Dave Jarvis' Repositories

git clone https://repo.autonoma.ca/repo/sales.git
#!/bin/bash

# ########################################################################
#
# PURPOSE
#
# Filter data from a Safeway flyer.
#
# TODO: Delete the notification files if the subscriber added more
# products after a notification was sent, so they'll always get notified
# of the newly added product.
#
# ########################################################################

# Sets DATE_EXPIRY and DATE_EXPIRY_SEC to the expiration date in the
# vendor's flyer.
#
# $1 - Full path to the flyer file.
expiry() {
  # The expiry date is the last "word" on the line in %Y-%m-%d format.
  DATE_EXPIRY=$(head -1 "$1" | awk 'NF>1{print $NF}')
  DATE_EXPIRY_SEC=$(date -d $DATE_EXPIRY '+%s')
}

scrape() {
  local DIR_SUBSCRIBER=$1
  local VENDOR_NAME=$2
  local PRODUCT_NAME=$3
  local PRODUCT_PAGE=$4
  local POSTAL_CODE=$5

  local \
    URL="http://specials.safeway.ca/flyers/accessibility/safewaycanada-flyer"

  # Write the message body and subject to these files.
  local PATH_BODY="$DIR_SUBSCRIBER/$FILE_MESSAGE_BODY"
  local PATH_SUBJECT="$DIR_SUBSCRIBER/$FILE_MESSAGE_SUBJECT"

  local PATH_FLYER="$DIR_SUBSCRIBER/$FILE_FLYER"
  local PATH_NOTIFIED="$DIR_SUBSCRIBER/$FILE_NOTIFIED"

  # Don't download the flyer unless it has expired (or doesn't exist).
  local download_flyer=false

  # If a flyer exists, check its expiration date against the notified date.
  if [ -s "$PATH_FLYER" ]; then

    # The flyer exists, but has a notification been sent?
    if [ -s "$PATH_NOTIFIED" ]; then
      DATE_NOTIFIED_SEC=$(cat $PATH_NOTIFIED)

      # Set DATE_EXPIRY and DATE_EXPIRY_SEC.
      expiry $PATH_FLYER

      # If the notification date happened before the expiration date, then
      # don't send another notification.
      if [ "$DATE_NOTIFIED_SEC" -lt "$DATE_EXPIRY_SEC" ]; then
        return
      fi

      download_flyer=true
    fi
  else
    # No flyer exists; download anew and notify, regardless of notified state.
    download_flyer=true
  fi

  if [ "$download_flyer" == true ]; then
    # Get the store code and address for a postal code.
    XHTML=$(curl -s "$URL?postal_code=$POSTAL_CODE" | hxnormalize -x)

    STORE_ADDRESS=$(echo $XHTML | hxselect -c "select > option:first-child" | recode html..ascii)
    STORE_CODE=$(echo $XHTML | hxselect -i "select > option:first-child" | sed -n -e "s/^.*value=['\"]\(.*\)['\"].*/\1/p")

    # Download the flyer.
    lynx -nolist -nolog -accept_all_cookies -dump -width=$TEXT_WIDTH \
      "$URL?store_code=$STORE_CODE" > "$PATH_FLYER"

    # Ensure the notification goes out by deleting the last notified file.
    rm -f $PATH_NOTIFIED

    echo "$VENDOR_NAME, $STORE_ADDRESS" > $PATH_BODY
  fi

  # Ensure flyer's expiry date is valid.
  expiry $PATH_FLYER

  # Strip the quotation marks from the product string (i.e., don't match 'em).
  # If the product contains spaces, replace the spaces such that they match
  # anything in between. This will match "Turkey Breast" against the vendor
  # text of "Turkey or Chicken Breast" (for example).
  #
  # Using bash internal is probably faster than sed.
  FIND_PRODUCT=${PRODUCT_NAME/ /\.\*}

  # Store the message body in a temporary file.
  if grep -i $FIND_PRODUCT "$PATH_FLYER" > "$PATH_BODY.tmp"
  then
    echo "" >> "$PATH_BODY"
    echo "$PRODUCT_NAME" >> "$PATH_BODY"
    echo "-------------------------" >> "$PATH_BODY"

    # Paste the message body after the product header and horizontal rule.
    # Remove all leading spaces, as well, so that conversion from Markdown
    # to HTML will work (e.g., via pandoc).
    #
    # Second expression fixes vendor's punctuation problem.
    sed -e "s/^[ \t]*//" -e "s/ \./\./g" -e "s/$/\./" \
      < $PATH_BODY.tmp >> $PATH_BODY

    # Put flyer expiry date in an unambiguous, human-readable format.
    expires=$(date -d @$DATE_EXPIRY_SEC +"$DATE_FORMAT")

    echo "[$VENDOR_NAME] Sale until $expires" > $PATH_SUBJECT
  fi

  # Clean up.
  rm -f $PATH_BODY.tmp
}