| Author | djarvis <email> |
|---|---|
| Date | 2016-06-10 14:09:46 GMT-0700 |
| Commit | 7952c02a47510d98035f8da826342e3c9ef6c0b5 |
| Parent | eeed0b7 |
| +# Overview | ||
| + | ||
| +Scripts in this directory extract product sale prices from vendor websites | ||
| +and create | ||
| + | ||
| +# Inputs | ||
| + | ||
| +Each script receives following inputs: | ||
| + | ||
| +* $1 - subscriber directory | ||
| +* $2 - vendor name | ||
| +* $3 - product name | ||
| +* $4 - product page | ||
| +* $5 - postal code | ||
| + | ||
| +Not all scripts use all the paramters. | ||
| + | ||
| +# Outputs | ||
| + | ||
| +Each script writes the following files into the subscriber's directory: | ||
| + | ||
| +* subject.txt - contains the subject line for the notification | ||
| +* body.txt - contains the body text for the notification | ||
| + | ||
| +If these files are not present, no notification will be sent. | ||
| + | ||
| +# Dependencies | ||
| + | ||
| +The scrips depend on the following tools: | ||
| + | ||
| +* **awk** - parse text files | ||
| +* **curl** - command-line tool for data transfer (http://curl.haxx.se) | ||
| +* **grep** - search files using regular expressions | ||
| +* **head** - extract the first N lines of a file | ||
| +* **html-xml-tools** - to parse HTML (https://www.w3.org/Tools/HTML-XML-utils) | ||
| +* **jq** - parses JSON | ||
| +* **lynx** - download web pages | ||
| +* **pandoc** - convert markup (http://pandoc.org) | ||
| +* **recode** - convert HTML entities to text strings | ||
| +* **sed** - search and replace text using regular expressions | ||
| +* **xmlstarlet** - select values from XML (or XHTML) elements | ||
| + | ||
| +#!/bin/bash | ||
| + | ||
| +scrape() { | ||
| + local SCRIPT_DIR=$1 | ||
| + local SUBSCRIBER_DIR=$2 | ||
| + local POSTAL_CODE=$3 | ||
| + | ||
| + # 1. Convert the first letter of the postal code to upper case. | ||
| + POSTAL_CODE=${3:0:1} | ||
| + POSTAL_CODE="${POSTAL_CODE^^}" | ||
| + | ||
| + # 2. Convert the postal code into a region recognised by Costco. | ||
| + case "$POSTAL_CODE" in | ||
| + A) REGION=nl ;; | ||
| + B) REGION=ns ;; | ||
| + C) REGION=pe ;; | ||
| + E) REGION=nb ;; | ||
| + G) REGION=qb ;; | ||
| + H) REGION=qb ;; | ||
| + J) REGION=qb ;; | ||
| + K) REGION=on ;; | ||
| + L) REGION=on ;; | ||
| + M) REGION=on ;; | ||
| + N) REGION=on ;; | ||
| + P) REGION=on ;; | ||
| + R) REGION=mb ;; | ||
| + S) REGION=sk ;; | ||
| + T) REGION=ab ;; | ||
| + V) REGION=bc ;; | ||
| + X) REGION=nw ;; | ||
| + Y) REGION=yt ;; | ||
| + *) REGION=bc ;; | ||
| + esac | ||
| + | ||
| + # 3. Use the region to get the savings URL. | ||
| + URL="http://www.costco.ca/warehouse-instant-savings-${REGION}.html" | ||
| + | ||
| + # 3. Dump the data. | ||
| + lynx -nolist -nolog -accept_all_cookies -dump -width=2048 \ | ||
| + -useragent="Mozilla/5.0 Lynx" $URL | ||
| +} | ||
| + | ||
| +#!/bin/bash | ||
| + | ||
| +# ######################################################################## | ||
| +# | ||
| +# PURPOSE | ||
| +# | ||
| +# Filter data from a Safeway flyer. | ||
| +# | ||
| +# TODO: Delete the notification files if the subscriber added more | ||
| +# products after a notification was sent, so they'll always get notified | ||
| +# of the newly added product. | ||
| +# | ||
| +# ######################################################################## | ||
| + | ||
| +# Sets DATE_EXPIRY and DATE_EXPIRY_SEC to the expiration date in the | ||
| +# vendor's flyer. | ||
| +# | ||
| +# $1 - Full path to the flyer file. | ||
| +expiry() { | ||
| + # The expiry date is the last "word" on the line in %Y-%m-%d format. | ||
| + DATE_EXPIRY=$(head -1 "$1" | awk 'NF>1{print $NF}') | ||
| + DATE_EXPIRY_SEC=$(date -d $DATE_EXPIRY '+%s') | ||
| +} | ||
| + | ||
| +scrape() { | ||
| + local DIR_SUBSCRIBER=$1 | ||
| + local VENDOR_NAME=$2 | ||
| + local PRODUCT_NAME=$3 | ||
| + local PRODUCT_PAGE=$4 | ||
| + local POSTAL_CODE=$5 | ||
| + | ||
| + local \ | ||
| + URL="http://specials.safeway.ca/flyers/accessibility/safewaycanada-flyer" | ||
| + | ||
| + # Write the message body and subject to these files. | ||
| + local PATH_BODY="$DIR_SUBSCRIBER/$FILE_MESSAGE_BODY" | ||
| + local PATH_SUBJECT="$DIR_SUBSCRIBER/$FILE_MESSAGE_SUBJECT" | ||
| + | ||
| + local PATH_FLYER="$DIR_SUBSCRIBER/$FILE_FLYER" | ||
| + local PATH_NOTIFIED="$DIR_SUBSCRIBER/$FILE_NOTIFIED" | ||
| + | ||
| + # Don't download the flyer unless it has expired (or doesn't exist). | ||
| + local download_flyer=false | ||
| + | ||
| + # If a flyer exists, check its expiration date against the notified date. | ||
| + if [ -s "$PATH_FLYER" ]; then | ||
| + | ||
| + # The flyer exists, but has a notification been sent? | ||
| + if [ -s "$PATH_NOTIFIED" ]; then | ||
| + DATE_NOTIFIED_SEC=$(cat $PATH_NOTIFIED) | ||
| + | ||
| + # Set DATE_EXPIRY and DATE_EXPIRY_SEC. | ||
| + expiry $PATH_FLYER | ||
| + | ||
| + # If the notification date happened before the expiration date, then | ||
| + # don't send another notification. | ||
| + if [ "$DATE_NOTIFIED_SEC" -lt "$DATE_EXPIRY_SEC" ]; then | ||
| + return | ||
| + fi | ||
| + | ||
| + download_flyer=true | ||
| + fi | ||
| + else | ||
| + # No flyer exists; download anew and notify, regardless of notified state. | ||
| + download_flyer=true | ||
| + fi | ||
| + | ||
| + if [ "$download_flyer" == true ]; then | ||
| + # Get the store code and address for a postal code. | ||
| + XHTML=$(curl -s "$URL?postal_code=$POSTAL_CODE" | hxnormalize -x) | ||
| + | ||
| + STORE_ADDRESS=$(echo $XHTML | hxselect -c "select > option:first-child" | recode html..ascii) | ||
| + STORE_CODE=$(echo $XHTML | hxselect -i "select > option:first-child" | sed -n -e "s/^.*value=['\"]\(.*\)['\"].*/\1/p") | ||
| + | ||
| + # Download the flyer. | ||
| + lynx -nolist -nolog -accept_all_cookies -dump -width=$TEXT_WIDTH \ | ||
| + "$URL?store_code=$STORE_CODE" > "$PATH_FLYER" | ||
| + | ||
| + # Ensure the notification goes out by deleting the last notified file. | ||
| + rm -f $PATH_NOTIFIED | ||
| + | ||
| + echo "$VENDOR_NAME, $STORE_ADDRESS" > $PATH_BODY | ||
| + fi | ||
| + | ||
| + # Ensure flyer's expiry date is valid. | ||
| + expiry $PATH_FLYER | ||
| + | ||
| + # Strip the quotation marks from the product string (i.e., don't match 'em). | ||
| + # If the product contains spaces, replace the spaces such that they match | ||
| + # anything in between. This will match "Turkey Breast" against the vendor | ||
| + # text of "Turkey or Chicken Breast" (for example). | ||
| + # | ||
| + # Using bash internal is probably faster than sed. | ||
| + FIND_PRODUCT=${PRODUCT_NAME/ /\.\*} | ||
| + | ||
| + # Store the message body in a temporary file. | ||
| + if grep -i $FIND_PRODUCT "$PATH_FLYER" > "$PATH_BODY.tmp" | ||
| + then | ||
| + echo "" >> "$PATH_BODY" | ||
| + echo "$PRODUCT_NAME" >> "$PATH_BODY" | ||
| + echo "-------------------------" >> "$PATH_BODY" | ||
| + | ||
| + # Paste the message body after the product header and horizontal rule. | ||
| + # Remove all leading spaces, as well, so that conversion from Markdown | ||
| + # to HTML will work (e.g., via pandoc). | ||
| + # | ||
| + # Second expression fixes vendor's punctuation problem. | ||
| + sed -e "s/^[ \t]*//" -e "s/ \./\./g" -e "s/$/\./" \ | ||
| + < $PATH_BODY.tmp >> $PATH_BODY | ||
| + | ||
| + # Put flyer expiry date in an unambiguous, human-readable format. | ||
| + expires=$(date -d @$DATE_EXPIRY_SEC +"$DATE_FORMAT") | ||
| + | ||
| + echo "[$VENDOR_NAME] Sale until $expires" > $PATH_SUBJECT | ||
| + fi | ||
| + | ||
| + # Clean up. | ||
| + rm -f $PATH_BODY.tmp | ||
| +} | ||
| + | ||
| +#!/bin/bash | ||
| + | ||
| +scrape() { | ||
| + local DIR_SUBSCRIBER=$1 | ||
| + local VENDOR_NAME=$2 | ||
| + local PRODUCT_NAME=$3 | ||
| + local PRODUCT_PAGE=$4 | ||
| + local POSTAL_CODE=$5 | ||
| + | ||
| + local URL="http://www.thebay.com/webapp/wcs/stores/servlet/en/thebay" | ||
| + | ||
| + local PATH_BODY="$DIR_SUBSCRIBER/$FILE_MESSAGE_BODY" | ||
| + local PATH_SUBJECT="$DIR_SUBSCRIBER/$FILE_MESSAGE_SUBJECT" | ||
| + | ||
| + local PATH_NOTIFIED="$DIR_SUBSCRIBER/$FILE_NOTIFIED" | ||
| + local PATH_PAGE="$DIR_SUBSCRIBER/page.html" | ||
| + | ||
| + curl -s "$URL/$PRODUCT_PAGE" | hxnormalize -x 2>/dev/null > $PATH_PAGE | ||
| + | ||
| + local PRODUCT_ID=$(cat $PATH_PAGE | hxselect "input[name='productId']" | \ | ||
| + xmlstarlet sel -t -v "input/@value") | ||
| + | ||
| + JSON=$(hxselect -c "div[id='entitledItem_$PRODUCT_ID']" < $PATH_PAGE) | ||
| + | ||
| + FLAG_SALE=$(echo $JSON | jq ".[0].saleFlag") | ||
| + PRICE_CURRENT=$(echo $JSON | jq ".[0] | .offerPrice") | ||
| + PRICE_PREVIOUS=$(echo $JSON | jq ".[0] | .listPrice") | ||
| + | ||
| + if [ "$FLAG_SALE" == "true" ]; then | ||
| + temp="${PRICE_CURRENT%\"}" | ||
| + temp="${temp#\"}" | ||
| + PRICE_CURRENT=$temp | ||
| + | ||
| + temp="${PRICE_PREVIOUS%\"}" | ||
| + temp="${temp#\"}" | ||
| + PRICE_PREVIOUS=$temp | ||
| + | ||
| + echo "[$VENDOR_NAME] $PRODUCT_NAME now $PRICE_CURRENT" > $PATH_SUBJECT | ||
| + | ||
| + local TIME_CURRENT=$(date +%0R) | ||
| + | ||
| + # Reformat the date to be human-friendly. | ||
| + local TIMESTAMP_CURRENT="$(date +"$DATE_FORMAT") $TIME_CURRENT" | ||
| + | ||
| + # Put full details in the message body. | ||
| + cat >> "$PATH_BODY" << EOL | ||
| + | ||
| +$PRODUCT_NAME | ||
| +------------------------- | ||
| +Price is $PRICE_CURRENT (was $PRICE_PREVIOUS) as of $TIMESTAMP_CURRENT. | ||
| + | ||
| +See [store]($URL/$PRODUCT_PAGE) for details. | ||
| +EOL | ||
| + else | ||
| + # Send out a notification if the item goes on sale again. | ||
| + rm -f $PATH_NOTIFIED $PATH_BODY $PATH_SUBJECT | ||
| + fi | ||
| +} | ||
| + | ||
| +#!/bin/bash | ||
| + | ||
| +# ######################################################################## | ||
| +# | ||
| +# PURPOSE | ||
| +# | ||
| +# Filter data from Thrifty Foods webpage. This creates a CSV file that | ||
| +# records any price changes since previous webpage downloads. | ||
| +# | ||
| +# DEPENDENCIES | ||
| +# | ||
| +# The hashed() function must be available to create a safe filename for | ||
| +# the product name. | ||
| +# | ||
| +# ######################################################################## | ||
| + | ||
| +scrape() { | ||
| + local DIR_SUBSCRIBER=$1 | ||
| + local VENDOR_NAME=$2 | ||
| + local PRODUCT_NAME=$3 | ||
| + local PRODUCT_PAGE=$4 | ||
| + local POSTAL_CODE=$5 | ||
| + | ||
| + local URL="https://www.thriftyfoods.com/product/$PRODUCT_PAGE" | ||
| + | ||
| + # Write the message body and subject to these files. | ||
| + local PATH_BODY="$DIR_SUBSCRIBER/$FILE_MESSAGE_BODY" | ||
| + local PATH_SUBJECT="$DIR_SUBSCRIBER/$FILE_MESSAGE_SUBJECT" | ||
| + | ||
| + # Store product data inside a file hashed from the product's name. | ||
| + local PATH_PRODUCT="$DIR_SUBSCRIBER/$(hashed $PRODUCT_NAME).csv" | ||
| + | ||
| + # Download the product webpage. | ||
| + # Determine the price. | ||
| + # Extract bulk cost (per 100G). | ||
| + # Get the dollar amount. | ||
| + # Remove the dollar sign. | ||
| + PRICE_CURRENT=$(lynx -nolist -nolog -accept_all_cookies -dump "$URL" | \ | ||
| + grep -A1 "Comparison price:" | \ | ||
| + tail -1 | \ | ||
| + awk '{print $1}' | \ | ||
| + tr -d '$') | ||
| + | ||
| + PRICE_PREVIOUS=0 | ||
| + | ||
| + if [ -s "$PATH_PRODUCT" ]; then | ||
| + # Extract the previously recorded price from the end of the file. | ||
| + PRICE_PREVIOUS=$(tail -1 $PATH_PRODUCT | awk -F "," '{print $3}') | ||
| + fi | ||
| + | ||
| + # Determine whether the price has changed. | ||
| + PRICE_CHANGED=$(echo "$PRICE_PREVIOUS != $PRICE_CURRENT" | bc) | ||
| + | ||
| + # Only notify when a different dollar amount is found. | ||
| + if [ "$PRICE_CHANGED" -eq "1" ]; then | ||
| + local DATE_CURRENT=$(date +%0F) | ||
| + local TIME_CURRENT=$(date +%0R) | ||
| + | ||
| + # Save the new value (in CSV format) for a later comparison. | ||
| + echo "$DATE_CURRENT,$TIME_CURRENT,$PRICE_CURRENT" >> $PATH_PRODUCT | ||
| + | ||
| + # Reformat the date to be human-friendly. | ||
| + TIMESTAMP_CURRENT="$(date +"$DATE_FORMAT") $TIME_CURRENT" | ||
| + | ||
| + # Set the subject line with the final subscribed product and price. | ||
| + echo "[$VENDOR_NAME] $PRODUCT_NAME now \$$PRICE_CURRENT" > $PATH_SUBJECT | ||
| + | ||
| + # Put full details in the message body. | ||
| + cat >> "$PATH_BODY" << EOL | ||
| + | ||
| +$PRODUCT_NAME | ||
| +------------------------- | ||
| +Price is \$$PRICE_CURRENT (was \$$PRICE_PREVIOUS) as of $TIMESTAMP_CURRENT. | ||
| +EOL | ||
| + fi | ||
| +} | ||
| + | ||
| +1,Thrifty Foods,com_thriftyfoods | ||
| +2,Safeway,ca_safeway | ||
| +3,Costco,ca_costco | ||
| +4,Hudson's Bay,com_thebay | ||
| + | ||
| <goal>install</goal> | ||
| </goals> | ||
| + <properties> | ||
| + <skipTests>true</skipTests> | ||
| + </properties> | ||
| </action> | ||
| <action> | ||
| <properties> | ||
| <skipTests>true</skipTests> | ||
| - </properties> | ||
| - </action> | ||
| - <action> | ||
| - <actionName>run</actionName> | ||
| - <packagings> | ||
| - <packaging>jar</packaging> | ||
| - </packagings> | ||
| - <goals> | ||
| - <goal>process-classes</goal> | ||
| - <goal>org.codehaus.mojo:exec-maven-plugin:1.2.1:exec</goal> | ||
| - </goals> | ||
| - <properties> | ||
| - <exec.args>-classpath %classpath ${packageClassName}</exec.args> | ||
| - <exec.executable>java</exec.executable> | ||
| - | ||
| </properties> | ||
| </action> | ||
| + | ||
| </actions> | ||
| Product product = new Product.Builder() | ||
| .withName( "Sunflower Seeds" ) | ||
| - .withUrlPath( "sunflower-seeds/00000_000000000000005091" ) | ||
| .build(); | ||
| Scraper scraper = getScraper( "com_thriftyfoods.xml" ); | ||
| - | ||
| - scraper.addVariableToContext( "product", product ); | ||
| + scraper.addVariableToContext( "path", "sunflower-seeds/00000_000000000000005091" ); | ||
| scraper.execute(); | ||
| Variable price = scraper.getContext().getVar( "price" ); | ||
| -// Variable content = scraper.getContext().getVar( "content" ); | ||
| - | ||
| - System.out.println( "Content = " + price ); | ||
| + Variable sale = scraper.getContext().getVar( "sale" ); | ||
| -// Jarvest jarvest = new Jarvest(); | ||
| -// | ||
| -// String[] results = jarvest.exec( script( "com_thriftyfoods.rb" ), | ||
| -// "https://www.thriftyfoods.com/product/sunflower-seeds/00000_000000000000005091" ); | ||
| -// | ||
| -// results = jarvest.exec( script( "com_thriftyfoods.rb" ), | ||
| -// "https://www.thriftyfoods.com/product/crisp-applesparkling/00000_000000007468212834" ); | ||
| -// | ||
| -// results = jarvest.exec( script( "com_thriftyfoods.rb" ), | ||
| -// "https://www.thriftyfoods.com/product/four-cheese-manicottipasta/00000_000000005844146203" ); | ||
| + System.out.println( "Sale = " + sale ); | ||
| + System.out.println( "Price = " + price ); | ||
| } | ||
| Path working = Paths.get( getWorkingDirectory( resource ) ); | ||
| working = Files.createDirectories( working ); | ||
| - return Files.createTempDirectory( working, DIRECTORY_PREFIX ); | ||
| + working = Files.createTempDirectory( working, DIRECTORY_PREFIX ); | ||
| + | ||
| + working.toFile().deleteOnExit(); | ||
| + | ||
| + return working; | ||
| } | ||
| <config charset="UTF-8"> | ||
| - <exit condition='${!sys.isVariableDefined("product")}' message="Provide a product." /> | ||
| + <var-def name="vendor">https://www.thriftyfoods.com/product/</var-def> | ||
| - <var-def name="vendor">https://www.thriftyfoods.com/product/sunflower-seeds/00000_000000000000005091</var-def> | ||
| - | ||
| + <var-def name="page"> | ||
| + <html-to-xml outputtype="pretty" prunetags="script"> | ||
| + <http url="${vendor}${path}" /> | ||
| + </html-to-xml> | ||
| + </var-def> | ||
| + | ||
| <var-def name="price"> | ||
| - <xpath expression="(//span[@class='price' and @itemprop='price'])[last()]"> | ||
| - <html-to-xml> | ||
| - <http url="${vendor}" /> | ||
| - </html-to-xml> | ||
| + <xpath expression="(//span[@class='price' and @itemprop='price'])[last()]/text()"> | ||
| + <var name="page" /> | ||
| </xpath> | ||
| </var-def> | ||
| - | ||
| - <!-- | ||
| - xpath( '(//span[@class="price" and @itemprop="price"])[last()]' ) | ||
| - --> | ||
| -</config> | ||
| + <var-def name="sale"> | ||
| + <xpath expression="boolean(//div[contains(@class,'on-sale') and @itemprop='offerDetails'])"> | ||
| + <var name="page" /> | ||
| + </xpath> | ||
| + </var-def> | ||
| +</config> | ||
| Delta | 373 lines added, 44 lines removed, 329-line increase |
|---|