Dave Jarvis' Repositories

git clone https://repo.autonoma.ca/repo/sales.git

Added scrapers. Removed jARVEST. Set variables for script. Get return values from script. Update xpath for scraping.

Author djarvis <email>
Date 2016-06-10 14:09:46 GMT-0700
Commit 7952c02a47510d98035f8da826342e3c9ef6c0b5
Parent eeed0b7
database/vendors/README.md
+# Overview
+
+Scripts in this directory extract product sale prices from vendor websites
+and create
+
+# Inputs
+
+Each script receives following inputs:
+
+* $1 - subscriber directory
+* $2 - vendor name
+* $3 - product name
+* $4 - product page
+* $5 - postal code
+
+Not all scripts use all the paramters.
+
+# Outputs
+
+Each script writes the following files into the subscriber's directory:
+
+* subject.txt - contains the subject line for the notification
+* body.txt - contains the body text for the notification
+
+If these files are not present, no notification will be sent.
+
+# Dependencies
+
+The scrips depend on the following tools:
+
+* **awk** - parse text files
+* **curl** - command-line tool for data transfer (http://curl.haxx.se)
+* **grep** - search files using regular expressions
+* **head** - extract the first N lines of a file
+* **html-xml-tools** - to parse HTML (https://www.w3.org/Tools/HTML-XML-utils)
+* **jq** - parses JSON
+* **lynx** - download web pages
+* **pandoc** - convert markup (http://pandoc.org)
+* **recode** - convert HTML entities to text strings
+* **sed** - search and replace text using regular expressions
+* **xmlstarlet** - select values from XML (or XHTML) elements
+
database/vendors/ca_costco/scrape.sh
+#!/bin/bash
+
+scrape() {
+ local SCRIPT_DIR=$1
+ local SUBSCRIBER_DIR=$2
+ local POSTAL_CODE=$3
+
+ # 1. Convert the first letter of the postal code to upper case.
+ POSTAL_CODE=${3:0:1}
+ POSTAL_CODE="${POSTAL_CODE^^}"
+
+ # 2. Convert the postal code into a region recognised by Costco.
+ case "$POSTAL_CODE" in
+ A) REGION=nl ;;
+ B) REGION=ns ;;
+ C) REGION=pe ;;
+ E) REGION=nb ;;
+ G) REGION=qb ;;
+ H) REGION=qb ;;
+ J) REGION=qb ;;
+ K) REGION=on ;;
+ L) REGION=on ;;
+ M) REGION=on ;;
+ N) REGION=on ;;
+ P) REGION=on ;;
+ R) REGION=mb ;;
+ S) REGION=sk ;;
+ T) REGION=ab ;;
+ V) REGION=bc ;;
+ X) REGION=nw ;;
+ Y) REGION=yt ;;
+ *) REGION=bc ;;
+ esac
+
+ # 3. Use the region to get the savings URL.
+ URL="http://www.costco.ca/warehouse-instant-savings-${REGION}.html"
+
+ # 3. Dump the data.
+ lynx -nolist -nolog -accept_all_cookies -dump -width=2048 \
+ -useragent="Mozilla/5.0 Lynx" $URL
+}
+
database/vendors/ca_safeway/scrape.sh
+#!/bin/bash
+
+# ########################################################################
+#
+# PURPOSE
+#
+# Filter data from a Safeway flyer.
+#
+# TODO: Delete the notification files if the subscriber added more
+# products after a notification was sent, so they'll always get notified
+# of the newly added product.
+#
+# ########################################################################
+
+# Sets DATE_EXPIRY and DATE_EXPIRY_SEC to the expiration date in the
+# vendor's flyer.
+#
+# $1 - Full path to the flyer file.
+expiry() {
+ # The expiry date is the last "word" on the line in %Y-%m-%d format.
+ DATE_EXPIRY=$(head -1 "$1" | awk 'NF>1{print $NF}')
+ DATE_EXPIRY_SEC=$(date -d $DATE_EXPIRY '+%s')
+}
+
+scrape() {
+ local DIR_SUBSCRIBER=$1
+ local VENDOR_NAME=$2
+ local PRODUCT_NAME=$3
+ local PRODUCT_PAGE=$4
+ local POSTAL_CODE=$5
+
+ local \
+ URL="http://specials.safeway.ca/flyers/accessibility/safewaycanada-flyer"
+
+ # Write the message body and subject to these files.
+ local PATH_BODY="$DIR_SUBSCRIBER/$FILE_MESSAGE_BODY"
+ local PATH_SUBJECT="$DIR_SUBSCRIBER/$FILE_MESSAGE_SUBJECT"
+
+ local PATH_FLYER="$DIR_SUBSCRIBER/$FILE_FLYER"
+ local PATH_NOTIFIED="$DIR_SUBSCRIBER/$FILE_NOTIFIED"
+
+ # Don't download the flyer unless it has expired (or doesn't exist).
+ local download_flyer=false
+
+ # If a flyer exists, check its expiration date against the notified date.
+ if [ -s "$PATH_FLYER" ]; then
+
+ # The flyer exists, but has a notification been sent?
+ if [ -s "$PATH_NOTIFIED" ]; then
+ DATE_NOTIFIED_SEC=$(cat $PATH_NOTIFIED)
+
+ # Set DATE_EXPIRY and DATE_EXPIRY_SEC.
+ expiry $PATH_FLYER
+
+ # If the notification date happened before the expiration date, then
+ # don't send another notification.
+ if [ "$DATE_NOTIFIED_SEC" -lt "$DATE_EXPIRY_SEC" ]; then
+ return
+ fi
+
+ download_flyer=true
+ fi
+ else
+ # No flyer exists; download anew and notify, regardless of notified state.
+ download_flyer=true
+ fi
+
+ if [ "$download_flyer" == true ]; then
+ # Get the store code and address for a postal code.
+ XHTML=$(curl -s "$URL?postal_code=$POSTAL_CODE" | hxnormalize -x)
+
+ STORE_ADDRESS=$(echo $XHTML | hxselect -c "select > option:first-child" | recode html..ascii)
+ STORE_CODE=$(echo $XHTML | hxselect -i "select > option:first-child" | sed -n -e "s/^.*value=['\"]\(.*\)['\"].*/\1/p")
+
+ # Download the flyer.
+ lynx -nolist -nolog -accept_all_cookies -dump -width=$TEXT_WIDTH \
+ "$URL?store_code=$STORE_CODE" > "$PATH_FLYER"
+
+ # Ensure the notification goes out by deleting the last notified file.
+ rm -f $PATH_NOTIFIED
+
+ echo "$VENDOR_NAME, $STORE_ADDRESS" > $PATH_BODY
+ fi
+
+ # Ensure flyer's expiry date is valid.
+ expiry $PATH_FLYER
+
+ # Strip the quotation marks from the product string (i.e., don't match 'em).
+ # If the product contains spaces, replace the spaces such that they match
+ # anything in between. This will match "Turkey Breast" against the vendor
+ # text of "Turkey or Chicken Breast" (for example).
+ #
+ # Using bash internal is probably faster than sed.
+ FIND_PRODUCT=${PRODUCT_NAME/ /\.\*}
+
+ # Store the message body in a temporary file.
+ if grep -i $FIND_PRODUCT "$PATH_FLYER" > "$PATH_BODY.tmp"
+ then
+ echo "" >> "$PATH_BODY"
+ echo "$PRODUCT_NAME" >> "$PATH_BODY"
+ echo "-------------------------" >> "$PATH_BODY"
+
+ # Paste the message body after the product header and horizontal rule.
+ # Remove all leading spaces, as well, so that conversion from Markdown
+ # to HTML will work (e.g., via pandoc).
+ #
+ # Second expression fixes vendor's punctuation problem.
+ sed -e "s/^[ \t]*//" -e "s/ \./\./g" -e "s/$/\./" \
+ < $PATH_BODY.tmp >> $PATH_BODY
+
+ # Put flyer expiry date in an unambiguous, human-readable format.
+ expires=$(date -d @$DATE_EXPIRY_SEC +"$DATE_FORMAT")
+
+ echo "[$VENDOR_NAME] Sale until $expires" > $PATH_SUBJECT
+ fi
+
+ # Clean up.
+ rm -f $PATH_BODY.tmp
+}
+
database/vendors/com_thebay/scrape.sh
+#!/bin/bash
+
+scrape() {
+ local DIR_SUBSCRIBER=$1
+ local VENDOR_NAME=$2
+ local PRODUCT_NAME=$3
+ local PRODUCT_PAGE=$4
+ local POSTAL_CODE=$5
+
+ local URL="http://www.thebay.com/webapp/wcs/stores/servlet/en/thebay"
+
+ local PATH_BODY="$DIR_SUBSCRIBER/$FILE_MESSAGE_BODY"
+ local PATH_SUBJECT="$DIR_SUBSCRIBER/$FILE_MESSAGE_SUBJECT"
+
+ local PATH_NOTIFIED="$DIR_SUBSCRIBER/$FILE_NOTIFIED"
+ local PATH_PAGE="$DIR_SUBSCRIBER/page.html"
+
+ curl -s "$URL/$PRODUCT_PAGE" | hxnormalize -x 2>/dev/null > $PATH_PAGE
+
+ local PRODUCT_ID=$(cat $PATH_PAGE | hxselect "input[name='productId']" | \
+ xmlstarlet sel -t -v "input/@value")
+
+ JSON=$(hxselect -c "div[id='entitledItem_$PRODUCT_ID']" < $PATH_PAGE)
+
+ FLAG_SALE=$(echo $JSON | jq ".[0].saleFlag")
+ PRICE_CURRENT=$(echo $JSON | jq ".[0] | .offerPrice")
+ PRICE_PREVIOUS=$(echo $JSON | jq ".[0] | .listPrice")
+
+ if [ "$FLAG_SALE" == "true" ]; then
+ temp="${PRICE_CURRENT%\"}"
+ temp="${temp#\"}"
+ PRICE_CURRENT=$temp
+
+ temp="${PRICE_PREVIOUS%\"}"
+ temp="${temp#\"}"
+ PRICE_PREVIOUS=$temp
+
+ echo "[$VENDOR_NAME] $PRODUCT_NAME now $PRICE_CURRENT" > $PATH_SUBJECT
+
+ local TIME_CURRENT=$(date +%0R)
+
+ # Reformat the date to be human-friendly.
+ local TIMESTAMP_CURRENT="$(date +"$DATE_FORMAT") $TIME_CURRENT"
+
+ # Put full details in the message body.
+ cat >> "$PATH_BODY" << EOL
+
+$PRODUCT_NAME
+-------------------------
+Price is $PRICE_CURRENT (was $PRICE_PREVIOUS) as of $TIMESTAMP_CURRENT.
+
+See [store]($URL/$PRODUCT_PAGE) for details.
+EOL
+ else
+ # Send out a notification if the item goes on sale again.
+ rm -f $PATH_NOTIFIED $PATH_BODY $PATH_SUBJECT
+ fi
+}
+
database/vendors/com_thriftyfoods/scrape.sh
+#!/bin/bash
+
+# ########################################################################
+#
+# PURPOSE
+#
+# Filter data from Thrifty Foods webpage. This creates a CSV file that
+# records any price changes since previous webpage downloads.
+#
+# DEPENDENCIES
+#
+# The hashed() function must be available to create a safe filename for
+# the product name.
+#
+# ########################################################################
+
+scrape() {
+ local DIR_SUBSCRIBER=$1
+ local VENDOR_NAME=$2
+ local PRODUCT_NAME=$3
+ local PRODUCT_PAGE=$4
+ local POSTAL_CODE=$5
+
+ local URL="https://www.thriftyfoods.com/product/$PRODUCT_PAGE"
+
+ # Write the message body and subject to these files.
+ local PATH_BODY="$DIR_SUBSCRIBER/$FILE_MESSAGE_BODY"
+ local PATH_SUBJECT="$DIR_SUBSCRIBER/$FILE_MESSAGE_SUBJECT"
+
+ # Store product data inside a file hashed from the product's name.
+ local PATH_PRODUCT="$DIR_SUBSCRIBER/$(hashed $PRODUCT_NAME).csv"
+
+ # Download the product webpage.
+ # Determine the price.
+ # Extract bulk cost (per 100G).
+ # Get the dollar amount.
+ # Remove the dollar sign.
+ PRICE_CURRENT=$(lynx -nolist -nolog -accept_all_cookies -dump "$URL" | \
+ grep -A1 "Comparison price:" | \
+ tail -1 | \
+ awk '{print $1}' | \
+ tr -d '$')
+
+ PRICE_PREVIOUS=0
+
+ if [ -s "$PATH_PRODUCT" ]; then
+ # Extract the previously recorded price from the end of the file.
+ PRICE_PREVIOUS=$(tail -1 $PATH_PRODUCT | awk -F "," '{print $3}')
+ fi
+
+ # Determine whether the price has changed.
+ PRICE_CHANGED=$(echo "$PRICE_PREVIOUS != $PRICE_CURRENT" | bc)
+
+ # Only notify when a different dollar amount is found.
+ if [ "$PRICE_CHANGED" -eq "1" ]; then
+ local DATE_CURRENT=$(date +%0F)
+ local TIME_CURRENT=$(date +%0R)
+
+ # Save the new value (in CSV format) for a later comparison.
+ echo "$DATE_CURRENT,$TIME_CURRENT,$PRICE_CURRENT" >> $PATH_PRODUCT
+
+ # Reformat the date to be human-friendly.
+ TIMESTAMP_CURRENT="$(date +"$DATE_FORMAT") $TIME_CURRENT"
+
+ # Set the subject line with the final subscribed product and price.
+ echo "[$VENDOR_NAME] $PRODUCT_NAME now \$$PRICE_CURRENT" > $PATH_SUBJECT
+
+ # Put full details in the message body.
+ cat >> "$PATH_BODY" << EOL
+
+$PRODUCT_NAME
+-------------------------
+Price is \$$PRICE_CURRENT (was \$$PRICE_PREVIOUS) as of $TIMESTAMP_CURRENT.
+EOL
+ fi
+}
+
database/vendors/list.csv
+1,Thrifty Foods,com_thriftyfoods
+2,Safeway,ca_safeway
+3,Costco,ca_costco
+4,Hudson's Bay,com_thebay
+
nbactions.xml
<goal>install</goal>
</goals>
+ <properties>
+ <skipTests>true</skipTests>
+ </properties>
</action>
<action>
<properties>
<skipTests>true</skipTests>
- </properties>
- </action>
- <action>
- <actionName>run</actionName>
- <packagings>
- <packaging>jar</packaging>
- </packagings>
- <goals>
- <goal>process-classes</goal>
- <goal>org.codehaus.mojo:exec-maven-plugin:1.2.1:exec</goal>
- </goals>
- <properties>
- <exec.args>-classpath %classpath ${packageClassName}</exec.args>
- <exec.executable>java</exec.executable>
-
</properties>
</action>
+
</actions>
src/main/java/com/whitemagicsoftware/sales/Main.java
Product product = new Product.Builder()
.withName( "Sunflower Seeds" )
- .withUrlPath( "sunflower-seeds/00000_000000000000005091" )
.build();
Scraper scraper = getScraper( "com_thriftyfoods.xml" );
-
- scraper.addVariableToContext( "product", product );
+ scraper.addVariableToContext( "path", "sunflower-seeds/00000_000000000000005091" );
scraper.execute();
Variable price = scraper.getContext().getVar( "price" );
-// Variable content = scraper.getContext().getVar( "content" );
-
- System.out.println( "Content = " + price );
+ Variable sale = scraper.getContext().getVar( "sale" );
-// Jarvest jarvest = new Jarvest();
-//
-// String[] results = jarvest.exec( script( "com_thriftyfoods.rb" ),
-// "https://www.thriftyfoods.com/product/sunflower-seeds/00000_000000000000005091" );
-//
-// results = jarvest.exec( script( "com_thriftyfoods.rb" ),
-// "https://www.thriftyfoods.com/product/crisp-applesparkling/00000_000000007468212834" );
-//
-// results = jarvest.exec( script( "com_thriftyfoods.rb" ),
-// "https://www.thriftyfoods.com/product/four-cheese-manicottipasta/00000_000000005844146203" );
+ System.out.println( "Sale = " + sale );
+ System.out.println( "Price = " + price );
}
Path working = Paths.get( getWorkingDirectory( resource ) );
working = Files.createDirectories( working );
- return Files.createTempDirectory( working, DIRECTORY_PREFIX );
+ working = Files.createTempDirectory( working, DIRECTORY_PREFIX );
+
+ working.toFile().deleteOnExit();
+
+ return working;
}
src/main/resources/com_thriftyfoods.xml
<config charset="UTF-8">
- <exit condition='${!sys.isVariableDefined("product")}' message="Provide a product." />
+ <var-def name="vendor">https://www.thriftyfoods.com/product/</var-def>
- <var-def name="vendor">https://www.thriftyfoods.com/product/sunflower-seeds/00000_000000000000005091</var-def>
-
+ <var-def name="page">
+ <html-to-xml outputtype="pretty" prunetags="script">
+ <http url="${vendor}${path}" />
+ </html-to-xml>
+ </var-def>
+
<var-def name="price">
- <xpath expression="(//span[@class='price' and @itemprop='price'])[last()]">
- <html-to-xml>
- <http url="${vendor}" />
- </html-to-xml>
+ <xpath expression="(//span[@class='price' and @itemprop='price'])[last()]/text()">
+ <var name="page" />
</xpath>
</var-def>
-
- <!--
- xpath( '(//span[@class="price" and @itemprop="price"])[last()]' )
- -->
-</config>
+ <var-def name="sale">
+ <xpath expression="boolean(//div[contains(@class,'on-sale') and @itemprop='offerDetails'])">
+ <var name="page" />
+ </xpath>
+ </var-def>
+</config>
Delta 373 lines added, 44 lines removed, 329-line increase