| | import java.io.IOException; |
| | import java.io.InputStream; |
| | -import java.nio.file.Files; |
| | -import java.nio.file.Path; |
| | -import java.nio.file.Paths; |
| | import java.util.List; |
| | import javax.mail.internet.AddressException; |
| | -import static org.apache.commons.io.FilenameUtils.removeExtension; |
| | +import org.apache.log4j.BasicConfigurator; |
| | import org.webharvest.definition.ScraperConfiguration; |
| | import org.webharvest.runtime.Scraper; |
 |
| | |
| | private void run() throws Exception { |
| | - process(getSubscribers()); |
| | + process( getSubscribers() ); |
| | } |
| | |
| | private void process( |
| | - List<Subscriber> subscribers) { |
| | - for (Subscriber subscriber : subscribers) { |
| | + List<Subscriber> subscribers ) { |
| | + for( Subscriber subscriber : subscribers ) { |
| | try { |
| | - String message = process(subscriber, getVendors(subscriber)); |
| | - notify(subscriber, message); |
| | - } catch (Exception e) { |
| | - notify(e); |
| | + String message = process( subscriber, getVendors( subscriber ) ); |
| | + notify( subscriber, message ); |
| | + } catch( Exception e ) { |
| | + notify( e ); |
| | } |
| | } |
| | } |
| | |
| | - private String process(Subscriber subscriber, List<Vendor> vendors) |
| | + private String process( Subscriber subscriber, List<Vendor> vendors ) |
| | throws Exception { |
| | StringBuilder result = new StringBuilder(); |
| | |
| | - for (Vendor vendor : vendors) { |
| | - String s = process(subscriber, vendor, getProducts(subscriber, vendor)); |
| | - result.append(s); |
| | + for( Vendor vendor : vendors ) { |
| | + String s = process( subscriber, vendor, getProducts( subscriber, vendor ) ); |
| | + result.append( s ); |
| | } |
| | |
| | return result.toString(); |
| | } |
| | |
| | private String process( |
| | Subscriber subscriber, |
| | Vendor vendor, |
| | - List<Product> products) throws Exception { |
| | + List<Product> products ) throws Exception { |
| | StringBuilder result = new StringBuilder(); |
| | |
| | // Include the vendor name on the first products loop iteration. |
| | boolean includeVendorName = true; |
| | |
| | - for (Product product : products) { |
| | - Scraper scraper = getScraper(vendor); |
| | + for( Product product : products ) { |
| | + Scraper scraper = getScraper( vendor ); |
| | |
| | - scraper.addVariableToContext("includeVendorName", includeVendorName); |
| | - scraper.addVariableToContext("locationCode", subscriber.getLocationCode()); |
| | - scraper.addVariableToContext("vendorName", vendor.getName()); |
| | - scraper.addVariableToContext("productName", product.getName()); |
| | - scraper.addVariableToContext("productPath", product.getUrlPath()); |
| | + scraper.addVariableToContext( "includeVendorName", includeVendorName ); |
| | + scraper.addVariableToContext( "locationCode", subscriber.getLocationCode() ); |
| | + scraper.addVariableToContext( "vendorName", vendor.getName() ); |
| | + scraper.addVariableToContext( "productName", product.getName() ); |
| | + scraper.addVariableToContext( "productPath", product.getUrlPath() ); |
| | scraper.execute(); |
| | |
| | - Variable message = scraper.getContext().getVar("message"); |
| | - result.append(message.toString()); |
| | + Variable message = scraper.getContext().getVar( "message" ); |
| | + result.append( message.toString() ); |
| | |
| | // Presumably, the template added the vendor's name to deliniate its |
 |
| | * @param subscriber The person to notify. |
| | * @param message |
| | + * |
| | * @throws Exception |
| | */ |
| | - private void notify(Subscriber subscriber, String message) throws Exception { |
| | - if (!empty(message)) { |
| | - getNotifyService().notify(subscriber.getAddress(), "Subject", message); |
| | + private void notify( Subscriber subscriber, String message ) throws Exception { |
| | + if( !empty( message ) ) { |
| | + getNotifyService().notify( subscriber.getAddress(), "Subject", message ); |
| | } |
| | } |
| | |
| | - private void notify(Exception e) { |
| | - getNotifyService().notify(e); |
| | + private void notify( Exception e ) { |
| | + getNotifyService().notify( e ); |
| | } |
| | |
| | /** |
| | * Answers whether the string is null or empty or contains only whitespace. |
| | * |
| | * @param s The string to validate. |
| | + * |
| | * @return true The string has no content. F |
| | */ |
| | - private boolean empty(String s) { |
| | + private boolean empty( String s ) { |
| | return s == null || s.trim().length() == 0; |
| | } |
 |
| | * @return A Scraper that can extract data. |
| | */ |
| | - private Scraper getScraper(Vendor vendor) throws IOException { |
| | - String resource = vendor.getScriptName(); |
| | - final ScraperConfiguration config = getScraperConfiguration(resource); |
| | - return new Scraper(config, createTemporaryDirectory(resource).toString()); |
| | + private Scraper getScraper( Vendor vendor ) throws IOException { |
| | + String resource = "scripts/" + vendor.getScriptName(); |
| | + final ScraperConfiguration config = getScraperConfiguration( resource ); |
| | + return new Scraper( config, getWorkingDirectory() ); |
| | } |
| | |
| | - private ScraperConfiguration getScraperConfiguration(String resource) { |
| | - final InputSource in = getResource(resource); |
| | - return new ScraperConfiguration(in); |
| | + /** |
| | + * Root directory containing scripts/ and templates/ directories. |
| | + * |
| | + * @return A non-null String to the resources' root directory. |
| | + */ |
| | + private String getWorkingDirectory() { |
| | + return new File( "src/main/resources" ).toString(); |
| | } |
| | |
| | - private InputSource getResource(String resource) { |
| | - final InputStream in = getResourceAsStream(resource); |
| | - return new InputSource(in); |
| | + private ScraperConfiguration getScraperConfiguration( String resource ) { |
| | + final InputSource in = getResource( resource ); |
| | + return new ScraperConfiguration( in ); |
| | } |
| | |
| | - private InputStream getResourceAsStream(String resource) { |
| | + private InputSource getResource( String resource ) { |
| | + final InputStream in = getResourceAsStream( resource ); |
| | + return new InputSource( in ); |
| | + } |
| | + |
| | + private InputStream getResourceAsStream( String resource ) { |
| | final InputStream in |
| | - = getContextClassLoader().getResourceAsStream(resource); |
| | + = getContextClassLoader().getResourceAsStream( resource ); |
| | |
| | - return in == null ? getClass().getResourceAsStream(resource) : in; |
| | + return in == null ? getClass().getResourceAsStream( resource ) : in; |
| | } |
| | |
| | private ClassLoader getContextClassLoader() { |
| | return Thread.currentThread().getContextClassLoader(); |
| | - } |
| | - |
| | - /** |
| | - * Creates a fully qualified path for web pages downloaded by the scraper. |
| | - * |
| | - * @param resource The site (with filename extension) to scrap. |
| | - * |
| | - * @return A path to the directory that was created (e.g., |
| | - * $HOME/.../sales/working/website/scrape4443161710900...). |
| | - * |
| | - * @throws IOException |
| | - */ |
| | - private File createTemporaryDirectory(String resource) throws IOException { |
| | - Path working = Paths.get(getWorkingDirectory(resource)); |
| | - working = Files.createDirectories(working); |
| | - working = Files.createTempDirectory(working, DIRECTORY_PREFIX); |
| | - |
| | - File file = working.toFile(); |
| | - file.deleteOnExit(); |
| | - |
| | - return file; |
| | - } |
| | - |
| | - private String getWorkingDirectory(String resource) { |
| | - return System.getProperty("user.dir") |
| | - + File.separator + "working" |
| | - + File.separator + removeExtension(resource); |
| | } |
| | |
 |
| | } |
| | |
| | - private List<Vendor> getVendors(Subscriber subscriber) { |
| | - return getVendorService().list(subscriber); |
| | + private List<Vendor> getVendors( Subscriber subscriber ) { |
| | + return getVendorService().list( subscriber ); |
| | } |
| | |
| | - private List<Product> getProducts(Subscriber subscriber, Vendor vendor) { |
| | - return getProductService().list(subscriber, vendor); |
| | + private List<Product> getProducts( Subscriber subscriber, Vendor vendor ) { |
| | + return getProductService().list( subscriber, vendor ); |
| | } |
| | |
| | - public static void main(String args[]) throws Exception { |
| | + public static void main( String args[] ) throws Exception { |
| | new Main().run(); |
| | } |
| | } |
| | - |
| | -/* |
| | - |
| | - # Filename to the message subject and body to send to the subscriber. |
| | - FILE_MESSAGE_SUBJECT="subject.txt" |
| | - FILE_MESSAGE_BODY="body.txt" |
| | - |
| | - FILE_FLYER="flyer.txt" |
| | - FILE_NOTIFIED="notified.txt" |
| | - |
| | - # Significantly reduce line wrap (to help with text parsing). The width |
| | - # is required because line items can be longer than 80 characters. Without |
| | - # the width, long item descriptions, which contain the item cost, might |
| | - # not parse correctly from some vendors. |
| | - TEXT_WIDTH=8192 |
| | - |
| | - # Human-readable date format. |
| | - DATE_FORMAT="%A, %b %d" |
| | - |
| | - # Generate a hash value for the $1 parameter. |
| | - hashed() { |
| | - echo "$1" | sha256sum | awk {'print $1'} |
| | - } |
| | - |
| | - # Determine whether products are on sale for each subscriber. |
| | - while IFS=',' read email postal_code store product_name product_page |
| | - do |
| | - # Ignore blank lines. |
| | - if [ ! "$email" ]; then continue; fi |
| | - |
| | - # Extract the vendor details for the subscriber's desired product. |
| | - line=$(head -$store $DIR/stores.csv | tail -1) |
| | - vendor_name=$(echo $line | awk -F, '{print $2}') |
| | - |
| | - # Drop the scrape function so it can be reloaded (fails silently). |
| | - unset -f scrape |
| | - |
| | - # Load the scraper for the subscriber's vendor. |
| | - source $DIR/vendors/$(echo $line | awk -F, '{print $3}')/scrape.sh |
| | - |
| | - # Hash the email for a safe directory name. |
| | - dir_subscriber="$DIR_SUBSCRIBERS/$(hashed $email)" |
| | - |
| | - # Create a place to store this subscriber's information (fail silent). |
| | - mkdir -p "$dir_subscriber" |
| | - |
| | - # Remove the quotes around the product name and product page. |
| | - temp="${product_name%\"}" |
| | - temp="${temp#\"}" |
| | - product_name=$temp |
| | - |
| | - temp="${product_page%\"}" |
| | - temp="${temp#\"}" |
| | - product_page=$temp |
| | - |
| | - # Extract product information for subscriber notifications. |
| | - # Builds the message subject and body for notifications. |
| | - # Will delete the notification file if it is time to send a new message. |
| | - scrape "$dir_subscriber" "$vendor_name" "$product_name" "$product_page" \ |
| | - "$postal_code" |
| | - done < "$DIR_SUBSCRIBERS/list.csv" |
| | - |
| | - # Load function to notify a subscriber. |
| | - source "$DIR/notify.sh" |
| | - |
| | - # Notification happens after sale determination because subscribers can |
| | - # subscribe to notifications for multiple product items. Once all the |
| | - # on sale products have been extracted, the message content can be sent. |
| | - while IFS=',' read email postal_code store product_name product_page |
| | - do |
| | - # Ignore blank lines. |
| | - if [ ! "$email" ]; then continue; fi |
| | - |
| | - dir_subscriber="$DIR_SUBSCRIBERS/$(hashed $email)" |
| | - path_notified="$dir_subscriber/$FILE_NOTIFIED" |
| | - |
| | - # If the notified file doesn't exist, send a notification. |
| | - if [ ! -f "$path_notified" ]; then |
| | - # Full path to the message subject and body. |
| | - path_subject="$dir_subscriber/$FILE_MESSAGE_SUBJECT" |
| | - path_body="$dir_subscriber/$FILE_MESSAGE_BODY" |
| | - |
| | - # Only send the notification if both a message body and subject exist. |
| | - # This allows for writing a body prefix (e.g., store address) to the |
| | - # message body without sending a message if no subscriber products are |
| | - # on sale. |
| | - if [ -s "$path_subject" -a -s "$path_body" ]; then |
| | - # Notify the subscriber. |
| | - notify $DIR $email $path_subject $path_body |
| | - |
| | - # Allow comparing flyer expiry date against last notified date. |
| | - date +%s > "$path_notified" |
| | - fi |
| | - fi |
| | - done < "$DIR_SUBSCRIBERS/list.csv" |
| | - */ |
| | |