Add word count to metadata

Author	DaveJarvis <email>
Date	2021-05-04 21:30:30 GMT-0700
Commit	cb0b2f36472f4de7281bb8bde94162ecce3a3238
Parent	f6c247c

src/main/java/com/keenwrite/preferences/PreferencesController.java

		),
		Group.of(
		+ get( KEY_DOC_PSEUDONYM ),
		+ Setting.of( label( KEY_DOC_PSEUDONYM ) ),
		+ Setting.of( title( KEY_DOC_PSEUDONYM ),
		+ stringProperty( KEY_DOC_PSEUDONYM ) )
		+ ),
		+ Group.of(
		get( KEY_DOC_KEYWORDS ),
		Setting.of( label( KEY_DOC_KEYWORDS ) ),

src/main/java/com/keenwrite/preferences/Workspace.java

		entry( KEY_DOC_TITLE, asStringProperty( "title" ) ),
		entry( KEY_DOC_AUTHOR, asStringProperty( getProperty( "user.name" ) ) ),
		+ entry( KEY_DOC_PSEUDONYM, asStringProperty( getProperty( "user.name" ) ) ),
		entry( KEY_DOC_KEYWORDS, asStringProperty( "science, nature" ) ),
		entry( KEY_DOC_COPYRIGHT, asStringProperty( getYear() ) ),

src/main/java/com/keenwrite/preferences/WorkspaceKeys.java

		public static final Key KEY_DOC_TITLE = key( KEY_DOC, "title" );
		public static final Key KEY_DOC_AUTHOR = key( KEY_DOC, "author" );
		+ public static final Key KEY_DOC_PSEUDONYM = key( KEY_DOC, "pseudonym" );
		public static final Key KEY_DOC_KEYWORDS = key( KEY_DOC, "keywords" );
		public static final Key KEY_DOC_DATE = key( KEY_DOC, "date" );

src/main/java/com/keenwrite/processors/XhtmlProcessor.java

		import com.keenwrite.preferences.Key;
		import com.keenwrite.preferences.Workspace;
		+import com.keenwrite.ui.heuristics.WordCounter;
		import javafx.beans.property.StringProperty;
		import org.jsoup.nodes.Document;

		import javax.xml.parsers.DocumentBuilderFactory;
		import javax.xml.transform.TransformerFactory;
		import javax.xml.transform.dom.DOMSource;
		import javax.xml.transform.stream.StreamResult;
		import java.io.FileNotFoundException;
		import java.nio.file.Path;
		+import java.util.Locale;
		import java.util.Map;
		import java.util.Map.Entry;
		import java.util.regex.Pattern;

		import static com.keenwrite.Bootstrap.APP_TITLE_LOWERCASE;
		import static com.keenwrite.Messages.get;
		import static com.keenwrite.events.StatusEvent.clue;
		import static com.keenwrite.io.HttpFacade.httpGet;
		import static com.keenwrite.preferences.WorkspaceKeys.*;
		+import static com.keenwrite.processors.text.TextReplacementFactory.replace;
		import static com.keenwrite.util.ProtocolScheme.getProtocol;
		import static java.lang.String.format;
		+import static java.lang.String.valueOf;
		import static java.nio.file.Files.copy;
		import static java.nio.file.StandardCopyOption.REPLACE_EXISTING;

		doc.title( getTitle() );

		- final var metadata = createMetaData();
		+ final var metadata = createMetaData( doc );
		final var head = doc.head();
		metadata.entrySet().forEach( entry -> head.append( createMeta( entry ) ) );


		/**
		- * <p>
		- * TODO: Convert the workspace fields using variables (processor context).
		- * </p>
		+ * Generates document metadata, including word count.
		*
		+ * @param doc The document containing the text to tally.
		* @return A map of metadata key/value pairs.
		*/
		- private Map<String, String> createMetaData() {
		+ private Map<String, String> createMetaData( final Document doc ) {
		return Map.of(
		"author", getAuthor(),
		+ "count", getWordCount( doc ),
		"keywords", getKeywords(),
		"copyright", getCopyright(),

		return mContext.getWorkspace();
		}
		+
		+ private Locale getLocale() { return getWorkspace().getLocale(); }

		private String getTitle() {
		- return asString( KEY_DOC_TITLE );
		+ return resolve( KEY_DOC_TITLE );
		}

		private String getAuthor() {
		- return asString( KEY_DOC_AUTHOR );
		+ return resolve( KEY_DOC_AUTHOR );
		+ }
		+
		+ private String getWordCount( final Document doc ) {
		+ final var text = doc.wholeText();
		+ final var wordCounter = WordCounter.create( getLocale() );
		+ return valueOf( wordCounter.countWords( text ) );
		}

		private String getKeywords() {
		- return asString( KEY_DOC_KEYWORDS );
		+ return resolve( KEY_DOC_KEYWORDS );
		}

		private String getCopyright() {
		- return asString( KEY_DOC_COPYRIGHT );
		+ return resolve( KEY_DOC_COPYRIGHT );
		}

		private String getDate() {
		- return asString( KEY_DOC_DATE );
		+ return resolve( KEY_DOC_DATE );
		+ }
		+
		+ private String resolve( final Key key ) {
		+ return replace( asString( key ), mContext.getResolvedMap() );
		}

		private String asString( final Key key ) {
		return stringProperty( key ).get();
		}
		-

		private StringProperty stringProperty( final Key key ) {

src/main/java/com/keenwrite/ui/heuristics/DocumentStatistics.java

		import com.keenwrite.preview.HtmlPanel;
		import com.keenwrite.util.MurmurHash;
		-import com.whitemagicsoftware.wordcount.Tokenizer;
		import com.whitemagicsoftware.wordcount.TokenizerException;
		-import com.whitemagicsoftware.wordcount.TokenizerFactory;
		import javafx.beans.property.IntegerProperty;
		import javafx.beans.property.SimpleIntegerProperty;

		import org.greenrobot.eventbus.Subscribe;
		import org.jsoup.Jsoup;
		-
		-import java.util.Locale;

		import static com.keenwrite.events.Bus.register;
		import static com.keenwrite.events.StatusEvent.clue;
		import static com.keenwrite.events.WordCountEvent.fireWordCountEvent;
		import static com.keenwrite.preferences.WorkspaceKeys.KEY_LANGUAGE_LOCALE;
		import static com.keenwrite.preferences.WorkspaceKeys.KEY_UI_FONT_EDITOR_NAME;
		+import static com.keenwrite.ui.heuristics.DocumentStatistics.StatEntry;
		import static java.lang.String.format;
		-import static java.util.Locale.ENGLISH;
		import static javafx.application.Platform.runLater;
		import static javafx.collections.FXCollections.observableArrayList;

		/**
		* Responsible for displaying document statistics, such as word count and
		* word frequency.
		*/
		-public final class DocumentStatistics
		- extends TableView<DocumentStatistics.StatEntry> {
		- /**
		- * Parses documents into word counts.
		- */
		- private static Tokenizer sTokenizer = createTokenizer( ENGLISH );
		+public final class DocumentStatistics extends TableView<StatEntry> {

		+ private WordCounter mWordCounter;
		private final ObservableList<StatEntry> mItems = observableArrayList();

		/**
		* Creates a new observer of document change events that will gather and
		* display document statistics (e.g., word counts).
		*
		* @param workspace Settings used to configure the statistics engine.
		*/
		public DocumentStatistics( final Workspace workspace ) {
		+ mWordCounter = WordCounter.create( workspace.getLocale() );
		+
		final var sortedItems = new SortedList<>( mItems );
		sortedItems.comparatorProperty().bind( comparatorProperty() );

		public void handle( final DocumentChangedEvent event ) {
		try {
		- final var tokens = sTokenizer.tokenize( event.getDocument() );
		- final var sum = new int[]{0};
		-
		runLater( () -> {
		mItems.clear();
		- tokens.forEach( ( k, v ) -> {
		- final var count = v[ 0 ];
		- if( count > 2 ) {
		- mItems.add( new StatEntry( k, count ) );
		+ final var document = event.getDocument();
		+ final var wordCount = mWordCounter.countWords(
		+ document, ( k, count ) -> {
		+ // Generate statistics for words that occur thrice or more.
		+ if( count > 2 ) {
		+ mItems.add( new StatEntry( k, count ) );
		+ }
		}
		- sum[ 0 ] += count;
		- } );
		+ );

		- fireWordCountEvent( sum[ 0 ] );
		+ fireWordCountEvent( wordCount );
		} );
		-
		} catch( final TokenizerException ex ) {
		clue( ex );

		final var property = workspace.localeProperty( KEY_LANGUAGE_LOCALE );
		property.addListener(
		- ( c, o, n ) -> sTokenizer = createTokenizer( property.toLocale() )
		+ ( c, o, n ) -> mWordCounter = WordCounter.create( property.toLocale() )
		);
		- }
		-
		- /**
		- * Creates a tokenizer for English text (can handle most Latin languages).
		- *
		- * @return An English-based tokenizer for counting words.
		- */
		- private static Tokenizer createTokenizer( final Locale language ) {
		- return TokenizerFactory.create( language );
		}

src/main/java/com/keenwrite/ui/heuristics/WordCounter.java

		+/* Copyright 2020-2021 White Magic Software, Ltd. -- All rights reserved. */
		+package com.keenwrite.ui.heuristics;
		+
		+import com.whitemagicsoftware.wordcount.Tokenizer;
		+import com.whitemagicsoftware.wordcount.TokenizerFactory;
		+
		+import java.util.Locale;
		+import java.util.function.BiConsumer;
		+
		+/**
		+ * Responsible for counting unique words as well as total words in a document.
		+ */
		+public class WordCounter {
		+ /**
		+ * Parses documents into word counts.
		+ */
		+ private final Tokenizer mTokenizer;
		+
		+ /**
		+ * Constructs a new {@link WordCounter} instance using the given tokenizer.
		+ *
		+ * @param tokenizer The class responsible for parsing a document into unique
		+ * and total word counts.
		+ */
		+ private WordCounter( final Tokenizer tokenizer ) {
		+ mTokenizer = tokenizer;
		+ }
		+
		+ /**
		+ * Counts the number of unique words in the document.
		+ *
		+ * @param document The document to tally.
		+ * @return The total number of words in the document.
		+ */
		+ public int countWords( final String document ) {
		+ return countWords( document, ( k, count ) -> {} );
		+ }
		+
		+ /**
		+ * Counts the number of unique words in the document.
		+ *
		+ * @param document The document to tally.
		+ * @param consumer The action to take for each unique word/count pair.
		+ * @return The total number of words in the document.
		+ */
		+ public int countWords(
		+ final String document, final BiConsumer<String, Integer> consumer ) {
		+ final var tokens = mTokenizer.tokenize( document );
		+ final var sum = new int[]{0};
		+
		+ tokens.forEach( ( k, v ) -> {
		+ final var count = v[ 0 ];
		+ consumer.accept( k, count );
		+ sum[ 0 ] += count;
		+ } );
		+
		+ return sum[ 0 ];
		+ }
		+
		+ /**
		+ * Constructs a new {@link WordCounter} capable of tokenizing a document
		+ * into words using the given {@link Locale}.
		+ *
		+ * @param locale The {@link Tokenizer}'s language settings.
		+ */
		+ public static WordCounter create( final Locale locale ) {
		+ return new WordCounter( createTokenizer( locale ) );
		+ }
		+
		+ /**
		+ * Creates a tokenizer for English text (can handle most Latin languages).
		+ *
		+ * @return An English-based tokenizer for counting words.
		+ */
		+ private static Tokenizer createTokenizer( final Locale language ) {
		+ return TokenizerFactory.create( language );
		+ }
		+}

src/main/resources/com/keenwrite/messages.properties

		workspace.document=Document
		workspace.document.author=Author Name
		-workspace.document.author.desc=Full name of primary author.
		+workspace.document.author.desc=Full name of primary author, or variable reference (e.g., '{{'book.author'}}').
		workspace.document.author.title=Author
		+workspace.document.pseudonym=Pseudonym
		+workspace.document.pseudonym.desc=Full name of author pseudonym, or variable reference.
		+workspace.document.pseudonym.title=Pseudonym
		workspace.document.title=Title Name
		-workspace.document.title.desc=Full document title.
		+workspace.document.title.desc=Full document title, or variable reference (e.g., '{{'book.title'}}').
		workspace.document.title.title=Title
		workspace.document.keywords=Keywords
		-workspace.document.keywords.desc=Comma-separated words relating to subject matter.
		+workspace.document.keywords.desc=Comma-separated words relating to subject matter, or variable reference.
		workspace.document.keywords.title=Words
		workspace.document.copyright=Copyright
		-workspace.document.copyright.desc=Continuous years of publication.
		+workspace.document.copyright.desc=Continuous years of publication, or variable reference.
		workspace.document.copyright.title=Year(s)
		workspace.document.date=Publish Date
		-workspace.document.date.desc=Date and time document was published.
		+workspace.document.date.desc=Date and time document was published, or variable reference.
		workspace.document.Date.title=Timestamp

Delta	135 lines added, 48 lines removed, 87-line increase