Dave Jarvis' Repositories

git clone https://repo.autonoma.ca/repo/keenwrite.git

Add word count to metadata

AuthorDaveJarvis <email>
Date2021-05-04 21:30:30 GMT-0700
Commitcb0b2f36472f4de7281bb8bde94162ecce3a3238
Parentf6c247c
src/main/java/com/keenwrite/preferences/PreferencesController.java
),
Group.of(
+ get( KEY_DOC_PSEUDONYM ),
+ Setting.of( label( KEY_DOC_PSEUDONYM ) ),
+ Setting.of( title( KEY_DOC_PSEUDONYM ),
+ stringProperty( KEY_DOC_PSEUDONYM ) )
+ ),
+ Group.of(
get( KEY_DOC_KEYWORDS ),
Setting.of( label( KEY_DOC_KEYWORDS ) ),
src/main/java/com/keenwrite/preferences/Workspace.java
entry( KEY_DOC_TITLE, asStringProperty( "title" ) ),
entry( KEY_DOC_AUTHOR, asStringProperty( getProperty( "user.name" ) ) ),
+ entry( KEY_DOC_PSEUDONYM, asStringProperty( getProperty( "user.name" ) ) ),
entry( KEY_DOC_KEYWORDS, asStringProperty( "science, nature" ) ),
entry( KEY_DOC_COPYRIGHT, asStringProperty( getYear() ) ),
src/main/java/com/keenwrite/preferences/WorkspaceKeys.java
public static final Key KEY_DOC_TITLE = key( KEY_DOC, "title" );
public static final Key KEY_DOC_AUTHOR = key( KEY_DOC, "author" );
+ public static final Key KEY_DOC_PSEUDONYM = key( KEY_DOC, "pseudonym" );
public static final Key KEY_DOC_KEYWORDS = key( KEY_DOC, "keywords" );
public static final Key KEY_DOC_DATE = key( KEY_DOC, "date" );
src/main/java/com/keenwrite/processors/XhtmlProcessor.java
import com.keenwrite.preferences.Key;
import com.keenwrite.preferences.Workspace;
+import com.keenwrite.ui.heuristics.WordCounter;
import javafx.beans.property.StringProperty;
import org.jsoup.nodes.Document;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.FileNotFoundException;
import java.nio.file.Path;
+import java.util.Locale;
import java.util.Map;
import java.util.Map.Entry;
import java.util.regex.Pattern;
import static com.keenwrite.Bootstrap.APP_TITLE_LOWERCASE;
import static com.keenwrite.Messages.get;
import static com.keenwrite.events.StatusEvent.clue;
import static com.keenwrite.io.HttpFacade.httpGet;
import static com.keenwrite.preferences.WorkspaceKeys.*;
+import static com.keenwrite.processors.text.TextReplacementFactory.replace;
import static com.keenwrite.util.ProtocolScheme.getProtocol;
import static java.lang.String.format;
+import static java.lang.String.valueOf;
import static java.nio.file.Files.copy;
import static java.nio.file.StandardCopyOption.REPLACE_EXISTING;
doc.title( getTitle() );
- final var metadata = createMetaData();
+ final var metadata = createMetaData( doc );
final var head = doc.head();
metadata.entrySet().forEach( entry -> head.append( createMeta( entry ) ) );
/**
- * <p>
- * TODO: Convert the workspace fields using variables (processor context).
- * </p>
+ * Generates document metadata, including word count.
*
+ * @param doc The document containing the text to tally.
* @return A map of metadata key/value pairs.
*/
- private Map<String, String> createMetaData() {
+ private Map<String, String> createMetaData( final Document doc ) {
return Map.of(
"author", getAuthor(),
+ "count", getWordCount( doc ),
"keywords", getKeywords(),
"copyright", getCopyright(),
return mContext.getWorkspace();
}
+
+ private Locale getLocale() { return getWorkspace().getLocale(); }
private String getTitle() {
- return asString( KEY_DOC_TITLE );
+ return resolve( KEY_DOC_TITLE );
}
private String getAuthor() {
- return asString( KEY_DOC_AUTHOR );
+ return resolve( KEY_DOC_AUTHOR );
+ }
+
+ private String getWordCount( final Document doc ) {
+ final var text = doc.wholeText();
+ final var wordCounter = WordCounter.create( getLocale() );
+ return valueOf( wordCounter.countWords( text ) );
}
private String getKeywords() {
- return asString( KEY_DOC_KEYWORDS );
+ return resolve( KEY_DOC_KEYWORDS );
}
private String getCopyright() {
- return asString( KEY_DOC_COPYRIGHT );
+ return resolve( KEY_DOC_COPYRIGHT );
}
private String getDate() {
- return asString( KEY_DOC_DATE );
+ return resolve( KEY_DOC_DATE );
+ }
+
+ private String resolve( final Key key ) {
+ return replace( asString( key ), mContext.getResolvedMap() );
}
private String asString( final Key key ) {
return stringProperty( key ).get();
}
-
private StringProperty stringProperty( final Key key ) {
src/main/java/com/keenwrite/ui/heuristics/DocumentStatistics.java
import com.keenwrite.preview.HtmlPanel;
import com.keenwrite.util.MurmurHash;
-import com.whitemagicsoftware.wordcount.Tokenizer;
import com.whitemagicsoftware.wordcount.TokenizerException;
-import com.whitemagicsoftware.wordcount.TokenizerFactory;
import javafx.beans.property.IntegerProperty;
import javafx.beans.property.SimpleIntegerProperty;
import org.greenrobot.eventbus.Subscribe;
import org.jsoup.Jsoup;
-
-import java.util.Locale;
import static com.keenwrite.events.Bus.register;
import static com.keenwrite.events.StatusEvent.clue;
import static com.keenwrite.events.WordCountEvent.fireWordCountEvent;
import static com.keenwrite.preferences.WorkspaceKeys.KEY_LANGUAGE_LOCALE;
import static com.keenwrite.preferences.WorkspaceKeys.KEY_UI_FONT_EDITOR_NAME;
+import static com.keenwrite.ui.heuristics.DocumentStatistics.StatEntry;
import static java.lang.String.format;
-import static java.util.Locale.ENGLISH;
import static javafx.application.Platform.runLater;
import static javafx.collections.FXCollections.observableArrayList;
/**
* Responsible for displaying document statistics, such as word count and
* word frequency.
*/
-public final class DocumentStatistics
- extends TableView<DocumentStatistics.StatEntry> {
- /**
- * Parses documents into word counts.
- */
- private static Tokenizer sTokenizer = createTokenizer( ENGLISH );
+public final class DocumentStatistics extends TableView<StatEntry> {
+ private WordCounter mWordCounter;
private final ObservableList<StatEntry> mItems = observableArrayList();
/**
* Creates a new observer of document change events that will gather and
* display document statistics (e.g., word counts).
*
* @param workspace Settings used to configure the statistics engine.
*/
public DocumentStatistics( final Workspace workspace ) {
+ mWordCounter = WordCounter.create( workspace.getLocale() );
+
final var sortedItems = new SortedList<>( mItems );
sortedItems.comparatorProperty().bind( comparatorProperty() );
public void handle( final DocumentChangedEvent event ) {
try {
- final var tokens = sTokenizer.tokenize( event.getDocument() );
- final var sum = new int[]{0};
-
runLater( () -> {
mItems.clear();
- tokens.forEach( ( k, v ) -> {
- final var count = v[ 0 ];
- if( count > 2 ) {
- mItems.add( new StatEntry( k, count ) );
+ final var document = event.getDocument();
+ final var wordCount = mWordCounter.countWords(
+ document, ( k, count ) -> {
+ // Generate statistics for words that occur thrice or more.
+ if( count > 2 ) {
+ mItems.add( new StatEntry( k, count ) );
+ }
}
- sum[ 0 ] += count;
- } );
+ );
- fireWordCountEvent( sum[ 0 ] );
+ fireWordCountEvent( wordCount );
} );
-
} catch( final TokenizerException ex ) {
clue( ex );
final var property = workspace.localeProperty( KEY_LANGUAGE_LOCALE );
property.addListener(
- ( c, o, n ) -> sTokenizer = createTokenizer( property.toLocale() )
+ ( c, o, n ) -> mWordCounter = WordCounter.create( property.toLocale() )
);
- }
-
- /**
- * Creates a tokenizer for English text (can handle most Latin languages).
- *
- * @return An English-based tokenizer for counting words.
- */
- private static Tokenizer createTokenizer( final Locale language ) {
- return TokenizerFactory.create( language );
}
src/main/java/com/keenwrite/ui/heuristics/WordCounter.java
+/* Copyright 2020-2021 White Magic Software, Ltd. -- All rights reserved. */
+package com.keenwrite.ui.heuristics;
+
+import com.whitemagicsoftware.wordcount.Tokenizer;
+import com.whitemagicsoftware.wordcount.TokenizerFactory;
+
+import java.util.Locale;
+import java.util.function.BiConsumer;
+
+/**
+ * Responsible for counting unique words as well as total words in a document.
+ */
+public class WordCounter {
+ /**
+ * Parses documents into word counts.
+ */
+ private final Tokenizer mTokenizer;
+
+ /**
+ * Constructs a new {@link WordCounter} instance using the given tokenizer.
+ *
+ * @param tokenizer The class responsible for parsing a document into unique
+ * and total word counts.
+ */
+ private WordCounter( final Tokenizer tokenizer ) {
+ mTokenizer = tokenizer;
+ }
+
+ /**
+ * Counts the number of unique words in the document.
+ *
+ * @param document The document to tally.
+ * @return The total number of words in the document.
+ */
+ public int countWords( final String document ) {
+ return countWords( document, ( k, count ) -> {} );
+ }
+
+ /**
+ * Counts the number of unique words in the document.
+ *
+ * @param document The document to tally.
+ * @param consumer The action to take for each unique word/count pair.
+ * @return The total number of words in the document.
+ */
+ public int countWords(
+ final String document, final BiConsumer<String, Integer> consumer ) {
+ final var tokens = mTokenizer.tokenize( document );
+ final var sum = new int[]{0};
+
+ tokens.forEach( ( k, v ) -> {
+ final var count = v[ 0 ];
+ consumer.accept( k, count );
+ sum[ 0 ] += count;
+ } );
+
+ return sum[ 0 ];
+ }
+
+ /**
+ * Constructs a new {@link WordCounter} capable of tokenizing a document
+ * into words using the given {@link Locale}.
+ *
+ * @param locale The {@link Tokenizer}'s language settings.
+ */
+ public static WordCounter create( final Locale locale ) {
+ return new WordCounter( createTokenizer( locale ) );
+ }
+
+ /**
+ * Creates a tokenizer for English text (can handle most Latin languages).
+ *
+ * @return An English-based tokenizer for counting words.
+ */
+ private static Tokenizer createTokenizer( final Locale language ) {
+ return TokenizerFactory.create( language );
+ }
+}
src/main/resources/com/keenwrite/messages.properties
workspace.document=Document
workspace.document.author=Author Name
-workspace.document.author.desc=Full name of primary author.
+workspace.document.author.desc=Full name of primary author, or variable reference (e.g., '{{'book.author'}}').
workspace.document.author.title=Author
+workspace.document.pseudonym=Pseudonym
+workspace.document.pseudonym.desc=Full name of author pseudonym, or variable reference.
+workspace.document.pseudonym.title=Pseudonym
workspace.document.title=Title Name
-workspace.document.title.desc=Full document title.
+workspace.document.title.desc=Full document title, or variable reference (e.g., '{{'book.title'}}').
workspace.document.title.title=Title
workspace.document.keywords=Keywords
-workspace.document.keywords.desc=Comma-separated words relating to subject matter.
+workspace.document.keywords.desc=Comma-separated words relating to subject matter, or variable reference.
workspace.document.keywords.title=Words
workspace.document.copyright=Copyright
-workspace.document.copyright.desc=Continuous years of publication.
+workspace.document.copyright.desc=Continuous years of publication, or variable reference.
workspace.document.copyright.title=Year(s)
workspace.document.date=Publish Date
-workspace.document.date.desc=Date and time document was published.
+workspace.document.date.desc=Date and time document was published, or variable reference.
workspace.document.Date.title=Timestamp
Delta135 lines added, 48 lines removed, 87-line increase