| Author | DaveJarvis <email> |
|---|---|
| Date | 2021-05-04 21:30:30 GMT-0700 |
| Commit | cb0b2f36472f4de7281bb8bde94162ecce3a3238 |
| Parent | f6c247c |
| ), | ||
| Group.of( | ||
| + get( KEY_DOC_PSEUDONYM ), | ||
| + Setting.of( label( KEY_DOC_PSEUDONYM ) ), | ||
| + Setting.of( title( KEY_DOC_PSEUDONYM ), | ||
| + stringProperty( KEY_DOC_PSEUDONYM ) ) | ||
| + ), | ||
| + Group.of( | ||
| get( KEY_DOC_KEYWORDS ), | ||
| Setting.of( label( KEY_DOC_KEYWORDS ) ), |
| entry( KEY_DOC_TITLE, asStringProperty( "title" ) ), | ||
| entry( KEY_DOC_AUTHOR, asStringProperty( getProperty( "user.name" ) ) ), | ||
| + entry( KEY_DOC_PSEUDONYM, asStringProperty( getProperty( "user.name" ) ) ), | ||
| entry( KEY_DOC_KEYWORDS, asStringProperty( "science, nature" ) ), | ||
| entry( KEY_DOC_COPYRIGHT, asStringProperty( getYear() ) ), |
| public static final Key KEY_DOC_TITLE = key( KEY_DOC, "title" ); | ||
| public static final Key KEY_DOC_AUTHOR = key( KEY_DOC, "author" ); | ||
| + public static final Key KEY_DOC_PSEUDONYM = key( KEY_DOC, "pseudonym" ); | ||
| public static final Key KEY_DOC_KEYWORDS = key( KEY_DOC, "keywords" ); | ||
| public static final Key KEY_DOC_DATE = key( KEY_DOC, "date" ); |
| import com.keenwrite.preferences.Key; | ||
| import com.keenwrite.preferences.Workspace; | ||
| +import com.keenwrite.ui.heuristics.WordCounter; | ||
| import javafx.beans.property.StringProperty; | ||
| import org.jsoup.nodes.Document; | ||
| import javax.xml.parsers.DocumentBuilderFactory; | ||
| import javax.xml.transform.TransformerFactory; | ||
| import javax.xml.transform.dom.DOMSource; | ||
| import javax.xml.transform.stream.StreamResult; | ||
| import java.io.FileNotFoundException; | ||
| import java.nio.file.Path; | ||
| +import java.util.Locale; | ||
| import java.util.Map; | ||
| import java.util.Map.Entry; | ||
| import java.util.regex.Pattern; | ||
| import static com.keenwrite.Bootstrap.APP_TITLE_LOWERCASE; | ||
| import static com.keenwrite.Messages.get; | ||
| import static com.keenwrite.events.StatusEvent.clue; | ||
| import static com.keenwrite.io.HttpFacade.httpGet; | ||
| import static com.keenwrite.preferences.WorkspaceKeys.*; | ||
| +import static com.keenwrite.processors.text.TextReplacementFactory.replace; | ||
| import static com.keenwrite.util.ProtocolScheme.getProtocol; | ||
| import static java.lang.String.format; | ||
| +import static java.lang.String.valueOf; | ||
| import static java.nio.file.Files.copy; | ||
| import static java.nio.file.StandardCopyOption.REPLACE_EXISTING; | ||
| doc.title( getTitle() ); | ||
| - final var metadata = createMetaData(); | ||
| + final var metadata = createMetaData( doc ); | ||
| final var head = doc.head(); | ||
| metadata.entrySet().forEach( entry -> head.append( createMeta( entry ) ) ); | ||
| /** | ||
| - * <p> | ||
| - * TODO: Convert the workspace fields using variables (processor context). | ||
| - * </p> | ||
| + * Generates document metadata, including word count. | ||
| * | ||
| + * @param doc The document containing the text to tally. | ||
| * @return A map of metadata key/value pairs. | ||
| */ | ||
| - private Map<String, String> createMetaData() { | ||
| + private Map<String, String> createMetaData( final Document doc ) { | ||
| return Map.of( | ||
| "author", getAuthor(), | ||
| + "count", getWordCount( doc ), | ||
| "keywords", getKeywords(), | ||
| "copyright", getCopyright(), | ||
| return mContext.getWorkspace(); | ||
| } | ||
| + | ||
| + private Locale getLocale() { return getWorkspace().getLocale(); } | ||
| private String getTitle() { | ||
| - return asString( KEY_DOC_TITLE ); | ||
| + return resolve( KEY_DOC_TITLE ); | ||
| } | ||
| private String getAuthor() { | ||
| - return asString( KEY_DOC_AUTHOR ); | ||
| + return resolve( KEY_DOC_AUTHOR ); | ||
| + } | ||
| + | ||
| + private String getWordCount( final Document doc ) { | ||
| + final var text = doc.wholeText(); | ||
| + final var wordCounter = WordCounter.create( getLocale() ); | ||
| + return valueOf( wordCounter.countWords( text ) ); | ||
| } | ||
| private String getKeywords() { | ||
| - return asString( KEY_DOC_KEYWORDS ); | ||
| + return resolve( KEY_DOC_KEYWORDS ); | ||
| } | ||
| private String getCopyright() { | ||
| - return asString( KEY_DOC_COPYRIGHT ); | ||
| + return resolve( KEY_DOC_COPYRIGHT ); | ||
| } | ||
| private String getDate() { | ||
| - return asString( KEY_DOC_DATE ); | ||
| + return resolve( KEY_DOC_DATE ); | ||
| + } | ||
| + | ||
| + private String resolve( final Key key ) { | ||
| + return replace( asString( key ), mContext.getResolvedMap() ); | ||
| } | ||
| private String asString( final Key key ) { | ||
| return stringProperty( key ).get(); | ||
| } | ||
| - | ||
| private StringProperty stringProperty( final Key key ) { | ||
| import com.keenwrite.preview.HtmlPanel; | ||
| import com.keenwrite.util.MurmurHash; | ||
| -import com.whitemagicsoftware.wordcount.Tokenizer; | ||
| import com.whitemagicsoftware.wordcount.TokenizerException; | ||
| -import com.whitemagicsoftware.wordcount.TokenizerFactory; | ||
| import javafx.beans.property.IntegerProperty; | ||
| import javafx.beans.property.SimpleIntegerProperty; | ||
| import org.greenrobot.eventbus.Subscribe; | ||
| import org.jsoup.Jsoup; | ||
| - | ||
| -import java.util.Locale; | ||
| import static com.keenwrite.events.Bus.register; | ||
| import static com.keenwrite.events.StatusEvent.clue; | ||
| import static com.keenwrite.events.WordCountEvent.fireWordCountEvent; | ||
| import static com.keenwrite.preferences.WorkspaceKeys.KEY_LANGUAGE_LOCALE; | ||
| import static com.keenwrite.preferences.WorkspaceKeys.KEY_UI_FONT_EDITOR_NAME; | ||
| +import static com.keenwrite.ui.heuristics.DocumentStatistics.StatEntry; | ||
| import static java.lang.String.format; | ||
| -import static java.util.Locale.ENGLISH; | ||
| import static javafx.application.Platform.runLater; | ||
| import static javafx.collections.FXCollections.observableArrayList; | ||
| /** | ||
| * Responsible for displaying document statistics, such as word count and | ||
| * word frequency. | ||
| */ | ||
| -public final class DocumentStatistics | ||
| - extends TableView<DocumentStatistics.StatEntry> { | ||
| - /** | ||
| - * Parses documents into word counts. | ||
| - */ | ||
| - private static Tokenizer sTokenizer = createTokenizer( ENGLISH ); | ||
| +public final class DocumentStatistics extends TableView<StatEntry> { | ||
| + private WordCounter mWordCounter; | ||
| private final ObservableList<StatEntry> mItems = observableArrayList(); | ||
| /** | ||
| * Creates a new observer of document change events that will gather and | ||
| * display document statistics (e.g., word counts). | ||
| * | ||
| * @param workspace Settings used to configure the statistics engine. | ||
| */ | ||
| public DocumentStatistics( final Workspace workspace ) { | ||
| + mWordCounter = WordCounter.create( workspace.getLocale() ); | ||
| + | ||
| final var sortedItems = new SortedList<>( mItems ); | ||
| sortedItems.comparatorProperty().bind( comparatorProperty() ); | ||
| public void handle( final DocumentChangedEvent event ) { | ||
| try { | ||
| - final var tokens = sTokenizer.tokenize( event.getDocument() ); | ||
| - final var sum = new int[]{0}; | ||
| - | ||
| runLater( () -> { | ||
| mItems.clear(); | ||
| - tokens.forEach( ( k, v ) -> { | ||
| - final var count = v[ 0 ]; | ||
| - if( count > 2 ) { | ||
| - mItems.add( new StatEntry( k, count ) ); | ||
| + final var document = event.getDocument(); | ||
| + final var wordCount = mWordCounter.countWords( | ||
| + document, ( k, count ) -> { | ||
| + // Generate statistics for words that occur thrice or more. | ||
| + if( count > 2 ) { | ||
| + mItems.add( new StatEntry( k, count ) ); | ||
| + } | ||
| } | ||
| - sum[ 0 ] += count; | ||
| - } ); | ||
| + ); | ||
| - fireWordCountEvent( sum[ 0 ] ); | ||
| + fireWordCountEvent( wordCount ); | ||
| } ); | ||
| - | ||
| } catch( final TokenizerException ex ) { | ||
| clue( ex ); | ||
| final var property = workspace.localeProperty( KEY_LANGUAGE_LOCALE ); | ||
| property.addListener( | ||
| - ( c, o, n ) -> sTokenizer = createTokenizer( property.toLocale() ) | ||
| + ( c, o, n ) -> mWordCounter = WordCounter.create( property.toLocale() ) | ||
| ); | ||
| - } | ||
| - | ||
| - /** | ||
| - * Creates a tokenizer for English text (can handle most Latin languages). | ||
| - * | ||
| - * @return An English-based tokenizer for counting words. | ||
| - */ | ||
| - private static Tokenizer createTokenizer( final Locale language ) { | ||
| - return TokenizerFactory.create( language ); | ||
| } | ||
| +/* Copyright 2020-2021 White Magic Software, Ltd. -- All rights reserved. */ | ||
| +package com.keenwrite.ui.heuristics; | ||
| + | ||
| +import com.whitemagicsoftware.wordcount.Tokenizer; | ||
| +import com.whitemagicsoftware.wordcount.TokenizerFactory; | ||
| + | ||
| +import java.util.Locale; | ||
| +import java.util.function.BiConsumer; | ||
| + | ||
| +/** | ||
| + * Responsible for counting unique words as well as total words in a document. | ||
| + */ | ||
| +public class WordCounter { | ||
| + /** | ||
| + * Parses documents into word counts. | ||
| + */ | ||
| + private final Tokenizer mTokenizer; | ||
| + | ||
| + /** | ||
| + * Constructs a new {@link WordCounter} instance using the given tokenizer. | ||
| + * | ||
| + * @param tokenizer The class responsible for parsing a document into unique | ||
| + * and total word counts. | ||
| + */ | ||
| + private WordCounter( final Tokenizer tokenizer ) { | ||
| + mTokenizer = tokenizer; | ||
| + } | ||
| + | ||
| + /** | ||
| + * Counts the number of unique words in the document. | ||
| + * | ||
| + * @param document The document to tally. | ||
| + * @return The total number of words in the document. | ||
| + */ | ||
| + public int countWords( final String document ) { | ||
| + return countWords( document, ( k, count ) -> {} ); | ||
| + } | ||
| + | ||
| + /** | ||
| + * Counts the number of unique words in the document. | ||
| + * | ||
| + * @param document The document to tally. | ||
| + * @param consumer The action to take for each unique word/count pair. | ||
| + * @return The total number of words in the document. | ||
| + */ | ||
| + public int countWords( | ||
| + final String document, final BiConsumer<String, Integer> consumer ) { | ||
| + final var tokens = mTokenizer.tokenize( document ); | ||
| + final var sum = new int[]{0}; | ||
| + | ||
| + tokens.forEach( ( k, v ) -> { | ||
| + final var count = v[ 0 ]; | ||
| + consumer.accept( k, count ); | ||
| + sum[ 0 ] += count; | ||
| + } ); | ||
| + | ||
| + return sum[ 0 ]; | ||
| + } | ||
| + | ||
| + /** | ||
| + * Constructs a new {@link WordCounter} capable of tokenizing a document | ||
| + * into words using the given {@link Locale}. | ||
| + * | ||
| + * @param locale The {@link Tokenizer}'s language settings. | ||
| + */ | ||
| + public static WordCounter create( final Locale locale ) { | ||
| + return new WordCounter( createTokenizer( locale ) ); | ||
| + } | ||
| + | ||
| + /** | ||
| + * Creates a tokenizer for English text (can handle most Latin languages). | ||
| + * | ||
| + * @return An English-based tokenizer for counting words. | ||
| + */ | ||
| + private static Tokenizer createTokenizer( final Locale language ) { | ||
| + return TokenizerFactory.create( language ); | ||
| + } | ||
| +} | ||
| workspace.document=Document | ||
| workspace.document.author=Author Name | ||
| -workspace.document.author.desc=Full name of primary author. | ||
| +workspace.document.author.desc=Full name of primary author, or variable reference (e.g., '{{'book.author'}}'). | ||
| workspace.document.author.title=Author | ||
| +workspace.document.pseudonym=Pseudonym | ||
| +workspace.document.pseudonym.desc=Full name of author pseudonym, or variable reference. | ||
| +workspace.document.pseudonym.title=Pseudonym | ||
| workspace.document.title=Title Name | ||
| -workspace.document.title.desc=Full document title. | ||
| +workspace.document.title.desc=Full document title, or variable reference (e.g., '{{'book.title'}}'). | ||
| workspace.document.title.title=Title | ||
| workspace.document.keywords=Keywords | ||
| -workspace.document.keywords.desc=Comma-separated words relating to subject matter. | ||
| +workspace.document.keywords.desc=Comma-separated words relating to subject matter, or variable reference. | ||
| workspace.document.keywords.title=Words | ||
| workspace.document.copyright=Copyright | ||
| -workspace.document.copyright.desc=Continuous years of publication. | ||
| +workspace.document.copyright.desc=Continuous years of publication, or variable reference. | ||
| workspace.document.copyright.title=Year(s) | ||
| workspace.document.date=Publish Date | ||
| -workspace.document.date.desc=Date and time document was published. | ||
| +workspace.document.date.desc=Date and time document was published, or variable reference. | ||
| workspace.document.Date.title=Timestamp | ||
| Delta | 135 lines added, 48 lines removed, 87-line increase |
|---|