| Author | Dave Jarvis <email> |
|---|---|
| Date | 2021-05-30 09:47:16 GMT-0700 |
| Commit | 0b537930c425135782d2564fae46b58fad3715ee |
| Delta | 411 lines added, 0 lines removed, 411-line increase |
|---|
| +# | ||
| +# https://help.github.com/articles/dealing-with-line-endings/ | ||
| +# | ||
| +# These are explicitly windows files and should use crlf | ||
| +*.bat text eol=crlf | ||
| + | ||
| +# Ignore Gradle project-specific cache directory | ||
| +.gradle | ||
| + | ||
| +# Ignore Gradle build output directory | ||
| +build | ||
| +# KeenQuotes | ||
| + | ||
| +A lightweight natural language parser that converts straight quotes | ||
| +to curly quotes. | ||
| + | ||
| +# Build | ||
| + | ||
| +Clone the repository and change to the root directory as usual, then run: | ||
| + | ||
| + gradle clean build | ||
| + | ||
| +/* | ||
| + * This file was generated by the Gradle 'init' task. | ||
| + * | ||
| + * This generated file contains a sample Java library project to get you started. | ||
| + * For more details take a look at the 'Building Java & JVM projects' chapter in the Gradle | ||
| + * User Manual available at https://docs.gradle.org/7.0/userguide/building_java_projects.html | ||
| + */ | ||
| + | ||
| +plugins { | ||
| + // Apply the java-library plugin for API and implementation separation. | ||
| + id 'java-library' | ||
| +} | ||
| + | ||
| +repositories { | ||
| + // Use Maven Central for resolving dependencies. | ||
| + mavenCentral() | ||
| +} | ||
| + | ||
| +dependencies { | ||
| + // Use JUnit Jupiter API for testing. | ||
| + testImplementation 'org.junit.jupiter:junit-jupiter-api:5.7.1' | ||
| + | ||
| + // Use JUnit Jupiter Engine for testing. | ||
| + testRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine' | ||
| + | ||
| + // This dependency is exported to consumers, that is to say found on their compile classpath. | ||
| + api 'org.apache.commons:commons-math3:3.6.1' | ||
| + | ||
| + // This dependency is used internally, and not exposed to consumers on their own compile classpath. | ||
| + implementation 'com.google.guava:guava:30.0-jre' | ||
| +} | ||
| + | ||
| +tasks.named('test') { | ||
| + // Use junit platform for unit tests. | ||
| + useJUnitPlatform() | ||
| +} | ||
| +/* Copyright 2020-2021 White Magic Software, Ltd. -- All rights reserved. */ | ||
| +package com.keenwrite.quotes; | ||
| + | ||
| +import java.util.function.Consumer; | ||
| + | ||
| +/** | ||
| + * Converts straight double/single quotes and apostrophes to curly equivalents. | ||
| + */ | ||
| +public class Parser implements Consumer<Token> { | ||
| + private final String mText; | ||
| + | ||
| + public Parser( final String text ) { | ||
| + mText = text; | ||
| + } | ||
| + | ||
| + public void parse() { | ||
| + final var tokenizer = new Tokenizer(); | ||
| + tokenizer.tokenize( mText, this ); | ||
| + } | ||
| + | ||
| + @Override | ||
| + public void accept( final Token token ) { | ||
| + System.out.print( token.getType() + " " ); | ||
| + } | ||
| +} | ||
| +/* Copyright 2020-2021 White Magic Software, Ltd. -- All rights reserved. */ | ||
| +package com.keenwrite.quotes; | ||
| + | ||
| +import static com.keenwrite.quotes.TokenType.INVALID; | ||
| + | ||
| +/** | ||
| + * Responsible for tracking the beginning and ending offsets of a token within | ||
| + * a text string. | ||
| + */ | ||
| +public class Token { | ||
| + /** | ||
| + * Denotes there are no more tokens: the end of text (EOT) has been reached. | ||
| + */ | ||
| + public static final Token EOT = new Token(); | ||
| + | ||
| + private final TokenType mType; | ||
| + private final int mBegan; | ||
| + private final int mEnded; | ||
| + | ||
| + /** | ||
| + * Set to {@code true} if there are more tokens to parse. | ||
| + */ | ||
| + private final boolean mHasNext; | ||
| + | ||
| + /** | ||
| + * Create an end of text token. | ||
| + */ | ||
| + private Token() { | ||
| + this( INVALID, -1, -1, false ); | ||
| + } | ||
| + | ||
| + /** | ||
| + * Create a token that indicates there are no more tokens. | ||
| + */ | ||
| + private Token( final TokenType type, final int began, final int ended ) { | ||
| + this( type, began, ended, true ); | ||
| + } | ||
| + | ||
| + /** | ||
| + * Create a token | ||
| + */ | ||
| + private Token( | ||
| + final TokenType type, final int began, final int ended, | ||
| + final boolean hasNext ) { | ||
| + mType = type; | ||
| + mBegan = began; | ||
| + mEnded = ended + 1; | ||
| + mHasNext = hasNext; | ||
| + } | ||
| + | ||
| + /** | ||
| + * Extracts a sequence of characters from the given text at the offsets | ||
| + * captured by this token. | ||
| + * | ||
| + * @param text The text that was parsed using this class. | ||
| + * @return The character string captured by the token. | ||
| + */ | ||
| + public String toString( final String text ) { | ||
| + return text.substring( mBegan, mEnded ); | ||
| + } | ||
| + | ||
| + public boolean hasNext() { | ||
| + return mHasNext; | ||
| + } | ||
| + | ||
| + public boolean isType( final TokenType type ) { | ||
| + return mType == type; | ||
| + } | ||
| + | ||
| + TokenType getType() { | ||
| + return mType; | ||
| + } | ||
| + | ||
| + static Token createToken( | ||
| + final TokenType token, final int began, final int ended ) { | ||
| + return new Token( token, began, ended ); | ||
| + } | ||
| +} | ||
| +/* Copyright 2020-2021 White Magic Software, Ltd. -- All rights reserved. */ | ||
| +package com.keenwrite.quotes; | ||
| + | ||
| +public enum TokenType { | ||
| + QSINGLE, | ||
| + QDOUBLE, | ||
| + WORD, | ||
| + NUMBER, | ||
| + NEWLINE, | ||
| + SPACE, | ||
| + PUNCT, | ||
| + PERIOD, | ||
| + INVALID | ||
| +} | ||
| + | ||
| +/* | ||
| + private enum TokenType { | ||
| +// QUOTE_OPENING_SINGLE, | ||
| +// QUOTE_OPENING_DOUBLE, | ||
| +// QUOTE_CLOSING_SINGLE, | ||
| +// QUOTE_CLOSING_DOUBLE, | ||
| +// QUOTE_APOSTROPHE, | ||
| +// QUOTE_PRIME_SINGLE, | ||
| +// QUOTE_PRIME_DOUBLE, | ||
| + TEXT | ||
| + } | ||
| + */ |
| +/* Copyright 2020-2021 White Magic Software, Ltd. -- All rights reserved. */ | ||
| +package com.keenwrite.quotes; | ||
| + | ||
| +import java.text.CharacterIterator; | ||
| +import java.text.StringCharacterIterator; | ||
| +import java.util.function.Consumer; | ||
| + | ||
| +import static com.keenwrite.quotes.Token.createToken; | ||
| +import static com.keenwrite.quotes.TokenType.*; | ||
| +import static java.lang.Character.*; | ||
| +import static java.text.CharacterIterator.DONE; | ||
| + | ||
| +/** | ||
| + * Tokenizes text into words, numbers, punctuation, spaces, and more. | ||
| + */ | ||
| +public class Tokenizer { | ||
| + /** | ||
| + * Default constructor, no state. | ||
| + */ | ||
| + public Tokenizer() { | ||
| + } | ||
| + | ||
| + /** | ||
| + * Emits a series of tokens that | ||
| + * | ||
| + * @param text The text to split into tokens. | ||
| + * @param consumer Receives each token as a separate event. | ||
| + */ | ||
| + public void tokenize( final String text, final Consumer<Token> consumer ) { | ||
| + final var iterator = new StringCharacterIterator( text ); | ||
| + Token token; | ||
| + | ||
| + while( (token = tokenize( iterator )).hasNext() ) { | ||
| + consumer.accept( token ); | ||
| + } | ||
| + } | ||
| + | ||
| + private Token tokenize( final CharacterIterator i ) { | ||
| + int began = i.getIndex(); | ||
| + boolean isWord = false; | ||
| + Token token = null; | ||
| + | ||
| + do { | ||
| + final var curr = i.current(); | ||
| + | ||
| + if( curr == DONE ) { | ||
| + return Token.EOT; | ||
| + } | ||
| + | ||
| + if( isLetter( curr ) ) { | ||
| + isWord = true; | ||
| + | ||
| + if( !isLetterOrDigit( peek( i ) ) ) { | ||
| + token = createToken( WORD, began, i.getIndex() ); | ||
| + } | ||
| + } | ||
| + else if( isDigit( curr ) || isNumeric( curr ) && isDigit( peek( i ) ) ) { | ||
| + // Tokenize all consecutive number characters at prevent the main | ||
| + // loop from switching back to word tokens. | ||
| + char next; | ||
| + | ||
| + do { | ||
| + next = i.next(); | ||
| + } | ||
| + while( isDigit( next ) || isNumeric( next ) && isDigit( peek( i ) ) ); | ||
| + | ||
| + // The loop above will overshoot the number by one character. | ||
| + i.previous(); | ||
| + | ||
| + token = createToken( isWord ? WORD : NUMBER, began, i.getIndex() ); | ||
| + } | ||
| + else if( curr == '\r' ) { | ||
| + token = createToken( NEWLINE, began, i.getIndex() ); | ||
| + | ||
| + // Swallow the LF in CRLF; peeking won't work here. | ||
| + if( i.next() != '\n' ) { | ||
| + // Push back the non-LF char. | ||
| + i.previous(); | ||
| + } | ||
| + } | ||
| + else if( curr == '\n' ) { | ||
| + token = createToken( NEWLINE, began, i.getIndex() ); | ||
| + } | ||
| + else if( isWhitespace( curr ) ) { | ||
| + token = createToken( SPACE, began, i.getIndex() ); | ||
| + } | ||
| + else if( curr == '\'' ) { | ||
| + token = createToken( QSINGLE, began, i.getIndex() ); | ||
| + } | ||
| + else if( curr == '"' ) { | ||
| + token = createToken( QDOUBLE, began, i.getIndex() ); | ||
| + } | ||
| + else if( curr == '.' ) { | ||
| + token = createToken( PERIOD, began, i.getIndex() ); | ||
| + } | ||
| + else { | ||
| + token = createToken( PUNCT, began, i.getIndex() ); | ||
| + } | ||
| + | ||
| + i.next(); | ||
| + } | ||
| + while( token == null ); | ||
| + | ||
| + return token; | ||
| + } | ||
| + | ||
| + private static boolean isNumeric( final char curr ) { | ||
| + return curr == '.' || curr == ','; | ||
| + } | ||
| + | ||
| + private static char peek( final CharacterIterator ci ) { | ||
| + final var ch = ci.next(); | ||
| + ci.previous(); | ||
| + return ch; | ||
| + } | ||
| +} | ||
| +/* Copyright 2020-2021 White Magic Software, Ltd. -- All rights reserved. */ | ||
| +package com.keenwrite.quotes; | ||
| + | ||
| +import org.junit.jupiter.api.Test; | ||
| + | ||
| +class ParserTest { | ||
| + @Test | ||
| + void test_Conversion_Straight_Curly() { | ||
| + final var parser = new Parser( "Salut tout le monde!\nÇa va?"); | ||
| + parser.parse(); | ||
| + System.out.println(); | ||
| + } | ||
| +} | ||
| +/* Copyright 2020-2021 White Magic Software, Ltd. -- All rights reserved. */ | ||
| +package com.keenwrite.quotes; | ||
| + | ||
| +import org.junit.jupiter.api.Test; | ||
| + | ||
| +import java.util.concurrent.atomic.AtomicInteger; | ||
| + | ||
| +import static com.keenwrite.quotes.TokenType.*; | ||
| +import static org.junit.jupiter.api.Assertions.assertEquals; | ||
| + | ||
| +/** | ||
| + * Tests tokenizing words, numbers, punctuation, spaces, and newlines. | ||
| + */ | ||
| +class TokenizerTest { | ||
| + @Test | ||
| + void test_Tokenize_Words_TokenValues() { | ||
| + testText( "abc 123", "abc", " ", "123" ); | ||
| + testText( "-123 abc", "-", "123", " ", "abc" ); | ||
| + } | ||
| + | ||
| + @Test | ||
| + void test_Tokenize_Numbers_EmitNumbers() { | ||
| + testType( ".123", NUMBER ); | ||
| + testType( "-123.", PUNCT, NUMBER, PERIOD ); | ||
| + testType( " 123.123.123", SPACE, NUMBER ); | ||
| + testType( "123 123\"", NUMBER, SPACE, NUMBER, QDOUBLE ); | ||
| + testType( "-123,123.123", PUNCT, NUMBER ); | ||
| + } | ||
| + | ||
| + @Test | ||
| + void test_Tokenize_Words_EmitWords() { | ||
| + testType( "abc", WORD ); | ||
| + testType( "abc abc", WORD, SPACE, WORD ); | ||
| + testType( "abc123", WORD ); | ||
| + testType( "-123abc", PUNCT, NUMBER, WORD ); | ||
| + testType( "abc-o'-abc", WORD, PUNCT, WORD, QSINGLE, PUNCT, WORD ); | ||
| + } | ||
| + | ||
| + @Test | ||
| + void test_Tokenize_PunctuationMarks_EmitPunctuationMarks() { | ||
| + testType( "!", PUNCT ); | ||
| + testType( ";", PUNCT ); | ||
| + testType( ".", PERIOD ); | ||
| + } | ||
| + | ||
| + @Test | ||
| + void test_Tokenize_Quotes_EmitQuotes() { | ||
| + testType( "'", QSINGLE ); | ||
| + testType( "\"", QDOUBLE ); | ||
| + testType( "3 o'clock", NUMBER, SPACE, WORD, QSINGLE, WORD ); | ||
| + } | ||
| + | ||
| + @Test | ||
| + void test_Tokenize_Newlines_EmitNewlines() { | ||
| + testType( "\r", NEWLINE ); | ||
| + testType( "\n", NEWLINE ); | ||
| + testType( "\r\n", NEWLINE ); | ||
| + testType( "\r\n\r\n", NEWLINE, NEWLINE ); | ||
| + testType( "\r\n\n\r", NEWLINE, NEWLINE, NEWLINE ); | ||
| + testType( "abc \r\nabc\n", WORD, SPACE, NEWLINE, WORD, NEWLINE ); | ||
| + } | ||
| + | ||
| + private void testType( | ||
| + final String actual, final TokenType... expected ) { | ||
| + final var tokenizer = new Tokenizer(); | ||
| + final var counter = new AtomicInteger(); | ||
| + | ||
| + tokenizer.tokenize( actual, ( token ) -> { | ||
| + final var expectedType = expected[ counter.getAndIncrement() ]; | ||
| + final var actualType = token.getType(); | ||
| + assertEquals( expectedType, actualType ); | ||
| + } ); | ||
| + | ||
| + // Ensure all expected tokens are matched (verify end of text reached). | ||
| + assertEquals( expected.length, counter.get() ); | ||
| + } | ||
| + | ||
| + private void testText( final String actual, final String... expected ) { | ||
| + final var tokenizer = new Tokenizer(); | ||
| + final var counter = new AtomicInteger(); | ||
| + | ||
| + tokenizer.tokenize( actual, ( token ) -> { | ||
| + final var expectedText = expected[ counter.getAndIncrement() ]; | ||
| + final var actualText = token.toString( actual ); | ||
| + assertEquals( expectedText, actualText ); | ||
| + } ); | ||
| + | ||
| + // Ensure all expected tokens are matched (verify end of text reached). | ||
| + assertEquals( expected.length, counter.get() ); | ||
| + } | ||
| +} | ||
| +rootProject.name = 'quotes' | ||
| +include('lib') | ||
| + | ||