Dave Jarvis' Repositories

Author	Dave Jarvis <email>
Date	2021-05-30 09:47:16 GMT-0700
Commit	0b537930c425135782d2564fae46b58fad3715ee

Delta	411 lines added, 0 lines removed, 411-line increase

.gitattributes

		+#
		+# https://help.github.com/articles/dealing-with-line-endings/
		+#
		+# These are explicitly windows files and should use crlf
		+*.bat text eol=crlf
		+

.gitignore

		+# Ignore Gradle project-specific cache directory
		+.gradle
		+
		+# Ignore Gradle build output directory
		+build

README.md

		+# KeenQuotes
		+
		+A lightweight natural language parser that converts straight quotes
		+to curly quotes.
		+
		+# Build
		+
		+Clone the repository and change to the root directory as usual, then run:
		+
		+ gradle clean build
		+

lib/build.gradle

		+/*
		+ * This file was generated by the Gradle 'init' task.
		+ *
		+ * This generated file contains a sample Java library project to get you started.
		+ * For more details take a look at the 'Building Java & JVM projects' chapter in the Gradle
		+ * User Manual available at https://docs.gradle.org/7.0/userguide/building_java_projects.html
		+ */
		+
		+plugins {
		+ // Apply the java-library plugin for API and implementation separation.
		+ id 'java-library'
		+}
		+
		+repositories {
		+ // Use Maven Central for resolving dependencies.
		+ mavenCentral()
		+}
		+
		+dependencies {
		+ // Use JUnit Jupiter API for testing.
		+ testImplementation 'org.junit.jupiter:junit-jupiter-api:5.7.1'
		+
		+ // Use JUnit Jupiter Engine for testing.
		+ testRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine'
		+
		+ // This dependency is exported to consumers, that is to say found on their compile classpath.
		+ api 'org.apache.commons:commons-math3:3.6.1'
		+
		+ // This dependency is used internally, and not exposed to consumers on their own compile classpath.
		+ implementation 'com.google.guava:guava:30.0-jre'
		+}
		+
		+tasks.named('test') {
		+ // Use junit platform for unit tests.
		+ useJUnitPlatform()
		+}

lib/src/main/java/com/keenwrite/quotes/Parser.java

		+/* Copyright 2020-2021 White Magic Software, Ltd. -- All rights reserved. */
		+package com.keenwrite.quotes;
		+
		+import java.util.function.Consumer;
		+
		+/**
		+ * Converts straight double/single quotes and apostrophes to curly equivalents.
		+ */
		+public class Parser implements Consumer<Token> {
		+ private final String mText;
		+
		+ public Parser( final String text ) {
		+ mText = text;
		+ }
		+
		+ public void parse() {
		+ final var tokenizer = new Tokenizer();
		+ tokenizer.tokenize( mText, this );
		+ }
		+
		+ @Override
		+ public void accept( final Token token ) {
		+ System.out.print( token.getType() + " " );
		+ }
		+}

lib/src/main/java/com/keenwrite/quotes/Token.java

		+/* Copyright 2020-2021 White Magic Software, Ltd. -- All rights reserved. */
		+package com.keenwrite.quotes;
		+
		+import static com.keenwrite.quotes.TokenType.INVALID;
		+
		+/**
		+ * Responsible for tracking the beginning and ending offsets of a token within
		+ * a text string.
		+ */
		+public class Token {
		+ /**
		+ * Denotes there are no more tokens: the end of text (EOT) has been reached.
		+ */
		+ public static final Token EOT = new Token();
		+
		+ private final TokenType mType;
		+ private final int mBegan;
		+ private final int mEnded;
		+
		+ /**
		+ * Set to {@code true} if there are more tokens to parse.
		+ */
		+ private final boolean mHasNext;
		+
		+ /**
		+ * Create an end of text token.
		+ */
		+ private Token() {
		+ this( INVALID, -1, -1, false );
		+ }
		+
		+ /**
		+ * Create a token that indicates there are no more tokens.
		+ */
		+ private Token( final TokenType type, final int began, final int ended ) {
		+ this( type, began, ended, true );
		+ }
		+
		+ /**
		+ * Create a token
		+ */
		+ private Token(
		+ final TokenType type, final int began, final int ended,
		+ final boolean hasNext ) {
		+ mType = type;
		+ mBegan = began;
		+ mEnded = ended + 1;
		+ mHasNext = hasNext;
		+ }
		+
		+ /**
		+ * Extracts a sequence of characters from the given text at the offsets
		+ * captured by this token.
		+ *
		+ * @param text The text that was parsed using this class.
		+ * @return The character string captured by the token.
		+ */
		+ public String toString( final String text ) {
		+ return text.substring( mBegan, mEnded );
		+ }
		+
		+ public boolean hasNext() {
		+ return mHasNext;
		+ }
		+
		+ public boolean isType( final TokenType type ) {
		+ return mType == type;
		+ }
		+
		+ TokenType getType() {
		+ return mType;
		+ }
		+
		+ static Token createToken(
		+ final TokenType token, final int began, final int ended ) {
		+ return new Token( token, began, ended );
		+ }
		+}

lib/src/main/java/com/keenwrite/quotes/TokenType.java

		+/* Copyright 2020-2021 White Magic Software, Ltd. -- All rights reserved. */
		+package com.keenwrite.quotes;
		+
		+public enum TokenType {
		+ QSINGLE,
		+ QDOUBLE,
		+ WORD,
		+ NUMBER,
		+ NEWLINE,
		+ SPACE,
		+ PUNCT,
		+ PERIOD,
		+ INVALID
		+}
		+
		+/*
		+ private enum TokenType {
		+// QUOTE_OPENING_SINGLE,
		+// QUOTE_OPENING_DOUBLE,
		+// QUOTE_CLOSING_SINGLE,
		+// QUOTE_CLOSING_DOUBLE,
		+// QUOTE_APOSTROPHE,
		+// QUOTE_PRIME_SINGLE,
		+// QUOTE_PRIME_DOUBLE,
		+ TEXT
		+ }

		+ */

lib/src/main/java/com/keenwrite/quotes/Tokenizer.java

		+/* Copyright 2020-2021 White Magic Software, Ltd. -- All rights reserved. */
		+package com.keenwrite.quotes;
		+
		+import java.text.CharacterIterator;
		+import java.text.StringCharacterIterator;
		+import java.util.function.Consumer;
		+
		+import static com.keenwrite.quotes.Token.createToken;
		+import static com.keenwrite.quotes.TokenType.*;
		+import static java.lang.Character.*;
		+import static java.text.CharacterIterator.DONE;
		+
		+/**
		+ * Tokenizes text into words, numbers, punctuation, spaces, and more.
		+ */
		+public class Tokenizer {
		+ /**
		+ * Default constructor, no state.
		+ */
		+ public Tokenizer() {
		+ }
		+
		+ /**
		+ * Emits a series of tokens that
		+ *
		+ * @param text The text to split into tokens.
		+ * @param consumer Receives each token as a separate event.
		+ */
		+ public void tokenize( final String text, final Consumer<Token> consumer ) {
		+ final var iterator = new StringCharacterIterator( text );
		+ Token token;
		+
		+ while( (token = tokenize( iterator )).hasNext() ) {
		+ consumer.accept( token );
		+ }
		+ }
		+
		+ private Token tokenize( final CharacterIterator i ) {
		+ int began = i.getIndex();
		+ boolean isWord = false;
		+ Token token = null;
		+
		+ do {
		+ final var curr = i.current();
		+
		+ if( curr == DONE ) {
		+ return Token.EOT;
		+ }
		+
		+ if( isLetter( curr ) ) {
		+ isWord = true;
		+
		+ if( !isLetterOrDigit( peek( i ) ) ) {
		+ token = createToken( WORD, began, i.getIndex() );
		+ }
		+ }
		+ else if( isDigit( curr ) \|\| isNumeric( curr ) && isDigit( peek( i ) ) ) {
		+ // Tokenize all consecutive number characters at prevent the main
		+ // loop from switching back to word tokens.
		+ char next;
		+
		+ do {
		+ next = i.next();
		+ }
		+ while( isDigit( next ) \|\| isNumeric( next ) && isDigit( peek( i ) ) );
		+
		+ // The loop above will overshoot the number by one character.
		+ i.previous();
		+
		+ token = createToken( isWord ? WORD : NUMBER, began, i.getIndex() );
		+ }
		+ else if( curr == '\r' ) {
		+ token = createToken( NEWLINE, began, i.getIndex() );
		+
		+ // Swallow the LF in CRLF; peeking won't work here.
		+ if( i.next() != '\n' ) {
		+ // Push back the non-LF char.
		+ i.previous();
		+ }
		+ }
		+ else if( curr == '\n' ) {
		+ token = createToken( NEWLINE, began, i.getIndex() );
		+ }
		+ else if( isWhitespace( curr ) ) {
		+ token = createToken( SPACE, began, i.getIndex() );
		+ }
		+ else if( curr == '\'' ) {
		+ token = createToken( QSINGLE, began, i.getIndex() );
		+ }
		+ else if( curr == '"' ) {
		+ token = createToken( QDOUBLE, began, i.getIndex() );
		+ }
		+ else if( curr == '.' ) {
		+ token = createToken( PERIOD, began, i.getIndex() );
		+ }
		+ else {
		+ token = createToken( PUNCT, began, i.getIndex() );
		+ }
		+
		+ i.next();
		+ }
		+ while( token == null );
		+
		+ return token;
		+ }
		+
		+ private static boolean isNumeric( final char curr ) {
		+ return curr == '.' \|\| curr == ',';
		+ }
		+
		+ private static char peek( final CharacterIterator ci ) {
		+ final var ch = ci.next();
		+ ci.previous();
		+ return ch;
		+ }
		+}

lib/src/test/java/com/keenwrite/quotes/ParserTest.java

		+/* Copyright 2020-2021 White Magic Software, Ltd. -- All rights reserved. */
		+package com.keenwrite.quotes;
		+
		+import org.junit.jupiter.api.Test;
		+
		+class ParserTest {
		+ @Test
		+ void test_Conversion_Straight_Curly() {
		+ final var parser = new Parser( "Salut tout le monde!\nÇa va?");
		+ parser.parse();
		+ System.out.println();
		+ }
		+}

lib/src/test/java/com/keenwrite/quotes/TokenizerTest.java

		+/* Copyright 2020-2021 White Magic Software, Ltd. -- All rights reserved. */
		+package com.keenwrite.quotes;
		+
		+import org.junit.jupiter.api.Test;
		+
		+import java.util.concurrent.atomic.AtomicInteger;
		+
		+import static com.keenwrite.quotes.TokenType.*;
		+import static org.junit.jupiter.api.Assertions.assertEquals;
		+
		+/**
		+ * Tests tokenizing words, numbers, punctuation, spaces, and newlines.
		+ */
		+class TokenizerTest {
		+ @Test
		+ void test_Tokenize_Words_TokenValues() {
		+ testText( "abc 123", "abc", " ", "123" );
		+ testText( "-123 abc", "-", "123", " ", "abc" );
		+ }
		+
		+ @Test
		+ void test_Tokenize_Numbers_EmitNumbers() {
		+ testType( ".123", NUMBER );
		+ testType( "-123.", PUNCT, NUMBER, PERIOD );
		+ testType( " 123.123.123", SPACE, NUMBER );
		+ testType( "123 123\"", NUMBER, SPACE, NUMBER, QDOUBLE );
		+ testType( "-123,123.123", PUNCT, NUMBER );
		+ }
		+
		+ @Test
		+ void test_Tokenize_Words_EmitWords() {
		+ testType( "abc", WORD );
		+ testType( "abc abc", WORD, SPACE, WORD );
		+ testType( "abc123", WORD );
		+ testType( "-123abc", PUNCT, NUMBER, WORD );
		+ testType( "abc-o'-abc", WORD, PUNCT, WORD, QSINGLE, PUNCT, WORD );
		+ }
		+
		+ @Test
		+ void test_Tokenize_PunctuationMarks_EmitPunctuationMarks() {
		+ testType( "!", PUNCT );
		+ testType( ";", PUNCT );
		+ testType( ".", PERIOD );
		+ }
		+
		+ @Test
		+ void test_Tokenize_Quotes_EmitQuotes() {
		+ testType( "'", QSINGLE );
		+ testType( "\"", QDOUBLE );
		+ testType( "3 o'clock", NUMBER, SPACE, WORD, QSINGLE, WORD );
		+ }
		+
		+ @Test
		+ void test_Tokenize_Newlines_EmitNewlines() {
		+ testType( "\r", NEWLINE );
		+ testType( "\n", NEWLINE );
		+ testType( "\r\n", NEWLINE );
		+ testType( "\r\n\r\n", NEWLINE, NEWLINE );
		+ testType( "\r\n\n\r", NEWLINE, NEWLINE, NEWLINE );
		+ testType( "abc \r\nabc\n", WORD, SPACE, NEWLINE, WORD, NEWLINE );
		+ }
		+
		+ private void testType(
		+ final String actual, final TokenType... expected ) {
		+ final var tokenizer = new Tokenizer();
		+ final var counter = new AtomicInteger();
		+
		+ tokenizer.tokenize( actual, ( token ) -> {
		+ final var expectedType = expected[ counter.getAndIncrement() ];
		+ final var actualType = token.getType();
		+ assertEquals( expectedType, actualType );
		+ } );
		+
		+ // Ensure all expected tokens are matched (verify end of text reached).
		+ assertEquals( expected.length, counter.get() );
		+ }
		+
		+ private void testText( final String actual, final String... expected ) {
		+ final var tokenizer = new Tokenizer();
		+ final var counter = new AtomicInteger();
		+
		+ tokenizer.tokenize( actual, ( token ) -> {
		+ final var expectedText = expected[ counter.getAndIncrement() ];
		+ final var actualText = token.toString( actual );
		+ assertEquals( expectedText, actualText );
		+ } );
		+
		+ // Ensure all expected tokens are matched (verify end of text reached).
		+ assertEquals( expected.length, counter.get() );
		+ }
		+}

settings.gradle

		+rootProject.name = 'quotes'
		+include('lib')
		+

Add tokenizer implementation