Dave Jarvis' Repositories

git clone https://repo.autonoma.ca/repo/keenquotes.git

Add tokenizer implementation

Author Dave Jarvis <email>
Date 2021-05-30 09:47:16 GMT-0700
Commit 0b537930c425135782d2564fae46b58fad3715ee
Delta 411 lines added, 0 lines removed, 411-line increase
.gitattributes
+#
+# https://help.github.com/articles/dealing-with-line-endings/
+#
+# These are explicitly windows files and should use crlf
+*.bat text eol=crlf
+
.gitignore
+# Ignore Gradle project-specific cache directory
+.gradle
+
+# Ignore Gradle build output directory
+build
README.md
+# KeenQuotes
+
+A lightweight natural language parser that converts straight quotes
+to curly quotes.
+
+# Build
+
+Clone the repository and change to the root directory as usual, then run:
+
+ gradle clean build
+
lib/build.gradle
+/*
+ * This file was generated by the Gradle 'init' task.
+ *
+ * This generated file contains a sample Java library project to get you started.
+ * For more details take a look at the 'Building Java & JVM projects' chapter in the Gradle
+ * User Manual available at https://docs.gradle.org/7.0/userguide/building_java_projects.html
+ */
+
+plugins {
+ // Apply the java-library plugin for API and implementation separation.
+ id 'java-library'
+}
+
+repositories {
+ // Use Maven Central for resolving dependencies.
+ mavenCentral()
+}
+
+dependencies {
+ // Use JUnit Jupiter API for testing.
+ testImplementation 'org.junit.jupiter:junit-jupiter-api:5.7.1'
+
+ // Use JUnit Jupiter Engine for testing.
+ testRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine'
+
+ // This dependency is exported to consumers, that is to say found on their compile classpath.
+ api 'org.apache.commons:commons-math3:3.6.1'
+
+ // This dependency is used internally, and not exposed to consumers on their own compile classpath.
+ implementation 'com.google.guava:guava:30.0-jre'
+}
+
+tasks.named('test') {
+ // Use junit platform for unit tests.
+ useJUnitPlatform()
+}
lib/src/main/java/com/keenwrite/quotes/Parser.java
+/* Copyright 2020-2021 White Magic Software, Ltd. -- All rights reserved. */
+package com.keenwrite.quotes;
+
+import java.util.function.Consumer;
+
+/**
+ * Converts straight double/single quotes and apostrophes to curly equivalents.
+ */
+public class Parser implements Consumer<Token> {
+ private final String mText;
+
+ public Parser( final String text ) {
+ mText = text;
+ }
+
+ public void parse() {
+ final var tokenizer = new Tokenizer();
+ tokenizer.tokenize( mText, this );
+ }
+
+ @Override
+ public void accept( final Token token ) {
+ System.out.print( token.getType() + " " );
+ }
+}
lib/src/main/java/com/keenwrite/quotes/Token.java
+/* Copyright 2020-2021 White Magic Software, Ltd. -- All rights reserved. */
+package com.keenwrite.quotes;
+
+import static com.keenwrite.quotes.TokenType.INVALID;
+
+/**
+ * Responsible for tracking the beginning and ending offsets of a token within
+ * a text string.
+ */
+public class Token {
+ /**
+ * Denotes there are no more tokens: the end of text (EOT) has been reached.
+ */
+ public static final Token EOT = new Token();
+
+ private final TokenType mType;
+ private final int mBegan;
+ private final int mEnded;
+
+ /**
+ * Set to {@code true} if there are more tokens to parse.
+ */
+ private final boolean mHasNext;
+
+ /**
+ * Create an end of text token.
+ */
+ private Token() {
+ this( INVALID, -1, -1, false );
+ }
+
+ /**
+ * Create a token that indicates there are no more tokens.
+ */
+ private Token( final TokenType type, final int began, final int ended ) {
+ this( type, began, ended, true );
+ }
+
+ /**
+ * Create a token
+ */
+ private Token(
+ final TokenType type, final int began, final int ended,
+ final boolean hasNext ) {
+ mType = type;
+ mBegan = began;
+ mEnded = ended + 1;
+ mHasNext = hasNext;
+ }
+
+ /**
+ * Extracts a sequence of characters from the given text at the offsets
+ * captured by this token.
+ *
+ * @param text The text that was parsed using this class.
+ * @return The character string captured by the token.
+ */
+ public String toString( final String text ) {
+ return text.substring( mBegan, mEnded );
+ }
+
+ public boolean hasNext() {
+ return mHasNext;
+ }
+
+ public boolean isType( final TokenType type ) {
+ return mType == type;
+ }
+
+ TokenType getType() {
+ return mType;
+ }
+
+ static Token createToken(
+ final TokenType token, final int began, final int ended ) {
+ return new Token( token, began, ended );
+ }
+}
lib/src/main/java/com/keenwrite/quotes/TokenType.java
+/* Copyright 2020-2021 White Magic Software, Ltd. -- All rights reserved. */
+package com.keenwrite.quotes;
+
+public enum TokenType {
+ QSINGLE,
+ QDOUBLE,
+ WORD,
+ NUMBER,
+ NEWLINE,
+ SPACE,
+ PUNCT,
+ PERIOD,
+ INVALID
+}
+
+/*
+ private enum TokenType {
+// QUOTE_OPENING_SINGLE,
+// QUOTE_OPENING_DOUBLE,
+// QUOTE_CLOSING_SINGLE,
+// QUOTE_CLOSING_DOUBLE,
+// QUOTE_APOSTROPHE,
+// QUOTE_PRIME_SINGLE,
+// QUOTE_PRIME_DOUBLE,
+ TEXT
+ }
+ */
lib/src/main/java/com/keenwrite/quotes/Tokenizer.java
+/* Copyright 2020-2021 White Magic Software, Ltd. -- All rights reserved. */
+package com.keenwrite.quotes;
+
+import java.text.CharacterIterator;
+import java.text.StringCharacterIterator;
+import java.util.function.Consumer;
+
+import static com.keenwrite.quotes.Token.createToken;
+import static com.keenwrite.quotes.TokenType.*;
+import static java.lang.Character.*;
+import static java.text.CharacterIterator.DONE;
+
+/**
+ * Tokenizes text into words, numbers, punctuation, spaces, and more.
+ */
+public class Tokenizer {
+ /**
+ * Default constructor, no state.
+ */
+ public Tokenizer() {
+ }
+
+ /**
+ * Emits a series of tokens that
+ *
+ * @param text The text to split into tokens.
+ * @param consumer Receives each token as a separate event.
+ */
+ public void tokenize( final String text, final Consumer<Token> consumer ) {
+ final var iterator = new StringCharacterIterator( text );
+ Token token;
+
+ while( (token = tokenize( iterator )).hasNext() ) {
+ consumer.accept( token );
+ }
+ }
+
+ private Token tokenize( final CharacterIterator i ) {
+ int began = i.getIndex();
+ boolean isWord = false;
+ Token token = null;
+
+ do {
+ final var curr = i.current();
+
+ if( curr == DONE ) {
+ return Token.EOT;
+ }
+
+ if( isLetter( curr ) ) {
+ isWord = true;
+
+ if( !isLetterOrDigit( peek( i ) ) ) {
+ token = createToken( WORD, began, i.getIndex() );
+ }
+ }
+ else if( isDigit( curr ) || isNumeric( curr ) && isDigit( peek( i ) ) ) {
+ // Tokenize all consecutive number characters at prevent the main
+ // loop from switching back to word tokens.
+ char next;
+
+ do {
+ next = i.next();
+ }
+ while( isDigit( next ) || isNumeric( next ) && isDigit( peek( i ) ) );
+
+ // The loop above will overshoot the number by one character.
+ i.previous();
+
+ token = createToken( isWord ? WORD : NUMBER, began, i.getIndex() );
+ }
+ else if( curr == '\r' ) {
+ token = createToken( NEWLINE, began, i.getIndex() );
+
+ // Swallow the LF in CRLF; peeking won't work here.
+ if( i.next() != '\n' ) {
+ // Push back the non-LF char.
+ i.previous();
+ }
+ }
+ else if( curr == '\n' ) {
+ token = createToken( NEWLINE, began, i.getIndex() );
+ }
+ else if( isWhitespace( curr ) ) {
+ token = createToken( SPACE, began, i.getIndex() );
+ }
+ else if( curr == '\'' ) {
+ token = createToken( QSINGLE, began, i.getIndex() );
+ }
+ else if( curr == '"' ) {
+ token = createToken( QDOUBLE, began, i.getIndex() );
+ }
+ else if( curr == '.' ) {
+ token = createToken( PERIOD, began, i.getIndex() );
+ }
+ else {
+ token = createToken( PUNCT, began, i.getIndex() );
+ }
+
+ i.next();
+ }
+ while( token == null );
+
+ return token;
+ }
+
+ private static boolean isNumeric( final char curr ) {
+ return curr == '.' || curr == ',';
+ }
+
+ private static char peek( final CharacterIterator ci ) {
+ final var ch = ci.next();
+ ci.previous();
+ return ch;
+ }
+}
lib/src/test/java/com/keenwrite/quotes/ParserTest.java
+/* Copyright 2020-2021 White Magic Software, Ltd. -- All rights reserved. */
+package com.keenwrite.quotes;
+
+import org.junit.jupiter.api.Test;
+
+class ParserTest {
+ @Test
+ void test_Conversion_Straight_Curly() {
+ final var parser = new Parser( "Salut tout le monde!\nÇa va?");
+ parser.parse();
+ System.out.println();
+ }
+}
lib/src/test/java/com/keenwrite/quotes/TokenizerTest.java
+/* Copyright 2020-2021 White Magic Software, Ltd. -- All rights reserved. */
+package com.keenwrite.quotes;
+
+import org.junit.jupiter.api.Test;
+
+import java.util.concurrent.atomic.AtomicInteger;
+
+import static com.keenwrite.quotes.TokenType.*;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+/**
+ * Tests tokenizing words, numbers, punctuation, spaces, and newlines.
+ */
+class TokenizerTest {
+ @Test
+ void test_Tokenize_Words_TokenValues() {
+ testText( "abc 123", "abc", " ", "123" );
+ testText( "-123 abc", "-", "123", " ", "abc" );
+ }
+
+ @Test
+ void test_Tokenize_Numbers_EmitNumbers() {
+ testType( ".123", NUMBER );
+ testType( "-123.", PUNCT, NUMBER, PERIOD );
+ testType( " 123.123.123", SPACE, NUMBER );
+ testType( "123 123\"", NUMBER, SPACE, NUMBER, QDOUBLE );
+ testType( "-123,123.123", PUNCT, NUMBER );
+ }
+
+ @Test
+ void test_Tokenize_Words_EmitWords() {
+ testType( "abc", WORD );
+ testType( "abc abc", WORD, SPACE, WORD );
+ testType( "abc123", WORD );
+ testType( "-123abc", PUNCT, NUMBER, WORD );
+ testType( "abc-o'-abc", WORD, PUNCT, WORD, QSINGLE, PUNCT, WORD );
+ }
+
+ @Test
+ void test_Tokenize_PunctuationMarks_EmitPunctuationMarks() {
+ testType( "!", PUNCT );
+ testType( ";", PUNCT );
+ testType( ".", PERIOD );
+ }
+
+ @Test
+ void test_Tokenize_Quotes_EmitQuotes() {
+ testType( "'", QSINGLE );
+ testType( "\"", QDOUBLE );
+ testType( "3 o'clock", NUMBER, SPACE, WORD, QSINGLE, WORD );
+ }
+
+ @Test
+ void test_Tokenize_Newlines_EmitNewlines() {
+ testType( "\r", NEWLINE );
+ testType( "\n", NEWLINE );
+ testType( "\r\n", NEWLINE );
+ testType( "\r\n\r\n", NEWLINE, NEWLINE );
+ testType( "\r\n\n\r", NEWLINE, NEWLINE, NEWLINE );
+ testType( "abc \r\nabc\n", WORD, SPACE, NEWLINE, WORD, NEWLINE );
+ }
+
+ private void testType(
+ final String actual, final TokenType... expected ) {
+ final var tokenizer = new Tokenizer();
+ final var counter = new AtomicInteger();
+
+ tokenizer.tokenize( actual, ( token ) -> {
+ final var expectedType = expected[ counter.getAndIncrement() ];
+ final var actualType = token.getType();
+ assertEquals( expectedType, actualType );
+ } );
+
+ // Ensure all expected tokens are matched (verify end of text reached).
+ assertEquals( expected.length, counter.get() );
+ }
+
+ private void testText( final String actual, final String... expected ) {
+ final var tokenizer = new Tokenizer();
+ final var counter = new AtomicInteger();
+
+ tokenizer.tokenize( actual, ( token ) -> {
+ final var expectedText = expected[ counter.getAndIncrement() ];
+ final var actualText = token.toString( actual );
+ assertEquals( expectedText, actualText );
+ } );
+
+ // Ensure all expected tokens are matched (verify end of text reached).
+ assertEquals( expected.length, counter.get() );
+ }
+}
settings.gradle
+rootProject.name = 'quotes'
+include('lib')
+