Dave Jarvis' Repositories

git clone https://repo.autonoma.ca/repo/keenquotes.git

Update parser to detect some contractions

AuthorDave Jarvis <email>
Date2021-05-31 00:04:24 GMT-0700
Commit5aaef2914036bc30c89d07651e24f0f860f7bbde
Parent3c33bc6
lib/build.gradle
plugins {
- // Apply the java-library plugin for API and implementation separation.
- id 'java-library'
+ // Apply the java-library plugin for API and implementation separation.
+ id 'java-library'
}
repositories {
- // Use Maven Central for resolving dependencies.
- mavenCentral()
+ // Use Maven Central for resolving dependencies.
+ mavenCentral()
}
dependencies {
- // Use JUnit Jupiter API for testing.
- testImplementation 'org.junit.jupiter:junit-jupiter-api:5.7.1'
-
- // Use JUnit Jupiter Engine for testing.
- testRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine'
-
- // This dependency is exported to consumers, that is to say found on their compile classpath.
- api 'org.apache.commons:commons-math3:3.6.1'
+ // Use JUnit Jupiter API for testing.
+ testImplementation 'org.junit.jupiter:junit-jupiter-api:5.7.1'
- // This dependency is used internally, and not exposed to consumers on their own compile classpath.
- implementation 'com.google.guava:guava:30.0-jre'
+ // Use JUnit Jupiter Engine for testing.
+ testRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine'
}
tasks.named('test') {
- // Use junit platform for unit tests.
- useJUnitPlatform()
+ // Use junit platform for unit tests.
+ useJUnitPlatform()
}
lib/src/main/java/com/keenwrite/quotes/CircularFifoQueue.java
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.keenwrite.quotes;
+
+import java.util.*;
+
+/**
+ * CircularFifoQueue is a first-in first-out queue with a fixed size that
+ * replaces its oldest element if full.
+ * <p>
+ * The removal order of a {@link CircularFifoQueue} is based on the
+ * insertion order; elements are removed in the same order in which they
+ * were added. The iteration order is the same as the removal order.
+ * </p>
+ * <p>
+ * The {@link #add(Object)}, {@link #remove()}, {@link #peek()},
+ * {@link #poll()},
+ * {@link #offer(Object)} operations all perform in constant time.
+ * All other operations perform in linear time or worse.
+ * </p>
+ * <p>
+ * This queue prevents null objects from being added.
+ * </p>
+ *
+ * @param <E> the type of elements in this collection
+ * @since 4.0
+ */
+public class CircularFifoQueue<E> extends AbstractCollection<E>
+ implements Queue<E> {
+ /**
+ * Underlying storage array.
+ */
+ private final transient E[] elements;
+
+ /**
+ * Array index of first (oldest) queue element.
+ */
+ private transient int start;
+
+ /**
+ * Index mod maxElements of the array position following the last queue
+ * element. Queue elements start at elements[start] and "wrap around"
+ * elements[maxElements-1], ending at elements[decrement(end)].
+ * For example, elements = {c,a,b}, start=1, end=1 corresponds to
+ * the queue [a,b,c].
+ */
+ private transient int end;
+
+ /**
+ * Flag to indicate if the queue is currently full.
+ */
+ private transient boolean full;
+
+ /**
+ * Capacity of the queue.
+ */
+ private final int maxElements;
+
+ /**
+ * Constructor that creates a queue with the specified size.
+ *
+ * @param size the size of the queue (cannot be changed)
+ * @throws IllegalArgumentException if the size is &lt; 1
+ */
+ @SuppressWarnings( "unchecked" )
+ public CircularFifoQueue( final int size ) {
+ if( size <= 0 ) {
+ throw new IllegalArgumentException( "The size must be greater than 0" );
+ }
+
+ elements = (E[]) new Object[ size ];
+ maxElements = elements.length;
+ }
+
+ /**
+ * Returns the number of elements stored in the queue.
+ *
+ * @return this queue's size
+ */
+ @Override
+ public int size() {
+ int size;
+
+ if( end < start ) {
+ size = maxElements - start + end;
+ }
+ else if( end == start ) {
+ size = full ? maxElements : 0;
+ }
+ else {
+ size = end - start;
+ }
+
+ return size;
+ }
+
+ /**
+ * Returns true if this queue is empty; false otherwise.
+ *
+ * @return true if this queue is empty
+ */
+ @Override
+ public boolean isEmpty() {
+ return size() == 0;
+ }
+
+ /**
+ * Returns {@code true} if the capacity limit of this queue has been reached,
+ * i.e. the number of elements stored in the queue equals its maximum size.
+ *
+ * @return {@code true} if the capacity limit has been reached, {@code
+ * false} otherwise
+ * @since 4.1
+ */
+ public boolean isAtFullCapacity() {
+ return size() == maxElements;
+ }
+
+ /**
+ * Clears this queue.
+ */
+ @Override
+ public void clear() {
+ full = false;
+ start = 0;
+ end = 0;
+ Arrays.fill( elements, null );
+ }
+
+ /**
+ * Adds the given element to this queue. If the queue is full, the least
+ * recently added
+ * element is discarded so that a new element can be inserted.
+ *
+ * @param element the element to add
+ * @return true, always
+ * @throws NullPointerException if the given element is null
+ */
+ @Override
+ public boolean add( final E element ) {
+ Objects.requireNonNull( element, "element" );
+
+ if( isAtFullCapacity() ) {
+ remove();
+ }
+
+ elements[ end++ ] = element;
+
+ if( end >= maxElements ) {
+ end = 0;
+ }
+
+ if( end == start ) {
+ full = true;
+ }
+
+ return true;
+ }
+
+ /**
+ * Returns the element at the specified position in this queue.
+ *
+ * @param index the position of the element in the queue
+ * @return the element at position {@code index}
+ * @throws NoSuchElementException if the requested position is outside the
+ * range [0, size)
+ */
+ public E get( final int index ) {
+ final int sz = size();
+ if( index < 0 || index >= sz ) {
+ throw new NoSuchElementException(
+ String.format(
+ "The specified index %1$d is outside the available range [0, %2$d)",
+ index,
+ sz ) );
+ }
+
+ final int idx = (start + index) % maxElements;
+ return elements[ idx ];
+ }
+
+ /**
+ * Adds the given element to this queue. If the queue is full, the least
+ * recently added
+ * element is discarded so that a new element can be inserted.
+ *
+ * @param element the element to add
+ * @return true, always
+ * @throws NullPointerException if the given element is null
+ */
+ @Override
+ public boolean offer( final E element ) {
+ return add( element );
+ }
+
+ @Override
+ public E poll() {
+ if( isEmpty() ) {
+ return null;
+ }
+ return remove();
+ }
+
+ @Override
+ public E element() {
+ if( isEmpty() ) {
+ throw new NoSuchElementException( "queue is empty" );
+ }
+
+ return peek();
+ }
+
+ @Override
+ public E peek() {
+ return isEmpty() ? null : elements[ start ];
+ }
+
+ @Override
+ public E remove() {
+ if( isEmpty() ) {
+ throw new NoSuchElementException( "queue is empty" );
+ }
+
+ final E element = elements[ start ];
+ if( null != element ) {
+ elements[ start++ ] = null;
+
+ if( start >= maxElements ) {
+ start = 0;
+ }
+ full = false;
+ }
+ return element;
+ }
+
+ //-----------------------------------------------------------------------
+
+ /**
+ * Increments the internal index.
+ *
+ * @param index the index to increment
+ * @return the updated index
+ */
+ private int increment( int index ) {
+ index++;
+ if( index >= maxElements ) {
+ index = 0;
+ }
+ return index;
+ }
+
+ /**
+ * Decrements the internal index.
+ *
+ * @param index the index to decrement
+ * @return the updated index
+ */
+ private int decrement( int index ) {
+ index--;
+ if( index < 0 ) {
+ index = maxElements - 1;
+ }
+ return index;
+ }
+
+ /**
+ * Returns an iterator over this queue's elements.
+ *
+ * @return an iterator over this queue's elements
+ */
+ @Override
+ public Iterator<E> iterator() {
+ return new Iterator<>() {
+
+ private int index = start;
+ private int lastReturnedIndex = -1;
+ private boolean isFirst = full;
+
+ @Override
+ public boolean hasNext() {
+ return isFirst || index != end;
+ }
+
+ @Override
+ public E next() {
+ if( !hasNext() ) {
+ throw new NoSuchElementException();
+ }
+
+ isFirst = false;
+ lastReturnedIndex = index;
+ index = increment( index );
+ return elements[ lastReturnedIndex ];
+ }
+
+ @Override
+ public void remove() {
+ if( lastReturnedIndex == -1 ) {
+ throw new IllegalStateException();
+ }
+
+ // First element can be removed quickly
+ if( lastReturnedIndex == start ) {
+ CircularFifoQueue.this.remove();
+ lastReturnedIndex = -1;
+ return;
+ }
+
+ int pos = lastReturnedIndex + 1;
+ if( start < lastReturnedIndex && pos < end ) {
+ // shift in one part
+ System.arraycopy(
+ elements, pos, elements, lastReturnedIndex, end - pos );
+ }
+ else {
+ // Other elements require us to shift the subsequent elements
+ while( pos != end ) {
+ if( pos >= maxElements ) {
+ elements[ pos - 1 ] = elements[ 0 ];
+ pos = 0;
+ }
+ else {
+ elements[ decrement( pos ) ] = elements[ pos ];
+ pos = increment( pos );
+ }
+ }
+ }
+
+ lastReturnedIndex = -1;
+ end = decrement( end );
+ elements[ end ] = null;
+ full = false;
+ index = decrement( index );
+ }
+ };
+ }
+}
lib/src/main/java/com/keenwrite/quotes/Parser.java
package com.keenwrite.quotes;
+import java.util.ArrayDeque;
+import java.util.Deque;
import java.util.function.Consumer;
+
+import static com.keenwrite.quotes.TokenType.*;
/**
* Converts straight double/single quotes and apostrophes to curly equivalents.
+ * First, handle single quotes as apostrophes, which include:
+ * <ol>
+ * <li>Inner contractions (WORD ' WORD) -- you'd've</li>
+ * <li>Inner contractions (PERIOD ' WORD) -- Ph.d.'ll</li>
+ * <li>Numeric contractions (NUMBER ' WORD) -- 70's</li>
+ * <li>Outer contractions (' WORD ') -- 'n'</li>
+ * <li>Unambiguous beginning contractions (' WORD) -- 'Twas</li>
+ * </ol>
+ * Next, handle single and double quotes as primes and double primes:
+ * <ol>
+ * <li>Single prime (NUMBER ') -- 2'</li>
+ * <li>Double prime (NUMBER ") -- 7.5"</li>
+ * </ol>
+ * Next, handle balanced double quotes:
+ * <ol>
+ * <li>Double quotes (" (WORD (SPACE+ WORD)? (PUNCT | PERIOD))+ ")</li>
+ * <li>Single quotes (' (WORD (SPACE+ WORD)? (PUNCT | PERIOD))+ ')</li>
+ * </ol>
*/
public class Parser implements Consumer<Token> {
private final String mText;
+ private final CircularFifoQueue<Token> mTokens = new CircularFifoQueue<>( 3 );
+
+ private final Deque<Token> mStack = new ArrayDeque<>();
public Parser( final String text ) {
mText = text;
+ mTokens.add( Token.EOT );
+ mTokens.add( Token.EOT );
+ mTokens.add( Token.EOT );
}
public void parse() {
final var tokenizer = new Tokenizer();
tokenizer.tokenize( mText, this );
}
@Override
public void accept( final Token token ) {
- System.out.print( token.getType() + " " );
+ mTokens.add( token );
+
+ final var token1 = mTokens.get( 0 );
+ final var token2 = mTokens.get( 1 );
+ final var token3 = mTokens.get( 2 );
+
+ if( token2.isType( QUOTE_SINGLE ) &&
+ token3.isType( WORD ) &&
+ token1.anyType( WORD, PERIOD, NUMBER ) ) {
+ System.out.println( "APOSTROPHE: " + token2 );
+ }
+ else if( token1.isType( QUOTE_SINGLE ) &&
+ "n".equalsIgnoreCase( token2.toString( mText ) ) &&
+ token3.isType( QUOTE_SINGLE ) ) {
+ System.out.printf( "APOSTROPHES: %s %s%n", token1, token3 );
+ }
+ else if( token1.isType( NUMBER ) && token2.isType( QUOTE_SINGLE ) ) {
+ System.out.println( "PRIME: " + token2 );
+ }
+ else if( token1.isType( NUMBER ) && token2.isType( QUOTE_DOUBLE ) ) {
+ System.out.println( "DOUBLE PRIME: " + token2 );
+ }
+ else if( token.anyType( QUOTE_SINGLE, QUOTE_DOUBLE ) ) {
+ mStack.push( token );
+
+ if( mStack.isEmpty() ) {
+ System.out.println( "EMPTY STACK?!" );
+ }
+ }
}
}
+
+/*
+ private enum TokenType {
+ QUOTE_OPENING_SINGLE,
+ QUOTE_OPENING_DOUBLE,
+ QUOTE_CLOSING_SINGLE,
+ QUOTE_CLOSING_DOUBLE,
+ QUOTE_APOSTROPHE,
+ QUOTE_PRIME_SINGLE,
+ QUOTE_PRIME_DOUBLE,
+ TEXT
+ }
+*/
lib/src/main/java/com/keenwrite/quotes/Token.java
}
+ public boolean anyType( final TokenType... types ) {
+ for( final var type : types ) {
+ if( mType == type ) {
+ return true;
+ }
+ }
+
+ return false;
+ }
+
TokenType getType() {
return mType;
+ }
+
+ @Override
+ public String toString() {
+ return getClass().getSimpleName() + "{" +
+ "mType=" + mType +
+ ", mBegan=" + mBegan +
+ ", mEnded=" + mEnded +
+ '}';
}
lib/src/main/java/com/keenwrite/quotes/TokenType.java
public enum TokenType {
- QSINGLE,
- QDOUBLE,
+ QUOTE_SINGLE,
+ QUOTE_DOUBLE,
WORD,
NUMBER,
NEWLINE,
SPACE,
PUNCT,
PERIOD,
INVALID
}
-
-/*
- private enum TokenType {
-// QUOTE_OPENING_SINGLE,
-// QUOTE_OPENING_DOUBLE,
-// QUOTE_CLOSING_SINGLE,
-// QUOTE_CLOSING_DOUBLE,
-// QUOTE_APOSTROPHE,
-// QUOTE_PRIME_SINGLE,
-// QUOTE_PRIME_DOUBLE,
- TEXT
- }
- */
lib/src/main/java/com/keenwrite/quotes/Tokenizer.java
/**
- * Emits a series of tokens that
+ * Emits a series of tokens that represent information about text that is
+ * needed to convert straight quotes to curly quotes.
*
* @param text The text to split into tokens.
}
+ /**
+ * Tokenizes a sequence of characters. The order of comparisons is optimized
+ * towards probability of the occurrence of a character in regular English
+ * prose: letters, space, quotation marks, numbers, periods, new lines,
+ * then end of text.
+ *
+ * @param i The sequence of characters to tokenize.
+ * @return The next token in the sequence.
+ */
private Token tokenize( final CharacterIterator i ) {
int began = i.getIndex();
boolean isWord = false;
Token token = null;
do {
final var curr = i.current();
-
- if( curr == DONE ) {
- return Token.EOT;
- }
if( isLetter( curr ) ) {
isWord = true;
if( !isLetterOrDigit( peek( i ) ) ) {
token = createToken( WORD, began, i.getIndex() );
}
+ }
+ else if( curr == ' ' ) {
+ token = createToken( SPACE, began, i.getIndex() );
+ }
+ else if( curr == '\'' ) {
+ token = createToken( QUOTE_SINGLE, began, i.getIndex() );
+ }
+ else if( curr == '"' ) {
+ token = createToken( QUOTE_DOUBLE, began, i.getIndex() );
}
else if( isDigit( curr ) || isNumeric( curr ) && isDigit( peek( i ) ) ) {
token = createToken( isWord ? WORD : NUMBER, began, i.getIndex() );
+ }
+ else if( curr == '.' ) {
+ token = createToken( PERIOD, began, i.getIndex() );
}
else if( curr == '\r' ) {
else if( isWhitespace( curr ) ) {
token = createToken( SPACE, began, i.getIndex() );
- }
- else if( curr == '\'' ) {
- token = createToken( QSINGLE, began, i.getIndex() );
- }
- else if( curr == '"' ) {
- token = createToken( QDOUBLE, began, i.getIndex() );
}
- else if( curr == '.' ) {
- token = createToken( PERIOD, began, i.getIndex() );
+ else if( curr != DONE ) {
+ token = createToken( PUNCT, began, i.getIndex() );
}
else {
- token = createToken( PUNCT, began, i.getIndex() );
+ token = Token.EOT;
}
lib/src/test/java/com/keenwrite/quotes/ParserTest.java
@Test
void test_Conversion_Straight_Curly() {
- final var parser = new Parser( "Salut tout le monde!\nÇa va?");
- parser.parse();
- System.out.println();
+ new Parser( "\"It's the 70's jack-o'-lantern\"").parse();
+ new Parser( "Fish-'n'-chips!").parse();
+ new Parser( "That's a 35' x 10\" yacht!").parse();
+ //new Parser( "'70s are Sams' faves.'").parse();
}
}
lib/src/test/java/com/keenwrite/quotes/TokenizerTest.java
testType( "-123.", PUNCT, NUMBER, PERIOD );
testType( " 123.123.123", SPACE, NUMBER );
- testType( "123 123\"", NUMBER, SPACE, NUMBER, QDOUBLE );
+ testType( "123 123\"", NUMBER, SPACE, NUMBER, QUOTE_DOUBLE );
testType( "-123,123.123", PUNCT, NUMBER );
}
@Test
void test_Tokenize_Words_EmitWords() {
testType( "abc", WORD );
testType( "abc abc", WORD, SPACE, WORD );
testType( "abc123", WORD );
testType( "-123abc", PUNCT, NUMBER, WORD );
- testType( "abc-o'-abc", WORD, PUNCT, WORD, QSINGLE, PUNCT, WORD );
+ testType( "abc-o'-abc", WORD, PUNCT, WORD, QUOTE_SINGLE, PUNCT, WORD );
}
@Test
void test_Tokenize_Quotes_EmitQuotes() {
- testType( "'", QSINGLE );
- testType( "\"", QDOUBLE );
- testType( "3 o'clock", NUMBER, SPACE, WORD, QSINGLE, WORD );
+ testType( "'", QUOTE_SINGLE );
+ testType( "\"", QUOTE_DOUBLE );
+ testType( "3 o'clock", NUMBER, SPACE, WORD, QUOTE_SINGLE, WORD );
}
Delta487 lines added, 54 lines removed, 433-line increase