Dave Jarvis' Repositories

git clone https://repo.autonoma.ca/repo/keenquotes.git

Performance improvements, add tests, simplify lexer

AuthorDave Jarvis <email>
Date2022-08-13 14:50:26 GMT-0700
Commit769e65ed7b07c0ef1243cac9746a6a7e06ae438b
Parent93c7eff
build.gradle
id 'application'
id 'com.palantir.git-version' version '0.12.3'
+ id "me.champeau.jmh" version "0.6.6"
}
implementation 'info.picocli:picocli:4.6.3'
- testImplementation 'org.junit.jupiter:junit-jupiter-api:5.8.2'
- testImplementation 'org.junit.jupiter:junit-jupiter-params:5.8.2'
+ testImplementation 'org.junit.jupiter:junit-jupiter-api:5.9.0'
+ testImplementation 'org.junit.jupiter:junit-jupiter-params:5.9.0'
testRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine'
}
src/jmh/java/com/whitemagicsoftware/keenquotes/StringIterationBenchmark.java
+package com.whitemagicsoftware.keenquotes;
+
+import org.openjdk.jmh.annotations.Benchmark;
+
+import java.text.StringCharacterIterator;
+import java.util.StringTokenizer;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import static java.lang.Math.random;
+import static java.text.CharacterIterator.DONE;
+
+@SuppressWarnings( "unused" )
+public class StringIterationBenchmark {
+ private final static String CHARSET =
+ "ABCDEFGHIJKLM NOPQRSTUVWXYZ abcdefghijklm nopqrstuvxyz 01234 5 6789";
+ public final static int STRLEN = 1024 * 1024;
+
+ private static String generateText() {
+ final var sb = new StringBuilder( STRLEN );
+ final var len = CHARSET.length();
+
+ for( var i = 0; i < STRLEN; i++ ) {
+ sb.append( CHARSET.charAt( (int) (len * random()) ) );
+ }
+
+ return sb.toString();
+ }
+
+ @Benchmark
+ public void test_CharAtIterator() {
+ final var s = generateText();
+ final var length = s.length();
+ var index = 0;
+
+ while( index < length ) {
+ final var ch = s.charAt( index );
+ index++;
+ }
+
+ assert index == length;
+ }
+
+ @Benchmark
+ public void test_FastCharacterIterator() {
+ final var s = generateText();
+ final var i = new FastCharacterIterator( s );
+
+ char c = ' ';
+
+ while( c != DONE ) {
+ i.next();
+ c = i.current();
+ }
+
+ assert i.index() == STRLEN;
+ }
+
+ @Benchmark
+ public void test_StringCharacterIterator() {
+ final var s = generateText();
+ final var i = new StringCharacterIterator( s );
+ var index = 0;
+
+ char c = ' ';
+
+ while( c != DONE ) {
+ c = i.next();
+ index++;
+ }
+
+ assert index == STRLEN;
+ }
+
+ @Benchmark
+ public void test_CharArrayIterator() {
+ final var s = generateText();
+ final var i = s.toCharArray();
+ var index = 0;
+
+ for( final var ch : i ) {
+ index++;
+ }
+
+ assert index == STRLEN;
+ }
+
+ @Benchmark
+ public void test_StringTokenizer() {
+ final var s = generateText();
+ final var i = new StringTokenizer( s, " ", true );
+ var index = 0;
+
+ while( i.hasMoreTokens() ) {
+ final var token = i.nextToken();
+ index += token.length();
+ }
+
+ assert index == STRLEN;
+ }
+
+ @Benchmark
+ public void test_StreamIterator() {
+ final var s = generateText();
+ final var index = new AtomicInteger();
+
+ s.chars().forEach( codepoint -> {
+ final var ch = Character.valueOf( (char) codepoint );
+ index.incrementAndGet();
+ } );
+
+ assert index.get() == STRLEN;
+ }
+}
src/main/java/com/whitemagicsoftware/keenquotes/Contractions.java
"i",
// of|letter o
- "o",
+ "o"
);
src/main/java/com/whitemagicsoftware/keenquotes/FastCharacterIterator.java
+/* Copyright 2022 White Magic Software, Ltd. -- All rights reserved. */
+package com.whitemagicsoftware.keenquotes;
+
+import java.text.CharacterIterator;
+import java.text.StringCharacterIterator;
+import java.util.function.Function;
+
+import static java.text.CharacterIterator.DONE;
+
+/**
+ * Iterates over a string, much like {@link CharacterIterator}, but faster.
+ * This class gets 53 ops/s vs. 49 ops/s for {@link StringCharacterIterator}.
+ * In comparison, using unconstrained {@link String#charAt(int)} calls yields
+ * 57 ops/s.
+ * <p>
+ * <strong>Caution:</strong> This class offers minimal bounds checking to eke
+ * out some efficiency.
+ * </p>
+ */
+public class FastCharacterIterator {
+
+ private final String mS;
+ private final int mLen;
+
+ /**
+ * Starts at 0, not guaranteed to be within bounds.
+ */
+ private int mPos;
+
+ /**
+ * Constructs a new iterator that can advance through the given string
+ * one character at a time.
+ *
+ * @param s The string to iterate.
+ */
+ public FastCharacterIterator( final String s ) {
+ mS = s;
+ mLen = s.length();
+ }
+
+ /**
+ * Returns the total number of characters in the string to iterate.
+ *
+ * @return The string length.
+ */
+ public int length() {
+ return mLen;
+ }
+
+ /**
+ * Returns the iterated index. The return value is not guaranteed to be
+ * within the string bounds.
+ *
+ * @return The iterated index.
+ */
+ public int index() {
+ return mPos;
+ }
+
+ /**
+ * Returns the character at the currently iterated position in the string.
+ * This method performs bounds checking.
+ *
+ * @return {@link CharacterIterator#DONE} if there are no more characters.
+ */
+ public char current() {
+ return hasNext() ? mS.charAt( mPos ) : DONE;
+ }
+
+ /**
+ * Advances to the next character in the string, without bounds checking.
+ */
+ public void next() {
+ mPos++;
+ }
+
+ /**
+ * Advances to the previous character in the string, without bounds checking.
+ */
+ public void prev() {
+ --mPos;
+ }
+
+ /**
+ * Returns the next character in the string without consuming it. Multiple
+ * consecutive calls to this method will return the same value. This method
+ * performs bounds checking.
+ *
+ * @return {@link CharacterIterator#DONE} if there are no more characters.
+ */
+ public char peek() {
+ final var pos = mPos;
+ return pos + 1 < mLen ? mS.charAt( pos + 1 ) : DONE;
+ }
+
+ /**
+ * Answers whether the internal index is less than the string's length,
+ * meaning that calling {@link #next()} followed by {@link #current()} will
+ * succeed.
+ *
+ * @return {@code true} if there are more characters to iterate.
+ */
+ public boolean hasNext() {
+ return mPos < mLen;
+ }
+
+ /**
+ * Parse all characters that match a given function.
+ *
+ * @param f The function that determines when slurping stops.
+ * @return The number of characters parsed.
+ */
+ public int skip( final Function<Character, Boolean> f ) {
+ assert f != null;
+
+ final var began = index();
+
+ do {
+ next();
+ }
+ while( f.apply( current() ) );
+
+ // The loop always overshoots by one character.
+ prev();
+
+ return index() - began;
+ }
+
+ /**
+ * Creates a string from a subset of consecutive characters in the string
+ * being iterated. The calling class is responsible for bounds-checking.
+ *
+ * @param began The starting index, must be greater than or equal to zero
+ * and less than or equal to {@code ended}.
+ * @param ended The ending index, must be less than the string length.
+ * @return A substring of the iterated string.
+ * @throws IndexOutOfBoundsException Either or both parameters exceed the
+ * string's boundaries.
+ */
+ public String substring( final int began, final int ended ) {
+ assert began >= 0;
+ assert began <= ended;
+ assert ended < mLen;
+
+ return mS.substring( began, ended );
+ }
+}
src/main/java/com/whitemagicsoftware/keenquotes/Lexeme.java
* @param ended Offset into the text where this instance stops (0-based).
*/
- private Lexeme( final LexemeType type, final int began, final int ended ) {
+ public Lexeme( final LexemeType type, final int began, final int ended ) {
assert type != null;
assert began >= 0 || ended == E_INDEX;
- assert ended >= began || ended == E_INDEX;
+ assert began <= ended || ended == E_INDEX;
mType = type;
mBegan = began;
- mEnded = ended + 1;
- }
-
- static Lexeme createLexeme(
- final LexemeType lexeme, final int began, final int ended ) {
- assert lexeme != null;
- return new Lexeme( lexeme, began, ended );
+ mEnded = ended;
}
src/main/java/com/whitemagicsoftware/keenquotes/LexemeType.java
QUOTE_SINGLE_CLOSING,
QUOTE_DOUBLE,
+ QUOTE_DOUBLE_OPENING,
+ QUOTE_DOUBLE_CLOSING,
ESC_SINGLE,
ESC_DOUBLE,
src/main/java/com/whitemagicsoftware/keenquotes/Lexer.java
package com.whitemagicsoftware.keenquotes;
-import java.text.CharacterIterator;
-import java.text.StringCharacterIterator;
-import java.util.function.BiFunction;
+import java.util.function.Consumer;
-import static com.whitemagicsoftware.keenquotes.Lexeme.createLexeme;
import static com.whitemagicsoftware.keenquotes.LexemeType.*;
import static java.lang.Character.isWhitespace;
import static java.text.CharacterIterator.DONE;
/**
* Turns text into words, numbers, punctuation, spaces, and more.
*/
public class Lexer {
- /**
- * Iterates over the entire string of text to help produce lexemes.
- */
- private final CharacterIterator mIterator;
-
- /**
- * Constructs a {@link Lexer} capable of turning text int {@link Lexeme}s.
- *
- * @param text The text to lex.
- */
- Lexer( final String text ) {
- mIterator = new StringCharacterIterator( text );
- }
-
- Lexeme next() {
- return parse( mIterator );
- }
-
/**
* Tokenizes a sequence of characters. The order of comparisons is optimized
* towards probability of the occurrence of a character in regular English
* prose: letters, space, quotation marks, numbers, periods, new lines,
* then end of text.
*
- * @param i The sequence of characters to tokenize.
- * @return The next token in the sequence.
+ * @param text The sequence of characters to tokenize.
*/
- private Lexeme parse( final CharacterIterator i ) {
- int began = i.getIndex();
- boolean isWord = false;
- Lexeme lexeme = null;
+ public static void lex(
+ final String text,
+ final Consumer<Lexeme> emitter ) {
+ lex( text, emitter, filter -> {} );
+ }
- do {
- // Allow subclasses to skip character sequences. This allows XML tags
- // to be skipped.
- while( skip( i ) ) {
- began = i.getIndex();
- }
+ public static void lex(
+ final String text,
+ final Consumer<Lexeme> emitter,
+ final Consumer<FastCharacterIterator> filter ) {
+ lex( new FastCharacterIterator( text ), emitter, filter );
+ }
- final var curr = i.current();
+ public static void lex(
+ final FastCharacterIterator i,
+ final Consumer<Lexeme> emitter,
+ final Consumer<FastCharacterIterator> filter ) {
+ var index = i.index();
+ var length = i.length();
+ var curr = ' ';
- if( isLetter( curr ) ) {
- isWord = true;
+ while( index < length ) {
+ // Allow filters to skip character sequences (such as XML tags).
+ filter.accept( i );
- final var next = peek( i );
+ index = i.index();
+ curr = i.current();
- if( !isLetter( next ) && !isDigit( next ) ) {
- lexeme = createLexeme( WORD, began, i.getIndex() );
- }
+ var token = PUNCT;
+
+ if( isLetter( curr ) ) {
+ // T1000 is one word, not a word and a number.
+ i.skip( next -> isLetter( next ) || isDigit( next ) );
+ token = WORD;
}
else if( curr == ' ' ) {
- slurp( i, ( next, ci ) -> next == ' ' );
- lexeme = createLexeme( SPACE, began, i.getIndex() );
- }
- else if( curr == '\'' ) {
- lexeme = createLexeme( QUOTE_SINGLE, began, i.getIndex() );
- }
- else if( curr == '"' ) {
- lexeme = createLexeme( QUOTE_DOUBLE, began, i.getIndex() );
+ i.skip( next -> next == ' ' );
+ token = SPACE;
}
- else if( curr == '‘' ) {
- lexeme = createLexeme( QUOTE_SINGLE_OPENING, began, i.getIndex() );
+ else if( curr == '\r' || curr == '\n' ) {
+ final var cr = new int[]{curr == '\r' ? 1 : 0};
+ final var lf = new int[]{curr == '\n' ? 1 : 0};
+
+ // Swallow all consecutive CR (Mac), CRLF (Windows), and/or LF (Unix).
+ i.skip( next -> {
+ cr[ 0 ] += next == '\r' ? 1 : 0;
+ lf[ 0 ] += next == '\n' ? 1 : 0;
+
+ return next == '\r' || next == '\n';
+ } );
+
+ token = cr[ 0 ] + lf[ 0 ] == 1 || cr[ 0 ] == 1 && lf[ 0 ] == 1
+ ? EOL
+ : EOP;
}
- else if( curr == '’' ) {
- lexeme = createLexeme( QUOTE_SINGLE_CLOSING, began, i.getIndex() );
+ else if( isWhitespace( curr ) ) {
+ i.skip( Character::isWhitespace );
+ token = SPACE;
}
- else if( isDigit( curr ) || isNumeric( curr ) && isDigit( peek( i ) ) ) {
+ else if( isDigit( curr ) || isNumeric( curr ) && isDigit( i.peek() ) ) {
// Parse all consecutive number characters to prevent the main loop
// from switching back to word tokens.
- slurp( i, ( next, ci ) ->
- isDigit( next ) || isNumeric( next ) && isDigit( peek( ci ) )
+ i.skip(
+ next -> isDigit( next ) || isNumeric( next ) && isDigit( i.peek() )
);
-
- lexeme = createLexeme( isWord ? WORD : NUMBER, began, i.getIndex() );
+ token = NUMBER;
}
- else if( curr == '-' && peek( i ) != '-' ) {
- lexeme = createLexeme( HYPHEN, began, i.getIndex() );
+ else if( curr == '.' ) {
+ token =
+ i.skip( next -> next == '.' || next == ' ' && i.peek() == '.' ) == 0
+ ? PERIOD
+ : ELLIPSIS;
+ }
+ else if( curr == '"' ) {
+ token = QUOTE_DOUBLE;
+ }
+ else if( curr == '\'' ) {
+ token = QUOTE_SINGLE;
+ }
+ else if( curr == '-' && i.peek() != '-' ) {
+ token = HYPHEN;
}
else if( isDash( curr ) ) {
- slurp( i, ( next, ci ) -> isDash( next ) );
-
- lexeme = createLexeme( DASH, began, i.getIndex() );
+ i.skip( Lexer::isDash );
+ token = DASH;
}
- else if( curr == '.' ) {
- lexeme = createLexeme(
- slurp( i, ( next, ci ) ->
- next == '.' || next == ' ' && peek( ci ) == '.' ) == 0
- ? PERIOD
- : ELLIPSIS,
- began, i.getIndex()
- );
+ else if( curr == '(' || curr == '{' || curr == '[' ) {
+ token = OPENING_GROUP;
}
- else if( curr == '\r' || curr == '\n' ) {
- final var cr = new int[]{curr == '\r' ? 1 : 0};
- final var lf = new int[]{curr == '\n' ? 1 : 0};
-
- // Swallow all consecutive CR (Mac), CRLF (Windows), and/or LF (Unix).
- slurp(
- i, ( next, ci ) -> {
- cr[ 0 ] += next == '\r' ? 1 : 0;
- lf[ 0 ] += next == '\n' ? 1 : 0;
- return next == '\r' || next == '\n';
- }
- );
-
- final var eol = cr[ 0 ] + lf[ 0 ] == 1 || cr[ 0 ] == 1 && lf[ 0 ] == 1;
- lexeme = createLexeme( eol ? EOL : EOP, began, i.getIndex() );
+ else if( curr == ')' || curr == '}' || curr == ']' ) {
+ token = CLOSING_GROUP;
}
- else if( isWhitespace( curr ) ) {
- lexeme = createLexeme( SPACE, began, i.getIndex() );
+ else if( curr == '“' ) {
+ token = QUOTE_DOUBLE_OPENING;
+ }
+ else if( curr == '”' ) {
+ token = QUOTE_DOUBLE_CLOSING;
+ }
+ else if( curr == '‘' ) {
+ token = QUOTE_SINGLE_OPENING;
+ }
+ else if( curr == '’' ) {
+ token = QUOTE_SINGLE_CLOSING;
}
else if( curr == '\\' ) {
- final var next = i.next();
+ i.next();
+ final var next = i.current();
if( next == '\'' ) {
- lexeme = createLexeme( ESC_SINGLE, began, i.getIndex() );
+ token = ESC_SINGLE;
}
else if( next == '\"' ) {
- lexeme = createLexeme( ESC_DOUBLE, began, i.getIndex() );
+ token = ESC_DOUBLE;
}
else {
// Push back the escaped character, which wasn't a straight quote.
- i.previous();
+ i.prev();
}
- }
- else if( curr == '(' || curr == '{' || curr == '[' ) {
- lexeme = createLexeme( OPENING_GROUP, began, i.getIndex() );
- }
- else if( curr == ')' || curr == '}' || curr == ']' ) {
- lexeme = createLexeme( CLOSING_GROUP, began, i.getIndex() );
}
else if( curr == '=' ) {
- lexeme = createLexeme( EQUALS, began, i.getIndex() );
- }
- else if( curr != DONE ) {
- lexeme = createLexeme( PUNCT, began, i.getIndex() );
+ token = EQUALS;
}
- else {
- lexeme = Lexeme.EOT;
+ else if( curr == DONE ) {
+ continue;
}
+
+ assert index >= 0;
+ assert curr != DONE;
+ emitter.accept( new Lexeme( token, index, i.index() + 1 ) );
i.next();
+ index = i.index();
}
- while( lexeme == null );
-
- return lexeme;
- }
-
- /**
- * @param i The {@link CharacterIterator} used to scan through the text, one
- * character at a time.
- * @return {@code true} if any characters were skipped.
- */
- boolean skip( final CharacterIterator i ) {
- return false;
}
curr == '.' || curr == ',' || curr == '-' || curr == '+' ||
curr == '^' || curr == '⅟' || curr == '⁄';
- }
-
- private static char peek( final CharacterIterator ci ) {
- final var ch = ci.next();
- ci.previous();
- return ch;
- }
-
- /**
- * Parse all characters that match a given function.
- *
- * @param ci The iterator containing characters to parse.
- * @param f The function that determines when slurping stops.
- * @return The number of characters parsed.
- */
- protected static int slurp(
- final CharacterIterator ci,
- final BiFunction<Character, CharacterIterator, Boolean> f ) {
- char next;
- int count = 0;
-
- do {
- next = ci.next();
- count++;
- }
- while( f.apply( next, ci ) );
-
- // The loop will have overshot the tally by one character.
- ci.previous();
-
- return --count;
}
}
src/main/java/com/whitemagicsoftware/keenquotes/LexerFilter.java
+package com.whitemagicsoftware.keenquotes;
+
+import java.text.CharacterIterator;
+import java.util.function.Predicate;
+
+public interface LexerFilter extends Predicate<CharacterIterator> {
+}
src/main/java/com/whitemagicsoftware/keenquotes/Parser.java
/**
- * Converts a string into an iterable list of {@link Lexeme} instances.
- */
- private final Lexer mLexer;
-
- /**
- * Sets of contractions that help disambiguate single quotes in the text.
- * These are effectively immutable while parsing.
- */
- private final Contractions sContractions;
-
- /**
- * Contains each emitted opening single quote per paragraph.
- */
- private final List<Lexeme> mOpeningSingleQuotes = new ArrayList<>();
-
- /**
- * Contains each emitted closing single quote per paragraph.
- */
- private final List<Lexeme> mClosingSingleQuotes = new ArrayList<>();
-
- /**
- * Contains each emitted opening double quote per paragraph.
- */
- private final List<Lexeme> mOpeningDoubleQuotes = new ArrayList<>();
-
- /**
- * Contains each emitted closing double quote per paragraph.
- */
- private final List<Lexeme> mClosingDoubleQuotes = new ArrayList<>();
-
- /**
- * Constructs a new {@link Parser} using the default contraction sets
- * to help resolve some ambiguous scenarios.
- *
- * @param text The prose to parse, containing zero or more quotation
- * characters.
- * @param contractions Custom sets of contractions to help resolve
- * ambiguities.
- */
- public Parser( final String text, final Contractions contractions ) {
- mText = text;
- mLexer = createLexer( mText );
- sContractions = contractions;
- }
-
- /**
- * Iterates over the entire text provided at construction, emitting
- * {@link Token}s that can be used to convert straight quotes to curly
- * quotes.
- *
- * @param tokenConsumer Receives emitted {@link Token}s.
- */
- public void parse(
- final Consumer<Token> tokenConsumer,
- final Consumer<Lexeme> lexemeConsumer ) {
- final var lexemes = new CircularFifoQueue<Lexeme>( 3 );
-
- // Allow consuming the very first token without needing a queue size check.
- flush( lexemes );
-
- final var unresolved = new ArrayList<Lexeme[]>();
- Lexeme lexeme;
-
- // Create and convert a list of all unambiguous quote characters.
- while( (lexeme = mLexer.next()) != EOT ) {
- // Reset after tokenizing a paragraph.
- if( tokenize( lexeme, lexemes, tokenConsumer, unresolved ) ) {
- // Attempt to resolve any remaining unambiguous quotes.
- resolve( unresolved, tokenConsumer );
-
- // Notify of any unambiguous quotes that could not be resolved.
- unresolved.forEach( lex -> lexemeConsumer.accept( lex[ 1 ] ) );
- unresolved.clear();
- mOpeningSingleQuotes.clear();
- mClosingSingleQuotes.clear();
- mOpeningDoubleQuotes.clear();
- mClosingDoubleQuotes.clear();
- }
- }
-
- // By loop's end, the lexemes list contains tokens for all except the
- // final two elements (from tokenizing in triplets). Tokenize the remaining
- // unprocessed lexemes.
- tokenize( EOT, lexemes, tokenConsumer, unresolved );
- tokenize( EOT, lexemes, tokenConsumer, unresolved );
-
- // Attempt to resolve any remaining unambiguous quotes.
- resolve( unresolved, tokenConsumer );
-
- // Notify of any unambiguous quotes that could not be resolved.
- unresolved.forEach( lex -> lexemeConsumer.accept( lex[ 1 ] ) );
- }
-
- /**
- * Converts {@link Lexeme}s identified as straight quotes into {@link Token}s
- * that represent the curly equivalent. The {@link Token}s are passed to
- * the given {@link Consumer} for further processing (e.g., replaced in
- * the original text being parsed).
- *
- * @param lexeme A part of the text being parsed.
- * @param lexemes A 3-element queue of lexemes that provide sufficient
- * context to identify curly quotes.
- * @param consumer Recipient of equivalent quotes.
- * @param unresolved Rolling list of potentially ambiguous {@link Lexeme}s
- * that could not be tokenized, yet.
- * @return {@code true} if an end-of-paragraph is detected.
- */
- private boolean tokenize( final Lexeme lexeme,
- final CircularFifoQueue<Lexeme> lexemes,
- final Consumer<Token> consumer,
- final List<Lexeme[]> unresolved ) {
- // Add the next lexeme to tokenize into the queue for immediate processing.
- lexemes.add( lexeme );
-
- final var lex1 = lexemes.get( 0 );
- final var lex2 = lexemes.get( 1 );
- final var lex3 = lexemes.get( 2 );
-
- if( lex2.isType( QUOTE_SINGLE ) && lex3.isType( WORD ) &&
- lex1.isType( WORD, PERIOD, NUMBER ) ) {
- // Examples: y'all, Ph.D.'ll, 20's, she's
- consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) );
- }
- else if( lex1.isType( QUOTE_SINGLE ) && lex3.isType( QUOTE_SINGLE ) &&
- "n".equalsIgnoreCase( lex2.toString( mText ) ) ) {
- // I.e., 'n'
- consumer.accept( new Token( QUOTE_APOSTROPHE, lex1 ) );
- consumer.accept( new Token( QUOTE_APOSTROPHE, lex3 ) );
- flush( lexemes );
- truncate( unresolved );
- }
- else if( lex2.isType( QUOTE_SINGLE ) && lex1.isType( NUMBER ) ) {
- if( lex3.isType( QUOTE_SINGLE ) ) {
- // E.g., 2''
- consumer.accept(
- new Token( QUOTE_PRIME_DOUBLE, lex2.began(), lex3.ended() ) );
- flush( lexemes );
- }
- else {
- // E.g., 2'
- consumer.accept( new Token( QUOTE_PRIME_SINGLE, lex2 ) );
- }
- }
- else if( lex2.isType( QUOTE_DOUBLE ) && lex1.isType( NUMBER ) ) {
- // E.g., 2"
- consumer.accept( new Token( QUOTE_PRIME_DOUBLE, lex2 ) );
- }
- else if( lex2.isType( WORD ) && lex3.isType( QUOTE_SINGLE ) &&
- sContractions.endedUnambiguously( lex2.toString( mText ) ) ) {
- // E.g., thinkin'
- consumer.accept( new Token( QUOTE_APOSTROPHE, lex3 ) );
- flush( lexemes );
- }
- else if( lex2.isType( NUMBER ) && lex1.isType( QUOTE_SINGLE ) ) {
- // Authors must re-write their sentences to avoid starting with numerals.
- if( lex3.isType( SPACE, PUNCT ) || lex3.isType( WORD ) &&
- lex3.toString( mText ).equalsIgnoreCase( "s" ) ) {
- // E.g.: '20s, '02
- consumer.accept( new Token( QUOTE_APOSTROPHE, lex1 ) );
- }
- else {
- // E.g., '2''
- consumer.accept( new Token( QUOTE_OPENING_SINGLE, lex1 ) );
- mOpeningSingleQuotes.add( lex1 );
- }
-
- truncate( unresolved );
- }
- else if( lex2.isType( QUOTE_SINGLE ) &&
- lex1.isType( PUNCT, PERIOD, ELLIPSIS, DASH ) &&
- (lex3.isType( EOL, EOP ) || lex3.isEot()) ) {
- consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) );
- mClosingSingleQuotes.add( lex2 );
- }
- else if( lex1.isType( ESC_SINGLE ) ) {
- // E.g., \'
- consumer.accept( new Token( QUOTE_STRAIGHT_SINGLE, lex1 ) );
- }
- else if( lex1.isType( ESC_DOUBLE ) ) {
- // E.g., \"
- consumer.accept( new Token( QUOTE_STRAIGHT_DOUBLE, lex1 ) );
-
- if( lex2.isType( QUOTE_SINGLE ) &&
- (lex3.isEot() || lex3.isType( SPACE, DASH, EOL, EOP )) ) {
- consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) );
- mClosingSingleQuotes.add( lex2 );
- }
- }
- else if( lex2.isType( QUOTE_DOUBLE ) &&
- (lex1.isSot() || lex1.isType( LEADING_QUOTE_OPENING_DOUBLE )) &&
- lex3.isType( LAGGING_QUOTE_OPENING_DOUBLE ) ) {
- // E.g.: "", "..., "word, ---"word
- consumer.accept( new Token( QUOTE_OPENING_DOUBLE, lex2 ) );
- mOpeningDoubleQuotes.add( lex2 );
- }
- else if( lex2.isType( QUOTE_DOUBLE ) &&
- lex1.isType( LEADING_QUOTE_CLOSING_DOUBLE ) &&
- (lex3.isEot() || lex3.isType( LAGGING_QUOTE_CLOSING_DOUBLE )) ) {
- // E.g.: ..."', word"', ?"', word"?
- consumer.accept( new Token( QUOTE_CLOSING_DOUBLE, lex2 ) );
- mClosingDoubleQuotes.add( lex2 );
- }
- else if( lex1.isType( WORD ) && lex2.isType( QUOTE_SINGLE ) &&
- lex3.isType( PUNCT, PERIOD ) ) {
- // E.g., word', (contraction ruled out by previous conditions)
- consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) );
- mClosingSingleQuotes.add( lex2 );
- }
- else if( lex1.isType( DASH ) &&
- lex2.isType( QUOTE_SINGLE ) &&
- lex3.isType( QUOTE_DOUBLE ) ) {
- // E.g., ---'"
- consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) );
- mClosingSingleQuotes.add( lex2 );
- }
- else if( lex2.isType( QUOTE_SINGLE, QUOTE_DOUBLE ) ) {
- // After tokenizing, the parser will attempt to resolve ambiguities.
- unresolved.add( new Lexeme[]{lex1, lex2, lex3} );
- }
-
- // Suggest to the caller that resolution should be performed. This allows
- // the algorithm to reset the opening/closing quote balance before the
- // next paragraph is parsed.
- return lex3.isType( EOP );
- }
-
- private void resolve(
- final List<Lexeme[]> unresolved, final Consumer<Token> consumer ) {
- // Some non-emitted tokenized lexemes may be ambiguous.
- final var ambiguousLeadingQuotes = new ArrayList<Lexeme[]>( 16 );
- final var ambiguousLaggingQuotes = new ArrayList<Lexeme[]>( 16 );
- var resolvedLeadingQuotes = 0;
- var resolvedLaggingQuotes = 0;
-
- // Count the number of ambiguous and non-ambiguous open single quotes.
- for( var i = unresolved.iterator(); i.hasNext(); ) {
- final var quotes = i.next();
- final var lex1 = quotes[ 0 ];
- final var lex2 = quotes[ 1 ];
- final var lex3 = quotes[ 2 ];
-
- if( lex2.isType( QUOTE_SINGLE ) ) {
- final var word1 = lex1 == SOT ? "" : lex1.toString( mText );
- final var word3 = lex3 == EOT ? "" : lex3.toString( mText );
-
- if( sContractions.beganAmbiguously( word3 ) ) {
- // E.g., 'Cause
- if( lex1.isType( QUOTE_SINGLE ) ) {
- // E.g., ''Cause
- consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) );
- i.remove();
- }
- else {
- // The contraction is uncertain until a closing quote is found that
- // may balance this single quote.
- ambiguousLeadingQuotes.add( quotes );
- }
- }
- else if( sContractions.beganUnambiguously( word3 ) ) {
- // The quote mark forms a word that does not stand alone from its
- // contraction. For example, twas is not a word: it's 'twas.
- consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) );
- i.remove();
- }
- else if( lex1.isType( WORD ) &&
- (lex3.isType( PUNCT, PERIOD, ELLIPSIS, DASH, SPACE, EOP ) ||
- lex3.isEot()) && unresolved.indexOf( quotes ) == 0 &&
- resolvedLeadingQuotes == 0 ) {
- // E.g., Tyfós'
- consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) );
- i.remove();
- }
- else if( sContractions.endedAmbiguously( word1 ) ) {
- ambiguousLaggingQuotes.add( quotes );
- }
- else if( (lex1.isSot() || lex1.isType( LEADING_QUOTE_OPENING_SINGLE )) &&
- lex3.isType( LAGGING_QUOTE_OPENING_SINGLE ) ) {
- consumer.accept( new Token( QUOTE_OPENING_SINGLE, lex2 ) );
- resolvedLeadingQuotes++;
- mOpeningSingleQuotes.add( lex2 );
- i.remove();
- }
- else if( lex1.isType( LEADING_QUOTE_CLOSING_SINGLE ) &&
- (lex3.isEot() || lex3.isType( LAGGING_QUOTE_CLOSING_SINGLE )) ) {
- consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) );
- resolvedLaggingQuotes++;
- mClosingSingleQuotes.add( lex2 );
- i.remove();
- }
- else if( lex3.isType( NUMBER ) ) {
- // E.g., '04
- ambiguousLeadingQuotes.add( quotes );
- }
- }
- }
-
- sort( mOpeningSingleQuotes );
- sort( mClosingSingleQuotes );
- sort( mOpeningDoubleQuotes );
- sort( mClosingDoubleQuotes );
-
- final var singleQuoteEmpty =
- mOpeningSingleQuotes.isEmpty() || mClosingSingleQuotes.isEmpty();
- final var doubleQuoteEmpty =
- mOpeningDoubleQuotes.isEmpty() || mClosingDoubleQuotes.isEmpty();
-
- final var singleQuoteDelta = abs(
- mClosingSingleQuotes.size() - mOpeningSingleQuotes.size()
- );
-
- final var doubleQuoteDelta = abs(
- mClosingDoubleQuotes.size() - mOpeningDoubleQuotes.size()
- );
-
- final var ambiguousLeadingCount = ambiguousLeadingQuotes.size();
- final var ambiguousLaggingCount = ambiguousLaggingQuotes.size();
-
- if( resolvedLeadingQuotes == 1 && resolvedLaggingQuotes == 0 ) {
- if( ambiguousLeadingCount == 0 && ambiguousLaggingCount == 1 ) {
- final var balanced = singleQuoteDelta == 0;
- final var quote = balanced ? QUOTE_APOSTROPHE : QUOTE_CLOSING_SINGLE;
- final var lex = ambiguousLaggingQuotes.get( 0 );
- consumer.accept( new Token( quote, lex[ 1 ] ) );
- unresolved.remove( lex );
- }
- else if( ambiguousLeadingCount == 0 && unresolved.size() == 1 ) {
- // Must be a closing quote.
- final var closing = unresolved.get( 0 );
- consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) );
- unresolved.remove( closing );
- }
- }
- else if( ambiguousLeadingCount == 0 && ambiguousLaggingCount > 0 ) {
- // If there are no ambiguous leading quotes then all ambiguous lagging
- // quotes must be contractions.
- ambiguousLaggingQuotes.forEach(
- lex -> {
- consumer.accept( new Token( QUOTE_APOSTROPHE, lex[ 1 ] ) );
- unresolved.remove( lex );
- }
- );
- }
- else if( mOpeningSingleQuotes.size() == 0 &&
- mClosingSingleQuotes.size() == 1 && !unresolved.isEmpty() ) {
- final var opening = unresolved.get( 0 );
- consumer.accept( new Token( QUOTE_OPENING_SINGLE, opening[ 1 ] ) );
- unresolved.remove( opening );
- }
- else if( ambiguousLeadingCount == 0 ) {
- if( resolvedLaggingQuotes < resolvedLeadingQuotes ) {
- for( final var i = unresolved.iterator(); i.hasNext(); ) {
- final var closing = i.next()[ 1 ];
- consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing ) );
- i.remove();
- }
- }
- else if( singleQuoteDelta == unresolved.size() ) {
- for( final var i = unresolved.iterator(); i.hasNext(); ) {
- final var closing = i.next();
- consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) );
- i.remove();
- }
- }
- else if( unresolved.size() == 2 ) {
- final var closing = unresolved.get( 0 );
- final var opening = unresolved.get( 1 );
- consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) );
- consumer.accept( new Token( QUOTE_OPENING_SINGLE, opening[ 1 ] ) );
-
- // Doesn't affect the algorithm.
- unresolved.clear();
- }
- }
- else if( singleQuoteDelta == 0 && !singleQuoteEmpty ||
- doubleQuoteDelta == 0 && !doubleQuoteEmpty ) {
- // An apostrophe stands betwixt opening/closing single quotes.
- for( final var lexemes = unresolved.iterator(); lexemes.hasNext(); ) {
- final var quote = lexemes.next()[ 1 ];
-
- for( int i = 0; i < mOpeningSingleQuotes.size(); i++ ) {
- // An apostrophe must fall between an open/close pair.
- final var openingQuote = mOpeningSingleQuotes.get( i );
- final var closingQuote = mClosingSingleQuotes.get( i );
-
- if( openingQuote.before( quote ) && closingQuote.after( quote ) ) {
- consumer.accept( new Token( QUOTE_APOSTROPHE, quote ) );
- lexemes.remove();
- }
- }
- }
-
- // An apostrophe stands betwixt opening/closing double quotes.
- final var lexemes = unresolved.iterator();
-
- while( lexemes.hasNext() ) {
- final var quote = lexemes.next()[ 1 ];
-
- // Prevent an index out of bounds exception.
- final var len = Math.min(
- mOpeningDoubleQuotes.size(),
- mClosingDoubleQuotes.size()
- );
-
- for( int i = 0; i < len; i++ ) {
- // An apostrophe must fall between an open/close pair.
- final var openingQuote = mOpeningDoubleQuotes.get( i );
- final var closingQuote = mClosingDoubleQuotes.get( i );
-
- if( openingQuote.before( quote ) && closingQuote.after( quote ) ) {
- consumer.accept( new Token( QUOTE_APOSTROPHE, quote ) );
-
- try {
- lexemes.remove();
- } catch( final Exception ex ) {
- // Weird edge case that hasn't been tracked down. Doesn't affect
- // the unit tests.
- break;
- }
- }
- }
- }
- }
- else if( ambiguousLeadingCount == 1 && resolvedLaggingQuotes == 1 ) {
- final var opening = ambiguousLeadingQuotes.get( 0 );
- consumer.accept( new Token( QUOTE_OPENING_SINGLE, opening[ 1 ] ) );
- unresolved.remove( opening );
- }
- }
-
- /**
- * Allow subclasses to change the type of {@link Lexer}
- *
- * @param text The text to lex.
- * @return A {@link Lexer} that can split the text into {@link Lexeme}s.
- */
- Lexer createLexer( final String text ) {
- return new Lexer( text );
+ * Sets of contractions that help disambiguate single quotes in the text.
+ * These are effectively immutable while parsing.
+ */
+ private final Contractions sContractions;
+
+ /**
+ * Contains each emitted opening single quote per paragraph.
+ */
+ private final List<Lexeme> mOpeningSingleQuotes = new ArrayList<>();
+
+ /**
+ * Contains each emitted closing single quote per paragraph.
+ */
+ private final List<Lexeme> mClosingSingleQuotes = new ArrayList<>();
+
+ /**
+ * Contains each emitted opening double quote per paragraph.
+ */
+ private final List<Lexeme> mOpeningDoubleQuotes = new ArrayList<>();
+
+ /**
+ * Contains each emitted closing double quote per paragraph.
+ */
+ private final List<Lexeme> mClosingDoubleQuotes = new ArrayList<>();
+
+ /**
+ * Constructs a new {@link Parser} using the default contraction sets
+ * to help resolve some ambiguous scenarios.
+ *
+ * @param text The prose to parse, containing zero or more quotation
+ * characters.
+ * @param contractions Custom sets of contractions to help resolve
+ * ambiguities.
+ */
+ public Parser( final String text, final Contractions contractions ) {
+ mText = text;
+ sContractions = contractions;
+ }
+
+ /**
+ * Iterates over the entire text provided at construction, emitting
+ * {@link Token}s that can be used to convert straight quotes to curly
+ * quotes.
+ *
+ * @param tokenConsumer Receives emitted {@link Token}s.
+ */
+ public void parse(
+ final Consumer<Token> tokenConsumer,
+ final Consumer<Lexeme> lexemeConsumer ) {
+ final var lexemes = new CircularFifoQueue<Lexeme>( 3 );
+
+ // Allow consuming the very first token without needing a queue size check.
+ flush( lexemes );
+
+ final var unresolved = new ArrayList<Lexeme[]>();
+
+ // Create and convert a list of all unambiguous quote characters.
+ Lexer.lex(mText, lexeme -> {
+ // Reset after tokenizing a paragraph.
+ if( tokenize( lexeme, lexemes, tokenConsumer, unresolved ) ) {
+ // Attempt to resolve any remaining unambiguous quotes.
+ resolve( unresolved, tokenConsumer );
+
+ // Notify of any unambiguous quotes that could not be resolved.
+ unresolved.forEach( lex -> lexemeConsumer.accept( lex[ 1 ] ) );
+ unresolved.clear();
+ mOpeningSingleQuotes.clear();
+ mClosingSingleQuotes.clear();
+ mOpeningDoubleQuotes.clear();
+ mClosingDoubleQuotes.clear();
+ }
+ });
+
+ // By loop's end, the lexemes list contains tokens for all except the
+ // final two elements (from tokenizing in triplets). Tokenize the remaining
+ // unprocessed lexemes.
+ tokenize( EOT, lexemes, tokenConsumer, unresolved );
+ tokenize( EOT, lexemes, tokenConsumer, unresolved );
+
+ // Attempt to resolve any remaining unambiguous quotes.
+ resolve( unresolved, tokenConsumer );
+
+ // Notify of any unambiguous quotes that could not be resolved.
+ unresolved.forEach( lex -> lexemeConsumer.accept( lex[ 1 ] ) );
+ }
+
+ /**
+ * Converts {@link Lexeme}s identified as straight quotes into {@link Token}s
+ * that represent the curly equivalent. The {@link Token}s are passed to
+ * the given {@link Consumer} for further processing (e.g., replaced in
+ * the original text being parsed).
+ *
+ * @param lexeme A part of the text being parsed.
+ * @param lexemes A 3-element queue of lexemes that provide sufficient
+ * context to identify curly quotes.
+ * @param consumer Recipient of equivalent quotes.
+ * @param unresolved Rolling list of potentially ambiguous {@link Lexeme}s
+ * that could not be tokenized, yet.
+ * @return {@code true} if an end-of-paragraph is detected.
+ */
+ private boolean tokenize( final Lexeme lexeme,
+ final CircularFifoQueue<Lexeme> lexemes,
+ final Consumer<Token> consumer,
+ final List<Lexeme[]> unresolved ) {
+ // Add the next lexeme to tokenize into the queue for immediate processing.
+ lexemes.add( lexeme );
+
+ final var lex1 = lexemes.get( 0 );
+ final var lex2 = lexemes.get( 1 );
+ final var lex3 = lexemes.get( 2 );
+
+ if( lex2.isType( QUOTE_SINGLE ) && lex3.isType( WORD ) &&
+ lex1.isType( WORD, PERIOD, NUMBER ) ) {
+ // Examples: y'all, Ph.D.'ll, 20's, she's
+ consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) );
+ }
+ else if( lex1.isType( QUOTE_SINGLE ) && lex3.isType( QUOTE_SINGLE ) &&
+ "n".equalsIgnoreCase( lex2.toString( mText ) ) ) {
+ // I.e., 'n'
+ consumer.accept( new Token( QUOTE_APOSTROPHE, lex1 ) );
+ consumer.accept( new Token( QUOTE_APOSTROPHE, lex3 ) );
+ flush( lexemes );
+ truncate( unresolved );
+ }
+ else if( lex2.isType( QUOTE_SINGLE ) && lex1.isType( NUMBER ) ) {
+ if( lex3.isType( QUOTE_SINGLE ) ) {
+ // E.g., 2''
+ consumer.accept(
+ new Token( QUOTE_PRIME_DOUBLE, lex2.began(), lex3.ended() ) );
+ flush( lexemes );
+ }
+ else {
+ // E.g., 2'
+ consumer.accept( new Token( QUOTE_PRIME_SINGLE, lex2 ) );
+ }
+ }
+ else if( lex2.isType( QUOTE_DOUBLE ) && lex1.isType( NUMBER ) ) {
+ // E.g., 2"
+ consumer.accept( new Token( QUOTE_PRIME_DOUBLE, lex2 ) );
+ }
+ else if( lex2.isType( WORD ) && lex3.isType( QUOTE_SINGLE ) &&
+ sContractions.endedUnambiguously( lex2.toString( mText ) ) ) {
+ // E.g., thinkin'
+ consumer.accept( new Token( QUOTE_APOSTROPHE, lex3 ) );
+ flush( lexemes );
+ }
+ else if( lex2.isType( NUMBER ) && lex1.isType( QUOTE_SINGLE ) ) {
+ // Authors must re-write their sentences to avoid starting with numerals.
+ if( lex3.isType( SPACE, PUNCT ) || lex3.isType( WORD ) &&
+ lex3.toString( mText ).equalsIgnoreCase( "s" ) ) {
+ // E.g.: '20s, '02
+ consumer.accept( new Token( QUOTE_APOSTROPHE, lex1 ) );
+ }
+ else {
+ // E.g., '2''
+ consumer.accept( new Token( QUOTE_OPENING_SINGLE, lex1 ) );
+ mOpeningSingleQuotes.add( lex1 );
+ }
+
+ truncate( unresolved );
+ }
+ else if( lex2.isType( QUOTE_SINGLE ) &&
+ lex1.isType( PUNCT, PERIOD, ELLIPSIS, DASH ) &&
+ (lex3.isType( EOL, EOP ) || lex3.isEot()) ) {
+ consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) );
+ mClosingSingleQuotes.add( lex2 );
+ }
+ else if( lex1.isType( ESC_SINGLE ) ) {
+ // E.g., \'
+ consumer.accept( new Token( QUOTE_STRAIGHT_SINGLE, lex1 ) );
+ }
+ else if( lex1.isType( ESC_DOUBLE ) ) {
+ // E.g., \"
+ consumer.accept( new Token( QUOTE_STRAIGHT_DOUBLE, lex1 ) );
+
+ if( lex2.isType( QUOTE_SINGLE ) &&
+ (lex3.isEot() || lex3.isType( SPACE, DASH, EOL, EOP )) ) {
+ consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) );
+ mClosingSingleQuotes.add( lex2 );
+ }
+ }
+ else if( lex2.isType( QUOTE_DOUBLE ) &&
+ (lex1.isSot() || lex1.isType( LEADING_QUOTE_OPENING_DOUBLE )) &&
+ lex3.isType( LAGGING_QUOTE_OPENING_DOUBLE ) ) {
+ // E.g.: "", "..., "word, ---"word
+ consumer.accept( new Token( QUOTE_OPENING_DOUBLE, lex2 ) );
+ mOpeningDoubleQuotes.add( lex2 );
+ }
+ else if( lex2.isType( QUOTE_DOUBLE ) &&
+ lex1.isType( LEADING_QUOTE_CLOSING_DOUBLE ) &&
+ (lex3.isEot() || lex3.isType( LAGGING_QUOTE_CLOSING_DOUBLE )) ) {
+ // E.g.: ..."', word"', ?"', word"?
+ consumer.accept( new Token( QUOTE_CLOSING_DOUBLE, lex2 ) );
+ mClosingDoubleQuotes.add( lex2 );
+ }
+ else if( lex1.isType( WORD ) && lex2.isType( QUOTE_SINGLE ) &&
+ lex3.isType( PUNCT, PERIOD ) ) {
+ // E.g., word', (contraction ruled out by previous conditions)
+ consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) );
+ mClosingSingleQuotes.add( lex2 );
+ }
+ else if( lex1.isType( DASH ) &&
+ lex2.isType( QUOTE_SINGLE ) &&
+ lex3.isType( QUOTE_DOUBLE ) ) {
+ // E.g., ---'"
+ consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) );
+ mClosingSingleQuotes.add( lex2 );
+ }
+ else if( lex2.isType( QUOTE_SINGLE, QUOTE_DOUBLE ) ) {
+ // After tokenizing, the parser will attempt to resolve ambiguities.
+ unresolved.add( new Lexeme[]{lex1, lex2, lex3} );
+ }
+
+ // Suggest to the caller that resolution should be performed. This allows
+ // the algorithm to reset the opening/closing quote balance before the
+ // next paragraph is parsed.
+ return lex3.isType( EOP );
+ }
+
+ private void resolve(
+ final List<Lexeme[]> unresolved, final Consumer<Token> consumer ) {
+ // Some non-emitted tokenized lexemes may be ambiguous.
+ final var ambiguousLeadingQuotes = new ArrayList<Lexeme[]>( 16 );
+ final var ambiguousLaggingQuotes = new ArrayList<Lexeme[]>( 16 );
+ var resolvedLeadingQuotes = 0;
+ var resolvedLaggingQuotes = 0;
+
+ // Count the number of ambiguous and non-ambiguous open single quotes.
+ for( var i = unresolved.iterator(); i.hasNext(); ) {
+ final var quotes = i.next();
+ final var lex1 = quotes[ 0 ];
+ final var lex2 = quotes[ 1 ];
+ final var lex3 = quotes[ 2 ];
+
+ if( lex2.isType( QUOTE_SINGLE ) ) {
+ final var word1 = lex1 == SOT ? "" : lex1.toString( mText );
+ final var word3 = lex3 == EOT ? "" : lex3.toString( mText );
+
+ if( sContractions.beganAmbiguously( word3 ) ) {
+ // E.g., 'Cause
+ if( lex1.isType( QUOTE_SINGLE ) ) {
+ // E.g., ''Cause
+ consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) );
+ i.remove();
+ }
+ else {
+ // The contraction is uncertain until a closing quote is found that
+ // may balance this single quote.
+ ambiguousLeadingQuotes.add( quotes );
+ }
+ }
+ else if( sContractions.beganUnambiguously( word3 ) ) {
+ // The quote mark forms a word that does not stand alone from its
+ // contraction. For example, twas is not a word: it's 'twas.
+ consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) );
+ i.remove();
+ }
+ else if( lex1.isType( WORD ) &&
+ (lex3.isType( PUNCT, PERIOD, ELLIPSIS, DASH, SPACE, EOP ) ||
+ lex3.isEot()) && unresolved.indexOf( quotes ) == 0 &&
+ resolvedLeadingQuotes == 0 ) {
+ // E.g., Tyfós'
+ consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) );
+ i.remove();
+ }
+ else if( sContractions.endedAmbiguously( word1 ) ) {
+ ambiguousLaggingQuotes.add( quotes );
+ }
+ else if( (lex1.isSot() || lex1.isType( LEADING_QUOTE_OPENING_SINGLE )) &&
+ lex3.isType( LAGGING_QUOTE_OPENING_SINGLE ) ) {
+ consumer.accept( new Token( QUOTE_OPENING_SINGLE, lex2 ) );
+ resolvedLeadingQuotes++;
+ mOpeningSingleQuotes.add( lex2 );
+ i.remove();
+ }
+ else if( lex1.isType( LEADING_QUOTE_CLOSING_SINGLE ) &&
+ (lex3.isEot() || lex3.isType( LAGGING_QUOTE_CLOSING_SINGLE )) ) {
+ consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) );
+ resolvedLaggingQuotes++;
+ mClosingSingleQuotes.add( lex2 );
+ i.remove();
+ }
+ else if( lex3.isType( NUMBER ) ) {
+ // E.g., '04
+ ambiguousLeadingQuotes.add( quotes );
+ }
+ }
+ }
+
+ sort( mOpeningSingleQuotes );
+ sort( mClosingSingleQuotes );
+ sort( mOpeningDoubleQuotes );
+ sort( mClosingDoubleQuotes );
+
+ final var singleQuoteEmpty =
+ mOpeningSingleQuotes.isEmpty() || mClosingSingleQuotes.isEmpty();
+ final var doubleQuoteEmpty =
+ mOpeningDoubleQuotes.isEmpty() || mClosingDoubleQuotes.isEmpty();
+
+ final var singleQuoteDelta = abs(
+ mClosingSingleQuotes.size() - mOpeningSingleQuotes.size()
+ );
+
+ final var doubleQuoteDelta = abs(
+ mClosingDoubleQuotes.size() - mOpeningDoubleQuotes.size()
+ );
+
+ final var ambiguousLeadingCount = ambiguousLeadingQuotes.size();
+ final var ambiguousLaggingCount = ambiguousLaggingQuotes.size();
+
+ if( resolvedLeadingQuotes == 1 && resolvedLaggingQuotes == 0 ) {
+ if( ambiguousLeadingCount == 0 && ambiguousLaggingCount == 1 ) {
+ final var balanced = singleQuoteDelta == 0;
+ final var quote = balanced ? QUOTE_APOSTROPHE : QUOTE_CLOSING_SINGLE;
+ final var lex = ambiguousLaggingQuotes.get( 0 );
+ consumer.accept( new Token( quote, lex[ 1 ] ) );
+ unresolved.remove( lex );
+ }
+ else if( ambiguousLeadingCount == 0 && unresolved.size() == 1 ) {
+ // Must be a closing quote.
+ final var closing = unresolved.get( 0 );
+ consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) );
+ unresolved.remove( closing );
+ }
+ }
+ else if( ambiguousLeadingCount == 0 && ambiguousLaggingCount > 0 ) {
+ // If there are no ambiguous leading quotes then all ambiguous lagging
+ // quotes must be contractions.
+ ambiguousLaggingQuotes.forEach(
+ lex -> {
+ consumer.accept( new Token( QUOTE_APOSTROPHE, lex[ 1 ] ) );
+ unresolved.remove( lex );
+ }
+ );
+ }
+ else if( mOpeningSingleQuotes.size() == 0 &&
+ mClosingSingleQuotes.size() == 1 && !unresolved.isEmpty() ) {
+ final var opening = unresolved.get( 0 );
+ consumer.accept( new Token( QUOTE_OPENING_SINGLE, opening[ 1 ] ) );
+ unresolved.remove( opening );
+ }
+ else if( ambiguousLeadingCount == 0 ) {
+ if( resolvedLaggingQuotes < resolvedLeadingQuotes ) {
+ for( final var i = unresolved.iterator(); i.hasNext(); ) {
+ final var closing = i.next()[ 1 ];
+ consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing ) );
+ i.remove();
+ }
+ }
+ else if( singleQuoteDelta == unresolved.size() ) {
+ for( final var i = unresolved.iterator(); i.hasNext(); ) {
+ final var closing = i.next();
+ consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) );
+ i.remove();
+ }
+ }
+ else if( unresolved.size() == 2 ) {
+ final var closing = unresolved.get( 0 );
+ final var opening = unresolved.get( 1 );
+ consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) );
+ consumer.accept( new Token( QUOTE_OPENING_SINGLE, opening[ 1 ] ) );
+
+ // Doesn't affect the algorithm.
+ unresolved.clear();
+ }
+ }
+ else if( singleQuoteDelta == 0 && !singleQuoteEmpty ||
+ doubleQuoteDelta == 0 && !doubleQuoteEmpty ) {
+ // An apostrophe stands betwixt opening/closing single quotes.
+ for( final var lexemes = unresolved.iterator(); lexemes.hasNext(); ) {
+ final var quote = lexemes.next()[ 1 ];
+
+ for( int i = 0; i < mOpeningSingleQuotes.size(); i++ ) {
+ // An apostrophe must fall between an open/close pair.
+ final var openingQuote = mOpeningSingleQuotes.get( i );
+ final var closingQuote = mClosingSingleQuotes.get( i );
+
+ if( openingQuote.before( quote ) && closingQuote.after( quote ) ) {
+ consumer.accept( new Token( QUOTE_APOSTROPHE, quote ) );
+ lexemes.remove();
+ }
+ }
+ }
+
+ // An apostrophe stands betwixt opening/closing double quotes.
+ final var lexemes = unresolved.iterator();
+
+ while( lexemes.hasNext() ) {
+ final var quote = lexemes.next()[ 1 ];
+
+ // Prevent an index out of bounds exception.
+ final var len = Math.min(
+ mOpeningDoubleQuotes.size(),
+ mClosingDoubleQuotes.size()
+ );
+
+ for( int i = 0; i < len; i++ ) {
+ // An apostrophe must fall between an open/close pair.
+ final var openingQuote = mOpeningDoubleQuotes.get( i );
+ final var closingQuote = mClosingDoubleQuotes.get( i );
+
+ if( openingQuote.before( quote ) && closingQuote.after( quote ) ) {
+ consumer.accept( new Token( QUOTE_APOSTROPHE, quote ) );
+
+ try {
+ lexemes.remove();
+ } catch( final Exception ex ) {
+ // Weird edge case that hasn't been tracked down. Doesn't affect
+ // the unit tests.
+ break;
+ }
+ }
+ }
+ }
+ }
+ else if( ambiguousLeadingCount == 1 && resolvedLaggingQuotes == 1 ) {
+ final var opening = ambiguousLeadingQuotes.get( 0 );
+ consumer.accept( new Token( QUOTE_OPENING_SINGLE, opening[ 1 ] ) );
+ unresolved.remove( opening );
+ }
}
src/main/java/com/whitemagicsoftware/keenquotes/XmlFilter.java
+/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */
+package com.whitemagicsoftware.keenquotes;
+
+import java.text.CharacterIterator;
+import java.util.Set;
+import java.util.function.Consumer;
+
+/**
+ * Responsible for lexing text while ignoring XML elements. The document must
+ * be both sane and well-formed. This is not intended to lex documents in the
+ * wild where the user has injected custom HTML. The lexer will fail if the
+ * angle brackets are not balanced. Additionally, any less than or greater than
+ * symbols must be encoded as {@code &lt;} or {@code &gt;}, respectively.
+ */
+final class XmlFilter implements Consumer<FastCharacterIterator> {
+
+ /**
+ * Elements that indicate preformatted text, intentional straight quotes.
+ */
+ private final Set<String> UNTOUCHABLE = Set.of(
+ "pre",
+ "code",
+ "tt",
+ "tex",
+ "kbd",
+ "samp",
+ "var",
+ "l",
+ "blockcode"
+ );
+
+ public XmlFilter() {}
+
+ /**
+ * Skip XML tags found within the prose, which hides the elements.
+ */
+ @Override
+ public void accept( final FastCharacterIterator i ) {
+ if( i.current() == '<' ) {
+ final var openingTag = nextTag( i );
+
+ if( UNTOUCHABLE.contains( openingTag.toLowerCase() ) ) {
+ String closingTag;
+
+ do {
+ closingTag = nextTag( i );
+ }
+ while( !closingTag.endsWith( openingTag ) );
+ }
+ }
+ }
+
+ /**
+ * Skips to the next greater than or less than symbol.
+ *
+ * @param i The {@link CharacterIterator} used to scan through the text, one
+ * character at a time.
+ * @return An opening/closing tag name, or the content within the element.
+ */
+ private String nextTag( final FastCharacterIterator i ) {
+ final var begin = i.index();
+
+ i.skip( next -> next != '>' && next != '<' );
+
+ // Swallow the trailing greater than symbol.
+ i.next();
+
+ // Skip to the character following the greater than symbol.
+ i.next();
+
+ return i.substring( begin + 1, i.index() - 1 );
+ }
+}
src/main/java/com/whitemagicsoftware/keenquotes/XmlLexer.java
-/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */
-package com.whitemagicsoftware.keenquotes;
-
-import java.text.CharacterIterator;
-import java.util.Set;
-
-import static java.text.CharacterIterator.DONE;
-
-/**
- * Responsible for lexing text while ignoring XML elements. The document must
- * be both sane and well-formed. This is not intended to lex documents in the
- * wild where the user has injected custom HTML. The lexer will fail if the
- * angle brackets are not balanced. Additionally, any less than or greater than
- * symbols must be encoded as {@code &lt;} or {@code &gt;}, respectively.
- */
-final class XmlLexer extends Lexer {
-
- /**
- * Referenced when skipping text, to determine whether lexing has found
- * an untouchable element, such as preformatted text.
- */
- private final String mText;
-
- /**
- * Elements that indicate preformatted text, intentional straight quotes.
- */
- private final Set<String> UNTOUCHABLE = Set.of(
- "pre",
- "code",
- "tt",
- "tex",
- "kbd",
- "samp",
- "var",
- "l",
- "blockcode"
- );
-
- /**
- * Constructs a {@link Lexer} capable of turning text int {@link Lexeme}s.
- *
- * @param text The text to lex.
- */
- XmlLexer( final String text ) {
- super( text );
-
- mText = text;
- }
-
- /**
- * Skip (do not emit) XML tags found within the prose. This effectively hides
- * the element.
- */
- @Override
- boolean skip( final CharacterIterator i ) {
- final var match = i.current() == '<';
-
- if( match ) {
- final var openingTag = nextTag( i );
-
- if( UNTOUCHABLE.contains( openingTag.toLowerCase() ) ) {
- String closingTag;
-
- do {
- closingTag = nextTag( i );
- }
- while( !closingTag.endsWith( openingTag ) && i.current() != DONE );
- }
- }
-
- return match;
- }
-
- /**
- * Skips to the next greater than or less than symbol.
- *
- * @param i The {@link CharacterIterator} used to scan through the text, one
- * character at a time.
- * @return An opening/closing tag name, or the content within the element.
- */
- private String nextTag( final CharacterIterator i ) {
- final var begin = i.getIndex();
-
- slurp( i, ( next, ci ) -> next != '>' && next != '<' );
-
- // Swallow the trailing greater than symbol.
- i.next();
-
- // Skip to the character following the greater than symbol.
- i.next();
-
- return mText.substring( begin + 1, i.getIndex() - 1 );
- }
-}
src/main/java/com/whitemagicsoftware/keenquotes/XmlParser.java
super( text, contractions );
}
-
- @Override
- public Lexer createLexer( final String text ) {
- return new XmlLexer( text );
- }
}
src/test/java/com/whitemagicsoftware/keenquotes/LexerTest.java
import java.util.List;
+import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.BiFunction;
+import java.util.function.Consumer;
-import static com.whitemagicsoftware.keenquotes.Lexeme.EOT;
import static com.whitemagicsoftware.keenquotes.LexemeType.*;
import static java.util.Arrays.asList;
testType( "!", PUNCT );
testType( ";", PUNCT );
+ testType( "\\", PUNCT );
testType( ".", PERIOD );
testType( "-", HYPHEN );
testType( "'", QUOTE_SINGLE );
testType( "\"", QUOTE_DOUBLE );
+ testType( "‘", QUOTE_SINGLE_OPENING );
+ testType( "’", QUOTE_SINGLE_CLOSING );
+ testType( "“", QUOTE_DOUBLE_OPENING );
+ testType( "”", QUOTE_DOUBLE_CLOSING );
testType( "3 o'clock", NUMBER, SPACE, WORD, QUOTE_SINGLE, WORD );
}
final String actual, final LexemeType... expected ) {
final var list = asList( expected );
- final var lexer = createLexer( actual );
- testType( lexer, actual, ( lexeme, text ) -> lexeme.getType(), list );
+ testType( actual, ( lexeme, text ) -> lexeme.getType(), list );
+ }
+
+ static void testType(
+ final String actual,
+ final Consumer<FastCharacterIterator> filter,
+ final LexemeType... expected ) {
+ final var list = asList( expected );
+ testType( actual, ( lexeme, text ) -> lexeme.getType(), filter, list );
}
static void testText(
final String actual, final String... expected ) {
- final var lexer = createLexer( actual );
- testType( lexer, actual, Lexeme::toString, asList( expected ) );
+ testType( actual, Lexeme::toString, asList( expected ) );
}
- static void testType(
- final Lexer lexer, final String actual, final LexemeType... expected ) {
- final var list = asList( expected );
- testType( lexer, actual, ( lexeme, text ) -> lexeme.getType(), list );
+ private static <A, E> void testType(
+ final String text,
+ final BiFunction<Lexeme, String, A> f,
+ final List<E> elements ) {
+ testType( text, f, filter -> {}, elements );
}
private static <A, E> void testType(
- final Lexer lexer,
final String text,
final BiFunction<Lexeme, String, A> f,
+ final Consumer<FastCharacterIterator> filter,
final List<E> elements ) {
- var counter = 0;
-
- Lexeme lexeme;
+ var counter = new AtomicInteger();
- while( (lexeme = lexer.next()) != EOT ) {
- final var expected = elements.get( counter++ );
+ Lexer.lex( text, lexeme -> {
+ final var expected = elements.get( counter.getAndIncrement() );
final var actual = f.apply( lexeme, text );
assertEquals( expected, actual );
- }
+ }, filter );
// Ensure all expected values are matched (verify end of text reached).
- assertEquals( elements.size(), counter );
- }
-
- static Lexer createLexer( final String text ) {
- return new Lexer( text );
+ assertEquals( elements.size(), counter.get() );
}
}
src/test/java/com/whitemagicsoftware/keenquotes/XmlFilterTest.java
+/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */
+package com.whitemagicsoftware.keenquotes;
+
+import org.junit.jupiter.api.Test;
+
+import java.util.function.Consumer;
+
+import static com.whitemagicsoftware.keenquotes.LexemeType.*;
+import static com.whitemagicsoftware.keenquotes.LexerTest.testType;
+
+/**
+ * Test that parsing XML documents ignores elements.
+ */
+final class XmlFilterTest {
+
+ @Test
+ void test_Lexing_Xml_EmitTags() {
+ final var actual =
+ "A <em>world's</em> aflame <pre><code>ch = '\\''</code></pre>.";
+ testType(
+ actual,
+ createXmlFilter(),
+ // A world ' s aflame .
+ WORD, SPACE, WORD, QUOTE_SINGLE, WORD, SPACE, WORD, SPACE, PERIOD
+ );
+ }
+
+ @Test
+ void test_Lexing_XmlAttribute_EmitTags() {
+ final var actual = "<a href=\"http://x.org\">X11</a>";
+ testType( actual, createXmlFilter(), WORD );
+ }
+
+ static Consumer<FastCharacterIterator> createXmlFilter() {
+ return new XmlFilter();
+ }
+}
src/test/java/com/whitemagicsoftware/keenquotes/XmlLexerTest.java
-/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */
-package com.whitemagicsoftware.keenquotes;
-
-import org.junit.jupiter.api.Test;
-
-import static com.whitemagicsoftware.keenquotes.LexemeType.*;
-import static com.whitemagicsoftware.keenquotes.LexerTest.testType;
-
-/**
- * Test that parsing XML documents ignores elements.
- */
-final class XmlLexerTest {
-
- @Test
- void test_Lexing_Xml_EmitTags() {
- final var actual =
- "A <em>world's</em> aflame <pre><code>ch = '\\''</code></pre>.";
- testType(
- createXmlLexer( actual ), actual,
- WORD, SPACE, WORD, QUOTE_SINGLE, WORD, SPACE, WORD, SPACE, PERIOD
- );
- }
-
- @Test
- void test_Lexing_XmlAttribute_EmitTags() {
- final var actual = "<a href=\"http://x.org\">X11</a>";
- testType( createXmlLexer( actual ), actual, WORD );
- }
-
- static Lexer createXmlLexer( final String text ) {
- return new XmlLexer( text );
- }
-}
src/test/resources/com/whitemagicsoftware/keenquotes/smartypants.txt
Sam&apos;s Sams&apos; and the Ross&apos;s roses&apos; thorns were prickly.
-"You 'cause---" all fifteen years' worth.
-&ldquo;You &apos;cause---&rdquo; all fifteen years&apos; worth.
-
# ########################################################################
# Inside contractions (no leading/trailing apostrophes)
"Jarvis, sir? Why, him as 'listed some years ago, and fought under
&ldquo;Jarvis, sir? Why, him as 'listed some years ago, and fought under
+
+"You 'cause---" all fifteen years' worth.
+&ldquo;You &apos;cause---&rdquo; all fifteen years' worth.
Delta936 lines added, 752 lines removed, 184-line increase