Performance improvements, add tests, simplify lexer

Author	Dave Jarvis <email>
Date	2022-08-13 14:50:26 GMT-0700
Commit	769e65ed7b07c0ef1243cac9746a6a7e06ae438b
Parent	93c7eff

build.gradle

		id 'application'
		id 'com.palantir.git-version' version '0.12.3'
		+ id "me.champeau.jmh" version "0.6.6"
		}


		implementation 'info.picocli:picocli:4.6.3'

		- testImplementation 'org.junit.jupiter:junit-jupiter-api:5.8.2'
		- testImplementation 'org.junit.jupiter:junit-jupiter-params:5.8.2'
		+ testImplementation 'org.junit.jupiter:junit-jupiter-api:5.9.0'
		+ testImplementation 'org.junit.jupiter:junit-jupiter-params:5.9.0'
		testRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine'
		}

src/jmh/java/com/whitemagicsoftware/keenquotes/StringIterationBenchmark.java

		+package com.whitemagicsoftware.keenquotes;
		+
		+import org.openjdk.jmh.annotations.Benchmark;
		+
		+import java.text.StringCharacterIterator;
		+import java.util.StringTokenizer;
		+import java.util.concurrent.atomic.AtomicInteger;
		+
		+import static java.lang.Math.random;
		+import static java.text.CharacterIterator.DONE;
		+
		+@SuppressWarnings( "unused" )
		+public class StringIterationBenchmark {
		+ private final static String CHARSET =
		+ "ABCDEFGHIJKLM NOPQRSTUVWXYZ abcdefghijklm nopqrstuvxyz 01234 5 6789";
		+ public final static int STRLEN = 1024 * 1024;
		+
		+ private static String generateText() {
		+ final var sb = new StringBuilder( STRLEN );
		+ final var len = CHARSET.length();
		+
		+ for( var i = 0; i < STRLEN; i++ ) {
		+ sb.append( CHARSET.charAt( (int) (len * random()) ) );
		+ }
		+
		+ return sb.toString();
		+ }
		+
		+ @Benchmark
		+ public void test_CharAtIterator() {
		+ final var s = generateText();
		+ final var length = s.length();
		+ var index = 0;
		+
		+ while( index < length ) {
		+ final var ch = s.charAt( index );
		+ index++;
		+ }
		+
		+ assert index == length;
		+ }
		+
		+ @Benchmark
		+ public void test_FastCharacterIterator() {
		+ final var s = generateText();
		+ final var i = new FastCharacterIterator( s );
		+
		+ char c = ' ';
		+
		+ while( c != DONE ) {
		+ i.next();
		+ c = i.current();
		+ }
		+
		+ assert i.index() == STRLEN;
		+ }
		+
		+ @Benchmark
		+ public void test_StringCharacterIterator() {
		+ final var s = generateText();
		+ final var i = new StringCharacterIterator( s );
		+ var index = 0;
		+
		+ char c = ' ';
		+
		+ while( c != DONE ) {
		+ c = i.next();
		+ index++;
		+ }
		+
		+ assert index == STRLEN;
		+ }
		+
		+ @Benchmark
		+ public void test_CharArrayIterator() {
		+ final var s = generateText();
		+ final var i = s.toCharArray();
		+ var index = 0;
		+
		+ for( final var ch : i ) {
		+ index++;
		+ }
		+
		+ assert index == STRLEN;
		+ }
		+
		+ @Benchmark
		+ public void test_StringTokenizer() {
		+ final var s = generateText();
		+ final var i = new StringTokenizer( s, " ", true );
		+ var index = 0;
		+
		+ while( i.hasMoreTokens() ) {
		+ final var token = i.nextToken();
		+ index += token.length();
		+ }
		+
		+ assert index == STRLEN;
		+ }
		+
		+ @Benchmark
		+ public void test_StreamIterator() {
		+ final var s = generateText();
		+ final var index = new AtomicInteger();
		+
		+ s.chars().forEach( codepoint -> {
		+ final var ch = Character.valueOf( (char) codepoint );
		+ index.incrementAndGet();
		+ } );
		+
		+ assert index.get() == STRLEN;
		+ }
		+}

src/main/java/com/whitemagicsoftware/keenquotes/Contractions.java

		"i",
		// of\|letter o
		- "o",
		+ "o"
		);

src/main/java/com/whitemagicsoftware/keenquotes/FastCharacterIterator.java

		+/* Copyright 2022 White Magic Software, Ltd. -- All rights reserved. */
		+package com.whitemagicsoftware.keenquotes;
		+
		+import java.text.CharacterIterator;
		+import java.text.StringCharacterIterator;
		+import java.util.function.Function;
		+
		+import static java.text.CharacterIterator.DONE;
		+
		+/**
		+ * Iterates over a string, much like {@link CharacterIterator}, but faster.
		+ * This class gets 53 ops/s vs. 49 ops/s for {@link StringCharacterIterator}.
		+ * In comparison, using unconstrained {@link String#charAt(int)} calls yields
		+ * 57 ops/s.
		+ * <p>
		+ * <strong>Caution:</strong> This class offers minimal bounds checking to eke
		+ * out some efficiency.
		+ * </p>
		+ */
		+public class FastCharacterIterator {
		+
		+ private final String mS;
		+ private final int mLen;
		+
		+ /**
		+ * Starts at 0, not guaranteed to be within bounds.
		+ */
		+ private int mPos;
		+
		+ /**
		+ * Constructs a new iterator that can advance through the given string
		+ * one character at a time.
		+ *
		+ * @param s The string to iterate.
		+ */
		+ public FastCharacterIterator( final String s ) {
		+ mS = s;
		+ mLen = s.length();
		+ }
		+
		+ /**
		+ * Returns the total number of characters in the string to iterate.
		+ *
		+ * @return The string length.
		+ */
		+ public int length() {
		+ return mLen;
		+ }
		+
		+ /**
		+ * Returns the iterated index. The return value is not guaranteed to be
		+ * within the string bounds.
		+ *
		+ * @return The iterated index.
		+ */
		+ public int index() {
		+ return mPos;
		+ }
		+
		+ /**
		+ * Returns the character at the currently iterated position in the string.
		+ * This method performs bounds checking.
		+ *
		+ * @return {@link CharacterIterator#DONE} if there are no more characters.
		+ */
		+ public char current() {
		+ return hasNext() ? mS.charAt( mPos ) : DONE;
		+ }
		+
		+ /**
		+ * Advances to the next character in the string, without bounds checking.
		+ */
		+ public void next() {
		+ mPos++;
		+ }
		+
		+ /**
		+ * Advances to the previous character in the string, without bounds checking.
		+ */
		+ public void prev() {
		+ --mPos;
		+ }
		+
		+ /**
		+ * Returns the next character in the string without consuming it. Multiple
		+ * consecutive calls to this method will return the same value. This method
		+ * performs bounds checking.
		+ *
		+ * @return {@link CharacterIterator#DONE} if there are no more characters.
		+ */
		+ public char peek() {
		+ final var pos = mPos;
		+ return pos + 1 < mLen ? mS.charAt( pos + 1 ) : DONE;
		+ }
		+
		+ /**
		+ * Answers whether the internal index is less than the string's length,
		+ * meaning that calling {@link #next()} followed by {@link #current()} will
		+ * succeed.
		+ *
		+ * @return {@code true} if there are more characters to iterate.
		+ */
		+ public boolean hasNext() {
		+ return mPos < mLen;
		+ }
		+
		+ /**
		+ * Parse all characters that match a given function.
		+ *
		+ * @param f The function that determines when slurping stops.
		+ * @return The number of characters parsed.
		+ */
		+ public int skip( final Function<Character, Boolean> f ) {
		+ assert f != null;
		+
		+ final var began = index();
		+
		+ do {
		+ next();
		+ }
		+ while( f.apply( current() ) );
		+
		+ // The loop always overshoots by one character.
		+ prev();
		+
		+ return index() - began;
		+ }
		+
		+ /**
		+ * Creates a string from a subset of consecutive characters in the string
		+ * being iterated. The calling class is responsible for bounds-checking.
		+ *
		+ * @param began The starting index, must be greater than or equal to zero
		+ * and less than or equal to {@code ended}.
		+ * @param ended The ending index, must be less than the string length.
		+ * @return A substring of the iterated string.
		+ * @throws IndexOutOfBoundsException Either or both parameters exceed the
		+ * string's boundaries.
		+ */
		+ public String substring( final int began, final int ended ) {
		+ assert began >= 0;
		+ assert began <= ended;
		+ assert ended < mLen;
		+
		+ return mS.substring( began, ended );
		+ }
		+}

src/main/java/com/whitemagicsoftware/keenquotes/Lexeme.java

		* @param ended Offset into the text where this instance stops (0-based).
		*/
		- private Lexeme( final LexemeType type, final int began, final int ended ) {
		+ public Lexeme( final LexemeType type, final int began, final int ended ) {
		assert type != null;
		assert began >= 0 \|\| ended == E_INDEX;
		- assert ended >= began \|\| ended == E_INDEX;
		+ assert began <= ended \|\| ended == E_INDEX;

		mType = type;
		mBegan = began;
		- mEnded = ended + 1;
		- }
		-
		- static Lexeme createLexeme(
		- final LexemeType lexeme, final int began, final int ended ) {
		- assert lexeme != null;
		- return new Lexeme( lexeme, began, ended );
		+ mEnded = ended;
		}

src/main/java/com/whitemagicsoftware/keenquotes/LexemeType.java

		QUOTE_SINGLE_CLOSING,
		QUOTE_DOUBLE,
		+ QUOTE_DOUBLE_OPENING,
		+ QUOTE_DOUBLE_CLOSING,
		ESC_SINGLE,
		ESC_DOUBLE,

src/main/java/com/whitemagicsoftware/keenquotes/Lexer.java

		package com.whitemagicsoftware.keenquotes;

		-import java.text.CharacterIterator;
		-import java.text.StringCharacterIterator;
		-import java.util.function.BiFunction;
		+import java.util.function.Consumer;

		-import static com.whitemagicsoftware.keenquotes.Lexeme.createLexeme;
		import static com.whitemagicsoftware.keenquotes.LexemeType.*;
		import static java.lang.Character.isWhitespace;
		import static java.text.CharacterIterator.DONE;

		/**
		* Turns text into words, numbers, punctuation, spaces, and more.
		*/
		public class Lexer {
		- /**
		- * Iterates over the entire string of text to help produce lexemes.
		- */
		- private final CharacterIterator mIterator;
		-
		- /**
		- * Constructs a {@link Lexer} capable of turning text int {@link Lexeme}s.
		- *
		- * @param text The text to lex.
		- */
		- Lexer( final String text ) {
		- mIterator = new StringCharacterIterator( text );
		- }
		-
		- Lexeme next() {
		- return parse( mIterator );
		- }
		-
		/**
		* Tokenizes a sequence of characters. The order of comparisons is optimized
		* towards probability of the occurrence of a character in regular English
		* prose: letters, space, quotation marks, numbers, periods, new lines,
		* then end of text.
		*
		- * @param i The sequence of characters to tokenize.
		- * @return The next token in the sequence.
		+ * @param text The sequence of characters to tokenize.
		*/
		- private Lexeme parse( final CharacterIterator i ) {
		- int began = i.getIndex();
		- boolean isWord = false;
		- Lexeme lexeme = null;
		+ public static void lex(
		+ final String text,
		+ final Consumer<Lexeme> emitter ) {
		+ lex( text, emitter, filter -> {} );
		+ }

		- do {
		- // Allow subclasses to skip character sequences. This allows XML tags
		- // to be skipped.
		- while( skip( i ) ) {
		- began = i.getIndex();
		- }
		+ public static void lex(
		+ final String text,
		+ final Consumer<Lexeme> emitter,
		+ final Consumer<FastCharacterIterator> filter ) {
		+ lex( new FastCharacterIterator( text ), emitter, filter );
		+ }

		- final var curr = i.current();
		+ public static void lex(
		+ final FastCharacterIterator i,
		+ final Consumer<Lexeme> emitter,
		+ final Consumer<FastCharacterIterator> filter ) {
		+ var index = i.index();
		+ var length = i.length();
		+ var curr = ' ';

		- if( isLetter( curr ) ) {
		- isWord = true;
		+ while( index < length ) {
		+ // Allow filters to skip character sequences (such as XML tags).
		+ filter.accept( i );

		- final var next = peek( i );
		+ index = i.index();
		+ curr = i.current();

		- if( !isLetter( next ) && !isDigit( next ) ) {
		- lexeme = createLexeme( WORD, began, i.getIndex() );
		- }
		+ var token = PUNCT;
		+
		+ if( isLetter( curr ) ) {
		+ // T1000 is one word, not a word and a number.
		+ i.skip( next -> isLetter( next ) \|\| isDigit( next ) );
		+ token = WORD;
		}
		else if( curr == ' ' ) {
		- slurp( i, ( next, ci ) -> next == ' ' );
		- lexeme = createLexeme( SPACE, began, i.getIndex() );
		- }
		- else if( curr == '\'' ) {
		- lexeme = createLexeme( QUOTE_SINGLE, began, i.getIndex() );
		- }
		- else if( curr == '"' ) {
		- lexeme = createLexeme( QUOTE_DOUBLE, began, i.getIndex() );
		+ i.skip( next -> next == ' ' );
		+ token = SPACE;
		}
		- else if( curr == '‘' ) {
		- lexeme = createLexeme( QUOTE_SINGLE_OPENING, began, i.getIndex() );
		+ else if( curr == '\r' \|\| curr == '\n' ) {
		+ final var cr = new int[]{curr == '\r' ? 1 : 0};
		+ final var lf = new int[]{curr == '\n' ? 1 : 0};
		+
		+ // Swallow all consecutive CR (Mac), CRLF (Windows), and/or LF (Unix).
		+ i.skip( next -> {
		+ cr[ 0 ] += next == '\r' ? 1 : 0;
		+ lf[ 0 ] += next == '\n' ? 1 : 0;
		+
		+ return next == '\r' \|\| next == '\n';
		+ } );
		+
		+ token = cr[ 0 ] + lf[ 0 ] == 1 \|\| cr[ 0 ] == 1 && lf[ 0 ] == 1
		+ ? EOL
		+ : EOP;
		}
		- else if( curr == '’' ) {
		- lexeme = createLexeme( QUOTE_SINGLE_CLOSING, began, i.getIndex() );
		+ else if( isWhitespace( curr ) ) {
		+ i.skip( Character::isWhitespace );
		+ token = SPACE;
		}
		- else if( isDigit( curr ) \|\| isNumeric( curr ) && isDigit( peek( i ) ) ) {
		+ else if( isDigit( curr ) \|\| isNumeric( curr ) && isDigit( i.peek() ) ) {
		// Parse all consecutive number characters to prevent the main loop
		// from switching back to word tokens.
		- slurp( i, ( next, ci ) ->
		- isDigit( next ) \|\| isNumeric( next ) && isDigit( peek( ci ) )
		+ i.skip(
		+ next -> isDigit( next ) \|\| isNumeric( next ) && isDigit( i.peek() )
		);
		-
		- lexeme = createLexeme( isWord ? WORD : NUMBER, began, i.getIndex() );
		+ token = NUMBER;
		}
		- else if( curr == '-' && peek( i ) != '-' ) {
		- lexeme = createLexeme( HYPHEN, began, i.getIndex() );
		+ else if( curr == '.' ) {
		+ token =
		+ i.skip( next -> next == '.' \|\| next == ' ' && i.peek() == '.' ) == 0
		+ ? PERIOD
		+ : ELLIPSIS;
		+ }
		+ else if( curr == '"' ) {
		+ token = QUOTE_DOUBLE;
		+ }
		+ else if( curr == '\'' ) {
		+ token = QUOTE_SINGLE;
		+ }
		+ else if( curr == '-' && i.peek() != '-' ) {
		+ token = HYPHEN;
		}
		else if( isDash( curr ) ) {
		- slurp( i, ( next, ci ) -> isDash( next ) );
		-
		- lexeme = createLexeme( DASH, began, i.getIndex() );
		+ i.skip( Lexer::isDash );
		+ token = DASH;
		}
		- else if( curr == '.' ) {
		- lexeme = createLexeme(
		- slurp( i, ( next, ci ) ->
		- next == '.' \|\| next == ' ' && peek( ci ) == '.' ) == 0
		- ? PERIOD
		- : ELLIPSIS,
		- began, i.getIndex()
		- );
		+ else if( curr == '(' \|\| curr == '{' \|\| curr == '[' ) {
		+ token = OPENING_GROUP;
		}
		- else if( curr == '\r' \|\| curr == '\n' ) {
		- final var cr = new int[]{curr == '\r' ? 1 : 0};
		- final var lf = new int[]{curr == '\n' ? 1 : 0};
		-
		- // Swallow all consecutive CR (Mac), CRLF (Windows), and/or LF (Unix).
		- slurp(
		- i, ( next, ci ) -> {
		- cr[ 0 ] += next == '\r' ? 1 : 0;
		- lf[ 0 ] += next == '\n' ? 1 : 0;
		- return next == '\r' \|\| next == '\n';
		- }
		- );
		-
		- final var eol = cr[ 0 ] + lf[ 0 ] == 1 \|\| cr[ 0 ] == 1 && lf[ 0 ] == 1;
		- lexeme = createLexeme( eol ? EOL : EOP, began, i.getIndex() );
		+ else if( curr == ')' \|\| curr == '}' \|\| curr == ']' ) {
		+ token = CLOSING_GROUP;
		}
		- else if( isWhitespace( curr ) ) {
		- lexeme = createLexeme( SPACE, began, i.getIndex() );
		+ else if( curr == '“' ) {
		+ token = QUOTE_DOUBLE_OPENING;
		+ }
		+ else if( curr == '”' ) {
		+ token = QUOTE_DOUBLE_CLOSING;
		+ }
		+ else if( curr == '‘' ) {
		+ token = QUOTE_SINGLE_OPENING;
		+ }
		+ else if( curr == '’' ) {
		+ token = QUOTE_SINGLE_CLOSING;
		}
		else if( curr == '\\' ) {
		- final var next = i.next();
		+ i.next();
		+ final var next = i.current();

		if( next == '\'' ) {
		- lexeme = createLexeme( ESC_SINGLE, began, i.getIndex() );
		+ token = ESC_SINGLE;
		}
		else if( next == '\"' ) {
		- lexeme = createLexeme( ESC_DOUBLE, began, i.getIndex() );
		+ token = ESC_DOUBLE;
		}
		else {
		// Push back the escaped character, which wasn't a straight quote.
		- i.previous();
		+ i.prev();
		}
		- }
		- else if( curr == '(' \|\| curr == '{' \|\| curr == '[' ) {
		- lexeme = createLexeme( OPENING_GROUP, began, i.getIndex() );
		- }
		- else if( curr == ')' \|\| curr == '}' \|\| curr == ']' ) {
		- lexeme = createLexeme( CLOSING_GROUP, began, i.getIndex() );
		}
		else if( curr == '=' ) {
		- lexeme = createLexeme( EQUALS, began, i.getIndex() );
		- }
		- else if( curr != DONE ) {
		- lexeme = createLexeme( PUNCT, began, i.getIndex() );
		+ token = EQUALS;
		}
		- else {
		- lexeme = Lexeme.EOT;
		+ else if( curr == DONE ) {
		+ continue;
		}
		+
		+ assert index >= 0;
		+ assert curr != DONE;

		+ emitter.accept( new Lexeme( token, index, i.index() + 1 ) );
		i.next();
		+ index = i.index();
		}
		- while( lexeme == null );
		-
		- return lexeme;
		- }
		-
		- /**
		- * @param i The {@link CharacterIterator} used to scan through the text, one
		- * character at a time.
		- * @return {@code true} if any characters were skipped.
		- */
		- boolean skip( final CharacterIterator i ) {
		- return false;
		}


		curr == '.' \|\| curr == ',' \|\| curr == '-' \|\| curr == '+' \|\|
		curr == '^' \|\| curr == '⅟' \|\| curr == '⁄';
		- }
		-
		- private static char peek( final CharacterIterator ci ) {
		- final var ch = ci.next();
		- ci.previous();
		- return ch;
		- }
		-
		- /**
		- * Parse all characters that match a given function.
		- *
		- * @param ci The iterator containing characters to parse.
		- * @param f The function that determines when slurping stops.
		- * @return The number of characters parsed.
		- */
		- protected static int slurp(
		- final CharacterIterator ci,
		- final BiFunction<Character, CharacterIterator, Boolean> f ) {
		- char next;
		- int count = 0;
		-
		- do {
		- next = ci.next();
		- count++;
		- }
		- while( f.apply( next, ci ) );
		-
		- // The loop will have overshot the tally by one character.
		- ci.previous();
		-
		- return --count;
		}
		}

src/main/java/com/whitemagicsoftware/keenquotes/LexerFilter.java

		+package com.whitemagicsoftware.keenquotes;
		+
		+import java.text.CharacterIterator;
		+import java.util.function.Predicate;
		+
		+public interface LexerFilter extends Predicate<CharacterIterator> {
		+}

src/main/java/com/whitemagicsoftware/keenquotes/Parser.java


		/**
		- * Converts a string into an iterable list of {@link Lexeme} instances.
		- */
		- private final Lexer mLexer;
		-
		- /**
		- * Sets of contractions that help disambiguate single quotes in the text.
		- * These are effectively immutable while parsing.
		- */
		- private final Contractions sContractions;
		-
		- /**
		- * Contains each emitted opening single quote per paragraph.
		- */
		- private final List<Lexeme> mOpeningSingleQuotes = new ArrayList<>();
		-
		- /**
		- * Contains each emitted closing single quote per paragraph.
		- */
		- private final List<Lexeme> mClosingSingleQuotes = new ArrayList<>();
		-
		- /**
		- * Contains each emitted opening double quote per paragraph.
		- */
		- private final List<Lexeme> mOpeningDoubleQuotes = new ArrayList<>();
		-
		- /**
		- * Contains each emitted closing double quote per paragraph.
		- */
		- private final List<Lexeme> mClosingDoubleQuotes = new ArrayList<>();
		-
		- /**
		- * Constructs a new {@link Parser} using the default contraction sets
		- * to help resolve some ambiguous scenarios.
		- *
		- * @param text The prose to parse, containing zero or more quotation
		- * characters.
		- * @param contractions Custom sets of contractions to help resolve
		- * ambiguities.
		- */
		- public Parser( final String text, final Contractions contractions ) {
		- mText = text;
		- mLexer = createLexer( mText );
		- sContractions = contractions;
		- }
		-
		- /**
		- * Iterates over the entire text provided at construction, emitting
		- * {@link Token}s that can be used to convert straight quotes to curly
		- * quotes.
		- *
		- * @param tokenConsumer Receives emitted {@link Token}s.
		- */
		- public void parse(
		- final Consumer<Token> tokenConsumer,
		- final Consumer<Lexeme> lexemeConsumer ) {
		- final var lexemes = new CircularFifoQueue<Lexeme>( 3 );
		-
		- // Allow consuming the very first token without needing a queue size check.
		- flush( lexemes );
		-
		- final var unresolved = new ArrayList<Lexeme[]>();
		- Lexeme lexeme;
		-
		- // Create and convert a list of all unambiguous quote characters.
		- while( (lexeme = mLexer.next()) != EOT ) {
		- // Reset after tokenizing a paragraph.
		- if( tokenize( lexeme, lexemes, tokenConsumer, unresolved ) ) {
		- // Attempt to resolve any remaining unambiguous quotes.
		- resolve( unresolved, tokenConsumer );
		-
		- // Notify of any unambiguous quotes that could not be resolved.
		- unresolved.forEach( lex -> lexemeConsumer.accept( lex[ 1 ] ) );
		- unresolved.clear();
		- mOpeningSingleQuotes.clear();
		- mClosingSingleQuotes.clear();
		- mOpeningDoubleQuotes.clear();
		- mClosingDoubleQuotes.clear();
		- }
		- }
		-
		- // By loop's end, the lexemes list contains tokens for all except the
		- // final two elements (from tokenizing in triplets). Tokenize the remaining
		- // unprocessed lexemes.
		- tokenize( EOT, lexemes, tokenConsumer, unresolved );
		- tokenize( EOT, lexemes, tokenConsumer, unresolved );
		-
		- // Attempt to resolve any remaining unambiguous quotes.
		- resolve( unresolved, tokenConsumer );
		-
		- // Notify of any unambiguous quotes that could not be resolved.
		- unresolved.forEach( lex -> lexemeConsumer.accept( lex[ 1 ] ) );
		- }
		-
		- /**
		- * Converts {@link Lexeme}s identified as straight quotes into {@link Token}s
		- * that represent the curly equivalent. The {@link Token}s are passed to
		- * the given {@link Consumer} for further processing (e.g., replaced in
		- * the original text being parsed).
		- *
		- * @param lexeme A part of the text being parsed.
		- * @param lexemes A 3-element queue of lexemes that provide sufficient
		- * context to identify curly quotes.
		- * @param consumer Recipient of equivalent quotes.
		- * @param unresolved Rolling list of potentially ambiguous {@link Lexeme}s
		- * that could not be tokenized, yet.
		- * @return {@code true} if an end-of-paragraph is detected.
		- */
		- private boolean tokenize( final Lexeme lexeme,
		- final CircularFifoQueue<Lexeme> lexemes,
		- final Consumer<Token> consumer,
		- final List<Lexeme[]> unresolved ) {
		- // Add the next lexeme to tokenize into the queue for immediate processing.
		- lexemes.add( lexeme );
		-
		- final var lex1 = lexemes.get( 0 );
		- final var lex2 = lexemes.get( 1 );
		- final var lex3 = lexemes.get( 2 );
		-
		- if( lex2.isType( QUOTE_SINGLE ) && lex3.isType( WORD ) &&
		- lex1.isType( WORD, PERIOD, NUMBER ) ) {
		- // Examples: y'all, Ph.D.'ll, 20's, she's
		- consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) );
		- }
		- else if( lex1.isType( QUOTE_SINGLE ) && lex3.isType( QUOTE_SINGLE ) &&
		- "n".equalsIgnoreCase( lex2.toString( mText ) ) ) {
		- // I.e., 'n'
		- consumer.accept( new Token( QUOTE_APOSTROPHE, lex1 ) );
		- consumer.accept( new Token( QUOTE_APOSTROPHE, lex3 ) );
		- flush( lexemes );
		- truncate( unresolved );
		- }
		- else if( lex2.isType( QUOTE_SINGLE ) && lex1.isType( NUMBER ) ) {
		- if( lex3.isType( QUOTE_SINGLE ) ) {
		- // E.g., 2''
		- consumer.accept(
		- new Token( QUOTE_PRIME_DOUBLE, lex2.began(), lex3.ended() ) );
		- flush( lexemes );
		- }
		- else {
		- // E.g., 2'
		- consumer.accept( new Token( QUOTE_PRIME_SINGLE, lex2 ) );
		- }
		- }
		- else if( lex2.isType( QUOTE_DOUBLE ) && lex1.isType( NUMBER ) ) {
		- // E.g., 2"
		- consumer.accept( new Token( QUOTE_PRIME_DOUBLE, lex2 ) );
		- }
		- else if( lex2.isType( WORD ) && lex3.isType( QUOTE_SINGLE ) &&
		- sContractions.endedUnambiguously( lex2.toString( mText ) ) ) {
		- // E.g., thinkin'
		- consumer.accept( new Token( QUOTE_APOSTROPHE, lex3 ) );
		- flush( lexemes );
		- }
		- else if( lex2.isType( NUMBER ) && lex1.isType( QUOTE_SINGLE ) ) {
		- // Authors must re-write their sentences to avoid starting with numerals.
		- if( lex3.isType( SPACE, PUNCT ) \|\| lex3.isType( WORD ) &&
		- lex3.toString( mText ).equalsIgnoreCase( "s" ) ) {
		- // E.g.: '20s, '02
		- consumer.accept( new Token( QUOTE_APOSTROPHE, lex1 ) );
		- }
		- else {
		- // E.g., '2''
		- consumer.accept( new Token( QUOTE_OPENING_SINGLE, lex1 ) );
		- mOpeningSingleQuotes.add( lex1 );
		- }
		-
		- truncate( unresolved );
		- }
		- else if( lex2.isType( QUOTE_SINGLE ) &&
		- lex1.isType( PUNCT, PERIOD, ELLIPSIS, DASH ) &&
		- (lex3.isType( EOL, EOP ) \|\| lex3.isEot()) ) {
		- consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) );
		- mClosingSingleQuotes.add( lex2 );
		- }
		- else if( lex1.isType( ESC_SINGLE ) ) {
		- // E.g., \'
		- consumer.accept( new Token( QUOTE_STRAIGHT_SINGLE, lex1 ) );
		- }
		- else if( lex1.isType( ESC_DOUBLE ) ) {
		- // E.g., \"
		- consumer.accept( new Token( QUOTE_STRAIGHT_DOUBLE, lex1 ) );
		-
		- if( lex2.isType( QUOTE_SINGLE ) &&
		- (lex3.isEot() \|\| lex3.isType( SPACE, DASH, EOL, EOP )) ) {
		- consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) );
		- mClosingSingleQuotes.add( lex2 );
		- }
		- }
		- else if( lex2.isType( QUOTE_DOUBLE ) &&
		- (lex1.isSot() \|\| lex1.isType( LEADING_QUOTE_OPENING_DOUBLE )) &&
		- lex3.isType( LAGGING_QUOTE_OPENING_DOUBLE ) ) {
		- // E.g.: "", "..., "word, ---"word
		- consumer.accept( new Token( QUOTE_OPENING_DOUBLE, lex2 ) );
		- mOpeningDoubleQuotes.add( lex2 );
		- }
		- else if( lex2.isType( QUOTE_DOUBLE ) &&
		- lex1.isType( LEADING_QUOTE_CLOSING_DOUBLE ) &&
		- (lex3.isEot() \|\| lex3.isType( LAGGING_QUOTE_CLOSING_DOUBLE )) ) {
		- // E.g.: ..."', word"', ?"', word"?
		- consumer.accept( new Token( QUOTE_CLOSING_DOUBLE, lex2 ) );
		- mClosingDoubleQuotes.add( lex2 );
		- }
		- else if( lex1.isType( WORD ) && lex2.isType( QUOTE_SINGLE ) &&
		- lex3.isType( PUNCT, PERIOD ) ) {
		- // E.g., word', (contraction ruled out by previous conditions)
		- consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) );
		- mClosingSingleQuotes.add( lex2 );
		- }
		- else if( lex1.isType( DASH ) &&
		- lex2.isType( QUOTE_SINGLE ) &&
		- lex3.isType( QUOTE_DOUBLE ) ) {
		- // E.g., ---'"
		- consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) );
		- mClosingSingleQuotes.add( lex2 );
		- }
		- else if( lex2.isType( QUOTE_SINGLE, QUOTE_DOUBLE ) ) {
		- // After tokenizing, the parser will attempt to resolve ambiguities.
		- unresolved.add( new Lexeme[]{lex1, lex2, lex3} );
		- }
		-
		- // Suggest to the caller that resolution should be performed. This allows
		- // the algorithm to reset the opening/closing quote balance before the
		- // next paragraph is parsed.
		- return lex3.isType( EOP );
		- }
		-
		- private void resolve(
		- final List<Lexeme[]> unresolved, final Consumer<Token> consumer ) {
		- // Some non-emitted tokenized lexemes may be ambiguous.
		- final var ambiguousLeadingQuotes = new ArrayList<Lexeme[]>( 16 );
		- final var ambiguousLaggingQuotes = new ArrayList<Lexeme[]>( 16 );
		- var resolvedLeadingQuotes = 0;
		- var resolvedLaggingQuotes = 0;
		-
		- // Count the number of ambiguous and non-ambiguous open single quotes.
		- for( var i = unresolved.iterator(); i.hasNext(); ) {
		- final var quotes = i.next();
		- final var lex1 = quotes[ 0 ];
		- final var lex2 = quotes[ 1 ];
		- final var lex3 = quotes[ 2 ];
		-
		- if( lex2.isType( QUOTE_SINGLE ) ) {
		- final var word1 = lex1 == SOT ? "" : lex1.toString( mText );
		- final var word3 = lex3 == EOT ? "" : lex3.toString( mText );
		-
		- if( sContractions.beganAmbiguously( word3 ) ) {
		- // E.g., 'Cause
		- if( lex1.isType( QUOTE_SINGLE ) ) {
		- // E.g., ''Cause
		- consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) );
		- i.remove();
		- }
		- else {
		- // The contraction is uncertain until a closing quote is found that
		- // may balance this single quote.
		- ambiguousLeadingQuotes.add( quotes );
		- }
		- }
		- else if( sContractions.beganUnambiguously( word3 ) ) {
		- // The quote mark forms a word that does not stand alone from its
		- // contraction. For example, twas is not a word: it's 'twas.
		- consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) );
		- i.remove();
		- }
		- else if( lex1.isType( WORD ) &&
		- (lex3.isType( PUNCT, PERIOD, ELLIPSIS, DASH, SPACE, EOP ) \|\|
		- lex3.isEot()) && unresolved.indexOf( quotes ) == 0 &&
		- resolvedLeadingQuotes == 0 ) {
		- // E.g., Tyfós'
		- consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) );
		- i.remove();
		- }
		- else if( sContractions.endedAmbiguously( word1 ) ) {
		- ambiguousLaggingQuotes.add( quotes );
		- }
		- else if( (lex1.isSot() \|\| lex1.isType( LEADING_QUOTE_OPENING_SINGLE )) &&
		- lex3.isType( LAGGING_QUOTE_OPENING_SINGLE ) ) {
		- consumer.accept( new Token( QUOTE_OPENING_SINGLE, lex2 ) );
		- resolvedLeadingQuotes++;
		- mOpeningSingleQuotes.add( lex2 );
		- i.remove();
		- }
		- else if( lex1.isType( LEADING_QUOTE_CLOSING_SINGLE ) &&
		- (lex3.isEot() \|\| lex3.isType( LAGGING_QUOTE_CLOSING_SINGLE )) ) {
		- consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) );
		- resolvedLaggingQuotes++;
		- mClosingSingleQuotes.add( lex2 );
		- i.remove();
		- }
		- else if( lex3.isType( NUMBER ) ) {
		- // E.g., '04
		- ambiguousLeadingQuotes.add( quotes );
		- }
		- }
		- }
		-
		- sort( mOpeningSingleQuotes );
		- sort( mClosingSingleQuotes );
		- sort( mOpeningDoubleQuotes );
		- sort( mClosingDoubleQuotes );
		-
		- final var singleQuoteEmpty =
		- mOpeningSingleQuotes.isEmpty() \|\| mClosingSingleQuotes.isEmpty();
		- final var doubleQuoteEmpty =
		- mOpeningDoubleQuotes.isEmpty() \|\| mClosingDoubleQuotes.isEmpty();
		-
		- final var singleQuoteDelta = abs(
		- mClosingSingleQuotes.size() - mOpeningSingleQuotes.size()
		- );
		-
		- final var doubleQuoteDelta = abs(
		- mClosingDoubleQuotes.size() - mOpeningDoubleQuotes.size()
		- );
		-
		- final var ambiguousLeadingCount = ambiguousLeadingQuotes.size();
		- final var ambiguousLaggingCount = ambiguousLaggingQuotes.size();
		-
		- if( resolvedLeadingQuotes == 1 && resolvedLaggingQuotes == 0 ) {
		- if( ambiguousLeadingCount == 0 && ambiguousLaggingCount == 1 ) {
		- final var balanced = singleQuoteDelta == 0;
		- final var quote = balanced ? QUOTE_APOSTROPHE : QUOTE_CLOSING_SINGLE;
		- final var lex = ambiguousLaggingQuotes.get( 0 );
		- consumer.accept( new Token( quote, lex[ 1 ] ) );
		- unresolved.remove( lex );
		- }
		- else if( ambiguousLeadingCount == 0 && unresolved.size() == 1 ) {
		- // Must be a closing quote.
		- final var closing = unresolved.get( 0 );
		- consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) );
		- unresolved.remove( closing );
		- }
		- }
		- else if( ambiguousLeadingCount == 0 && ambiguousLaggingCount > 0 ) {
		- // If there are no ambiguous leading quotes then all ambiguous lagging
		- // quotes must be contractions.
		- ambiguousLaggingQuotes.forEach(
		- lex -> {
		- consumer.accept( new Token( QUOTE_APOSTROPHE, lex[ 1 ] ) );
		- unresolved.remove( lex );
		- }
		- );
		- }
		- else if( mOpeningSingleQuotes.size() == 0 &&
		- mClosingSingleQuotes.size() == 1 && !unresolved.isEmpty() ) {
		- final var opening = unresolved.get( 0 );
		- consumer.accept( new Token( QUOTE_OPENING_SINGLE, opening[ 1 ] ) );
		- unresolved.remove( opening );
		- }
		- else if( ambiguousLeadingCount == 0 ) {
		- if( resolvedLaggingQuotes < resolvedLeadingQuotes ) {
		- for( final var i = unresolved.iterator(); i.hasNext(); ) {
		- final var closing = i.next()[ 1 ];
		- consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing ) );
		- i.remove();
		- }
		- }
		- else if( singleQuoteDelta == unresolved.size() ) {
		- for( final var i = unresolved.iterator(); i.hasNext(); ) {
		- final var closing = i.next();
		- consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) );
		- i.remove();
		- }
		- }
		- else if( unresolved.size() == 2 ) {
		- final var closing = unresolved.get( 0 );
		- final var opening = unresolved.get( 1 );
		- consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) );
		- consumer.accept( new Token( QUOTE_OPENING_SINGLE, opening[ 1 ] ) );
		-
		- // Doesn't affect the algorithm.
		- unresolved.clear();
		- }
		- }
		- else if( singleQuoteDelta == 0 && !singleQuoteEmpty \|\|
		- doubleQuoteDelta == 0 && !doubleQuoteEmpty ) {
		- // An apostrophe stands betwixt opening/closing single quotes.
		- for( final var lexemes = unresolved.iterator(); lexemes.hasNext(); ) {
		- final var quote = lexemes.next()[ 1 ];
		-
		- for( int i = 0; i < mOpeningSingleQuotes.size(); i++ ) {
		- // An apostrophe must fall between an open/close pair.
		- final var openingQuote = mOpeningSingleQuotes.get( i );
		- final var closingQuote = mClosingSingleQuotes.get( i );
		-
		- if( openingQuote.before( quote ) && closingQuote.after( quote ) ) {
		- consumer.accept( new Token( QUOTE_APOSTROPHE, quote ) );
		- lexemes.remove();
		- }
		- }
		- }
		-
		- // An apostrophe stands betwixt opening/closing double quotes.
		- final var lexemes = unresolved.iterator();
		-
		- while( lexemes.hasNext() ) {
		- final var quote = lexemes.next()[ 1 ];
		-
		- // Prevent an index out of bounds exception.
		- final var len = Math.min(
		- mOpeningDoubleQuotes.size(),
		- mClosingDoubleQuotes.size()
		- );
		-
		- for( int i = 0; i < len; i++ ) {
		- // An apostrophe must fall between an open/close pair.
		- final var openingQuote = mOpeningDoubleQuotes.get( i );
		- final var closingQuote = mClosingDoubleQuotes.get( i );
		-
		- if( openingQuote.before( quote ) && closingQuote.after( quote ) ) {
		- consumer.accept( new Token( QUOTE_APOSTROPHE, quote ) );
		-
		- try {
		- lexemes.remove();
		- } catch( final Exception ex ) {
		- // Weird edge case that hasn't been tracked down. Doesn't affect
		- // the unit tests.
		- break;
		- }
		- }
		- }
		- }
		- }
		- else if( ambiguousLeadingCount == 1 && resolvedLaggingQuotes == 1 ) {
		- final var opening = ambiguousLeadingQuotes.get( 0 );
		- consumer.accept( new Token( QUOTE_OPENING_SINGLE, opening[ 1 ] ) );
		- unresolved.remove( opening );
		- }
		- }
		-
		- /**
		- * Allow subclasses to change the type of {@link Lexer}
		- *
		- * @param text The text to lex.
		- * @return A {@link Lexer} that can split the text into {@link Lexeme}s.
		- */
		- Lexer createLexer( final String text ) {
		- return new Lexer( text );
		+ * Sets of contractions that help disambiguate single quotes in the text.
		+ * These are effectively immutable while parsing.
		+ */
		+ private final Contractions sContractions;
		+
		+ /**
		+ * Contains each emitted opening single quote per paragraph.
		+ */
		+ private final List<Lexeme> mOpeningSingleQuotes = new ArrayList<>();
		+
		+ /**
		+ * Contains each emitted closing single quote per paragraph.
		+ */
		+ private final List<Lexeme> mClosingSingleQuotes = new ArrayList<>();
		+
		+ /**
		+ * Contains each emitted opening double quote per paragraph.
		+ */
		+ private final List<Lexeme> mOpeningDoubleQuotes = new ArrayList<>();
		+
		+ /**
		+ * Contains each emitted closing double quote per paragraph.
		+ */
		+ private final List<Lexeme> mClosingDoubleQuotes = new ArrayList<>();
		+
		+ /**
		+ * Constructs a new {@link Parser} using the default contraction sets
		+ * to help resolve some ambiguous scenarios.
		+ *
		+ * @param text The prose to parse, containing zero or more quotation
		+ * characters.
		+ * @param contractions Custom sets of contractions to help resolve
		+ * ambiguities.
		+ */
		+ public Parser( final String text, final Contractions contractions ) {
		+ mText = text;
		+ sContractions = contractions;
		+ }
		+
		+ /**
		+ * Iterates over the entire text provided at construction, emitting
		+ * {@link Token}s that can be used to convert straight quotes to curly
		+ * quotes.
		+ *
		+ * @param tokenConsumer Receives emitted {@link Token}s.
		+ */
		+ public void parse(
		+ final Consumer<Token> tokenConsumer,
		+ final Consumer<Lexeme> lexemeConsumer ) {
		+ final var lexemes = new CircularFifoQueue<Lexeme>( 3 );
		+
		+ // Allow consuming the very first token without needing a queue size check.
		+ flush( lexemes );
		+
		+ final var unresolved = new ArrayList<Lexeme[]>();
		+
		+ // Create and convert a list of all unambiguous quote characters.
		+ Lexer.lex(mText, lexeme -> {
		+ // Reset after tokenizing a paragraph.
		+ if( tokenize( lexeme, lexemes, tokenConsumer, unresolved ) ) {
		+ // Attempt to resolve any remaining unambiguous quotes.
		+ resolve( unresolved, tokenConsumer );
		+
		+ // Notify of any unambiguous quotes that could not be resolved.
		+ unresolved.forEach( lex -> lexemeConsumer.accept( lex[ 1 ] ) );
		+ unresolved.clear();
		+ mOpeningSingleQuotes.clear();
		+ mClosingSingleQuotes.clear();
		+ mOpeningDoubleQuotes.clear();
		+ mClosingDoubleQuotes.clear();
		+ }
		+ });
		+
		+ // By loop's end, the lexemes list contains tokens for all except the
		+ // final two elements (from tokenizing in triplets). Tokenize the remaining
		+ // unprocessed lexemes.
		+ tokenize( EOT, lexemes, tokenConsumer, unresolved );
		+ tokenize( EOT, lexemes, tokenConsumer, unresolved );
		+
		+ // Attempt to resolve any remaining unambiguous quotes.
		+ resolve( unresolved, tokenConsumer );
		+
		+ // Notify of any unambiguous quotes that could not be resolved.
		+ unresolved.forEach( lex -> lexemeConsumer.accept( lex[ 1 ] ) );
		+ }
		+
		+ /**
		+ * Converts {@link Lexeme}s identified as straight quotes into {@link Token}s
		+ * that represent the curly equivalent. The {@link Token}s are passed to
		+ * the given {@link Consumer} for further processing (e.g., replaced in
		+ * the original text being parsed).
		+ *
		+ * @param lexeme A part of the text being parsed.
		+ * @param lexemes A 3-element queue of lexemes that provide sufficient
		+ * context to identify curly quotes.
		+ * @param consumer Recipient of equivalent quotes.
		+ * @param unresolved Rolling list of potentially ambiguous {@link Lexeme}s
		+ * that could not be tokenized, yet.
		+ * @return {@code true} if an end-of-paragraph is detected.
		+ */
		+ private boolean tokenize( final Lexeme lexeme,
		+ final CircularFifoQueue<Lexeme> lexemes,
		+ final Consumer<Token> consumer,
		+ final List<Lexeme[]> unresolved ) {
		+ // Add the next lexeme to tokenize into the queue for immediate processing.
		+ lexemes.add( lexeme );
		+
		+ final var lex1 = lexemes.get( 0 );
		+ final var lex2 = lexemes.get( 1 );
		+ final var lex3 = lexemes.get( 2 );
		+
		+ if( lex2.isType( QUOTE_SINGLE ) && lex3.isType( WORD ) &&
		+ lex1.isType( WORD, PERIOD, NUMBER ) ) {
		+ // Examples: y'all, Ph.D.'ll, 20's, she's
		+ consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) );
		+ }
		+ else if( lex1.isType( QUOTE_SINGLE ) && lex3.isType( QUOTE_SINGLE ) &&
		+ "n".equalsIgnoreCase( lex2.toString( mText ) ) ) {
		+ // I.e., 'n'
		+ consumer.accept( new Token( QUOTE_APOSTROPHE, lex1 ) );
		+ consumer.accept( new Token( QUOTE_APOSTROPHE, lex3 ) );
		+ flush( lexemes );
		+ truncate( unresolved );
		+ }
		+ else if( lex2.isType( QUOTE_SINGLE ) && lex1.isType( NUMBER ) ) {
		+ if( lex3.isType( QUOTE_SINGLE ) ) {
		+ // E.g., 2''
		+ consumer.accept(
		+ new Token( QUOTE_PRIME_DOUBLE, lex2.began(), lex3.ended() ) );
		+ flush( lexemes );
		+ }
		+ else {
		+ // E.g., 2'
		+ consumer.accept( new Token( QUOTE_PRIME_SINGLE, lex2 ) );
		+ }
		+ }
		+ else if( lex2.isType( QUOTE_DOUBLE ) && lex1.isType( NUMBER ) ) {
		+ // E.g., 2"
		+ consumer.accept( new Token( QUOTE_PRIME_DOUBLE, lex2 ) );
		+ }
		+ else if( lex2.isType( WORD ) && lex3.isType( QUOTE_SINGLE ) &&
		+ sContractions.endedUnambiguously( lex2.toString( mText ) ) ) {
		+ // E.g., thinkin'
		+ consumer.accept( new Token( QUOTE_APOSTROPHE, lex3 ) );
		+ flush( lexemes );
		+ }
		+ else if( lex2.isType( NUMBER ) && lex1.isType( QUOTE_SINGLE ) ) {
		+ // Authors must re-write their sentences to avoid starting with numerals.
		+ if( lex3.isType( SPACE, PUNCT ) \|\| lex3.isType( WORD ) &&
		+ lex3.toString( mText ).equalsIgnoreCase( "s" ) ) {
		+ // E.g.: '20s, '02
		+ consumer.accept( new Token( QUOTE_APOSTROPHE, lex1 ) );
		+ }
		+ else {
		+ // E.g., '2''
		+ consumer.accept( new Token( QUOTE_OPENING_SINGLE, lex1 ) );
		+ mOpeningSingleQuotes.add( lex1 );
		+ }
		+
		+ truncate( unresolved );
		+ }
		+ else if( lex2.isType( QUOTE_SINGLE ) &&
		+ lex1.isType( PUNCT, PERIOD, ELLIPSIS, DASH ) &&
		+ (lex3.isType( EOL, EOP ) \|\| lex3.isEot()) ) {
		+ consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) );
		+ mClosingSingleQuotes.add( lex2 );
		+ }
		+ else if( lex1.isType( ESC_SINGLE ) ) {
		+ // E.g., \'
		+ consumer.accept( new Token( QUOTE_STRAIGHT_SINGLE, lex1 ) );
		+ }
		+ else if( lex1.isType( ESC_DOUBLE ) ) {
		+ // E.g., \"
		+ consumer.accept( new Token( QUOTE_STRAIGHT_DOUBLE, lex1 ) );
		+
		+ if( lex2.isType( QUOTE_SINGLE ) &&
		+ (lex3.isEot() \|\| lex3.isType( SPACE, DASH, EOL, EOP )) ) {
		+ consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) );
		+ mClosingSingleQuotes.add( lex2 );
		+ }
		+ }
		+ else if( lex2.isType( QUOTE_DOUBLE ) &&
		+ (lex1.isSot() \|\| lex1.isType( LEADING_QUOTE_OPENING_DOUBLE )) &&
		+ lex3.isType( LAGGING_QUOTE_OPENING_DOUBLE ) ) {
		+ // E.g.: "", "..., "word, ---"word
		+ consumer.accept( new Token( QUOTE_OPENING_DOUBLE, lex2 ) );
		+ mOpeningDoubleQuotes.add( lex2 );
		+ }
		+ else if( lex2.isType( QUOTE_DOUBLE ) &&
		+ lex1.isType( LEADING_QUOTE_CLOSING_DOUBLE ) &&
		+ (lex3.isEot() \|\| lex3.isType( LAGGING_QUOTE_CLOSING_DOUBLE )) ) {
		+ // E.g.: ..."', word"', ?"', word"?
		+ consumer.accept( new Token( QUOTE_CLOSING_DOUBLE, lex2 ) );
		+ mClosingDoubleQuotes.add( lex2 );
		+ }
		+ else if( lex1.isType( WORD ) && lex2.isType( QUOTE_SINGLE ) &&
		+ lex3.isType( PUNCT, PERIOD ) ) {
		+ // E.g., word', (contraction ruled out by previous conditions)
		+ consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) );
		+ mClosingSingleQuotes.add( lex2 );
		+ }
		+ else if( lex1.isType( DASH ) &&
		+ lex2.isType( QUOTE_SINGLE ) &&
		+ lex3.isType( QUOTE_DOUBLE ) ) {
		+ // E.g., ---'"
		+ consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) );
		+ mClosingSingleQuotes.add( lex2 );
		+ }
		+ else if( lex2.isType( QUOTE_SINGLE, QUOTE_DOUBLE ) ) {
		+ // After tokenizing, the parser will attempt to resolve ambiguities.
		+ unresolved.add( new Lexeme[]{lex1, lex2, lex3} );
		+ }
		+
		+ // Suggest to the caller that resolution should be performed. This allows
		+ // the algorithm to reset the opening/closing quote balance before the
		+ // next paragraph is parsed.
		+ return lex3.isType( EOP );
		+ }
		+
		+ private void resolve(
		+ final List<Lexeme[]> unresolved, final Consumer<Token> consumer ) {
		+ // Some non-emitted tokenized lexemes may be ambiguous.
		+ final var ambiguousLeadingQuotes = new ArrayList<Lexeme[]>( 16 );
		+ final var ambiguousLaggingQuotes = new ArrayList<Lexeme[]>( 16 );
		+ var resolvedLeadingQuotes = 0;
		+ var resolvedLaggingQuotes = 0;
		+
		+ // Count the number of ambiguous and non-ambiguous open single quotes.
		+ for( var i = unresolved.iterator(); i.hasNext(); ) {
		+ final var quotes = i.next();
		+ final var lex1 = quotes[ 0 ];
		+ final var lex2 = quotes[ 1 ];
		+ final var lex3 = quotes[ 2 ];
		+
		+ if( lex2.isType( QUOTE_SINGLE ) ) {
		+ final var word1 = lex1 == SOT ? "" : lex1.toString( mText );
		+ final var word3 = lex3 == EOT ? "" : lex3.toString( mText );
		+
		+ if( sContractions.beganAmbiguously( word3 ) ) {
		+ // E.g., 'Cause
		+ if( lex1.isType( QUOTE_SINGLE ) ) {
		+ // E.g., ''Cause
		+ consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) );
		+ i.remove();
		+ }
		+ else {
		+ // The contraction is uncertain until a closing quote is found that
		+ // may balance this single quote.
		+ ambiguousLeadingQuotes.add( quotes );
		+ }
		+ }
		+ else if( sContractions.beganUnambiguously( word3 ) ) {
		+ // The quote mark forms a word that does not stand alone from its
		+ // contraction. For example, twas is not a word: it's 'twas.
		+ consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) );
		+ i.remove();
		+ }
		+ else if( lex1.isType( WORD ) &&
		+ (lex3.isType( PUNCT, PERIOD, ELLIPSIS, DASH, SPACE, EOP ) \|\|
		+ lex3.isEot()) && unresolved.indexOf( quotes ) == 0 &&
		+ resolvedLeadingQuotes == 0 ) {
		+ // E.g., Tyfós'
		+ consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) );
		+ i.remove();
		+ }
		+ else if( sContractions.endedAmbiguously( word1 ) ) {
		+ ambiguousLaggingQuotes.add( quotes );
		+ }
		+ else if( (lex1.isSot() \|\| lex1.isType( LEADING_QUOTE_OPENING_SINGLE )) &&
		+ lex3.isType( LAGGING_QUOTE_OPENING_SINGLE ) ) {
		+ consumer.accept( new Token( QUOTE_OPENING_SINGLE, lex2 ) );
		+ resolvedLeadingQuotes++;
		+ mOpeningSingleQuotes.add( lex2 );
		+ i.remove();
		+ }
		+ else if( lex1.isType( LEADING_QUOTE_CLOSING_SINGLE ) &&
		+ (lex3.isEot() \|\| lex3.isType( LAGGING_QUOTE_CLOSING_SINGLE )) ) {
		+ consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) );
		+ resolvedLaggingQuotes++;
		+ mClosingSingleQuotes.add( lex2 );
		+ i.remove();
		+ }
		+ else if( lex3.isType( NUMBER ) ) {
		+ // E.g., '04
		+ ambiguousLeadingQuotes.add( quotes );
		+ }
		+ }
		+ }
		+
		+ sort( mOpeningSingleQuotes );
		+ sort( mClosingSingleQuotes );
		+ sort( mOpeningDoubleQuotes );
		+ sort( mClosingDoubleQuotes );
		+
		+ final var singleQuoteEmpty =
		+ mOpeningSingleQuotes.isEmpty() \|\| mClosingSingleQuotes.isEmpty();
		+ final var doubleQuoteEmpty =
		+ mOpeningDoubleQuotes.isEmpty() \|\| mClosingDoubleQuotes.isEmpty();
		+
		+ final var singleQuoteDelta = abs(
		+ mClosingSingleQuotes.size() - mOpeningSingleQuotes.size()
		+ );
		+
		+ final var doubleQuoteDelta = abs(
		+ mClosingDoubleQuotes.size() - mOpeningDoubleQuotes.size()
		+ );
		+
		+ final var ambiguousLeadingCount = ambiguousLeadingQuotes.size();
		+ final var ambiguousLaggingCount = ambiguousLaggingQuotes.size();
		+
		+ if( resolvedLeadingQuotes == 1 && resolvedLaggingQuotes == 0 ) {
		+ if( ambiguousLeadingCount == 0 && ambiguousLaggingCount == 1 ) {
		+ final var balanced = singleQuoteDelta == 0;
		+ final var quote = balanced ? QUOTE_APOSTROPHE : QUOTE_CLOSING_SINGLE;
		+ final var lex = ambiguousLaggingQuotes.get( 0 );
		+ consumer.accept( new Token( quote, lex[ 1 ] ) );
		+ unresolved.remove( lex );
		+ }
		+ else if( ambiguousLeadingCount == 0 && unresolved.size() == 1 ) {
		+ // Must be a closing quote.
		+ final var closing = unresolved.get( 0 );
		+ consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) );
		+ unresolved.remove( closing );
		+ }
		+ }
		+ else if( ambiguousLeadingCount == 0 && ambiguousLaggingCount > 0 ) {
		+ // If there are no ambiguous leading quotes then all ambiguous lagging
		+ // quotes must be contractions.
		+ ambiguousLaggingQuotes.forEach(
		+ lex -> {
		+ consumer.accept( new Token( QUOTE_APOSTROPHE, lex[ 1 ] ) );
		+ unresolved.remove( lex );
		+ }
		+ );
		+ }
		+ else if( mOpeningSingleQuotes.size() == 0 &&
		+ mClosingSingleQuotes.size() == 1 && !unresolved.isEmpty() ) {
		+ final var opening = unresolved.get( 0 );
		+ consumer.accept( new Token( QUOTE_OPENING_SINGLE, opening[ 1 ] ) );
		+ unresolved.remove( opening );
		+ }
		+ else if( ambiguousLeadingCount == 0 ) {
		+ if( resolvedLaggingQuotes < resolvedLeadingQuotes ) {
		+ for( final var i = unresolved.iterator(); i.hasNext(); ) {
		+ final var closing = i.next()[ 1 ];
		+ consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing ) );
		+ i.remove();
		+ }
		+ }
		+ else if( singleQuoteDelta == unresolved.size() ) {
		+ for( final var i = unresolved.iterator(); i.hasNext(); ) {
		+ final var closing = i.next();
		+ consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) );
		+ i.remove();
		+ }
		+ }
		+ else if( unresolved.size() == 2 ) {
		+ final var closing = unresolved.get( 0 );
		+ final var opening = unresolved.get( 1 );
		+ consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) );
		+ consumer.accept( new Token( QUOTE_OPENING_SINGLE, opening[ 1 ] ) );
		+
		+ // Doesn't affect the algorithm.
		+ unresolved.clear();
		+ }
		+ }
		+ else if( singleQuoteDelta == 0 && !singleQuoteEmpty \|\|
		+ doubleQuoteDelta == 0 && !doubleQuoteEmpty ) {
		+ // An apostrophe stands betwixt opening/closing single quotes.
		+ for( final var lexemes = unresolved.iterator(); lexemes.hasNext(); ) {
		+ final var quote = lexemes.next()[ 1 ];
		+
		+ for( int i = 0; i < mOpeningSingleQuotes.size(); i++ ) {
		+ // An apostrophe must fall between an open/close pair.
		+ final var openingQuote = mOpeningSingleQuotes.get( i );
		+ final var closingQuote = mClosingSingleQuotes.get( i );
		+
		+ if( openingQuote.before( quote ) && closingQuote.after( quote ) ) {
		+ consumer.accept( new Token( QUOTE_APOSTROPHE, quote ) );
		+ lexemes.remove();
		+ }
		+ }
		+ }
		+
		+ // An apostrophe stands betwixt opening/closing double quotes.
		+ final var lexemes = unresolved.iterator();
		+
		+ while( lexemes.hasNext() ) {
		+ final var quote = lexemes.next()[ 1 ];
		+
		+ // Prevent an index out of bounds exception.
		+ final var len = Math.min(
		+ mOpeningDoubleQuotes.size(),
		+ mClosingDoubleQuotes.size()
		+ );
		+
		+ for( int i = 0; i < len; i++ ) {
		+ // An apostrophe must fall between an open/close pair.
		+ final var openingQuote = mOpeningDoubleQuotes.get( i );
		+ final var closingQuote = mClosingDoubleQuotes.get( i );
		+
		+ if( openingQuote.before( quote ) && closingQuote.after( quote ) ) {
		+ consumer.accept( new Token( QUOTE_APOSTROPHE, quote ) );
		+
		+ try {
		+ lexemes.remove();
		+ } catch( final Exception ex ) {
		+ // Weird edge case that hasn't been tracked down. Doesn't affect
		+ // the unit tests.
		+ break;
		+ }
		+ }
		+ }
		+ }
		+ }
		+ else if( ambiguousLeadingCount == 1 && resolvedLaggingQuotes == 1 ) {
		+ final var opening = ambiguousLeadingQuotes.get( 0 );
		+ consumer.accept( new Token( QUOTE_OPENING_SINGLE, opening[ 1 ] ) );
		+ unresolved.remove( opening );
		+ }
		}

src/main/java/com/whitemagicsoftware/keenquotes/XmlFilter.java

		+/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */
		+package com.whitemagicsoftware.keenquotes;
		+
		+import java.text.CharacterIterator;
		+import java.util.Set;
		+import java.util.function.Consumer;
		+
		+/**
		+ * Responsible for lexing text while ignoring XML elements. The document must
		+ * be both sane and well-formed. This is not intended to lex documents in the
		+ * wild where the user has injected custom HTML. The lexer will fail if the
		+ * angle brackets are not balanced. Additionally, any less than or greater than
		+ * symbols must be encoded as {@code <} or {@code >}, respectively.
		+ */
		+final class XmlFilter implements Consumer<FastCharacterIterator> {
		+
		+ /**
		+ * Elements that indicate preformatted text, intentional straight quotes.
		+ */
		+ private final Set<String> UNTOUCHABLE = Set.of(
		+ "pre",
		+ "code",
		+ "tt",
		+ "tex",
		+ "kbd",
		+ "samp",
		+ "var",
		+ "l",
		+ "blockcode"
		+ );
		+
		+ public XmlFilter() {}
		+
		+ /**
		+ * Skip XML tags found within the prose, which hides the elements.
		+ */
		+ @Override
		+ public void accept( final FastCharacterIterator i ) {
		+ if( i.current() == '<' ) {
		+ final var openingTag = nextTag( i );
		+
		+ if( UNTOUCHABLE.contains( openingTag.toLowerCase() ) ) {
		+ String closingTag;
		+
		+ do {
		+ closingTag = nextTag( i );
		+ }
		+ while( !closingTag.endsWith( openingTag ) );
		+ }
		+ }
		+ }
		+
		+ /**
		+ * Skips to the next greater than or less than symbol.
		+ *
		+ * @param i The {@link CharacterIterator} used to scan through the text, one
		+ * character at a time.
		+ * @return An opening/closing tag name, or the content within the element.
		+ */
		+ private String nextTag( final FastCharacterIterator i ) {
		+ final var begin = i.index();
		+
		+ i.skip( next -> next != '>' && next != '<' );
		+
		+ // Swallow the trailing greater than symbol.
		+ i.next();
		+
		+ // Skip to the character following the greater than symbol.
		+ i.next();
		+
		+ return i.substring( begin + 1, i.index() - 1 );
		+ }
		+}

src/main/java/com/whitemagicsoftware/keenquotes/XmlLexer.java

		-/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */
		-package com.whitemagicsoftware.keenquotes;
		-
		-import java.text.CharacterIterator;
		-import java.util.Set;
		-
		-import static java.text.CharacterIterator.DONE;
		-
		-/**
		- * Responsible for lexing text while ignoring XML elements. The document must
		- * be both sane and well-formed. This is not intended to lex documents in the
		- * wild where the user has injected custom HTML. The lexer will fail if the
		- * angle brackets are not balanced. Additionally, any less than or greater than
		- * symbols must be encoded as {@code <} or {@code >}, respectively.
		- */
		-final class XmlLexer extends Lexer {
		-
		- /**
		- * Referenced when skipping text, to determine whether lexing has found
		- * an untouchable element, such as preformatted text.
		- */
		- private final String mText;
		-
		- /**
		- * Elements that indicate preformatted text, intentional straight quotes.
		- */
		- private final Set<String> UNTOUCHABLE = Set.of(
		- "pre",
		- "code",
		- "tt",
		- "tex",
		- "kbd",
		- "samp",
		- "var",
		- "l",
		- "blockcode"
		- );
		-
		- /**
		- * Constructs a {@link Lexer} capable of turning text int {@link Lexeme}s.
		- *
		- * @param text The text to lex.
		- */
		- XmlLexer( final String text ) {
		- super( text );
		-
		- mText = text;
		- }
		-
		- /**
		- * Skip (do not emit) XML tags found within the prose. This effectively hides
		- * the element.
		- */
		- @Override
		- boolean skip( final CharacterIterator i ) {
		- final var match = i.current() == '<';
		-
		- if( match ) {
		- final var openingTag = nextTag( i );
		-
		- if( UNTOUCHABLE.contains( openingTag.toLowerCase() ) ) {
		- String closingTag;
		-
		- do {
		- closingTag = nextTag( i );
		- }
		- while( !closingTag.endsWith( openingTag ) && i.current() != DONE );
		- }
		- }
		-
		- return match;
		- }
		-
		- /**
		- * Skips to the next greater than or less than symbol.
		- *
		- * @param i The {@link CharacterIterator} used to scan through the text, one
		- * character at a time.
		- * @return An opening/closing tag name, or the content within the element.
		- */
		- private String nextTag( final CharacterIterator i ) {
		- final var begin = i.getIndex();
		-
		- slurp( i, ( next, ci ) -> next != '>' && next != '<' );
		-
		- // Swallow the trailing greater than symbol.
		- i.next();
		-
		- // Skip to the character following the greater than symbol.
		- i.next();
		-
		- return mText.substring( begin + 1, i.getIndex() - 1 );
		- }
		-}

src/main/java/com/whitemagicsoftware/keenquotes/XmlParser.java

		super( text, contractions );
		}
		-
		- @Override
		- public Lexer createLexer( final String text ) {
		- return new XmlLexer( text );
		- }
		}

src/test/java/com/whitemagicsoftware/keenquotes/LexerTest.java


		import java.util.List;
		+import java.util.concurrent.atomic.AtomicInteger;
		import java.util.function.BiFunction;
		+import java.util.function.Consumer;

		-import static com.whitemagicsoftware.keenquotes.Lexeme.EOT;
		import static com.whitemagicsoftware.keenquotes.LexemeType.*;
		import static java.util.Arrays.asList;

		testType( "!", PUNCT );
		testType( ";", PUNCT );
		+ testType( "\\", PUNCT );
		testType( ".", PERIOD );
		testType( "-", HYPHEN );

		testType( "'", QUOTE_SINGLE );
		testType( "\"", QUOTE_DOUBLE );
		+ testType( "‘", QUOTE_SINGLE_OPENING );
		+ testType( "’", QUOTE_SINGLE_CLOSING );
		+ testType( "“", QUOTE_DOUBLE_OPENING );
		+ testType( "”", QUOTE_DOUBLE_CLOSING );
		testType( "3 o'clock", NUMBER, SPACE, WORD, QUOTE_SINGLE, WORD );
		}

		final String actual, final LexemeType... expected ) {
		final var list = asList( expected );
		- final var lexer = createLexer( actual );
		- testType( lexer, actual, ( lexeme, text ) -> lexeme.getType(), list );
		+ testType( actual, ( lexeme, text ) -> lexeme.getType(), list );
		+ }
		+
		+ static void testType(
		+ final String actual,
		+ final Consumer<FastCharacterIterator> filter,
		+ final LexemeType... expected ) {
		+ final var list = asList( expected );
		+ testType( actual, ( lexeme, text ) -> lexeme.getType(), filter, list );
		}

		static void testText(
		final String actual, final String... expected ) {
		- final var lexer = createLexer( actual );
		- testType( lexer, actual, Lexeme::toString, asList( expected ) );
		+ testType( actual, Lexeme::toString, asList( expected ) );
		}

		- static void testType(
		- final Lexer lexer, final String actual, final LexemeType... expected ) {
		- final var list = asList( expected );
		- testType( lexer, actual, ( lexeme, text ) -> lexeme.getType(), list );
		+ private static <A, E> void testType(
		+ final String text,
		+ final BiFunction<Lexeme, String, A> f,
		+ final List<E> elements ) {
		+ testType( text, f, filter -> {}, elements );
		}

		private static <A, E> void testType(
		- final Lexer lexer,
		final String text,
		final BiFunction<Lexeme, String, A> f,
		+ final Consumer<FastCharacterIterator> filter,
		final List<E> elements ) {
		- var counter = 0;
		-
		- Lexeme lexeme;
		+ var counter = new AtomicInteger();

		- while( (lexeme = lexer.next()) != EOT ) {
		- final var expected = elements.get( counter++ );
		+ Lexer.lex( text, lexeme -> {
		+ final var expected = elements.get( counter.getAndIncrement() );
		final var actual = f.apply( lexeme, text );

		assertEquals( expected, actual );
		- }
		+ }, filter );

		// Ensure all expected values are matched (verify end of text reached).
		- assertEquals( elements.size(), counter );
		- }
		-
		- static Lexer createLexer( final String text ) {
		- return new Lexer( text );
		+ assertEquals( elements.size(), counter.get() );
		}
		}

src/test/java/com/whitemagicsoftware/keenquotes/XmlFilterTest.java

		+/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */
		+package com.whitemagicsoftware.keenquotes;
		+
		+import org.junit.jupiter.api.Test;
		+
		+import java.util.function.Consumer;
		+
		+import static com.whitemagicsoftware.keenquotes.LexemeType.*;
		+import static com.whitemagicsoftware.keenquotes.LexerTest.testType;
		+
		+/**
		+ * Test that parsing XML documents ignores elements.
		+ */
		+final class XmlFilterTest {
		+
		+ @Test
		+ void test_Lexing_Xml_EmitTags() {
		+ final var actual =
		+ "A <em>world's</em> aflame <pre><code>ch = '\\''</code></pre>.";
		+ testType(
		+ actual,
		+ createXmlFilter(),
		+ // A world ' s aflame .
		+ WORD, SPACE, WORD, QUOTE_SINGLE, WORD, SPACE, WORD, SPACE, PERIOD
		+ );
		+ }
		+
		+ @Test
		+ void test_Lexing_XmlAttribute_EmitTags() {
		+ final var actual = "<a href=\"http://x.org\">X11</a>";
		+ testType( actual, createXmlFilter(), WORD );
		+ }
		+
		+ static Consumer<FastCharacterIterator> createXmlFilter() {
		+ return new XmlFilter();
		+ }
		+}

src/test/java/com/whitemagicsoftware/keenquotes/XmlLexerTest.java

		-/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */
		-package com.whitemagicsoftware.keenquotes;
		-
		-import org.junit.jupiter.api.Test;
		-
		-import static com.whitemagicsoftware.keenquotes.LexemeType.*;
		-import static com.whitemagicsoftware.keenquotes.LexerTest.testType;
		-
		-/**
		- * Test that parsing XML documents ignores elements.
		- */
		-final class XmlLexerTest {
		-
		- @Test
		- void test_Lexing_Xml_EmitTags() {
		- final var actual =
		- "A <em>world's</em> aflame <pre><code>ch = '\\''</code></pre>.";
		- testType(
		- createXmlLexer( actual ), actual,
		- WORD, SPACE, WORD, QUOTE_SINGLE, WORD, SPACE, WORD, SPACE, PERIOD
		- );
		- }
		-
		- @Test
		- void test_Lexing_XmlAttribute_EmitTags() {
		- final var actual = "<a href=\"http://x.org\">X11</a>";
		- testType( createXmlLexer( actual ), actual, WORD );
		- }
		-
		- static Lexer createXmlLexer( final String text ) {
		- return new XmlLexer( text );
		- }
		-}

src/test/resources/com/whitemagicsoftware/keenquotes/smartypants.txt

		Sam's Sams' and the Ross's roses' thorns were prickly.

		-"You 'cause---" all fifteen years' worth.
		-“You 'cause---” all fifteen years' worth.
		-
		# ########################################################################
		# Inside contractions (no leading/trailing apostrophes)

		"Jarvis, sir? Why, him as 'listed some years ago, and fought under
		“Jarvis, sir? Why, him as 'listed some years ago, and fought under
		+
		+"You 'cause---" all fifteen years' worth.
		+“You 'cause---” all fifteen years' worth.

Delta	936 lines added, 752 lines removed, 184-line increase