Add lexemes, emit apostrophes and primes

Author	Dave Jarvis <email>
Date	2021-05-31 18:28:39 GMT-0700
Commit	f40e32cbe72ac0af42e55bde67c9204ab88cf840
Parent	5aaef29

lib/src/main/java/com/keenwrite/quotes/Contractions.java

		+package com.keenwrite.quotes;
		+
		+import java.util.Set;
		+
		+public class Contractions {
		+ /**
		+ * Words having a straight apostrophe that cannot be mistaken for an
		+ * opening single quote.
		+ */
		+ private static final Set<String> BEGAN_UNAMBIGUOUS = Set.of(
		+ "aporth", "boutcha", "boutchu", "cept", "dillo", "e'll", "fraid",
		+ "gainst", "n", "neath", "nother", "onna", "onna'", "pon", "s", "sblood",
		+ "scuse", "sfar", "sfoot", "t", "taint", "tain't", "til", "tis", "tisn't",
		+ "tshall", "twas", "twasn't", "tween", "twere", "tweren't", "twixt",
		+ "twon't", "twou'd", "twou'dn't", "twould", "twouldn't", "ve"
		+ );
		+
		+ /**
		+ * Words having a straight apostrophe that may be either part of a
		+ * contraction or a word that stands alone beside an opening single quote.
		+ */
		+ private static final Set<String> BEGAN_AMBIGUOUS = Set.of(
		+ "bout", "choo", "ere", "e", "e's", "fro", "ho", "kay", "lo",
		+ "re", "sup", "twill", "um", "zat"
		+ );
		+
		+ /**
		+ * Answers whether the given word is a contraction that always starts
		+ * with an apostrophe. The comparison is case insensitive. This must
		+ * only be called when a straight quote is followed by a word.
		+ *
		+ * @param word The word to compare against the list of known unambiguous
		+ * contractions.
		+ * @return {@code true} when the given word is in the set of unambiguous
		+ * contractions.
		+ */
		+ public static boolean beginsUnambiguously( final String word ) {
		+ assert word != null;
		+ return BEGAN_UNAMBIGUOUS.contains( word.toLowerCase() );
		+ }
		+
		+ /**
		+ * Answers whether the given word could be a contraction but is also a
		+ * valid word in non-contracted form.
		+ *
		+ * @param word The word to compare against the list of known ambiguous
		+ * contractions.
		+ * @return {@code true} when the given word is in the set of ambiguous
		+ * contractions.
		+ */
		+ public static boolean beginsAmbiguously( final String word ) {
		+ assert word != null;
		+ return BEGAN_AMBIGUOUS.contains( word.toLowerCase() );
		+ }
		+}

lib/src/main/java/com/keenwrite/quotes/Lexeme.java

		+/* Copyright 2020-2021 White Magic Software, Ltd. -- All rights reserved. */
		+package com.keenwrite.quotes;
		+
		+import static com.keenwrite.quotes.LexemeType.INVALID;
		+
		+/**
		+ * Responsible for tracking the beginning and ending offsets of a token within
		+ * a text string. Tracking the beginning and ending indices should use less
		+ * memory than duplicating the entire text of Unicode characters (i.e., using
		+ * a similar approach to run-length encoding).
		+ */
		+public class Lexeme {
		+ /**
		+ * Denotes there are no more tokens: the end of text (EOT) has been reached.
		+ */
		+ public static final Lexeme EOT = new Lexeme();
		+
		+ private final LexemeType mType;
		+ private final int mBegan;
		+ private final int mEnded;
		+
		+ /**
		+ * Set to {@code true} if there are more tokens to parse.
		+ */
		+ private final boolean mHasNext;
		+
		+ /**
		+ * Create an end of text token.
		+ */
		+ private Lexeme() {
		+ this( INVALID, -1, -1, false );
		+ }
		+
		+ /**
		+ * Create a token that indicates there are no more tokens.
		+ */
		+ private Lexeme( final LexemeType type, final int began, final int ended ) {
		+ this( type, began, ended, true );
		+ }
		+
		+ /**
		+ * Create a token that represents a section of the text.
		+ */
		+ private Lexeme(
		+ final LexemeType type, final int began, final int ended,
		+ final boolean hasNext ) {
		+ mType = type;
		+ mBegan = began;
		+ mEnded = ended + 1;
		+ mHasNext = hasNext;
		+ }
		+
		+ /**
		+ * Extracts a sequence of characters from the given text at the offsets
		+ * captured by this token.
		+ *
		+ * @param text The text that was parsed using this class.
		+ * @return The character string captured by the token.
		+ */
		+ public String toString( final String text ) {
		+ return text.substring( mBegan, mEnded );
		+ }
		+
		+ public boolean hasNext() {
		+ return mHasNext;
		+ }
		+
		+ public boolean isType( final LexemeType type ) {
		+ return mType == type;
		+ }
		+
		+ public boolean anyType( final LexemeType... types ) {
		+ for( final var type : types ) {
		+ if( mType == type ) {
		+ return true;
		+ }
		+ }
		+
		+ return false;
		+ }
		+
		+ LexemeType getType() {
		+ return mType;
		+ }
		+
		+ @Override
		+ public String toString() {
		+ return getClass().getSimpleName() + "{" +
		+ "mType=" + mType +
		+ ", mBegan=" + mBegan +
		+ ", mEnded=" + mEnded +
		+ '}';
		+ }
		+
		+ static Lexeme createLexeme(
		+ final LexemeType token, final int began, final int ended ) {
		+ return new Lexeme( token, began, ended );
		+ }
		+}

lib/src/main/java/com/keenwrite/quotes/LexemeType.java

		+/* Copyright 2020-2021 White Magic Software, Ltd. -- All rights reserved. */
		+package com.keenwrite.quotes;
		+
		+public enum LexemeType {
		+ QUOTE_SINGLE,
		+ QUOTE_DOUBLE,
		+ WORD,
		+ NUMBER,
		+ NEWLINE,
		+ SPACE,
		+ PUNCT,
		+ PERIOD,
		+ INVALID
		+}

lib/src/main/java/com/keenwrite/quotes/Lexer.java

		+/* Copyright 2020-2021 White Magic Software, Ltd. -- All rights reserved. */
		+package com.keenwrite.quotes;
		+
		+import java.text.CharacterIterator;
		+import java.text.StringCharacterIterator;
		+import java.util.function.Consumer;
		+
		+import static com.keenwrite.quotes.Lexeme.createLexeme;
		+import static com.keenwrite.quotes.LexemeType.*;
		+import static java.lang.Character.*;
		+import static java.text.CharacterIterator.DONE;
		+
		+/**
		+ * Turns text into words, numbers, punctuation, spaces, and more.
		+ */
		+public class Lexer {
		+ /**
		+ * Default constructor, no state.
		+ */
		+ public Lexer() {
		+ }
		+
		+ /**
		+ * Emits a series of tokens that represent information about text that is
		+ * needed to convert straight quotes to curly quotes.
		+ *
		+ * @param text The text to split into tokens.
		+ * @param consumer Receives each token as a separate event.
		+ */
		+ public void parse( final String text, final Consumer<Lexeme> consumer ) {
		+ final var iterator = new StringCharacterIterator( text );
		+ Lexeme lex;
		+
		+ while( (lex = parse( iterator )).hasNext() ) {
		+ consumer.accept( lex );
		+ }
		+ }
		+
		+ /**
		+ * Tokenizes a sequence of characters. The order of comparisons is optimized
		+ * towards probability of the occurrence of a character in regular English
		+ * prose: letters, space, quotation marks, numbers, periods, new lines,
		+ * then end of text.
		+ *
		+ * @param i The sequence of characters to tokenize.
		+ * @return The next token in the sequence.
		+ */
		+ private Lexeme parse( final CharacterIterator i ) {
		+ int began = i.getIndex();
		+ boolean isWord = false;
		+ Lexeme lexeme = null;
		+
		+ do {
		+ final var curr = i.current();
		+
		+ if( isLetter( curr ) ) {
		+ isWord = true;
		+
		+ if( !isLetterOrDigit( peek( i ) ) ) {
		+ lexeme = createLexeme( WORD, began, i.getIndex() );
		+ }
		+ }
		+ else if( curr == ' ' ) {
		+ lexeme = createLexeme( SPACE, began, i.getIndex() );
		+ }
		+ else if( curr == '\'' ) {
		+ lexeme = createLexeme( QUOTE_SINGLE, began, i.getIndex() );
		+ }
		+ else if( curr == '"' ) {
		+ lexeme = createLexeme( QUOTE_DOUBLE, began, i.getIndex() );
		+ }
		+ else if( isDigit( curr ) \|\| isNumeric( curr ) && isDigit( peek( i ) ) ) {
		+ // Tokenize all consecutive number characters at prevent the main
		+ // loop from switching back to word tokens.
		+ char next;
		+
		+ do {
		+ next = i.next();
		+ }
		+ while( isDigit( next ) \|\| isNumeric( next ) && isDigit( peek( i ) ) );
		+
		+ // The loop above will overshoot the number by one character.
		+ i.previous();
		+
		+ lexeme = createLexeme( isWord ? WORD : NUMBER, began, i.getIndex() );
		+ }
		+ else if( curr == '.' ) {
		+ lexeme = createLexeme( PERIOD, began, i.getIndex() );
		+ }
		+ else if( curr == '\r' ) {
		+ lexeme = createLexeme( NEWLINE, began, i.getIndex() );
		+
		+ // Swallow the LF in CRLF; peeking won't work here.
		+ if( i.next() != '\n' ) {
		+ // Push back the non-LF char.
		+ i.previous();
		+ }
		+ }
		+ else if( curr == '\n' ) {
		+ lexeme = createLexeme( NEWLINE, began, i.getIndex() );
		+ }
		+ else if( isWhitespace( curr ) ) {
		+ lexeme = createLexeme( SPACE, began, i.getIndex() );
		+ }
		+ else if( curr != DONE ) {
		+ lexeme = createLexeme( PUNCT, began, i.getIndex() );
		+ }
		+ else {
		+ lexeme = Lexeme.EOT;
		+ }
		+
		+ i.next();
		+ }
		+ while( lexeme == null );
		+
		+ return lexeme;
		+ }
		+
		+ private static boolean isNumeric( final char curr ) {
		+ return curr == '.' \|\| curr == ',';
		+ }
		+
		+ private static char peek( final CharacterIterator ci ) {
		+ final var ch = ci.next();
		+ ci.previous();
		+ return ch;
		+ }
		+}

lib/src/main/java/com/keenwrite/quotes/Parser.java

		import java.util.function.Consumer;

		+import static com.keenwrite.quotes.Contractions.beginsUnambiguously;
		+import static com.keenwrite.quotes.LexemeType.*;
		import static com.keenwrite.quotes.TokenType.*;


		* </ol>
		*/
		-public class Parser implements Consumer<Token> {
		+public class Parser implements Consumer<Lexeme> {
		private final String mText;
		- private final CircularFifoQueue<Token> mTokens = new CircularFifoQueue<>( 3 );
		+ private final CircularFifoQueue<Lexeme> mTokens =
		+ new CircularFifoQueue<>( 3 );

		- private final Deque<Token> mStack = new ArrayDeque<>();
		+ private final Deque<Lexeme> mStack = new ArrayDeque<>();
		+ private final Consumer<Token> mConsumer;

		- public Parser( final String text ) {
		+ public Parser( final String text, final Consumer<Token> consumer ) {
		mText = text;
		- mTokens.add( Token.EOT );
		- mTokens.add( Token.EOT );
		- mTokens.add( Token.EOT );
		+
		+ // Allow consuming the very first token without checking the queue size.
		+ mTokens.add( Lexeme.EOT );
		+ mTokens.add( Lexeme.EOT );
		+ mTokens.add( Lexeme.EOT );
		+
		+ mConsumer = consumer;
		}

		public void parse() {
		- final var tokenizer = new Tokenizer();
		- tokenizer.tokenize( mText, this );
		+ final var tokenizer = new Lexer();
		+ tokenizer.parse( mText, this );
		+ System.out.println(" DUH DONE!" );
		}

		@Override
		- public void accept( final Token token ) {
		+ public void accept( final Lexeme token ) {
		mTokens.add( token );

		final var token1 = mTokens.get( 0 );
		final var token2 = mTokens.get( 1 );
		final var token3 = mTokens.get( 2 );

		- if( token2.isType( QUOTE_SINGLE ) &&
		- token3.isType( WORD ) &&
		+ if( token2.isType( QUOTE_SINGLE ) && token3.isType( WORD ) &&
		token1.anyType( WORD, PERIOD, NUMBER ) ) {
		- System.out.println( "APOSTROPHE: " + token2 );
		+ mConsumer.accept( new Token( QUOTE_APOSTROPHE, token2 ) );
		}
		- else if( token1.isType( QUOTE_SINGLE ) &&
		- "n".equalsIgnoreCase( token2.toString( mText ) ) &&
		- token3.isType( QUOTE_SINGLE ) ) {
		- System.out.printf( "APOSTROPHES: %s %s%n", token1, token3 );
		+ else if( token1.isType( QUOTE_SINGLE ) && token3.isType( QUOTE_SINGLE ) &&
		+ "n".equalsIgnoreCase( token2.toString( mText ) ) ) {
		+ mConsumer.accept( new Token( QUOTE_APOSTROPHE, token1 ) );
		+ mConsumer.accept( new Token( QUOTE_APOSTROPHE, token3 ) );
		}
		else if( token1.isType( NUMBER ) && token2.isType( QUOTE_SINGLE ) ) {
		- System.out.println( "PRIME: " + token2 );
		+ mConsumer.accept( new Token( QUOTE_PRIME_SINGLE, token2 ) );
		}
		else if( token1.isType( NUMBER ) && token2.isType( QUOTE_DOUBLE ) ) {
		- System.out.println( "DOUBLE PRIME: " + token2 );
		+ mConsumer.accept( new Token( QUOTE_PRIME_DOUBLE, token2 ) );
		+ }
		+ else if( token1.isType( QUOTE_SINGLE ) && token2.isType( WORD ) &&
		+ beginsUnambiguously( token2.toString( mText ) ) ) {
		+ mConsumer.accept( new Token( QUOTE_APOSTROPHE, token1 ) );
		}
		else if( token.anyType( QUOTE_SINGLE, QUOTE_DOUBLE ) ) {

		}
		}
		-
		-/*
		- private enum TokenType {
		- QUOTE_OPENING_SINGLE,
		- QUOTE_OPENING_DOUBLE,
		- QUOTE_CLOSING_SINGLE,
		- QUOTE_CLOSING_DOUBLE,
		- QUOTE_APOSTROPHE,
		- QUOTE_PRIME_SINGLE,
		- QUOTE_PRIME_DOUBLE,
		- TEXT
		- }
		-*/

lib/src/main/java/com/keenwrite/quotes/Token.java

		package com.keenwrite.quotes;

		-import static com.keenwrite.quotes.TokenType.INVALID;
		-
		/**
		- * Responsible for tracking the beginning and ending offsets of a token within
		- * a text string.
		+ * Represents a high-level token read from the text.
		*/
		public class Token {
		- /**
		- * Denotes there are no more tokens: the end of text (EOT) has been reached.
		- */
		- public static final Token EOT = new Token();
		-
		- private final TokenType mType;
		- private final int mBegan;
		- private final int mEnded;
		-
		- /**
		- * Set to {@code true} if there are more tokens to parse.
		- */
		- private final boolean mHasNext;
		-
		- /**
		- * Create an end of text token.
		- */
		- private Token() {
		- this( INVALID, -1, -1, false );
		- }
		+ private TokenType mType;
		+ private Lexeme mLexeme;

		- /**
		- * Create a token that indicates there are no more tokens.
		- */
		- private Token( final TokenType type, final int began, final int ended ) {
		- this( type, began, ended, true );
		- }
		+ public Token( final TokenType type, final Lexeme lexeme ) {
		+ assert type != null;
		+ assert lexeme != null;

		- /**
		- * Create a token
		- */
		- private Token(
		- final TokenType type, final int began, final int ended,
		- final boolean hasNext ) {
		mType = type;
		- mBegan = began;
		- mEnded = ended + 1;
		- mHasNext = hasNext;
		- }
		-
		- /**
		- * Extracts a sequence of characters from the given text at the offsets
		- * captured by this token.
		- *
		- * @param text The text that was parsed using this class.
		- * @return The character string captured by the token.
		- */
		- public String toString( final String text ) {
		- return text.substring( mBegan, mEnded );
		- }
		-
		- public boolean hasNext() {
		- return mHasNext;
		- }
		-
		- public boolean isType( final TokenType type ) {
		- return mType == type;
		- }
		-
		- public boolean anyType( final TokenType... types ) {
		- for( final var type : types ) {
		- if( mType == type ) {
		- return true;
		- }
		- }
		-
		- return false;
		- }
		-
		- TokenType getType() {
		- return mType;
		+ mLexeme = lexeme;
		}

		@Override
		public String toString() {
		return getClass().getSimpleName() + "{" +
		"mType=" + mType +
		- ", mBegan=" + mBegan +
		- ", mEnded=" + mEnded +
		+ ", mLexeme=" + mLexeme +
		'}';
		- }
		-
		- static Token createToken(
		- final TokenType token, final int began, final int ended ) {
		- return new Token( token, began, ended );
		}
		}

lib/src/main/java/com/keenwrite/quotes/TokenType.java

		package com.keenwrite.quotes;

		+/**
		+ * The {@link Parser} emits these token types.
		+ */
		public enum TokenType {
		- QUOTE_SINGLE,
		- QUOTE_DOUBLE,
		- WORD,
		- NUMBER,
		- NEWLINE,
		- SPACE,
		- PUNCT,
		- PERIOD,
		- INVALID
		+ QUOTE_OPENING_SINGLE,
		+ QUOTE_OPENING_DOUBLE,
		+ QUOTE_CLOSING_SINGLE,
		+ QUOTE_CLOSING_DOUBLE,
		+ QUOTE_APOSTROPHE,
		+ QUOTE_PRIME_SINGLE,
		+ QUOTE_PRIME_DOUBLE,
		+ TEXT,
		}

lib/src/main/java/com/keenwrite/quotes/Tokenizer.java

		-/* Copyright 2020-2021 White Magic Software, Ltd. -- All rights reserved. */
		-package com.keenwrite.quotes;
		-
		-import java.text.CharacterIterator;
		-import java.text.StringCharacterIterator;
		-import java.util.function.Consumer;
		-
		-import static com.keenwrite.quotes.Token.createToken;
		-import static com.keenwrite.quotes.TokenType.*;
		-import static java.lang.Character.*;
		-import static java.text.CharacterIterator.DONE;
		-
		-/**
		- * Tokenizes text into words, numbers, punctuation, spaces, and more.
		- */
		-public class Tokenizer {
		- /**
		- * Default constructor, no state.
		- */
		- public Tokenizer() {
		- }
		-
		- /**
		- * Emits a series of tokens that represent information about text that is
		- * needed to convert straight quotes to curly quotes.
		- *
		- * @param text The text to split into tokens.
		- * @param consumer Receives each token as a separate event.
		- */
		- public void tokenize( final String text, final Consumer<Token> consumer ) {
		- final var iterator = new StringCharacterIterator( text );
		- Token token;
		-
		- while( (token = tokenize( iterator )).hasNext() ) {
		- consumer.accept( token );
		- }
		- }
		-
		- /**
		- * Tokenizes a sequence of characters. The order of comparisons is optimized
		- * towards probability of the occurrence of a character in regular English
		- * prose: letters, space, quotation marks, numbers, periods, new lines,
		- * then end of text.
		- *
		- * @param i The sequence of characters to tokenize.
		- * @return The next token in the sequence.
		- */
		- private Token tokenize( final CharacterIterator i ) {
		- int began = i.getIndex();
		- boolean isWord = false;
		- Token token = null;
		-
		- do {
		- final var curr = i.current();
		-
		- if( isLetter( curr ) ) {
		- isWord = true;
		-
		- if( !isLetterOrDigit( peek( i ) ) ) {
		- token = createToken( WORD, began, i.getIndex() );
		- }
		- }
		- else if( curr == ' ' ) {
		- token = createToken( SPACE, began, i.getIndex() );
		- }
		- else if( curr == '\'' ) {
		- token = createToken( QUOTE_SINGLE, began, i.getIndex() );
		- }
		- else if( curr == '"' ) {
		- token = createToken( QUOTE_DOUBLE, began, i.getIndex() );
		- }
		- else if( isDigit( curr ) \|\| isNumeric( curr ) && isDigit( peek( i ) ) ) {
		- // Tokenize all consecutive number characters at prevent the main
		- // loop from switching back to word tokens.
		- char next;
		-
		- do {
		- next = i.next();
		- }
		- while( isDigit( next ) \|\| isNumeric( next ) && isDigit( peek( i ) ) );
		-
		- // The loop above will overshoot the number by one character.
		- i.previous();
		-
		- token = createToken( isWord ? WORD : NUMBER, began, i.getIndex() );
		- }
		- else if( curr == '.' ) {
		- token = createToken( PERIOD, began, i.getIndex() );
		- }
		- else if( curr == '\r' ) {
		- token = createToken( NEWLINE, began, i.getIndex() );
		-
		- // Swallow the LF in CRLF; peeking won't work here.
		- if( i.next() != '\n' ) {
		- // Push back the non-LF char.
		- i.previous();
		- }
		- }
		- else if( curr == '\n' ) {
		- token = createToken( NEWLINE, began, i.getIndex() );
		- }
		- else if( isWhitespace( curr ) ) {
		- token = createToken( SPACE, began, i.getIndex() );
		- }
		- else if( curr != DONE ) {
		- token = createToken( PUNCT, began, i.getIndex() );
		- }
		- else {
		- token = Token.EOT;
		- }
		-
		- i.next();
		- }
		- while( token == null );
		-
		- return token;
		- }
		-
		- private static boolean isNumeric( final char curr ) {
		- return curr == '.' \|\| curr == ',';
		- }
		-
		- private static char peek( final CharacterIterator ci ) {
		- final var ch = ci.next();
		- ci.previous();
		- return ch;
		- }
		-}

lib/src/test/java/com/keenwrite/quotes/LexerTest.java

		+/* Copyright 2020-2021 White Magic Software, Ltd. -- All rights reserved. */
		+package com.keenwrite.quotes;
		+
		+import org.junit.jupiter.api.Test;
		+
		+import java.util.concurrent.atomic.AtomicInteger;
		+
		+import static com.keenwrite.quotes.LexemeType.*;
		+import static org.junit.jupiter.api.Assertions.assertEquals;
		+
		+/**
		+ * Tests lexing words, numbers, punctuation, spaces, and newlines.
		+ */
		+class LexerTest {
		+ @Test
		+ void test_Lexing_Words_TokenValues() {
		+ testText( "abc 123", "abc", " ", "123" );
		+ testText( "-123 abc", "-", "123", " ", "abc" );
		+ }
		+
		+ @Test
		+ void test_Lexing_Numbers_EmitNumbers() {
		+ testType( ".123", NUMBER );
		+ testType( "-123.", PUNCT, NUMBER, PERIOD );
		+ testType( " 123.123.123", SPACE, NUMBER );
		+ testType( "123 123\"", NUMBER, SPACE, NUMBER, QUOTE_DOUBLE );
		+ testType( "-123,123.123", PUNCT, NUMBER );
		+ }
		+
		+ @Test
		+ void test_Lexing_Words_EmitWords() {
		+ testType( "abc", WORD );
		+ testType( "abc abc", WORD, SPACE, WORD );
		+ testType( "abc123", WORD );
		+ testType( "-123abc", PUNCT, NUMBER, WORD );
		+ testType( "abc-o'-abc", WORD, PUNCT, WORD, QUOTE_SINGLE, PUNCT, WORD );
		+ }
		+
		+ @Test
		+ void test_Lexing_PunctuationMarks_EmitPunctuationMarks() {
		+ testType( "!", PUNCT );
		+ testType( ";", PUNCT );
		+ testType( ".", PERIOD );
		+ }
		+
		+ @Test
		+ void test_Lexing_Quotes_EmitQuotes() {
		+ testType( "'", QUOTE_SINGLE );
		+ testType( "\"", QUOTE_DOUBLE );
		+ testType( "3 o'clock", NUMBER, SPACE, WORD, QUOTE_SINGLE, WORD );
		+ }
		+
		+ @Test
		+ void test_Lexing_Newlines_EmitNewlines() {
		+ testType( "\r", NEWLINE );
		+ testType( "\n", NEWLINE );
		+ testType( "\r\n", NEWLINE );
		+ testType( "\r\n\r\n", NEWLINE, NEWLINE );
		+ testType( "\r\n\n\r", NEWLINE, NEWLINE, NEWLINE );
		+ testType( "abc \r\nabc\n", WORD, SPACE, NEWLINE, WORD, NEWLINE );
		+ }
		+
		+ private void testType(
		+ final String actual, final LexemeType... expected ) {
		+ final var tokenizer = new Lexer();
		+ final var counter = new AtomicInteger();
		+
		+ tokenizer.parse( actual, ( token ) -> {
		+ final var expectedType = expected[ counter.getAndIncrement() ];
		+ final var actualType = token.getType();
		+ assertEquals( expectedType, actualType );
		+ } );
		+
		+ // Ensure all expected tokens are matched (verify end of text reached).
		+ assertEquals( expected.length, counter.get() );
		+ }
		+
		+ private void testText( final String actual, final String... expected ) {
		+ final var tokenizer = new Lexer();
		+ final var counter = new AtomicInteger();
		+
		+ tokenizer.parse( actual, ( token ) -> {
		+ final var expectedText = expected[ counter.getAndIncrement() ];
		+ final var actualText = token.toString( actual );
		+ assertEquals( expectedText, actualText );
		+ } );
		+
		+ // Ensure all expected tokens are matched (verify end of text reached).
		+ assertEquals( expected.length, counter.get() );
		+ }
		+}

lib/src/test/java/com/keenwrite/quotes/ParserTest.java

		@Test
		void test_Conversion_Straight_Curly() {
		- new Parser( "\"It's the 70's jack-o'-lantern\"").parse();
		- new Parser( "Fish-'n'-chips!").parse();
		- new Parser( "That's a 35' x 10\" yacht!").parse();
		- //new Parser( "'70s are Sams' faves.'").parse();
		+ parse( "\"It's the 70's jack-o'-lantern\"" );
		+ parse( "Fish-'n'-chips!" );
		+ parse( "That's a 35' x 10\" yacht!" );
		+ parse( "He's a kinda' cat ya'll couldn't've known!" );
		+ parse( "'70s are Sams' faves.'" );
		+ parse( "'Twas and 'tis whate'er lay 'twixt dawn and dusk 'n River Styx." );
		+ parse( """
		+ But I must leave the proofs to those who 've seen 'em;
		+ But this I heard her say, and can't be wrong
		+ And all may think which way their judgments lean 'em,
		+ ''T is strange—the Hebrew noun which means "I am,"
		+ The English always use to govern d--n.'
		+ """ );
		+ }
		+
		+ private void parse( final String text ) {
		+ final var parser = new Parser( text, System.out::println );
		+ parser.parse();
		}
		}

lib/src/test/java/com/keenwrite/quotes/TokenizerTest.java

		-/* Copyright 2020-2021 White Magic Software, Ltd. -- All rights reserved. */
		-package com.keenwrite.quotes;
		-
		-import org.junit.jupiter.api.Test;
		-
		-import java.util.concurrent.atomic.AtomicInteger;
		-
		-import static com.keenwrite.quotes.TokenType.*;
		-import static org.junit.jupiter.api.Assertions.assertEquals;
		-
		-/**
		- * Tests tokenizing words, numbers, punctuation, spaces, and newlines.
		- */
		-class TokenizerTest {
		- @Test
		- void test_Tokenize_Words_TokenValues() {
		- testText( "abc 123", "abc", " ", "123" );
		- testText( "-123 abc", "-", "123", " ", "abc" );
		- }
		-
		- @Test
		- void test_Tokenize_Numbers_EmitNumbers() {
		- testType( ".123", NUMBER );
		- testType( "-123.", PUNCT, NUMBER, PERIOD );
		- testType( " 123.123.123", SPACE, NUMBER );
		- testType( "123 123\"", NUMBER, SPACE, NUMBER, QUOTE_DOUBLE );
		- testType( "-123,123.123", PUNCT, NUMBER );
		- }
		-
		- @Test
		- void test_Tokenize_Words_EmitWords() {
		- testType( "abc", WORD );
		- testType( "abc abc", WORD, SPACE, WORD );
		- testType( "abc123", WORD );
		- testType( "-123abc", PUNCT, NUMBER, WORD );
		- testType( "abc-o'-abc", WORD, PUNCT, WORD, QUOTE_SINGLE, PUNCT, WORD );
		- }
		-
		- @Test
		- void test_Tokenize_PunctuationMarks_EmitPunctuationMarks() {
		- testType( "!", PUNCT );
		- testType( ";", PUNCT );
		- testType( ".", PERIOD );
		- }
		-
		- @Test
		- void test_Tokenize_Quotes_EmitQuotes() {
		- testType( "'", QUOTE_SINGLE );
		- testType( "\"", QUOTE_DOUBLE );
		- testType( "3 o'clock", NUMBER, SPACE, WORD, QUOTE_SINGLE, WORD );
		- }
		-
		- @Test
		- void test_Tokenize_Newlines_EmitNewlines() {
		- testType( "\r", NEWLINE );
		- testType( "\n", NEWLINE );
		- testType( "\r\n", NEWLINE );
		- testType( "\r\n\r\n", NEWLINE, NEWLINE );
		- testType( "\r\n\n\r", NEWLINE, NEWLINE, NEWLINE );
		- testType( "abc \r\nabc\n", WORD, SPACE, NEWLINE, WORD, NEWLINE );
		- }
		-
		- private void testType(
		- final String actual, final TokenType... expected ) {
		- final var tokenizer = new Tokenizer();
		- final var counter = new AtomicInteger();
		-
		- tokenizer.tokenize( actual, ( token ) -> {
		- final var expectedType = expected[ counter.getAndIncrement() ];
		- final var actualType = token.getType();
		- assertEquals( expectedType, actualType );
		- } );
		-
		- // Ensure all expected tokens are matched (verify end of text reached).
		- assertEquals( expected.length, counter.get() );
		- }
		-
		- private void testText( final String actual, final String... expected ) {
		- final var tokenizer = new Tokenizer();
		- final var counter = new AtomicInteger();
		-
		- tokenizer.tokenize( actual, ( token ) -> {
		- final var expectedText = expected[ counter.getAndIncrement() ];
		- final var actualText = token.toString( actual );
		- assertEquals( expectedText, actualText );
		- } );
		-
		- // Ensure all expected tokens are matched (verify end of text reached).
		- assertEquals( expected.length, counter.get() );
		- }
		-}

Delta	455 lines added, 343 lines removed, 112-line increase