lib/src/main/java/com/keenwrite/quotes/Parser.java

		private final String mText;

		+ /**
		+ * Converts a string into an iterable list of {@link Lexeme} instances.
		+ */
		private final Lexer mLexer;
		-
		- private final List<Lexeme[]> mQuotationMarks = new ArrayList<>();
		- private final CircularFifoQueue<Lexeme> mLexemes =
		- new CircularFifoQueue<>( 3 );

		/**
		* Sets of contractions that help disambiguate single quotes in the text.
		+ * These are effectively immutable while parsing.
		*/
		- private final Contractions mContractions;
		+ private final Contractions sContractions;

		/**

		private int mClosingSingleQuote;

		+ /**
		+ * Constructs a new {@link Parser} using the default contraction sets
		+ * to help resolve some ambiguous scenarios.
		+ *
		+ * @param text The prose to parse, containing zero or more quotation
		+ * characters.
		+ */
		public Parser( final String text ) {
		this( text, new Contractions.Builder().build() );
		}

		+ /**
		+ * Constructs a new {@link Parser} using the default contraction sets
		+ * to help resolve some ambiguous scenarios.
		+ *
		+ * @param text The prose to parse, containing zero or more quotation
		+ * characters.
		+ * @param contractions Custom sets of contractions to help resolve
		+ * ambiguities.
		+ */
		public Parser( final String text, final Contractions contractions ) {
		mText = text;
		mLexer = new Lexer( mText );
		- mContractions = contractions;
		-
		- // Allow consuming the very first token without checking the queue size.
		- flush( mLexemes );
		+ sContractions = contractions;
		}


		*/
		public void parse( final Consumer<Token> consumer ) {
		- // Create/convert a list of all unambiguous quote characters.
		+ final var lexemes = new CircularFifoQueue<Lexeme>( 3 );
		+
		+ // Allow consuming the very first token without checking the queue size.
		+ flush( lexemes );
		+
		+ final var unresolved = new ArrayList<Lexeme[]>();
		Lexeme lexeme;

		+ // Create and convert a list of all unambiguous quote characters.
		while( (lexeme = mLexer.next()) != EOT ) {
		- tokenize( lexeme, consumer );
		+ tokenize( lexeme, lexemes, consumer, unresolved );
		}

		// By loop's end, the lexemes list contains tokens for all except the
		// final two elements (from tokenizing in triplets). Tokenize the remaining
		// unprocessed lexemes.
		- tokenize( EOT, consumer );
		- tokenize( EOT, consumer );
		+ tokenize( EOT, lexemes, consumer, unresolved );
		+ tokenize( EOT, lexemes, consumer, unresolved );
		+
		+ // Attempt to resolve any remaining unambiguous quotes.
		+ resolve( unresolved, consumer );
		+
		+ System.out.println( "UNRESOLVED: " + unresolved.size() );
		+ }

		+ private void resolve(
		+ final List<Lexeme[]> lexemes, final Consumer<Token> consumer ) {
		// Some non-emitted tokenized lexemes may be ambiguous.
		final var ambiguousLeadingQuotes = new ArrayList<Lexeme>( 16 );
		final var ambiguousLaggingQuotes = new ArrayList<Lexeme>( 16 );
		var resolvedLeadingQuotes = 0;
		var resolvedLaggingQuotes = 0;

		// Count the number of ambiguous and non-ambiguous open single quotes.
		- for( var i = mQuotationMarks.iterator(); i.hasNext(); ) {
		+ for( var i = lexemes.iterator(); i.hasNext(); ) {
		final var quotes = i.next();
		final var lex1 = quotes[ 0 ];
		final var lex2 = quotes[ 1 ];
		final var lex3 = quotes[ 2 ];

		if( lex2.isType( QUOTE_SINGLE ) ) {
		final var word1 = lex1 == SOT ? "" : lex1.toString( mText );
		final var word3 = lex3 == EOT ? "" : lex3.toString( mText );

		- if( mContractions.beganAmbiguously( word3 ) ) {
		+ if( sContractions.beganAmbiguously( word3 ) ) {
		// E.g., 'Cause
		if( lex1.isType( QUOTE_SINGLE ) ) {

		}
		}
		- else if( mContractions.beganUnambiguously( word3 ) ) {
		+ else if( sContractions.beganUnambiguously( word3 ) ) {
		// The quote mark forms a word that does not stand alone from its
		// contraction. For example, twas is not a word: it's 'twas.
		consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) );
		i.remove();
		}
		- else if( mContractions.endedAmbiguously( word1 ) ) {
		+ else if( sContractions.endedAmbiguously( word1 ) ) {
		ambiguousLaggingQuotes.add( lex2 );
		}

		}
		}
		-
		- System.out.println( "--------------------" );
		- System.out.println( "ambig leading: " + ambiguousLeadingQuotes.size() );
		- System.out.println( "ambig lagging: " + ambiguousLaggingQuotes.size() );
		- System.out.println( "resolv leading: " + resolvedLeadingQuotes );
		- System.out.println( "resolv lagging: " + resolvedLaggingQuotes );

		final var ambiguousLeadingCount = ambiguousLeadingQuotes.size();

		else if( ambiguousLeadingCount == 0 ) {
		if( resolvedLaggingQuotes < resolvedLeadingQuotes ) {
		- for( final var mark : mQuotationMarks ) {
		+ for( final var mark : lexemes ) {
		consumer.accept( new Token( QUOTE_CLOSING_SINGLE, mark[ 1 ] ) );
		}

		}

		- private void tokenize( final Lexeme lexeme, final Consumer<Token> consumer ) {
		- mLexemes.add( lexeme );
		- tokenize( consumer );
		- }
		+ private void tokenize( final Lexeme lexeme,
		+ final CircularFifoQueue<Lexeme> lexemes,
		+ final Consumer<Token> consumer,
		+ final List<Lexeme[]> unresolved ) {
		+ // Add the next lexeme to tokenize into the queue for immediate processing.
		+ lexemes.add( lexeme );

		- private void tokenize( final Consumer<Token> consumer ) {
		- final var lex1 = mLexemes.get( 0 );
		- final var lex2 = mLexemes.get( 1 );
		- final var lex3 = mLexemes.get( 2 );
		+ final var lex1 = lexemes.get( 0 );
		+ final var lex2 = lexemes.get( 1 );
		+ final var lex3 = lexemes.get( 2 );

		if( lex2.isType( QUOTE_SINGLE ) && lex3.isType( WORD ) &&
		lex1.anyType( WORD, PERIOD, NUMBER ) ) {
		// Examples: y'all, Ph.D.'ll, 20's, she's
		consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) );
		- flush( mLexemes );
		+ flush( lexemes );
		}
		else if( lex1.isType( QUOTE_SINGLE ) && lex3.isType( QUOTE_SINGLE ) &&
		"n".equalsIgnoreCase( lex2.toString( mText ) ) ) {
		// I.e., 'n'
		consumer.accept( new Token( QUOTE_APOSTROPHE, lex1 ) );
		consumer.accept( new Token( QUOTE_APOSTROPHE, lex3 ) );
		- flush( mLexemes );
		+ flush( lexemes );

		// Remove the first apostrophe so that it isn't emitted twice.
		- mQuotationMarks.remove( mQuotationMarks.size() - 1 );
		+ unresolved.remove( unresolved.size() - 1 );
		}
		else if( lex2.isType( QUOTE_SINGLE ) && lex1.isType( NUMBER ) ) {

		}
		else if( lex2.isType( WORD ) && lex3.isType( QUOTE_SINGLE ) &&
		- mContractions.endedUnambiguously( lex2.toString( mText ) ) ) {
		+ sContractions.endedUnambiguously( lex2.toString( mText ) ) ) {
		// E.g., thinkin'
		consumer.accept( new Token( QUOTE_APOSTROPHE, lex3 ) );

		}
		else if( lex2.anyType( QUOTE_SINGLE, QUOTE_DOUBLE ) ) {
		- mQuotationMarks.add( new Lexeme[]{lex1, lex2, lex3} );
		+ // After tokenizing, the parser will attempt to resolve ambiguities.
		+ unresolved.add( new Lexeme[]{lex1, lex2, lex3} );
		}
		}

		+ /**
		+ * Overwrites the {@link CircularFifoQueue}'s contents with start-of-text
		+ * indicators.
		+ *
		+ * @param lexemes The queue to overflow.
		+ */
		private void flush( final CircularFifoQueue<Lexeme> lexemes ) {
		lexemes.add( SOT );

lib/src/test/java/com/keenwrite/quotes/ParserTest.java

		import static org.junit.jupiter.api.Assertions.assertEquals;

		+/**
		+ * Test that all unambiguous apostrophes are emitted once.
		+ */
		class ParserTest {
		private final static Map<String, Map<TokenType, Integer>> TEST_CASES =

		But this I heard her say, and can't be wrong
		And all may think which way their judgments lean 'em,
		- ''T is strange—the Hebrew noun which means "I am,"
		+ ''T is strange---the Hebrew noun which means "I am,"
		The English always use to govern d--n.'
		""",

lib/src/test/java/com/keenwrite/quotes/StoryTest.java

		+package com.keenwrite.quotes;
		+
		+public class StoryTest {
		+
		+}

lib/src/test/resources/com/keenwrite/quotes/smartypants.txt

		‘It's a beautiful day!’

		+Book 'em, Danno. Rock 'n' roll. 'Cause 'twas the season.
		+Book 'em, Danno. Rock 'n' roll. 'Cause 'twas the season.
		+
		# ########################################################################
		# Possessives

		# Ambiguous (no solution)
		# ########################################################################
		-Book 'em, Danno. Rock 'n' roll. 'Cause 'twas the season.
		-Book 'em, Danno. Rock 'n' roll. 'Cause 'twas the season.
		-
		She said, 'Cause of death?\n\n'Old age, there's no doubt.'
		She said, 'Cause of death?\n\n‘Old age, there's no doubt.’

Finish parser, eliminate side-effects

Author	Dave Jarvis <email>
Date	2021-06-12 18:56:13 GMT-0700
Commit	abfd26bad7d2fef1fa863b2cf4b27722a180cffd
Parent	07f9f01
Delta	78 lines added, 41 lines removed, 37-line increase