Replace ambiguous closing quotes when no ambiguous opening ones

Author	Dave Jarvis <email>
Date	2021-06-05 18:46:22 GMT-0700
Commit	5749c9871072dc62920d35478ffc5b0bbbfbeeef
Parent	6880f3c

lib/src/main/java/com/keenwrite/quotes/Contractions.java

		*/
		private static final Set<String> BEGAN_UNAMBIGUOUS = Set.of(
		- "aporth", "boutcha", "boutchu", "cept", "dillo", "e'll", "fraid",
		- "gainst", "n", "neath", "nother", "onna", "onna'", "pon", "s", "sblood",
		- "scuse", "sfar", "sfoot", "t", "taint", "tain't", "til", "tis", "tisn't",
		- "tshall", "twas", "twasn't", "tween", "twere", "tweren't", "twixt",
		- "twon't", "twou'd", "twou'dn't", "twould", "twouldn't", "ve"
		+// "aporth", "boutcha", "boutchu", "cept", "dillo", "e", "e'll", "e's",
		+// "fraid", "gainst", "n", "neath", "nother", "onna", "onna'", "pon", "s",
		+// "sblood", "scuse", "sfar", "sfoot", "t", "taint", "tain't", "til", "tis",
		+// "tisn't", "tshall", "twas", "twasn't", "tween", "twere", "tweren't",
		+// "twixt", "twon't", "twou'd", "twou'dn't", "twould", "twouldn't", "ve"
		+
		+ "aporth", "boutcha", "boutchu", "cept", "dillo", "fraid", "gainst",
		+ "n", "neath", "nother", "onna", "onna'", "pon", "s", "sblood", "scuse",
		+ "sfar", "sfoot", "t", "taint", "tain", "til", "tis", "tisn", "tshall",
		+ "twas", "twasn", "tween", "twere", "tweren", "twixt", "twon", "twou",
		+ "twould", "twouldn", "ve"
		);

		/**
		* Words having a straight apostrophe that may be either part of a
		* contraction or a word that stands alone beside an opening single quote.
		*/
		private static final Set<String> BEGAN_AMBIGUOUS = Set.of(
		- "bout", "choo", "ere", "e", "e's", "fro", "ho", "kay", "lo",
		- "re", "sup", "twill", "um", "zat"
		+ // about\|boxing match
		+ "bout",
		+ // what you\|choo choo train
		+ "choo",
		+ // he\|e pluribus unum
		+ "e",
		+ // them\|emily
		+ "em",
		+ // here\|earlier
		+ "ere",
		+ // afro\|to and fro
		+ "fro",
		+ // whore\|stop
		+ "ho",
		+ // okay\|letter K
		+ "kay",
		+ // lo\|lo and behold
		+ "lo",
		+ // are\|regarding
		+ "re",
		+ // what's up\|to sup
		+ "sup",
		+ // it will\|twill fabric
		+ "twill",
		+ // them\|utterance
		+ "um",
		+ // is that\|Iranian village
		+ "zat"
		+ );
		+
		+ private static final Set<String> ENDED_AMBIGUOUS = Set.of(
		+ // he\|a
		+ "a",
		+ // and\|an
		+ "an",
		+ // give\|martial arts garment
		+ "gi",
		+ // in\|I
		+ "i",
		+ // of\|letter o
		+ "o"
		+ );
		+
		+ private static final Set<String> ENDED_UNAMBIGUOUS = Set.of(
		+ // old
		+ "ol",
		+ // the
		+ "th"
		);


		* contractions.
		*/
		- public static boolean beginsUnambiguously( final String word ) {
		+ public static boolean contractionBeganUnambiguously( final String word ) {
		assert word != null;
		return BEGAN_UNAMBIGUOUS.contains( word.toLowerCase() );

		* contractions.
		*/
		- public static boolean beginsAmbiguously( final String word ) {
		+ public static boolean contractionBeganAmbiguously( final String word ) {
		assert word != null;
		return BEGAN_AMBIGUOUS.contains( word.toLowerCase() );
		+ }
		+
		+ public static boolean contractionEndedAmbiguously( final String word ) {
		+ return ENDED_AMBIGUOUS.contains( word ) \|\| word.endsWith( "s" ) \|\|
		+ word.endsWith( "n" ) \|\| word.endsWith( "z" ) \|\|
		+ word.endsWith( "x" ) \|\| word.endsWith( "ch" );
		+ }
		+
		+ public static boolean contractionEndedUnambiguously( final String word ) {
		+ return ENDED_UNAMBIGUOUS.contains( word );
		}
		}

lib/src/main/java/com/keenwrite/quotes/Lexeme.java

		*/
		private Lexeme( final LexemeType type, final int began, final int ended ) {
		+ assert type != null;
		+ assert began >= 0 \|\| ended == -2;
		+ assert ended >= began \|\| ended == -2;
		+
		mType = type;
		mBegan = began;

lib/src/main/java/com/keenwrite/quotes/Parser.java

		import java.util.function.Consumer;

		-import static com.keenwrite.quotes.Contractions.beginsUnambiguously;
		+import static com.keenwrite.quotes.Contractions.*;
		import static com.keenwrite.quotes.Lexeme.EOT;
		import static com.keenwrite.quotes.Lexeme.SOT;

		* <ol>
		* <li>Single prime (NUMBER ') -- 2'</li>
		- * <li>Double prime (NUMBER ") -- 7.5"</li>
		+ * <li>Double prime (NUMBER ") -- 2"</li>
		+ * <li>Double prime (NUMBER '') -- 2''</li>
		* </ol>
		* Next, handle balanced double quotes:
		* <ol>
		* <li>Double quotes (" (WORD (SPACE+ WORD)? (PUNCT \| PERIOD))+ ")</li>
		* <li>Single quotes (' (WORD (SPACE+ WORD)? (PUNCT \| PERIOD))+ ')</li>
		* </ol>
		*/
		public class Parser {
		+ /**
		+ * Single quotes preceded by these {@link LexemeType}s may be opening quotes.
		+ */
		+ private static final LexemeType[] LEADING_OPENING_QUOTE_SINGLE =
		+ new LexemeType[]{SPACE, HYPHEN, QUOTE_DOUBLE};
		+
		+ /**
		+ * Single quotes succeeded by these {@link LexemeType}s may be opening quotes.
		+ */
		+ private static final LexemeType[] TRAILING_OPENING_QUOTE_SINGLE =
		+ new LexemeType[]{WORD, NUMBER, ELLIPSIS, QUOTE_SINGLE, QUOTE_DOUBLE};
		+
		+ /**
		+ * Single quotes preceded by these {@link LexemeType}s may be closing quotes.
		+ */
		+ private static final LexemeType[] LEADING_CLOSING_QUOTE_SINGLE =
		+ new LexemeType[]{WORD, NUMBER, PERIOD, PUNCT, ELLIPSIS, QUOTE_DOUBLE};
		+
		+ /**
		+ * Single quotes succeeded by these {@link LexemeType}s may be closing quotes.
		+ */
		+ private static final LexemeType[] TRAILING_CLOSING_QUOTE_SINGLE =
		+ new LexemeType[]{SPACE, HYPHEN, EOL};
		+
		+ /**
		+ * Double quotes preceded by these {@link LexemeType}s may be opening quotes.
		+ */
		+ private static final LexemeType[] LEADING_OPENING_QUOTE_DOUBLE =
		+ new LexemeType[]{SPACE, HYPHEN, QUOTE_SINGLE};
		+
		+ /**
		+ * Double quotes succeeded by these {@link LexemeType}s may be opening quotes.
		+ */
		+ private static final LexemeType[] TRAILING_OPENING_QUOTE_DOUBLE =
		+ new LexemeType[]{WORD, NUMBER, ELLIPSIS, QUOTE_SINGLE, QUOTE_DOUBLE};
		+
		+ /**
		+ * Double quotes preceded by these {@link LexemeType}s may be closing quotes.
		+ */
		+ private static final LexemeType[] LEADING_CLOSING_QUOTE_DOUBLE =
		+ new LexemeType[]{WORD, NUMBER, PERIOD, PUNCT, ELLIPSIS, QUOTE_SINGLE};
		+
		+ /**
		+ * Double quotes succeeded by these {@link LexemeType}s may be closing quotes.
		+ */
		+ private static final LexemeType[] TRAILING_CLOSING_QUOTE_DOUBLE =
		+ new LexemeType[]{SPACE, HYPHEN, QUOTE_SINGLE, EOL};
		+
		/**
		* The text to parse. A reference is required as a minor optimization in


		// By loop's end, the lexemes list contains tokens for all except the
		- // final two elements (from tokenizing in triplets). Try emitting the last
		- // two unprocessed lexemes as tokens.
		+ // final two elements (from tokenizing in triplets). Tokenize the remaining
		+ // unprocessed lexemes.
		tokenize( EOT, consumer );
		tokenize( EOT, consumer );
		-
		- // Lexemes not emitted as tokens may be ambiguous.

		- //mQuotationMarks.sort( comparingInt( lexemes -> lexemes[ 0 ].began() ) );
		+ // Some non-emitted tokenized lexemes may be ambiguous.
		+ final var ambiguousOpeningQuotes = new ArrayList<Lexeme>( 16 );
		+ final var ambiguousClosingQuotes = new ArrayList<Lexeme>( 16 );

		- //
		+ // Count the number of ambiguous and non-ambiguous open single quotes.
		+ for( var i = mQuotationMarks.iterator(); i.hasNext(); ) {
		+ final var quotes = i.next();
		+ final var lex1 = quotes[ 0 ];
		+ final var lex2 = quotes[ 1 ];
		+ final var lex3 = quotes[ 2 ];

		+ if( lex2.isType( QUOTE_SINGLE ) ) {
		+ final var word1 = lex1 == SOT ? "" : lex1.toString( mText );
		+ final var word3 = lex3 == EOT ? "" : lex3.toString( mText );

		- for( final var unparsed : mQuotationMarks ) {
		- System.out.println( "unparsed: " + unparsed[ 0 ] + " " + unparsed[ 1 ] + " " + unparsed[ 2 ] );
		+ if( contractionBeganAmbiguously( word3 ) ) {
		+ // The contraction is uncertain until closing quote is found that
		+ // balances this single quote.
		+ ambiguousOpeningQuotes.add( lex2 );
		+ }
		+ else if( contractionBeganUnambiguously( word3 ) ) {
		+ // The quote mark forms a word that does not stand alone from its
		+ // contraction. For example, twas is not a word: it's 'twas.
		+ consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) );
		+ i.remove();
		+ }
		+ else if( contractionEndedAmbiguously( word1 ) ) {
		+ ambiguousClosingQuotes.add( lex2 );
		+ }
		+ else if( contractionEndedUnambiguously( word1 ) ) {
		+ consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) );
		+ i.remove();
		+ }
		+ else if( (lex1.isSot() \|\| lex1.anyType( LEADING_OPENING_QUOTE_SINGLE ))
		+ && lex3.anyType( TRAILING_OPENING_QUOTE_SINGLE ) ) {
		+ consumer.accept( new Token( QUOTE_OPENING_SINGLE, lex2 ) );
		+ }
		+ else if( lex1.anyType( LEADING_CLOSING_QUOTE_SINGLE ) &&
		+ (lex3.isEot() \|\| lex3.anyType( TRAILING_CLOSING_QUOTE_SINGLE )) ) {
		+ consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) );
		+ }
		+ }
		}

		- // Create/convert a list of all unambiguous quotations.
		- // Let TERM ::= (, \| ; \| ! \| ? \| .)
		- // Find unambiguous quotations by searching for:
		- // ' WORD ('* SPACE+ WORD)* TERM '
		- // In other words, when a ' WORD is encountered, push the ' onto a stack.
		- // If ' WORD is encountered, pop the stack and push the new ' onto it.
		- // If TERM ' is encountered, push the new ' onto it.
		- // This algorithm may have to push " WORD and " TERM as well, to account
		- // for nested sentences.
		+ if( ambiguousOpeningQuotes.size() == 0 ) {
		+ ambiguousClosingQuotes.forEach(
		+ lex -> consumer.accept( new Token( QUOTE_APOSTROPHE, lex ) )
		+ );
		+ }
		+ else {
		+ System.out.println( "ambig opening: " + ambiguousOpeningQuotes.size() );
		+ System.out.println( "ambig closing: " + ambiguousClosingQuotes.size() );

		- // Convert remaining single quotes to apostrophes.
		+ // No opening quotes, then there are no closing quotes, which
		+ // means the remaining single quotes are all apostrophes.
		+ if( ambiguousOpeningQuotes == ambiguousClosingQuotes ) {
		+ System.out.println( "MATCHING OPENING/CLOSING SINGLE QUOTES" );
		+ }
		+ }
		}


		else if( lex1.isType( QUOTE_SINGLE ) && lex2.isType( WORD ) &&
		// E.g., 'twas
		- beginsUnambiguously( lex2.toString( mText ) ) ) {
		+ contractionBeganUnambiguously( lex2.toString( mText ) ) ) {
		consumer.accept( new Token( QUOTE_APOSTROPHE, lex1 ) );
		}

		}
		else if( lex2.isType( QUOTE_DOUBLE ) &&
		- (lex1.isSot() \|\| lex1.anyType( SPACE, HYPHEN, QUOTE_SINGLE )) &&
		- lex3.anyType( WORD, NUMBER, ELLIPSIS, QUOTE_SINGLE, QUOTE_DOUBLE ) ) {
		+ (lex1.isSot() \|\| lex1.anyType( LEADING_OPENING_QUOTE_DOUBLE )) &&
		+ lex3.anyType( TRAILING_OPENING_QUOTE_DOUBLE ) ) {
		// Examples: "'Twas, "", "...
		consumer.accept( new Token( QUOTE_OPENING_DOUBLE, lex2 ) );
		}
		else if( lex2.isType( QUOTE_DOUBLE ) &&
		- lex1.anyType( WORD, NUMBER, PERIOD, PUNCT, ELLIPSIS, QUOTE_SINGLE ) &&
		- (lex3.anyType( SPACE, HYPHEN, QUOTE_SINGLE, EOL ) \|\| lex3.isEot()) ) {
		+ lex1.anyType( LEADING_CLOSING_QUOTE_DOUBLE ) &&
		+ (lex3.isEot() \|\| lex3.anyType( TRAILING_CLOSING_QUOTE_DOUBLE )) ) {
		consumer.accept( new Token( QUOTE_CLOSING_DOUBLE, lex2 ) );
		}

lib/src/main/java/com/keenwrite/quotes/SmartQuotes.java


		for( final var token : tokens ) {
		+ assert position <= token.began();
		+
		result.append( text, position, token.began() );
		result.append( REPLACEMENTS.get( token.getType() ) );

lib/src/test/java/com/keenwrite/quotes/SmartQuotesTest.java

		final var fixer = new SmartQuotes();
		out.println( fixer.replace( "\"Didn' get th' message.\"" ) );
		+
		+ out.println( "-------" );
		+
		+ out.println( fixer.replace( "'Bout that time I says, 'Boys! I been " +
		+ "thinkin' 'bout th' Universe.'"));
		+
		+ out.println( "-------" );
		+
		out.println( fixer.replace( "\"John asked, 'What are you, beyond " +
		"\"somethin' shiny?\"'\" said Fred.\n" ) );

		expected = line;
		}
		+
		+ System.out.println( "EXPECT:" + expected );

		final var actual = parser.apply( testLine );

lib/src/test/resources/com/keenwrite/quotes/smartypants.txt

		With--“air quotes”--and dashes.

		-"Not "quite" what you expected?"
		-“Not “quite” what you expected?”
		-
		"Not all open quotes are closed...
		“Not all open quotes are closed...

		With--'imaginary'--dashes.
		With--‘imaginary’--dashes.
		-
		-'Not 'quite' what you expected?'
		-‘Not ‘quite’ what you expected?’

		''Cause I don't like it, 's why,' said Pat.
		‘'Cause I don't like it, 's why,’ said Pat.

		'It's a beautiful day!'
		‘It's a beautiful day!’
		-
		-'He said, "Thinkin'."'
		-‘He said, “Thinkin’.”’
		-
		-"She said, 'Llamas'll languish, they'll--
		-“She said, ‘Llamas'll languish, they'll--

		# ########################################################################

		“I heard she said, ‘That's Sam's’,” said the Sams' cat.

		-"'Janes' said, ''E'll be spooky, Sam's son with the jack-o'-lantern!'" said the O'Mally twins'---y'know---ghosts in unison.
		-“‘Janes' said, ‘'E'll be spooky, Sam's son with the jack-o'-lantern!’” said the O'Mally twins'---y'know---ghosts in unison.
		+"'Janes said, ''E'll be spooky, Sam's son with the jack-o'-lantern!'" said the O'Mally twins'---y'know---ghosts in unison.
		+“‘Janes said, ‘'E'll be spooky, Sam's son with the jack-o'-lantern!’” said the O'Mally twins'---y'know---ghosts in unison.

		'He's at Sams'

		‘A’ ‘B’ and ‘C’ are letters.

		-'Oak,' 'elm,' and 'beech' are names of trees. So is 'pine.'
		+'Oak,' 'elm,'and 'beech' are names of trees. So is 'pine.'
		‘Oak,’ ‘elm,’ and ‘beech’ are names of trees. So is ‘pine.’

		'He said, \"I want to go.\"' Were you alive in the 70's?
		‘He said, "I want to go."’ Were you alive in the 70's?

		"That's a 'magic' sock."
		“That's a ‘magic’ sock.”
		+
		+'That's a "magic" sock.'
		+‘That's a “magic” sock.’

		Website! Company Name, Inc. ("Company Name" or "Company") recommends reading the following terms and conditions, carefully:
		Website! Company Name, Inc. (“Company Name” or “Company”) recommends reading the following terms and conditions, carefully:

		Website! Company Name, Inc. ('Company Name' or 'Company') recommends reading the following terms and conditions, carefully:
		Website! Company Name, Inc. (‘Company Name’ or ‘Company’) recommends reading the following terms and conditions, carefully:

		Workin' hard
		Workin' hard
		+
		+'He said, "Thinkin'."'
		+‘He said, “Thinkin’.”’
		+
		+"She said, 'Llamas'll languish, they'll--
		+“She said, ‘Llamas'll languish, they'll--

		'70s are my favorite numbers,' she said.

Delta	210 lines added, 49 lines removed, 161-line increase