Dave Jarvis' Repositories

git clone https://repo.autonoma.ca/repo/keenquotes.git

Replace ambiguous closing quotes when no ambiguous opening ones

AuthorDave Jarvis <email>
Date2021-06-05 18:46:22 GMT-0700
Commit5749c9871072dc62920d35478ffc5b0bbbfbeeef
Parent6880f3c
lib/src/main/java/com/keenwrite/quotes/Contractions.java
*/
private static final Set<String> BEGAN_UNAMBIGUOUS = Set.of(
- "aporth", "boutcha", "boutchu", "cept", "dillo", "e'll", "fraid",
- "gainst", "n", "neath", "nother", "onna", "onna'", "pon", "s", "sblood",
- "scuse", "sfar", "sfoot", "t", "taint", "tain't", "til", "tis", "tisn't",
- "tshall", "twas", "twasn't", "tween", "twere", "tweren't", "twixt",
- "twon't", "twou'd", "twou'dn't", "twould", "twouldn't", "ve"
+// "aporth", "boutcha", "boutchu", "cept", "dillo", "e", "e'll", "e's",
+// "fraid", "gainst", "n", "neath", "nother", "onna", "onna'", "pon", "s",
+// "sblood", "scuse", "sfar", "sfoot", "t", "taint", "tain't", "til", "tis",
+// "tisn't", "tshall", "twas", "twasn't", "tween", "twere", "tweren't",
+// "twixt", "twon't", "twou'd", "twou'dn't", "twould", "twouldn't", "ve"
+
+ "aporth", "boutcha", "boutchu", "cept", "dillo", "fraid", "gainst",
+ "n", "neath", "nother", "onna", "onna'", "pon", "s", "sblood", "scuse",
+ "sfar", "sfoot", "t", "taint", "tain", "til", "tis", "tisn", "tshall",
+ "twas", "twasn", "tween", "twere", "tweren", "twixt", "twon", "twou",
+ "twould", "twouldn", "ve"
);
/**
* Words having a straight apostrophe that may be either part of a
* contraction or a word that stands alone beside an opening single quote.
*/
private static final Set<String> BEGAN_AMBIGUOUS = Set.of(
- "bout", "choo", "ere", "e", "e's", "fro", "ho", "kay", "lo",
- "re", "sup", "twill", "um", "zat"
+ // about|boxing match
+ "bout",
+ // what you|choo choo train
+ "choo",
+ // he|e pluribus unum
+ "e",
+ // them|emily
+ "em",
+ // here|earlier
+ "ere",
+ // afro|to and fro
+ "fro",
+ // whore|stop
+ "ho",
+ // okay|letter K
+ "kay",
+ // lo|lo and behold
+ "lo",
+ // are|regarding
+ "re",
+ // what's up|to sup
+ "sup",
+ // it will|twill fabric
+ "twill",
+ // them|utterance
+ "um",
+ // is that|Iranian village
+ "zat"
+ );
+
+ private static final Set<String> ENDED_AMBIGUOUS = Set.of(
+ // he|a
+ "a",
+ // and|an
+ "an",
+ // give|martial arts garment
+ "gi",
+ // in|I
+ "i",
+ // of|letter o
+ "o"
+ );
+
+ private static final Set<String> ENDED_UNAMBIGUOUS = Set.of(
+ // old
+ "ol",
+ // the
+ "th"
);
* contractions.
*/
- public static boolean beginsUnambiguously( final String word ) {
+ public static boolean contractionBeganUnambiguously( final String word ) {
assert word != null;
return BEGAN_UNAMBIGUOUS.contains( word.toLowerCase() );
* contractions.
*/
- public static boolean beginsAmbiguously( final String word ) {
+ public static boolean contractionBeganAmbiguously( final String word ) {
assert word != null;
return BEGAN_AMBIGUOUS.contains( word.toLowerCase() );
+ }
+
+ public static boolean contractionEndedAmbiguously( final String word ) {
+ return ENDED_AMBIGUOUS.contains( word ) || word.endsWith( "s" ) ||
+ word.endsWith( "n" ) || word.endsWith( "z" ) ||
+ word.endsWith( "x" ) || word.endsWith( "ch" );
+ }
+
+ public static boolean contractionEndedUnambiguously( final String word ) {
+ return ENDED_UNAMBIGUOUS.contains( word );
}
}
lib/src/main/java/com/keenwrite/quotes/Lexeme.java
*/
private Lexeme( final LexemeType type, final int began, final int ended ) {
+ assert type != null;
+ assert began >= 0 || ended == -2;
+ assert ended >= began || ended == -2;
+
mType = type;
mBegan = began;
lib/src/main/java/com/keenwrite/quotes/Parser.java
import java.util.function.Consumer;
-import static com.keenwrite.quotes.Contractions.beginsUnambiguously;
+import static com.keenwrite.quotes.Contractions.*;
import static com.keenwrite.quotes.Lexeme.EOT;
import static com.keenwrite.quotes.Lexeme.SOT;
* <ol>
* <li>Single prime (NUMBER ') -- 2'</li>
- * <li>Double prime (NUMBER ") -- 7.5"</li>
+ * <li>Double prime (NUMBER ") -- 2"</li>
+ * <li>Double prime (NUMBER '') -- 2''</li>
* </ol>
* Next, handle balanced double quotes:
* <ol>
* <li>Double quotes (" (WORD (SPACE+ WORD)? (PUNCT | PERIOD))+ ")</li>
* <li>Single quotes (' (WORD (SPACE+ WORD)? (PUNCT | PERIOD))+ ')</li>
* </ol>
*/
public class Parser {
+ /**
+ * Single quotes preceded by these {@link LexemeType}s may be opening quotes.
+ */
+ private static final LexemeType[] LEADING_OPENING_QUOTE_SINGLE =
+ new LexemeType[]{SPACE, HYPHEN, QUOTE_DOUBLE};
+
+ /**
+ * Single quotes succeeded by these {@link LexemeType}s may be opening quotes.
+ */
+ private static final LexemeType[] TRAILING_OPENING_QUOTE_SINGLE =
+ new LexemeType[]{WORD, NUMBER, ELLIPSIS, QUOTE_SINGLE, QUOTE_DOUBLE};
+
+ /**
+ * Single quotes preceded by these {@link LexemeType}s may be closing quotes.
+ */
+ private static final LexemeType[] LEADING_CLOSING_QUOTE_SINGLE =
+ new LexemeType[]{WORD, NUMBER, PERIOD, PUNCT, ELLIPSIS, QUOTE_DOUBLE};
+
+ /**
+ * Single quotes succeeded by these {@link LexemeType}s may be closing quotes.
+ */
+ private static final LexemeType[] TRAILING_CLOSING_QUOTE_SINGLE =
+ new LexemeType[]{SPACE, HYPHEN, EOL};
+
+ /**
+ * Double quotes preceded by these {@link LexemeType}s may be opening quotes.
+ */
+ private static final LexemeType[] LEADING_OPENING_QUOTE_DOUBLE =
+ new LexemeType[]{SPACE, HYPHEN, QUOTE_SINGLE};
+
+ /**
+ * Double quotes succeeded by these {@link LexemeType}s may be opening quotes.
+ */
+ private static final LexemeType[] TRAILING_OPENING_QUOTE_DOUBLE =
+ new LexemeType[]{WORD, NUMBER, ELLIPSIS, QUOTE_SINGLE, QUOTE_DOUBLE};
+
+ /**
+ * Double quotes preceded by these {@link LexemeType}s may be closing quotes.
+ */
+ private static final LexemeType[] LEADING_CLOSING_QUOTE_DOUBLE =
+ new LexemeType[]{WORD, NUMBER, PERIOD, PUNCT, ELLIPSIS, QUOTE_SINGLE};
+
+ /**
+ * Double quotes succeeded by these {@link LexemeType}s may be closing quotes.
+ */
+ private static final LexemeType[] TRAILING_CLOSING_QUOTE_DOUBLE =
+ new LexemeType[]{SPACE, HYPHEN, QUOTE_SINGLE, EOL};
+
/**
* The text to parse. A reference is required as a minor optimization in
// By loop's end, the lexemes list contains tokens for all except the
- // final two elements (from tokenizing in triplets). Try emitting the last
- // two unprocessed lexemes as tokens.
+ // final two elements (from tokenizing in triplets). Tokenize the remaining
+ // unprocessed lexemes.
tokenize( EOT, consumer );
tokenize( EOT, consumer );
-
- // Lexemes not emitted as tokens may be ambiguous.
- //mQuotationMarks.sort( comparingInt( lexemes -> lexemes[ 0 ].began() ) );
+ // Some non-emitted tokenized lexemes may be ambiguous.
+ final var ambiguousOpeningQuotes = new ArrayList<Lexeme>( 16 );
+ final var ambiguousClosingQuotes = new ArrayList<Lexeme>( 16 );
- //
+ // Count the number of ambiguous and non-ambiguous open single quotes.
+ for( var i = mQuotationMarks.iterator(); i.hasNext(); ) {
+ final var quotes = i.next();
+ final var lex1 = quotes[ 0 ];
+ final var lex2 = quotes[ 1 ];
+ final var lex3 = quotes[ 2 ];
+ if( lex2.isType( QUOTE_SINGLE ) ) {
+ final var word1 = lex1 == SOT ? "" : lex1.toString( mText );
+ final var word3 = lex3 == EOT ? "" : lex3.toString( mText );
- for( final var unparsed : mQuotationMarks ) {
- System.out.println( "unparsed: " + unparsed[ 0 ] + " " + unparsed[ 1 ] + " " + unparsed[ 2 ] );
+ if( contractionBeganAmbiguously( word3 ) ) {
+ // The contraction is uncertain until closing quote is found that
+ // balances this single quote.
+ ambiguousOpeningQuotes.add( lex2 );
+ }
+ else if( contractionBeganUnambiguously( word3 ) ) {
+ // The quote mark forms a word that does not stand alone from its
+ // contraction. For example, twas is not a word: it's 'twas.
+ consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) );
+ i.remove();
+ }
+ else if( contractionEndedAmbiguously( word1 ) ) {
+ ambiguousClosingQuotes.add( lex2 );
+ }
+ else if( contractionEndedUnambiguously( word1 ) ) {
+ consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) );
+ i.remove();
+ }
+ else if( (lex1.isSot() || lex1.anyType( LEADING_OPENING_QUOTE_SINGLE ))
+ && lex3.anyType( TRAILING_OPENING_QUOTE_SINGLE ) ) {
+ consumer.accept( new Token( QUOTE_OPENING_SINGLE, lex2 ) );
+ }
+ else if( lex1.anyType( LEADING_CLOSING_QUOTE_SINGLE ) &&
+ (lex3.isEot() || lex3.anyType( TRAILING_CLOSING_QUOTE_SINGLE )) ) {
+ consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) );
+ }
+ }
}
- // Create/convert a list of all unambiguous quotations.
- // Let TERM ::= (, | ; | ! | ? | .)
- // Find unambiguous quotations by searching for:
- // ' WORD ('* SPACE+ WORD)* TERM '
- // In other words, when a ' WORD is encountered, push the ' onto a stack.
- // If ' WORD is encountered, pop the stack and push the new ' onto it.
- // If TERM ' is encountered, push the new ' onto it.
- // This algorithm may have to push " WORD and " TERM as well, to account
- // for nested sentences.
+ if( ambiguousOpeningQuotes.size() == 0 ) {
+ ambiguousClosingQuotes.forEach(
+ lex -> consumer.accept( new Token( QUOTE_APOSTROPHE, lex ) )
+ );
+ }
+ else {
+ System.out.println( "ambig opening: " + ambiguousOpeningQuotes.size() );
+ System.out.println( "ambig closing: " + ambiguousClosingQuotes.size() );
- // Convert remaining single quotes to apostrophes.
+ // No opening quotes, then there are no closing quotes, which
+ // means the remaining single quotes are all apostrophes.
+ if( ambiguousOpeningQuotes == ambiguousClosingQuotes ) {
+ System.out.println( "MATCHING OPENING/CLOSING SINGLE QUOTES" );
+ }
+ }
}
else if( lex1.isType( QUOTE_SINGLE ) && lex2.isType( WORD ) &&
// E.g., 'twas
- beginsUnambiguously( lex2.toString( mText ) ) ) {
+ contractionBeganUnambiguously( lex2.toString( mText ) ) ) {
consumer.accept( new Token( QUOTE_APOSTROPHE, lex1 ) );
}
}
else if( lex2.isType( QUOTE_DOUBLE ) &&
- (lex1.isSot() || lex1.anyType( SPACE, HYPHEN, QUOTE_SINGLE )) &&
- lex3.anyType( WORD, NUMBER, ELLIPSIS, QUOTE_SINGLE, QUOTE_DOUBLE ) ) {
+ (lex1.isSot() || lex1.anyType( LEADING_OPENING_QUOTE_DOUBLE )) &&
+ lex3.anyType( TRAILING_OPENING_QUOTE_DOUBLE ) ) {
// Examples: "'Twas, "", "...
consumer.accept( new Token( QUOTE_OPENING_DOUBLE, lex2 ) );
}
else if( lex2.isType( QUOTE_DOUBLE ) &&
- lex1.anyType( WORD, NUMBER, PERIOD, PUNCT, ELLIPSIS, QUOTE_SINGLE ) &&
- (lex3.anyType( SPACE, HYPHEN, QUOTE_SINGLE, EOL ) || lex3.isEot()) ) {
+ lex1.anyType( LEADING_CLOSING_QUOTE_DOUBLE ) &&
+ (lex3.isEot() || lex3.anyType( TRAILING_CLOSING_QUOTE_DOUBLE )) ) {
consumer.accept( new Token( QUOTE_CLOSING_DOUBLE, lex2 ) );
}
lib/src/main/java/com/keenwrite/quotes/SmartQuotes.java
for( final var token : tokens ) {
+ assert position <= token.began();
+
result.append( text, position, token.began() );
result.append( REPLACEMENTS.get( token.getType() ) );
lib/src/test/java/com/keenwrite/quotes/SmartQuotesTest.java
final var fixer = new SmartQuotes();
out.println( fixer.replace( "\"Didn' get th' message.\"" ) );
+
+ out.println( "-------" );
+
+ out.println( fixer.replace( "'Bout that time I says, 'Boys! I been " +
+ "thinkin' 'bout th' Universe.'"));
+
+ out.println( "-------" );
+
out.println( fixer.replace( "\"John asked, 'What are you, beyond " +
"\"somethin' shiny?\"'\" said Fred.\n" ) );
expected = line;
}
+
+ System.out.println( "EXPECT:" + expected );
final var actual = parser.apply( testLine );
lib/src/test/resources/com/keenwrite/quotes/smartypants.txt
With--&ldquo;air quotes&rdquo;--and dashes.
-"Not "quite" what you expected?"
-&ldquo;Not &ldquo;quite&rdquo; what you expected?&rdquo;
-
"Not all open quotes are closed...
&ldquo;Not all open quotes are closed...
With--'imaginary'--dashes.
With--&lsquo;imaginary&rsquo;--dashes.
-
-'Not 'quite' what you expected?'
-&lsquo;Not &lsquo;quite&rsquo; what you expected?&rsquo;
''Cause I don't like it, 's why,' said Pat.
&lsquo;&apos;Cause I don't like it, &apos;s why,&rsquo; said Pat.
'It's a beautiful day!'
&lsquo;It&apos;s a beautiful day!&rsquo;
-
-'He said, "Thinkin'."'
-&lsquo;He said, &ldquo;Thinkin&rsquo;.&rdquo;&rsquo;
-
-"She said, 'Llamas'll languish, they'll--
-&ldquo;She said, &lsquo;Llamas&apos;ll languish, they&apos;ll--
# ########################################################################
&ldquo;I heard she said, &lsquo;That&apos;s Sam&apos;s&rsquo;,&rdquo; said the Sams&apos; cat.
-"'Janes' said, ''E'll be spooky, Sam's son with the jack-o'-lantern!'" said the O'Mally twins'---y'know---ghosts in unison.
-&ldquo;&lsquo;Janes&apos; said, &lsquo;&apos;E&apos;ll be spooky, Sam&apos;s son with the jack-o&apos;-lantern!&rsquo;&rdquo; said the O&apos;Mally twins&apos;---y&apos;know---ghosts in unison.
+"'Janes said, ''E'll be spooky, Sam's son with the jack-o'-lantern!'" said the O'Mally twins'---y'know---ghosts in unison.
+&ldquo;&lsquo;Janes said, &lsquo;&apos;E&apos;ll be spooky, Sam&apos;s son with the jack-o&apos;-lantern!&rsquo;&rdquo; said the O&apos;Mally twins&apos;---y&apos;know---ghosts in unison.
'He's at Sams'
&lsquo;A&rsquo; &lsquo;B&rsquo; and &lsquo;C&rsquo; are letters.
-'Oak,' 'elm,' and 'beech' are names of trees. So is 'pine.'
+'Oak,' 'elm,'and 'beech' are names of trees. So is 'pine.'
&lsquo;Oak,&rsquo; &lsquo;elm,&rsquo; and &lsquo;beech&rsquo; are names of trees. So is &lsquo;pine.&rsquo;
'He said, \"I want to go.\"' Were you alive in the 70's?
&lsquo;He said, "I want to go."&rsquo; Were you alive in the 70&apos;s?
"That's a 'magic' sock."
&ldquo;That&apos;s a &lsquo;magic&rsquo; sock.&rdquo;
+
+'That's a "magic" sock.'
+&lsquo;That&apos;s a &ldquo;magic&rdquo; sock.&rsquo;
Website! Company Name, Inc. ("Company Name" or "Company") recommends reading the following terms and conditions, carefully:
Website! Company Name, Inc. (&ldquo;Company Name&rdquo; or &ldquo;Company&rdquo;) recommends reading the following terms and conditions, carefully:
Website! Company Name, Inc. ('Company Name' or 'Company') recommends reading the following terms and conditions, carefully:
Website! Company Name, Inc. (&lsquo;Company Name&rsquo; or &lsquo;Company&rsquo;) recommends reading the following terms and conditions, carefully:
Workin' hard
Workin&apos; hard
+
+'He said, "Thinkin'."'
+&lsquo;He said, &ldquo;Thinkin&rsquo;.&rdquo;&rsquo;
+
+"She said, 'Llamas'll languish, they'll--
+&ldquo;She said, &lsquo;Llamas&apos;ll languish, they&apos;ll--
'70s are my favorite numbers,' she said.
Delta210 lines added, 49 lines removed, 161-line increase