| Author | Dave Jarvis <email> |
|---|---|
| Date | 2021-06-05 18:46:22 GMT-0700 |
| Commit | 5749c9871072dc62920d35478ffc5b0bbbfbeeef |
| Parent | 6880f3c |
| */ | ||
| private static final Set<String> BEGAN_UNAMBIGUOUS = Set.of( | ||
| - "aporth", "boutcha", "boutchu", "cept", "dillo", "e'll", "fraid", | ||
| - "gainst", "n", "neath", "nother", "onna", "onna'", "pon", "s", "sblood", | ||
| - "scuse", "sfar", "sfoot", "t", "taint", "tain't", "til", "tis", "tisn't", | ||
| - "tshall", "twas", "twasn't", "tween", "twere", "tweren't", "twixt", | ||
| - "twon't", "twou'd", "twou'dn't", "twould", "twouldn't", "ve" | ||
| +// "aporth", "boutcha", "boutchu", "cept", "dillo", "e", "e'll", "e's", | ||
| +// "fraid", "gainst", "n", "neath", "nother", "onna", "onna'", "pon", "s", | ||
| +// "sblood", "scuse", "sfar", "sfoot", "t", "taint", "tain't", "til", "tis", | ||
| +// "tisn't", "tshall", "twas", "twasn't", "tween", "twere", "tweren't", | ||
| +// "twixt", "twon't", "twou'd", "twou'dn't", "twould", "twouldn't", "ve" | ||
| + | ||
| + "aporth", "boutcha", "boutchu", "cept", "dillo", "fraid", "gainst", | ||
| + "n", "neath", "nother", "onna", "onna'", "pon", "s", "sblood", "scuse", | ||
| + "sfar", "sfoot", "t", "taint", "tain", "til", "tis", "tisn", "tshall", | ||
| + "twas", "twasn", "tween", "twere", "tweren", "twixt", "twon", "twou", | ||
| + "twould", "twouldn", "ve" | ||
| ); | ||
| /** | ||
| * Words having a straight apostrophe that may be either part of a | ||
| * contraction or a word that stands alone beside an opening single quote. | ||
| */ | ||
| private static final Set<String> BEGAN_AMBIGUOUS = Set.of( | ||
| - "bout", "choo", "ere", "e", "e's", "fro", "ho", "kay", "lo", | ||
| - "re", "sup", "twill", "um", "zat" | ||
| + // about|boxing match | ||
| + "bout", | ||
| + // what you|choo choo train | ||
| + "choo", | ||
| + // he|e pluribus unum | ||
| + "e", | ||
| + // them|emily | ||
| + "em", | ||
| + // here|earlier | ||
| + "ere", | ||
| + // afro|to and fro | ||
| + "fro", | ||
| + // whore|stop | ||
| + "ho", | ||
| + // okay|letter K | ||
| + "kay", | ||
| + // lo|lo and behold | ||
| + "lo", | ||
| + // are|regarding | ||
| + "re", | ||
| + // what's up|to sup | ||
| + "sup", | ||
| + // it will|twill fabric | ||
| + "twill", | ||
| + // them|utterance | ||
| + "um", | ||
| + // is that|Iranian village | ||
| + "zat" | ||
| + ); | ||
| + | ||
| + private static final Set<String> ENDED_AMBIGUOUS = Set.of( | ||
| + // he|a | ||
| + "a", | ||
| + // and|an | ||
| + "an", | ||
| + // give|martial arts garment | ||
| + "gi", | ||
| + // in|I | ||
| + "i", | ||
| + // of|letter o | ||
| + "o" | ||
| + ); | ||
| + | ||
| + private static final Set<String> ENDED_UNAMBIGUOUS = Set.of( | ||
| + // old | ||
| + "ol", | ||
| + // the | ||
| + "th" | ||
| ); | ||
| * contractions. | ||
| */ | ||
| - public static boolean beginsUnambiguously( final String word ) { | ||
| + public static boolean contractionBeganUnambiguously( final String word ) { | ||
| assert word != null; | ||
| return BEGAN_UNAMBIGUOUS.contains( word.toLowerCase() ); | ||
| * contractions. | ||
| */ | ||
| - public static boolean beginsAmbiguously( final String word ) { | ||
| + public static boolean contractionBeganAmbiguously( final String word ) { | ||
| assert word != null; | ||
| return BEGAN_AMBIGUOUS.contains( word.toLowerCase() ); | ||
| + } | ||
| + | ||
| + public static boolean contractionEndedAmbiguously( final String word ) { | ||
| + return ENDED_AMBIGUOUS.contains( word ) || word.endsWith( "s" ) || | ||
| + word.endsWith( "n" ) || word.endsWith( "z" ) || | ||
| + word.endsWith( "x" ) || word.endsWith( "ch" ); | ||
| + } | ||
| + | ||
| + public static boolean contractionEndedUnambiguously( final String word ) { | ||
| + return ENDED_UNAMBIGUOUS.contains( word ); | ||
| } | ||
| } | ||
| */ | ||
| private Lexeme( final LexemeType type, final int began, final int ended ) { | ||
| + assert type != null; | ||
| + assert began >= 0 || ended == -2; | ||
| + assert ended >= began || ended == -2; | ||
| + | ||
| mType = type; | ||
| mBegan = began; |
| import java.util.function.Consumer; | ||
| -import static com.keenwrite.quotes.Contractions.beginsUnambiguously; | ||
| +import static com.keenwrite.quotes.Contractions.*; | ||
| import static com.keenwrite.quotes.Lexeme.EOT; | ||
| import static com.keenwrite.quotes.Lexeme.SOT; | ||
| * <ol> | ||
| * <li>Single prime (NUMBER ') -- 2'</li> | ||
| - * <li>Double prime (NUMBER ") -- 7.5"</li> | ||
| + * <li>Double prime (NUMBER ") -- 2"</li> | ||
| + * <li>Double prime (NUMBER '') -- 2''</li> | ||
| * </ol> | ||
| * Next, handle balanced double quotes: | ||
| * <ol> | ||
| * <li>Double quotes (" (WORD (SPACE+ WORD)? (PUNCT | PERIOD))+ ")</li> | ||
| * <li>Single quotes (' (WORD (SPACE+ WORD)? (PUNCT | PERIOD))+ ')</li> | ||
| * </ol> | ||
| */ | ||
| public class Parser { | ||
| + /** | ||
| + * Single quotes preceded by these {@link LexemeType}s may be opening quotes. | ||
| + */ | ||
| + private static final LexemeType[] LEADING_OPENING_QUOTE_SINGLE = | ||
| + new LexemeType[]{SPACE, HYPHEN, QUOTE_DOUBLE}; | ||
| + | ||
| + /** | ||
| + * Single quotes succeeded by these {@link LexemeType}s may be opening quotes. | ||
| + */ | ||
| + private static final LexemeType[] TRAILING_OPENING_QUOTE_SINGLE = | ||
| + new LexemeType[]{WORD, NUMBER, ELLIPSIS, QUOTE_SINGLE, QUOTE_DOUBLE}; | ||
| + | ||
| + /** | ||
| + * Single quotes preceded by these {@link LexemeType}s may be closing quotes. | ||
| + */ | ||
| + private static final LexemeType[] LEADING_CLOSING_QUOTE_SINGLE = | ||
| + new LexemeType[]{WORD, NUMBER, PERIOD, PUNCT, ELLIPSIS, QUOTE_DOUBLE}; | ||
| + | ||
| + /** | ||
| + * Single quotes succeeded by these {@link LexemeType}s may be closing quotes. | ||
| + */ | ||
| + private static final LexemeType[] TRAILING_CLOSING_QUOTE_SINGLE = | ||
| + new LexemeType[]{SPACE, HYPHEN, EOL}; | ||
| + | ||
| + /** | ||
| + * Double quotes preceded by these {@link LexemeType}s may be opening quotes. | ||
| + */ | ||
| + private static final LexemeType[] LEADING_OPENING_QUOTE_DOUBLE = | ||
| + new LexemeType[]{SPACE, HYPHEN, QUOTE_SINGLE}; | ||
| + | ||
| + /** | ||
| + * Double quotes succeeded by these {@link LexemeType}s may be opening quotes. | ||
| + */ | ||
| + private static final LexemeType[] TRAILING_OPENING_QUOTE_DOUBLE = | ||
| + new LexemeType[]{WORD, NUMBER, ELLIPSIS, QUOTE_SINGLE, QUOTE_DOUBLE}; | ||
| + | ||
| + /** | ||
| + * Double quotes preceded by these {@link LexemeType}s may be closing quotes. | ||
| + */ | ||
| + private static final LexemeType[] LEADING_CLOSING_QUOTE_DOUBLE = | ||
| + new LexemeType[]{WORD, NUMBER, PERIOD, PUNCT, ELLIPSIS, QUOTE_SINGLE}; | ||
| + | ||
| + /** | ||
| + * Double quotes succeeded by these {@link LexemeType}s may be closing quotes. | ||
| + */ | ||
| + private static final LexemeType[] TRAILING_CLOSING_QUOTE_DOUBLE = | ||
| + new LexemeType[]{SPACE, HYPHEN, QUOTE_SINGLE, EOL}; | ||
| + | ||
| /** | ||
| * The text to parse. A reference is required as a minor optimization in | ||
| // By loop's end, the lexemes list contains tokens for all except the | ||
| - // final two elements (from tokenizing in triplets). Try emitting the last | ||
| - // two unprocessed lexemes as tokens. | ||
| + // final two elements (from tokenizing in triplets). Tokenize the remaining | ||
| + // unprocessed lexemes. | ||
| tokenize( EOT, consumer ); | ||
| tokenize( EOT, consumer ); | ||
| - | ||
| - // Lexemes not emitted as tokens may be ambiguous. | ||
| - //mQuotationMarks.sort( comparingInt( lexemes -> lexemes[ 0 ].began() ) ); | ||
| + // Some non-emitted tokenized lexemes may be ambiguous. | ||
| + final var ambiguousOpeningQuotes = new ArrayList<Lexeme>( 16 ); | ||
| + final var ambiguousClosingQuotes = new ArrayList<Lexeme>( 16 ); | ||
| - // | ||
| + // Count the number of ambiguous and non-ambiguous open single quotes. | ||
| + for( var i = mQuotationMarks.iterator(); i.hasNext(); ) { | ||
| + final var quotes = i.next(); | ||
| + final var lex1 = quotes[ 0 ]; | ||
| + final var lex2 = quotes[ 1 ]; | ||
| + final var lex3 = quotes[ 2 ]; | ||
| + if( lex2.isType( QUOTE_SINGLE ) ) { | ||
| + final var word1 = lex1 == SOT ? "" : lex1.toString( mText ); | ||
| + final var word3 = lex3 == EOT ? "" : lex3.toString( mText ); | ||
| - for( final var unparsed : mQuotationMarks ) { | ||
| - System.out.println( "unparsed: " + unparsed[ 0 ] + " " + unparsed[ 1 ] + " " + unparsed[ 2 ] ); | ||
| + if( contractionBeganAmbiguously( word3 ) ) { | ||
| + // The contraction is uncertain until closing quote is found that | ||
| + // balances this single quote. | ||
| + ambiguousOpeningQuotes.add( lex2 ); | ||
| + } | ||
| + else if( contractionBeganUnambiguously( word3 ) ) { | ||
| + // The quote mark forms a word that does not stand alone from its | ||
| + // contraction. For example, twas is not a word: it's 'twas. | ||
| + consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) ); | ||
| + i.remove(); | ||
| + } | ||
| + else if( contractionEndedAmbiguously( word1 ) ) { | ||
| + ambiguousClosingQuotes.add( lex2 ); | ||
| + } | ||
| + else if( contractionEndedUnambiguously( word1 ) ) { | ||
| + consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) ); | ||
| + i.remove(); | ||
| + } | ||
| + else if( (lex1.isSot() || lex1.anyType( LEADING_OPENING_QUOTE_SINGLE )) | ||
| + && lex3.anyType( TRAILING_OPENING_QUOTE_SINGLE ) ) { | ||
| + consumer.accept( new Token( QUOTE_OPENING_SINGLE, lex2 ) ); | ||
| + } | ||
| + else if( lex1.anyType( LEADING_CLOSING_QUOTE_SINGLE ) && | ||
| + (lex3.isEot() || lex3.anyType( TRAILING_CLOSING_QUOTE_SINGLE )) ) { | ||
| + consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) ); | ||
| + } | ||
| + } | ||
| } | ||
| - // Create/convert a list of all unambiguous quotations. | ||
| - // Let TERM ::= (, | ; | ! | ? | .) | ||
| - // Find unambiguous quotations by searching for: | ||
| - // ' WORD ('* SPACE+ WORD)* TERM ' | ||
| - // In other words, when a ' WORD is encountered, push the ' onto a stack. | ||
| - // If ' WORD is encountered, pop the stack and push the new ' onto it. | ||
| - // If TERM ' is encountered, push the new ' onto it. | ||
| - // This algorithm may have to push " WORD and " TERM as well, to account | ||
| - // for nested sentences. | ||
| + if( ambiguousOpeningQuotes.size() == 0 ) { | ||
| + ambiguousClosingQuotes.forEach( | ||
| + lex -> consumer.accept( new Token( QUOTE_APOSTROPHE, lex ) ) | ||
| + ); | ||
| + } | ||
| + else { | ||
| + System.out.println( "ambig opening: " + ambiguousOpeningQuotes.size() ); | ||
| + System.out.println( "ambig closing: " + ambiguousClosingQuotes.size() ); | ||
| - // Convert remaining single quotes to apostrophes. | ||
| + // No opening quotes, then there are no closing quotes, which | ||
| + // means the remaining single quotes are all apostrophes. | ||
| + if( ambiguousOpeningQuotes == ambiguousClosingQuotes ) { | ||
| + System.out.println( "MATCHING OPENING/CLOSING SINGLE QUOTES" ); | ||
| + } | ||
| + } | ||
| } | ||
| else if( lex1.isType( QUOTE_SINGLE ) && lex2.isType( WORD ) && | ||
| // E.g., 'twas | ||
| - beginsUnambiguously( lex2.toString( mText ) ) ) { | ||
| + contractionBeganUnambiguously( lex2.toString( mText ) ) ) { | ||
| consumer.accept( new Token( QUOTE_APOSTROPHE, lex1 ) ); | ||
| } | ||
| } | ||
| else if( lex2.isType( QUOTE_DOUBLE ) && | ||
| - (lex1.isSot() || lex1.anyType( SPACE, HYPHEN, QUOTE_SINGLE )) && | ||
| - lex3.anyType( WORD, NUMBER, ELLIPSIS, QUOTE_SINGLE, QUOTE_DOUBLE ) ) { | ||
| + (lex1.isSot() || lex1.anyType( LEADING_OPENING_QUOTE_DOUBLE )) && | ||
| + lex3.anyType( TRAILING_OPENING_QUOTE_DOUBLE ) ) { | ||
| // Examples: "'Twas, "", "... | ||
| consumer.accept( new Token( QUOTE_OPENING_DOUBLE, lex2 ) ); | ||
| } | ||
| else if( lex2.isType( QUOTE_DOUBLE ) && | ||
| - lex1.anyType( WORD, NUMBER, PERIOD, PUNCT, ELLIPSIS, QUOTE_SINGLE ) && | ||
| - (lex3.anyType( SPACE, HYPHEN, QUOTE_SINGLE, EOL ) || lex3.isEot()) ) { | ||
| + lex1.anyType( LEADING_CLOSING_QUOTE_DOUBLE ) && | ||
| + (lex3.isEot() || lex3.anyType( TRAILING_CLOSING_QUOTE_DOUBLE )) ) { | ||
| consumer.accept( new Token( QUOTE_CLOSING_DOUBLE, lex2 ) ); | ||
| } | ||
| for( final var token : tokens ) { | ||
| + assert position <= token.began(); | ||
| + | ||
| result.append( text, position, token.began() ); | ||
| result.append( REPLACEMENTS.get( token.getType() ) ); |
| final var fixer = new SmartQuotes(); | ||
| out.println( fixer.replace( "\"Didn' get th' message.\"" ) ); | ||
| + | ||
| + out.println( "-------" ); | ||
| + | ||
| + out.println( fixer.replace( "'Bout that time I says, 'Boys! I been " + | ||
| + "thinkin' 'bout th' Universe.'")); | ||
| + | ||
| + out.println( "-------" ); | ||
| + | ||
| out.println( fixer.replace( "\"John asked, 'What are you, beyond " + | ||
| "\"somethin' shiny?\"'\" said Fred.\n" ) ); | ||
| expected = line; | ||
| } | ||
| + | ||
| + System.out.println( "EXPECT:" + expected ); | ||
| final var actual = parser.apply( testLine ); | ||
| With--“air quotes”--and dashes. | ||
| -"Not "quite" what you expected?" | ||
| -“Not “quite” what you expected?” | ||
| - | ||
| "Not all open quotes are closed... | ||
| “Not all open quotes are closed... | ||
| With--'imaginary'--dashes. | ||
| With--‘imaginary’--dashes. | ||
| - | ||
| -'Not 'quite' what you expected?' | ||
| -‘Not ‘quite’ what you expected?’ | ||
| ''Cause I don't like it, 's why,' said Pat. | ||
| ‘'Cause I don't like it, 's why,’ said Pat. | ||
| 'It's a beautiful day!' | ||
| ‘It's a beautiful day!’ | ||
| - | ||
| -'He said, "Thinkin'."' | ||
| -‘He said, “Thinkin’.”’ | ||
| - | ||
| -"She said, 'Llamas'll languish, they'll-- | ||
| -“She said, ‘Llamas'll languish, they'll-- | ||
| # ######################################################################## | ||
| “I heard she said, ‘That's Sam's’,” said the Sams' cat. | ||
| -"'Janes' said, ''E'll be spooky, Sam's son with the jack-o'-lantern!'" said the O'Mally twins'---y'know---ghosts in unison. | ||
| -“‘Janes' said, ‘'E'll be spooky, Sam's son with the jack-o'-lantern!’” said the O'Mally twins'---y'know---ghosts in unison. | ||
| +"'Janes said, ''E'll be spooky, Sam's son with the jack-o'-lantern!'" said the O'Mally twins'---y'know---ghosts in unison. | ||
| +“‘Janes said, ‘'E'll be spooky, Sam's son with the jack-o'-lantern!’” said the O'Mally twins'---y'know---ghosts in unison. | ||
| 'He's at Sams' | ||
| ‘A’ ‘B’ and ‘C’ are letters. | ||
| -'Oak,' 'elm,' and 'beech' are names of trees. So is 'pine.' | ||
| +'Oak,' 'elm,'and 'beech' are names of trees. So is 'pine.' | ||
| ‘Oak,’ ‘elm,’ and ‘beech’ are names of trees. So is ‘pine.’ | ||
| 'He said, \"I want to go.\"' Were you alive in the 70's? | ||
| ‘He said, "I want to go."’ Were you alive in the 70's? | ||
| "That's a 'magic' sock." | ||
| “That's a ‘magic’ sock.” | ||
| + | ||
| +'That's a "magic" sock.' | ||
| +‘That's a “magic” sock.’ | ||
| Website! Company Name, Inc. ("Company Name" or "Company") recommends reading the following terms and conditions, carefully: | ||
| Website! Company Name, Inc. (“Company Name” or “Company”) recommends reading the following terms and conditions, carefully: | ||
| Website! Company Name, Inc. ('Company Name' or 'Company') recommends reading the following terms and conditions, carefully: | ||
| Website! Company Name, Inc. (‘Company Name’ or ‘Company’) recommends reading the following terms and conditions, carefully: | ||
| Workin' hard | ||
| Workin' hard | ||
| + | ||
| +'He said, "Thinkin'."' | ||
| +‘He said, “Thinkin’.”’ | ||
| + | ||
| +"She said, 'Llamas'll languish, they'll-- | ||
| +“She said, ‘Llamas'll languish, they'll-- | ||
| '70s are my favorite numbers,' she said. | ||
| Delta | 210 lines added, 49 lines removed, 161-line increase |
|---|