| Author | Dave Jarvis <email> |
|---|---|
| Date | 2021-06-06 22:24:31 GMT-0700 |
| Commit | 72021e74fd62d4add95acfce9f30aad629fef18b |
| Parent | a74d89e |
| public static boolean contractionBeganUnambiguously( final String word ) { | ||
| assert word != null; | ||
| - | ||
| return BEGAN_UNAMBIGUOUS.contains( word.toLowerCase() ); | ||
| } | ||
| public static boolean contractionBeganAmbiguously( final String word ) { | ||
| assert word != null; | ||
| - | ||
| return BEGAN_AMBIGUOUS.contains( word.toLowerCase() ); | ||
| } | ||
| public static boolean contractionEndedAmbiguously( final String word ) { | ||
| assert word != null; | ||
| - | ||
| final var check = word.toLowerCase(); | ||
| return ENDED_AMBIGUOUS.contains( check ) || check.endsWith( "s" ) || | ||
| check.endsWith( "n" ) || check.endsWith( "z" ) || | ||
| check.endsWith( "x" ) || check.endsWith( "ch" ); | ||
| } | ||
| public static boolean contractionEndedUnambiguously( final String word ) { | ||
| assert word != null; | ||
| - | ||
| return ENDED_UNAMBIGUOUS.contains( word.toLowerCase() ); | ||
| } | ||
| NUMBER, | ||
| PUNCT, | ||
| + OPENING_GROUP, | ||
| + CLOSING_GROUP, | ||
| HYPHEN, | ||
| PERIOD, |
| } | ||
| } | ||
| + else if( curr == '(' || curr == '{' || curr == '[' ) { | ||
| + lexeme = createLexeme( OPENING_GROUP, began, i.getIndex() ); | ||
| + } | ||
| + else if( curr == ')' || curr == '}' || curr == ']' ) { | ||
| + lexeme = createLexeme( CLOSING_GROUP, began, i.getIndex() ); | ||
| + } | ||
| else if( curr != DONE ) { | ||
| lexeme = createLexeme( PUNCT, began, i.getIndex() ); |
| */ | ||
| private static final LexemeType[] LEADING_QUOTE_OPENING_SINGLE = | ||
| - new LexemeType[]{SPACE, HYPHEN, QUOTE_DOUBLE}; | ||
| + new LexemeType[]{SPACE, HYPHEN, QUOTE_DOUBLE, OPENING_GROUP}; | ||
| /** | ||
| */ | ||
| private static final LexemeType[] LAGGING_QUOTE_CLOSING_SINGLE = | ||
| - new LexemeType[]{SPACE, HYPHEN, QUOTE_DOUBLE, EOL}; | ||
| + new LexemeType[]{SPACE, HYPHEN, QUOTE_DOUBLE, CLOSING_GROUP, EOL}; | ||
| /** | ||
| * Double quotes preceded by these {@link LexemeType}s may be opening quotes. | ||
| */ | ||
| private static final LexemeType[] LEADING_QUOTE_OPENING_DOUBLE = | ||
| - new LexemeType[]{SPACE, HYPHEN, QUOTE_SINGLE}; | ||
| + new LexemeType[]{SPACE, HYPHEN, QUOTE_SINGLE, OPENING_GROUP}; | ||
| /** | ||
| */ | ||
| private static final LexemeType[] LAGGING_QUOTE_CLOSING_DOUBLE = | ||
| - new LexemeType[]{SPACE, HYPHEN, QUOTE_SINGLE, EOL}; | ||
| + new LexemeType[]{SPACE, HYPHEN, QUOTE_SINGLE, CLOSING_GROUP, EOL}; | ||
| /** | ||
| private final CircularFifoQueue<Lexeme> mLexemes = | ||
| new CircularFifoQueue<>( 3 ); | ||
| + | ||
| + /** | ||
| + * Incremented for each opening single quote emitted. Used to help resolve | ||
| + * ambiguities when single quote marks are balanced. | ||
| + */ | ||
| + private int mOpeningSingleQuote; | ||
| + | ||
| + /** | ||
| + * Incremented for each closing single quote emitted. Used to help resolve | ||
| + * ambiguities when single quote marks are balanced. | ||
| + */ | ||
| + private int mClosingSingleQuote; | ||
| public Parser( final String text ) { | ||
| else if( (lex1.isSot() || lex1.anyType( LEADING_QUOTE_OPENING_SINGLE )) | ||
| && lex3.anyType( LAGGING_QUOTE_OPENING_SINGLE ) ) { | ||
| - resolvedLeadingQuotes++; | ||
| consumer.accept( new Token( QUOTE_OPENING_SINGLE, lex2 ) ); | ||
| i.remove(); | ||
| + resolvedLeadingQuotes++; | ||
| + mOpeningSingleQuote++; | ||
| } | ||
| else if( lex1.anyType( LEADING_QUOTE_CLOSING_SINGLE ) && | ||
| (lex3.isEot() || lex3.anyType( LAGGING_QUOTE_CLOSING_SINGLE )) ) { | ||
| - resolvedLaggingQuotes++; | ||
| consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) ); | ||
| i.remove(); | ||
| + resolvedLaggingQuotes++; | ||
| + mClosingSingleQuote++; | ||
| } | ||
| else if( lex3.isType( NUMBER ) ) { | ||
| System.out.println( "ambig leading: " + ambiguousLeadingQuotes.size() ); | ||
| System.out.println( "ambig lagging: " + ambiguousLaggingQuotes.size() ); | ||
| - System.out.println( "unambig leading: " + resolvedLeadingQuotes ); | ||
| - System.out.println( "unambig lagging: " + resolvedLaggingQuotes ); | ||
| + System.out.println( "resolv leading: " + resolvedLeadingQuotes ); | ||
| + System.out.println( "resolv lagging: " + resolvedLaggingQuotes ); | ||
| if( resolvedLeadingQuotes == 1 && resolvedLaggingQuotes == 0 ) { | ||
| if( ambiguousLeadingQuotes.size() == 0 && ambiguousLaggingQuotes.size() == 1 ) { | ||
| - consumer.accept( | ||
| - new Token( QUOTE_CLOSING_SINGLE, ambiguousLaggingQuotes.get( 0 ) ) | ||
| - ); | ||
| + final var balanced = mClosingSingleQuote - mOpeningSingleQuote == 0; | ||
| + final var quote = balanced ? QUOTE_APOSTROPHE : QUOTE_CLOSING_SINGLE; | ||
| + consumer.accept( new Token( quote, ambiguousLaggingQuotes.get( 0 ) ) ); | ||
| } | ||
| } | ||
| // E.g., \" | ||
| consumer.accept( new Token( QUOTE_STRAIGHT_DOUBLE, lex1 ) ); | ||
| + | ||
| + if( lex2.isType( QUOTE_SINGLE ) && | ||
| + (lex3.isEot() || lex3.anyType( SPACE, HYPHEN, EOL )) ) { | ||
| + consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) ); | ||
| + mClosingSingleQuote++; | ||
| + } | ||
| } | ||
| else if( lex2.isType( QUOTE_DOUBLE ) && | ||
| (lex1.isSot() || lex1.anyType( LEADING_QUOTE_OPENING_DOUBLE )) && | ||
| lex3.anyType( LAGGING_QUOTE_OPENING_DOUBLE ) ) { | ||
| - // Examples: "'Twas, "", "... | ||
| + // Examples: "", "..., "word, ---"word | ||
| consumer.accept( new Token( QUOTE_OPENING_DOUBLE, lex2 ) ); | ||
| } | ||
| else if( lex2.isType( QUOTE_DOUBLE ) && | ||
| lex1.anyType( LEADING_QUOTE_CLOSING_DOUBLE ) && | ||
| (lex3.isEot() || lex3.anyType( LAGGING_QUOTE_CLOSING_DOUBLE )) ) { | ||
| + // E.g., ..."', word"', ?"' | ||
| consumer.accept( new Token( QUOTE_CLOSING_DOUBLE, lex2 ) ); | ||
| + } | ||
| + else if( lex1.isType( QUOTE_SINGLE ) && | ||
| + lex2.anyType( PUNCT, PERIOD ) && lex3.isType( QUOTE_DOUBLE ) ) { | ||
| + // E.g., '," (contraction ruled out from previous conditionals) | ||
| + consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex1 ) ); | ||
| + mClosingSingleQuote++; | ||
| } | ||
| else if( lex2.anyType( QUOTE_SINGLE, QUOTE_DOUBLE ) ) { | ||
| @Test | ||
| public void test_parse_SingleLine_Parsed() { | ||
| - out.println( SmartQuotes.replace( "Took place in '04, yes'm!")); | ||
| - | ||
| - out.println( "-------" ); | ||
| - out.println( SmartQuotes.replace( "'Bout that time I says, 'Boys! I been " + | ||
| - "thinkin' 'bout th' Universe.'")); | ||
| - | ||
| - out.println( "-------" ); | ||
| - | ||
| - out.println( SmartQuotes.replace( "\"John asked, 'What are you, beyond " + | ||
| - "\"somethin' shiny?\"'\" said Fred.\n" ) ); | ||
| + out.println( SmartQuotes.replace( | ||
| + "'Bout that time I says, 'Boys! I been thinkin' 'bout th' Universe.'" ) | ||
| + ); | ||
| } | ||
| “I heard she said, ‘That's Sam's’,” said the Sams' cat. | ||
| -"'Janes said, ''E'll be spooky, Sam's son with the jack-o'-lantern!'" said the O'Mally twins'---y'know---ghosts in unison. | ||
| -“‘Janes said, ‘'E'll be spooky, Sam's son with the jack-o'-lantern!’” said the O'Mally twins'---y'know---ghosts in unison. | ||
| +"'Jane said, ''E'll be spooky, Sam's son with the jack-o'-lantern!'" said the O'Mally twins'---y'know---ghosts in unison. | ||
| +“‘Jane said, ‘'E'll be spooky, Sam's son with the jack-o'-lantern!’” said the O'Mally twins'---y'know---ghosts in unison. | ||
| 'He's at Sams' | ||
| -‘He' at Sams’ | ||
| +‘He's at Sams’ | ||
| ma'am | ||
| ma'am | ||
| 'Twas midnight | ||
| 'Twas midnight | ||
| "Hello," said the spider. "'Shelob' is my name." | ||
| “Hello,” said the spider. “‘Shelob’ is my name.” | ||
| - | ||
| -'A', 'B', and 'C' are letters. | ||
| -‘A’ ‘B’ and ‘C’ are letters. | ||
| - | ||
| -'Oak,' 'elm,'and 'beech' are names of trees. So is 'pine.' | ||
| -‘Oak,’ ‘elm,’ and ‘beech’ are names of trees. So is ‘pine.’ | ||
| 'He said, \"I want to go.\"' Were you alive in the 70's? | ||
| Workin' hard | ||
| Workin' hard | ||
| - | ||
| -'He said, "Thinkin'."' | ||
| -‘He said, “Thinkin’.”’ | ||
| "She said, 'Llamas'll languish, they'll-- | ||
| iPad 3's battery life is not great. | ||
| iPad 3's battery life is not great. | ||
| + | ||
| +'Oak,' 'elm,' and 'beech' are names of trees. So is 'pine.' | ||
| +‘Oak,’ ‘elm,’ and ‘beech’ are names of trees. So is ‘pine.’ | ||
| Book 'em, Danno. Rock 'n' roll. 'Cause 'twas the season. | ||
| Book 'em, Danno. Rock 'n' roll. 'Cause 'twas the season. | ||
| + | ||
| +'He said, "Thinkin'."' | ||
| +‘He said, “Thinkin'.”’ | ||
| + | ||
| +'A', 'B', and 'C' are letters. | ||
| +‘A’ ‘B’ and ‘C’ are letters. | ||
| Delta | 62 lines added, 38 lines removed, 24-line increase |
|---|