| Author | Dave Jarvis <email> |
|---|---|
| Date | 2021-06-08 21:13:54 GMT-0700 |
| Commit | ca8bd102a691e2716b90e1b6661d103ff2dbd32e |
| Parent | 72021e7 |
| Delta | 58 lines added, 32 lines removed, 26-line increase |
|---|
| private static final Set<String> ENDED_AMBIGUOUS = Set.of( | ||
| - // he|a | ||
| - "a", | ||
| // and|an | ||
| "an", | ||
| return ENDED_AMBIGUOUS.contains( check ) || check.endsWith( "s" ) || | ||
| - check.endsWith( "n" ) || check.endsWith( "z" ) || | ||
| - check.endsWith( "x" ) || check.endsWith( "ch" ); | ||
| + check.endsWith( "n" ) || check.endsWith( "z" ) || check.endsWith( "x" ); | ||
| } | ||
| ESC_DOUBLE, | ||
| EOL, | ||
| + EOP, | ||
| SPACE, | ||
| WORD, |
| */ | ||
| public class Lexer { | ||
| - | ||
| + /** | ||
| + * Iterates over the entire string of text to help produce lexemes. | ||
| + */ | ||
| private final CharacterIterator mIterator; | ||
| ); | ||
| } | ||
| - else if( curr == '\r' ) { | ||
| - lexeme = createLexeme( EOL, began, i.getIndex() ); | ||
| + else if( curr == '\r' || curr == '\n' ) { | ||
| + final var cr = new int[]{curr == '\r' ? 1 : 0}; | ||
| + final var lf = new int[]{curr == '\n' ? 1 : 0}; | ||
| - // Swallow the LF in CRLF; peeking won't work here. | ||
| - if( i.next() != '\n' ) { | ||
| - // Push back the non-LF char. | ||
| - i.previous(); | ||
| - } | ||
| - } | ||
| - else if( curr == '\n' ) { | ||
| - lexeme = createLexeme( EOL, began, i.getIndex() ); | ||
| + // Swallow all consecutive CR (Mac), CRLF (Windows), and/or LF (Unix). | ||
| + slurp( | ||
| + i, ( next, ci ) -> { | ||
| + cr[ 0 ] += next == '\r' ? 1 : 0; | ||
| + lf[ 0 ] += next == '\n' ? 1 : 0; | ||
| + return next == '\r' || next == '\n'; | ||
| + } | ||
| + ); | ||
| + | ||
| + final var eol = cr[ 0 ] + lf[ 0 ] == 1 || cr[ 0 ] == 1 && lf[ 0 ] == 1; | ||
| + lexeme = createLexeme( eol ? EOL : EOP, began, i.getIndex() ); | ||
| } | ||
| else if( isWhitespace( curr ) ) { | ||
| ci.previous(); | ||
| - return count; | ||
| + return --count; | ||
| } | ||
| } | ||
| */ | ||
| private static final LexemeType[] LAGGING_QUOTE_OPENING_SINGLE = | ||
| - new LexemeType[]{WORD, ELLIPSIS, QUOTE_SINGLE, QUOTE_DOUBLE}; | ||
| + new LexemeType[]{WORD, ELLIPSIS, QUOTE_SINGLE, QUOTE_DOUBLE, EOP}; | ||
| /** | ||
| */ | ||
| private static final LexemeType[] LEADING_QUOTE_OPENING_DOUBLE = | ||
| - new LexemeType[]{SPACE, HYPHEN, QUOTE_SINGLE, OPENING_GROUP}; | ||
| + new LexemeType[]{SPACE, HYPHEN, QUOTE_SINGLE, OPENING_GROUP, EOP}; | ||
| /** | ||
| } | ||
| + System.out.println( "--------------------" ); | ||
| System.out.println( "ambig leading: " + ambiguousLeadingQuotes.size() ); | ||
| System.out.println( "ambig lagging: " + ambiguousLaggingQuotes.size() ); | ||
| System.out.println( "resolv leading: " + resolvedLeadingQuotes ); | ||
| System.out.println( "resolv lagging: " + resolvedLaggingQuotes ); | ||
| + | ||
| + final var ambiguousLeadingCount = ambiguousLeadingQuotes.size(); | ||
| + final var ambiguousLaggingCount = ambiguousLaggingQuotes.size(); | ||
| if( resolvedLeadingQuotes == 1 && resolvedLaggingQuotes == 0 ) { | ||
| - if( ambiguousLeadingQuotes.size() == 0 && ambiguousLaggingQuotes.size() == 1 ) { | ||
| + if( ambiguousLeadingCount == 0 && ambiguousLaggingCount == 1 ) { | ||
| final var balanced = mClosingSingleQuote - mOpeningSingleQuote == 0; | ||
| final var quote = balanced ? QUOTE_APOSTROPHE : QUOTE_CLOSING_SINGLE; | ||
| consumer.accept( new Token( quote, ambiguousLaggingQuotes.get( 0 ) ) ); | ||
| } | ||
| } | ||
| - else if( ambiguousLeadingQuotes.size() > 0 && ambiguousLaggingQuotes.size() == 0 ) { | ||
| + else if( ambiguousLeadingCount > 0 && ambiguousLaggingCount == 0 ) { | ||
| // If there are no ambiguous lagging quotes then all ambiguous leading | ||
| // quotes must be contractions. | ||
| ambiguousLeadingQuotes.forEach( | ||
| lex -> consumer.accept( new Token( QUOTE_APOSTROPHE, lex ) ) | ||
| ); | ||
| } | ||
| - else if( ambiguousLeadingQuotes.size() == 0 && ambiguousLaggingQuotes.size() > 0 ) { | ||
| + else if( ambiguousLeadingCount == 0 && ambiguousLaggingCount > 0 ) { | ||
| // If there are no ambiguous leading quotes then all ambiguous lagging | ||
| // quotes must be contractions. | ||
| ambiguousLaggingQuotes.forEach( | ||
| lex -> consumer.accept( new Token( QUOTE_APOSTROPHE, lex ) ) | ||
| ); | ||
| } | ||
| - else if( resolvedLeadingQuotes == 1 && ambiguousLaggingQuotes.size() == 1 ) { | ||
| + else if( ambiguousLeadingCount == 0 ) { | ||
| + if( resolvedLaggingQuotes < resolvedLeadingQuotes ) { | ||
| + for( final var mark : mQuotationMarks ) { | ||
| + consumer.accept( new Token( QUOTE_CLOSING_SINGLE, mark[ 1 ] ) ); | ||
| + } | ||
| + } | ||
| + } | ||
| + else if( resolvedLeadingQuotes == 1 && ambiguousLaggingCount == 1 ) { | ||
| consumer.accept( | ||
| new Token( QUOTE_CLOSING_SINGLE, ambiguousLaggingQuotes.get( 0 ) ) | ||
| if( lex2.isType( QUOTE_SINGLE ) && | ||
| - (lex3.isEot() || lex3.anyType( SPACE, HYPHEN, EOL )) ) { | ||
| + (lex3.isEot() || lex3.anyType( SPACE, HYPHEN, EOL, EOP )) ) { | ||
| consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) ); | ||
| mClosingSingleQuote++; | ||
| public void test_parse_SingleLine_Parsed() { | ||
| out.println( SmartQuotes.replace( | ||
| - "'Bout that time I says, 'Boys! I been thinkin' 'bout th' Universe.'" ) | ||
| - ); | ||
| + "'Oak,' 'elm,' and 'beech' are names of trees. So is 'pine.'" | ||
| + ) ); | ||
| + | ||
| + System.out.println(" ------------------------ "); | ||
| + | ||
| + out.println( SmartQuotes.replace( | ||
| + "'Bout that time I says, 'Boys! I been thinkin' 'bout th' Universe.'" | ||
| + ) ); | ||
| } | ||
| } | ||
| - System.out.println( "EXPECT:" + expected ); | ||
| + testLine = unescapeEol( testLine ); | ||
| + expected = unescapeEol( expected ); | ||
| + System.out.println( testLine ); | ||
| final var actual = parser.apply( testLine ); | ||
| - System.out.println( "ACTUAL:" + actual ); | ||
| assertEquals( expected, actual ); | ||
| testLine = ""; | ||
| expected = ""; | ||
| } | ||
| } | ||
| + } | ||
| + | ||
| + private static String unescapeEol( final String s) { | ||
| + return String.join( "\n", s.split( "\\\\n" ) ); | ||
| } | ||
| Model "T2000" | ||
| -iPad 3's battery life is not great. | ||
| -iPad 3's battery life is not great. | ||
| +The 2's complement.\n\n"Yar, binary!" | ||
| +The 2's complement.\n\n“Yar, binary!” | ||
| 'Oak,' 'elm,' and 'beech' are names of trees. So is 'pine.' | ||
| ‘Oak,’ ‘elm,’ and ‘beech’ are names of trees. So is ‘pine.’ | ||
| + | ||
| +'A', 'B', and 'C' are letters. | ||
| +‘A’, ‘B’, and ‘C’ are letters. | ||
| Book 'em, Danno. Rock 'n' roll. 'Cause 'twas the season. | ||
| Book 'em, Danno. Rock 'n' roll. 'Cause 'twas the season. | ||
| 'He said, "Thinkin'."' | ||
| ‘He said, “Thinkin'.”’ | ||
| - | ||
| -'A', 'B', and 'C' are letters. | ||
| -‘A’ ‘B’ and ‘C’ are letters. | ||