| Author | Dave Jarvis <email> |
|---|---|
| Date | 2021-06-02 22:02:37 GMT-0700 |
| Commit | 45f5d443fbe94dab1f5d613fc94d26970947860d |
| Parent | 1238740 |
| /** | ||
| * Denotes there are no more lexemes: the end of text (EOT) has been reached. | ||
| + * The beginning index differentiates between EOT and SOT. | ||
| */ | ||
| - public static final Lexeme EOT = new Lexeme(); | ||
| + public static final Lexeme EOT = new Lexeme( FLAG, -1, -2 ); | ||
| /** | ||
| - * Denotes parsing at the start of text. This is useful to avoid branching | ||
| - * conditions while iterating. | ||
| + * Denotes parsing at the start of text (SOT). This is useful to avoid | ||
| + * branching conditions while iterating. The beginning index differentiates | ||
| + * between EOT and SOT. | ||
| */ | ||
| - public static final Lexeme SOT = new Lexeme(); | ||
| + public static final Lexeme SOT = new Lexeme( FLAG, 0, -2 ); | ||
| private final LexemeType mType; | ||
| private final int mBegan; | ||
| private final int mEnded; | ||
| - | ||
| - /** | ||
| - * Create a flag that indicates a stream marker (start or end). | ||
| - */ | ||
| - private Lexeme() { | ||
| - this( FLAG, -1, -1 ); | ||
| - } | ||
| /** | ||
| int ended() { | ||
| return mEnded; | ||
| + } | ||
| + | ||
| + boolean isEot() { | ||
| + return mBegan == -1; | ||
| + } | ||
| + | ||
| + boolean isSot() { | ||
| + return mBegan == 0; | ||
| } | ||
| ESC_SINGLE, | ||
| ESC_DOUBLE, | ||
| + EOL, | ||
| + SPACE, | ||
| WORD, | ||
| NUMBER, | ||
| - NEWLINE, | ||
| - SPACE, | ||
| PUNCT, | ||
| + HYPHEN, | ||
| PERIOD, | ||
| + ELLIPSIS, | ||
| FLAG | ||
| } |
| import java.text.CharacterIterator; | ||
| import java.text.StringCharacterIterator; | ||
| +import java.util.function.BiFunction; | ||
| import static com.keenwrite.quotes.Lexeme.createLexeme; | ||
| import static com.keenwrite.quotes.LexemeType.*; | ||
| import static java.lang.Character.*; | ||
| import static java.text.CharacterIterator.DONE; | ||
| /** | ||
| * Turns text into words, numbers, punctuation, spaces, and more. | ||
| */ | ||
| -public class Lexer { | ||
| +public class Lexer { | ||
| private final CharacterIterator mIterator; | ||
| else if( curr == '"' ) { | ||
| lexeme = createLexeme( QUOTE_DOUBLE, began, i.getIndex() ); | ||
| + } | ||
| + else if( curr == '-' ) { | ||
| + lexeme = createLexeme( HYPHEN, began, i.getIndex() ); | ||
| } | ||
| else if( isDigit( curr ) || isNumeric( curr ) && isDigit( peek( i ) ) ) { | ||
| - // Tokenize all consecutive number characters at prevent the main | ||
| - // loop from switching back to word tokens. | ||
| - char next; | ||
| - | ||
| - do { | ||
| - next = i.next(); | ||
| - } | ||
| - while( isDigit( next ) || isNumeric( next ) && isDigit( peek( i ) ) ); | ||
| - | ||
| - // The loop above will overshoot the number by one character. | ||
| - i.previous(); | ||
| + // Parse all consecutive number characters to prevent the main loop | ||
| + // from switching back to word tokens. | ||
| + slurp( i, ( next, ci ) -> | ||
| + isDigit( next ) || isNumeric( next ) && isDigit( peek( ci ) ) | ||
| + ); | ||
| lexeme = createLexeme( isWord ? WORD : NUMBER, began, i.getIndex() ); | ||
| } | ||
| else if( curr == '.' ) { | ||
| - lexeme = createLexeme( PERIOD, began, i.getIndex() ); | ||
| + // Parse all consecutive periods into an ellipsis lexeme. This will | ||
| + // not capture space-separated ellipsis (such as ". . ."). | ||
| + lexeme = createLexeme( | ||
| + slurp( i, ( next, ci ) -> next == '.' ) == 1 ? PERIOD : ELLIPSIS, | ||
| + began, i.getIndex() | ||
| + ); | ||
| } | ||
| else if( curr == '\r' ) { | ||
| - lexeme = createLexeme( NEWLINE, began, i.getIndex() ); | ||
| + lexeme = createLexeme( EOL, began, i.getIndex() ); | ||
| // Swallow the LF in CRLF; peeking won't work here. | ||
| if( i.next() != '\n' ) { | ||
| // Push back the non-LF char. | ||
| i.previous(); | ||
| } | ||
| } | ||
| else if( curr == '\n' ) { | ||
| - lexeme = createLexeme( NEWLINE, began, i.getIndex() ); | ||
| + lexeme = createLexeme( EOL, began, i.getIndex() ); | ||
| } | ||
| else if( isWhitespace( curr ) ) { | ||
| ci.previous(); | ||
| return ch; | ||
| + } | ||
| + | ||
| + /** | ||
| + * Parse all characters that match a given function. | ||
| + * | ||
| + * @param ci The iterator containing characters to parse. | ||
| + * @param f The function that determines when slurping stops. | ||
| + * @return The number of characters parsed. | ||
| + */ | ||
| + private static int slurp( | ||
| + final CharacterIterator ci, | ||
| + final BiFunction<Character, CharacterIterator, Boolean> f ) { | ||
| + char next; | ||
| + int count = 0; | ||
| + | ||
| + do { | ||
| + next = ci.next(); | ||
| + count++; | ||
| + } | ||
| + while( f.apply( next, ci ) ); | ||
| + | ||
| + // The loop above will overshoot the number by one character. | ||
| + ci.previous(); | ||
| + | ||
| + return count; | ||
| } | ||
| } | ||
| import static com.keenwrite.quotes.Contractions.beginsUnambiguously; | ||
| import static com.keenwrite.quotes.Lexeme.EOT; | ||
| +import static com.keenwrite.quotes.Lexeme.SOT; | ||
| import static com.keenwrite.quotes.LexemeType.*; | ||
| import static com.keenwrite.quotes.TokenType.*; | ||
| -import static java.util.Comparator.comparingInt; | ||
| /** | ||
| while( (lexeme = mLexer.next()) != EOT ) { | ||
| - parse( lexeme, consumer ); | ||
| + tokenize( lexeme, consumer ); | ||
| } | ||
| - // Parse the remaining lexemes because the EOT lexeme will terminate the | ||
| - // loop above without having examined the last lexemes. | ||
| - for( int i = 0; i < mLexemes.size(); i++ ) { | ||
| - parse( mLexemes.get( i ), consumer ); | ||
| - } | ||
| + // By loop's end, the lexemes list contains tokens for all except the | ||
| + // final two elements (from tokenizing in triplets). Try emitting the last | ||
| + // two unprocessed lexemes as tokens. | ||
| + tokenize( EOT, consumer ); | ||
| + tokenize( EOT, consumer ); | ||
| - mQuotationMarks.sort( comparingInt( lexemes -> lexemes[ 0 ].began() ) ); | ||
| + // Lexemes not emitted as tokens may be ambiguous. | ||
| + | ||
| + //mQuotationMarks.sort( comparingInt( lexemes -> lexemes[ 0 ].began() ) ); | ||
| + | ||
| + // | ||
| + | ||
| for( final var unparsed : mQuotationMarks ) { | ||
| } | ||
| - private void parse( final Lexeme lexeme, final Consumer<Token> consumer ) { | ||
| + private void tokenize( final Lexeme lexeme, final Consumer<Token> consumer ) { | ||
| mLexemes.add( lexeme ); | ||
| + tokenize( consumer ); | ||
| + } | ||
| + private void tokenize( final Consumer<Token> consumer ) { | ||
| final var lex1 = mLexemes.get( 0 ); | ||
| final var lex2 = mLexemes.get( 1 ); | ||
| // E.g., \" | ||
| consumer.accept( new Token( QUOTE_STRAIGHT_DOUBLE, lex1 ) ); | ||
| + } | ||
| + else if( lex2.isType( QUOTE_DOUBLE ) && | ||
| + (lex1.isSot() || lex1.anyType( SPACE, HYPHEN, QUOTE_SINGLE )) && | ||
| + lex3.anyType( WORD, NUMBER, ELLIPSIS, QUOTE_SINGLE, QUOTE_DOUBLE ) ) { | ||
| + // Examples: "'Twas, "", "... | ||
| + consumer.accept( new Token( QUOTE_OPENING_DOUBLE, lex2 ) ); | ||
| + } | ||
| + else if( lex2.isType( QUOTE_DOUBLE ) && | ||
| + lex1.anyType( WORD, NUMBER, PERIOD, PUNCT, ELLIPSIS, QUOTE_SINGLE ) && | ||
| + (lex3.anyType( SPACE, HYPHEN, QUOTE_SINGLE, EOL ) || lex3.isEot()) ) { | ||
| + consumer.accept( new Token( QUOTE_CLOSING_DOUBLE, lex2 ) ); | ||
| } | ||
| else if( lex2.anyType( QUOTE_SINGLE, QUOTE_DOUBLE ) ) { | ||
| mQuotationMarks.add( new Lexeme[]{lex1, lex2, lex3} ); | ||
| } | ||
| } | ||
| private void flush( final CircularFifoQueue<Lexeme> lexemes ) { | ||
| - lexemes.add( Lexeme.SOT ); | ||
| - lexemes.add( Lexeme.SOT ); | ||
| - lexemes.add( Lexeme.SOT ); | ||
| + lexemes.add( SOT ); | ||
| + lexemes.add( SOT ); | ||
| + lexemes.add( SOT ); | ||
| } | ||
| } | ||
| /** | ||
| - * Responsible for converting straight quotes into smart quotes. | ||
| + * Responsible for replacing {@link Token} instances with equivalent smart | ||
| + * quotes (or straight quotes). | ||
| */ | ||
| public class SmartQuotes { | ||
| private static final Map<TokenType, String> REPLACEMENTS = Map.of( | ||
| + QUOTE_OPENING_SINGLE, "‘", | ||
| + QUOTE_CLOSING_SINGLE, "’", | ||
| + QUOTE_OPENING_DOUBLE, "“", | ||
| + QUOTE_CLOSING_DOUBLE, "”", | ||
| QUOTE_STRAIGHT_SINGLE, "'", | ||
| QUOTE_STRAIGHT_DOUBLE, "\"", | ||
| final var tokens = new ArrayList<Token>(); | ||
| - // Store all parsed quotation marks; the parser may emit them in any order. | ||
| + // Store all parsed quotation marks. | ||
| parser.parse( tokens::add ); | ||
| + | ||
| + // The parser may emit tokens in any order. | ||
| sort( tokens ); | ||
| void test_Lexing_Numbers_EmitNumbers() { | ||
| testType( ".123", NUMBER ); | ||
| - testType( "-123.", PUNCT, NUMBER, PERIOD ); | ||
| + testType( "-123.", HYPHEN, NUMBER, PERIOD ); | ||
| testType( " 123.123.123", SPACE, NUMBER ); | ||
| testType( "123 123\"", NUMBER, SPACE, NUMBER, QUOTE_DOUBLE ); | ||
| - testType( "-123,123.123", PUNCT, NUMBER ); | ||
| + testType( "-123,123.123", HYPHEN, NUMBER ); | ||
| + testType( "...1,023...", ELLIPSIS, NUMBER, ELLIPSIS ); | ||
| } | ||
| @Test | ||
| void test_Lexing_Words_EmitWords() { | ||
| testType( "abc", WORD ); | ||
| testType( "abc abc", WORD, SPACE, WORD ); | ||
| + testType( "abc...", WORD, ELLIPSIS ); | ||
| testType( "abc123", WORD, NUMBER ); | ||
| - testType( "-123abc", PUNCT, NUMBER, WORD ); | ||
| - testType( "abc-o'-abc", WORD, PUNCT, WORD, QUOTE_SINGLE, PUNCT, WORD ); | ||
| + testType( "-123abc", HYPHEN, NUMBER, WORD ); | ||
| + testType( "abc-o'-abc", WORD, HYPHEN, WORD, QUOTE_SINGLE, HYPHEN, WORD ); | ||
| } | ||
| @Test | ||
| void test_Lexing_PunctuationMarks_EmitPunctuationMarks() { | ||
| testType( "!", PUNCT ); | ||
| testType( ";", PUNCT ); | ||
| testType( ".", PERIOD ); | ||
| + testType( "-", HYPHEN ); | ||
| + testType( "...", ELLIPSIS ); | ||
| } | ||
| @Test | ||
| void test_Lexing_Newlines_EmitNewlines() { | ||
| - testType( "\r", NEWLINE ); | ||
| - testType( "\n", NEWLINE ); | ||
| - testType( "\r\n", NEWLINE ); | ||
| - testType( "\r\n\r\n", NEWLINE, NEWLINE ); | ||
| - testType( "\r\n\n\r", NEWLINE, NEWLINE, NEWLINE ); | ||
| - testType( "abc \r\nabc\n", WORD, SPACE, NEWLINE, WORD, NEWLINE ); | ||
| + testType( "\r", EOL ); | ||
| + testType( "\n", EOL ); | ||
| + testType( "\r\n", EOL ); | ||
| + testType( "\r\n\r\n", EOL, EOL ); | ||
| + testType( "\r\n\n\r", EOL, EOL, EOL ); | ||
| + testType( "abc \r\nabc\n", WORD, SPACE, EOL, WORD, EOL ); | ||
| } | ||
| import java.util.function.Function; | ||
| +import static java.lang.System.out; | ||
| import static org.junit.jupiter.api.Assertions.assertEquals; | ||
| import static org.junit.jupiter.api.Assertions.assertNotNull; | ||
| public void test_parse_SingleLine_Parsed() { | ||
| final var fixer = new SmartQuotes(); | ||
| - final var output = fixer.replace( "Didn' get th' message." ); | ||
| - System.out.println( output ); | ||
| + out.println( fixer.replace( "\"Didn' get th' message.\"" ) ); | ||
| + out.println( fixer.replace( "\"John asked, 'What are you, beyond " + | ||
| + "\"somethin' shiny?\"'\" said Fred.\n" ) ); | ||
| } | ||
| # Primes (single, double) | ||
| # ######################################################################## | ||
| - | ||
| -# No space after the feet sign. | ||
| -It's 4'11" away. | ||
| -It's 4′11″ away. | ||
| - | ||
| Alice's friend is 6'3" tall. | ||
| Alice's friend is 6′3″ tall. | ||
| Angular measure 3° 5' 30" means 3 degs, 5 arcmins, & 30 arcsecs. | ||
| Angular measure 3° 5′ 30″ means 3 degs, 5 arcmins, & 30 arcsecs. | ||
| + | ||
| +# ######################################################################## | ||
| +# Double quotes | ||
| +# ######################################################################## | ||
| +"I am Sam" | ||
| +“I am Sam” | ||
| + | ||
| +"...even better!" | ||
| +“...even better!” | ||
| + | ||
| +"It was so," said he. | ||
| +“It was so,” said he. | ||
| + | ||
| +With "air quotes" in the middle. | ||
| +With “air quotes” in the middle. | ||
| + | ||
| +With--"air quotes"--and dashes. | ||
| +With--“air quotes”--and dashes. | ||
| + | ||
| +"Not "quite" what you expected?" | ||
| +“Not “quite” what you expected?” | ||
| + | ||
| +"Not all open quotes are closed... | ||
| +“Not all open quotes are closed... | ||
| + | ||
| +# ######################################################################## | ||
| +# Single quotes | ||
| +# ######################################################################## | ||
| +'I am Sam' | ||
| +‘I am Sam’ | ||
| + | ||
| +'It was so,' said he. | ||
| +‘It was so,’ said he. | ||
| + | ||
| +'...even better!' | ||
| +‘...even better!’ | ||
| + | ||
| +With 'quotes' in the middle. | ||
| +With ‘quotes’ in the middle. | ||
| + | ||
| +With--'imaginary'--dashes. | ||
| +With--‘imaginary’--dashes. | ||
| + | ||
| +'Not 'quite' what you expected?' | ||
| +‘Not ‘quite’ what you expected?’ | ||
| + | ||
| +''Cause I don't like it, 's why,' said Pat. | ||
| +‘'Cause I don't like it, 's why,’ said Pat. | ||
| + | ||
| +'It's a beautiful day!' | ||
| +‘It's a beautiful day!’ | ||
| + | ||
| +'He said, "Thinkin'."' | ||
| +‘He said, “Thinkin’.”’ | ||
| + | ||
| +"She said, 'Llamas'll languish, they'll-- | ||
| +“She said, ‘Llamas'll languish, they'll-- | ||
| + | ||
| +# ######################################################################## | ||
| +# Possessives | ||
| +# ######################################################################## | ||
| +Sam's Sams' and the Ross's roses' thorns were prickly. | ||
| +Sam's Sams' and the Ross's roses' thorns were prickly. | ||
| # ######################################################################## | ||
| # Outside contractions (leading and trailing, no middle) | ||
| # ######################################################################## | ||
| -Salt 'n' vinegar, fish-'n'-chips, sugar 'n' spice! | ||
| -Salt 'n' vinegar, fish-'n'-chips, sugar 'n' spice! | ||
| +Salt 'n' vinegar, fish-'n'-chips, sugar 'n spice! | ||
| +Salt 'n' vinegar, fish-'n'-chips, sugar 'n spice! | ||
| # ######################################################################## | ||
| '"Trouble's my name."' | ||
| ‘“Trouble's my name.“‘ | ||
| - | ||
| -# ######################################################################## | ||
| -# Double quotes | ||
| -# ######################################################################## | ||
| -"I am Sam" | ||
| -“I am Sam” | ||
| - | ||
| -"...even better!" | ||
| -“...even better!” | ||
| - | ||
| -"It was so," said he. | ||
| -“It was so,” said he. | ||
| - | ||
| -"She said, 'Llamas'll languish, they'll-- | ||
| -“She said, ‘Llamas'll languish, they'll-- | ||
| - | ||
| -With "air quotes" in the middle. | ||
| -With “air quotes” in the middle. | ||
| - | ||
| -With--"air quotes"--and dashes. | ||
| -With--“air quotes”--and dashes. | ||
| - | ||
| -"Not "quite" what you expected?" | ||
| -“Not “quite” what you expected?” | ||
| # ######################################################################## | ||
| "'Twas, t'wasn't thy name, 'twas it?" said Jim "the Barber" Brown. | ||
| “'Twas, t'wasn't thy name, 'twas it?” said Jim “the Barber” Brown. | ||
| - | ||
| -# ######################################################################## | ||
| -# Single quotes | ||
| -# ######################################################################## | ||
| -'I am Sam' | ||
| -‘I am Sam’ | ||
| - | ||
| -'It was so,' said he. | ||
| -‘It was so,’ said he. | ||
| - | ||
| -'...even better!' | ||
| -‘...even better!’ | ||
| - | ||
| -With 'quotes' in the middle. | ||
| -With ‘quotes’ in the middle. | ||
| - | ||
| -With--'imaginary'--dashes. | ||
| -With--‘imaginary’--dashes. | ||
| - | ||
| -'Not 'quite' what you expected?' | ||
| -‘Not ‘quite’ what you expected?’ | ||
| - | ||
| -''Cause I don't like it, 's why,' said Pat. | ||
| -‘'Cause I don't like it, 's why,’ said Pat. | ||
| - | ||
| -'It's a beautiful day!' | ||
| -‘It's a beautiful day!’ | ||
| - | ||
| -'He said, "Thinkin'."' | ||
| -‘He said, “Thinkin’.”’ | ||
| - | ||
| -# ######################################################################## | ||
| -# Possessives | ||
| -# ######################################################################## | ||
| -Sam's Sams' and the Ross's roses' thorns were prickly. | ||
| -Sam's Sams' and the Ross's roses' thorns were prickly. | ||
| # ######################################################################## | ||
| Book 'em, Danno. Rock 'n' roll. 'Cause 'twas the season. | ||
| -'85 was a good year. (The entire '80s were.) | ||
| -'85 was a good year. (The entire '80s were.) | ||
| +'27 was a good year. (The entire '20s were.) | ||
| +'27 was a good year. (The entire '20s were.) | ||
| Delta | 186 lines added, 123 lines removed, 63-line increase |
|---|