| Author | Dave Jarvis <email> |
|---|---|
| Date | 2021-05-31 18:28:39 GMT-0700 |
| Commit | f40e32cbe72ac0af42e55bde67c9204ab88cf840 |
| Parent | 5aaef29 |
| +package com.keenwrite.quotes; | ||
| + | ||
| +import java.util.Set; | ||
| + | ||
| +public class Contractions { | ||
| + /** | ||
| + * Words having a straight apostrophe that cannot be mistaken for an | ||
| + * opening single quote. | ||
| + */ | ||
| + private static final Set<String> BEGAN_UNAMBIGUOUS = Set.of( | ||
| + "aporth", "boutcha", "boutchu", "cept", "dillo", "e'll", "fraid", | ||
| + "gainst", "n", "neath", "nother", "onna", "onna'", "pon", "s", "sblood", | ||
| + "scuse", "sfar", "sfoot", "t", "taint", "tain't", "til", "tis", "tisn't", | ||
| + "tshall", "twas", "twasn't", "tween", "twere", "tweren't", "twixt", | ||
| + "twon't", "twou'd", "twou'dn't", "twould", "twouldn't", "ve" | ||
| + ); | ||
| + | ||
| + /** | ||
| + * Words having a straight apostrophe that may be either part of a | ||
| + * contraction or a word that stands alone beside an opening single quote. | ||
| + */ | ||
| + private static final Set<String> BEGAN_AMBIGUOUS = Set.of( | ||
| + "bout", "choo", "ere", "e", "e's", "fro", "ho", "kay", "lo", | ||
| + "re", "sup", "twill", "um", "zat" | ||
| + ); | ||
| + | ||
| + /** | ||
| + * Answers whether the given word is a contraction that always starts | ||
| + * with an apostrophe. The comparison is case insensitive. This must | ||
| + * only be called when a straight quote is followed by a word. | ||
| + * | ||
| + * @param word The word to compare against the list of known unambiguous | ||
| + * contractions. | ||
| + * @return {@code true} when the given word is in the set of unambiguous | ||
| + * contractions. | ||
| + */ | ||
| + public static boolean beginsUnambiguously( final String word ) { | ||
| + assert word != null; | ||
| + return BEGAN_UNAMBIGUOUS.contains( word.toLowerCase() ); | ||
| + } | ||
| + | ||
| + /** | ||
| + * Answers whether the given word could be a contraction but is also a | ||
| + * valid word in non-contracted form. | ||
| + * | ||
| + * @param word The word to compare against the list of known ambiguous | ||
| + * contractions. | ||
| + * @return {@code true} when the given word is in the set of ambiguous | ||
| + * contractions. | ||
| + */ | ||
| + public static boolean beginsAmbiguously( final String word ) { | ||
| + assert word != null; | ||
| + return BEGAN_AMBIGUOUS.contains( word.toLowerCase() ); | ||
| + } | ||
| +} | ||
| +/* Copyright 2020-2021 White Magic Software, Ltd. -- All rights reserved. */ | ||
| +package com.keenwrite.quotes; | ||
| + | ||
| +import static com.keenwrite.quotes.LexemeType.INVALID; | ||
| + | ||
| +/** | ||
| + * Responsible for tracking the beginning and ending offsets of a token within | ||
| + * a text string. Tracking the beginning and ending indices should use less | ||
| + * memory than duplicating the entire text of Unicode characters (i.e., using | ||
| + * a similar approach to run-length encoding). | ||
| + */ | ||
| +public class Lexeme { | ||
| + /** | ||
| + * Denotes there are no more tokens: the end of text (EOT) has been reached. | ||
| + */ | ||
| + public static final Lexeme EOT = new Lexeme(); | ||
| + | ||
| + private final LexemeType mType; | ||
| + private final int mBegan; | ||
| + private final int mEnded; | ||
| + | ||
| + /** | ||
| + * Set to {@code true} if there are more tokens to parse. | ||
| + */ | ||
| + private final boolean mHasNext; | ||
| + | ||
| + /** | ||
| + * Create an end of text token. | ||
| + */ | ||
| + private Lexeme() { | ||
| + this( INVALID, -1, -1, false ); | ||
| + } | ||
| + | ||
| + /** | ||
| + * Create a token that indicates there are no more tokens. | ||
| + */ | ||
| + private Lexeme( final LexemeType type, final int began, final int ended ) { | ||
| + this( type, began, ended, true ); | ||
| + } | ||
| + | ||
| + /** | ||
| + * Create a token that represents a section of the text. | ||
| + */ | ||
| + private Lexeme( | ||
| + final LexemeType type, final int began, final int ended, | ||
| + final boolean hasNext ) { | ||
| + mType = type; | ||
| + mBegan = began; | ||
| + mEnded = ended + 1; | ||
| + mHasNext = hasNext; | ||
| + } | ||
| + | ||
| + /** | ||
| + * Extracts a sequence of characters from the given text at the offsets | ||
| + * captured by this token. | ||
| + * | ||
| + * @param text The text that was parsed using this class. | ||
| + * @return The character string captured by the token. | ||
| + */ | ||
| + public String toString( final String text ) { | ||
| + return text.substring( mBegan, mEnded ); | ||
| + } | ||
| + | ||
| + public boolean hasNext() { | ||
| + return mHasNext; | ||
| + } | ||
| + | ||
| + public boolean isType( final LexemeType type ) { | ||
| + return mType == type; | ||
| + } | ||
| + | ||
| + public boolean anyType( final LexemeType... types ) { | ||
| + for( final var type : types ) { | ||
| + if( mType == type ) { | ||
| + return true; | ||
| + } | ||
| + } | ||
| + | ||
| + return false; | ||
| + } | ||
| + | ||
| + LexemeType getType() { | ||
| + return mType; | ||
| + } | ||
| + | ||
| + @Override | ||
| + public String toString() { | ||
| + return getClass().getSimpleName() + "{" + | ||
| + "mType=" + mType + | ||
| + ", mBegan=" + mBegan + | ||
| + ", mEnded=" + mEnded + | ||
| + '}'; | ||
| + } | ||
| + | ||
| + static Lexeme createLexeme( | ||
| + final LexemeType token, final int began, final int ended ) { | ||
| + return new Lexeme( token, began, ended ); | ||
| + } | ||
| +} | ||
| +/* Copyright 2020-2021 White Magic Software, Ltd. -- All rights reserved. */ | ||
| +package com.keenwrite.quotes; | ||
| + | ||
| +public enum LexemeType { | ||
| + QUOTE_SINGLE, | ||
| + QUOTE_DOUBLE, | ||
| + WORD, | ||
| + NUMBER, | ||
| + NEWLINE, | ||
| + SPACE, | ||
| + PUNCT, | ||
| + PERIOD, | ||
| + INVALID | ||
| +} | ||
| +/* Copyright 2020-2021 White Magic Software, Ltd. -- All rights reserved. */ | ||
| +package com.keenwrite.quotes; | ||
| + | ||
| +import java.text.CharacterIterator; | ||
| +import java.text.StringCharacterIterator; | ||
| +import java.util.function.Consumer; | ||
| + | ||
| +import static com.keenwrite.quotes.Lexeme.createLexeme; | ||
| +import static com.keenwrite.quotes.LexemeType.*; | ||
| +import static java.lang.Character.*; | ||
| +import static java.text.CharacterIterator.DONE; | ||
| + | ||
| +/** | ||
| + * Turns text into words, numbers, punctuation, spaces, and more. | ||
| + */ | ||
| +public class Lexer { | ||
| + /** | ||
| + * Default constructor, no state. | ||
| + */ | ||
| + public Lexer() { | ||
| + } | ||
| + | ||
| + /** | ||
| + * Emits a series of tokens that represent information about text that is | ||
| + * needed to convert straight quotes to curly quotes. | ||
| + * | ||
| + * @param text The text to split into tokens. | ||
| + * @param consumer Receives each token as a separate event. | ||
| + */ | ||
| + public void parse( final String text, final Consumer<Lexeme> consumer ) { | ||
| + final var iterator = new StringCharacterIterator( text ); | ||
| + Lexeme lex; | ||
| + | ||
| + while( (lex = parse( iterator )).hasNext() ) { | ||
| + consumer.accept( lex ); | ||
| + } | ||
| + } | ||
| + | ||
| + /** | ||
| + * Tokenizes a sequence of characters. The order of comparisons is optimized | ||
| + * towards probability of the occurrence of a character in regular English | ||
| + * prose: letters, space, quotation marks, numbers, periods, new lines, | ||
| + * then end of text. | ||
| + * | ||
| + * @param i The sequence of characters to tokenize. | ||
| + * @return The next token in the sequence. | ||
| + */ | ||
| + private Lexeme parse( final CharacterIterator i ) { | ||
| + int began = i.getIndex(); | ||
| + boolean isWord = false; | ||
| + Lexeme lexeme = null; | ||
| + | ||
| + do { | ||
| + final var curr = i.current(); | ||
| + | ||
| + if( isLetter( curr ) ) { | ||
| + isWord = true; | ||
| + | ||
| + if( !isLetterOrDigit( peek( i ) ) ) { | ||
| + lexeme = createLexeme( WORD, began, i.getIndex() ); | ||
| + } | ||
| + } | ||
| + else if( curr == ' ' ) { | ||
| + lexeme = createLexeme( SPACE, began, i.getIndex() ); | ||
| + } | ||
| + else if( curr == '\'' ) { | ||
| + lexeme = createLexeme( QUOTE_SINGLE, began, i.getIndex() ); | ||
| + } | ||
| + else if( curr == '"' ) { | ||
| + lexeme = createLexeme( QUOTE_DOUBLE, began, i.getIndex() ); | ||
| + } | ||
| + else if( isDigit( curr ) || isNumeric( curr ) && isDigit( peek( i ) ) ) { | ||
| + // Tokenize all consecutive number characters at prevent the main | ||
| + // loop from switching back to word tokens. | ||
| + char next; | ||
| + | ||
| + do { | ||
| + next = i.next(); | ||
| + } | ||
| + while( isDigit( next ) || isNumeric( next ) && isDigit( peek( i ) ) ); | ||
| + | ||
| + // The loop above will overshoot the number by one character. | ||
| + i.previous(); | ||
| + | ||
| + lexeme = createLexeme( isWord ? WORD : NUMBER, began, i.getIndex() ); | ||
| + } | ||
| + else if( curr == '.' ) { | ||
| + lexeme = createLexeme( PERIOD, began, i.getIndex() ); | ||
| + } | ||
| + else if( curr == '\r' ) { | ||
| + lexeme = createLexeme( NEWLINE, began, i.getIndex() ); | ||
| + | ||
| + // Swallow the LF in CRLF; peeking won't work here. | ||
| + if( i.next() != '\n' ) { | ||
| + // Push back the non-LF char. | ||
| + i.previous(); | ||
| + } | ||
| + } | ||
| + else if( curr == '\n' ) { | ||
| + lexeme = createLexeme( NEWLINE, began, i.getIndex() ); | ||
| + } | ||
| + else if( isWhitespace( curr ) ) { | ||
| + lexeme = createLexeme( SPACE, began, i.getIndex() ); | ||
| + } | ||
| + else if( curr != DONE ) { | ||
| + lexeme = createLexeme( PUNCT, began, i.getIndex() ); | ||
| + } | ||
| + else { | ||
| + lexeme = Lexeme.EOT; | ||
| + } | ||
| + | ||
| + i.next(); | ||
| + } | ||
| + while( lexeme == null ); | ||
| + | ||
| + return lexeme; | ||
| + } | ||
| + | ||
| + private static boolean isNumeric( final char curr ) { | ||
| + return curr == '.' || curr == ','; | ||
| + } | ||
| + | ||
| + private static char peek( final CharacterIterator ci ) { | ||
| + final var ch = ci.next(); | ||
| + ci.previous(); | ||
| + return ch; | ||
| + } | ||
| +} | ||
| import java.util.function.Consumer; | ||
| +import static com.keenwrite.quotes.Contractions.beginsUnambiguously; | ||
| +import static com.keenwrite.quotes.LexemeType.*; | ||
| import static com.keenwrite.quotes.TokenType.*; | ||
| * </ol> | ||
| */ | ||
| -public class Parser implements Consumer<Token> { | ||
| +public class Parser implements Consumer<Lexeme> { | ||
| private final String mText; | ||
| - private final CircularFifoQueue<Token> mTokens = new CircularFifoQueue<>( 3 ); | ||
| + private final CircularFifoQueue<Lexeme> mTokens = | ||
| + new CircularFifoQueue<>( 3 ); | ||
| - private final Deque<Token> mStack = new ArrayDeque<>(); | ||
| + private final Deque<Lexeme> mStack = new ArrayDeque<>(); | ||
| + private final Consumer<Token> mConsumer; | ||
| - public Parser( final String text ) { | ||
| + public Parser( final String text, final Consumer<Token> consumer ) { | ||
| mText = text; | ||
| - mTokens.add( Token.EOT ); | ||
| - mTokens.add( Token.EOT ); | ||
| - mTokens.add( Token.EOT ); | ||
| + | ||
| + // Allow consuming the very first token without checking the queue size. | ||
| + mTokens.add( Lexeme.EOT ); | ||
| + mTokens.add( Lexeme.EOT ); | ||
| + mTokens.add( Lexeme.EOT ); | ||
| + | ||
| + mConsumer = consumer; | ||
| } | ||
| public void parse() { | ||
| - final var tokenizer = new Tokenizer(); | ||
| - tokenizer.tokenize( mText, this ); | ||
| + final var tokenizer = new Lexer(); | ||
| + tokenizer.parse( mText, this ); | ||
| + System.out.println(" DUH DONE!" ); | ||
| } | ||
| @Override | ||
| - public void accept( final Token token ) { | ||
| + public void accept( final Lexeme token ) { | ||
| mTokens.add( token ); | ||
| final var token1 = mTokens.get( 0 ); | ||
| final var token2 = mTokens.get( 1 ); | ||
| final var token3 = mTokens.get( 2 ); | ||
| - if( token2.isType( QUOTE_SINGLE ) && | ||
| - token3.isType( WORD ) && | ||
| + if( token2.isType( QUOTE_SINGLE ) && token3.isType( WORD ) && | ||
| token1.anyType( WORD, PERIOD, NUMBER ) ) { | ||
| - System.out.println( "APOSTROPHE: " + token2 ); | ||
| + mConsumer.accept( new Token( QUOTE_APOSTROPHE, token2 ) ); | ||
| } | ||
| - else if( token1.isType( QUOTE_SINGLE ) && | ||
| - "n".equalsIgnoreCase( token2.toString( mText ) ) && | ||
| - token3.isType( QUOTE_SINGLE ) ) { | ||
| - System.out.printf( "APOSTROPHES: %s %s%n", token1, token3 ); | ||
| + else if( token1.isType( QUOTE_SINGLE ) && token3.isType( QUOTE_SINGLE ) && | ||
| + "n".equalsIgnoreCase( token2.toString( mText ) ) ) { | ||
| + mConsumer.accept( new Token( QUOTE_APOSTROPHE, token1 ) ); | ||
| + mConsumer.accept( new Token( QUOTE_APOSTROPHE, token3 ) ); | ||
| } | ||
| else if( token1.isType( NUMBER ) && token2.isType( QUOTE_SINGLE ) ) { | ||
| - System.out.println( "PRIME: " + token2 ); | ||
| + mConsumer.accept( new Token( QUOTE_PRIME_SINGLE, token2 ) ); | ||
| } | ||
| else if( token1.isType( NUMBER ) && token2.isType( QUOTE_DOUBLE ) ) { | ||
| - System.out.println( "DOUBLE PRIME: " + token2 ); | ||
| + mConsumer.accept( new Token( QUOTE_PRIME_DOUBLE, token2 ) ); | ||
| + } | ||
| + else if( token1.isType( QUOTE_SINGLE ) && token2.isType( WORD ) && | ||
| + beginsUnambiguously( token2.toString( mText ) ) ) { | ||
| + mConsumer.accept( new Token( QUOTE_APOSTROPHE, token1 ) ); | ||
| } | ||
| else if( token.anyType( QUOTE_SINGLE, QUOTE_DOUBLE ) ) { | ||
| } | ||
| } | ||
| - | ||
| -/* | ||
| - private enum TokenType { | ||
| - QUOTE_OPENING_SINGLE, | ||
| - QUOTE_OPENING_DOUBLE, | ||
| - QUOTE_CLOSING_SINGLE, | ||
| - QUOTE_CLOSING_DOUBLE, | ||
| - QUOTE_APOSTROPHE, | ||
| - QUOTE_PRIME_SINGLE, | ||
| - QUOTE_PRIME_DOUBLE, | ||
| - TEXT | ||
| - } | ||
| -*/ | ||
| package com.keenwrite.quotes; | ||
| -import static com.keenwrite.quotes.TokenType.INVALID; | ||
| - | ||
| /** | ||
| - * Responsible for tracking the beginning and ending offsets of a token within | ||
| - * a text string. | ||
| + * Represents a high-level token read from the text. | ||
| */ | ||
| public class Token { | ||
| - /** | ||
| - * Denotes there are no more tokens: the end of text (EOT) has been reached. | ||
| - */ | ||
| - public static final Token EOT = new Token(); | ||
| - | ||
| - private final TokenType mType; | ||
| - private final int mBegan; | ||
| - private final int mEnded; | ||
| - | ||
| - /** | ||
| - * Set to {@code true} if there are more tokens to parse. | ||
| - */ | ||
| - private final boolean mHasNext; | ||
| - | ||
| - /** | ||
| - * Create an end of text token. | ||
| - */ | ||
| - private Token() { | ||
| - this( INVALID, -1, -1, false ); | ||
| - } | ||
| + private TokenType mType; | ||
| + private Lexeme mLexeme; | ||
| - /** | ||
| - * Create a token that indicates there are no more tokens. | ||
| - */ | ||
| - private Token( final TokenType type, final int began, final int ended ) { | ||
| - this( type, began, ended, true ); | ||
| - } | ||
| + public Token( final TokenType type, final Lexeme lexeme ) { | ||
| + assert type != null; | ||
| + assert lexeme != null; | ||
| - /** | ||
| - * Create a token | ||
| - */ | ||
| - private Token( | ||
| - final TokenType type, final int began, final int ended, | ||
| - final boolean hasNext ) { | ||
| mType = type; | ||
| - mBegan = began; | ||
| - mEnded = ended + 1; | ||
| - mHasNext = hasNext; | ||
| - } | ||
| - | ||
| - /** | ||
| - * Extracts a sequence of characters from the given text at the offsets | ||
| - * captured by this token. | ||
| - * | ||
| - * @param text The text that was parsed using this class. | ||
| - * @return The character string captured by the token. | ||
| - */ | ||
| - public String toString( final String text ) { | ||
| - return text.substring( mBegan, mEnded ); | ||
| - } | ||
| - | ||
| - public boolean hasNext() { | ||
| - return mHasNext; | ||
| - } | ||
| - | ||
| - public boolean isType( final TokenType type ) { | ||
| - return mType == type; | ||
| - } | ||
| - | ||
| - public boolean anyType( final TokenType... types ) { | ||
| - for( final var type : types ) { | ||
| - if( mType == type ) { | ||
| - return true; | ||
| - } | ||
| - } | ||
| - | ||
| - return false; | ||
| - } | ||
| - | ||
| - TokenType getType() { | ||
| - return mType; | ||
| + mLexeme = lexeme; | ||
| } | ||
| @Override | ||
| public String toString() { | ||
| return getClass().getSimpleName() + "{" + | ||
| "mType=" + mType + | ||
| - ", mBegan=" + mBegan + | ||
| - ", mEnded=" + mEnded + | ||
| + ", mLexeme=" + mLexeme + | ||
| '}'; | ||
| - } | ||
| - | ||
| - static Token createToken( | ||
| - final TokenType token, final int began, final int ended ) { | ||
| - return new Token( token, began, ended ); | ||
| } | ||
| } |
| package com.keenwrite.quotes; | ||
| +/** | ||
| + * The {@link Parser} emits these token types. | ||
| + */ | ||
| public enum TokenType { | ||
| - QUOTE_SINGLE, | ||
| - QUOTE_DOUBLE, | ||
| - WORD, | ||
| - NUMBER, | ||
| - NEWLINE, | ||
| - SPACE, | ||
| - PUNCT, | ||
| - PERIOD, | ||
| - INVALID | ||
| + QUOTE_OPENING_SINGLE, | ||
| + QUOTE_OPENING_DOUBLE, | ||
| + QUOTE_CLOSING_SINGLE, | ||
| + QUOTE_CLOSING_DOUBLE, | ||
| + QUOTE_APOSTROPHE, | ||
| + QUOTE_PRIME_SINGLE, | ||
| + QUOTE_PRIME_DOUBLE, | ||
| + TEXT, | ||
| } | ||
| -/* Copyright 2020-2021 White Magic Software, Ltd. -- All rights reserved. */ | ||
| -package com.keenwrite.quotes; | ||
| - | ||
| -import java.text.CharacterIterator; | ||
| -import java.text.StringCharacterIterator; | ||
| -import java.util.function.Consumer; | ||
| - | ||
| -import static com.keenwrite.quotes.Token.createToken; | ||
| -import static com.keenwrite.quotes.TokenType.*; | ||
| -import static java.lang.Character.*; | ||
| -import static java.text.CharacterIterator.DONE; | ||
| - | ||
| -/** | ||
| - * Tokenizes text into words, numbers, punctuation, spaces, and more. | ||
| - */ | ||
| -public class Tokenizer { | ||
| - /** | ||
| - * Default constructor, no state. | ||
| - */ | ||
| - public Tokenizer() { | ||
| - } | ||
| - | ||
| - /** | ||
| - * Emits a series of tokens that represent information about text that is | ||
| - * needed to convert straight quotes to curly quotes. | ||
| - * | ||
| - * @param text The text to split into tokens. | ||
| - * @param consumer Receives each token as a separate event. | ||
| - */ | ||
| - public void tokenize( final String text, final Consumer<Token> consumer ) { | ||
| - final var iterator = new StringCharacterIterator( text ); | ||
| - Token token; | ||
| - | ||
| - while( (token = tokenize( iterator )).hasNext() ) { | ||
| - consumer.accept( token ); | ||
| - } | ||
| - } | ||
| - | ||
| - /** | ||
| - * Tokenizes a sequence of characters. The order of comparisons is optimized | ||
| - * towards probability of the occurrence of a character in regular English | ||
| - * prose: letters, space, quotation marks, numbers, periods, new lines, | ||
| - * then end of text. | ||
| - * | ||
| - * @param i The sequence of characters to tokenize. | ||
| - * @return The next token in the sequence. | ||
| - */ | ||
| - private Token tokenize( final CharacterIterator i ) { | ||
| - int began = i.getIndex(); | ||
| - boolean isWord = false; | ||
| - Token token = null; | ||
| - | ||
| - do { | ||
| - final var curr = i.current(); | ||
| - | ||
| - if( isLetter( curr ) ) { | ||
| - isWord = true; | ||
| - | ||
| - if( !isLetterOrDigit( peek( i ) ) ) { | ||
| - token = createToken( WORD, began, i.getIndex() ); | ||
| - } | ||
| - } | ||
| - else if( curr == ' ' ) { | ||
| - token = createToken( SPACE, began, i.getIndex() ); | ||
| - } | ||
| - else if( curr == '\'' ) { | ||
| - token = createToken( QUOTE_SINGLE, began, i.getIndex() ); | ||
| - } | ||
| - else if( curr == '"' ) { | ||
| - token = createToken( QUOTE_DOUBLE, began, i.getIndex() ); | ||
| - } | ||
| - else if( isDigit( curr ) || isNumeric( curr ) && isDigit( peek( i ) ) ) { | ||
| - // Tokenize all consecutive number characters at prevent the main | ||
| - // loop from switching back to word tokens. | ||
| - char next; | ||
| - | ||
| - do { | ||
| - next = i.next(); | ||
| - } | ||
| - while( isDigit( next ) || isNumeric( next ) && isDigit( peek( i ) ) ); | ||
| - | ||
| - // The loop above will overshoot the number by one character. | ||
| - i.previous(); | ||
| - | ||
| - token = createToken( isWord ? WORD : NUMBER, began, i.getIndex() ); | ||
| - } | ||
| - else if( curr == '.' ) { | ||
| - token = createToken( PERIOD, began, i.getIndex() ); | ||
| - } | ||
| - else if( curr == '\r' ) { | ||
| - token = createToken( NEWLINE, began, i.getIndex() ); | ||
| - | ||
| - // Swallow the LF in CRLF; peeking won't work here. | ||
| - if( i.next() != '\n' ) { | ||
| - // Push back the non-LF char. | ||
| - i.previous(); | ||
| - } | ||
| - } | ||
| - else if( curr == '\n' ) { | ||
| - token = createToken( NEWLINE, began, i.getIndex() ); | ||
| - } | ||
| - else if( isWhitespace( curr ) ) { | ||
| - token = createToken( SPACE, began, i.getIndex() ); | ||
| - } | ||
| - else if( curr != DONE ) { | ||
| - token = createToken( PUNCT, began, i.getIndex() ); | ||
| - } | ||
| - else { | ||
| - token = Token.EOT; | ||
| - } | ||
| - | ||
| - i.next(); | ||
| - } | ||
| - while( token == null ); | ||
| - | ||
| - return token; | ||
| - } | ||
| - | ||
| - private static boolean isNumeric( final char curr ) { | ||
| - return curr == '.' || curr == ','; | ||
| - } | ||
| - | ||
| - private static char peek( final CharacterIterator ci ) { | ||
| - final var ch = ci.next(); | ||
| - ci.previous(); | ||
| - return ch; | ||
| - } | ||
| -} | ||
| +/* Copyright 2020-2021 White Magic Software, Ltd. -- All rights reserved. */ | ||
| +package com.keenwrite.quotes; | ||
| + | ||
| +import org.junit.jupiter.api.Test; | ||
| + | ||
| +import java.util.concurrent.atomic.AtomicInteger; | ||
| + | ||
| +import static com.keenwrite.quotes.LexemeType.*; | ||
| +import static org.junit.jupiter.api.Assertions.assertEquals; | ||
| + | ||
| +/** | ||
| + * Tests lexing words, numbers, punctuation, spaces, and newlines. | ||
| + */ | ||
| +class LexerTest { | ||
| + @Test | ||
| + void test_Lexing_Words_TokenValues() { | ||
| + testText( "abc 123", "abc", " ", "123" ); | ||
| + testText( "-123 abc", "-", "123", " ", "abc" ); | ||
| + } | ||
| + | ||
| + @Test | ||
| + void test_Lexing_Numbers_EmitNumbers() { | ||
| + testType( ".123", NUMBER ); | ||
| + testType( "-123.", PUNCT, NUMBER, PERIOD ); | ||
| + testType( " 123.123.123", SPACE, NUMBER ); | ||
| + testType( "123 123\"", NUMBER, SPACE, NUMBER, QUOTE_DOUBLE ); | ||
| + testType( "-123,123.123", PUNCT, NUMBER ); | ||
| + } | ||
| + | ||
| + @Test | ||
| + void test_Lexing_Words_EmitWords() { | ||
| + testType( "abc", WORD ); | ||
| + testType( "abc abc", WORD, SPACE, WORD ); | ||
| + testType( "abc123", WORD ); | ||
| + testType( "-123abc", PUNCT, NUMBER, WORD ); | ||
| + testType( "abc-o'-abc", WORD, PUNCT, WORD, QUOTE_SINGLE, PUNCT, WORD ); | ||
| + } | ||
| + | ||
| + @Test | ||
| + void test_Lexing_PunctuationMarks_EmitPunctuationMarks() { | ||
| + testType( "!", PUNCT ); | ||
| + testType( ";", PUNCT ); | ||
| + testType( ".", PERIOD ); | ||
| + } | ||
| + | ||
| + @Test | ||
| + void test_Lexing_Quotes_EmitQuotes() { | ||
| + testType( "'", QUOTE_SINGLE ); | ||
| + testType( "\"", QUOTE_DOUBLE ); | ||
| + testType( "3 o'clock", NUMBER, SPACE, WORD, QUOTE_SINGLE, WORD ); | ||
| + } | ||
| + | ||
| + @Test | ||
| + void test_Lexing_Newlines_EmitNewlines() { | ||
| + testType( "\r", NEWLINE ); | ||
| + testType( "\n", NEWLINE ); | ||
| + testType( "\r\n", NEWLINE ); | ||
| + testType( "\r\n\r\n", NEWLINE, NEWLINE ); | ||
| + testType( "\r\n\n\r", NEWLINE, NEWLINE, NEWLINE ); | ||
| + testType( "abc \r\nabc\n", WORD, SPACE, NEWLINE, WORD, NEWLINE ); | ||
| + } | ||
| + | ||
| + private void testType( | ||
| + final String actual, final LexemeType... expected ) { | ||
| + final var tokenizer = new Lexer(); | ||
| + final var counter = new AtomicInteger(); | ||
| + | ||
| + tokenizer.parse( actual, ( token ) -> { | ||
| + final var expectedType = expected[ counter.getAndIncrement() ]; | ||
| + final var actualType = token.getType(); | ||
| + assertEquals( expectedType, actualType ); | ||
| + } ); | ||
| + | ||
| + // Ensure all expected tokens are matched (verify end of text reached). | ||
| + assertEquals( expected.length, counter.get() ); | ||
| + } | ||
| + | ||
| + private void testText( final String actual, final String... expected ) { | ||
| + final var tokenizer = new Lexer(); | ||
| + final var counter = new AtomicInteger(); | ||
| + | ||
| + tokenizer.parse( actual, ( token ) -> { | ||
| + final var expectedText = expected[ counter.getAndIncrement() ]; | ||
| + final var actualText = token.toString( actual ); | ||
| + assertEquals( expectedText, actualText ); | ||
| + } ); | ||
| + | ||
| + // Ensure all expected tokens are matched (verify end of text reached). | ||
| + assertEquals( expected.length, counter.get() ); | ||
| + } | ||
| +} | ||
| @Test | ||
| void test_Conversion_Straight_Curly() { | ||
| - new Parser( "\"It's the 70's jack-o'-lantern\"").parse(); | ||
| - new Parser( "Fish-'n'-chips!").parse(); | ||
| - new Parser( "That's a 35' x 10\" yacht!").parse(); | ||
| - //new Parser( "'70s are Sams' faves.'").parse(); | ||
| + parse( "\"It's the 70's jack-o'-lantern\"" ); | ||
| + parse( "Fish-'n'-chips!" ); | ||
| + parse( "That's a 35' x 10\" yacht!" ); | ||
| + parse( "He's a kinda' cat ya'll couldn't've known!" ); | ||
| + parse( "'70s are Sams' faves.'" ); | ||
| + parse( "'Twas and 'tis whate'er lay 'twixt dawn and dusk 'n River Styx." ); | ||
| + parse( """ | ||
| + But I must leave the proofs to those who 've seen 'em; | ||
| + But this I heard her say, and can't be wrong | ||
| + And all may think which way their judgments lean 'em, | ||
| + ''T is strange—the Hebrew noun which means "I am," | ||
| + The English always use to govern d--n.' | ||
| + """ ); | ||
| + } | ||
| + | ||
| + private void parse( final String text ) { | ||
| + final var parser = new Parser( text, System.out::println ); | ||
| + parser.parse(); | ||
| } | ||
| } |
| -/* Copyright 2020-2021 White Magic Software, Ltd. -- All rights reserved. */ | ||
| -package com.keenwrite.quotes; | ||
| - | ||
| -import org.junit.jupiter.api.Test; | ||
| - | ||
| -import java.util.concurrent.atomic.AtomicInteger; | ||
| - | ||
| -import static com.keenwrite.quotes.TokenType.*; | ||
| -import static org.junit.jupiter.api.Assertions.assertEquals; | ||
| - | ||
| -/** | ||
| - * Tests tokenizing words, numbers, punctuation, spaces, and newlines. | ||
| - */ | ||
| -class TokenizerTest { | ||
| - @Test | ||
| - void test_Tokenize_Words_TokenValues() { | ||
| - testText( "abc 123", "abc", " ", "123" ); | ||
| - testText( "-123 abc", "-", "123", " ", "abc" ); | ||
| - } | ||
| - | ||
| - @Test | ||
| - void test_Tokenize_Numbers_EmitNumbers() { | ||
| - testType( ".123", NUMBER ); | ||
| - testType( "-123.", PUNCT, NUMBER, PERIOD ); | ||
| - testType( " 123.123.123", SPACE, NUMBER ); | ||
| - testType( "123 123\"", NUMBER, SPACE, NUMBER, QUOTE_DOUBLE ); | ||
| - testType( "-123,123.123", PUNCT, NUMBER ); | ||
| - } | ||
| - | ||
| - @Test | ||
| - void test_Tokenize_Words_EmitWords() { | ||
| - testType( "abc", WORD ); | ||
| - testType( "abc abc", WORD, SPACE, WORD ); | ||
| - testType( "abc123", WORD ); | ||
| - testType( "-123abc", PUNCT, NUMBER, WORD ); | ||
| - testType( "abc-o'-abc", WORD, PUNCT, WORD, QUOTE_SINGLE, PUNCT, WORD ); | ||
| - } | ||
| - | ||
| - @Test | ||
| - void test_Tokenize_PunctuationMarks_EmitPunctuationMarks() { | ||
| - testType( "!", PUNCT ); | ||
| - testType( ";", PUNCT ); | ||
| - testType( ".", PERIOD ); | ||
| - } | ||
| - | ||
| - @Test | ||
| - void test_Tokenize_Quotes_EmitQuotes() { | ||
| - testType( "'", QUOTE_SINGLE ); | ||
| - testType( "\"", QUOTE_DOUBLE ); | ||
| - testType( "3 o'clock", NUMBER, SPACE, WORD, QUOTE_SINGLE, WORD ); | ||
| - } | ||
| - | ||
| - @Test | ||
| - void test_Tokenize_Newlines_EmitNewlines() { | ||
| - testType( "\r", NEWLINE ); | ||
| - testType( "\n", NEWLINE ); | ||
| - testType( "\r\n", NEWLINE ); | ||
| - testType( "\r\n\r\n", NEWLINE, NEWLINE ); | ||
| - testType( "\r\n\n\r", NEWLINE, NEWLINE, NEWLINE ); | ||
| - testType( "abc \r\nabc\n", WORD, SPACE, NEWLINE, WORD, NEWLINE ); | ||
| - } | ||
| - | ||
| - private void testType( | ||
| - final String actual, final TokenType... expected ) { | ||
| - final var tokenizer = new Tokenizer(); | ||
| - final var counter = new AtomicInteger(); | ||
| - | ||
| - tokenizer.tokenize( actual, ( token ) -> { | ||
| - final var expectedType = expected[ counter.getAndIncrement() ]; | ||
| - final var actualType = token.getType(); | ||
| - assertEquals( expectedType, actualType ); | ||
| - } ); | ||
| - | ||
| - // Ensure all expected tokens are matched (verify end of text reached). | ||
| - assertEquals( expected.length, counter.get() ); | ||
| - } | ||
| - | ||
| - private void testText( final String actual, final String... expected ) { | ||
| - final var tokenizer = new Tokenizer(); | ||
| - final var counter = new AtomicInteger(); | ||
| - | ||
| - tokenizer.tokenize( actual, ( token ) -> { | ||
| - final var expectedText = expected[ counter.getAndIncrement() ]; | ||
| - final var actualText = token.toString( actual ); | ||
| - assertEquals( expectedText, actualText ); | ||
| - } ); | ||
| - | ||
| - // Ensure all expected tokens are matched (verify end of text reached). | ||
| - assertEquals( expected.length, counter.get() ); | ||
| - } | ||
| -} | ||
| Delta | 455 lines added, 343 lines removed, 112-line increase |
|---|