| | import com.whitemagicsoftware.keenquotes.util.FastCharacterIterator; |
| | |
| | +import java.util.Arrays; |
| | +import java.util.HashSet; |
| | +import java.util.Set; |
| | import java.util.function.Consumer; |
| | |
| | import static com.whitemagicsoftware.keenquotes.lex.LexemeGlyph.*; |
| | import static com.whitemagicsoftware.keenquotes.lex.LexemeType.*; |
| | import static java.lang.Character.isWhitespace; |
| | import static java.text.CharacterIterator.DONE; |
| | |
| | /** |
| | - * Turns text into words, numbers, punctuation, spaces, and more. |
| | + * Turns text into words, numbers, punctuation, spaces, and more. Using sets |
| | + * (i.e., {@link Set#contains(Object)}) provide a constant time O(1) lookup |
| | + * and is theoretically faster than either {@link String#indexOf(int, int)} |
| | + * or a series of linear-time {@code if} statements. |
| | */ |
| | public final class Lexer { |
| | + private static final Set<Character> DIGIT_CHARS = new HashSet<>( |
| | + Arrays.asList( |
| | + '¼', '½', '¾', '⅐', '⅑', '⅒', |
| | + '⅓', '⅔', '⅕', '⅖', '⅗', '⅘', |
| | + '⅙', '⅚', '⅛', '⅜', '⅝', '⅞' |
| | + ) |
| | + ); |
| | + |
| | + private static final Set<Character> NUMERIC_CHARS = new HashSet<>( |
| | + Arrays.asList( '.', ',', '-', '+', '^', '⅟', '⁄' ) |
| | + ); |
| | + |
| | + private static final Set<Character> DASH_CHARS = new HashSet<>( |
| | + Arrays.asList( '-', '–', '—', '―' ) |
| | + ); |
| | + |
| | + /** |
| | + * Consider {@code _} and {@code *} as letters (in a word) because plain |
| | + * text formats often use those characters to emphasize a word. |
| | + */ |
| | + private static final Set<Character> LETTER_CHARS = new HashSet<>( |
| | + Arrays.asList( '_', '*' ) |
| | + ); |
| | + |
| | /** |
| | * Tokenizes a sequence of characters. The order of comparisons is optimized |
 |
| | } |
| | else if( curr == DONE ) { |
| | - continue; |
| | + // Having a single exit point to a loop simplifies the control flow for |
| | + // human readers, but breaking removes a superfluous iteration: a minor |
| | + // optimization. |
| | + break; |
| | } |
| | |
| | assert index >= 0; |
| | - assert curr != DONE; |
| | |
| | consumer.accept( new Lexeme( token, index, i.index() + 1 ) ); |
 |
| | /** |
| | * Answers whether the given character can be considered part of a word |
| | - * or not. This will include {@code _} and {@code *} because plain text |
| | - * formats often use those characters to emphasize a word. |
| | + * or not. |
| | * |
| | * @param curr The character to check as being part of a word. |
| | * @return {@code true} if the given character is a letter or a formatting |
| | * indicator. |
| | */ |
| | private static boolean isLetter( final char curr ) { |
| | - return Character.isLetter( curr ) || curr == '_' || curr == '*'; |
| | + return Character.isLetter( curr ) || LETTER_CHARS.contains( curr ); |
| | } |
| | |
| | private static boolean isDigit( final char curr ) { |
| | - return Character.isDigit( curr ) || |
| | - "¼½¾⅐⅑⅒⅓⅔⅕⅖⅗⅘⅙⅚⅛⅜⅝⅞".indexOf( curr ) >= 0; |
| | + return Character.isDigit( curr ) || DIGIT_CHARS.contains( curr ); |
| | } |
| | |
 |
| | */ |
| | private static boolean isDash( final char curr ) { |
| | - return curr == '-' || curr == '–' || curr == '—' || curr == '―'; |
| | + return DASH_CHARS.contains( curr ); |
| | } |
| | |
 |
| | */ |
| | private static boolean isNumeric( final char curr ) { |
| | - return curr == '.' || curr == ',' || curr == '-' || curr == '+' || |
| | - curr == '^' || curr == '⅟' || curr == '⁄'; |
| | + return NUMERIC_CHARS.contains( curr ); |
| | } |
| | } |