Dave Jarvis' Repositories

git clone https://repo.autonoma.ca/repo/keenquotes.git

Optimizes lexer, in theory

AuthorDave Jarvis <email>
Date2023-07-22 13:00:22 GMT-0700
Commitee98e47acd208766060f2367aada9bdaffcb1136
Parent5c97a90
src/main/java/com/whitemagicsoftware/keenquotes/lex/Lexer.java
import com.whitemagicsoftware.keenquotes.util.FastCharacterIterator;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
import java.util.function.Consumer;
import static com.whitemagicsoftware.keenquotes.lex.LexemeGlyph.*;
import static com.whitemagicsoftware.keenquotes.lex.LexemeType.*;
import static java.lang.Character.isWhitespace;
import static java.text.CharacterIterator.DONE;
/**
- * Turns text into words, numbers, punctuation, spaces, and more.
+ * Turns text into words, numbers, punctuation, spaces, and more. Using sets
+ * (i.e., {@link Set#contains(Object)}) provide a constant time O(1) lookup
+ * and is theoretically faster than either {@link String#indexOf(int, int)}
+ * or a series of linear-time {@code if} statements.
*/
public final class Lexer {
+ private static final Set<Character> DIGIT_CHARS = new HashSet<>(
+ Arrays.asList(
+ '¼', '½', '¾', '⅐', '⅑', '⅒',
+ '⅓', '⅔', '⅕', '⅖', '⅗', '⅘',
+ '⅙', '⅚', '⅛', '⅜', '⅝', '⅞'
+ )
+ );
+
+ private static final Set<Character> NUMERIC_CHARS = new HashSet<>(
+ Arrays.asList( '.', ',', '-', '+', '^', '⅟', '⁄' )
+ );
+
+ private static final Set<Character> DASH_CHARS = new HashSet<>(
+ Arrays.asList( '-', '–', '—', '―' )
+ );
+
+ /**
+ * Consider {@code _} and {@code *} as letters (in a word) because plain
+ * text formats often use those characters to emphasize a word.
+ */
+ private static final Set<Character> LETTER_CHARS = new HashSet<>(
+ Arrays.asList( '_', '*' )
+ );
+
/**
* Tokenizes a sequence of characters. The order of comparisons is optimized
}
else if( curr == DONE ) {
- continue;
+ // Having a single exit point to a loop simplifies the control flow for
+ // human readers, but breaking removes a superfluous iteration: a minor
+ // optimization.
+ break;
}
assert index >= 0;
- assert curr != DONE;
consumer.accept( new Lexeme( token, index, i.index() + 1 ) );
/**
* Answers whether the given character can be considered part of a word
- * or not. This will include {@code _} and {@code *} because plain text
- * formats often use those characters to emphasize a word.
+ * or not.
*
* @param curr The character to check as being part of a word.
* @return {@code true} if the given character is a letter or a formatting
* indicator.
*/
private static boolean isLetter( final char curr ) {
- return Character.isLetter( curr ) || curr == '_' || curr == '*';
+ return Character.isLetter( curr ) || LETTER_CHARS.contains( curr );
}
private static boolean isDigit( final char curr ) {
- return Character.isDigit( curr ) ||
- "¼½¾⅐⅑⅒⅓⅔⅕⅖⅗⅘⅙⅚⅛⅜⅝⅞".indexOf( curr ) >= 0;
+ return Character.isDigit( curr ) || DIGIT_CHARS.contains( curr );
}
*/
private static boolean isDash( final char curr ) {
- return curr == '-' || curr == '–' || curr == '—' || curr == '―';
+ return DASH_CHARS.contains( curr );
}
*/
private static boolean isNumeric( final char curr ) {
- return curr == '.' || curr == ',' || curr == '-' || curr == '+' ||
- curr == '^' || curr == '⅟' || curr == '⁄';
+ return NUMERIC_CHARS.contains( curr );
}
}
Delta40 lines added, 11 lines removed, 29-line increase