Optimizes lexer, in theory

Author	Dave Jarvis <email>
Date	2023-07-22 13:00:22 GMT-0700
Commit	ee98e47acd208766060f2367aada9bdaffcb1136
Parent	5c97a90

src/main/java/com/whitemagicsoftware/keenquotes/lex/Lexer.java

		import com.whitemagicsoftware.keenquotes.util.FastCharacterIterator;

		+import java.util.Arrays;
		+import java.util.HashSet;
		+import java.util.Set;
		import java.util.function.Consumer;

		import static com.whitemagicsoftware.keenquotes.lex.LexemeGlyph.*;
		import static com.whitemagicsoftware.keenquotes.lex.LexemeType.*;
		import static java.lang.Character.isWhitespace;
		import static java.text.CharacterIterator.DONE;

		/**
		- * Turns text into words, numbers, punctuation, spaces, and more.
		+ * Turns text into words, numbers, punctuation, spaces, and more. Using sets
		+ * (i.e., {@link Set#contains(Object)}) provide a constant time O(1) lookup
		+ * and is theoretically faster than either {@link String#indexOf(int, int)}
		+ * or a series of linear-time {@code if} statements.
		*/
		public final class Lexer {
		+ private static final Set<Character> DIGIT_CHARS = new HashSet<>(
		+ Arrays.asList(
		+ '¼', '½', '¾', '⅐', '⅑', '⅒',
		+ '⅓', '⅔', '⅕', '⅖', '⅗', '⅘',
		+ '⅙', '⅚', '⅛', '⅜', '⅝', '⅞'
		+ )
		+ );
		+
		+ private static final Set<Character> NUMERIC_CHARS = new HashSet<>(
		+ Arrays.asList( '.', ',', '-', '+', '^', '⅟', '⁄' )
		+ );
		+
		+ private static final Set<Character> DASH_CHARS = new HashSet<>(
		+ Arrays.asList( '-', '–', '—', '―' )
		+ );
		+
		+ /**
		+ * Consider {@code _} and {@code *} as letters (in a word) because plain
		+ * text formats often use those characters to emphasize a word.
		+ */
		+ private static final Set<Character> LETTER_CHARS = new HashSet<>(
		+ Arrays.asList( '_', '*' )
		+ );
		+
		/**
		* Tokenizes a sequence of characters. The order of comparisons is optimized

		}
		else if( curr == DONE ) {
		- continue;
		+ // Having a single exit point to a loop simplifies the control flow for
		+ // human readers, but breaking removes a superfluous iteration: a minor
		+ // optimization.
		+ break;
		}

		assert index >= 0;
		- assert curr != DONE;

		consumer.accept( new Lexeme( token, index, i.index() + 1 ) );

		/**
		* Answers whether the given character can be considered part of a word
		- * or not. This will include {@code _} and {@code *} because plain text
		- * formats often use those characters to emphasize a word.
		+ * or not.
		*
		* @param curr The character to check as being part of a word.
		* @return {@code true} if the given character is a letter or a formatting
		* indicator.
		*/
		private static boolean isLetter( final char curr ) {
		- return Character.isLetter( curr ) \|\| curr == '_' \|\| curr == '*';
		+ return Character.isLetter( curr ) \|\| LETTER_CHARS.contains( curr );
		}

		private static boolean isDigit( final char curr ) {
		- return Character.isDigit( curr ) \|\|
		- "¼½¾⅐⅑⅒⅓⅔⅕⅖⅗⅘⅙⅚⅛⅜⅝⅞".indexOf( curr ) >= 0;
		+ return Character.isDigit( curr ) \|\| DIGIT_CHARS.contains( curr );
		}


		*/
		private static boolean isDash( final char curr ) {
		- return curr == '-' \|\| curr == '–' \|\| curr == '—' \|\| curr == '―';
		+ return DASH_CHARS.contains( curr );
		}


		*/
		private static boolean isNumeric( final char curr ) {
		- return curr == '.' \|\| curr == ',' \|\| curr == '-' \|\| curr == '+' \|\|
		- curr == '^' \|\| curr == '⅟' \|\| curr == '⁄';
		+ return NUMERIC_CHARS.contains( curr );
		}
		}

Delta	40 lines added, 11 lines removed, 29-line increase