| 1 | # Release Notes |
|
| 2 | ||
| 3 | * **Version:** 2.5.5 |
|
| 4 | * **Released:** August 25, 2025 |
|
| 5 | ||
| 6 | ## What's New |
|
| 7 | ||
| 8 | * **Enhancement**: Added protection for script and style elements |
|
| 9 | * **Output Enhancement**: Emits ellipses (`...` as `…`) |
|
| 10 | * **Code Formatting**: Reformatted codebase for improved consistency |
|
| 11 | ||
| 1 | 12 |
| 1 | #!/usr/bin/env bash |
|
| 2 | ||
| 3 | RELEASE=$(git describe --abbrev=0 --tags) |
|
| 4 | ||
| 5 | glab release create "${RELEASE}" |
|
| 6 | ||
| 7 | cat tokens/keenquotes.pat | glab auth login --hostname gitlab.com --stdin |
|
| 8 | ||
| 9 | glab release upload "${RELEASE}" build/libs/keenquotes.jar |
|
| 10 | ||
| 11 | 1 |
| 27 | 27 | LEX_DOUBLE_CHEVRON_RIGHT( '»' ), |
| 28 | 28 | LEX_SINGLE_CHEVRON_LEFT( '‹' ), |
| 29 | LEX_SINGLE_CHEVRON_RIGHT( '›' ); |
|
| 29 | LEX_SINGLE_CHEVRON_RIGHT( '›' ), |
|
| 30 | ||
| 31 | LEX_SPACE( ' ' ), |
|
| 32 | ||
| 33 | LEX_ELLIPSES( '…' ), |
|
| 34 | LEX_DASH_EN( '–' ), |
|
| 35 | LEX_DASH_EM( '—' ), |
|
| 36 | LEX_HYPHEN( '-' ); |
|
| 30 | 37 | |
| 31 | 38 | private final char mGlyph; |
| 23 | 23 | public static final LexemeType EOP = new LexemeType(); |
| 24 | 24 | public static final LexemeType EOT = new LexemeType(); |
| 25 | public static final LexemeType SPACE = new LexemeType(); |
|
| 25 | public static final LexemeType SPACE = new LexemeType( LEX_SPACE ); |
|
| 26 | 26 | public static final LexemeType WORD = new LexemeType(); |
| 27 | 27 | public static final LexemeType NUMBER = new LexemeType(); |
| 28 | 28 | public static final LexemeType PUNCT = new LexemeType(); |
| 29 | 29 | public static final LexemeType OPENING_GROUP = new LexemeType(); |
| 30 | 30 | public static final LexemeType CLOSING_GROUP = new LexemeType(); |
| 31 | public static final LexemeType HYPHEN = new LexemeType(); |
|
| 32 | public static final LexemeType DASH = new LexemeType(); |
|
| 31 | public static final LexemeType DASH_EN = new LexemeType( LEX_DASH_EN ); |
|
| 32 | public static final LexemeType DASH_EM = new LexemeType( LEX_DASH_EM ); |
|
| 33 | public static final LexemeType DASH_LINE = new LexemeType(); |
|
| 34 | public static final LexemeType HYPHEN = new LexemeType( LEX_HYPHEN ); |
|
| 33 | 35 | public static final LexemeType EQUALS = new LexemeType(); |
| 34 | 36 | public static final LexemeType PERIOD = new LexemeType(); |
| 35 | public static final LexemeType ELLIPSIS = new LexemeType(); |
|
| 37 | public static final LexemeType ELLIPSIS = new LexemeType( LEX_ELLIPSES ); |
|
| 36 | 38 | public static final LexemeType ENDING = new LexemeType(); |
| 37 | 39 | public static final LexemeType ANY = new LexemeType(); |
| 33 | 33 | ); |
| 34 | 34 | |
| 35 | private static final Set<Character> DASH_CHARS = new HashSet<>( |
|
| 36 | Arrays.asList( '-', '–', '—', '―' ) |
|
| 37 | ); |
|
| 38 | ||
| 39 | 35 | /** |
| 40 | 36 | * Consider {@code _} and {@code *} as letters (in a word) because plain |
| ... | ||
| 94 | 90 | } |
| 95 | 91 | else if( curr == '\r' || curr == '\n' ) { |
| 96 | final var cr = new int[]{curr == '\r' ? 1 : 0}; |
|
| 97 | final var lf = new int[]{curr == '\n' ? 1 : 0}; |
|
| 92 | final var cr = new int[]{ curr == '\r' ? 1 : 0 }; |
|
| 93 | final var lf = new int[]{ curr == '\n' ? 1 : 0 }; |
|
| 98 | 94 | |
| 99 | 95 | // Swallow all consecutive CR (Mac), CRLF (Windows), and/or LF (Unix). |
| ... | ||
| 132 | 128 | token = QUOTE_SINGLE; |
| 133 | 129 | } |
| 134 | else if( curr == '-' && i.peek() != '-' ) { |
|
| 135 | token = HYPHEN; |
|
| 130 | else if( isHyphen( curr ) ) { |
|
| 131 | final var began = i.index(); |
|
| 132 | i.skip( Lexer::isHyphen ); |
|
| 133 | final var count = i.index() - began; |
|
| 134 | ||
| 135 | token = switch( count ) { |
|
| 136 | case 0 -> HYPHEN; |
|
| 137 | case 1 -> DASH_EN; |
|
| 138 | case 2 -> DASH_EM; |
|
| 139 | default -> DASH_LINE; |
|
| 140 | }; |
|
| 136 | 141 | } |
| 137 | else if( isDash( curr ) ) { |
|
| 138 | i.skip( Lexer::isDash ); |
|
| 139 | token = DASH; |
|
| 142 | else if( curr == '–' ) { |
|
| 143 | token = DASH_EN; |
|
| 144 | } |
|
| 145 | else if( curr == '—' ) { |
|
| 146 | token = DASH_EM; |
|
| 140 | 147 | } |
| 141 | 148 | else if( curr == '(' || curr == '{' || curr == '[' ) { |
| ... | ||
| 231 | 238 | |
| 232 | 239 | /** |
| 233 | * Answers whether the given character may be part of an en- or em-dash. |
|
| 234 | * This must be called after it is known that the character isn't a lone |
|
| 235 | * hyphen. |
|
| 240 | * Answers whether the given character is a hyphen. |
|
| 236 | 241 | * |
| 237 | * @param curr The character to check as being a dash. |
|
| 238 | * @return {@code true} if the given character is part of a dash. |
|
| 242 | * @param curr The character to check as being a hyphen. |
|
| 243 | * @return {@code true} if the given character is a hyphen. |
|
| 239 | 244 | */ |
| 240 | private static boolean isDash( final char curr ) { |
|
| 241 | return DASH_CHARS.contains( curr ); |
|
| 245 | private static boolean isHyphen( final char curr ) { |
|
| 246 | return curr == '-'; |
|
| 242 | 247 | } |
| 243 | 248 | |
| 30 | 30 | }; |
| 31 | 31 | |
| 32 | private static final LexemeType[] PUNCT_PERIOD_ELLIPSIS_DASH = { |
|
| 33 | PUNCT, PERIOD, ELLIPSIS, DASH |
|
| 34 | }; |
|
| 35 | ||
| 36 | private static final LexemeType[] PUNCT_PERIOD = { |
|
| 37 | PUNCT, PERIOD |
|
| 38 | }; |
|
| 39 | ||
| 40 | private static final LexemeType[] SPACE_DASH_ENDING = { |
|
| 41 | SPACE, DASH, ENDING |
|
| 42 | }; |
|
| 43 | ||
| 44 | private static final LexemeType[] SPACE_ENDING = { |
|
| 45 | SPACE, ENDING |
|
| 46 | }; |
|
| 47 | ||
| 48 | private static final LexemeType[] SPACE_HYPHEN = { |
|
| 49 | SPACE, HYPHEN |
|
| 50 | }; |
|
| 51 | ||
| 52 | private static final LexemeType[] SPACE_PUNCT = { |
|
| 53 | SPACE, PUNCT |
|
| 54 | }; |
|
| 55 | ||
| 56 | private static final LexemeType[] SPACE_SOT = { |
|
| 57 | SPACE, SOT |
|
| 58 | }; |
|
| 59 | ||
| 60 | /** |
|
| 61 | * Single quotes preceded by these {@link LexemeType}s may be opening quotes. |
|
| 62 | */ |
|
| 63 | private static final LexemeType[] LEADING_QUOTE_OPENING_SINGLE = |
|
| 64 | new LexemeType[]{ |
|
| 65 | LexemeType.SOT, SPACE, DASH, QUOTE_DOUBLE, OPENING_GROUP, EOL, EOP |
|
| 66 | }; |
|
| 67 | ||
| 68 | /** |
|
| 69 | * Single quotes succeeded by these {@link LexemeType}s may be opening quotes. |
|
| 70 | */ |
|
| 71 | private static final LexemeType[] LAGGING_QUOTE_OPENING_SINGLE = |
|
| 72 | new LexemeType[]{ |
|
| 73 | WORD, ELLIPSIS, QUOTE_SINGLE, QUOTE_DOUBLE |
|
| 74 | }; |
|
| 75 | ||
| 76 | /** |
|
| 77 | * Single quotes preceded by these {@link LexemeType}s may be closing quotes. |
|
| 78 | */ |
|
| 79 | private static final LexemeType[] LEADING_QUOTE_CLOSING_SINGLE = |
|
| 80 | new LexemeType[]{ |
|
| 81 | WORD, NUMBER, PERIOD, PUNCT, ELLIPSIS, QUOTE_DOUBLE |
|
| 82 | }; |
|
| 83 | ||
| 84 | /** |
|
| 85 | * Single quotes succeeded by these {@link LexemeType}s may be closing quotes. |
|
| 86 | */ |
|
| 87 | private static final LexemeType[] LAGGING_QUOTE_CLOSING_SINGLE = |
|
| 88 | new LexemeType[]{ |
|
| 89 | SPACE, HYPHEN, DASH, PUNCT, PERIOD, ELLIPSIS, QUOTE_DOUBLE, CLOSING_GROUP, |
|
| 90 | ENDING |
|
| 91 | }; |
|
| 92 | ||
| 93 | /** |
|
| 94 | * Double quotes preceded by these {@link LexemeType}s may be opening quotes. |
|
| 95 | */ |
|
| 96 | private static final LexemeType[] LEADING_QUOTE_OPENING_DOUBLE = |
|
| 97 | new LexemeType[]{ |
|
| 98 | LexemeType.SOT, SPACE, DASH, EQUALS, OPENING_GROUP, EOL, EOP |
|
| 99 | }; |
|
| 100 | ||
| 101 | /** |
|
| 102 | * Double quotes succeeded by these {@link LexemeType}s may be opening quotes. |
|
| 103 | */ |
|
| 104 | private static final LexemeType[] LAGGING_QUOTE_OPENING_DOUBLE = |
|
| 105 | new LexemeType[]{ |
|
| 106 | WORD, PUNCT, NUMBER, DASH, ELLIPSIS, OPENING_GROUP, QUOTE_SINGLE, |
|
| 107 | QUOTE_SINGLE_OPENING, QUOTE_SINGLE_CLOSING, QUOTE_DOUBLE |
|
| 108 | }; |
|
| 109 | ||
| 110 | /** |
|
| 111 | * Double quotes preceded by these {@link LexemeType}s may be closing quotes. |
|
| 112 | */ |
|
| 113 | private static final LexemeType[] LEADING_QUOTE_CLOSING_DOUBLE = |
|
| 114 | new LexemeType[]{ |
|
| 115 | WORD, NUMBER, PERIOD, PUNCT, DASH, ELLIPSIS, CLOSING_GROUP, QUOTE_SINGLE, |
|
| 116 | QUOTE_SINGLE_CLOSING, QUOTE_SINGLE_OPENING |
|
| 117 | }; |
|
| 118 | ||
| 119 | /** |
|
| 120 | * Double quotes succeeded by these {@link LexemeType}s may be closing quotes. |
|
| 121 | */ |
|
| 122 | private static final LexemeType[] LAGGING_QUOTE_CLOSING_DOUBLE = |
|
| 123 | new LexemeType[]{ |
|
| 124 | SPACE, PUNCT, PERIOD, EQUALS, HYPHEN, DASH, QUOTE_SINGLE, CLOSING_GROUP, |
|
| 125 | ENDING |
|
| 126 | }; |
|
| 127 | ||
| 128 | private final CircularFifoQueue<Lexeme> mQ = new CircularFifoQueue<>( 4 ); |
|
| 129 | private final String mText; |
|
| 130 | private final Contractions mContractions; |
|
| 131 | private final Consumer<Token> mConsumer; |
|
| 132 | ||
| 133 | public QuoteEmitter( |
|
| 134 | final String text, |
|
| 135 | final Contractions contractions, |
|
| 136 | final Consumer<Token> consumer |
|
| 137 | ) { |
|
| 138 | assert text != null; |
|
| 139 | assert contractions != null; |
|
| 140 | ||
| 141 | mText = text; |
|
| 142 | mContractions = contractions; |
|
| 143 | mConsumer = consumer; |
|
| 144 | } |
|
| 145 | ||
| 146 | /** |
|
| 147 | * Scans the given text document for quotation marks and passes them to the |
|
| 148 | * given {@link Token} {@link Consumer}. |
|
| 149 | * |
|
| 150 | * @param text The prose to lex. |
|
| 151 | * @param contractions List of ambiguous and unambiguous contractions. |
|
| 152 | * @param consumer Receives |
|
| 153 | */ |
|
| 154 | public static void analyze( |
|
| 155 | final String text, |
|
| 156 | final Contractions contractions, |
|
| 157 | final Consumer<Token> consumer, |
|
| 158 | final LexerFilter filter |
|
| 159 | ) { |
|
| 160 | final var emitter = new QuoteEmitter( text, contractions, consumer ); |
|
| 161 | Lexer.lex( text, emitter, filter ); |
|
| 162 | } |
|
| 163 | ||
| 164 | /** |
|
| 165 | * @param lexeme the input argument |
|
| 166 | */ |
|
| 167 | @Override |
|
| 168 | public void accept( final Lexeme lexeme ) { |
|
| 169 | mQ.add( lexeme ); |
|
| 170 | ||
| 171 | if( mQ.size() == 4 ) { |
|
| 172 | parse(); |
|
| 173 | } |
|
| 174 | } |
|
| 175 | ||
| 176 | private void parse() { |
|
| 177 | final var lex1 = mQ.get( 0 ); |
|
| 178 | final var lex2 = mQ.get( 1 ); |
|
| 179 | final var lex3 = mQ.get( 2 ); |
|
| 180 | final var lex4 = mQ.get( 3 ); |
|
| 181 | ||
| 182 | // <y'all>, <Ph.D.'ll>, <20's>, <she's> |
|
| 183 | if( match( WORD_PERIOD_NUMBER, QUOTE_SINGLE, WORD, ANY ) ) { |
|
| 184 | emit( QUOTE_APOSTROPHE, lex2 ); |
|
| 185 | } |
|
| 186 | // <'n'>, <'N'>, <'owlin'> |
|
| 187 | else if( |
|
| 188 | match( ANY, QUOTE_SINGLE, WORD, QUOTE_SINGLE ) && |
|
| 189 | mContractions.beganEndedUnambiguously( lex3.toString( mText ) ) |
|
| 190 | ) { |
|
| 191 | emit( QUOTE_APOSTROPHE, lex2 ); |
|
| 192 | emit( QUOTE_APOSTROPHE, lex4 ); |
|
| 193 | mQ.set( Lexeme.NONE, 3 ); |
|
| 194 | } |
|
| 195 | // <2''> |
|
| 196 | else if( match( NUMBER, QUOTE_SINGLE, QUOTE_SINGLE, ANY ) ) { |
|
| 197 | // Force double primes to conform to the same constructor usage. This |
|
| 198 | // simplifies the tokens, reduces some memory usage, |
|
| 199 | final var lex = new Lexeme( PRIME_DOUBLE, lex2.began(), lex3.ended() ); |
|
| 200 | ||
| 201 | emit( QUOTE_PRIME_DOUBLE, lex ); |
|
| 202 | mQ.set( Lexeme.NONE, 2 ); |
|
| 203 | } |
|
| 204 | // <2'> |
|
| 205 | else if( match( NUMBER, QUOTE_SINGLE, ANY, ANY ) ) { |
|
| 206 | emit( QUOTE_PRIME_SINGLE, lex2 ); |
|
| 207 | } |
|
| 208 | // <2"> |
|
| 209 | else if( match( NUMBER, QUOTE_DOUBLE, ANY, ANY ) ) { |
|
| 210 | emit( QUOTE_PRIME_DOUBLE, lex2 ); |
|
| 211 | } |
|
| 212 | // <thinkin'> |
|
| 213 | else if( |
|
| 214 | match( WORD, QUOTE_SINGLE, ANY, ANY ) && |
|
| 215 | mContractions.endedUnambiguously( lex1.toString( mText ) ) |
|
| 216 | ) { |
|
| 217 | emit( QUOTE_APOSTROPHE, lex2 ); |
|
| 218 | } |
|
| 219 | // <'02> |
|
| 220 | else if( match( ANY, QUOTE_SINGLE, NUMBER, SPACE_PUNCT ) ) { |
|
| 221 | emit( QUOTE_APOSTROPHE, lex2 ); |
|
| 222 | } |
|
| 223 | // <'20s> |
|
| 224 | else if( |
|
| 225 | match( ANY, QUOTE_SINGLE, NUMBER, WORD ) && |
|
| 226 | "s".equalsIgnoreCase( lex4.toString( mText ) ) |
|
| 227 | ) { |
|
| 228 | emit( QUOTE_APOSTROPHE, lex2 ); |
|
| 229 | } |
|
| 230 | // <.'\n> |
|
| 231 | else if( match( PUNCT_PERIOD_ELLIPSIS_DASH, QUOTE_SINGLE, ENDING, ANY ) ) { |
|
| 232 | emit( QUOTE_CLOSING_SINGLE, lex2 ); |
|
| 233 | } |
|
| 234 | // <\'> |
|
| 235 | else if( match( ESC_SINGLE, ANY, ANY, ANY ) ) { |
|
| 236 | emit( QUOTE_STRAIGHT_SINGLE, lex1 ); |
|
| 237 | } |
|
| 238 | // <\"> |
|
| 239 | else if( match( ESC_DOUBLE, ANY, ANY, ANY ) ) { |
|
| 240 | emit( QUOTE_STRAIGHT_DOUBLE, lex1 ); |
|
| 241 | ||
| 242 | // <\"'---> |
|
| 243 | if( match( ESC_DOUBLE, QUOTE_SINGLE, SPACE_DASH_ENDING, ANY ) ) { |
|
| 244 | emit( QUOTE_CLOSING_SINGLE, lex2 ); |
|
| 245 | } |
|
| 246 | } |
|
| 247 | // <---'" > |
|
| 248 | else if( match( DASH, QUOTE_SINGLE, QUOTE_DOUBLE, SPACE_ENDING ) ) { |
|
| 249 | emit( QUOTE_CLOSING_SINGLE, lex2 ); |
|
| 250 | } |
|
| 251 | // <o’-lantern>, <o' fellow>, <O'-the> |
|
| 252 | else if( |
|
| 253 | match( WORD, QUOTE_SINGLE, SPACE_HYPHEN, WORD ) && |
|
| 254 | "o".equalsIgnoreCase( lex1.toString( mText ) ) |
|
| 255 | ) { |
|
| 256 | emit( QUOTE_APOSTROPHE, lex2 ); |
|
| 257 | } |
|
| 258 | // <"">, <"...>, <"word>, <---"word> |
|
| 259 | else if( |
|
| 260 | match( |
|
| 261 | LEADING_QUOTE_OPENING_DOUBLE, QUOTE_DOUBLE, |
|
| 262 | LAGGING_QUOTE_OPENING_DOUBLE, ANY |
|
| 263 | ) |
|
| 264 | ) { |
|
| 265 | emit( QUOTE_OPENING_DOUBLE, lex2 ); |
|
| 266 | } |
|
| 267 | // <..."'>, <word"'>, <?"'>, <word"?> |
|
| 268 | else if( |
|
| 269 | match( |
|
| 270 | LEADING_QUOTE_CLOSING_DOUBLE, QUOTE_DOUBLE, |
|
| 271 | LAGGING_QUOTE_CLOSING_DOUBLE, ANY |
|
| 272 | ) |
|
| 273 | ) { |
|
| 274 | emit( QUOTE_CLOSING_DOUBLE, lex2 ); |
|
| 275 | } |
|
| 276 | // < ''E> |
|
| 277 | else if( match( SPACE_SOT, QUOTE_SINGLE, QUOTE_SINGLE, WORD ) ) { |
|
| 278 | // Consume both immediately to avoid the false ambiguity <'e>. |
|
| 279 | emit( QUOTE_OPENING_SINGLE, lex2 ); |
|
| 280 | emit( QUOTE_APOSTROPHE, lex3 ); |
|
| 281 | mQ.set( Lexeme.NONE, 1 ); |
|
| 282 | mQ.set( Lexeme.NONE, 2 ); |
|
| 283 | } |
|
| 284 | // <'...>, <'word>, <---'word>, < 'nation> |
|
| 285 | else if( |
|
| 286 | match( |
|
| 287 | LEADING_QUOTE_OPENING_SINGLE, QUOTE_SINGLE, |
|
| 288 | LAGGING_QUOTE_OPENING_SINGLE, ANY ) |
|
| 289 | ) { |
|
| 290 | final var word = lex3.toString( mText ); |
|
| 291 | ||
| 292 | if( mContractions.beganAmbiguously( word ) ) { |
|
| 293 | emit( QUOTE_AMBIGUOUS_LEADING, lex2 ); |
|
| 294 | } |
|
| 295 | else if( mContractions.beganUnambiguously( word ) ) { |
|
| 296 | emit( QUOTE_APOSTROPHE, lex2 ); |
|
| 297 | } |
|
| 298 | // <"'"nested> |
|
| 299 | else if( match( QUOTE_DOUBLE, QUOTE_SINGLE, QUOTE_DOUBLE, WORD ) ) { |
|
| 300 | emit( QUOTE_OPENING_SINGLE, lex2 ); |
|
| 301 | } |
|
| 302 | // <"'" > |
|
| 303 | else if( match( QUOTE_DOUBLE, QUOTE_SINGLE, QUOTE_DOUBLE, ANY ) ) { |
|
| 304 | emit( QUOTE_AMBIGUOUS_SINGLE, lex2 ); |
|
| 305 | } |
|
| 306 | // < '" > |
|
| 307 | else if( match( ANY, QUOTE_SINGLE, LAGGING_QUOTE_OPENING_SINGLE, ANY ) ) { |
|
| 308 | emit( QUOTE_OPENING_SINGLE, lex2 ); |
|
| 309 | } |
|
| 310 | // Ambiguous |
|
| 311 | else { |
|
| 312 | emit( QUOTE_AMBIGUOUS_LEADING, lex2 ); |
|
| 313 | } |
|
| 314 | } |
|
| 315 | // <"'--- > |
|
| 316 | else if( match( QUOTE_DOUBLE, QUOTE_SINGLE, DASH, ANY ) ) { |
|
| 317 | emit( QUOTE_OPENING_SINGLE, lex2 ); |
|
| 318 | } |
|
| 319 | // <word'">, <...'--->, <"' > |
|
| 320 | else if( |
|
| 321 | match( |
|
| 322 | LEADING_QUOTE_CLOSING_SINGLE, QUOTE_SINGLE, |
|
| 323 | LAGGING_QUOTE_CLOSING_SINGLE, ANY |
|
| 324 | ) |
|
| 325 | ) { |
|
| 326 | final var word = lex1.toString( mText ); |
|
| 327 | ||
| 328 | if( mContractions.endedAmbiguously( word ) ) { |
|
| 329 | emit( QUOTE_AMBIGUOUS_LAGGING, lex2 ); |
|
| 330 | } |
|
| 331 | else { |
|
| 332 | emit( QUOTE_CLOSING_SINGLE, lex2 ); |
|
| 333 | } |
|
| 334 | } |
|
| 335 | // <word';> (contraction inferred by previous matches) |
|
| 336 | else if( match( WORD, QUOTE_SINGLE, PUNCT_PERIOD, ANY ) ) { |
|
| 337 | emit( QUOTE_APOSTROPHE, lex2 ); |
|
| 338 | } |
|
| 339 | // <---'"> |
|
| 340 | else if( match( DASH, QUOTE_SINGLE, QUOTE_DOUBLE, ANY ) ) { |
|
| 341 | emit( QUOTE_CLOSING_SINGLE, lex2 ); |
|
| 342 | } |
|
| 343 | // <'42>, <'-3.14> |
|
| 344 | else if( match( ANY, QUOTE_SINGLE, NUMBER, ANY ) ) { |
|
| 345 | emit( QUOTE_OPENING_SINGLE, lex2 ); |
|
| 346 | } |
|
| 347 | // <PRE-PARSED><'---.> |
|
| 348 | else if( match( LexemeType.NONE, QUOTE_SINGLE, ANY, ANY ) ) { |
|
| 349 | emit( QUOTE_CLOSING_SINGLE, lex2 ); |
|
| 350 | } |
|
| 351 | // <''Cause > |
|
| 352 | else if( match( QUOTE_SINGLE, QUOTE_SINGLE, WORD, ANY ) ) { |
|
| 353 | final var word = lex3.toString( mText ); |
|
| 354 | ||
| 355 | if( mContractions.beganAmbiguously( word ) ) { |
|
| 356 | emit( QUOTE_AMBIGUOUS_LEADING, lex2 ); |
|
| 357 | } |
|
| 358 | else if( mContractions.beganUnambiguously( word ) ) { |
|
| 359 | emit( QUOTE_APOSTROPHE, lex2 ); |
|
| 360 | } |
|
| 361 | else { |
|
| 362 | emit( QUOTE_AMBIGUOUS_SINGLE, lex2 ); |
|
| 363 | } |
|
| 364 | } |
|
| 365 | // <'"Trouble> |
|
| 366 | else if( match( QUOTE_SINGLE, QUOTE_DOUBLE, WORD, ANY ) ) { |
|
| 367 | emit( QUOTE_OPENING_DOUBLE, lex2 ); |
|
| 368 | } |
|
| 369 | // International quotation marks. |
|
| 370 | else if( match( ANY, QUOTE_DOUBLE_OPENING, ANY, ANY ) ) { |
|
| 371 | emit( QUOTE_OPENING_DOUBLE, lex2 ); |
|
| 372 | } |
|
| 373 | else if( match( ANY, QUOTE_SINGLE_OPENING, ANY, ANY ) ) { |
|
| 374 | emit( QUOTE_OPENING_SINGLE, lex2 ); |
|
| 375 | } |
|
| 376 | else if( match( ANY, QUOTE_DOUBLE_CLOSING, ANY, ANY ) ) { |
|
| 377 | emit( QUOTE_CLOSING_DOUBLE, lex2 ); |
|
| 378 | } |
|
| 379 | else if( match( ANY, QUOTE_SINGLE_CLOSING, ANY, ANY ) ) { |
|
| 380 | emit( QUOTE_CLOSING_SINGLE, lex2 ); |
|
| 381 | } |
|
| 382 | // Ambiguous (no match) |
|
| 383 | else if( match( ANY, QUOTE_SINGLE, ANY, ANY ) ) { |
|
| 384 | emit( QUOTE_AMBIGUOUS_SINGLE, lex2 ); |
|
| 385 | } |
|
| 386 | else if( match( ANY, QUOTE_DOUBLE, ANY, ANY ) ) { |
|
| 387 | emit( QUOTE_AMBIGUOUS_DOUBLE, lex2 ); |
|
| 388 | } |
|
| 389 | else if( match( ANY, ELLIPSIS, ANY, ANY ) ) { |
|
| 390 | emit( QUOTE_ELLIPSIS, lex2 ); |
|
| 391 | } |
|
| 392 | } |
|
| 393 | ||
| 394 | private void emit( final TokenType tokenType, final Lexeme lexeme ) { |
|
| 395 | mConsumer.accept( new Token( tokenType, lexeme ) ); |
|
| 396 | } |
|
| 397 | ||
| 398 | private boolean match( |
|
| 399 | final LexemeType l1, |
|
| 400 | final LexemeType l2, |
|
| 401 | final LexemeType l3, |
|
| 402 | final LexemeType l4 ) { |
|
| 403 | return mQ.get( 0 ).isType( l1 ) && |
|
| 404 | mQ.get( 1 ).isType( l2 ) && |
|
| 405 | mQ.get( 2 ).isType( l3 ) && |
|
| 406 | mQ.get( 3 ).isType( l4 ); |
|
| 407 | } |
|
| 408 | ||
| 409 | private boolean match( |
|
| 410 | final LexemeType[] l1, |
|
| 411 | final LexemeType l2, |
|
| 412 | final LexemeType l3, |
|
| 413 | final LexemeType l4 ) { |
|
| 414 | return mQ.get( 0 ).isType( l1 ) && |
|
| 415 | mQ.get( 1 ).isType( l2 ) && |
|
| 416 | mQ.get( 2 ).isType( l3 ) && |
|
| 417 | mQ.get( 3 ).isType( l4 ); |
|
| 418 | } |
|
| 419 | ||
| 420 | private boolean match( |
|
| 421 | final LexemeType l1, |
|
| 422 | final LexemeType l2, |
|
| 423 | final LexemeType[] l3, |
|
| 424 | final LexemeType l4 ) { |
|
| 425 | return mQ.get( 0 ).isType( l1 ) && |
|
| 426 | mQ.get( 1 ).isType( l2 ) && |
|
| 427 | mQ.get( 2 ).isType( l3 ) && |
|
| 428 | mQ.get( 3 ).isType( l4 ); |
|
| 429 | } |
|
| 430 | ||
| 431 | private boolean match( |
|
| 432 | final LexemeType l1, |
|
| 433 | final LexemeType l2, |
|
| 434 | final LexemeType l3, |
|
| 435 | final LexemeType[] l4 ) { |
|
| 436 | return mQ.get( 0 ).isType( l1 ) && |
|
| 437 | mQ.get( 1 ).isType( l2 ) && |
|
| 438 | mQ.get( 2 ).isType( l3 ) && |
|
| 439 | mQ.get( 3 ).isType( l4 ); |
|
| 440 | } |
|
| 441 | ||
| 442 | private boolean match( |
|
| 443 | final LexemeType[] l1, |
|
| 444 | final LexemeType l2, |
|
| 445 | final LexemeType[] l3, |
|
| 446 | final LexemeType l4 ) { |
|
| 32 | private static final LexemeType[] DASHES = { |
|
| 33 | DASH_EN, DASH_EM, DASH_LINE, HYPHEN |
|
| 34 | }; |
|
| 35 | ||
| 36 | private static final LexemeType[] PUNCT_PERIOD_ELLIPSIS_DASH = { |
|
| 37 | PUNCT, PERIOD, ELLIPSIS, DASH_EN, DASH_EM, DASH_LINE, HYPHEN |
|
| 38 | }; |
|
| 39 | ||
| 40 | private static final LexemeType[] PUNCT_PERIOD = { |
|
| 41 | PUNCT, PERIOD |
|
| 42 | }; |
|
| 43 | ||
| 44 | private static final LexemeType[] SPACE_DASH_ENDING = { |
|
| 45 | SPACE, DASH_EN, DASH_EM, DASH_LINE, HYPHEN, ENDING |
|
| 46 | }; |
|
| 47 | ||
| 48 | private static final LexemeType[] SPACE_ENDING = { |
|
| 49 | SPACE, ENDING |
|
| 50 | }; |
|
| 51 | ||
| 52 | private static final LexemeType[] SPACE_HYPHEN = { |
|
| 53 | SPACE, HYPHEN |
|
| 54 | }; |
|
| 55 | ||
| 56 | private static final LexemeType[] SPACE_PUNCT = { |
|
| 57 | SPACE, PUNCT |
|
| 58 | }; |
|
| 59 | ||
| 60 | private static final LexemeType[] SPACE_SOT = { |
|
| 61 | SPACE, SOT |
|
| 62 | }; |
|
| 63 | ||
| 64 | /** |
|
| 65 | * Single quotes preceded by these {@link LexemeType}s may be opening quotes. |
|
| 66 | */ |
|
| 67 | private static final LexemeType[] LEADING_QUOTE_OPENING_SINGLE = |
|
| 68 | new LexemeType[]{ |
|
| 69 | LexemeType.SOT, SPACE, |
|
| 70 | DASH_EN, DASH_EM, DASH_LINE, HYPHEN, |
|
| 71 | QUOTE_DOUBLE, OPENING_GROUP, EOL, EOP |
|
| 72 | }; |
|
| 73 | ||
| 74 | /** |
|
| 75 | * Single quotes succeeded by these {@link LexemeType}s may be opening quotes. |
|
| 76 | */ |
|
| 77 | private static final LexemeType[] LAGGING_QUOTE_OPENING_SINGLE = |
|
| 78 | new LexemeType[]{ |
|
| 79 | WORD, ELLIPSIS, QUOTE_SINGLE, QUOTE_DOUBLE |
|
| 80 | }; |
|
| 81 | ||
| 82 | /** |
|
| 83 | * Single quotes preceded by these {@link LexemeType}s may be closing quotes. |
|
| 84 | */ |
|
| 85 | private static final LexemeType[] LEADING_QUOTE_CLOSING_SINGLE = |
|
| 86 | new LexemeType[]{ |
|
| 87 | WORD, NUMBER, PERIOD, PUNCT, ELLIPSIS, QUOTE_DOUBLE |
|
| 88 | }; |
|
| 89 | ||
| 90 | /** |
|
| 91 | * Single quotes succeeded by these {@link LexemeType}s may be closing quotes. |
|
| 92 | */ |
|
| 93 | private static final LexemeType[] LAGGING_QUOTE_CLOSING_SINGLE = |
|
| 94 | new LexemeType[]{ |
|
| 95 | SPACE, |
|
| 96 | DASH_EN, DASH_EM, DASH_LINE, HYPHEN, |
|
| 97 | PUNCT, PERIOD, ELLIPSIS, QUOTE_DOUBLE, CLOSING_GROUP, ENDING |
|
| 98 | }; |
|
| 99 | ||
| 100 | /** |
|
| 101 | * Double quotes preceded by these {@link LexemeType}s may be opening quotes. |
|
| 102 | */ |
|
| 103 | private static final LexemeType[] LEADING_QUOTE_OPENING_DOUBLE = |
|
| 104 | new LexemeType[]{ |
|
| 105 | LexemeType.SOT, SPACE, |
|
| 106 | DASH_EN, DASH_EM, DASH_LINE, HYPHEN, |
|
| 107 | EQUALS, OPENING_GROUP, EOL, EOP |
|
| 108 | }; |
|
| 109 | ||
| 110 | /** |
|
| 111 | * Double quotes succeeded by these {@link LexemeType}s may be opening quotes. |
|
| 112 | */ |
|
| 113 | private static final LexemeType[] LAGGING_QUOTE_OPENING_DOUBLE = |
|
| 114 | new LexemeType[]{ |
|
| 115 | WORD, PUNCT, NUMBER, |
|
| 116 | DASH_EN, DASH_EM, DASH_LINE, HYPHEN, |
|
| 117 | ELLIPSIS, OPENING_GROUP, |
|
| 118 | QUOTE_SINGLE, QUOTE_SINGLE_OPENING, QUOTE_SINGLE_CLOSING, |
|
| 119 | QUOTE_DOUBLE |
|
| 120 | }; |
|
| 121 | ||
| 122 | /** |
|
| 123 | * Double quotes preceded by these {@link LexemeType}s may be closing quotes. |
|
| 124 | */ |
|
| 125 | private static final LexemeType[] LEADING_QUOTE_CLOSING_DOUBLE = |
|
| 126 | new LexemeType[]{ |
|
| 127 | WORD, NUMBER, PERIOD, PUNCT, |
|
| 128 | DASH_EN, DASH_EM, DASH_LINE, HYPHEN, |
|
| 129 | ELLIPSIS, CLOSING_GROUP, |
|
| 130 | QUOTE_SINGLE, QUOTE_SINGLE_CLOSING, QUOTE_SINGLE_OPENING |
|
| 131 | }; |
|
| 132 | ||
| 133 | /** |
|
| 134 | * Double quotes succeeded by these {@link LexemeType}s may be closing quotes. |
|
| 135 | */ |
|
| 136 | private static final LexemeType[] LAGGING_QUOTE_CLOSING_DOUBLE = |
|
| 137 | new LexemeType[]{ |
|
| 138 | SPACE, PUNCT, PERIOD, EQUALS, |
|
| 139 | DASH_EN, DASH_EM, DASH_LINE, HYPHEN, |
|
| 140 | QUOTE_SINGLE, CLOSING_GROUP, ENDING |
|
| 141 | }; |
|
| 142 | ||
| 143 | private final CircularFifoQueue<Lexeme> mQ = new CircularFifoQueue<>( 4 ); |
|
| 144 | private final String mText; |
|
| 145 | private final Contractions mContractions; |
|
| 146 | private final Consumer<Token> mConsumer; |
|
| 147 | ||
| 148 | public QuoteEmitter( |
|
| 149 | final String text, |
|
| 150 | final Contractions contractions, |
|
| 151 | final Consumer<Token> consumer |
|
| 152 | ) { |
|
| 153 | assert text != null; |
|
| 154 | assert contractions != null; |
|
| 155 | ||
| 156 | mText = text; |
|
| 157 | mContractions = contractions; |
|
| 158 | mConsumer = consumer; |
|
| 159 | } |
|
| 160 | ||
| 161 | /** |
|
| 162 | * Scans the given text document for quotation marks and passes them to the |
|
| 163 | * given {@link Token} {@link Consumer}. |
|
| 164 | * |
|
| 165 | * @param text The prose to lex. |
|
| 166 | * @param contractions List of ambiguous and unambiguous contractions. |
|
| 167 | * @param consumer Receives |
|
| 168 | */ |
|
| 169 | public static void analyze( |
|
| 170 | final String text, |
|
| 171 | final Contractions contractions, |
|
| 172 | final Consumer<Token> consumer, |
|
| 173 | final LexerFilter filter |
|
| 174 | ) { |
|
| 175 | final var emitter = new QuoteEmitter( text, contractions, consumer ); |
|
| 176 | Lexer.lex( text, emitter, filter ); |
|
| 177 | } |
|
| 178 | ||
| 179 | /** |
|
| 180 | * @param lexeme the input argument |
|
| 181 | */ |
|
| 182 | @Override |
|
| 183 | public void accept( final Lexeme lexeme ) { |
|
| 184 | mQ.add( lexeme ); |
|
| 185 | ||
| 186 | if( mQ.size() == 4 ) { |
|
| 187 | parse(); |
|
| 188 | } |
|
| 189 | } |
|
| 190 | ||
| 191 | private void parse() { |
|
| 192 | final var lex1 = mQ.get( 0 ); |
|
| 193 | final var lex2 = mQ.get( 1 ); |
|
| 194 | final var lex3 = mQ.get( 2 ); |
|
| 195 | final var lex4 = mQ.get( 3 ); |
|
| 196 | ||
| 197 | // <y'all>, <Ph.D.'ll>, <20's>, <she's> |
|
| 198 | if( match( WORD_PERIOD_NUMBER, QUOTE_SINGLE, WORD, ANY ) ) { |
|
| 199 | emit( QUOTE_APOSTROPHE, lex2 ); |
|
| 200 | } |
|
| 201 | // <'n'>, <'N'>, <'owlin'> |
|
| 202 | else if( |
|
| 203 | match( ANY, QUOTE_SINGLE, WORD, QUOTE_SINGLE ) && |
|
| 204 | mContractions.beganEndedUnambiguously( lex3.toString( mText ) ) |
|
| 205 | ) { |
|
| 206 | emit( QUOTE_APOSTROPHE, lex2 ); |
|
| 207 | emit( QUOTE_APOSTROPHE, lex4 ); |
|
| 208 | mQ.set( Lexeme.NONE, 3 ); |
|
| 209 | } |
|
| 210 | // <2''> |
|
| 211 | else if( match( NUMBER, QUOTE_SINGLE, QUOTE_SINGLE, ANY ) ) { |
|
| 212 | // Force double primes to conform to the same constructor usage. This |
|
| 213 | // simplifies the tokens, reduces some memory usage, |
|
| 214 | final var lex = new Lexeme( PRIME_DOUBLE, lex2.began(), lex3.ended() ); |
|
| 215 | ||
| 216 | emit( QUOTE_PRIME_DOUBLE, lex ); |
|
| 217 | mQ.set( Lexeme.NONE, 2 ); |
|
| 218 | } |
|
| 219 | // <2'> |
|
| 220 | else if( match( NUMBER, QUOTE_SINGLE, ANY, ANY ) ) { |
|
| 221 | emit( QUOTE_PRIME_SINGLE, lex2 ); |
|
| 222 | } |
|
| 223 | // <2"> |
|
| 224 | else if( match( NUMBER, QUOTE_DOUBLE, ANY, ANY ) ) { |
|
| 225 | emit( QUOTE_PRIME_DOUBLE, lex2 ); |
|
| 226 | } |
|
| 227 | // <thinkin'> |
|
| 228 | else if( |
|
| 229 | match( WORD, QUOTE_SINGLE, ANY, ANY ) && |
|
| 230 | mContractions.endedUnambiguously( lex1.toString( mText ) ) |
|
| 231 | ) { |
|
| 232 | emit( QUOTE_APOSTROPHE, lex2 ); |
|
| 233 | } |
|
| 234 | // <'02> |
|
| 235 | else if( match( ANY, QUOTE_SINGLE, NUMBER, SPACE_PUNCT ) ) { |
|
| 236 | emit( QUOTE_APOSTROPHE, lex2 ); |
|
| 237 | } |
|
| 238 | // <'20s> |
|
| 239 | else if( |
|
| 240 | match( ANY, QUOTE_SINGLE, NUMBER, WORD ) && |
|
| 241 | "s".equalsIgnoreCase( lex4.toString( mText ) ) |
|
| 242 | ) { |
|
| 243 | emit( QUOTE_APOSTROPHE, lex2 ); |
|
| 244 | } |
|
| 245 | // <.'\n> |
|
| 246 | else if( match( PUNCT_PERIOD_ELLIPSIS_DASH, QUOTE_SINGLE, ENDING, ANY ) ) { |
|
| 247 | emit( QUOTE_CLOSING_SINGLE, lex2 ); |
|
| 248 | } |
|
| 249 | // <\'> |
|
| 250 | else if( match( ESC_SINGLE, ANY, ANY, ANY ) ) { |
|
| 251 | emit( QUOTE_STRAIGHT_SINGLE, lex1 ); |
|
| 252 | } |
|
| 253 | // <\"> |
|
| 254 | else if( match( ESC_DOUBLE, ANY, ANY, ANY ) ) { |
|
| 255 | emit( QUOTE_STRAIGHT_DOUBLE, lex1 ); |
|
| 256 | ||
| 257 | // <\"'---> |
|
| 258 | if( match( ESC_DOUBLE, QUOTE_SINGLE, SPACE_DASH_ENDING, ANY ) ) { |
|
| 259 | emit( QUOTE_CLOSING_SINGLE, lex2 ); |
|
| 260 | } |
|
| 261 | } |
|
| 262 | // <---'" > |
|
| 263 | else if( match( DASHES, QUOTE_SINGLE, QUOTE_DOUBLE, SPACE_ENDING ) ) { |
|
| 264 | emit( QUOTE_CLOSING_SINGLE, lex2 ); |
|
| 265 | } |
|
| 266 | // <o’-lantern>, <o' fellow>, <O'-the> |
|
| 267 | else if( |
|
| 268 | match( WORD, QUOTE_SINGLE, SPACE_HYPHEN, WORD ) && |
|
| 269 | "o".equalsIgnoreCase( lex1.toString( mText ) ) |
|
| 270 | ) { |
|
| 271 | emit( QUOTE_APOSTROPHE, lex2 ); |
|
| 272 | } |
|
| 273 | // <"">, <"...>, <"word>, <---"word> |
|
| 274 | else if( |
|
| 275 | match( |
|
| 276 | LEADING_QUOTE_OPENING_DOUBLE, QUOTE_DOUBLE, |
|
| 277 | LAGGING_QUOTE_OPENING_DOUBLE, ANY |
|
| 278 | ) |
|
| 279 | ) { |
|
| 280 | emit( QUOTE_OPENING_DOUBLE, lex2 ); |
|
| 281 | } |
|
| 282 | // <..."'>, <word"'>, <?"'>, <word"?> |
|
| 283 | else if( |
|
| 284 | match( |
|
| 285 | LEADING_QUOTE_CLOSING_DOUBLE, QUOTE_DOUBLE, |
|
| 286 | LAGGING_QUOTE_CLOSING_DOUBLE, ANY |
|
| 287 | ) |
|
| 288 | ) { |
|
| 289 | emit( QUOTE_CLOSING_DOUBLE, lex2 ); |
|
| 290 | } |
|
| 291 | // < ''E> |
|
| 292 | else if( match( SPACE_SOT, QUOTE_SINGLE, QUOTE_SINGLE, WORD ) ) { |
|
| 293 | // Consume both immediately to avoid the false ambiguity <'e>. |
|
| 294 | emit( QUOTE_OPENING_SINGLE, lex2 ); |
|
| 295 | emit( QUOTE_APOSTROPHE, lex3 ); |
|
| 296 | mQ.set( Lexeme.NONE, 1 ); |
|
| 297 | mQ.set( Lexeme.NONE, 2 ); |
|
| 298 | } |
|
| 299 | // <'...>, <'word>, <---'word>, < 'nation> |
|
| 300 | else if( |
|
| 301 | match( |
|
| 302 | LEADING_QUOTE_OPENING_SINGLE, QUOTE_SINGLE, |
|
| 303 | LAGGING_QUOTE_OPENING_SINGLE, ANY ) |
|
| 304 | ) { |
|
| 305 | final var word = lex3.toString( mText ); |
|
| 306 | ||
| 307 | if( mContractions.beganAmbiguously( word ) ) { |
|
| 308 | emit( QUOTE_AMBIGUOUS_LEADING, lex2 ); |
|
| 309 | } |
|
| 310 | else if( mContractions.beganUnambiguously( word ) ) { |
|
| 311 | emit( QUOTE_APOSTROPHE, lex2 ); |
|
| 312 | } |
|
| 313 | // <"'"nested> |
|
| 314 | else if( match( QUOTE_DOUBLE, QUOTE_SINGLE, QUOTE_DOUBLE, WORD ) ) { |
|
| 315 | emit( QUOTE_OPENING_SINGLE, lex2 ); |
|
| 316 | } |
|
| 317 | // <"'" > |
|
| 318 | else if( match( QUOTE_DOUBLE, QUOTE_SINGLE, QUOTE_DOUBLE, ANY ) ) { |
|
| 319 | emit( QUOTE_AMBIGUOUS_SINGLE, lex2 ); |
|
| 320 | } |
|
| 321 | // < '" > |
|
| 322 | else if( match( ANY, QUOTE_SINGLE, LAGGING_QUOTE_OPENING_SINGLE, ANY ) ) { |
|
| 323 | emit( QUOTE_OPENING_SINGLE, lex2 ); |
|
| 324 | } |
|
| 325 | // Ambiguous |
|
| 326 | else { |
|
| 327 | emit( QUOTE_AMBIGUOUS_LEADING, lex2 ); |
|
| 328 | } |
|
| 329 | } |
|
| 330 | // <"'--- > |
|
| 331 | else if( match( QUOTE_DOUBLE, QUOTE_SINGLE, DASHES, ANY ) ) { |
|
| 332 | emit( QUOTE_OPENING_SINGLE, lex2 ); |
|
| 333 | } |
|
| 334 | // <word'">, <...'--->, <"' > |
|
| 335 | else if( |
|
| 336 | match( |
|
| 337 | LEADING_QUOTE_CLOSING_SINGLE, QUOTE_SINGLE, |
|
| 338 | LAGGING_QUOTE_CLOSING_SINGLE, ANY |
|
| 339 | ) |
|
| 340 | ) { |
|
| 341 | final var word = lex1.toString( mText ); |
|
| 342 | ||
| 343 | if( mContractions.endedAmbiguously( word ) ) { |
|
| 344 | emit( QUOTE_AMBIGUOUS_LAGGING, lex2 ); |
|
| 345 | } |
|
| 346 | else { |
|
| 347 | emit( QUOTE_CLOSING_SINGLE, lex2 ); |
|
| 348 | } |
|
| 349 | } |
|
| 350 | // <word';> (contraction inferred by previous matches) |
|
| 351 | else if( match( WORD, QUOTE_SINGLE, PUNCT_PERIOD, ANY ) ) { |
|
| 352 | emit( QUOTE_APOSTROPHE, lex2 ); |
|
| 353 | } |
|
| 354 | // <---'"> |
|
| 355 | else if( match( DASHES, QUOTE_SINGLE, QUOTE_DOUBLE, ANY ) ) { |
|
| 356 | emit( QUOTE_CLOSING_SINGLE, lex2 ); |
|
| 357 | } |
|
| 358 | // <'42>, <'-3.14> |
|
| 359 | else if( match( ANY, QUOTE_SINGLE, NUMBER, ANY ) ) { |
|
| 360 | emit( QUOTE_OPENING_SINGLE, lex2 ); |
|
| 361 | } |
|
| 362 | // <PRE-PARSED><'---.> |
|
| 363 | else if( match( LexemeType.NONE, QUOTE_SINGLE, ANY, ANY ) ) { |
|
| 364 | emit( QUOTE_CLOSING_SINGLE, lex2 ); |
|
| 365 | } |
|
| 366 | // <''Cause > |
|
| 367 | else if( match( QUOTE_SINGLE, QUOTE_SINGLE, WORD, ANY ) ) { |
|
| 368 | final var word = lex3.toString( mText ); |
|
| 369 | ||
| 370 | if( mContractions.beganAmbiguously( word ) ) { |
|
| 371 | emit( QUOTE_AMBIGUOUS_LEADING, lex2 ); |
|
| 372 | } |
|
| 373 | else if( mContractions.beganUnambiguously( word ) ) { |
|
| 374 | emit( QUOTE_APOSTROPHE, lex2 ); |
|
| 375 | } |
|
| 376 | else { |
|
| 377 | emit( QUOTE_AMBIGUOUS_SINGLE, lex2 ); |
|
| 378 | } |
|
| 379 | } |
|
| 380 | // <'"Trouble> |
|
| 381 | else if( match( QUOTE_SINGLE, QUOTE_DOUBLE, WORD, ANY ) ) { |
|
| 382 | emit( QUOTE_OPENING_DOUBLE, lex2 ); |
|
| 383 | } |
|
| 384 | // International quotation marks. |
|
| 385 | else if( match( ANY, QUOTE_DOUBLE_OPENING, ANY, ANY ) ) { |
|
| 386 | emit( QUOTE_OPENING_DOUBLE, lex2 ); |
|
| 387 | } |
|
| 388 | else if( match( ANY, QUOTE_SINGLE_OPENING, ANY, ANY ) ) { |
|
| 389 | emit( QUOTE_OPENING_SINGLE, lex2 ); |
|
| 390 | } |
|
| 391 | else if( match( ANY, QUOTE_DOUBLE_CLOSING, ANY, ANY ) ) { |
|
| 392 | emit( QUOTE_CLOSING_DOUBLE, lex2 ); |
|
| 393 | } |
|
| 394 | else if( match( ANY, QUOTE_SINGLE_CLOSING, ANY, ANY ) ) { |
|
| 395 | emit( QUOTE_CLOSING_SINGLE, lex2 ); |
|
| 396 | } |
|
| 397 | // Ambiguous (no match) |
|
| 398 | else if( match( ANY, QUOTE_SINGLE, ANY, ANY ) ) { |
|
| 399 | emit( QUOTE_AMBIGUOUS_SINGLE, lex2 ); |
|
| 400 | } |
|
| 401 | else if( match( ANY, QUOTE_DOUBLE, ANY, ANY ) ) { |
|
| 402 | emit( QUOTE_AMBIGUOUS_DOUBLE, lex2 ); |
|
| 403 | } |
|
| 404 | else if( match( ANY, ELLIPSIS, ANY, ANY ) ) { |
|
| 405 | emit( QUOTE_ELLIPSIS, lex2 ); |
|
| 406 | } |
|
| 407 | else if( match( ANY, DASH_EN, ANY, ANY ) ) { |
|
| 408 | emit( QUOTE_DASH_EN, lex2 ); |
|
| 409 | } |
|
| 410 | else if( match( ANY, DASH_EM, ANY, ANY ) ) { |
|
| 411 | emit( QUOTE_DASH_EM, lex2 ); |
|
| 412 | } |
|
| 413 | } |
|
| 414 | ||
| 415 | private void emit( final TokenType tokenType, final Lexeme lexeme ) { |
|
| 416 | mConsumer.accept( new Token( tokenType, lexeme ) ); |
|
| 417 | } |
|
| 418 | ||
| 419 | private boolean match( |
|
| 420 | final LexemeType l1, |
|
| 421 | final LexemeType l2, |
|
| 422 | final LexemeType l3, |
|
| 423 | final LexemeType l4 ) { |
|
| 424 | return mQ.get( 0 ).isType( l1 ) && |
|
| 425 | mQ.get( 1 ).isType( l2 ) && |
|
| 426 | mQ.get( 2 ).isType( l3 ) && |
|
| 427 | mQ.get( 3 ).isType( l4 ); |
|
| 428 | } |
|
| 429 | ||
| 430 | private boolean match( |
|
| 431 | final LexemeType[] l1, |
|
| 432 | final LexemeType l2, |
|
| 433 | final LexemeType l3, |
|
| 434 | final LexemeType l4 ) { |
|
| 435 | return mQ.get( 0 ).isType( l1 ) && |
|
| 436 | mQ.get( 1 ).isType( l2 ) && |
|
| 437 | mQ.get( 2 ).isType( l3 ) && |
|
| 438 | mQ.get( 3 ).isType( l4 ); |
|
| 439 | } |
|
| 440 | ||
| 441 | private boolean match( |
|
| 442 | final LexemeType l1, |
|
| 443 | final LexemeType l2, |
|
| 444 | final LexemeType[] l3, |
|
| 445 | final LexemeType l4 ) { |
|
| 446 | return mQ.get( 0 ).isType( l1 ) && |
|
| 447 | mQ.get( 1 ).isType( l2 ) && |
|
| 448 | mQ.get( 2 ).isType( l3 ) && |
|
| 449 | mQ.get( 3 ).isType( l4 ); |
|
| 450 | } |
|
| 451 | ||
| 452 | private boolean match( |
|
| 453 | final LexemeType l1, |
|
| 454 | final LexemeType l2, |
|
| 455 | final LexemeType l3, |
|
| 456 | final LexemeType[] l4 ) { |
|
| 457 | return mQ.get( 0 ).isType( l1 ) && |
|
| 458 | mQ.get( 1 ).isType( l2 ) && |
|
| 459 | mQ.get( 2 ).isType( l3 ) && |
|
| 460 | mQ.get( 3 ).isType( l4 ); |
|
| 461 | } |
|
| 462 | ||
| 463 | private boolean match( |
|
| 464 | final LexemeType[] l1, |
|
| 465 | final LexemeType l2, |
|
| 466 | final LexemeType[] l3, |
|
| 467 | final LexemeType l4 ) { |
|
| 468 | return mQ.get( 0 ).isType( l1 ) && |
|
| 469 | mQ.get( 1 ).isType( l2 ) && |
|
| 470 | mQ.get( 2 ).isType( l3 ) && |
|
| 471 | mQ.get( 3 ).isType( l4 ); |
|
| 472 | } |
|
| 473 | ||
| 474 | private boolean match( |
|
| 475 | final LexemeType[] l1, |
|
| 476 | final LexemeType l2, |
|
| 477 | final LexemeType l3, |
|
| 478 | final LexemeType[] l4 ) { |
|
| 447 | 479 | return mQ.get( 0 ).isType( l1 ) && |
| 448 | 480 | mQ.get( 1 ).isType( l2 ) && |
| 37 | 37 | ENTITIES.put( QUOTE_PRIME_QUADRUPLE, "⁗" ); |
| 38 | 38 | ENTITIES.put( QUOTE_ELLIPSIS, "…" ); |
| 39 | ENTITIES.put( QUOTE_DASH_EN, "–" ); |
|
| 40 | ENTITIES.put( QUOTE_DASH_EM, "—" ); |
|
| 41 | ENTITIES.put( QUOTE_DASH_MINUS, "−" ); |
|
| 39 | 42 | } |
| 40 | 43 | |
| ... | ||
| 58 | 61 | CHARS.put( QUOTE_PRIME_QUADRUPLE, "⁗" ); |
| 59 | 62 | CHARS.put( QUOTE_ELLIPSIS, "…" ); |
| 63 | CHARS.put( QUOTE_DASH_MINUS, "−" ); |
|
| 64 | CHARS.put( QUOTE_DASH_EN, "–" ); |
|
| 65 | CHARS.put( QUOTE_DASH_EM, "—" ); |
|
| 60 | 66 | } |
| 61 | 67 | |
| 22 | 22 | QUOTE_AMBIGUOUS_SINGLE( "single-ambiguous" ), |
| 23 | 23 | QUOTE_ELLIPSIS, |
| 24 | QUOTE_DASH_MINUS, |
|
| 25 | QUOTE_DASH_EN, |
|
| 26 | QUOTE_DASH_EM, |
|
| 24 | 27 | NONE; |
| 25 | 28 |
| 18 | 18 | */ |
| 19 | 19 | public final class FastCharacterIterator { |
| 20 | ||
| 21 | 20 | private final String mS; |
| 22 | 21 | private final int mLen; |
| 49 | 49 | @Test |
| 50 | 50 | void test_Lexing_PunctuationMarks_EmitPunctuationMarks() { |
| 51 | testType( "-", HYPHEN ); |
|
| 52 | testType( "--", DASH_EN ); |
|
| 53 | testType( "–", DASH_EN ); |
|
| 54 | testType( "---", DASH_EM ); |
|
| 55 | testType( "--- -- -", DASH_EM, SPACE, DASH_EN, SPACE, HYPHEN ); |
|
| 56 | testType( "—", DASH_EM ); |
|
| 57 | testType( "----", DASH_LINE ); |
|
| 58 | testType( "-----", DASH_LINE ); |
|
| 59 | testType( "------", DASH_LINE ); |
|
| 60 | testType( "—-—", DASH_EM, HYPHEN, DASH_EM ); |
|
| 51 | 61 | testType( "!", PUNCT ); |
| 52 | 62 | testType( ";", PUNCT ); |
| 53 | 63 | testType( "\\", PUNCT ); |
| 54 | 64 | testType( ".", PERIOD ); |
| 55 | testType( "-", HYPHEN ); |
|
| 56 | testType( "--", DASH ); |
|
| 57 | testType( "---", DASH ); |
|
| 58 | testType( "–", DASH ); |
|
| 59 | testType( "―", DASH ); |
|
| 60 | testType( "—", DASH ); |
|
| 61 | testType( "—-—", DASH ); |
|
| 62 | 65 | testType( "...", ELLIPSIS ); |
| 63 | 66 | testType( ". .", ELLIPSIS ); |
| ... | ||
| 144 | 147 | var counter = new AtomicInteger(); |
| 145 | 148 | |
| 146 | Lexer.lex( text, lexeme -> { |
|
| 147 | // Ignore the SOT and EOT lexemes (avoids duplication). Each test will |
|
| 148 | // include EOL, EOP, and EOT tokens due to the lexer's algorithm. |
|
| 149 | if( !lexeme.isType( SOT, EOT ) && counter.get() < elements.size() ) { |
|
| 150 | final var expected = elements.get( counter.getAndIncrement() ); |
|
| 151 | final var actual = f.apply( lexeme, text ); |
|
| 149 | Lexer.lex( |
|
| 150 | text, lexeme -> { |
|
| 151 | // Ignore the SOT and EOT lexemes (avoids duplication). Each test will |
|
| 152 | // include EOL, EOP, and EOT tokens due to the lexer's algorithm. |
|
| 153 | if( !lexeme.isType( SOT, EOT ) && counter.get() < elements.size() ) { |
|
| 154 | final var expected = elements.get( counter.getAndIncrement() ); |
|
| 155 | final var actual = f.apply( lexeme, text ); |
|
| 152 | 156 | |
| 153 | assertEquals( expected, actual ); |
|
| 154 | } |
|
| 155 | }, filter ); |
|
| 157 | assertEquals( expected, actual ); |
|
| 158 | } |
|
| 159 | }, filter |
|
| 160 | ); |
|
| 156 | 161 | |
| 157 | 162 | // Ensure all expected values are matched (verify end of text reached). |
| 63 | 63 | |
| 64 | 64 | With--"air quotes"--and dashes. |
| 65 | With--“air quotes”--and dashes. |
|
| 65 | With–“air quotes”–and dashes. |
|
| 66 | 66 | |
| 67 | 67 | "Not all open quotes are closed... |
| 68 | 68 | “Not all open quotes are closed… |
| 69 | 69 | |
| 70 | 70 | "---retroactively!" |
| 71 | “---retroactively!” |
|
| 71 | “—retroactively!” |
|
| 72 | 72 | |
| 73 | 73 | "And this" and "and this" and "and this" and "and another." |
| 74 | 74 | “And this” and “and this” and “and this” and “and another.” |
| 75 | 75 | |
| 76 | 76 | "Will'll invite?" Mrs. Thorne asked;\n"because he's coming--he's at the gate." |
| 77 | “Will'll invite?” Mrs. Thorne asked;\n“because he's coming--he's at the gate.” |
|
| 77 | “Will'll invite?” Mrs. Thorne asked;\n“because he's coming–he's at the gate.” |
|
| 78 | 78 | |
| 79 | 79 | Model "T2000" |
| ... | ||
| 94 | 94 | '...even better!' |
| 95 | 95 | ‘…even better!’ |
| 96 | ||
| 97 | With-'imaginary'-dashes. |
|
| 98 | With-‘imaginary’-dashes. |
|
| 96 | 99 | |
| 97 | 100 | With--'imaginary'--dashes. |
| 98 | With--‘imaginary’--dashes. |
|
| 101 | With–‘imaginary’–dashes. |
|
| 102 | ||
| 103 | With---'imaginary'---dashes. |
|
| 104 | With—‘imaginary’—dashes. |
|
| 99 | 105 | |
| 100 | 106 | 'It's a beautiful day!' |
| ... | ||
| 171 | 177 | |
| 172 | 178 | "'I'm in danger. Help? Take me to'--an address on Van Ness Avenue. |
| 173 | “‘I'm in danger. Help? Take me to’--an address on Van Ness Avenue. |
|
| 179 | “‘I'm in danger. Help? Take me to’–an address on Van Ness Avenue. |
|
| 174 | 180 | |
| 175 | 181 | "Ah," she said, "you knew! He told you--and you said 'my dear'! How could you?!" |
| 176 | “Ah,” she said, “you knew! He told you--and you said ‘my dear’! How could you?!” |
|
| 182 | “Ah,” she said, “you knew! He told you–and you said ‘my dear’! How could you?!” |
|
| 177 | 183 | |
| 178 | 184 | shouldn't drop letters, nor say "ain't" or "hain't." |
| 179 | 185 | shouldn't drop letters, nor say “ain't” or “hain't.” |
| 180 | 186 | |
| 181 | 187 | "’Kearney lives on the banks of Killarney—’ |
| 182 | “’Kearney lives on the banks of Killarney—’ |
|
| 188 | “’Kearney lives on the banks of Killarney—’ |
|
| 183 | 189 | |
| 184 | 190 | # ######################################################################## |
| ... | ||
| 216 | 222 | |
| 217 | 223 | "She said, 'Llamas'll languish, they'll-- |
| 218 | “She said, ‘Llamas'll languish, they'll-- |
|
| 224 | “She said, ‘Llamas'll languish, they'll– |
|
| 219 | 225 | |
| 220 | 226 | 'The 70s are my favorite numbers,' she said. |
| ... | ||
| 240 | 246 | |
| 241 | 247 | Computer says, "'It is mysteries---'" |
| 242 | Computer says, “‘It is mysteries---’” |
|
| 248 | Computer says, “‘It is mysteries—’” |
|
| 243 | 249 | |
| 105 | 105 | |
| 106 | 106 | "Jane said, ''E'll be spooky, Sam's son with the jack-o'-lantern!'" said the O'Mally twins'---y'know---ghosts in unison. |
| 107 | “Jane said, ‘'E'll be spooky, Sam's son with the jack-o'-lantern!’” said the O'Mally twins'---y'know---ghosts in unison. |
|
| 107 | “Jane said, ‘'E'll be spooky, Sam's son with the jack-o'-lantern!’” said the O'Mally twins'—y'know—ghosts in unison. |
|
| 108 | 108 | |
| 109 | 109 | ''Sup, Doc?' |
| ... | ||
| 123 | 123 | |
| 124 | 124 | "Not much o' fellow-creaturs, I think, Miss; all I know—my old master, as war a knowin' man, used to say, says he, 'If e'er I sow my wheat wi'out brinin', I'm a Dutchman,' says he; an' that war as much as to say as a Dutchman war a fool, or next door. Nay, nay, I aren't goin' to bother mysen about Dutchmen. There's fools enoo, an' rogues enoo, wi'out lookin' i' books for 'em." |
| 125 | “Not much o' fellow-creaturs, I think, Miss; all I know—my old master, as war a knowin' man, used to say, says he, ‘If e'er I sow my wheat wi'out brinin', I'm a Dutchman,’ says he; an' that war as much as to say as a Dutchman war a fool, or next door. Nay, nay, I aren't goin' to bother mysen about Dutchmen. There's fools enoo, an' rogues enoo, wi'out lookin' i' books for 'em.” |
|
| 125 | “Not much o' fellow-creaturs, I think, Miss; all I know—my old master, as war a knowin' man, used to say, says he, ‘If e'er I sow my wheat wi'out brinin', I'm a Dutchman,’ says he; an' that war as much as to say as a Dutchman war a fool, or next door. Nay, nay, I aren't goin' to bother mysen about Dutchmen. There's fools enoo, an' rogues enoo, wi'out lookin' i' books for 'em.” |
|
| 126 | 126 | |
| 127 | 127 | I leave proofs to those who 've seen 'em;\n\nBut this I heard her say, and can't be wrong\n\nAnd all may think which way their judgments lean 'em,\n\n''T is strange---the Hebrew noun which means "I am,"\n\nThe English always use to govern d--n.' |
| 128 | I leave proofs to those who 've seen 'em;\n\nBut this I heard her say, and can't be wrong\n\nAnd all may think which way their judgments lean 'em,\n\n‘'T is strange---the Hebrew noun which means “I am,”\n\nThe English always use to govern d--n.’ |
|
| 128 | I leave proofs to those who 've seen 'em;\n\nBut this I heard her say, and can't be wrong\n\nAnd all may think which way their judgments lean 'em,\n\n‘'T is strange—the Hebrew noun which means “I am,”\n\nThe English always use to govern d–n.’ |
|
| 129 | 129 | |
| 130 | 130 | ''E's got a 'ittle box 'n a big 'un,' she said, 'wit' th' 'ittle 'un 'bout 2'×6". An' no, y'ain't cryin' on th' "soap box" to me no mo, y'hear. 'Cause it 'tweren't ever a spec o' fun!' I says to my frien'. |
| ... | ||
| 144 | 144 | |
| 145 | 145 | "You 'cause---" all fifteen years' worth. |
| 146 | “You 'cause---” all fifteen years' worth. |
|
| 146 | “You 'cause—” all fifteen years' worth. |
|
| 147 | 147 | |
| 148 | 148 | "'---has a prison.'" |
| 149 | “‘---has a prison.’” |
|
| 149 | “‘—has a prison.’” |
|
| 150 | 150 | |
| 151 | 151 | "Teach 'em t'be more 'an we ever been!" |
| 11 | 11 | <bold>'twas</bold> redeemed for the <em>cat</em>'s eye |
| 12 | 12 | |
| 13 | <a href="https://x.org" title="X's Homepage">X11's bomb</a> |
|
| 14 | <a href="https://x.org" title="X's Homepage">X11's bomb</a> |
|
| 13 | <a href="https://dj.ca" title="X's Homepage">X11's bomb</a> |
|
| 14 | <a href="https://dj.ca" title="X's Homepage">X11's bomb</a> |
|
| 15 | ||
| 16 | <a href="https://d--j.ca">D--J</a> |
|
| 17 | <a href="https://d--j.ca">D–J</a> |
|
| 15 | 18 | |
| 16 | 19 | <body><p>Test lungs' capacity; shovel's handle hit his head's electronics.</p><p>Hoped to bring him 'round.</p></body> |