| 22 |
22 |
* respect to curling. Each {@link Lexeme} is marked as such. |
| 23 |
23 |
*/ |
| 24 |
|
@SuppressWarnings( "SameParameterValue" ) |
| 25 |
|
public final class QuoteEmitter implements Consumer<Lexeme> { |
| 26 |
|
private static final LexemeType[] WORD_PERIOD_NUMBER = { |
| 27 |
|
WORD, PERIOD, NUMBER |
| 28 |
|
}; |
| 29 |
|
|
| 30 |
|
private static final LexemeType[] PUNCT_PERIOD_ELLIPSIS_DASH = { |
| 31 |
|
PUNCT, PERIOD, ELLIPSIS, DASH |
| 32 |
|
}; |
| 33 |
|
|
| 34 |
|
private static final LexemeType[] PUNCT_PERIOD = { |
| 35 |
|
PUNCT, PERIOD |
| 36 |
|
}; |
| 37 |
|
|
| 38 |
|
private static final LexemeType[] SPACE_DASH_ENDING = { |
| 39 |
|
SPACE, DASH, ENDING |
| 40 |
|
}; |
| 41 |
|
|
| 42 |
|
private static final LexemeType[] SPACE_ENDING = { |
| 43 |
|
SPACE, ENDING |
| 44 |
|
}; |
| 45 |
|
|
| 46 |
|
private static final LexemeType[] SPACE_HYPHEN = { |
| 47 |
|
SPACE, HYPHEN |
| 48 |
|
}; |
| 49 |
|
|
| 50 |
|
private static final LexemeType[] SPACE_PUNCT = { |
| 51 |
|
SPACE, PUNCT |
| 52 |
|
}; |
| 53 |
|
|
| 54 |
|
private static final LexemeType[] SPACE_SOT = { |
| 55 |
|
SPACE, SOT |
| 56 |
|
}; |
| 57 |
|
|
| 58 |
|
/** |
| 59 |
|
* Single quotes preceded by these {@link LexemeType}s may be opening quotes. |
| 60 |
|
*/ |
| 61 |
|
private static final LexemeType[] LEADING_QUOTE_OPENING_SINGLE = |
| 62 |
|
new LexemeType[]{ |
| 63 |
|
LexemeType.SOT, SPACE, DASH, QUOTE_DOUBLE, OPENING_GROUP, EOL, EOP |
| 64 |
|
}; |
| 65 |
|
|
| 66 |
|
/** |
| 67 |
|
* Single quotes succeeded by these {@link LexemeType}s may be opening quotes. |
| 68 |
|
*/ |
| 69 |
|
private static final LexemeType[] LAGGING_QUOTE_OPENING_SINGLE = |
| 70 |
|
new LexemeType[]{ |
| 71 |
|
WORD, ELLIPSIS, QUOTE_SINGLE, QUOTE_DOUBLE |
| 72 |
|
}; |
| 73 |
|
|
| 74 |
|
/** |
| 75 |
|
* Single quotes preceded by these {@link LexemeType}s may be closing quotes. |
| 76 |
|
*/ |
| 77 |
|
private static final LexemeType[] LEADING_QUOTE_CLOSING_SINGLE = |
| 78 |
|
new LexemeType[]{ |
| 79 |
|
WORD, NUMBER, PERIOD, PUNCT, ELLIPSIS, QUOTE_DOUBLE |
| 80 |
|
}; |
| 81 |
|
|
| 82 |
|
/** |
| 83 |
|
* Single quotes succeeded by these {@link LexemeType}s may be closing quotes. |
| 84 |
|
*/ |
| 85 |
|
private static final LexemeType[] LAGGING_QUOTE_CLOSING_SINGLE = |
| 86 |
|
new LexemeType[]{ |
| 87 |
|
SPACE, HYPHEN, DASH, PUNCT, PERIOD, ELLIPSIS, QUOTE_DOUBLE, CLOSING_GROUP, |
| 88 |
|
ENDING |
| 89 |
|
}; |
| 90 |
|
|
| 91 |
|
/** |
| 92 |
|
* Double quotes preceded by these {@link LexemeType}s may be opening quotes. |
| 93 |
|
*/ |
| 94 |
|
private static final LexemeType[] LEADING_QUOTE_OPENING_DOUBLE = |
| 95 |
|
new LexemeType[]{ |
| 96 |
|
LexemeType.SOT, SPACE, DASH, EQUALS, OPENING_GROUP, EOL, EOP |
| 97 |
|
}; |
| 98 |
|
|
| 99 |
|
/** |
| 100 |
|
* Double quotes succeeded by these {@link LexemeType}s may be opening quotes. |
| 101 |
|
*/ |
| 102 |
|
private static final LexemeType[] LAGGING_QUOTE_OPENING_DOUBLE = |
| 103 |
|
new LexemeType[]{ |
| 104 |
|
WORD, PUNCT, NUMBER, DASH, ELLIPSIS, OPENING_GROUP, QUOTE_SINGLE, |
| 105 |
|
QUOTE_SINGLE_OPENING, QUOTE_SINGLE_CLOSING, QUOTE_DOUBLE |
| 106 |
|
}; |
| 107 |
|
|
| 108 |
|
/** |
| 109 |
|
* Double quotes preceded by these {@link LexemeType}s may be closing quotes. |
| 110 |
|
*/ |
| 111 |
|
private static final LexemeType[] LEADING_QUOTE_CLOSING_DOUBLE = |
| 112 |
|
new LexemeType[]{ |
| 113 |
|
WORD, NUMBER, PERIOD, PUNCT, DASH, ELLIPSIS, CLOSING_GROUP, QUOTE_SINGLE, |
| 114 |
|
QUOTE_SINGLE_CLOSING, QUOTE_SINGLE_OPENING |
| 115 |
|
}; |
| 116 |
|
|
| 117 |
|
/** |
| 118 |
|
* Double quotes succeeded by these {@link LexemeType}s may be closing quotes. |
| 119 |
|
*/ |
| 120 |
|
private static final LexemeType[] LAGGING_QUOTE_CLOSING_DOUBLE = |
| 121 |
|
new LexemeType[]{ |
| 122 |
|
SPACE, PUNCT, PERIOD, EQUALS, HYPHEN, DASH, QUOTE_SINGLE, CLOSING_GROUP, |
| 123 |
|
ENDING |
| 124 |
|
}; |
| 125 |
|
|
| 126 |
|
private final CircularFifoQueue<Lexeme> mQ = new CircularFifoQueue<>( 4 ); |
| 127 |
|
private final String mText; |
| 128 |
|
private final Contractions mContractions; |
| 129 |
|
private final Consumer<Token> mConsumer; |
| 130 |
|
|
| 131 |
|
public QuoteEmitter( |
| 132 |
|
final String text, |
| 133 |
|
final Contractions contractions, |
| 134 |
|
final Consumer<Token> consumer |
| 135 |
|
) { |
| 136 |
|
assert text != null; |
| 137 |
|
assert contractions != null; |
| 138 |
|
|
| 139 |
|
mText = text; |
| 140 |
|
mContractions = contractions; |
| 141 |
|
mConsumer = consumer; |
| 142 |
|
} |
| 143 |
|
|
| 144 |
|
/** |
| 145 |
|
* Scans the given text document for quotation marks and passes them to the |
| 146 |
|
* given {@link Token} {@link Consumer}. |
| 147 |
|
* |
| 148 |
|
* @param text The prose to lex. |
| 149 |
|
* @param contractions List of ambiguous and unambiguous contractions. |
| 150 |
|
* @param consumer Receives |
| 151 |
|
*/ |
| 152 |
|
public static void analyze( |
| 153 |
|
final String text, |
| 154 |
|
final Contractions contractions, |
| 155 |
|
final Consumer<Token> consumer, |
| 156 |
|
final LexerFilter filter |
| 157 |
|
) { |
| 158 |
|
final var emitter = new QuoteEmitter( text, contractions, consumer ); |
| 159 |
|
Lexer.lex( text, emitter, filter ); |
| 160 |
|
} |
| 161 |
|
|
| 162 |
|
/** |
| 163 |
|
* @param lexeme the input argument |
| 164 |
|
*/ |
| 165 |
|
@Override |
| 166 |
|
public void accept( final Lexeme lexeme ) { |
| 167 |
|
mQ.add( lexeme ); |
| 168 |
|
|
| 169 |
|
if( mQ.size() == 4 ) { |
| 170 |
|
parse(); |
| 171 |
|
} |
| 172 |
|
} |
| 173 |
|
|
| 174 |
|
private void parse() { |
| 175 |
|
final var lex1 = mQ.get( 0 ); |
| 176 |
|
final var lex2 = mQ.get( 1 ); |
| 177 |
|
final var lex3 = mQ.get( 2 ); |
| 178 |
|
final var lex4 = mQ.get( 3 ); |
| 179 |
|
|
| 180 |
|
// <y'all>, <Ph.D.'ll>, <20's>, <she's> |
| 181 |
|
if( match( WORD_PERIOD_NUMBER, QUOTE_SINGLE, WORD, ANY ) ) { |
| 182 |
|
emit( QUOTE_APOSTROPHE, lex2 ); |
| 183 |
|
} |
| 184 |
|
// <'n'>, <'N'>, <'owlin'> |
| 185 |
|
else if( |
| 186 |
|
match( ANY, QUOTE_SINGLE, WORD, QUOTE_SINGLE ) && |
| 187 |
|
mContractions.beganEndedUnambiguously( lex3.toString( mText ) ) |
| 188 |
|
) { |
| 189 |
|
emit( QUOTE_APOSTROPHE, lex2 ); |
| 190 |
|
emit( QUOTE_APOSTROPHE, lex4 ); |
| 191 |
|
mQ.set( Lexeme.NONE, 3 ); |
| 192 |
|
} |
| 193 |
|
// <2''> |
| 194 |
|
else if( match( NUMBER, QUOTE_SINGLE, QUOTE_SINGLE, ANY ) ) { |
| 195 |
|
// Force double primes to conform to the same constructor usage. This |
| 196 |
|
// simplifies the tokens, reduces some memory usage, |
| 197 |
|
final var lex = new Lexeme( PRIME_DOUBLE, lex2.began(), lex3.ended() ); |
| 198 |
|
|
| 199 |
|
emit( QUOTE_PRIME_DOUBLE, lex ); |
| 200 |
|
mQ.set( Lexeme.NONE, 2 ); |
| 201 |
|
} |
| 202 |
|
// <2'> |
| 203 |
|
else if( match( NUMBER, QUOTE_SINGLE, ANY, ANY ) ) { |
| 204 |
|
emit( QUOTE_PRIME_SINGLE, lex2 ); |
| 205 |
|
} |
| 206 |
|
// <2"> |
| 207 |
|
else if( match( NUMBER, QUOTE_DOUBLE, ANY, ANY ) ) { |
| 208 |
|
emit( QUOTE_PRIME_DOUBLE, lex2 ); |
| 209 |
|
} |
| 210 |
|
// <thinkin'> |
| 211 |
|
else if( |
| 212 |
|
match( WORD, QUOTE_SINGLE, ANY, ANY ) && |
| 213 |
|
mContractions.endedUnambiguously( lex1.toString( mText ) ) |
| 214 |
|
) { |
| 215 |
|
emit( QUOTE_APOSTROPHE, lex2 ); |
| 216 |
|
} |
| 217 |
|
// <'02> |
| 218 |
|
else if( match( ANY, QUOTE_SINGLE, NUMBER, SPACE_PUNCT ) ) { |
| 219 |
|
emit( QUOTE_APOSTROPHE, lex2 ); |
| 220 |
|
} |
| 221 |
|
// <'20s> |
| 222 |
|
else if( |
| 223 |
|
match( ANY, QUOTE_SINGLE, NUMBER, WORD ) && |
| 224 |
|
"s".equalsIgnoreCase( lex4.toString( mText ) ) |
| 225 |
|
) { |
| 226 |
|
emit( QUOTE_APOSTROPHE, lex2 ); |
| 227 |
|
} |
| 228 |
|
// <.'\n> |
| 229 |
|
else if( match( PUNCT_PERIOD_ELLIPSIS_DASH, QUOTE_SINGLE, ENDING, ANY ) ) { |
| 230 |
|
emit( QUOTE_CLOSING_SINGLE, lex2 ); |
| 231 |
|
} |
| 232 |
|
// <\'> |
| 233 |
|
else if( match( ESC_SINGLE, ANY, ANY, ANY ) ) { |
| 234 |
|
emit( QUOTE_STRAIGHT_SINGLE, lex1 ); |
| 235 |
|
} |
| 236 |
|
// <\"> |
| 237 |
|
else if( match( ESC_DOUBLE, ANY, ANY, ANY ) ) { |
| 238 |
|
emit( QUOTE_STRAIGHT_DOUBLE, lex1 ); |
| 239 |
|
|
| 240 |
|
// <\"'---> |
| 241 |
|
if( match( ESC_DOUBLE, QUOTE_SINGLE, SPACE_DASH_ENDING, ANY ) ) { |
| 242 |
|
emit( QUOTE_CLOSING_SINGLE, lex2 ); |
| 243 |
|
} |
| 244 |
|
} |
| 245 |
|
// <---'" > |
| 246 |
|
else if( match( DASH, QUOTE_SINGLE, QUOTE_DOUBLE, SPACE_ENDING ) ) { |
| 247 |
|
emit( QUOTE_CLOSING_SINGLE, lex2 ); |
| 248 |
|
} |
| 249 |
|
// <o’-lantern>, <o' fellow>, <O'-the> |
| 250 |
|
else if( |
| 251 |
|
match( WORD, QUOTE_SINGLE, SPACE_HYPHEN, WORD ) && |
| 252 |
|
"o".equalsIgnoreCase( lex1.toString( mText ) ) |
| 253 |
|
) { |
| 254 |
|
emit( QUOTE_APOSTROPHE, lex2 ); |
| 255 |
|
} |
| 256 |
|
// <"">, <"...>, <"word>, <---"word> |
| 257 |
|
else if( |
| 258 |
|
match( |
| 259 |
|
LEADING_QUOTE_OPENING_DOUBLE, QUOTE_DOUBLE, |
| 260 |
|
LAGGING_QUOTE_OPENING_DOUBLE, ANY |
| 261 |
|
) |
| 262 |
|
) { |
| 263 |
|
emit( QUOTE_OPENING_DOUBLE, lex2 ); |
| 264 |
|
} |
| 265 |
|
// <..."'>, <word"'>, <?"'>, <word"?> |
| 266 |
|
else if( |
| 267 |
|
match( |
| 268 |
|
LEADING_QUOTE_CLOSING_DOUBLE, QUOTE_DOUBLE, |
| 269 |
|
LAGGING_QUOTE_CLOSING_DOUBLE, ANY |
| 270 |
|
) |
| 271 |
|
) { |
| 272 |
|
emit( QUOTE_CLOSING_DOUBLE, lex2 ); |
| 273 |
|
} |
| 274 |
|
// < ''E> |
| 275 |
|
else if( match( SPACE_SOT, QUOTE_SINGLE, QUOTE_SINGLE, WORD ) ) { |
| 276 |
|
// Consume both immediately to avoid the false ambiguity <'e>. |
| 277 |
|
emit( QUOTE_OPENING_SINGLE, lex2 ); |
| 278 |
|
emit( QUOTE_APOSTROPHE, lex3 ); |
| 279 |
|
mQ.set( Lexeme.NONE, 1 ); |
| 280 |
|
mQ.set( Lexeme.NONE, 2 ); |
| 281 |
|
} |
| 282 |
|
// <'...>, <'word>, <---'word>, < 'nation> |
| 283 |
|
else if( |
| 284 |
|
match( |
| 285 |
|
LEADING_QUOTE_OPENING_SINGLE, QUOTE_SINGLE, |
| 286 |
|
LAGGING_QUOTE_OPENING_SINGLE, ANY ) |
| 287 |
|
) { |
| 288 |
|
final var word = lex3.toString( mText ); |
| 289 |
|
|
| 290 |
|
if( mContractions.beganAmbiguously( word ) ) { |
| 291 |
|
emit( QUOTE_AMBIGUOUS_LEADING, lex2 ); |
| 292 |
|
} |
| 293 |
|
else if( mContractions.beganUnambiguously( word ) ) { |
| 294 |
|
emit( QUOTE_APOSTROPHE, lex2 ); |
| 295 |
|
} |
| 296 |
|
// <"'"nested> |
| 297 |
|
else if( match( QUOTE_DOUBLE, QUOTE_SINGLE, QUOTE_DOUBLE, WORD ) ) { |
| 298 |
|
emit( QUOTE_OPENING_SINGLE, lex2 ); |
| 299 |
|
} |
| 300 |
|
// <"'" > |
| 301 |
|
else if( match( QUOTE_DOUBLE, QUOTE_SINGLE, QUOTE_DOUBLE, ANY ) ) { |
| 302 |
|
emit( QUOTE_AMBIGUOUS_SINGLE, lex2 ); |
| 303 |
|
} |
| 304 |
|
// < '" > |
| 305 |
|
else if( match( ANY, QUOTE_SINGLE, LAGGING_QUOTE_OPENING_SINGLE, ANY ) ) { |
| 306 |
|
emit( QUOTE_OPENING_SINGLE, lex2 ); |
| 307 |
|
} |
| 308 |
|
// Ambiguous |
| 309 |
|
else { |
| 310 |
|
emit( QUOTE_AMBIGUOUS_LEADING, lex2 ); |
| 311 |
|
} |
| 312 |
|
} |
| 313 |
|
// <word'">, <...'--->, <"' > |
| 314 |
|
else if( |
| 315 |
|
match( |
| 316 |
|
LEADING_QUOTE_CLOSING_SINGLE, QUOTE_SINGLE, |
| 317 |
|
LAGGING_QUOTE_CLOSING_SINGLE, ANY |
| 318 |
|
) |
| 319 |
|
) { |
| 320 |
|
final var word = lex1.toString( mText ); |
| 321 |
|
|
| 322 |
|
if( mContractions.endedAmbiguously( word ) ) { |
| 323 |
|
emit( QUOTE_AMBIGUOUS_LAGGING, lex2 ); |
| 324 |
|
} |
| 325 |
|
else { |
| 326 |
|
emit( QUOTE_CLOSING_SINGLE, lex2 ); |
| 327 |
|
} |
| 328 |
|
} |
| 329 |
|
// <word';> (contraction inferred by previous matches) |
| 330 |
|
else if( match( WORD, QUOTE_SINGLE, PUNCT_PERIOD, ANY ) ) { |
| 331 |
|
emit( QUOTE_APOSTROPHE, lex2 ); |
| 332 |
|
} |
| 333 |
|
// <---'"> |
| 334 |
|
else if( match( DASH, QUOTE_SINGLE, QUOTE_DOUBLE, ANY ) ) { |
| 335 |
|
emit( QUOTE_CLOSING_SINGLE, lex2 ); |
| 336 |
|
} |
| 337 |
|
// <'42>, <'-3.14> |
| 338 |
|
else if( match( ANY, QUOTE_SINGLE, NUMBER, ANY ) ) { |
| 339 |
|
emit( QUOTE_OPENING_SINGLE, lex2 ); |
| 340 |
|
} |
| 341 |
|
// <PRE-PARSED><'---.> |
| 342 |
|
else if( match( LexemeType.NONE, QUOTE_SINGLE, ANY, ANY ) ) { |
| 343 |
|
emit( QUOTE_CLOSING_SINGLE, lex2 ); |
| 344 |
|
} |
| 345 |
|
// <''Cause > |
| 346 |
|
else if( match( QUOTE_SINGLE, QUOTE_SINGLE, WORD, ANY ) ) { |
| 347 |
|
final var word = lex3.toString( mText ); |
| 348 |
|
|
| 349 |
|
if( mContractions.beganAmbiguously( word ) ) { |
| 350 |
|
emit( QUOTE_AMBIGUOUS_LEADING, lex2 ); |
| 351 |
|
} |
| 352 |
|
else if( mContractions.beganUnambiguously( word ) ) { |
| 353 |
|
emit( QUOTE_APOSTROPHE, lex2 ); |
| 354 |
|
} |
| 355 |
|
else { |
| 356 |
|
emit( QUOTE_AMBIGUOUS_SINGLE, lex2 ); |
| 357 |
|
} |
| 358 |
|
} |
| 359 |
|
// <'"Trouble> |
| 360 |
|
else if( match( QUOTE_SINGLE, QUOTE_DOUBLE, WORD, ANY ) ) { |
| 361 |
|
emit( QUOTE_OPENING_DOUBLE, lex2 ); |
| 362 |
|
} |
| 363 |
|
// International quotation marks. |
| 364 |
|
else if( match( ANY, QUOTE_DOUBLE_OPENING, ANY, ANY ) ) { |
| 365 |
|
emit( QUOTE_OPENING_DOUBLE, lex2 ); |
| 366 |
|
} |
| 367 |
|
else if( match( ANY, QUOTE_SINGLE_OPENING, ANY, ANY ) ) { |
| 368 |
|
emit( QUOTE_OPENING_SINGLE, lex2 ); |
| 369 |
|
} |
| 370 |
|
else if( match( ANY, QUOTE_DOUBLE_CLOSING, ANY, ANY ) ) { |
|
24 |
@SuppressWarnings( |
|
25 |
{"SameParameterValue", "SpellCheckingInspection", "GrazieInspection"} |
|
26 |
) |
|
27 |
public final class QuoteEmitter implements Consumer<Lexeme> { |
|
28 |
private static final LexemeType[] WORD_PERIOD_NUMBER = { |
|
29 |
WORD, PERIOD, NUMBER |
|
30 |
}; |
|
31 |
|
|
32 |
private static final LexemeType[] PUNCT_PERIOD_ELLIPSIS_DASH = { |
|
33 |
PUNCT, PERIOD, ELLIPSIS, DASH |
|
34 |
}; |
|
35 |
|
|
36 |
private static final LexemeType[] PUNCT_PERIOD = { |
|
37 |
PUNCT, PERIOD |
|
38 |
}; |
|
39 |
|
|
40 |
private static final LexemeType[] SPACE_DASH_ENDING = { |
|
41 |
SPACE, DASH, ENDING |
|
42 |
}; |
|
43 |
|
|
44 |
private static final LexemeType[] SPACE_ENDING = { |
|
45 |
SPACE, ENDING |
|
46 |
}; |
|
47 |
|
|
48 |
private static final LexemeType[] SPACE_HYPHEN = { |
|
49 |
SPACE, HYPHEN |
|
50 |
}; |
|
51 |
|
|
52 |
private static final LexemeType[] SPACE_PUNCT = { |
|
53 |
SPACE, PUNCT |
|
54 |
}; |
|
55 |
|
|
56 |
private static final LexemeType[] SPACE_SOT = { |
|
57 |
SPACE, SOT |
|
58 |
}; |
|
59 |
|
|
60 |
/** |
|
61 |
* Single quotes preceded by these {@link LexemeType}s may be opening quotes. |
|
62 |
*/ |
|
63 |
private static final LexemeType[] LEADING_QUOTE_OPENING_SINGLE = |
|
64 |
new LexemeType[]{ |
|
65 |
LexemeType.SOT, SPACE, DASH, QUOTE_DOUBLE, OPENING_GROUP, EOL, EOP |
|
66 |
}; |
|
67 |
|
|
68 |
/** |
|
69 |
* Single quotes succeeded by these {@link LexemeType}s may be opening quotes. |
|
70 |
*/ |
|
71 |
private static final LexemeType[] LAGGING_QUOTE_OPENING_SINGLE = |
|
72 |
new LexemeType[]{ |
|
73 |
WORD, ELLIPSIS, QUOTE_SINGLE, QUOTE_DOUBLE |
|
74 |
}; |
|
75 |
|
|
76 |
/** |
|
77 |
* Single quotes preceded by these {@link LexemeType}s may be closing quotes. |
|
78 |
*/ |
|
79 |
private static final LexemeType[] LEADING_QUOTE_CLOSING_SINGLE = |
|
80 |
new LexemeType[]{ |
|
81 |
WORD, NUMBER, PERIOD, PUNCT, ELLIPSIS, QUOTE_DOUBLE |
|
82 |
}; |
|
83 |
|
|
84 |
/** |
|
85 |
* Single quotes succeeded by these {@link LexemeType}s may be closing quotes. |
|
86 |
*/ |
|
87 |
private static final LexemeType[] LAGGING_QUOTE_CLOSING_SINGLE = |
|
88 |
new LexemeType[]{ |
|
89 |
SPACE, HYPHEN, DASH, PUNCT, PERIOD, ELLIPSIS, QUOTE_DOUBLE, CLOSING_GROUP, |
|
90 |
ENDING |
|
91 |
}; |
|
92 |
|
|
93 |
/** |
|
94 |
* Double quotes preceded by these {@link LexemeType}s may be opening quotes. |
|
95 |
*/ |
|
96 |
private static final LexemeType[] LEADING_QUOTE_OPENING_DOUBLE = |
|
97 |
new LexemeType[]{ |
|
98 |
LexemeType.SOT, SPACE, DASH, EQUALS, OPENING_GROUP, EOL, EOP |
|
99 |
}; |
|
100 |
|
|
101 |
/** |
|
102 |
* Double quotes succeeded by these {@link LexemeType}s may be opening quotes. |
|
103 |
*/ |
|
104 |
private static final LexemeType[] LAGGING_QUOTE_OPENING_DOUBLE = |
|
105 |
new LexemeType[]{ |
|
106 |
WORD, PUNCT, NUMBER, DASH, ELLIPSIS, OPENING_GROUP, QUOTE_SINGLE, |
|
107 |
QUOTE_SINGLE_OPENING, QUOTE_SINGLE_CLOSING, QUOTE_DOUBLE |
|
108 |
}; |
|
109 |
|
|
110 |
/** |
|
111 |
* Double quotes preceded by these {@link LexemeType}s may be closing quotes. |
|
112 |
*/ |
|
113 |
private static final LexemeType[] LEADING_QUOTE_CLOSING_DOUBLE = |
|
114 |
new LexemeType[]{ |
|
115 |
WORD, NUMBER, PERIOD, PUNCT, DASH, ELLIPSIS, CLOSING_GROUP, QUOTE_SINGLE, |
|
116 |
QUOTE_SINGLE_CLOSING, QUOTE_SINGLE_OPENING |
|
117 |
}; |
|
118 |
|
|
119 |
/** |
|
120 |
* Double quotes succeeded by these {@link LexemeType}s may be closing quotes. |
|
121 |
*/ |
|
122 |
private static final LexemeType[] LAGGING_QUOTE_CLOSING_DOUBLE = |
|
123 |
new LexemeType[]{ |
|
124 |
SPACE, PUNCT, PERIOD, EQUALS, HYPHEN, DASH, QUOTE_SINGLE, CLOSING_GROUP, |
|
125 |
ENDING |
|
126 |
}; |
|
127 |
|
|
128 |
private final CircularFifoQueue<Lexeme> mQ = new CircularFifoQueue<>( 4 ); |
|
129 |
private final String mText; |
|
130 |
private final Contractions mContractions; |
|
131 |
private final Consumer<Token> mConsumer; |
|
132 |
|
|
133 |
public QuoteEmitter( |
|
134 |
final String text, |
|
135 |
final Contractions contractions, |
|
136 |
final Consumer<Token> consumer |
|
137 |
) { |
|
138 |
assert text != null; |
|
139 |
assert contractions != null; |
|
140 |
|
|
141 |
mText = text; |
|
142 |
mContractions = contractions; |
|
143 |
mConsumer = consumer; |
|
144 |
} |
|
145 |
|
|
146 |
/** |
|
147 |
* Scans the given text document for quotation marks and passes them to the |
|
148 |
* given {@link Token} {@link Consumer}. |
|
149 |
* |
|
150 |
* @param text The prose to lex. |
|
151 |
* @param contractions List of ambiguous and unambiguous contractions. |
|
152 |
* @param consumer Receives |
|
153 |
*/ |
|
154 |
public static void analyze( |
|
155 |
final String text, |
|
156 |
final Contractions contractions, |
|
157 |
final Consumer<Token> consumer, |
|
158 |
final LexerFilter filter |
|
159 |
) { |
|
160 |
final var emitter = new QuoteEmitter( text, contractions, consumer ); |
|
161 |
Lexer.lex( text, emitter, filter ); |
|
162 |
} |
|
163 |
|
|
164 |
/** |
|
165 |
* @param lexeme the input argument |
|
166 |
*/ |
|
167 |
@Override |
|
168 |
public void accept( final Lexeme lexeme ) { |
|
169 |
mQ.add( lexeme ); |
|
170 |
|
|
171 |
if( mQ.size() == 4 ) { |
|
172 |
parse(); |
|
173 |
} |
|
174 |
} |
|
175 |
|
|
176 |
private void parse() { |
|
177 |
final var lex1 = mQ.get( 0 ); |
|
178 |
final var lex2 = mQ.get( 1 ); |
|
179 |
final var lex3 = mQ.get( 2 ); |
|
180 |
final var lex4 = mQ.get( 3 ); |
|
181 |
|
|
182 |
// <y'all>, <Ph.D.'ll>, <20's>, <she's> |
|
183 |
if( match( WORD_PERIOD_NUMBER, QUOTE_SINGLE, WORD, ANY ) ) { |
|
184 |
emit( QUOTE_APOSTROPHE, lex2 ); |
|
185 |
} |
|
186 |
// <'n'>, <'N'>, <'owlin'> |
|
187 |
else if( |
|
188 |
match( ANY, QUOTE_SINGLE, WORD, QUOTE_SINGLE ) && |
|
189 |
mContractions.beganEndedUnambiguously( lex3.toString( mText ) ) |
|
190 |
) { |
|
191 |
emit( QUOTE_APOSTROPHE, lex2 ); |
|
192 |
emit( QUOTE_APOSTROPHE, lex4 ); |
|
193 |
mQ.set( Lexeme.NONE, 3 ); |
|
194 |
} |
|
195 |
// <2''> |
|
196 |
else if( match( NUMBER, QUOTE_SINGLE, QUOTE_SINGLE, ANY ) ) { |
|
197 |
// Force double primes to conform to the same constructor usage. This |
|
198 |
// simplifies the tokens, reduces some memory usage, |
|
199 |
final var lex = new Lexeme( PRIME_DOUBLE, lex2.began(), lex3.ended() ); |
|
200 |
|
|
201 |
emit( QUOTE_PRIME_DOUBLE, lex ); |
|
202 |
mQ.set( Lexeme.NONE, 2 ); |
|
203 |
} |
|
204 |
// <2'> |
|
205 |
else if( match( NUMBER, QUOTE_SINGLE, ANY, ANY ) ) { |
|
206 |
emit( QUOTE_PRIME_SINGLE, lex2 ); |
|
207 |
} |
|
208 |
// <2"> |
|
209 |
else if( match( NUMBER, QUOTE_DOUBLE, ANY, ANY ) ) { |
|
210 |
emit( QUOTE_PRIME_DOUBLE, lex2 ); |
|
211 |
} |
|
212 |
// <thinkin'> |
|
213 |
else if( |
|
214 |
match( WORD, QUOTE_SINGLE, ANY, ANY ) && |
|
215 |
mContractions.endedUnambiguously( lex1.toString( mText ) ) |
|
216 |
) { |
|
217 |
emit( QUOTE_APOSTROPHE, lex2 ); |
|
218 |
} |
|
219 |
// <'02> |
|
220 |
else if( match( ANY, QUOTE_SINGLE, NUMBER, SPACE_PUNCT ) ) { |
|
221 |
emit( QUOTE_APOSTROPHE, lex2 ); |
|
222 |
} |
|
223 |
// <'20s> |
|
224 |
else if( |
|
225 |
match( ANY, QUOTE_SINGLE, NUMBER, WORD ) && |
|
226 |
"s".equalsIgnoreCase( lex4.toString( mText ) ) |
|
227 |
) { |
|
228 |
emit( QUOTE_APOSTROPHE, lex2 ); |
|
229 |
} |
|
230 |
// <.'\n> |
|
231 |
else if( match( PUNCT_PERIOD_ELLIPSIS_DASH, QUOTE_SINGLE, ENDING, ANY ) ) { |
|
232 |
emit( QUOTE_CLOSING_SINGLE, lex2 ); |
|
233 |
} |
|
234 |
// <\'> |
|
235 |
else if( match( ESC_SINGLE, ANY, ANY, ANY ) ) { |
|
236 |
emit( QUOTE_STRAIGHT_SINGLE, lex1 ); |
|
237 |
} |
|
238 |
// <\"> |
|
239 |
else if( match( ESC_DOUBLE, ANY, ANY, ANY ) ) { |
|
240 |
emit( QUOTE_STRAIGHT_DOUBLE, lex1 ); |
|
241 |
|
|
242 |
// <\"'---> |
|
243 |
if( match( ESC_DOUBLE, QUOTE_SINGLE, SPACE_DASH_ENDING, ANY ) ) { |
|
244 |
emit( QUOTE_CLOSING_SINGLE, lex2 ); |
|
245 |
} |
|
246 |
} |
|
247 |
// <---'" > |
|
248 |
else if( match( DASH, QUOTE_SINGLE, QUOTE_DOUBLE, SPACE_ENDING ) ) { |
|
249 |
emit( QUOTE_CLOSING_SINGLE, lex2 ); |
|
250 |
} |
|
251 |
// <o’-lantern>, <o' fellow>, <O'-the> |
|
252 |
else if( |
|
253 |
match( WORD, QUOTE_SINGLE, SPACE_HYPHEN, WORD ) && |
|
254 |
"o".equalsIgnoreCase( lex1.toString( mText ) ) |
|
255 |
) { |
|
256 |
emit( QUOTE_APOSTROPHE, lex2 ); |
|
257 |
} |
|
258 |
// <"">, <"...>, <"word>, <---"word> |
|
259 |
else if( |
|
260 |
match( |
|
261 |
LEADING_QUOTE_OPENING_DOUBLE, QUOTE_DOUBLE, |
|
262 |
LAGGING_QUOTE_OPENING_DOUBLE, ANY |
|
263 |
) |
|
264 |
) { |
|
265 |
emit( QUOTE_OPENING_DOUBLE, lex2 ); |
|
266 |
} |
|
267 |
// <..."'>, <word"'>, <?"'>, <word"?> |
|
268 |
else if( |
|
269 |
match( |
|
270 |
LEADING_QUOTE_CLOSING_DOUBLE, QUOTE_DOUBLE, |
|
271 |
LAGGING_QUOTE_CLOSING_DOUBLE, ANY |
|
272 |
) |
|
273 |
) { |
|
274 |
emit( QUOTE_CLOSING_DOUBLE, lex2 ); |
|
275 |
} |
|
276 |
// < ''E> |
|
277 |
else if( match( SPACE_SOT, QUOTE_SINGLE, QUOTE_SINGLE, WORD ) ) { |
|
278 |
// Consume both immediately to avoid the false ambiguity <'e>. |
|
279 |
emit( QUOTE_OPENING_SINGLE, lex2 ); |
|
280 |
emit( QUOTE_APOSTROPHE, lex3 ); |
|
281 |
mQ.set( Lexeme.NONE, 1 ); |
|
282 |
mQ.set( Lexeme.NONE, 2 ); |
|
283 |
} |
|
284 |
// <'...>, <'word>, <---'word>, < 'nation> |
|
285 |
else if( |
|
286 |
match( |
|
287 |
LEADING_QUOTE_OPENING_SINGLE, QUOTE_SINGLE, |
|
288 |
LAGGING_QUOTE_OPENING_SINGLE, ANY ) |
|
289 |
) { |
|
290 |
final var word = lex3.toString( mText ); |
|
291 |
|
|
292 |
if( mContractions.beganAmbiguously( word ) ) { |
|
293 |
emit( QUOTE_AMBIGUOUS_LEADING, lex2 ); |
|
294 |
} |
|
295 |
else if( mContractions.beganUnambiguously( word ) ) { |
|
296 |
emit( QUOTE_APOSTROPHE, lex2 ); |
|
297 |
} |
|
298 |
// <"'"nested> |
|
299 |
else if( match( QUOTE_DOUBLE, QUOTE_SINGLE, QUOTE_DOUBLE, WORD ) ) { |
|
300 |
emit( QUOTE_OPENING_SINGLE, lex2 ); |
|
301 |
} |
|
302 |
// <"'" > |
|
303 |
else if( match( QUOTE_DOUBLE, QUOTE_SINGLE, QUOTE_DOUBLE, ANY ) ) { |
|
304 |
emit( QUOTE_AMBIGUOUS_SINGLE, lex2 ); |
|
305 |
} |
|
306 |
// < '" > |
|
307 |
else if( match( ANY, QUOTE_SINGLE, LAGGING_QUOTE_OPENING_SINGLE, ANY ) ) { |
|
308 |
emit( QUOTE_OPENING_SINGLE, lex2 ); |
|
309 |
} |
|
310 |
// Ambiguous |
|
311 |
else { |
|
312 |
emit( QUOTE_AMBIGUOUS_LEADING, lex2 ); |
|
313 |
} |
|
314 |
} |
|
315 |
// <"'--- > |
|
316 |
else if( match( QUOTE_DOUBLE, QUOTE_SINGLE, DASH, ANY ) ) { |
|
317 |
emit( QUOTE_OPENING_SINGLE, lex2 ); |
|
318 |
} |
|
319 |
// <word'">, <...'--->, <"' > |
|
320 |
else if( |
|
321 |
match( |
|
322 |
LEADING_QUOTE_CLOSING_SINGLE, QUOTE_SINGLE, |
|
323 |
LAGGING_QUOTE_CLOSING_SINGLE, ANY |
|
324 |
) |
|
325 |
) { |
|
326 |
final var word = lex1.toString( mText ); |
|
327 |
|
|
328 |
if( mContractions.endedAmbiguously( word ) ) { |
|
329 |
emit( QUOTE_AMBIGUOUS_LAGGING, lex2 ); |
|
330 |
} |
|
331 |
else { |
|
332 |
emit( QUOTE_CLOSING_SINGLE, lex2 ); |
|
333 |
} |
|
334 |
} |
|
335 |
// <word';> (contraction inferred by previous matches) |
|
336 |
else if( match( WORD, QUOTE_SINGLE, PUNCT_PERIOD, ANY ) ) { |
|
337 |
emit( QUOTE_APOSTROPHE, lex2 ); |
|
338 |
} |
|
339 |
// <---'"> |
|
340 |
else if( match( DASH, QUOTE_SINGLE, QUOTE_DOUBLE, ANY ) ) { |
|
341 |
emit( QUOTE_CLOSING_SINGLE, lex2 ); |
|
342 |
} |
|
343 |
// <'42>, <'-3.14> |
|
344 |
else if( match( ANY, QUOTE_SINGLE, NUMBER, ANY ) ) { |
|
345 |
emit( QUOTE_OPENING_SINGLE, lex2 ); |
|
346 |
} |
|
347 |
// <PRE-PARSED><'---.> |
|
348 |
else if( match( LexemeType.NONE, QUOTE_SINGLE, ANY, ANY ) ) { |
|
349 |
emit( QUOTE_CLOSING_SINGLE, lex2 ); |
|
350 |
} |
|
351 |
// <''Cause > |
|
352 |
else if( match( QUOTE_SINGLE, QUOTE_SINGLE, WORD, ANY ) ) { |
|
353 |
final var word = lex3.toString( mText ); |
|
354 |
|
|
355 |
if( mContractions.beganAmbiguously( word ) ) { |
|
356 |
emit( QUOTE_AMBIGUOUS_LEADING, lex2 ); |
|
357 |
} |
|
358 |
else if( mContractions.beganUnambiguously( word ) ) { |
|
359 |
emit( QUOTE_APOSTROPHE, lex2 ); |
|
360 |
} |
|
361 |
else { |
|
362 |
emit( QUOTE_AMBIGUOUS_SINGLE, lex2 ); |
|
363 |
} |
|
364 |
} |
|
365 |
// <'"Trouble> |
|
366 |
else if( match( QUOTE_SINGLE, QUOTE_DOUBLE, WORD, ANY ) ) { |
|
367 |
emit( QUOTE_OPENING_DOUBLE, lex2 ); |
|
368 |
} |
|
369 |
// International quotation marks. |
|
370 |
else if( match( ANY, QUOTE_DOUBLE_OPENING, ANY, ANY ) ) { |
|
371 |
emit( QUOTE_OPENING_DOUBLE, lex2 ); |
|
372 |
} |
|
373 |
else if( match( ANY, QUOTE_SINGLE_OPENING, ANY, ANY ) ) { |
|
374 |
emit( QUOTE_OPENING_SINGLE, lex2 ); |
|
375 |
} |
|
376 |
else if( match( ANY, QUOTE_DOUBLE_CLOSING, ANY, ANY ) ) { |
| 371 |
377 |
emit( QUOTE_CLOSING_DOUBLE, lex2 ); |
| 372 |
378 |
} |