| 149 |
149 |
|
| 150 |
150 |
// Notify of any unambiguous quotes that could not be resolved. |
| 151 |
|
unresolved.forEach( ( lex ) -> lexemeConsumer.accept( lex[ 1 ] ) ); |
| 152 |
|
unresolved.clear(); |
| 153 |
|
mOpeningSingleQuotes.clear(); |
| 154 |
|
mClosingSingleQuotes.clear(); |
| 155 |
|
mOpeningDoubleQuotes.clear(); |
| 156 |
|
mClosingDoubleQuotes.clear(); |
| 157 |
|
} |
| 158 |
|
} |
| 159 |
|
|
| 160 |
|
// By loop's end, the lexemes list contains tokens for all except the |
| 161 |
|
// final two elements (from tokenizing in triplets). Tokenize the remaining |
| 162 |
|
// unprocessed lexemes. |
| 163 |
|
tokenize( EOT, lexemes, tokenConsumer, unresolved ); |
| 164 |
|
tokenize( EOT, lexemes, tokenConsumer, unresolved ); |
| 165 |
|
|
| 166 |
|
// Attempt to resolve any remaining unambiguous quotes. |
| 167 |
|
resolve( unresolved, tokenConsumer ); |
| 168 |
|
|
| 169 |
|
// Notify of any unambiguous quotes that could not be resolved. |
| 170 |
|
unresolved.forEach( ( lex ) -> lexemeConsumer.accept( lex[ 1 ] ) ); |
| 171 |
|
} |
| 172 |
|
|
| 173 |
|
/** |
| 174 |
|
* Converts {@link Lexeme}s identified as straight quotes into {@link Token}s |
| 175 |
|
* that represent the curly equivalent. The {@link Token}s are passed to |
| 176 |
|
* the given {@link Consumer} for further processing (e.g., replaced in |
| 177 |
|
* the original text being parsed). |
| 178 |
|
* |
| 179 |
|
* @param lexeme A part of the text being parsed. |
| 180 |
|
* @param lexemes A 3-element queue of lexemes that provide sufficient |
| 181 |
|
* context to identify curly quotes. |
| 182 |
|
* @param consumer Recipient of equivalent quotes. |
| 183 |
|
* @param unresolved Rolling list of potentially ambiguous {@link Lexeme}s |
| 184 |
|
* that could not be tokenized, yet. |
| 185 |
|
* @return {@code true} if an end-of-paragraph is detected. |
| 186 |
|
*/ |
| 187 |
|
private boolean tokenize( final Lexeme lexeme, |
| 188 |
|
final CircularFifoQueue<Lexeme> lexemes, |
| 189 |
|
final Consumer<Token> consumer, |
| 190 |
|
final List<Lexeme[]> unresolved ) { |
| 191 |
|
// Add the next lexeme to tokenize into the queue for immediate processing. |
| 192 |
|
lexemes.add( lexeme ); |
| 193 |
|
|
| 194 |
|
final var lex1 = lexemes.get( 0 ); |
| 195 |
|
final var lex2 = lexemes.get( 1 ); |
| 196 |
|
final var lex3 = lexemes.get( 2 ); |
| 197 |
|
|
| 198 |
|
if( lex2.isType( QUOTE_SINGLE ) && lex3.isType( WORD ) && |
| 199 |
|
lex1.isType( WORD, PERIOD, NUMBER ) ) { |
| 200 |
|
// Examples: y'all, Ph.D.'ll, 20's, she's |
| 201 |
|
consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) ); |
| 202 |
|
} |
| 203 |
|
else if( lex1.isType( QUOTE_SINGLE ) && lex3.isType( QUOTE_SINGLE ) && |
| 204 |
|
"n".equalsIgnoreCase( lex2.toString( mText ) ) ) { |
| 205 |
|
// I.e., 'n' |
| 206 |
|
consumer.accept( new Token( QUOTE_APOSTROPHE, lex1 ) ); |
| 207 |
|
consumer.accept( new Token( QUOTE_APOSTROPHE, lex3 ) ); |
| 208 |
|
flush( lexemes ); |
| 209 |
|
truncate( unresolved ); |
| 210 |
|
} |
| 211 |
|
else if( lex2.isType( QUOTE_SINGLE ) && lex1.isType( NUMBER ) ) { |
| 212 |
|
if( lex3.isType( QUOTE_SINGLE ) ) { |
| 213 |
|
// E.g., 2'' |
| 214 |
|
consumer.accept( |
| 215 |
|
new Token( QUOTE_PRIME_DOUBLE, lex2.began(), lex3.ended() ) ); |
| 216 |
|
flush( lexemes ); |
| 217 |
|
} |
| 218 |
|
else { |
| 219 |
|
// E.g., 2' |
| 220 |
|
consumer.accept( new Token( QUOTE_PRIME_SINGLE, lex2 ) ); |
| 221 |
|
} |
| 222 |
|
} |
| 223 |
|
else if( lex2.isType( QUOTE_DOUBLE ) && lex1.isType( NUMBER ) ) { |
| 224 |
|
// E.g., 2" |
| 225 |
|
consumer.accept( new Token( QUOTE_PRIME_DOUBLE, lex2 ) ); |
| 226 |
|
} |
| 227 |
|
else if( lex2.isType( WORD ) && lex3.isType( QUOTE_SINGLE ) && |
| 228 |
|
sContractions.endedUnambiguously( lex2.toString( mText ) ) ) { |
| 229 |
|
// E.g., thinkin' |
| 230 |
|
consumer.accept( new Token( QUOTE_APOSTROPHE, lex3 ) ); |
| 231 |
|
flush( lexemes ); |
| 232 |
|
} |
| 233 |
|
else if( lex2.isType( NUMBER ) && lex1.isType( QUOTE_SINGLE ) ) { |
| 234 |
|
// Sentences must re-written to avoid starting with numerals. |
| 235 |
|
if( lex3.isType( SPACE, PUNCT ) || (lex3.isType( WORD ) && |
| 236 |
|
lex3.toString( mText ).equalsIgnoreCase( "s" )) ) { |
| 237 |
|
// Examples: '20s, '02 |
| 238 |
|
consumer.accept( new Token( QUOTE_APOSTROPHE, lex1 ) ); |
| 239 |
|
} |
| 240 |
|
else { |
| 241 |
|
// E.g., '2'' |
| 242 |
|
consumer.accept( new Token( QUOTE_OPENING_SINGLE, lex1 ) ); |
| 243 |
|
mOpeningSingleQuotes.add( lex1 ); |
| 244 |
|
} |
| 245 |
|
|
| 246 |
|
truncate( unresolved ); |
| 247 |
|
} |
| 248 |
|
else if( lex2.isType( QUOTE_SINGLE ) && |
| 249 |
|
lex1.isType( PUNCT, PERIOD, ELLIPSIS, DASH ) && |
| 250 |
|
(lex3.isType( EOL, EOP ) || lex3.isEot()) ) { |
| 251 |
|
consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) ); |
| 252 |
|
mClosingSingleQuotes.add( lex2 ); |
| 253 |
|
} |
| 254 |
|
else if( lex1.isType( ESC_SINGLE ) ) { |
| 255 |
|
// E.g., \' |
| 256 |
|
consumer.accept( new Token( QUOTE_STRAIGHT_SINGLE, lex1 ) ); |
| 257 |
|
} |
| 258 |
|
else if( lex1.isType( ESC_DOUBLE ) ) { |
| 259 |
|
// E.g., \" |
| 260 |
|
consumer.accept( new Token( QUOTE_STRAIGHT_DOUBLE, lex1 ) ); |
| 261 |
|
|
| 262 |
|
if( lex2.isType( QUOTE_SINGLE ) && |
| 263 |
|
(lex3.isEot() || lex3.isType( SPACE, DASH, EOL, EOP )) ) { |
| 264 |
|
consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) ); |
| 265 |
|
mClosingSingleQuotes.add( lex2 ); |
| 266 |
|
} |
| 267 |
|
} |
| 268 |
|
else if( lex2.isType( QUOTE_DOUBLE ) && |
| 269 |
|
(lex1.isSot() || lex1.isType( LEADING_QUOTE_OPENING_DOUBLE )) && |
| 270 |
|
lex3.isType( LAGGING_QUOTE_OPENING_DOUBLE ) ) { |
| 271 |
|
// Examples: "", "..., "word, ---"word |
| 272 |
|
consumer.accept( new Token( QUOTE_OPENING_DOUBLE, lex2 ) ); |
| 273 |
|
mOpeningDoubleQuotes.add( lex2 ); |
| 274 |
|
} |
| 275 |
|
else if( lex2.isType( QUOTE_DOUBLE ) && |
| 276 |
|
lex1.isType( LEADING_QUOTE_CLOSING_DOUBLE ) && |
| 277 |
|
(lex3.isEot() || lex3.isType( LAGGING_QUOTE_CLOSING_DOUBLE )) ) { |
| 278 |
|
// Examples: ..."', word"', ?"', word"? |
| 279 |
|
consumer.accept( new Token( QUOTE_CLOSING_DOUBLE, lex2 ) ); |
| 280 |
|
mClosingDoubleQuotes.add( lex2 ); |
| 281 |
|
} |
| 282 |
|
else if( lex1.isType( WORD ) && lex2.isType( QUOTE_SINGLE ) && |
| 283 |
|
lex3.isType( PUNCT, PERIOD ) ) { |
| 284 |
|
// E.g., word', (contraction ruled out by previous conditions) |
| 285 |
|
consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) ); |
| 286 |
|
mClosingSingleQuotes.add( lex2 ); |
| 287 |
|
} |
| 288 |
|
else if( lex1.isType( DASH ) && |
| 289 |
|
lex2.isType( QUOTE_SINGLE ) && |
| 290 |
|
lex3.isType( QUOTE_DOUBLE ) ) { |
| 291 |
|
// Example: ---'" |
| 292 |
|
consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) ); |
| 293 |
|
mClosingSingleQuotes.add( lex2 ); |
| 294 |
|
} |
| 295 |
|
else if( lex2.isType( QUOTE_SINGLE, QUOTE_DOUBLE ) ) { |
| 296 |
|
// After tokenizing, the parser will attempt to resolve ambiguities. |
| 297 |
|
unresolved.add( new Lexeme[]{lex1, lex2, lex3} ); |
| 298 |
|
} |
| 299 |
|
|
| 300 |
|
// Suggest to the caller that resolution should be performed. This allows |
| 301 |
|
// the algorithm to reset the opening/closing quote balance before the |
| 302 |
|
// next paragraph is parsed. |
| 303 |
|
return lex3.isType( EOP ); |
| 304 |
|
} |
| 305 |
|
|
| 306 |
|
private void resolve( |
| 307 |
|
final List<Lexeme[]> unresolved, final Consumer<Token> consumer ) { |
| 308 |
|
// Some non-emitted tokenized lexemes may be ambiguous. |
| 309 |
|
final var ambiguousLeadingQuotes = new ArrayList<Lexeme[]>( 16 ); |
| 310 |
|
final var ambiguousLaggingQuotes = new ArrayList<Lexeme[]>( 16 ); |
| 311 |
|
var resolvedLeadingQuotes = 0; |
| 312 |
|
var resolvedLaggingQuotes = 0; |
| 313 |
|
|
| 314 |
|
// Count the number of ambiguous and non-ambiguous open single quotes. |
| 315 |
|
for( var i = unresolved.iterator(); i.hasNext(); ) { |
| 316 |
|
final var quotes = i.next(); |
| 317 |
|
final var lex1 = quotes[ 0 ]; |
| 318 |
|
final var lex2 = quotes[ 1 ]; |
| 319 |
|
final var lex3 = quotes[ 2 ]; |
| 320 |
|
|
| 321 |
|
if( lex2.isType( QUOTE_SINGLE ) ) { |
| 322 |
|
final var word1 = lex1 == SOT ? "" : lex1.toString( mText ); |
| 323 |
|
final var word3 = lex3 == EOT ? "" : lex3.toString( mText ); |
| 324 |
|
|
| 325 |
|
if( sContractions.beganAmbiguously( word3 ) ) { |
| 326 |
|
// E.g., 'Cause |
| 327 |
|
if( lex1.isType( QUOTE_SINGLE ) ) { |
| 328 |
|
// E.g., ''Cause |
| 329 |
|
consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) ); |
| 330 |
|
i.remove(); |
| 331 |
|
} |
| 332 |
|
else { |
| 333 |
|
// The contraction is uncertain until a closing quote is found that |
| 334 |
|
// may balance this single quote. |
| 335 |
|
ambiguousLeadingQuotes.add( quotes ); |
| 336 |
|
} |
| 337 |
|
} |
| 338 |
|
else if( sContractions.beganUnambiguously( word3 ) ) { |
| 339 |
|
// The quote mark forms a word that does not stand alone from its |
| 340 |
|
// contraction. For example, twas is not a word: it's 'twas. |
| 341 |
|
consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) ); |
| 342 |
|
i.remove(); |
| 343 |
|
} |
| 344 |
|
else if( sContractions.endedAmbiguously( word1 ) ) { |
| 345 |
|
ambiguousLaggingQuotes.add( quotes ); |
| 346 |
|
} |
| 347 |
|
else if( (lex1.isSot() || lex1.isType( LEADING_QUOTE_OPENING_SINGLE )) && |
| 348 |
|
lex3.isType( LAGGING_QUOTE_OPENING_SINGLE ) ) { |
| 349 |
|
consumer.accept( new Token( QUOTE_OPENING_SINGLE, lex2 ) ); |
| 350 |
|
resolvedLeadingQuotes++; |
| 351 |
|
mOpeningSingleQuotes.add( lex2 ); |
| 352 |
|
i.remove(); |
| 353 |
|
} |
| 354 |
|
else if( lex1.isType( LEADING_QUOTE_CLOSING_SINGLE ) && |
| 355 |
|
(lex3.isEot() || lex3.isType( LAGGING_QUOTE_CLOSING_SINGLE )) ) { |
| 356 |
|
consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) ); |
| 357 |
|
resolvedLaggingQuotes++; |
| 358 |
|
mClosingSingleQuotes.add( lex2 ); |
| 359 |
|
i.remove(); |
| 360 |
|
} |
| 361 |
|
else if( lex3.isType( NUMBER ) ) { |
| 362 |
|
// E.g., '04 |
| 363 |
|
ambiguousLeadingQuotes.add( quotes ); |
| 364 |
|
} |
| 365 |
|
} |
| 366 |
|
} |
| 367 |
|
|
| 368 |
|
sort( mOpeningSingleQuotes ); |
| 369 |
|
sort( mClosingSingleQuotes ); |
| 370 |
|
sort( mOpeningDoubleQuotes ); |
| 371 |
|
sort( mClosingDoubleQuotes ); |
| 372 |
|
|
| 373 |
|
final var singleQuoteEmpty = |
| 374 |
|
mOpeningSingleQuotes.isEmpty() || mClosingSingleQuotes.isEmpty(); |
| 375 |
|
final var doubleQuoteEmpty = |
| 376 |
|
mOpeningDoubleQuotes.isEmpty() || mClosingDoubleQuotes.isEmpty(); |
| 377 |
|
|
| 378 |
|
final var singleQuoteDelta = abs( |
| 379 |
|
mClosingSingleQuotes.size() - mOpeningSingleQuotes.size() |
| 380 |
|
); |
| 381 |
|
|
| 382 |
|
final var doubleQuoteDelta = abs( |
| 383 |
|
mClosingDoubleQuotes.size() - mOpeningDoubleQuotes.size() |
| 384 |
|
); |
| 385 |
|
|
| 386 |
|
final var ambiguousLeadingCount = ambiguousLeadingQuotes.size(); |
| 387 |
|
final var ambiguousLaggingCount = ambiguousLaggingQuotes.size(); |
| 388 |
|
|
| 389 |
|
if( resolvedLeadingQuotes == 1 && resolvedLaggingQuotes == 0 ) { |
| 390 |
|
if( ambiguousLeadingCount == 0 && ambiguousLaggingCount == 1 ) { |
| 391 |
|
final var balanced = singleQuoteDelta == 0; |
| 392 |
|
final var quote = balanced ? QUOTE_APOSTROPHE : QUOTE_CLOSING_SINGLE; |
| 393 |
|
final var lex = ambiguousLaggingQuotes.get( 0 ); |
| 394 |
|
consumer.accept( new Token( quote, lex[ 1 ] ) ); |
| 395 |
|
unresolved.remove( lex ); |
| 396 |
|
} |
| 397 |
|
else if( ambiguousLeadingCount == 0 && unresolved.size() == 1 ) { |
| 398 |
|
// Must be a closing quote. |
| 399 |
|
final var closing = unresolved.get( 0 ); |
| 400 |
|
consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) ); |
| 401 |
|
unresolved.remove( closing ); |
| 402 |
|
} |
| 403 |
|
} |
| 404 |
|
else if( ambiguousLeadingCount == 0 && ambiguousLaggingCount > 0 ) { |
| 405 |
|
// If there are no ambiguous leading quotes then all ambiguous lagging |
| 406 |
|
// quotes must be contractions. |
| 407 |
|
ambiguousLaggingQuotes.forEach( |
| 408 |
|
lex -> { |
| 409 |
|
consumer.accept( new Token( QUOTE_APOSTROPHE, lex[ 1 ] ) ); |
| 410 |
|
unresolved.remove( lex ); |
| 411 |
|
} |
| 412 |
|
); |
| 413 |
|
} |
| 414 |
|
else if( mOpeningSingleQuotes.size() == 0 && |
| 415 |
|
mClosingSingleQuotes.size() == 1 && !unresolved.isEmpty() ) { |
| 416 |
|
final var opening = unresolved.get( 0 ); |
| 417 |
|
consumer.accept( new Token( QUOTE_OPENING_SINGLE, opening[ 1 ] ) ); |
| 418 |
|
unresolved.remove( opening ); |
| 419 |
|
} |
| 420 |
|
else if( ambiguousLeadingCount == 0 ) { |
| 421 |
|
if( resolvedLaggingQuotes < resolvedLeadingQuotes ) { |
| 422 |
|
for( final var i = unresolved.iterator(); i.hasNext(); ) { |
| 423 |
|
final var closing = i.next()[ 1 ]; |
| 424 |
|
consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing ) ); |
| 425 |
|
i.remove(); |
| 426 |
|
} |
| 427 |
|
} |
| 428 |
|
else if( singleQuoteDelta == unresolved.size() ) { |
| 429 |
|
for( final var i = unresolved.iterator(); i.hasNext(); ) { |
| 430 |
|
final var closing = i.next(); |
| 431 |
|
consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) ); |
| 432 |
|
i.remove(); |
| 433 |
|
} |
| 434 |
|
} |
| 435 |
|
else if( unresolved.size() == 2 ) { |
| 436 |
|
final var closing = unresolved.get( 0 ); |
| 437 |
|
final var opening = unresolved.get( 1 ); |
| 438 |
|
consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) ); |
| 439 |
|
consumer.accept( new Token( QUOTE_OPENING_SINGLE, opening[ 1 ] ) ); |
| 440 |
|
|
| 441 |
|
// Doesn't affect the algorithm. |
| 442 |
|
unresolved.clear(); |
| 443 |
|
} |
| 444 |
|
} |
| 445 |
|
else if( (singleQuoteDelta == 0 && !singleQuoteEmpty) || |
| 446 |
|
(doubleQuoteDelta == 0 && !doubleQuoteEmpty) ) { |
| 447 |
|
// An apostrophe stands betwixt opening/closing single quotes. |
| 448 |
|
for( final var lexemes = unresolved.iterator(); lexemes.hasNext(); ) { |
| 449 |
|
final var quote = lexemes.next()[ 1 ]; |
| 450 |
|
|
| 451 |
|
for( int i = 0; i < mOpeningSingleQuotes.size(); i++ ) { |
| 452 |
|
// An apostrophe must fall between an open/close pair. |
| 453 |
|
final var openingQuote = mOpeningSingleQuotes.get( i ); |
| 454 |
|
final var closingQuote = mClosingSingleQuotes.get( i ); |
| 455 |
|
|
| 456 |
|
if( openingQuote.before( quote ) && closingQuote.after( quote ) ) { |
| 457 |
|
consumer.accept( new Token( QUOTE_APOSTROPHE, quote ) ); |
| 458 |
|
lexemes.remove(); |
| 459 |
|
} |
| 460 |
|
} |
| 461 |
|
} |
| 462 |
|
|
| 463 |
|
// An apostrophe stands betwixt opening/closing double quotes. |
| 464 |
|
for( final var lexemes = unresolved.iterator(); lexemes.hasNext(); ) { |
| 465 |
|
final var quote = lexemes.next()[ 1 ]; |
| 466 |
|
|
| 467 |
|
for( int i = 0; i < mOpeningDoubleQuotes.size(); i++ ) { |
| 468 |
|
// An apostrophe must fall between an open/close pair. |
| 469 |
|
final var openingQuote = mOpeningDoubleQuotes.get( i ); |
| 470 |
|
final var closingQuote = mClosingDoubleQuotes.get( i ); |
| 471 |
|
|
| 472 |
|
if( openingQuote.before( quote ) && closingQuote.after( quote ) ) { |
| 473 |
|
consumer.accept( new Token( QUOTE_APOSTROPHE, quote ) ); |
| 474 |
|
lexemes.remove(); |
|
151 |
unresolved.forEach( lex -> lexemeConsumer.accept( lex[ 1 ] ) ); |
|
152 |
unresolved.clear(); |
|
153 |
mOpeningSingleQuotes.clear(); |
|
154 |
mClosingSingleQuotes.clear(); |
|
155 |
mOpeningDoubleQuotes.clear(); |
|
156 |
mClosingDoubleQuotes.clear(); |
|
157 |
} |
|
158 |
} |
|
159 |
|
|
160 |
// By loop's end, the lexemes list contains tokens for all except the |
|
161 |
// final two elements (from tokenizing in triplets). Tokenize the remaining |
|
162 |
// unprocessed lexemes. |
|
163 |
tokenize( EOT, lexemes, tokenConsumer, unresolved ); |
|
164 |
tokenize( EOT, lexemes, tokenConsumer, unresolved ); |
|
165 |
|
|
166 |
// Attempt to resolve any remaining unambiguous quotes. |
|
167 |
resolve( unresolved, tokenConsumer ); |
|
168 |
|
|
169 |
// Notify of any unambiguous quotes that could not be resolved. |
|
170 |
unresolved.forEach( lex -> lexemeConsumer.accept( lex[ 1 ] ) ); |
|
171 |
} |
|
172 |
|
|
173 |
/** |
|
174 |
* Converts {@link Lexeme}s identified as straight quotes into {@link Token}s |
|
175 |
* that represent the curly equivalent. The {@link Token}s are passed to |
|
176 |
* the given {@link Consumer} for further processing (e.g., replaced in |
|
177 |
* the original text being parsed). |
|
178 |
* |
|
179 |
* @param lexeme A part of the text being parsed. |
|
180 |
* @param lexemes A 3-element queue of lexemes that provide sufficient |
|
181 |
* context to identify curly quotes. |
|
182 |
* @param consumer Recipient of equivalent quotes. |
|
183 |
* @param unresolved Rolling list of potentially ambiguous {@link Lexeme}s |
|
184 |
* that could not be tokenized, yet. |
|
185 |
* @return {@code true} if an end-of-paragraph is detected. |
|
186 |
*/ |
|
187 |
private boolean tokenize( final Lexeme lexeme, |
|
188 |
final CircularFifoQueue<Lexeme> lexemes, |
|
189 |
final Consumer<Token> consumer, |
|
190 |
final List<Lexeme[]> unresolved ) { |
|
191 |
// Add the next lexeme to tokenize into the queue for immediate processing. |
|
192 |
lexemes.add( lexeme ); |
|
193 |
|
|
194 |
final var lex1 = lexemes.get( 0 ); |
|
195 |
final var lex2 = lexemes.get( 1 ); |
|
196 |
final var lex3 = lexemes.get( 2 ); |
|
197 |
|
|
198 |
if( lex2.isType( QUOTE_SINGLE ) && lex3.isType( WORD ) && |
|
199 |
lex1.isType( WORD, PERIOD, NUMBER ) ) { |
|
200 |
// Examples: y'all, Ph.D.'ll, 20's, she's |
|
201 |
consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) ); |
|
202 |
} |
|
203 |
else if( lex1.isType( QUOTE_SINGLE ) && lex3.isType( QUOTE_SINGLE ) && |
|
204 |
"n".equalsIgnoreCase( lex2.toString( mText ) ) ) { |
|
205 |
// I.e., 'n' |
|
206 |
consumer.accept( new Token( QUOTE_APOSTROPHE, lex1 ) ); |
|
207 |
consumer.accept( new Token( QUOTE_APOSTROPHE, lex3 ) ); |
|
208 |
flush( lexemes ); |
|
209 |
truncate( unresolved ); |
|
210 |
} |
|
211 |
else if( lex2.isType( QUOTE_SINGLE ) && lex1.isType( NUMBER ) ) { |
|
212 |
if( lex3.isType( QUOTE_SINGLE ) ) { |
|
213 |
// E.g., 2'' |
|
214 |
consumer.accept( |
|
215 |
new Token( QUOTE_PRIME_DOUBLE, lex2.began(), lex3.ended() ) ); |
|
216 |
flush( lexemes ); |
|
217 |
} |
|
218 |
else { |
|
219 |
// E.g., 2' |
|
220 |
consumer.accept( new Token( QUOTE_PRIME_SINGLE, lex2 ) ); |
|
221 |
} |
|
222 |
} |
|
223 |
else if( lex2.isType( QUOTE_DOUBLE ) && lex1.isType( NUMBER ) ) { |
|
224 |
// E.g., 2" |
|
225 |
consumer.accept( new Token( QUOTE_PRIME_DOUBLE, lex2 ) ); |
|
226 |
} |
|
227 |
else if( lex2.isType( WORD ) && lex3.isType( QUOTE_SINGLE ) && |
|
228 |
sContractions.endedUnambiguously( lex2.toString( mText ) ) ) { |
|
229 |
// E.g., thinkin' |
|
230 |
consumer.accept( new Token( QUOTE_APOSTROPHE, lex3 ) ); |
|
231 |
flush( lexemes ); |
|
232 |
} |
|
233 |
else if( lex2.isType( NUMBER ) && lex1.isType( QUOTE_SINGLE ) ) { |
|
234 |
// Sentences must re-written to avoid starting with numerals. |
|
235 |
if( lex3.isType( SPACE, PUNCT ) || lex3.isType( WORD ) && |
|
236 |
lex3.toString( mText ).equalsIgnoreCase( "s" ) ) { |
|
237 |
// Examples: '20s, '02 |
|
238 |
consumer.accept( new Token( QUOTE_APOSTROPHE, lex1 ) ); |
|
239 |
} |
|
240 |
else { |
|
241 |
// E.g., '2'' |
|
242 |
consumer.accept( new Token( QUOTE_OPENING_SINGLE, lex1 ) ); |
|
243 |
mOpeningSingleQuotes.add( lex1 ); |
|
244 |
} |
|
245 |
|
|
246 |
truncate( unresolved ); |
|
247 |
} |
|
248 |
else if( lex2.isType( QUOTE_SINGLE ) && |
|
249 |
lex1.isType( PUNCT, PERIOD, ELLIPSIS, DASH ) && |
|
250 |
(lex3.isType( EOL, EOP ) || lex3.isEot()) ) { |
|
251 |
consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) ); |
|
252 |
mClosingSingleQuotes.add( lex2 ); |
|
253 |
} |
|
254 |
else if( lex1.isType( ESC_SINGLE ) ) { |
|
255 |
// E.g., \' |
|
256 |
consumer.accept( new Token( QUOTE_STRAIGHT_SINGLE, lex1 ) ); |
|
257 |
} |
|
258 |
else if( lex1.isType( ESC_DOUBLE ) ) { |
|
259 |
// E.g., \" |
|
260 |
consumer.accept( new Token( QUOTE_STRAIGHT_DOUBLE, lex1 ) ); |
|
261 |
|
|
262 |
if( lex2.isType( QUOTE_SINGLE ) && |
|
263 |
(lex3.isEot() || lex3.isType( SPACE, DASH, EOL, EOP )) ) { |
|
264 |
consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) ); |
|
265 |
mClosingSingleQuotes.add( lex2 ); |
|
266 |
} |
|
267 |
} |
|
268 |
else if( lex2.isType( QUOTE_DOUBLE ) && |
|
269 |
(lex1.isSot() || lex1.isType( LEADING_QUOTE_OPENING_DOUBLE )) && |
|
270 |
lex3.isType( LAGGING_QUOTE_OPENING_DOUBLE ) ) { |
|
271 |
// Examples: "", "..., "word, ---"word |
|
272 |
consumer.accept( new Token( QUOTE_OPENING_DOUBLE, lex2 ) ); |
|
273 |
mOpeningDoubleQuotes.add( lex2 ); |
|
274 |
} |
|
275 |
else if( lex2.isType( QUOTE_DOUBLE ) && |
|
276 |
lex1.isType( LEADING_QUOTE_CLOSING_DOUBLE ) && |
|
277 |
(lex3.isEot() || lex3.isType( LAGGING_QUOTE_CLOSING_DOUBLE )) ) { |
|
278 |
// Examples: ..."', word"', ?"', word"? |
|
279 |
consumer.accept( new Token( QUOTE_CLOSING_DOUBLE, lex2 ) ); |
|
280 |
mClosingDoubleQuotes.add( lex2 ); |
|
281 |
} |
|
282 |
else if( lex1.isType( WORD ) && lex2.isType( QUOTE_SINGLE ) && |
|
283 |
lex3.isType( PUNCT, PERIOD ) ) { |
|
284 |
// E.g., word', (contraction ruled out by previous conditions) |
|
285 |
consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) ); |
|
286 |
mClosingSingleQuotes.add( lex2 ); |
|
287 |
} |
|
288 |
else if( lex1.isType( DASH ) && |
|
289 |
lex2.isType( QUOTE_SINGLE ) && |
|
290 |
lex3.isType( QUOTE_DOUBLE ) ) { |
|
291 |
// Example: ---'" |
|
292 |
consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) ); |
|
293 |
mClosingSingleQuotes.add( lex2 ); |
|
294 |
} |
|
295 |
else if( lex2.isType( QUOTE_SINGLE, QUOTE_DOUBLE ) ) { |
|
296 |
// After tokenizing, the parser will attempt to resolve ambiguities. |
|
297 |
unresolved.add( new Lexeme[]{lex1, lex2, lex3} ); |
|
298 |
} |
|
299 |
|
|
300 |
// Suggest to the caller that resolution should be performed. This allows |
|
301 |
// the algorithm to reset the opening/closing quote balance before the |
|
302 |
// next paragraph is parsed. |
|
303 |
return lex3.isType( EOP ); |
|
304 |
} |
|
305 |
|
|
306 |
private void resolve( |
|
307 |
final List<Lexeme[]> unresolved, final Consumer<Token> consumer ) { |
|
308 |
// Some non-emitted tokenized lexemes may be ambiguous. |
|
309 |
final var ambiguousLeadingQuotes = new ArrayList<Lexeme[]>( 16 ); |
|
310 |
final var ambiguousLaggingQuotes = new ArrayList<Lexeme[]>( 16 ); |
|
311 |
var resolvedLeadingQuotes = 0; |
|
312 |
var resolvedLaggingQuotes = 0; |
|
313 |
|
|
314 |
// Count the number of ambiguous and non-ambiguous open single quotes. |
|
315 |
for( var i = unresolved.iterator(); i.hasNext(); ) { |
|
316 |
final var quotes = i.next(); |
|
317 |
final var lex1 = quotes[ 0 ]; |
|
318 |
final var lex2 = quotes[ 1 ]; |
|
319 |
final var lex3 = quotes[ 2 ]; |
|
320 |
|
|
321 |
if( lex2.isType( QUOTE_SINGLE ) ) { |
|
322 |
final var word1 = lex1 == SOT ? "" : lex1.toString( mText ); |
|
323 |
final var word3 = lex3 == EOT ? "" : lex3.toString( mText ); |
|
324 |
|
|
325 |
if( sContractions.beganAmbiguously( word3 ) ) { |
|
326 |
// E.g., 'Cause |
|
327 |
if( lex1.isType( QUOTE_SINGLE ) ) { |
|
328 |
// E.g., ''Cause |
|
329 |
consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) ); |
|
330 |
i.remove(); |
|
331 |
} |
|
332 |
else { |
|
333 |
// The contraction is uncertain until a closing quote is found that |
|
334 |
// may balance this single quote. |
|
335 |
ambiguousLeadingQuotes.add( quotes ); |
|
336 |
} |
|
337 |
} |
|
338 |
else if( sContractions.beganUnambiguously( word3 ) ) { |
|
339 |
// The quote mark forms a word that does not stand alone from its |
|
340 |
// contraction. For example, twas is not a word: it's 'twas. |
|
341 |
consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) ); |
|
342 |
i.remove(); |
|
343 |
} |
|
344 |
else if( sContractions.endedAmbiguously( word1 ) ) { |
|
345 |
ambiguousLaggingQuotes.add( quotes ); |
|
346 |
} |
|
347 |
else if( (lex1.isSot() || lex1.isType( LEADING_QUOTE_OPENING_SINGLE )) && |
|
348 |
lex3.isType( LAGGING_QUOTE_OPENING_SINGLE ) ) { |
|
349 |
consumer.accept( new Token( QUOTE_OPENING_SINGLE, lex2 ) ); |
|
350 |
resolvedLeadingQuotes++; |
|
351 |
mOpeningSingleQuotes.add( lex2 ); |
|
352 |
i.remove(); |
|
353 |
} |
|
354 |
else if( lex1.isType( LEADING_QUOTE_CLOSING_SINGLE ) && |
|
355 |
(lex3.isEot() || lex3.isType( LAGGING_QUOTE_CLOSING_SINGLE )) ) { |
|
356 |
consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) ); |
|
357 |
resolvedLaggingQuotes++; |
|
358 |
mClosingSingleQuotes.add( lex2 ); |
|
359 |
i.remove(); |
|
360 |
} |
|
361 |
else if( lex3.isType( NUMBER ) ) { |
|
362 |
// E.g., '04 |
|
363 |
ambiguousLeadingQuotes.add( quotes ); |
|
364 |
} |
|
365 |
} |
|
366 |
} |
|
367 |
|
|
368 |
sort( mOpeningSingleQuotes ); |
|
369 |
sort( mClosingSingleQuotes ); |
|
370 |
sort( mOpeningDoubleQuotes ); |
|
371 |
sort( mClosingDoubleQuotes ); |
|
372 |
|
|
373 |
final var singleQuoteEmpty = |
|
374 |
mOpeningSingleQuotes.isEmpty() || mClosingSingleQuotes.isEmpty(); |
|
375 |
final var doubleQuoteEmpty = |
|
376 |
mOpeningDoubleQuotes.isEmpty() || mClosingDoubleQuotes.isEmpty(); |
|
377 |
|
|
378 |
final var singleQuoteDelta = abs( |
|
379 |
mClosingSingleQuotes.size() - mOpeningSingleQuotes.size() |
|
380 |
); |
|
381 |
|
|
382 |
final var doubleQuoteDelta = abs( |
|
383 |
mClosingDoubleQuotes.size() - mOpeningDoubleQuotes.size() |
|
384 |
); |
|
385 |
|
|
386 |
final var ambiguousLeadingCount = ambiguousLeadingQuotes.size(); |
|
387 |
final var ambiguousLaggingCount = ambiguousLaggingQuotes.size(); |
|
388 |
|
|
389 |
if( resolvedLeadingQuotes == 1 && resolvedLaggingQuotes == 0 ) { |
|
390 |
if( ambiguousLeadingCount == 0 && ambiguousLaggingCount == 1 ) { |
|
391 |
final var balanced = singleQuoteDelta == 0; |
|
392 |
final var quote = balanced ? QUOTE_APOSTROPHE : QUOTE_CLOSING_SINGLE; |
|
393 |
final var lex = ambiguousLaggingQuotes.get( 0 ); |
|
394 |
consumer.accept( new Token( quote, lex[ 1 ] ) ); |
|
395 |
unresolved.remove( lex ); |
|
396 |
} |
|
397 |
else if( ambiguousLeadingCount == 0 && unresolved.size() == 1 ) { |
|
398 |
// Must be a closing quote. |
|
399 |
final var closing = unresolved.get( 0 ); |
|
400 |
consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) ); |
|
401 |
unresolved.remove( closing ); |
|
402 |
} |
|
403 |
} |
|
404 |
else if( ambiguousLeadingCount == 0 && ambiguousLaggingCount > 0 ) { |
|
405 |
// If there are no ambiguous leading quotes then all ambiguous lagging |
|
406 |
// quotes must be contractions. |
|
407 |
ambiguousLaggingQuotes.forEach( |
|
408 |
lex -> { |
|
409 |
consumer.accept( new Token( QUOTE_APOSTROPHE, lex[ 1 ] ) ); |
|
410 |
unresolved.remove( lex ); |
|
411 |
} |
|
412 |
); |
|
413 |
} |
|
414 |
else if( mOpeningSingleQuotes.size() == 0 && |
|
415 |
mClosingSingleQuotes.size() == 1 && !unresolved.isEmpty() ) { |
|
416 |
final var opening = unresolved.get( 0 ); |
|
417 |
consumer.accept( new Token( QUOTE_OPENING_SINGLE, opening[ 1 ] ) ); |
|
418 |
unresolved.remove( opening ); |
|
419 |
} |
|
420 |
else if( ambiguousLeadingCount == 0 ) { |
|
421 |
if( resolvedLaggingQuotes < resolvedLeadingQuotes ) { |
|
422 |
for( final var i = unresolved.iterator(); i.hasNext(); ) { |
|
423 |
final var closing = i.next()[ 1 ]; |
|
424 |
consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing ) ); |
|
425 |
i.remove(); |
|
426 |
} |
|
427 |
} |
|
428 |
else if( singleQuoteDelta == unresolved.size() ) { |
|
429 |
for( final var i = unresolved.iterator(); i.hasNext(); ) { |
|
430 |
final var closing = i.next(); |
|
431 |
consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) ); |
|
432 |
i.remove(); |
|
433 |
} |
|
434 |
} |
|
435 |
else if( unresolved.size() == 2 ) { |
|
436 |
final var closing = unresolved.get( 0 ); |
|
437 |
final var opening = unresolved.get( 1 ); |
|
438 |
consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) ); |
|
439 |
consumer.accept( new Token( QUOTE_OPENING_SINGLE, opening[ 1 ] ) ); |
|
440 |
|
|
441 |
// Doesn't affect the algorithm. |
|
442 |
unresolved.clear(); |
|
443 |
} |
|
444 |
} |
|
445 |
else if( singleQuoteDelta == 0 && !singleQuoteEmpty || |
|
446 |
doubleQuoteDelta == 0 && !doubleQuoteEmpty ) { |
|
447 |
// An apostrophe stands betwixt opening/closing single quotes. |
|
448 |
for( final var lexemes = unresolved.iterator(); lexemes.hasNext(); ) { |
|
449 |
final var quote = lexemes.next()[ 1 ]; |
|
450 |
|
|
451 |
for( int i = 0; i < mOpeningSingleQuotes.size(); i++ ) { |
|
452 |
// An apostrophe must fall between an open/close pair. |
|
453 |
final var openingQuote = mOpeningSingleQuotes.get( i ); |
|
454 |
final var closingQuote = mClosingSingleQuotes.get( i ); |
|
455 |
|
|
456 |
if( openingQuote.before( quote ) && closingQuote.after( quote ) ) { |
|
457 |
consumer.accept( new Token( QUOTE_APOSTROPHE, quote ) ); |
|
458 |
lexemes.remove(); |
|
459 |
} |
|
460 |
} |
|
461 |
} |
|
462 |
|
|
463 |
// An apostrophe stands betwixt opening/closing double quotes. |
|
464 |
final var lexemes = unresolved.iterator(); |
|
465 |
|
|
466 |
while( lexemes.hasNext() ) { |
|
467 |
final var quote = lexemes.next()[ 1 ]; |
|
468 |
|
|
469 |
// Prevent an index out of bounds exception. |
|
470 |
final var len = Math.min( |
|
471 |
mOpeningDoubleQuotes.size(), |
|
472 |
mClosingDoubleQuotes.size() |
|
473 |
); |
|
474 |
|
|
475 |
for( int i = 0; i < len; i++ ) { |
|
476 |
// An apostrophe must fall between an open/close pair. |
|
477 |
final var openingQuote = mOpeningDoubleQuotes.get( i ); |
|
478 |
final var closingQuote = mClosingDoubleQuotes.get( i ); |
|
479 |
|
|
480 |
if( openingQuote.before( quote ) && closingQuote.after( quote ) ) { |
|
481 |
consumer.accept( new Token( QUOTE_APOSTROPHE, quote ) ); |
|
482 |
|
|
483 |
try { |
|
484 |
lexemes.remove(); |
|
485 |
} catch( final Exception ex ) { |
|
486 |
// Weird edge case that hasn't been tracked down. Doesn't affect |
|
487 |
// the unit tests. |
|
488 |
break; |
|
489 |
} |
| 475 |
490 |
} |
| 476 |
491 |
} |