Dave Jarvis' Repositories

git clone https://repo.autonoma.ca/repo/keenquotes.git
M src/main/java/com/whitemagicsoftware/keenquotes/Contractions.java
2836 2836
    "regrettin",
2837 2837
    "regulatin",
2838
    "rehashin",
2838 2839
    "rehearin",
2839 2840
    "rehearsin",
...
3087 3088
    "shapin",
3088 3089
    "sharin",
3090
    "sharkin",
3089 3091
    "sharpenin",
3090 3092
    "shatterin",
M src/main/java/com/whitemagicsoftware/keenquotes/Lexer.java
8 8
import static com.whitemagicsoftware.keenquotes.Lexeme.createLexeme;
9 9
import static com.whitemagicsoftware.keenquotes.LexemeType.*;
10
import static java.lang.Character.isDigit;
11 10
import static java.lang.Character.isWhitespace;
12 11
import static java.text.CharacterIterator.DONE;
...
60 59
        isWord = true;
61 60
62
        if( !isLetter( peek( i ) ) ) {
61
        final var next = peek(i);
62
63
        if( !isLetter( next ) && !isDigit( next ) ) {
63 64
          lexeme = createLexeme( WORD, began, i.getIndex() );
64 65
        }
...
177 178
  private static boolean isLetter( final char curr ) {
178 179
    return Character.isLetter( curr ) || curr == '_' || curr == '*';
180
  }
181
182
  private static boolean isDigit( final char curr ) {
183
    return Character.isDigit( curr ) ||
184
      "¼½¾⅐⅑⅒⅓⅔⅕⅖⅗⅘⅙⅚⅛⅜⅝⅞".indexOf( curr ) > -1;
179 185
  }
180 186
181 187
  /**
182 188
   * Answers whether the given character can be considered part of a number
183 189
   * or not. This does not include digits, which are checked independently
184
   * from this method.
190
   * of this method.
185 191
   *
186
   * @param curr The character to check as being related to numbers.
192
   * @param curr The character to check as being related to a number.
187 193
   * @return {@code true} if the given character can be considered part of
188 194
   * a number (e.g., -2,000.2^2 is considered a single number).
189 195
   */
190 196
  private static boolean isNumeric( final char curr ) {
191 197
    return
192
      curr == '.' || curr == ',' || curr == '-' || curr == '+' || curr == '^';
198
      curr == '.' || curr == ',' || curr == '-' || curr == '+' ||
199
        curr == '^' || curr == '⅟' || curr == '⁄';
193 200
  }
194 201
M src/main/java/com/whitemagicsoftware/keenquotes/Parser.java
149 149
150 150
        // Notify of any unambiguous quotes that could not be resolved.
151
        unresolved.forEach( ( lex ) -> lexemeConsumer.accept( lex[ 1 ] ) );
152
        unresolved.clear();
153
        mOpeningSingleQuotes.clear();
154
        mClosingSingleQuotes.clear();
155
        mOpeningDoubleQuotes.clear();
156
        mClosingDoubleQuotes.clear();
157
      }
158
    }
159
160
    // By loop's end, the lexemes list contains tokens for all except the
161
    // final two elements (from tokenizing in triplets). Tokenize the remaining
162
    // unprocessed lexemes.
163
    tokenize( EOT, lexemes, tokenConsumer, unresolved );
164
    tokenize( EOT, lexemes, tokenConsumer, unresolved );
165
166
    // Attempt to resolve any remaining unambiguous quotes.
167
    resolve( unresolved, tokenConsumer );
168
169
    // Notify of any unambiguous quotes that could not be resolved.
170
    unresolved.forEach( ( lex ) -> lexemeConsumer.accept( lex[ 1 ] ) );
171
  }
172
173
  /**
174
   * Converts {@link Lexeme}s identified as straight quotes into {@link Token}s
175
   * that represent the curly equivalent. The {@link Token}s are passed to
176
   * the given {@link Consumer} for further processing (e.g., replaced in
177
   * the original text being parsed).
178
   *
179
   * @param lexeme     A part of the text being parsed.
180
   * @param lexemes    A 3-element queue of lexemes that provide sufficient
181
   *                   context to identify curly quotes.
182
   * @param consumer   Recipient of equivalent quotes.
183
   * @param unresolved Rolling list of potentially ambiguous {@link Lexeme}s
184
   *                   that could not be tokenized, yet.
185
   * @return {@code true} if an end-of-paragraph is detected.
186
   */
187
  private boolean tokenize( final Lexeme lexeme,
188
                            final CircularFifoQueue<Lexeme> lexemes,
189
                            final Consumer<Token> consumer,
190
                            final List<Lexeme[]> unresolved ) {
191
    // Add the next lexeme to tokenize into the queue for immediate processing.
192
    lexemes.add( lexeme );
193
194
    final var lex1 = lexemes.get( 0 );
195
    final var lex2 = lexemes.get( 1 );
196
    final var lex3 = lexemes.get( 2 );
197
198
    if( lex2.isType( QUOTE_SINGLE ) && lex3.isType( WORD ) &&
199
      lex1.isType( WORD, PERIOD, NUMBER ) ) {
200
      // Examples: y'all, Ph.D.'ll, 20's, she's
201
      consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) );
202
    }
203
    else if( lex1.isType( QUOTE_SINGLE ) && lex3.isType( QUOTE_SINGLE ) &&
204
      "n".equalsIgnoreCase( lex2.toString( mText ) ) ) {
205
      // I.e., 'n'
206
      consumer.accept( new Token( QUOTE_APOSTROPHE, lex1 ) );
207
      consumer.accept( new Token( QUOTE_APOSTROPHE, lex3 ) );
208
      flush( lexemes );
209
      truncate( unresolved );
210
    }
211
    else if( lex2.isType( QUOTE_SINGLE ) && lex1.isType( NUMBER ) ) {
212
      if( lex3.isType( QUOTE_SINGLE ) ) {
213
        // E.g., 2''
214
        consumer.accept(
215
          new Token( QUOTE_PRIME_DOUBLE, lex2.began(), lex3.ended() ) );
216
        flush( lexemes );
217
      }
218
      else {
219
        // E.g., 2'
220
        consumer.accept( new Token( QUOTE_PRIME_SINGLE, lex2 ) );
221
      }
222
    }
223
    else if( lex2.isType( QUOTE_DOUBLE ) && lex1.isType( NUMBER ) ) {
224
      // E.g., 2"
225
      consumer.accept( new Token( QUOTE_PRIME_DOUBLE, lex2 ) );
226
    }
227
    else if( lex2.isType( WORD ) && lex3.isType( QUOTE_SINGLE ) &&
228
      sContractions.endedUnambiguously( lex2.toString( mText ) ) ) {
229
      // E.g., thinkin'
230
      consumer.accept( new Token( QUOTE_APOSTROPHE, lex3 ) );
231
      flush( lexemes );
232
    }
233
    else if( lex2.isType( NUMBER ) && lex1.isType( QUOTE_SINGLE ) ) {
234
      // Sentences must re-written to avoid starting with numerals.
235
      if( lex3.isType( SPACE, PUNCT ) || (lex3.isType( WORD ) &&
236
        lex3.toString( mText ).equalsIgnoreCase( "s" )) ) {
237
        // Examples: '20s, '02
238
        consumer.accept( new Token( QUOTE_APOSTROPHE, lex1 ) );
239
      }
240
      else {
241
        // E.g., '2''
242
        consumer.accept( new Token( QUOTE_OPENING_SINGLE, lex1 ) );
243
        mOpeningSingleQuotes.add( lex1 );
244
      }
245
246
      truncate( unresolved );
247
    }
248
    else if( lex2.isType( QUOTE_SINGLE ) &&
249
      lex1.isType( PUNCT, PERIOD, ELLIPSIS, DASH ) &&
250
      (lex3.isType( EOL, EOP ) || lex3.isEot()) ) {
251
      consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) );
252
      mClosingSingleQuotes.add( lex2 );
253
    }
254
    else if( lex1.isType( ESC_SINGLE ) ) {
255
      // E.g., \'
256
      consumer.accept( new Token( QUOTE_STRAIGHT_SINGLE, lex1 ) );
257
    }
258
    else if( lex1.isType( ESC_DOUBLE ) ) {
259
      // E.g., \"
260
      consumer.accept( new Token( QUOTE_STRAIGHT_DOUBLE, lex1 ) );
261
262
      if( lex2.isType( QUOTE_SINGLE ) &&
263
        (lex3.isEot() || lex3.isType( SPACE, DASH, EOL, EOP )) ) {
264
        consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) );
265
        mClosingSingleQuotes.add( lex2 );
266
      }
267
    }
268
    else if( lex2.isType( QUOTE_DOUBLE ) &&
269
      (lex1.isSot() || lex1.isType( LEADING_QUOTE_OPENING_DOUBLE )) &&
270
      lex3.isType( LAGGING_QUOTE_OPENING_DOUBLE ) ) {
271
      // Examples: "", "..., "word, ---"word
272
      consumer.accept( new Token( QUOTE_OPENING_DOUBLE, lex2 ) );
273
      mOpeningDoubleQuotes.add( lex2 );
274
    }
275
    else if( lex2.isType( QUOTE_DOUBLE ) &&
276
      lex1.isType( LEADING_QUOTE_CLOSING_DOUBLE ) &&
277
      (lex3.isEot() || lex3.isType( LAGGING_QUOTE_CLOSING_DOUBLE )) ) {
278
      // Examples: ..."', word"', ?"', word"?
279
      consumer.accept( new Token( QUOTE_CLOSING_DOUBLE, lex2 ) );
280
      mClosingDoubleQuotes.add( lex2 );
281
    }
282
    else if( lex1.isType( WORD ) && lex2.isType( QUOTE_SINGLE ) &&
283
      lex3.isType( PUNCT, PERIOD ) ) {
284
      // E.g., word', (contraction ruled out by previous conditions)
285
      consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) );
286
      mClosingSingleQuotes.add( lex2 );
287
    }
288
    else if( lex1.isType( DASH ) &&
289
      lex2.isType( QUOTE_SINGLE ) &&
290
      lex3.isType( QUOTE_DOUBLE ) ) {
291
      // Example: ---'"
292
      consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) );
293
      mClosingSingleQuotes.add( lex2 );
294
    }
295
    else if( lex2.isType( QUOTE_SINGLE, QUOTE_DOUBLE ) ) {
296
      // After tokenizing, the parser will attempt to resolve ambiguities.
297
      unresolved.add( new Lexeme[]{lex1, lex2, lex3} );
298
    }
299
300
    // Suggest to the caller that resolution should be performed. This allows
301
    // the algorithm to reset the opening/closing quote balance before the
302
    // next paragraph is parsed.
303
    return lex3.isType( EOP );
304
  }
305
306
  private void resolve(
307
    final List<Lexeme[]> unresolved, final Consumer<Token> consumer ) {
308
    // Some non-emitted tokenized lexemes may be ambiguous.
309
    final var ambiguousLeadingQuotes = new ArrayList<Lexeme[]>( 16 );
310
    final var ambiguousLaggingQuotes = new ArrayList<Lexeme[]>( 16 );
311
    var resolvedLeadingQuotes = 0;
312
    var resolvedLaggingQuotes = 0;
313
314
    // Count the number of ambiguous and non-ambiguous open single quotes.
315
    for( var i = unresolved.iterator(); i.hasNext(); ) {
316
      final var quotes = i.next();
317
      final var lex1 = quotes[ 0 ];
318
      final var lex2 = quotes[ 1 ];
319
      final var lex3 = quotes[ 2 ];
320
321
      if( lex2.isType( QUOTE_SINGLE ) ) {
322
        final var word1 = lex1 == SOT ? "" : lex1.toString( mText );
323
        final var word3 = lex3 == EOT ? "" : lex3.toString( mText );
324
325
        if( sContractions.beganAmbiguously( word3 ) ) {
326
          // E.g., 'Cause
327
          if( lex1.isType( QUOTE_SINGLE ) ) {
328
            // E.g., ''Cause
329
            consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) );
330
            i.remove();
331
          }
332
          else {
333
            // The contraction is uncertain until a closing quote is found that
334
            // may balance this single quote.
335
            ambiguousLeadingQuotes.add( quotes );
336
          }
337
        }
338
        else if( sContractions.beganUnambiguously( word3 ) ) {
339
          // The quote mark forms a word that does not stand alone from its
340
          // contraction. For example, twas is not a word: it's 'twas.
341
          consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) );
342
          i.remove();
343
        }
344
        else if( sContractions.endedAmbiguously( word1 ) ) {
345
          ambiguousLaggingQuotes.add( quotes );
346
        }
347
        else if( (lex1.isSot() || lex1.isType( LEADING_QUOTE_OPENING_SINGLE )) &&
348
          lex3.isType( LAGGING_QUOTE_OPENING_SINGLE ) ) {
349
          consumer.accept( new Token( QUOTE_OPENING_SINGLE, lex2 ) );
350
          resolvedLeadingQuotes++;
351
          mOpeningSingleQuotes.add( lex2 );
352
          i.remove();
353
        }
354
        else if( lex1.isType( LEADING_QUOTE_CLOSING_SINGLE ) &&
355
          (lex3.isEot() || lex3.isType( LAGGING_QUOTE_CLOSING_SINGLE )) ) {
356
          consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) );
357
          resolvedLaggingQuotes++;
358
          mClosingSingleQuotes.add( lex2 );
359
          i.remove();
360
        }
361
        else if( lex3.isType( NUMBER ) ) {
362
          // E.g., '04
363
          ambiguousLeadingQuotes.add( quotes );
364
        }
365
      }
366
    }
367
368
    sort( mOpeningSingleQuotes );
369
    sort( mClosingSingleQuotes );
370
    sort( mOpeningDoubleQuotes );
371
    sort( mClosingDoubleQuotes );
372
373
    final var singleQuoteEmpty =
374
      mOpeningSingleQuotes.isEmpty() || mClosingSingleQuotes.isEmpty();
375
    final var doubleQuoteEmpty =
376
      mOpeningDoubleQuotes.isEmpty() || mClosingDoubleQuotes.isEmpty();
377
378
    final var singleQuoteDelta = abs(
379
      mClosingSingleQuotes.size() - mOpeningSingleQuotes.size()
380
    );
381
382
    final var doubleQuoteDelta = abs(
383
      mClosingDoubleQuotes.size() - mOpeningDoubleQuotes.size()
384
    );
385
386
    final var ambiguousLeadingCount = ambiguousLeadingQuotes.size();
387
    final var ambiguousLaggingCount = ambiguousLaggingQuotes.size();
388
389
    if( resolvedLeadingQuotes == 1 && resolvedLaggingQuotes == 0 ) {
390
      if( ambiguousLeadingCount == 0 && ambiguousLaggingCount == 1 ) {
391
        final var balanced = singleQuoteDelta == 0;
392
        final var quote = balanced ? QUOTE_APOSTROPHE : QUOTE_CLOSING_SINGLE;
393
        final var lex = ambiguousLaggingQuotes.get( 0 );
394
        consumer.accept( new Token( quote, lex[ 1 ] ) );
395
        unresolved.remove( lex );
396
      }
397
      else if( ambiguousLeadingCount == 0 && unresolved.size() == 1 ) {
398
        // Must be a closing quote.
399
        final var closing = unresolved.get( 0 );
400
        consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) );
401
        unresolved.remove( closing );
402
      }
403
    }
404
    else if( ambiguousLeadingCount == 0 && ambiguousLaggingCount > 0 ) {
405
      // If there are no ambiguous leading quotes then all ambiguous lagging
406
      // quotes must be contractions.
407
      ambiguousLaggingQuotes.forEach(
408
        lex -> {
409
          consumer.accept( new Token( QUOTE_APOSTROPHE, lex[ 1 ] ) );
410
          unresolved.remove( lex );
411
        }
412
      );
413
    }
414
    else if( mOpeningSingleQuotes.size() == 0 &&
415
      mClosingSingleQuotes.size() == 1 && !unresolved.isEmpty() ) {
416
      final var opening = unresolved.get( 0 );
417
      consumer.accept( new Token( QUOTE_OPENING_SINGLE, opening[ 1 ] ) );
418
      unresolved.remove( opening );
419
    }
420
    else if( ambiguousLeadingCount == 0 ) {
421
      if( resolvedLaggingQuotes < resolvedLeadingQuotes ) {
422
        for( final var i = unresolved.iterator(); i.hasNext(); ) {
423
          final var closing = i.next()[ 1 ];
424
          consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing ) );
425
          i.remove();
426
        }
427
      }
428
      else if( singleQuoteDelta == unresolved.size() ) {
429
        for( final var i = unresolved.iterator(); i.hasNext(); ) {
430
          final var closing = i.next();
431
          consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) );
432
          i.remove();
433
        }
434
      }
435
      else if( unresolved.size() == 2 ) {
436
        final var closing = unresolved.get( 0 );
437
        final var opening = unresolved.get( 1 );
438
        consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) );
439
        consumer.accept( new Token( QUOTE_OPENING_SINGLE, opening[ 1 ] ) );
440
441
        // Doesn't affect the algorithm.
442
        unresolved.clear();
443
      }
444
    }
445
    else if( (singleQuoteDelta == 0 && !singleQuoteEmpty) ||
446
      (doubleQuoteDelta == 0 && !doubleQuoteEmpty) ) {
447
      // An apostrophe stands betwixt opening/closing single quotes.
448
      for( final var lexemes = unresolved.iterator(); lexemes.hasNext(); ) {
449
        final var quote = lexemes.next()[ 1 ];
450
451
        for( int i = 0; i < mOpeningSingleQuotes.size(); i++ ) {
452
          // An apostrophe must fall between an open/close pair.
453
          final var openingQuote = mOpeningSingleQuotes.get( i );
454
          final var closingQuote = mClosingSingleQuotes.get( i );
455
456
          if( openingQuote.before( quote ) && closingQuote.after( quote ) ) {
457
            consumer.accept( new Token( QUOTE_APOSTROPHE, quote ) );
458
            lexemes.remove();
459
          }
460
        }
461
      }
462
463
      // An apostrophe stands betwixt opening/closing double quotes.
464
      for( final var lexemes = unresolved.iterator(); lexemes.hasNext(); ) {
465
        final var quote = lexemes.next()[ 1 ];
466
467
        for( int i = 0; i < mOpeningDoubleQuotes.size(); i++ ) {
468
          // An apostrophe must fall between an open/close pair.
469
          final var openingQuote = mOpeningDoubleQuotes.get( i );
470
          final var closingQuote = mClosingDoubleQuotes.get( i );
471
472
          if( openingQuote.before( quote ) && closingQuote.after( quote ) ) {
473
            consumer.accept( new Token( QUOTE_APOSTROPHE, quote ) );
474
            lexemes.remove();
151
        unresolved.forEach( lex -> lexemeConsumer.accept( lex[ 1 ] ) );
152
        unresolved.clear();
153
        mOpeningSingleQuotes.clear();
154
        mClosingSingleQuotes.clear();
155
        mOpeningDoubleQuotes.clear();
156
        mClosingDoubleQuotes.clear();
157
      }
158
    }
159
160
    // By loop's end, the lexemes list contains tokens for all except the
161
    // final two elements (from tokenizing in triplets). Tokenize the remaining
162
    // unprocessed lexemes.
163
    tokenize( EOT, lexemes, tokenConsumer, unresolved );
164
    tokenize( EOT, lexemes, tokenConsumer, unresolved );
165
166
    // Attempt to resolve any remaining unambiguous quotes.
167
    resolve( unresolved, tokenConsumer );
168
169
    // Notify of any unambiguous quotes that could not be resolved.
170
    unresolved.forEach( lex -> lexemeConsumer.accept( lex[ 1 ] ) );
171
  }
172
173
  /**
174
   * Converts {@link Lexeme}s identified as straight quotes into {@link Token}s
175
   * that represent the curly equivalent. The {@link Token}s are passed to
176
   * the given {@link Consumer} for further processing (e.g., replaced in
177
   * the original text being parsed).
178
   *
179
   * @param lexeme     A part of the text being parsed.
180
   * @param lexemes    A 3-element queue of lexemes that provide sufficient
181
   *                   context to identify curly quotes.
182
   * @param consumer   Recipient of equivalent quotes.
183
   * @param unresolved Rolling list of potentially ambiguous {@link Lexeme}s
184
   *                   that could not be tokenized, yet.
185
   * @return {@code true} if an end-of-paragraph is detected.
186
   */
187
  private boolean tokenize( final Lexeme lexeme,
188
                            final CircularFifoQueue<Lexeme> lexemes,
189
                            final Consumer<Token> consumer,
190
                            final List<Lexeme[]> unresolved ) {
191
    // Add the next lexeme to tokenize into the queue for immediate processing.
192
    lexemes.add( lexeme );
193
194
    final var lex1 = lexemes.get( 0 );
195
    final var lex2 = lexemes.get( 1 );
196
    final var lex3 = lexemes.get( 2 );
197
198
    if( lex2.isType( QUOTE_SINGLE ) && lex3.isType( WORD ) &&
199
      lex1.isType( WORD, PERIOD, NUMBER ) ) {
200
      // Examples: y'all, Ph.D.'ll, 20's, she's
201
      consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) );
202
    }
203
    else if( lex1.isType( QUOTE_SINGLE ) && lex3.isType( QUOTE_SINGLE ) &&
204
      "n".equalsIgnoreCase( lex2.toString( mText ) ) ) {
205
      // I.e., 'n'
206
      consumer.accept( new Token( QUOTE_APOSTROPHE, lex1 ) );
207
      consumer.accept( new Token( QUOTE_APOSTROPHE, lex3 ) );
208
      flush( lexemes );
209
      truncate( unresolved );
210
    }
211
    else if( lex2.isType( QUOTE_SINGLE ) && lex1.isType( NUMBER ) ) {
212
      if( lex3.isType( QUOTE_SINGLE ) ) {
213
        // E.g., 2''
214
        consumer.accept(
215
          new Token( QUOTE_PRIME_DOUBLE, lex2.began(), lex3.ended() ) );
216
        flush( lexemes );
217
      }
218
      else {
219
        // E.g., 2'
220
        consumer.accept( new Token( QUOTE_PRIME_SINGLE, lex2 ) );
221
      }
222
    }
223
    else if( lex2.isType( QUOTE_DOUBLE ) && lex1.isType( NUMBER ) ) {
224
      // E.g., 2"
225
      consumer.accept( new Token( QUOTE_PRIME_DOUBLE, lex2 ) );
226
    }
227
    else if( lex2.isType( WORD ) && lex3.isType( QUOTE_SINGLE ) &&
228
      sContractions.endedUnambiguously( lex2.toString( mText ) ) ) {
229
      // E.g., thinkin'
230
      consumer.accept( new Token( QUOTE_APOSTROPHE, lex3 ) );
231
      flush( lexemes );
232
    }
233
    else if( lex2.isType( NUMBER ) && lex1.isType( QUOTE_SINGLE ) ) {
234
      // Sentences must re-written to avoid starting with numerals.
235
      if( lex3.isType( SPACE, PUNCT ) || lex3.isType( WORD ) &&
236
        lex3.toString( mText ).equalsIgnoreCase( "s" ) ) {
237
        // Examples: '20s, '02
238
        consumer.accept( new Token( QUOTE_APOSTROPHE, lex1 ) );
239
      }
240
      else {
241
        // E.g., '2''
242
        consumer.accept( new Token( QUOTE_OPENING_SINGLE, lex1 ) );
243
        mOpeningSingleQuotes.add( lex1 );
244
      }
245
246
      truncate( unresolved );
247
    }
248
    else if( lex2.isType( QUOTE_SINGLE ) &&
249
      lex1.isType( PUNCT, PERIOD, ELLIPSIS, DASH ) &&
250
      (lex3.isType( EOL, EOP ) || lex3.isEot()) ) {
251
      consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) );
252
      mClosingSingleQuotes.add( lex2 );
253
    }
254
    else if( lex1.isType( ESC_SINGLE ) ) {
255
      // E.g., \'
256
      consumer.accept( new Token( QUOTE_STRAIGHT_SINGLE, lex1 ) );
257
    }
258
    else if( lex1.isType( ESC_DOUBLE ) ) {
259
      // E.g., \"
260
      consumer.accept( new Token( QUOTE_STRAIGHT_DOUBLE, lex1 ) );
261
262
      if( lex2.isType( QUOTE_SINGLE ) &&
263
        (lex3.isEot() || lex3.isType( SPACE, DASH, EOL, EOP )) ) {
264
        consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) );
265
        mClosingSingleQuotes.add( lex2 );
266
      }
267
    }
268
    else if( lex2.isType( QUOTE_DOUBLE ) &&
269
      (lex1.isSot() || lex1.isType( LEADING_QUOTE_OPENING_DOUBLE )) &&
270
      lex3.isType( LAGGING_QUOTE_OPENING_DOUBLE ) ) {
271
      // Examples: "", "..., "word, ---"word
272
      consumer.accept( new Token( QUOTE_OPENING_DOUBLE, lex2 ) );
273
      mOpeningDoubleQuotes.add( lex2 );
274
    }
275
    else if( lex2.isType( QUOTE_DOUBLE ) &&
276
      lex1.isType( LEADING_QUOTE_CLOSING_DOUBLE ) &&
277
      (lex3.isEot() || lex3.isType( LAGGING_QUOTE_CLOSING_DOUBLE )) ) {
278
      // Examples: ..."', word"', ?"', word"?
279
      consumer.accept( new Token( QUOTE_CLOSING_DOUBLE, lex2 ) );
280
      mClosingDoubleQuotes.add( lex2 );
281
    }
282
    else if( lex1.isType( WORD ) && lex2.isType( QUOTE_SINGLE ) &&
283
      lex3.isType( PUNCT, PERIOD ) ) {
284
      // E.g., word', (contraction ruled out by previous conditions)
285
      consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) );
286
      mClosingSingleQuotes.add( lex2 );
287
    }
288
    else if( lex1.isType( DASH ) &&
289
      lex2.isType( QUOTE_SINGLE ) &&
290
      lex3.isType( QUOTE_DOUBLE ) ) {
291
      // Example: ---'"
292
      consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) );
293
      mClosingSingleQuotes.add( lex2 );
294
    }
295
    else if( lex2.isType( QUOTE_SINGLE, QUOTE_DOUBLE ) ) {
296
      // After tokenizing, the parser will attempt to resolve ambiguities.
297
      unresolved.add( new Lexeme[]{lex1, lex2, lex3} );
298
    }
299
300
    // Suggest to the caller that resolution should be performed. This allows
301
    // the algorithm to reset the opening/closing quote balance before the
302
    // next paragraph is parsed.
303
    return lex3.isType( EOP );
304
  }
305
306
  private void resolve(
307
    final List<Lexeme[]> unresolved, final Consumer<Token> consumer ) {
308
    // Some non-emitted tokenized lexemes may be ambiguous.
309
    final var ambiguousLeadingQuotes = new ArrayList<Lexeme[]>( 16 );
310
    final var ambiguousLaggingQuotes = new ArrayList<Lexeme[]>( 16 );
311
    var resolvedLeadingQuotes = 0;
312
    var resolvedLaggingQuotes = 0;
313
314
    // Count the number of ambiguous and non-ambiguous open single quotes.
315
    for( var i = unresolved.iterator(); i.hasNext(); ) {
316
      final var quotes = i.next();
317
      final var lex1 = quotes[ 0 ];
318
      final var lex2 = quotes[ 1 ];
319
      final var lex3 = quotes[ 2 ];
320
321
      if( lex2.isType( QUOTE_SINGLE ) ) {
322
        final var word1 = lex1 == SOT ? "" : lex1.toString( mText );
323
        final var word3 = lex3 == EOT ? "" : lex3.toString( mText );
324
325
        if( sContractions.beganAmbiguously( word3 ) ) {
326
          // E.g., 'Cause
327
          if( lex1.isType( QUOTE_SINGLE ) ) {
328
            // E.g., ''Cause
329
            consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) );
330
            i.remove();
331
          }
332
          else {
333
            // The contraction is uncertain until a closing quote is found that
334
            // may balance this single quote.
335
            ambiguousLeadingQuotes.add( quotes );
336
          }
337
        }
338
        else if( sContractions.beganUnambiguously( word3 ) ) {
339
          // The quote mark forms a word that does not stand alone from its
340
          // contraction. For example, twas is not a word: it's 'twas.
341
          consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) );
342
          i.remove();
343
        }
344
        else if( sContractions.endedAmbiguously( word1 ) ) {
345
          ambiguousLaggingQuotes.add( quotes );
346
        }
347
        else if( (lex1.isSot() || lex1.isType( LEADING_QUOTE_OPENING_SINGLE )) &&
348
          lex3.isType( LAGGING_QUOTE_OPENING_SINGLE ) ) {
349
          consumer.accept( new Token( QUOTE_OPENING_SINGLE, lex2 ) );
350
          resolvedLeadingQuotes++;
351
          mOpeningSingleQuotes.add( lex2 );
352
          i.remove();
353
        }
354
        else if( lex1.isType( LEADING_QUOTE_CLOSING_SINGLE ) &&
355
          (lex3.isEot() || lex3.isType( LAGGING_QUOTE_CLOSING_SINGLE )) ) {
356
          consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) );
357
          resolvedLaggingQuotes++;
358
          mClosingSingleQuotes.add( lex2 );
359
          i.remove();
360
        }
361
        else if( lex3.isType( NUMBER ) ) {
362
          // E.g., '04
363
          ambiguousLeadingQuotes.add( quotes );
364
        }
365
      }
366
    }
367
368
    sort( mOpeningSingleQuotes );
369
    sort( mClosingSingleQuotes );
370
    sort( mOpeningDoubleQuotes );
371
    sort( mClosingDoubleQuotes );
372
373
    final var singleQuoteEmpty =
374
      mOpeningSingleQuotes.isEmpty() || mClosingSingleQuotes.isEmpty();
375
    final var doubleQuoteEmpty =
376
      mOpeningDoubleQuotes.isEmpty() || mClosingDoubleQuotes.isEmpty();
377
378
    final var singleQuoteDelta = abs(
379
      mClosingSingleQuotes.size() - mOpeningSingleQuotes.size()
380
    );
381
382
    final var doubleQuoteDelta = abs(
383
      mClosingDoubleQuotes.size() - mOpeningDoubleQuotes.size()
384
    );
385
386
    final var ambiguousLeadingCount = ambiguousLeadingQuotes.size();
387
    final var ambiguousLaggingCount = ambiguousLaggingQuotes.size();
388
389
    if( resolvedLeadingQuotes == 1 && resolvedLaggingQuotes == 0 ) {
390
      if( ambiguousLeadingCount == 0 && ambiguousLaggingCount == 1 ) {
391
        final var balanced = singleQuoteDelta == 0;
392
        final var quote = balanced ? QUOTE_APOSTROPHE : QUOTE_CLOSING_SINGLE;
393
        final var lex = ambiguousLaggingQuotes.get( 0 );
394
        consumer.accept( new Token( quote, lex[ 1 ] ) );
395
        unresolved.remove( lex );
396
      }
397
      else if( ambiguousLeadingCount == 0 && unresolved.size() == 1 ) {
398
        // Must be a closing quote.
399
        final var closing = unresolved.get( 0 );
400
        consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) );
401
        unresolved.remove( closing );
402
      }
403
    }
404
    else if( ambiguousLeadingCount == 0 && ambiguousLaggingCount > 0 ) {
405
      // If there are no ambiguous leading quotes then all ambiguous lagging
406
      // quotes must be contractions.
407
      ambiguousLaggingQuotes.forEach(
408
        lex -> {
409
          consumer.accept( new Token( QUOTE_APOSTROPHE, lex[ 1 ] ) );
410
          unresolved.remove( lex );
411
        }
412
      );
413
    }
414
    else if( mOpeningSingleQuotes.size() == 0 &&
415
      mClosingSingleQuotes.size() == 1 && !unresolved.isEmpty() ) {
416
      final var opening = unresolved.get( 0 );
417
      consumer.accept( new Token( QUOTE_OPENING_SINGLE, opening[ 1 ] ) );
418
      unresolved.remove( opening );
419
    }
420
    else if( ambiguousLeadingCount == 0 ) {
421
      if( resolvedLaggingQuotes < resolvedLeadingQuotes ) {
422
        for( final var i = unresolved.iterator(); i.hasNext(); ) {
423
          final var closing = i.next()[ 1 ];
424
          consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing ) );
425
          i.remove();
426
        }
427
      }
428
      else if( singleQuoteDelta == unresolved.size() ) {
429
        for( final var i = unresolved.iterator(); i.hasNext(); ) {
430
          final var closing = i.next();
431
          consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) );
432
          i.remove();
433
        }
434
      }
435
      else if( unresolved.size() == 2 ) {
436
        final var closing = unresolved.get( 0 );
437
        final var opening = unresolved.get( 1 );
438
        consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) );
439
        consumer.accept( new Token( QUOTE_OPENING_SINGLE, opening[ 1 ] ) );
440
441
        // Doesn't affect the algorithm.
442
        unresolved.clear();
443
      }
444
    }
445
    else if( singleQuoteDelta == 0 && !singleQuoteEmpty ||
446
      doubleQuoteDelta == 0 && !doubleQuoteEmpty ) {
447
      // An apostrophe stands betwixt opening/closing single quotes.
448
      for( final var lexemes = unresolved.iterator(); lexemes.hasNext(); ) {
449
        final var quote = lexemes.next()[ 1 ];
450
451
        for( int i = 0; i < mOpeningSingleQuotes.size(); i++ ) {
452
          // An apostrophe must fall between an open/close pair.
453
          final var openingQuote = mOpeningSingleQuotes.get( i );
454
          final var closingQuote = mClosingSingleQuotes.get( i );
455
456
          if( openingQuote.before( quote ) && closingQuote.after( quote ) ) {
457
            consumer.accept( new Token( QUOTE_APOSTROPHE, quote ) );
458
            lexemes.remove();
459
          }
460
        }
461
      }
462
463
      // An apostrophe stands betwixt opening/closing double quotes.
464
      final var lexemes = unresolved.iterator();
465
466
      while( lexemes.hasNext() ) {
467
        final var quote = lexemes.next()[ 1 ];
468
469
        // Prevent an index out of bounds exception.
470
        final var len = Math.min(
471
          mOpeningDoubleQuotes.size(),
472
          mClosingDoubleQuotes.size()
473
        );
474
475
        for( int i = 0; i < len; i++ ) {
476
          // An apostrophe must fall between an open/close pair.
477
          final var openingQuote = mOpeningDoubleQuotes.get( i );
478
          final var closingQuote = mClosingDoubleQuotes.get( i );
479
480
          if( openingQuote.before( quote ) && closingQuote.after( quote ) ) {
481
            consumer.accept( new Token( QUOTE_APOSTROPHE, quote ) );
482
483
            try {
484
              lexemes.remove();
485
            } catch( final Exception ex ) {
486
              // Weird edge case that hasn't been tracked down. Doesn't affect
487
              // the unit tests.
488
              break;
489
            }
475 490
          }
476 491
        }
M src/test/java/com/whitemagicsoftware/keenquotes/KeenQuotesTest.java
85 85
  }
86 86
87
  @SuppressWarnings( "unused" )
88 87
  static Stream<Arguments> param_XmlParse_StraightQuotes_CurlyQuotes() {
89 88
    return Stream.of(
M src/test/resources/com/whitemagicsoftware/keenquotes/smartypants.txt
32 32
12&Prime; record, 5&prime;10&Prime; height
33 33
34
Use 11.5"x14.25" virtual paper!
35
Use 11.5&Prime;x14.25&Prime; virtual paper!
34
Use 11.5"×14.25" virtual paper!
35
Use 11.5&Prime;×14.25&Prime; virtual paper!
36 36
37 37
Angular measure 3° 5' 30" means 3 degs, 5 arcmins, & 30 arcsecs.
38 38
Angular measure 3° 5&prime; 30&Prime; means 3 degs, 5 arcmins, & 30 arcsecs.
39
40
Play 12½" records.
41
Play 12½&Prime; records.
39 42
40 43
# ########################################################################
...
70 73
"Will'll invite?" Mrs. Thorne asked;\n"because he's coming--he's at the gate."
71 74
&ldquo;Will&apos;ll invite?&rdquo; Mrs. Thorne asked;\n&ldquo;because he&apos;s coming--he&apos;s at the gate.&rdquo;
75
76
Model "T2000"
77
Model &ldquo;T2000&rdquo;
72 78
73 79
# ########################################################################