Dave Jarvis' Repositories

git clone https://repo.autonoma.ca/repo/keenquotes.git
M README.md
48 48
    gradle clean build
49 49
50
## Library
51
52
To build a library for use with other software applications, run:
53
54
    gradle clean lib
55
56
Find the library at:
57
58
    build/lib/keenquotes.jar
59
50 60
M src/main/java/com/whitemagicsoftware/keenquotes/Converter.java
1
/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */
1 2
package com.whitemagicsoftware.keenquotes;
3
4
import com.whitemagicsoftware.keenquotes.ParserFactory.ParserType;
2 5
3 6
import java.util.ArrayList;
...
45 48
  private final Contractions mContractions;
46 49
  private final Map<TokenType, String> mReplacements;
50
  private final ParserFactory mFactory;
47 51
48 52
  /**
49 53
   * Maps quotes to HTML entities.
50 54
   *
51 55
   * @param unresolved Consumes {@link Lexeme}s that could not be converted
52 56
   *                   into HTML entities.
57
   * @param parserType Creates a parser based on document content structure.
53 58
   */
54
  public Converter( final Consumer<Lexeme> unresolved ) {
55
    this( unresolved, new Contractions.Builder().build() );
59
  public Converter(
60
    final Consumer<Lexeme> unresolved, final ParserType parserType ) {
61
    this( unresolved, new Contractions.Builder().build(), parserType );
56 62
  }
57 63
58 64
  /**
59 65
   * Maps quotes to HTML entities.
60 66
   *
61 67
   * @param unresolved Consumes {@link Lexeme}s that could not be converted
62 68
   *                   into HTML entities.
69
   * @param parserType Creates a parser based on document content structure.
63 70
   */
64 71
  public Converter(
65 72
    final Consumer<Lexeme> unresolved,
66
    final Map<TokenType, String> replacements ) {
67
    this( unresolved, new Contractions.Builder().build(), replacements );
73
    final Map<TokenType, String> replacements,
74
    final ParserType parserType ) {
75
    this(
76
      unresolved, new Contractions.Builder().build(), replacements, parserType
77
    );
68 78
  }
69 79
70 80
  /**
71 81
   * Maps quotes to HTML entities.
72 82
   *
73 83
   * @param unresolved Consumes {@link Lexeme}s that could not be converted
74 84
   *                   into HTML entities.
75 85
   * @param c          Contractions listings.
86
   * @param parserType Creates a parser based on document content structure.
76 87
   */
77
  public Converter( final Consumer<Lexeme> unresolved, final Contractions c ) {
78
    this( unresolved, c, ENTITIES );
88
  public Converter(
89
    final Consumer<Lexeme> unresolved,
90
    final Contractions c,
91
    final ParserType parserType ) {
92
    this( unresolved, c, ENTITIES, parserType );
79 93
  }
80 94
...
91 105
    final Consumer<Lexeme> unresolved,
92 106
    final Contractions c,
93
    final Map<TokenType, String> replacements ) {
107
    final Map<TokenType, String> replacements,
108
    final ParserType parserType ) {
94 109
    mUnresolved = unresolved;
95 110
    mContractions = c;
96 111
    mReplacements = replacements;
112
    mFactory = new ParserFactory( parserType );
97 113
  }
98 114
...
108 124
  @Override
109 125
  public String apply( final String text ) {
110
    final var parser = new Parser( text, mContractions );
126
    final var parser = mFactory.createParser( text, mContractions );
111 127
    final var tokens = new ArrayList<Token>();
112 128
M src/main/java/com/whitemagicsoftware/keenquotes/KeenQuotes.java
8 8
import java.util.Properties;
9 9
10
import static com.whitemagicsoftware.keenquotes.ParserFactory.ParserType.PARSER_PLAIN;
10 11
import static java.lang.String.format;
11 12
import static java.lang.System.*;
...
41 42
    else {
42 43
      try {
43
        out.print( convert( new Converter( err::println, contractions ) ) );
44
        final var c = new Converter( err::println, contractions, PARSER_PLAIN );
45
        out.print( convert( c ) );
44 46
      } catch( final Exception ex ) {
45 47
        ex.printStackTrace( err );
M src/main/java/com/whitemagicsoftware/keenquotes/LexemeType.java
25 25
  PERIOD,
26 26
  ELLIPSIS,
27
  FLAG
27
  FLAG,
28
  TAG
28 29
}
29 30
M src/main/java/com/whitemagicsoftware/keenquotes/Lexer.java
22 22
23 23
  /**
24
   * Default constructor, no state.
24
   * Constructs a {@link Lexer} capable of turning text int {@link Lexeme}s.
25
   *
26
   * @param text The text to lex.
25 27
   */
26
  public Lexer( final String text ) {
28
  Lexer( final String text ) {
27 29
    mIterator = new StringCharacterIterator( text );
28 30
  }
29 31
30
  public Lexeme next() {
32
  Lexeme next() {
31 33
    return parse( mIterator );
32 34
  }
...
41 43
   * @return The next token in the sequence.
42 44
   */
43
  private Lexeme parse( final CharacterIterator i ) {
45
  Lexeme parse( final CharacterIterator i ) {
44 46
    int began = i.getIndex();
45 47
    boolean isWord = false;
46 48
    Lexeme lexeme = null;
47 49
48 50
    do {
51
      // Allow subclasses to skip character sequences. This allows XML tags
52
      // to be skipped.
53
      if( skip( i ) ) {
54
        began = i.getIndex();
55
      }
56
49 57
      final var curr = i.current();
50 58
...
152 160
153 161
    return lexeme;
162
  }
163
164
  boolean skip( final CharacterIterator i ) {
165
    return false;
154 166
  }
155 167
...
193 205
  }
194 206
195
  private static char peek( final CharacterIterator ci ) {
207
  static char peek( final CharacterIterator ci ) {
196 208
    final var ch = ci.next();
197 209
    ci.previous();
...
206 218
   * @return The number of characters parsed.
207 219
   */
208
  private static int slurp(
220
  static int slurp(
209 221
    final CharacterIterator ci,
210 222
    final BiFunction<Character, CharacterIterator, Boolean> f ) {
M src/main/java/com/whitemagicsoftware/keenquotes/Parser.java
16 16
 * Converts straight double/single quotes and apostrophes to curly equivalents.
17 17
 */
18
public final class Parser {
19
  /**
20
   * Single quotes preceded by these {@link LexemeType}s may be opening quotes.
21
   */
22
  private static final LexemeType[] LEADING_QUOTE_OPENING_SINGLE =
23
    new LexemeType[]{SPACE, DASH, QUOTE_DOUBLE, OPENING_GROUP, EOL, EOP};
24
25
  /**
26
   * Single quotes succeeded by these {@link LexemeType}s may be opening quotes.
27
   */
28
  private static final LexemeType[] LAGGING_QUOTE_OPENING_SINGLE =
29
    new LexemeType[]{WORD, ELLIPSIS, QUOTE_SINGLE, QUOTE_DOUBLE};
30
31
  /**
32
   * Single quotes preceded by these {@link LexemeType}s may be closing quotes.
33
   */
34
  private static final LexemeType[] LEADING_QUOTE_CLOSING_SINGLE =
35
    new LexemeType[]{WORD, NUMBER, PERIOD, PUNCT, ELLIPSIS, QUOTE_DOUBLE};
36
37
  /**
38
   * Single quotes succeeded by these {@link LexemeType}s may be closing quotes.
39
   */
40
  private static final LexemeType[] LAGGING_QUOTE_CLOSING_SINGLE =
41
    new LexemeType[]{SPACE, HYPHEN, DASH,
42
      QUOTE_DOUBLE, CLOSING_GROUP, EOL, EOP};
43
44
  /**
45
   * Double quotes preceded by these {@link LexemeType}s may be opening quotes.
46
   */
47
  private static final LexemeType[] LEADING_QUOTE_OPENING_DOUBLE =
48
    new LexemeType[]{SPACE, DASH, EQUALS, QUOTE_SINGLE, OPENING_GROUP, EOL,
49
      EOP};
50
51
  /**
52
   * Double quotes succeeded by these {@link LexemeType}s may be opening quotes.
53
   */
54
  private static final LexemeType[] LAGGING_QUOTE_OPENING_DOUBLE =
55
    new LexemeType[]{WORD, NUMBER, ELLIPSIS, OPENING_GROUP,
56
      QUOTE_SINGLE, QUOTE_SINGLE_OPENING, QUOTE_SINGLE_CLOSING, QUOTE_DOUBLE};
57
58
  /**
59
   * Double quotes preceded by these {@link LexemeType}s may be closing quotes.
60
   */
61
  private static final LexemeType[] LEADING_QUOTE_CLOSING_DOUBLE =
62
    new LexemeType[]{WORD, NUMBER, PERIOD, PUNCT, DASH, ELLIPSIS, CLOSING_GROUP,
63
      QUOTE_SINGLE, QUOTE_SINGLE_CLOSING, QUOTE_SINGLE_OPENING};
64
65
  /**
66
   * Double quotes succeeded by these {@link LexemeType}s may be closing quotes.
67
   */
68
  private static final LexemeType[] LAGGING_QUOTE_CLOSING_DOUBLE =
69
    new LexemeType[]{SPACE, PUNCT, PERIOD, EQUALS, HYPHEN, DASH,
70
      QUOTE_SINGLE, CLOSING_GROUP, EOL, EOP};
71
72
  /**
73
   * The text to parse. A reference is required as a minor optimization in
74
   * memory and speed: the lexer records integer offsets, rather than new
75
   * {@link String} instances, to track parsed lexemes.
76
   */
77
  private final String mText;
78
79
  /**
80
   * Converts a string into an iterable list of {@link Lexeme} instances.
81
   */
82
  private final Lexer mLexer;
83
84
  /**
85
   * Sets of contractions that help disambiguate single quotes in the text.
86
   * These are effectively immutable while parsing.
87
   */
88
  private final Contractions sContractions;
89
90
  /**
91
   * Contains each emitted opening single quote per paragraph.
92
   */
93
  private final List<Lexeme> mOpeningSingleQuotes = new ArrayList<>();
94
95
  /**
96
   * Contains each emitted closing single quote per paragraph.
97
   */
98
  private final List<Lexeme> mClosingSingleQuotes = new ArrayList<>();
99
100
  /**
101
   * Contains each emitted opening double quote per paragraph.
102
   */
103
  private final List<Lexeme> mOpeningDoubleQuotes = new ArrayList<>();
104
105
  /**
106
   * Contains each emitted closing double quote per paragraph.
107
   */
108
  private final List<Lexeme> mClosingDoubleQuotes = new ArrayList<>();
109
110
  /**
111
   * Constructs a new {@link Parser} using the default contraction sets
112
   * to help resolve some ambiguous scenarios.
113
   *
114
   * @param text         The prose to parse, containing zero or more quotation
115
   *                     characters.
116
   * @param contractions Custom sets of contractions to help resolve
117
   *                     ambiguities.
118
   */
119
  public Parser( final String text, final Contractions contractions ) {
120
    mText = text;
121
    mLexer = new Lexer( mText );
122
    sContractions = contractions;
123
  }
124
125
  /**
126
   * Iterates over the entire text provided at construction, emitting
127
   * {@link Token}s that can be used to convert straight quotes to curly
128
   * quotes.
129
   *
130
   * @param tokenConsumer Receives emitted {@link Token}s.
131
   */
132
  public void parse(
133
    final Consumer<Token> tokenConsumer,
134
    final Consumer<Lexeme> lexemeConsumer ) {
135
    final var lexemes = new CircularFifoQueue<Lexeme>( 3 );
136
137
    // Allow consuming the very first token without needing a queue size check.
138
    flush( lexemes );
139
140
    final var unresolved = new ArrayList<Lexeme[]>();
141
    Lexeme lexeme;
142
143
    // Create and convert a list of all unambiguous quote characters.
144
    while( (lexeme = mLexer.next()) != EOT ) {
145
      // Reset after tokenizing a paragraph.
146
      if( tokenize( lexeme, lexemes, tokenConsumer, unresolved ) ) {
147
        // Attempt to resolve any remaining unambiguous quotes.
148
        resolve( unresolved, tokenConsumer );
149
150
        // Notify of any unambiguous quotes that could not be resolved.
151
        unresolved.forEach( ( lex ) -> lexemeConsumer.accept( lex[ 1 ] ) );
152
        unresolved.clear();
153
        mOpeningSingleQuotes.clear();
154
        mClosingSingleQuotes.clear();
155
        mOpeningDoubleQuotes.clear();
156
        mClosingDoubleQuotes.clear();
157
      }
158
    }
159
160
    // By loop's end, the lexemes list contains tokens for all except the
161
    // final two elements (from tokenizing in triplets). Tokenize the remaining
162
    // unprocessed lexemes.
163
    tokenize( EOT, lexemes, tokenConsumer, unresolved );
164
    tokenize( EOT, lexemes, tokenConsumer, unresolved );
165
166
    // Attempt to resolve any remaining unambiguous quotes.
167
    resolve( unresolved, tokenConsumer );
168
169
    // Notify of any unambiguous quotes that could not be resolved.
170
    unresolved.forEach( ( lex ) -> lexemeConsumer.accept( lex[ 1 ] ) );
171
  }
172
173
  /**
174
   * Converts {@link Lexeme}s identified as straight quotes into {@link Token}s
175
   * that represent the curly equivalent. The {@link Token}s are passed to
176
   * the given {@link Consumer} for further processing (e.g., replaced in
177
   * the original text being parsed).
178
   *
179
   * @param lexeme     A part of the text being parsed.
180
   * @param lexemes    A 3-element queue of lexemes that provide sufficient
181
   *                   context to identify curly quotes.
182
   * @param consumer   Recipient of equivalent quotes.
183
   * @param unresolved Rolling list of potentially ambiguous {@link Lexeme}s
184
   *                   that could not be tokenized, yet.
185
   * @return {@code true} if an end-of-paragraph is detected.
186
   */
187
  private boolean tokenize( final Lexeme lexeme,
188
                            final CircularFifoQueue<Lexeme> lexemes,
189
                            final Consumer<Token> consumer,
190
                            final List<Lexeme[]> unresolved ) {
191
    // Add the next lexeme to tokenize into the queue for immediate processing.
192
    lexemes.add( lexeme );
193
194
    final var lex1 = lexemes.get( 0 );
195
    final var lex2 = lexemes.get( 1 );
196
    final var lex3 = lexemes.get( 2 );
197
198
    if( lex2.isType( QUOTE_SINGLE ) && lex3.isType( WORD ) &&
199
      lex1.isType( WORD, PERIOD, NUMBER ) ) {
200
      // Examples: y'all, Ph.D.'ll, 20's, she's
201
      consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) );
202
    }
203
    else if( lex1.isType( QUOTE_SINGLE ) && lex3.isType( QUOTE_SINGLE ) &&
204
      "n".equalsIgnoreCase( lex2.toString( mText ) ) ) {
205
      // I.e., 'n'
206
      consumer.accept( new Token( QUOTE_APOSTROPHE, lex1 ) );
207
      consumer.accept( new Token( QUOTE_APOSTROPHE, lex3 ) );
208
      flush( lexemes );
209
      truncate( unresolved );
210
    }
211
    else if( lex2.isType( QUOTE_SINGLE ) && lex1.isType( NUMBER ) ) {
212
      if( lex3.isType( QUOTE_SINGLE ) ) {
213
        // E.g., 2''
214
        consumer.accept(
215
          new Token( QUOTE_PRIME_DOUBLE, lex2.began(), lex3.ended() ) );
216
        flush( lexemes );
217
      }
218
      else {
219
        // E.g., 2'
220
        consumer.accept( new Token( QUOTE_PRIME_SINGLE, lex2 ) );
221
      }
222
    }
223
    else if( lex2.isType( QUOTE_DOUBLE ) && lex1.isType( NUMBER ) ) {
224
      // E.g., 2"
225
      consumer.accept( new Token( QUOTE_PRIME_DOUBLE, lex2 ) );
226
    }
227
    else if( lex2.isType( WORD ) && lex3.isType( QUOTE_SINGLE ) &&
228
      sContractions.endedUnambiguously( lex2.toString( mText ) ) ) {
229
      // E.g., thinkin'
230
      consumer.accept( new Token( QUOTE_APOSTROPHE, lex3 ) );
231
      flush( lexemes );
232
    }
233
    else if( lex2.isType( NUMBER ) && lex1.isType( QUOTE_SINGLE ) ) {
234
      // Sentences must re-written to avoid starting with numerals.
235
      if( lex3.isType( SPACE, PUNCT ) || (lex3.isType( WORD ) &&
236
        lex3.toString( mText ).equalsIgnoreCase( "s" )) ) {
237
        // Examples: '20s, '02
238
        consumer.accept( new Token( QUOTE_APOSTROPHE, lex1 ) );
239
      }
240
      else {
241
        // E.g., '2''
242
        consumer.accept( new Token( QUOTE_OPENING_SINGLE, lex1 ) );
243
        mOpeningSingleQuotes.add( lex1 );
244
      }
245
246
      truncate( unresolved );
247
    }
248
    else if( lex2.isType( QUOTE_SINGLE ) &&
249
      lex1.isType( PUNCT, PERIOD, ELLIPSIS, DASH ) &&
250
      (lex3.isType( EOL, EOP ) || lex3.isEot()) ) {
251
      consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) );
252
      mClosingSingleQuotes.add( lex2 );
253
    }
254
    else if( lex1.isType( ESC_SINGLE ) ) {
255
      // E.g., \'
256
      consumer.accept( new Token( QUOTE_STRAIGHT_SINGLE, lex1 ) );
257
    }
258
    else if( lex1.isType( ESC_DOUBLE ) ) {
259
      // E.g., \"
260
      consumer.accept( new Token( QUOTE_STRAIGHT_DOUBLE, lex1 ) );
261
262
      if( lex2.isType( QUOTE_SINGLE ) &&
263
        (lex3.isEot() || lex3.isType( SPACE, DASH, EOL, EOP )) ) {
264
        consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) );
265
        mClosingSingleQuotes.add( lex2 );
266
      }
267
    }
268
    else if( lex2.isType( QUOTE_DOUBLE ) &&
269
      (lex1.isSot() || lex1.isType( LEADING_QUOTE_OPENING_DOUBLE )) &&
270
      lex3.isType( LAGGING_QUOTE_OPENING_DOUBLE ) ) {
271
      // Examples: "", "..., "word, ---"word
272
      consumer.accept( new Token( QUOTE_OPENING_DOUBLE, lex2 ) );
273
      mOpeningDoubleQuotes.add( lex2 );
274
    }
275
    else if( lex2.isType( QUOTE_DOUBLE ) &&
276
      lex1.isType( LEADING_QUOTE_CLOSING_DOUBLE ) &&
277
      (lex3.isEot() || lex3.isType( LAGGING_QUOTE_CLOSING_DOUBLE )) ) {
278
      // Examples: ..."', word"', ?"', word"?
279
      consumer.accept( new Token( QUOTE_CLOSING_DOUBLE, lex2 ) );
280
      mClosingDoubleQuotes.add( lex2 );
281
    }
282
    else if( lex1.isType( WORD ) && lex2.isType( QUOTE_SINGLE ) &&
283
      lex3.isType( PUNCT, PERIOD ) ) {
284
      // E.g., word', (contraction ruled out by previous conditions)
285
      consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) );
286
      mClosingSingleQuotes.add( lex2 );
287
    }
288
    else if( lex2.isType( QUOTE_SINGLE, QUOTE_DOUBLE ) ) {
289
      // After tokenizing, the parser will attempt to resolve ambiguities.
290
      unresolved.add( new Lexeme[]{lex1, lex2, lex3} );
291
    }
292
293
    // Suggest to the caller that resolution should be performed. This allows
294
    // the algorithm to reset the opening/closing quote balance before the
295
    // next paragraph is parsed.
296
    return lex3.isType( EOP );
297
  }
298
299
  private void resolve(
300
    final List<Lexeme[]> unresolved, final Consumer<Token> consumer ) {
301
    // Some non-emitted tokenized lexemes may be ambiguous.
302
    final var ambiguousLeadingQuotes = new ArrayList<Lexeme[]>( 16 );
303
    final var ambiguousLaggingQuotes = new ArrayList<Lexeme[]>( 16 );
304
    var resolvedLeadingQuotes = 0;
305
    var resolvedLaggingQuotes = 0;
306
307
    // Count the number of ambiguous and non-ambiguous open single quotes.
308
    for( var i = unresolved.iterator(); i.hasNext(); ) {
309
      final var quotes = i.next();
310
      final var lex1 = quotes[ 0 ];
311
      final var lex2 = quotes[ 1 ];
312
      final var lex3 = quotes[ 2 ];
313
314
      if( lex2.isType( QUOTE_SINGLE ) ) {
315
        final var word1 = lex1 == SOT ? "" : lex1.toString( mText );
316
        final var word3 = lex3 == EOT ? "" : lex3.toString( mText );
317
318
        if( sContractions.beganAmbiguously( word3 ) ) {
319
          // E.g., 'Cause
320
          if( lex1.isType( QUOTE_SINGLE ) ) {
321
            // E.g., ''Cause
322
            consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) );
323
            i.remove();
324
          }
325
          else {
326
            // The contraction is uncertain until a closing quote is found that
327
            // may balance this single quote.
328
            ambiguousLeadingQuotes.add( quotes );
329
          }
330
        }
331
        else if( sContractions.beganUnambiguously( word3 ) ) {
332
          // The quote mark forms a word that does not stand alone from its
333
          // contraction. For example, twas is not a word: it's 'twas.
334
          consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) );
335
          i.remove();
336
        }
337
        else if( sContractions.endedAmbiguously( word1 ) ) {
338
          ambiguousLaggingQuotes.add( quotes );
339
        }
340
        else if( (lex1.isSot() || lex1.isType( LEADING_QUOTE_OPENING_SINGLE )) &&
341
          lex3.isType( LAGGING_QUOTE_OPENING_SINGLE ) ) {
342
          consumer.accept( new Token( QUOTE_OPENING_SINGLE, lex2 ) );
343
          resolvedLeadingQuotes++;
344
          mOpeningSingleQuotes.add( lex2 );
345
          i.remove();
346
        }
347
        else if( lex1.isType( LEADING_QUOTE_CLOSING_SINGLE ) &&
348
          (lex3.isEot() || lex3.isType( LAGGING_QUOTE_CLOSING_SINGLE )) ) {
349
          consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) );
350
          resolvedLaggingQuotes++;
351
          mClosingSingleQuotes.add( lex2 );
352
          i.remove();
353
        }
354
        else if( lex3.isType( NUMBER ) ) {
355
          // E.g., '04
356
          ambiguousLeadingQuotes.add( quotes );
357
        }
358
      }
359
    }
360
361
    sort( mOpeningSingleQuotes );
362
    sort( mClosingSingleQuotes );
363
    sort( mOpeningDoubleQuotes );
364
    sort( mClosingDoubleQuotes );
365
366
    final var singleQuoteEmpty =
367
      mOpeningSingleQuotes.isEmpty() || mClosingSingleQuotes.isEmpty();
368
    final var doubleQuoteEmpty =
369
      mOpeningDoubleQuotes.isEmpty() || mClosingDoubleQuotes.isEmpty();
370
371
    final var singleQuoteDelta = abs(
372
      mClosingSingleQuotes.size() - mOpeningSingleQuotes.size()
373
    );
374
375
    final var doubleQuoteDelta = abs(
376
      mClosingDoubleQuotes.size() - mOpeningDoubleQuotes.size()
377
    );
378
379
    final var ambiguousLeadingCount = ambiguousLeadingQuotes.size();
380
    final var ambiguousLaggingCount = ambiguousLaggingQuotes.size();
381
382
    if( resolvedLeadingQuotes == 1 && resolvedLaggingQuotes == 0 ) {
383
      if( ambiguousLeadingCount == 0 && ambiguousLaggingCount == 1 ) {
384
        final var balanced = singleQuoteDelta == 0;
385
        final var quote = balanced ? QUOTE_APOSTROPHE : QUOTE_CLOSING_SINGLE;
386
        final var lex = ambiguousLaggingQuotes.get( 0 );
387
        consumer.accept( new Token( quote, lex[ 1 ] ) );
388
        unresolved.remove( lex );
389
      }
390
      else if( ambiguousLeadingCount == 0 && unresolved.size() == 1 ) {
391
        // Must be a closing quote.
392
        final var closing = unresolved.get( 0 );
393
        consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) );
394
        unresolved.remove( closing );
395
      }
396
    }
397
    else if( ambiguousLeadingCount == 0 && ambiguousLaggingCount > 0 ) {
398
      // If there are no ambiguous leading quotes then all ambiguous lagging
399
      // quotes must be contractions.
400
      ambiguousLaggingQuotes.forEach(
401
        lex -> {
402
          consumer.accept( new Token( QUOTE_APOSTROPHE, lex[ 1 ] ) );
403
          unresolved.remove( lex );
404
        }
405
      );
406
    }
407
    else if( mOpeningSingleQuotes.size() == 0 &&
408
      mClosingSingleQuotes.size() == 1 && !unresolved.isEmpty() ) {
409
      final var opening = unresolved.get( 0 );
410
      consumer.accept( new Token( QUOTE_OPENING_SINGLE, opening[ 1 ] ) );
411
      unresolved.remove( opening );
412
    }
413
    else if( ambiguousLeadingCount == 0 ) {
414
      if( resolvedLaggingQuotes < resolvedLeadingQuotes ) {
415
        for( final var i = unresolved.iterator(); i.hasNext(); ) {
416
          final var closing = i.next()[ 1 ];
417
          consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing ) );
418
          i.remove();
419
        }
420
      }
421
      else if( singleQuoteDelta == unresolved.size() ) {
422
        for( final var i = unresolved.iterator(); i.hasNext(); ) {
423
          final var closing = i.next();
424
          consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) );
425
          i.remove();
426
        }
427
      }
428
      else if( unresolved.size() == 2 ) {
429
        final var closing = unresolved.get( 0 );
430
        final var opening = unresolved.get( 1 );
431
        consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) );
432
        consumer.accept( new Token( QUOTE_OPENING_SINGLE, opening[ 1 ] ) );
433
434
        // Doesn't affect the algorithm.
435
        unresolved.clear();
436
      }
437
    }
438
    else if( (singleQuoteDelta == 0 && !singleQuoteEmpty) ||
439
      (doubleQuoteDelta == 0 && !doubleQuoteEmpty) ) {
440
      // An apostrophe stands betwixt opening/closing single quotes.
441
      for( final var lexemes = unresolved.iterator(); lexemes.hasNext(); ) {
442
        final var quote = lexemes.next()[ 1 ];
443
444
        for( int i = 0; i < mOpeningSingleQuotes.size(); i++ ) {
445
          // An apostrophe must fall between an open/close pair.
446
          final var openingQuote = mOpeningSingleQuotes.get( i );
447
          final var closingQuote = mClosingSingleQuotes.get( i );
448
449
          if( openingQuote.before( quote ) && closingQuote.after( quote ) ) {
450
            consumer.accept( new Token( QUOTE_APOSTROPHE, quote ) );
451
            lexemes.remove();
452
          }
453
        }
454
      }
455
456
      // An apostrophe stands betwixt opening/closing double quotes.
457
      for( final var lexemes = unresolved.iterator(); lexemes.hasNext(); ) {
458
        final var quote = lexemes.next()[ 1 ];
459
460
        for( int i = 0; i < mOpeningDoubleQuotes.size(); i++ ) {
461
          // An apostrophe must fall between an open/close pair.
462
          final var openingQuote = mOpeningDoubleQuotes.get( i );
463
          final var closingQuote = mClosingDoubleQuotes.get( i );
464
465
          if( openingQuote.before( quote ) && closingQuote.after( quote ) ) {
466
            consumer.accept( new Token( QUOTE_APOSTROPHE, quote ) );
467
            lexemes.remove();
468
          }
469
        }
470
      }
471
    }
472
    else if( ambiguousLeadingCount == 1 && resolvedLaggingQuotes == 1 ) {
473
      final var opening = ambiguousLeadingQuotes.get( 0 );
474
      consumer.accept( new Token( QUOTE_OPENING_SINGLE, opening[ 1 ] ) );
475
      unresolved.remove( opening );
476
    }
18
public class Parser {
19
  /**
20
   * Single quotes preceded by these {@link LexemeType}s may be opening quotes.
21
   */
22
  private static final LexemeType[] LEADING_QUOTE_OPENING_SINGLE =
23
    new LexemeType[]{SPACE, DASH, QUOTE_DOUBLE, OPENING_GROUP, EOL, EOP};
24
25
  /**
26
   * Single quotes succeeded by these {@link LexemeType}s may be opening quotes.
27
   */
28
  private static final LexemeType[] LAGGING_QUOTE_OPENING_SINGLE =
29
    new LexemeType[]{WORD, ELLIPSIS, QUOTE_SINGLE, QUOTE_DOUBLE};
30
31
  /**
32
   * Single quotes preceded by these {@link LexemeType}s may be closing quotes.
33
   */
34
  private static final LexemeType[] LEADING_QUOTE_CLOSING_SINGLE =
35
    new LexemeType[]{WORD, NUMBER, PERIOD, PUNCT, ELLIPSIS, QUOTE_DOUBLE};
36
37
  /**
38
   * Single quotes succeeded by these {@link LexemeType}s may be closing quotes.
39
   */
40
  private static final LexemeType[] LAGGING_QUOTE_CLOSING_SINGLE =
41
    new LexemeType[]{SPACE, HYPHEN, DASH,
42
      QUOTE_DOUBLE, CLOSING_GROUP, EOL, EOP};
43
44
  /**
45
   * Double quotes preceded by these {@link LexemeType}s may be opening quotes.
46
   */
47
  private static final LexemeType[] LEADING_QUOTE_OPENING_DOUBLE =
48
    new LexemeType[]{SPACE, DASH, EQUALS, QUOTE_SINGLE, OPENING_GROUP, EOL,
49
      EOP};
50
51
  /**
52
   * Double quotes succeeded by these {@link LexemeType}s may be opening quotes.
53
   */
54
  private static final LexemeType[] LAGGING_QUOTE_OPENING_DOUBLE =
55
    new LexemeType[]{WORD, NUMBER, ELLIPSIS, OPENING_GROUP,
56
      QUOTE_SINGLE, QUOTE_SINGLE_OPENING, QUOTE_SINGLE_CLOSING, QUOTE_DOUBLE};
57
58
  /**
59
   * Double quotes preceded by these {@link LexemeType}s may be closing quotes.
60
   */
61
  private static final LexemeType[] LEADING_QUOTE_CLOSING_DOUBLE =
62
    new LexemeType[]{WORD, NUMBER, PERIOD, PUNCT, DASH, ELLIPSIS, CLOSING_GROUP,
63
      QUOTE_SINGLE, QUOTE_SINGLE_CLOSING, QUOTE_SINGLE_OPENING};
64
65
  /**
66
   * Double quotes succeeded by these {@link LexemeType}s may be closing quotes.
67
   */
68
  private static final LexemeType[] LAGGING_QUOTE_CLOSING_DOUBLE =
69
    new LexemeType[]{SPACE, PUNCT, PERIOD, EQUALS, HYPHEN, DASH,
70
      QUOTE_SINGLE, CLOSING_GROUP, EOL, EOP};
71
72
  /**
73
   * The text to parse. A reference is required as a minor optimization in
74
   * memory and speed: the lexer records integer offsets, rather than new
75
   * {@link String} instances, to track parsed lexemes.
76
   */
77
  private final String mText;
78
79
  /**
80
   * Converts a string into an iterable list of {@link Lexeme} instances.
81
   */
82
  private final Lexer mLexer;
83
84
  /**
85
   * Sets of contractions that help disambiguate single quotes in the text.
86
   * These are effectively immutable while parsing.
87
   */
88
  private final Contractions sContractions;
89
90
  /**
91
   * Contains each emitted opening single quote per paragraph.
92
   */
93
  private final List<Lexeme> mOpeningSingleQuotes = new ArrayList<>();
94
95
  /**
96
   * Contains each emitted closing single quote per paragraph.
97
   */
98
  private final List<Lexeme> mClosingSingleQuotes = new ArrayList<>();
99
100
  /**
101
   * Contains each emitted opening double quote per paragraph.
102
   */
103
  private final List<Lexeme> mOpeningDoubleQuotes = new ArrayList<>();
104
105
  /**
106
   * Contains each emitted closing double quote per paragraph.
107
   */
108
  private final List<Lexeme> mClosingDoubleQuotes = new ArrayList<>();
109
110
  /**
111
   * Constructs a new {@link Parser} using the default contraction sets
112
   * to help resolve some ambiguous scenarios.
113
   *
114
   * @param text         The prose to parse, containing zero or more quotation
115
   *                     characters.
116
   * @param contractions Custom sets of contractions to help resolve
117
   *                     ambiguities.
118
   */
119
  public Parser( final String text, final Contractions contractions ) {
120
    mText = text;
121
    mLexer = createLexer( mText );
122
    sContractions = contractions;
123
  }
124
125
  /**
126
   * Iterates over the entire text provided at construction, emitting
127
   * {@link Token}s that can be used to convert straight quotes to curly
128
   * quotes.
129
   *
130
   * @param tokenConsumer Receives emitted {@link Token}s.
131
   */
132
  public void parse(
133
    final Consumer<Token> tokenConsumer,
134
    final Consumer<Lexeme> lexemeConsumer ) {
135
    final var lexemes = new CircularFifoQueue<Lexeme>( 3 );
136
137
    // Allow consuming the very first token without needing a queue size check.
138
    flush( lexemes );
139
140
    final var unresolved = new ArrayList<Lexeme[]>();
141
    Lexeme lexeme;
142
143
    // Create and convert a list of all unambiguous quote characters.
144
    while( (lexeme = mLexer.next()) != EOT ) {
145
      // Reset after tokenizing a paragraph.
146
      if( tokenize( lexeme, lexemes, tokenConsumer, unresolved ) ) {
147
        // Attempt to resolve any remaining unambiguous quotes.
148
        resolve( unresolved, tokenConsumer );
149
150
        // Notify of any unambiguous quotes that could not be resolved.
151
        unresolved.forEach( ( lex ) -> lexemeConsumer.accept( lex[ 1 ] ) );
152
        unresolved.clear();
153
        mOpeningSingleQuotes.clear();
154
        mClosingSingleQuotes.clear();
155
        mOpeningDoubleQuotes.clear();
156
        mClosingDoubleQuotes.clear();
157
      }
158
    }
159
160
    // By loop's end, the lexemes list contains tokens for all except the
161
    // final two elements (from tokenizing in triplets). Tokenize the remaining
162
    // unprocessed lexemes.
163
    tokenize( EOT, lexemes, tokenConsumer, unresolved );
164
    tokenize( EOT, lexemes, tokenConsumer, unresolved );
165
166
    // Attempt to resolve any remaining unambiguous quotes.
167
    resolve( unresolved, tokenConsumer );
168
169
    // Notify of any unambiguous quotes that could not be resolved.
170
    unresolved.forEach( ( lex ) -> lexemeConsumer.accept( lex[ 1 ] ) );
171
  }
172
173
  /**
174
   * Converts {@link Lexeme}s identified as straight quotes into {@link Token}s
175
   * that represent the curly equivalent. The {@link Token}s are passed to
176
   * the given {@link Consumer} for further processing (e.g., replaced in
177
   * the original text being parsed).
178
   *
179
   * @param lexeme     A part of the text being parsed.
180
   * @param lexemes    A 3-element queue of lexemes that provide sufficient
181
   *                   context to identify curly quotes.
182
   * @param consumer   Recipient of equivalent quotes.
183
   * @param unresolved Rolling list of potentially ambiguous {@link Lexeme}s
184
   *                   that could not be tokenized, yet.
185
   * @return {@code true} if an end-of-paragraph is detected.
186
   */
187
  private boolean tokenize( final Lexeme lexeme,
188
                            final CircularFifoQueue<Lexeme> lexemes,
189
                            final Consumer<Token> consumer,
190
                            final List<Lexeme[]> unresolved ) {
191
    // Add the next lexeme to tokenize into the queue for immediate processing.
192
    lexemes.add( lexeme );
193
194
    final var lex1 = lexemes.get( 0 );
195
    final var lex2 = lexemes.get( 1 );
196
    final var lex3 = lexemes.get( 2 );
197
198
    if( lex2.isType( QUOTE_SINGLE ) && lex3.isType( WORD ) &&
199
      lex1.isType( WORD, PERIOD, NUMBER ) ) {
200
      // Examples: y'all, Ph.D.'ll, 20's, she's
201
      consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) );
202
    }
203
    else if( lex1.isType( QUOTE_SINGLE ) && lex3.isType( QUOTE_SINGLE ) &&
204
      "n".equalsIgnoreCase( lex2.toString( mText ) ) ) {
205
      // I.e., 'n'
206
      consumer.accept( new Token( QUOTE_APOSTROPHE, lex1 ) );
207
      consumer.accept( new Token( QUOTE_APOSTROPHE, lex3 ) );
208
      flush( lexemes );
209
      truncate( unresolved );
210
    }
211
    else if( lex2.isType( QUOTE_SINGLE ) && lex1.isType( NUMBER ) ) {
212
      if( lex3.isType( QUOTE_SINGLE ) ) {
213
        // E.g., 2''
214
        consumer.accept(
215
          new Token( QUOTE_PRIME_DOUBLE, lex2.began(), lex3.ended() ) );
216
        flush( lexemes );
217
      }
218
      else {
219
        // E.g., 2'
220
        consumer.accept( new Token( QUOTE_PRIME_SINGLE, lex2 ) );
221
      }
222
    }
223
    else if( lex2.isType( QUOTE_DOUBLE ) && lex1.isType( NUMBER ) ) {
224
      // E.g., 2"
225
      consumer.accept( new Token( QUOTE_PRIME_DOUBLE, lex2 ) );
226
    }
227
    else if( lex2.isType( WORD ) && lex3.isType( QUOTE_SINGLE ) &&
228
      sContractions.endedUnambiguously( lex2.toString( mText ) ) ) {
229
      // E.g., thinkin'
230
      consumer.accept( new Token( QUOTE_APOSTROPHE, lex3 ) );
231
      flush( lexemes );
232
    }
233
    else if( lex2.isType( NUMBER ) && lex1.isType( QUOTE_SINGLE ) ) {
234
      // Sentences must re-written to avoid starting with numerals.
235
      if( lex3.isType( SPACE, PUNCT ) || (lex3.isType( WORD ) &&
236
        lex3.toString( mText ).equalsIgnoreCase( "s" )) ) {
237
        // Examples: '20s, '02
238
        consumer.accept( new Token( QUOTE_APOSTROPHE, lex1 ) );
239
      }
240
      else {
241
        // E.g., '2''
242
        consumer.accept( new Token( QUOTE_OPENING_SINGLE, lex1 ) );
243
        mOpeningSingleQuotes.add( lex1 );
244
      }
245
246
      truncate( unresolved );
247
    }
248
    else if( lex2.isType( QUOTE_SINGLE ) &&
249
      lex1.isType( PUNCT, PERIOD, ELLIPSIS, DASH ) &&
250
      (lex3.isType( EOL, EOP ) || lex3.isEot()) ) {
251
      consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) );
252
      mClosingSingleQuotes.add( lex2 );
253
    }
254
    else if( lex1.isType( ESC_SINGLE ) ) {
255
      // E.g., \'
256
      consumer.accept( new Token( QUOTE_STRAIGHT_SINGLE, lex1 ) );
257
    }
258
    else if( lex1.isType( ESC_DOUBLE ) ) {
259
      // E.g., \"
260
      consumer.accept( new Token( QUOTE_STRAIGHT_DOUBLE, lex1 ) );
261
262
      if( lex2.isType( QUOTE_SINGLE ) &&
263
        (lex3.isEot() || lex3.isType( SPACE, DASH, EOL, EOP )) ) {
264
        consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) );
265
        mClosingSingleQuotes.add( lex2 );
266
      }
267
    }
268
    else if( lex2.isType( QUOTE_DOUBLE ) &&
269
      (lex1.isSot() || lex1.isType( LEADING_QUOTE_OPENING_DOUBLE )) &&
270
      lex3.isType( LAGGING_QUOTE_OPENING_DOUBLE ) ) {
271
      // Examples: "", "..., "word, ---"word
272
      consumer.accept( new Token( QUOTE_OPENING_DOUBLE, lex2 ) );
273
      mOpeningDoubleQuotes.add( lex2 );
274
    }
275
    else if( lex2.isType( QUOTE_DOUBLE ) &&
276
      lex1.isType( LEADING_QUOTE_CLOSING_DOUBLE ) &&
277
      (lex3.isEot() || lex3.isType( LAGGING_QUOTE_CLOSING_DOUBLE )) ) {
278
      // Examples: ..."', word"', ?"', word"?
279
      consumer.accept( new Token( QUOTE_CLOSING_DOUBLE, lex2 ) );
280
      mClosingDoubleQuotes.add( lex2 );
281
    }
282
    else if( lex1.isType( WORD ) && lex2.isType( QUOTE_SINGLE ) &&
283
      lex3.isType( PUNCT, PERIOD ) ) {
284
      // E.g., word', (contraction ruled out by previous conditions)
285
      consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) );
286
      mClosingSingleQuotes.add( lex2 );
287
    }
288
    else if( lex2.isType( QUOTE_SINGLE, QUOTE_DOUBLE ) ) {
289
      // After tokenizing, the parser will attempt to resolve ambiguities.
290
      unresolved.add( new Lexeme[]{lex1, lex2, lex3} );
291
    }
292
293
    // Suggest to the caller that resolution should be performed. This allows
294
    // the algorithm to reset the opening/closing quote balance before the
295
    // next paragraph is parsed.
296
    return lex3.isType( EOP );
297
  }
298
299
  private void resolve(
300
    final List<Lexeme[]> unresolved, final Consumer<Token> consumer ) {
301
    // Some non-emitted tokenized lexemes may be ambiguous.
302
    final var ambiguousLeadingQuotes = new ArrayList<Lexeme[]>( 16 );
303
    final var ambiguousLaggingQuotes = new ArrayList<Lexeme[]>( 16 );
304
    var resolvedLeadingQuotes = 0;
305
    var resolvedLaggingQuotes = 0;
306
307
    // Count the number of ambiguous and non-ambiguous open single quotes.
308
    for( var i = unresolved.iterator(); i.hasNext(); ) {
309
      final var quotes = i.next();
310
      final var lex1 = quotes[ 0 ];
311
      final var lex2 = quotes[ 1 ];
312
      final var lex3 = quotes[ 2 ];
313
314
      if( lex2.isType( QUOTE_SINGLE ) ) {
315
        final var word1 = lex1 == SOT ? "" : lex1.toString( mText );
316
        final var word3 = lex3 == EOT ? "" : lex3.toString( mText );
317
318
        if( sContractions.beganAmbiguously( word3 ) ) {
319
          // E.g., 'Cause
320
          if( lex1.isType( QUOTE_SINGLE ) ) {
321
            // E.g., ''Cause
322
            consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) );
323
            i.remove();
324
          }
325
          else {
326
            // The contraction is uncertain until a closing quote is found that
327
            // may balance this single quote.
328
            ambiguousLeadingQuotes.add( quotes );
329
          }
330
        }
331
        else if( sContractions.beganUnambiguously( word3 ) ) {
332
          // The quote mark forms a word that does not stand alone from its
333
          // contraction. For example, twas is not a word: it's 'twas.
334
          consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) );
335
          i.remove();
336
        }
337
        else if( sContractions.endedAmbiguously( word1 ) ) {
338
          ambiguousLaggingQuotes.add( quotes );
339
        }
340
        else if( (lex1.isSot() || lex1.isType( LEADING_QUOTE_OPENING_SINGLE )) &&
341
          lex3.isType( LAGGING_QUOTE_OPENING_SINGLE ) ) {
342
          consumer.accept( new Token( QUOTE_OPENING_SINGLE, lex2 ) );
343
          resolvedLeadingQuotes++;
344
          mOpeningSingleQuotes.add( lex2 );
345
          i.remove();
346
        }
347
        else if( lex1.isType( LEADING_QUOTE_CLOSING_SINGLE ) &&
348
          (lex3.isEot() || lex3.isType( LAGGING_QUOTE_CLOSING_SINGLE )) ) {
349
          consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) );
350
          resolvedLaggingQuotes++;
351
          mClosingSingleQuotes.add( lex2 );
352
          i.remove();
353
        }
354
        else if( lex3.isType( NUMBER ) ) {
355
          // E.g., '04
356
          ambiguousLeadingQuotes.add( quotes );
357
        }
358
      }
359
    }
360
361
    sort( mOpeningSingleQuotes );
362
    sort( mClosingSingleQuotes );
363
    sort( mOpeningDoubleQuotes );
364
    sort( mClosingDoubleQuotes );
365
366
    final var singleQuoteEmpty =
367
      mOpeningSingleQuotes.isEmpty() || mClosingSingleQuotes.isEmpty();
368
    final var doubleQuoteEmpty =
369
      mOpeningDoubleQuotes.isEmpty() || mClosingDoubleQuotes.isEmpty();
370
371
    final var singleQuoteDelta = abs(
372
      mClosingSingleQuotes.size() - mOpeningSingleQuotes.size()
373
    );
374
375
    final var doubleQuoteDelta = abs(
376
      mClosingDoubleQuotes.size() - mOpeningDoubleQuotes.size()
377
    );
378
379
    final var ambiguousLeadingCount = ambiguousLeadingQuotes.size();
380
    final var ambiguousLaggingCount = ambiguousLaggingQuotes.size();
381
382
    if( resolvedLeadingQuotes == 1 && resolvedLaggingQuotes == 0 ) {
383
      if( ambiguousLeadingCount == 0 && ambiguousLaggingCount == 1 ) {
384
        final var balanced = singleQuoteDelta == 0;
385
        final var quote = balanced ? QUOTE_APOSTROPHE : QUOTE_CLOSING_SINGLE;
386
        final var lex = ambiguousLaggingQuotes.get( 0 );
387
        consumer.accept( new Token( quote, lex[ 1 ] ) );
388
        unresolved.remove( lex );
389
      }
390
      else if( ambiguousLeadingCount == 0 && unresolved.size() == 1 ) {
391
        // Must be a closing quote.
392
        final var closing = unresolved.get( 0 );
393
        consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) );
394
        unresolved.remove( closing );
395
      }
396
    }
397
    else if( ambiguousLeadingCount == 0 && ambiguousLaggingCount > 0 ) {
398
      // If there are no ambiguous leading quotes then all ambiguous lagging
399
      // quotes must be contractions.
400
      ambiguousLaggingQuotes.forEach(
401
        lex -> {
402
          consumer.accept( new Token( QUOTE_APOSTROPHE, lex[ 1 ] ) );
403
          unresolved.remove( lex );
404
        }
405
      );
406
    }
407
    else if( mOpeningSingleQuotes.size() == 0 &&
408
      mClosingSingleQuotes.size() == 1 && !unresolved.isEmpty() ) {
409
      final var opening = unresolved.get( 0 );
410
      consumer.accept( new Token( QUOTE_OPENING_SINGLE, opening[ 1 ] ) );
411
      unresolved.remove( opening );
412
    }
413
    else if( ambiguousLeadingCount == 0 ) {
414
      if( resolvedLaggingQuotes < resolvedLeadingQuotes ) {
415
        for( final var i = unresolved.iterator(); i.hasNext(); ) {
416
          final var closing = i.next()[ 1 ];
417
          consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing ) );
418
          i.remove();
419
        }
420
      }
421
      else if( singleQuoteDelta == unresolved.size() ) {
422
        for( final var i = unresolved.iterator(); i.hasNext(); ) {
423
          final var closing = i.next();
424
          consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) );
425
          i.remove();
426
        }
427
      }
428
      else if( unresolved.size() == 2 ) {
429
        final var closing = unresolved.get( 0 );
430
        final var opening = unresolved.get( 1 );
431
        consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) );
432
        consumer.accept( new Token( QUOTE_OPENING_SINGLE, opening[ 1 ] ) );
433
434
        // Doesn't affect the algorithm.
435
        unresolved.clear();
436
      }
437
    }
438
    else if( (singleQuoteDelta == 0 && !singleQuoteEmpty) ||
439
      (doubleQuoteDelta == 0 && !doubleQuoteEmpty) ) {
440
      // An apostrophe stands betwixt opening/closing single quotes.
441
      for( final var lexemes = unresolved.iterator(); lexemes.hasNext(); ) {
442
        final var quote = lexemes.next()[ 1 ];
443
444
        for( int i = 0; i < mOpeningSingleQuotes.size(); i++ ) {
445
          // An apostrophe must fall between an open/close pair.
446
          final var openingQuote = mOpeningSingleQuotes.get( i );
447
          final var closingQuote = mClosingSingleQuotes.get( i );
448
449
          if( openingQuote.before( quote ) && closingQuote.after( quote ) ) {
450
            consumer.accept( new Token( QUOTE_APOSTROPHE, quote ) );
451
            lexemes.remove();
452
          }
453
        }
454
      }
455
456
      // An apostrophe stands betwixt opening/closing double quotes.
457
      for( final var lexemes = unresolved.iterator(); lexemes.hasNext(); ) {
458
        final var quote = lexemes.next()[ 1 ];
459
460
        for( int i = 0; i < mOpeningDoubleQuotes.size(); i++ ) {
461
          // An apostrophe must fall between an open/close pair.
462
          final var openingQuote = mOpeningDoubleQuotes.get( i );
463
          final var closingQuote = mClosingDoubleQuotes.get( i );
464
465
          if( openingQuote.before( quote ) && closingQuote.after( quote ) ) {
466
            consumer.accept( new Token( QUOTE_APOSTROPHE, quote ) );
467
            lexemes.remove();
468
          }
469
        }
470
      }
471
    }
472
    else if( ambiguousLeadingCount == 1 && resolvedLaggingQuotes == 1 ) {
473
      final var opening = ambiguousLeadingQuotes.get( 0 );
474
      consumer.accept( new Token( QUOTE_OPENING_SINGLE, opening[ 1 ] ) );
475
      unresolved.remove( opening );
476
    }
477
  }
478
479
  /**
480
   * Allow subclasses to change the type of {@link Lexer}
481
   *
482
   * @param text The text to lex.
483
   * @return A {@link Lexer} that can split the text into {@link Lexeme}s.
484
   */
485
  Lexer createLexer( final String text ) {
486
    return new Lexer( text );
477 487
  }
478 488
A src/main/java/com/whitemagicsoftware/keenquotes/ParserFactory.java
1
/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */
2
package com.whitemagicsoftware.keenquotes;
3
4
/**
5
 * Responsible for creating new {@link Parser} instances based on the
6
 * {@link ParserType}. The document content format must be known in advance.
7
 */
8
public class ParserFactory {
9
  public enum ParserType {
10
    PARSER_PLAIN,
11
    PARSER_XML
12
  }
13
14
  private final ParserType mParserType;
15
16
  public ParserFactory( final ParserType parserType ) {
17
    mParserType = parserType;
18
  }
19
20
  public Parser createParser(
21
    final String text, final Contractions contractions ) {
22
23
    return mParserType == ParserType.PARSER_PLAIN
24
      ? new Parser( text, contractions )
25
      : new XmlParser( text, contractions );
26
  }
27
}
1 28
A src/main/java/com/whitemagicsoftware/keenquotes/XmlLexer.java
1
/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */
2
package com.whitemagicsoftware.keenquotes;
3
4
import java.text.CharacterIterator;
5
6
/**
7
 * Responsible for lexing text while ignoring XML elements. The document must
8
 * be both sane and well-formed. This is not intended to lex documents in the
9
 * wild where the user has injected custom HTML. The lexer will fail if the
10
 * angle brackets are not balanced. Additionally, any less than or greater than
11
 * symbols must be encoded as {@code &lt;} or {@code &gt;}, respectively.
12
 */
13
final class XmlLexer extends Lexer {
14
  /**
15
   * Constructs a {@link Lexer} capable of turning text int {@link Lexeme}s.
16
   *
17
   * @param text The text to lex.
18
   */
19
  XmlLexer( final String text ) {
20
    super( text );
21
  }
22
23
  /**
24
   * Skip (do not emit) XML tags found within the prose. This effectively hides
25
   * the element.
26
   *
27
   * @param i The {@link CharacterIterator} used to scan through the text, one
28
   *          character at a time.
29
   */
30
  @Override
31
  boolean skip( final CharacterIterator i ) {
32
    final boolean match = i.current() == '<';
33
34
    if( match ) {
35
      slurp( i, ( next, ci ) -> next != '>' );
36
37
      // Swallow the trailing greater than symbol.
38
      i.next();
39
40
      // Skip to the character following the greater than symbol.
41
      i.next();
42
    }
43
44
    return match;
45
  }
46
}
1 47
A src/main/java/com/whitemagicsoftware/keenquotes/XmlParser.java
1
/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */
2
package com.whitemagicsoftware.keenquotes;
3
4
/**
5
 * Turns text into {@link Lexeme}s, allowing XML elements to be ignored.
6
 */
7
public final class XmlParser extends Parser {
8
  /**
9
   * Constructs a new {@link Parser} using the default contraction sets
10
   * to help resolve some ambiguous scenarios.
11
   *
12
   * @param text         The prose to parse, containing zero or more quotation
13
   *                     characters.
14
   * @param contractions Custom sets of contractions to help resolve
15
   */
16
  public XmlParser(
17
    final String text, final Contractions contractions ) {
18
    super( text, contractions );
19
  }
20
21
  @Override
22
  public Lexer createLexer( final String text ) {
23
    return new XmlLexer( text );
24
  }
25
}
1 26
M src/test/java/com/whitemagicsoftware/keenquotes/KeenQuotesTest.java
2 2
package com.whitemagicsoftware.keenquotes;
3 3
4
import com.whitemagicsoftware.keenquotes.ParserFactory.ParserType;
4 5
import org.junit.jupiter.api.Disabled;
5 6
import org.junit.jupiter.api.Test;
6 7
import org.junit.jupiter.params.ParameterizedTest;
8
import org.junit.jupiter.params.provider.Arguments;
9
import org.junit.jupiter.params.provider.MethodSource;
7 10
import org.junit.jupiter.params.provider.ValueSource;
8 11
9 12
import java.io.BufferedReader;
10 13
import java.io.IOException;
11 14
import java.io.InputStreamReader;
15
import java.util.function.Consumer;
12 16
import java.util.function.Function;
17
import java.util.stream.Stream;
13 18
19
import static com.whitemagicsoftware.keenquotes.ParserFactory.ParserType.PARSER_PLAIN;
20
import static com.whitemagicsoftware.keenquotes.ParserFactory.ParserType.PARSER_XML;
14 21
import static java.lang.System.out;
15 22
import static org.junit.jupiter.api.Assertions.assertEquals;
16 23
import static org.junit.jupiter.api.Assertions.assertNotNull;
24
import static org.junit.jupiter.params.provider.Arguments.arguments;
17 25
18 26
/**
...
27 35
  @Disabled
28 36
  public void test_parse_SingleLine_Parsed() {
29
    final var converter = new Converter( out::println );
37
    final var converter = createConverter( out::println );
30 38
    out.println( converter.apply(
31 39
      "'A', 'B', and 'C' are letters."
...
40 48
  @Test
41 49
  public void test_Parse_StraightQuotes_CurlyQuotes() throws IOException {
42
    testConverter( new Converter( ( lex ) -> {} ) );
50
    testConverter( createConverter( ( lex ) -> {} ) );
51
  }
52
53
  @ParameterizedTest
54
  @MethodSource( "param_XmlParse_StraightQuotes_CurlyQuotes" )
55
  public void test_XmlParse_StraightQuotes_CurlyQuotes(
56
    final String input, final String expected ) {
57
    final var converter = createConverter( out::println, PARSER_XML );
58
    final var actual = converter.apply( input );
59
    assertEquals( expected, actual );
43 60
  }
44 61
...
65 82
    }
66 83
67
    final var converter = new Converter( out::println );
84
    final var converter = createConverter( out::println );
68 85
    System.out.println( converter.apply( sb.toString() ) );
86
  }
87
88
  @SuppressWarnings( "unused" )
89
  static Stream<Arguments> param_XmlParse_StraightQuotes_CurlyQuotes() {
90
    return Stream.of(
91
      arguments(
92
        "<em>'twas</em>",
93
        "<em>&apos;twas</em>"
94
      ),
95
      arguments(
96
        "<bold>'twas</bold> redeemed for the <em>cat</em>'s eye",
97
        "<bold>&apos;twas</bold> redeemed for the <em>cat</em>&apos;s eye"
98
      ),
99
      arguments(
100
        "<a href=\"https://x.org\" title=\"X's Homepage\">X11's bomb</a>",
101
        "<a href=\"https://x.org\" title=\"X's Homepage\">X11&apos;s bomb</a>"
102
      ),
103
      arguments(
104
        "''<em>Twas</em> happening!'",
105
        "&lsquo;&apos;<em>Twas</em> happening!&rsquo;"
106
      )
107
    );
69 108
  }
70 109
...
127 166
128 167
    return new BufferedReader( new InputStreamReader( is ) );
168
  }
169
170
  private Function<String, String> createConverter(
171
    final Consumer<Lexeme> unresolved ) {
172
    return createConverter( unresolved, PARSER_PLAIN );
173
  }
174
175
  private Function<String, String> createConverter(
176
    final Consumer<Lexeme> unresolved, final ParserType parserType ) {
177
    return new Converter( unresolved, parserType );
129 178
  }
130 179
}
M src/test/java/com/whitemagicsoftware/keenquotes/LexerTest.java
4 4
import org.junit.jupiter.api.Test;
5 5
6
import java.util.Arrays;
7 6
import java.util.List;
8 7
import java.util.function.BiFunction;
9 8
10 9
import static com.whitemagicsoftware.keenquotes.Lexeme.EOT;
11 10
import static com.whitemagicsoftware.keenquotes.LexemeType.*;
11
import static java.util.Arrays.asList;
12 12
import static org.junit.jupiter.api.Assertions.assertEquals;
13 13
14 14
/**
15 15
 * Tests lexing words, numbers, punctuation, spaces, newlines, etc.
16 16
 */
17
class LexerTest {
17
final class LexerTest {
18
18 19
  @Test
19
  void test_Lexing_Words_TokenValues() {
20
  void test_Lexing_Words_LexemeValues() {
20 21
    testText( "abc 123", "abc", " ", "123" );
21 22
    testText( "-123 abc", "-123", " ", "abc" );
...
89 90
  }
90 91
91
  private void testType( final String actual, final LexemeType... expected ) {
92
    final var list = Arrays.asList( expected );
93
    testType( actual, ( lexeme, text ) -> lexeme.getType(), list );
92
  static void testType(
93
    final String actual, final LexemeType... expected ) {
94
    final var list = asList( expected );
95
    final var lexer = createLexer( actual );
96
    testType( lexer, actual, ( lexeme, text ) -> lexeme.getType(), list );
94 97
  }
95 98
96
  private void testText( final String actual, final String... expected ) {
97
    testType( actual, Lexeme::toString, Arrays.asList( expected ) );
99
  static void testText(
100
    final String actual, final String... expected ) {
101
    final var lexer = createLexer( actual );
102
    testType( lexer, actual, Lexeme::toString, asList( expected ) );
98 103
  }
99 104
100
  private <A, E> void testType(
105
  static void testType(
106
    final Lexer lexer, final String actual, final LexemeType... expected ) {
107
    final var list = asList( expected );
108
    testType( lexer, actual, ( lexeme, text ) -> lexeme.getType(), list );
109
  }
110
111
  private static <A, E> void testType(
112
    final Lexer lexer,
101 113
    final String text,
102 114
    final BiFunction<Lexeme, String, A> f,
103 115
    final List<E> elements ) {
104
    final var lexer = new Lexer( text );
105 116
    var counter = 0;
106 117
...
116 127
    // Ensure all expected values are matched (verify end of text reached).
117 128
    assertEquals( elements.size(), counter );
129
  }
130
131
  static Lexer createLexer( final String text ) {
132
    return new Lexer( text );
118 133
  }
119 134
}
M src/test/java/com/whitemagicsoftware/keenquotes/ParserTest.java
13 13
 * Test that all unambiguous apostrophes are emitted once.
14 14
 */
15
class ParserTest {
15
final class ParserTest {
16
16 17
  @SuppressWarnings( "TextBlockMigration" )
17 18
  private final static Map<String, Map<TokenType, Integer>> TEST_CASES =
A src/test/java/com/whitemagicsoftware/keenquotes/XmlLexerTest.java
1
/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */
2
package com.whitemagicsoftware.keenquotes;
3
4
import org.junit.jupiter.api.Test;
5
6
import static com.whitemagicsoftware.keenquotes.LexemeType.*;
7
import static com.whitemagicsoftware.keenquotes.LexerTest.testType;
8
9
/**
10
 * Test that parsing XML documents ignores elements.
11
 */
12
final class XmlLexerTest {
13
14
  @Test
15
  void test_Lexing_Xml_EmitTags() {
16
    final var actual = "The <em>world's</em> aflame.";
17
    testType(
18
      createXmlLexer( actual ), actual,
19
      WORD, SPACE, WORD, QUOTE_SINGLE, WORD, SPACE, WORD, PERIOD
20
    );
21
  }
22
23
  @Test
24
  void test_Lexing_XmlAttribute_EmitTags() {
25
    final var actual = "<a href=\"http://x.org\">X11</a>";
26
    testType( createXmlLexer( actual ), actual, WORD, NUMBER );
27
  }
28
29
  static Lexer createXmlLexer( final String text ) {
30
    return new XmlLexer( text );
31
  }
32
}
1 33