Dave Jarvis' Repositories

git clone https://repo.autonoma.ca/repo/keenquotes.git
M src/main/java/com/whitemagicsoftware/keenquotes/app/KeenQuotes.java
13 13
import static java.lang.String.format;
14 14
import static java.lang.System.*;
15
import static java.nio.charset.StandardCharsets.UTF_8;
15 16
import static picocli.CommandLine.Help.Ansi.Style.*;
16 17
import static picocli.CommandLine.Help.ColorScheme;
...
62 63
63 64
  private String convert( final Curler curler ) throws IOException {
64
    return curler.apply( new String( in.readAllBytes() ) );
65
    return curler.apply( new String( in.readAllBytes(), UTF_8 ) );
65 66
  }
66 67
M src/main/java/com/whitemagicsoftware/keenquotes/parser/AmbiguityResolver.java
64 64
      mTree = mTree.closing( token );
65 65
    }
66
    else if( token.isType( QUOTE_AMBIGUOUS_DOUBLE ) ) {
67
      // Create subtrees for: <" ... ">, <" ">, <"">, etc.
68
      if( mTree.hasOpeningDoubleQuote() ) {
69
        token.setTokenType( QUOTE_CLOSING_DOUBLE );
70
        mTree = mTree.closing( token );
71
      }
72
      else {
73
        token.setTokenType( QUOTE_OPENING_DOUBLE );
74
        mTree = mTree.opening( token );
75
      }
76
    }
66 77
    // Add ambiguous tokens to be resolved; add apostrophes for later emitting.
67 78
    else {
...
107 118
  private void resolve( final List<Token> tokens ) {
108 119
    assert tokens != null;
120
121
    boolean leader = false;
109 122
110 123
    for( final var token : tokens ) {
111 124
      if( token.isType( QUOTE_AMBIGUOUS_LEADING ) ) {
112 125
        // Once a leader quote is found, any laggard could be a closing quote.
113
        break;
126
        leader = true;
114 127
      }
115
      else if( token.isType( QUOTE_AMBIGUOUS_LAGGING ) ) {
128
      else if( !leader && token.isType( QUOTE_AMBIGUOUS_LAGGING ) ) {
116 129
        token.setTokenType( QUOTE_APOSTROPHE );
130
      }
131
      else if( token.isType( QUOTE_AMBIGUOUS_SINGLE ) ) {
132
        // Create subtrees for: <' ... '>, <' '>, <''>, etc.
133
        if( mTree.hasOpeningSingleQuote() ) {
134
          token.setTokenType( QUOTE_CLOSING_SINGLE );
135
          mTree = mTree.closing( token );
136
        }
137
        else {
138
          token.setTokenType( QUOTE_OPENING_SINGLE );
139
          mTree = mTree.opening( token );
140
        }
117 141
      }
118 142
    }
...
129 153
    final var countLeading = tree.count( QUOTE_AMBIGUOUS_LEADING );
130 154
    final var countLagging = tree.count( QUOTE_AMBIGUOUS_LAGGING );
131
    final var countUnknown = tree.count( AMBIGUOUS );
155
    final var countSingles = tree.count( QUOTE_AMBIGUOUS_SINGLE );
132 156
133 157
    if( tree.hasOpeningSingleQuote() && !tree.hasClosingSingleQuote() ) {
134
      if( countUnknown == 0 && countLeading == 0 && countLagging == 1 ) {
158
      if( countSingles == 0 && countLeading == 0 && countLagging == 1 ) {
135 159
        tree.replaceAll( QUOTE_AMBIGUOUS_LAGGING, QUOTE_CLOSING_SINGLE );
136 160
      }
137
      else if( countUnknown == 1 && countLagging == 0 ) {
138
        tree.replaceAll( AMBIGUOUS, QUOTE_CLOSING_SINGLE );
161
      else if( countSingles == 1 && countLagging == 0 ) {
162
        tree.replaceAll( QUOTE_AMBIGUOUS_SINGLE, QUOTE_CLOSING_SINGLE );
139 163
      }
140 164
    }
141 165
142
    if( countUnknown == 0 && countLeading == 1 && countLagging == 0 &&
166
    if( countSingles == 0 && countLeading == 1 && countLagging == 0 &&
143 167
      !tree.hasOpeningSingleQuote() && tree.hasClosingSingleQuote() ) {
144 168
      tree.replaceAll( QUOTE_AMBIGUOUS_LEADING, QUOTE_OPENING_SINGLE );
145 169
    }
146 170
147 171
    if( !tree.hasOpeningSingleQuote() && !tree.hasClosingSingleQuote() ||
148 172
      tree.isBalanced() ) {
149
      if( countUnknown == 0 && countLeading > 0 && countLagging == 0 ) {
173
      if( countSingles == 0 && countLeading > 0 && countLagging == 0 ) {
150 174
        tree.replaceAll( QUOTE_AMBIGUOUS_LEADING, QUOTE_APOSTROPHE );
151 175
      }
152 176
153
      if( countUnknown == 0 && countLeading == 0 && countLagging > 0 ) {
177
      if( countSingles == 0 && countLeading == 0 && countLagging > 0 ) {
154 178
        tree.replaceAll( QUOTE_AMBIGUOUS_LAGGING, QUOTE_APOSTROPHE );
155 179
      }
M src/main/java/com/whitemagicsoftware/keenquotes/parser/Contractions.java
97 97
  }
98 98
99
  public boolean beganEndedUmambiguously( final String word ) {
99
  public boolean beganEndedUnambiguously( final String word ) {
100 100
    assert word != null;
101 101
    return getBeganEndedUnambiguous().contains( word );
...
170 170
  public String toString() {
171 171
    return
172
      toString( getBeganAmbiguous(), "Ambiguous Began", "'%s" ) +
173
        toString( getEndedAmbiguous(), "Ambiguous Ended", "%s'" ) +
172
      toString( getBeganEndedUnambiguous(), "Unambiguous Began/Ended", "'%s" ) +
174 173
        toString( getBeganUnambiguous(), "Unambiguous Began", "'%s" ) +
175
        toString( getEndedUnambiguous(), "Unambiguous Ended", "%s'" );
174
        toString( getEndedUnambiguous(), "Unambiguous Ended", "%s'" ) +
175
        toString( getBeganAmbiguous(), "Ambiguous Began", "'%s" ) +
176
        toString( getEndedAmbiguous(), "Ambiguous Ended", "%s'" );
176 177
  }
177 178
178 179
  private String toString(
179
    final Set<String> words, final String category, final String fmt ) {
180
    final Set<String> words,
181
    final String category,
182
    final String fmt
183
  ) {
180 184
    final var sb = new StringBuilder( 16384 );
181 185
    final var newline = System.lineSeparator();
...
193 197
   */
194 198
  private static final Set<String> BEGAN_ENDED_UNAMBIGUOUS = Set.of(
199
    // hacking
195 200
    "ackin",
201
    // hammering
196 202
    "ammerin",
197 203
    // hankering
198 204
    "ankerin",
199 205
    // having
200 206
    "avin",
207
    // hawking
201 208
    "awkin",
202 209
    "cepin",
203 210
    "ceppin",
204 211
    "ceptin",
205 212
    "cordin",
206 213
    "eadin",
207 214
    "eavin",
208 215
    "elpin",
216
    // hindering
209 217
    "inderin",
210 218
    "lectioneerin",
219
    // amazing
211 220
    "mazin",
221
    // remembering
212 222
    "memberin",
213 223
    // fish 'n' chips
214 224
    "n",
225
    // hobbling
215 226
    "obblin",
227
    // holding
216 228
    "oldin",
229
    // hollering
217 230
    "ollerin",
231
    // hopping
218 232
    "oppin",
219 233
    "ousekeepin",
234
    // howling
220 235
    "owlin",
221 236
    "sceptin",
222 237
    "spectin",
223 238
    "splainin",
239
    // supposing
224 240
    "sposin",
225 241
    "sputin",
226 242
    "stonishin",
243
    // destroying
227 244
    "stroyin",
245
    // persuading
228 246
    "suadin",
229 247
    "titivatin",
248
    // introducing
230 249
    "troducin",
250
    // hugging
231 251
    "uggin",
252
    // hulking
232 253
    "ulkin",
254
    // humbugging
233 255
    "umbuggin",
256
    // humiliating
234 257
    "umiliatin",
258
    // humming
235 259
    "ummin",
260
    // humping
236 261
    "umpin",
262
    // hurrying
237 263
    "urryin",
264
    // hurting
238 265
    "urtin",
266
    // hustling
239 267
    "ustlin",
268
    // investigating
240 269
    "vestigatin",
270
    // inviting
241 271
    "vitin",
242 272
    "xceptin",
...
345 375
    "twould",
346 376
    "twouldn",
377
    // one
378
    "un",
347 379
    // have
348 380
    "ve",
M src/main/java/com/whitemagicsoftware/keenquotes/parser/QuoteEmitter.java
53 53
  };
54 54
55
  private static final LexemeType[] QUOTE_SINGLE_QUOTE_DOUBLE = {
56
    QUOTE_SINGLE, QUOTE_DOUBLE
57
  };
58
59
  /**
60
   * Single quotes preceded by these {@link LexemeType}s may be opening quotes.
61
   */
62
  private static final LexemeType[] LEADING_QUOTE_OPENING_SINGLE =
63
    new LexemeType[]{
64
      LexemeType.SOT, SPACE, DASH, QUOTE_DOUBLE, OPENING_GROUP, EOL, EOP
65
    };
66
67
  /**
68
   * Single quotes succeeded by these {@link LexemeType}s may be opening quotes.
69
   */
70
  private static final LexemeType[] LAGGING_QUOTE_OPENING_SINGLE =
71
    new LexemeType[]{
72
      WORD, ELLIPSIS, QUOTE_SINGLE, QUOTE_DOUBLE
73
    };
74
75
  /**
76
   * Single quotes preceded by these {@link LexemeType}s may be closing quotes.
77
   */
78
  private static final LexemeType[] LEADING_QUOTE_CLOSING_SINGLE =
79
    new LexemeType[]{
80
      WORD, NUMBER, PERIOD, PUNCT, ELLIPSIS, QUOTE_DOUBLE
81
    };
82
83
  /**
84
   * Single quotes succeeded by these {@link LexemeType}s may be closing quotes.
85
   */
86
  private static final LexemeType[] LAGGING_QUOTE_CLOSING_SINGLE =
87
    new LexemeType[]{
88
      SPACE, HYPHEN, DASH, PUNCT, PERIOD, ELLIPSIS, QUOTE_DOUBLE, CLOSING_GROUP,
89
      ENDING
90
    };
91
92
  /**
93
   * Double quotes preceded by these {@link LexemeType}s may be opening quotes.
94
   */
95
  private static final LexemeType[] LEADING_QUOTE_OPENING_DOUBLE =
96
    new LexemeType[]{
97
      LexemeType.SOT, SPACE, DASH, EQUALS, QUOTE_SINGLE, OPENING_GROUP, EOL, EOP
98
    };
99
100
  /**
101
   * Double quotes succeeded by these {@link LexemeType}s may be opening quotes.
102
   */
103
  private static final LexemeType[] LAGGING_QUOTE_OPENING_DOUBLE =
104
    new LexemeType[]{
105
      WORD, NUMBER, DASH, ELLIPSIS, OPENING_GROUP, QUOTE_SINGLE,
106
      QUOTE_SINGLE_OPENING, QUOTE_SINGLE_CLOSING, QUOTE_DOUBLE
107
    };
108
109
  /**
110
   * Double quotes preceded by these {@link LexemeType}s may be closing quotes.
111
   */
112
  private static final LexemeType[] LEADING_QUOTE_CLOSING_DOUBLE =
113
    new LexemeType[]{
114
      WORD, NUMBER, PERIOD, PUNCT, DASH, ELLIPSIS, CLOSING_GROUP, QUOTE_SINGLE,
115
      QUOTE_SINGLE_CLOSING, QUOTE_SINGLE_OPENING
116
    };
117
118
  /**
119
   * Double quotes succeeded by these {@link LexemeType}s may be closing quotes.
120
   */
121
  private static final LexemeType[] LAGGING_QUOTE_CLOSING_DOUBLE =
122
    new LexemeType[]{
123
      SPACE, PUNCT, PERIOD, EQUALS, HYPHEN, DASH, QUOTE_SINGLE, CLOSING_GROUP,
124
      ENDING
125
    };
126
127
  private final CircularFifoQueue<Lexeme> mQ = new CircularFifoQueue<>( 4 );
128
  private final String mText;
129
  private final Contractions mContractions;
130
  private final Consumer<Token> mConsumer;
131
132
  public QuoteEmitter(
133
    final String text,
134
    final Contractions contractions,
135
    final Consumer<Token> consumer
136
  ) {
137
    assert text != null;
138
    assert contractions != null;
139
140
    mText = text;
141
    mContractions = contractions;
142
    mConsumer = consumer;
143
  }
144
145
  /**
146
   * Scans the given text document for quotation marks and passes them to the
147
   * given {@link Token} {@link Consumer}.
148
   *
149
   * @param text         The prose to lex.
150
   * @param contractions List of ambiguous and unambiguous contractions.
151
   * @param consumer     Receives
152
   */
153
  public static void analyze(
154
    final String text,
155
    final Contractions contractions,
156
    final Consumer<Token> consumer,
157
    final LexerFilter filter
158
  ) {
159
    final var emitter = new QuoteEmitter( text, contractions, consumer );
160
    Lexer.lex( text, emitter, filter );
161
  }
162
163
  /**
164
   * @param lexeme the input argument
165
   */
166
  @Override
167
  public void accept( final Lexeme lexeme ) {
168
    mQ.add( lexeme );
169
170
    if( mQ.size() == 4 ) {
171
      parse();
172
    }
173
  }
174
175
  private void parse() {
176
    final var lex1 = mQ.get( 0 );
177
    final var lex2 = mQ.get( 1 );
178
    final var lex3 = mQ.get( 2 );
179
    final var lex4 = mQ.get( 3 );
180
181
    // <y'all>, <Ph.D.'ll>, <20's>, <she's>
182
    if( match( WORD_PERIOD_NUMBER, QUOTE_SINGLE, WORD, ANY ) ) {
183
      emit( QUOTE_APOSTROPHE, lex2 );
184
    }
185
    // <'n'>, <'N'>, <'owlin'>
186
    else if(
187
      match( ANY, QUOTE_SINGLE, WORD, QUOTE_SINGLE ) &&
188
        mContractions.beganEndedUmambiguously(lex3.toString(mText))
189
    ) {
190
      emit( QUOTE_APOSTROPHE, lex2 );
191
      emit( QUOTE_APOSTROPHE, lex4 );
192
      mQ.set( Lexeme.NONE, 3 );
193
    }
194
    // <2''>
195
    else if( match( NUMBER, QUOTE_SINGLE, QUOTE_SINGLE, ANY ) ) {
196
      emit( QUOTE_PRIME_DOUBLE, lex2.began(), lex3.ended() );
197
      mQ.set( Lexeme.NONE, 2 );
198
    }
199
    // <2'>
200
    else if( match( NUMBER, QUOTE_SINGLE, ANY, ANY ) ) {
201
      emit( QUOTE_PRIME_SINGLE, lex2 );
202
    }
203
    // <2">
204
    else if( match( NUMBER, QUOTE_DOUBLE, ANY, ANY ) ) {
205
      emit( QUOTE_PRIME_DOUBLE, lex2 );
206
    }
207
    // <thinkin'>
208
    else if(
209
      match( WORD, QUOTE_SINGLE, ANY, ANY ) &&
210
        mContractions.endedUnambiguously( lex1.toString( mText ) )
211
    ) {
212
      emit( QUOTE_APOSTROPHE, lex2 );
213
    }
214
    // <'02>
215
    else if( match( ANY, QUOTE_SINGLE, NUMBER, SPACE_PUNCT ) ) {
216
      emit( QUOTE_APOSTROPHE, lex2 );
217
    }
218
    // <'20s>
219
    else if(
220
      match( ANY, QUOTE_SINGLE, NUMBER, WORD ) &&
221
        "s".equalsIgnoreCase( lex4.toString( mText ) )
222
    ) {
223
      emit( QUOTE_APOSTROPHE, lex2 );
224
    }
225
    // <.'\n>
226
    else if( match( PUNCT_PERIOD_ELLIPSIS_DASH, QUOTE_SINGLE, ENDING, ANY ) ) {
227
      emit( QUOTE_CLOSING_SINGLE, lex2 );
228
    }
229
    // <\'>
230
    else if( match( ESC_SINGLE, ANY, ANY, ANY ) ) {
231
      emit( QUOTE_STRAIGHT_SINGLE, lex1 );
232
    }
233
    // <\">
234
    else if( match( ESC_DOUBLE, ANY, ANY, ANY ) ) {
235
      emit( QUOTE_STRAIGHT_DOUBLE, lex1 );
236
237
      // <\"'--->
238
      if( match( ESC_DOUBLE, QUOTE_SINGLE, SPACE_DASH_ENDING, ANY ) ) {
239
        emit( QUOTE_CLOSING_SINGLE, lex2 );
240
      }
241
    }
242
    // <---'" >
243
    else if( match( DASH, QUOTE_SINGLE, QUOTE_DOUBLE, SPACE_ENDING ) ) {
244
      emit( QUOTE_CLOSING_SINGLE, lex2 );
245
    }
246
    // <o’-lantern>, <o' fellow>, <O'-the>
247
    else if(
248
      match( WORD, QUOTE_SINGLE, SPACE_HYPHEN, WORD ) &&
249
        "o".equalsIgnoreCase( lex1.toString( mText ) )
250
    ) {
251
      emit( QUOTE_APOSTROPHE, lex2 );
252
    }
253
    // <"">, <"...>, <"word>, <---"word>
254
    else if(
255
      match(
256
        LEADING_QUOTE_OPENING_DOUBLE, QUOTE_DOUBLE,
257
        LAGGING_QUOTE_OPENING_DOUBLE, ANY
258
      )
259
    ) {
260
      emit( QUOTE_OPENING_DOUBLE, lex2 );
261
    }
262
    // <..."'>, <word"'>, <?"'>, <word"?>
263
    else if(
264
      match(
265
        LEADING_QUOTE_CLOSING_DOUBLE, QUOTE_DOUBLE,
266
        LAGGING_QUOTE_CLOSING_DOUBLE, ANY
267
      )
268
    ) {
269
      emit( QUOTE_CLOSING_DOUBLE, lex2 );
270
    }
271
    // < ''E>
272
    else if( match( SPACE_SOT, QUOTE_SINGLE, QUOTE_SINGLE, WORD ) ) {
273
      // Consume both immediately to avoid the false ambiguity <'e>.
274
      emit( QUOTE_OPENING_SINGLE, lex2 );
275
      emit( QUOTE_APOSTROPHE, lex3 );
276
      mQ.set( Lexeme.NONE, 1 );
277
      mQ.set( Lexeme.NONE, 2 );
278
    }
279
    // <'...>, <'word>, <---'word>, < 'nation>
280
    else if(
281
      match(
282
        LEADING_QUOTE_OPENING_SINGLE, QUOTE_SINGLE,
283
        LAGGING_QUOTE_OPENING_SINGLE, ANY )
284
    ) {
285
      final var word = lex3.toString( mText );
286
287
      if( mContractions.beganAmbiguously( word ) ) {
288
        emit( QUOTE_AMBIGUOUS_LEADING, lex2 );
289
      }
290
      else if( mContractions.beganUnambiguously( word ) ) {
291
        emit( QUOTE_APOSTROPHE, lex2 );
292
      }
293
      // <"'"nested>
294
      else if( match( QUOTE_DOUBLE, QUOTE_SINGLE, QUOTE_DOUBLE, WORD ) ) {
295
        emit( QUOTE_OPENING_SINGLE, lex2 );
296
      }
297
      // <"'" >
298
      else if( match( QUOTE_DOUBLE, QUOTE_SINGLE, QUOTE_DOUBLE, ANY ) ) {
299
        emit( lex2 );
300
      }
301
      // < '" >
302
      else if( match( ANY, QUOTE_SINGLE, LAGGING_QUOTE_OPENING_SINGLE, ANY ) ) {
303
        emit( QUOTE_OPENING_SINGLE, lex2 );
304
      }
305
      // Ambiguous
306
      else {
307
        emit( QUOTE_AMBIGUOUS_LEADING, lex2 );
308
      }
309
    }
310
    // <word'">, <...'--->, <"' >
311
    else if(
312
      match(
313
        LEADING_QUOTE_CLOSING_SINGLE, QUOTE_SINGLE,
314
        LAGGING_QUOTE_CLOSING_SINGLE, ANY
315
      )
316
    ) {
317
      final var word = lex1.toString( mText );
318
319
      if( mContractions.endedAmbiguously( word ) ) {
320
        emit( QUOTE_AMBIGUOUS_LAGGING, lex2 );
321
      }
322
      else {
323
        emit( QUOTE_CLOSING_SINGLE, lex2 );
324
      }
325
    }
326
    // <word';> (contraction inferred by previous matches)
327
    else if( match( WORD, QUOTE_SINGLE, PUNCT_PERIOD, ANY ) ) {
328
      emit( QUOTE_APOSTROPHE, lex2 );
329
    }
330
    // <---'">
331
    else if( match( DASH, QUOTE_SINGLE, QUOTE_DOUBLE, ANY ) ) {
332
      emit( QUOTE_CLOSING_SINGLE, lex2 );
333
    }
334
    // <'42>, <'-3.14>
335
    else if( match( ANY, QUOTE_SINGLE, NUMBER, ANY ) ) {
336
      emit( QUOTE_OPENING_SINGLE, lex2 );
337
    }
338
    // <PRE-PARSED><'---.>
339
    else if( match( LexemeType.NONE, QUOTE_SINGLE, ANY, ANY ) ) {
340
      emit( QUOTE_CLOSING_SINGLE, lex2 );
341
    }
342
    // <''Cause >
343
    else if( match( QUOTE_SINGLE, QUOTE_SINGLE, WORD, ANY ) ) {
344
      final var word = lex3.toString( mText );
345
346
      if( mContractions.beganAmbiguously( word ) ) {
347
        emit( QUOTE_AMBIGUOUS_LEADING, lex2 );
348
      }
349
      else if( mContractions.beganUnambiguously( word ) ) {
350
        emit( QUOTE_APOSTROPHE, lex2 );
351
      }
352
      else {
353
        emit( lex2 );
354
      }
355
    }
356
    // Ambiguous (no match)
357
    else if( match( ANY, QUOTE_SINGLE_QUOTE_DOUBLE, ANY, ANY ) ) {
358
      emit( lex2 );
359
    }
360
  }
361
362
  private void emit( final TokenType tokenType, final Lexeme lexeme ) {
363
    mConsumer.accept( new Token( tokenType, lexeme ) );
364
  }
365
366
  private void emit(
367
    final TokenType tokenType,
368
    final int began,
369
    final int ended ) {
370
    mConsumer.accept( new Token( tokenType, began, ended ) );
371
  }
372
373
  /**
374
   * Emits a token that represents an ambiguous quotation mark.
375
   *
376
   * @param lexeme A quotation mark that could not be curled.
377
   */
378
  private void emit( final Lexeme lexeme ) {
379
    mConsumer.accept( new Token( AMBIGUOUS, lexeme ) );
380
  }
381
382
  private boolean match(
383
    final LexemeType l1,
384
    final LexemeType l2,
385
    final LexemeType l3,
386
    final LexemeType l4 ) {
387
    return mQ.get( 0 ).isType( l1 ) &&
388
      mQ.get( 1 ).isType( l2 ) &&
389
      mQ.get( 2 ).isType( l3 ) &&
390
      mQ.get( 3 ).isType( l4 );
391
  }
392
393
  private boolean match(
394
    final LexemeType[] l1,
395
    final LexemeType l2,
396
    final LexemeType l3,
397
    final LexemeType l4 ) {
398
    return mQ.get( 0 ).isType( l1 ) &&
399
      mQ.get( 1 ).isType( l2 ) &&
400
      mQ.get( 2 ).isType( l3 ) &&
401
      mQ.get( 3 ).isType( l4 );
402
  }
403
404
  private boolean match(
405
    final LexemeType l1,
406
    final LexemeType[] l2,
55
  /**
56
   * Single quotes preceded by these {@link LexemeType}s may be opening quotes.
57
   */
58
  private static final LexemeType[] LEADING_QUOTE_OPENING_SINGLE =
59
    new LexemeType[]{
60
      LexemeType.SOT, SPACE, DASH, QUOTE_DOUBLE, OPENING_GROUP, EOL, EOP
61
    };
62
63
  /**
64
   * Single quotes succeeded by these {@link LexemeType}s may be opening quotes.
65
   */
66
  private static final LexemeType[] LAGGING_QUOTE_OPENING_SINGLE =
67
    new LexemeType[]{
68
      WORD, ELLIPSIS, QUOTE_SINGLE, QUOTE_DOUBLE
69
    };
70
71
  /**
72
   * Single quotes preceded by these {@link LexemeType}s may be closing quotes.
73
   */
74
  private static final LexemeType[] LEADING_QUOTE_CLOSING_SINGLE =
75
    new LexemeType[]{
76
      WORD, NUMBER, PERIOD, PUNCT, ELLIPSIS, QUOTE_DOUBLE
77
    };
78
79
  /**
80
   * Single quotes succeeded by these {@link LexemeType}s may be closing quotes.
81
   */
82
  private static final LexemeType[] LAGGING_QUOTE_CLOSING_SINGLE =
83
    new LexemeType[]{
84
      SPACE, HYPHEN, DASH, PUNCT, PERIOD, ELLIPSIS, QUOTE_DOUBLE, CLOSING_GROUP,
85
      ENDING
86
    };
87
88
  /**
89
   * Double quotes preceded by these {@link LexemeType}s may be opening quotes.
90
   */
91
  private static final LexemeType[] LEADING_QUOTE_OPENING_DOUBLE =
92
    new LexemeType[]{
93
      LexemeType.SOT, SPACE, DASH, EQUALS, QUOTE_SINGLE, OPENING_GROUP, EOL, EOP
94
    };
95
96
  /**
97
   * Double quotes succeeded by these {@link LexemeType}s may be opening quotes.
98
   */
99
  private static final LexemeType[] LAGGING_QUOTE_OPENING_DOUBLE =
100
    new LexemeType[]{
101
      WORD, PUNCT, NUMBER, DASH, ELLIPSIS, OPENING_GROUP, QUOTE_SINGLE,
102
      QUOTE_SINGLE_OPENING, QUOTE_SINGLE_CLOSING, QUOTE_DOUBLE
103
    };
104
105
  /**
106
   * Double quotes preceded by these {@link LexemeType}s may be closing quotes.
107
   */
108
  private static final LexemeType[] LEADING_QUOTE_CLOSING_DOUBLE =
109
    new LexemeType[]{
110
      WORD, NUMBER, PERIOD, PUNCT, DASH, ELLIPSIS, CLOSING_GROUP, QUOTE_SINGLE,
111
      QUOTE_SINGLE_CLOSING, QUOTE_SINGLE_OPENING
112
    };
113
114
  /**
115
   * Double quotes succeeded by these {@link LexemeType}s may be closing quotes.
116
   */
117
  private static final LexemeType[] LAGGING_QUOTE_CLOSING_DOUBLE =
118
    new LexemeType[]{
119
      SPACE, PUNCT, PERIOD, EQUALS, HYPHEN, DASH, QUOTE_SINGLE, CLOSING_GROUP,
120
      ENDING
121
    };
122
123
  private final CircularFifoQueue<Lexeme> mQ = new CircularFifoQueue<>( 4 );
124
  private final String mText;
125
  private final Contractions mContractions;
126
  private final Consumer<Token> mConsumer;
127
128
  public QuoteEmitter(
129
    final String text,
130
    final Contractions contractions,
131
    final Consumer<Token> consumer
132
  ) {
133
    assert text != null;
134
    assert contractions != null;
135
136
    mText = text;
137
    mContractions = contractions;
138
    mConsumer = consumer;
139
  }
140
141
  /**
142
   * Scans the given text document for quotation marks and passes them to the
143
   * given {@link Token} {@link Consumer}.
144
   *
145
   * @param text         The prose to lex.
146
   * @param contractions List of ambiguous and unambiguous contractions.
147
   * @param consumer     Receives
148
   */
149
  public static void analyze(
150
    final String text,
151
    final Contractions contractions,
152
    final Consumer<Token> consumer,
153
    final LexerFilter filter
154
  ) {
155
    final var emitter = new QuoteEmitter( text, contractions, consumer );
156
    Lexer.lex( text, emitter, filter );
157
  }
158
159
  /**
160
   * @param lexeme the input argument
161
   */
162
  @Override
163
  public void accept( final Lexeme lexeme ) {
164
    mQ.add( lexeme );
165
166
    if( mQ.size() == 4 ) {
167
      parse();
168
    }
169
  }
170
171
  private void parse() {
172
    final var lex1 = mQ.get( 0 );
173
    final var lex2 = mQ.get( 1 );
174
    final var lex3 = mQ.get( 2 );
175
    final var lex4 = mQ.get( 3 );
176
177
    // <y'all>, <Ph.D.'ll>, <20's>, <she's>
178
    if( match( WORD_PERIOD_NUMBER, QUOTE_SINGLE, WORD, ANY ) ) {
179
      emit( QUOTE_APOSTROPHE, lex2 );
180
    }
181
    // <'n'>, <'N'>, <'owlin'>
182
    else if(
183
      match( ANY, QUOTE_SINGLE, WORD, QUOTE_SINGLE ) &&
184
        mContractions.beganEndedUnambiguously( lex3.toString( mText ) )
185
    ) {
186
      emit( QUOTE_APOSTROPHE, lex2 );
187
      emit( QUOTE_APOSTROPHE, lex4 );
188
      mQ.set( Lexeme.NONE, 3 );
189
    }
190
    // <2''>
191
    else if( match( NUMBER, QUOTE_SINGLE, QUOTE_SINGLE, ANY ) ) {
192
      emit( QUOTE_PRIME_DOUBLE, lex2.began(), lex3.ended() );
193
      mQ.set( Lexeme.NONE, 2 );
194
    }
195
    // <2'>
196
    else if( match( NUMBER, QUOTE_SINGLE, ANY, ANY ) ) {
197
      emit( QUOTE_PRIME_SINGLE, lex2 );
198
    }
199
    // <2">
200
    else if( match( NUMBER, QUOTE_DOUBLE, ANY, ANY ) ) {
201
      emit( QUOTE_PRIME_DOUBLE, lex2 );
202
    }
203
    // <thinkin'>
204
    else if(
205
      match( WORD, QUOTE_SINGLE, ANY, ANY ) &&
206
        mContractions.endedUnambiguously( lex1.toString( mText ) )
207
    ) {
208
      emit( QUOTE_APOSTROPHE, lex2 );
209
    }
210
    // <'02>
211
    else if( match( ANY, QUOTE_SINGLE, NUMBER, SPACE_PUNCT ) ) {
212
      emit( QUOTE_APOSTROPHE, lex2 );
213
    }
214
    // <'20s>
215
    else if(
216
      match( ANY, QUOTE_SINGLE, NUMBER, WORD ) &&
217
        "s".equalsIgnoreCase( lex4.toString( mText ) )
218
    ) {
219
      emit( QUOTE_APOSTROPHE, lex2 );
220
    }
221
    // <.'\n>
222
    else if( match( PUNCT_PERIOD_ELLIPSIS_DASH, QUOTE_SINGLE, ENDING, ANY ) ) {
223
      emit( QUOTE_CLOSING_SINGLE, lex2 );
224
    }
225
    // <\'>
226
    else if( match( ESC_SINGLE, ANY, ANY, ANY ) ) {
227
      emit( QUOTE_STRAIGHT_SINGLE, lex1 );
228
    }
229
    // <\">
230
    else if( match( ESC_DOUBLE, ANY, ANY, ANY ) ) {
231
      emit( QUOTE_STRAIGHT_DOUBLE, lex1 );
232
233
      // <\"'--->
234
      if( match( ESC_DOUBLE, QUOTE_SINGLE, SPACE_DASH_ENDING, ANY ) ) {
235
        emit( QUOTE_CLOSING_SINGLE, lex2 );
236
      }
237
    }
238
    // <---'" >
239
    else if( match( DASH, QUOTE_SINGLE, QUOTE_DOUBLE, SPACE_ENDING ) ) {
240
      emit( QUOTE_CLOSING_SINGLE, lex2 );
241
    }
242
    // <o’-lantern>, <o' fellow>, <O'-the>
243
    else if(
244
      match( WORD, QUOTE_SINGLE, SPACE_HYPHEN, WORD ) &&
245
        "o".equalsIgnoreCase( lex1.toString( mText ) )
246
    ) {
247
      emit( QUOTE_APOSTROPHE, lex2 );
248
    }
249
    // <"">, <"...>, <"word>, <---"word>
250
    else if(
251
      match(
252
        LEADING_QUOTE_OPENING_DOUBLE, QUOTE_DOUBLE,
253
        LAGGING_QUOTE_OPENING_DOUBLE, ANY
254
      )
255
    ) {
256
      emit( QUOTE_OPENING_DOUBLE, lex2 );
257
    }
258
    // <..."'>, <word"'>, <?"'>, <word"?>
259
    else if(
260
      match(
261
        LEADING_QUOTE_CLOSING_DOUBLE, QUOTE_DOUBLE,
262
        LAGGING_QUOTE_CLOSING_DOUBLE, ANY
263
      )
264
    ) {
265
      emit( QUOTE_CLOSING_DOUBLE, lex2 );
266
    }
267
    // < ''E>
268
    else if( match( SPACE_SOT, QUOTE_SINGLE, QUOTE_SINGLE, WORD ) ) {
269
      // Consume both immediately to avoid the false ambiguity <'e>.
270
      emit( QUOTE_OPENING_SINGLE, lex2 );
271
      emit( QUOTE_APOSTROPHE, lex3 );
272
      mQ.set( Lexeme.NONE, 1 );
273
      mQ.set( Lexeme.NONE, 2 );
274
    }
275
    // <'...>, <'word>, <---'word>, < 'nation>
276
    else if(
277
      match(
278
        LEADING_QUOTE_OPENING_SINGLE, QUOTE_SINGLE,
279
        LAGGING_QUOTE_OPENING_SINGLE, ANY )
280
    ) {
281
      final var word = lex3.toString( mText );
282
283
      if( mContractions.beganAmbiguously( word ) ) {
284
        emit( QUOTE_AMBIGUOUS_LEADING, lex2 );
285
      }
286
      else if( mContractions.beganUnambiguously( word ) ) {
287
        emit( QUOTE_APOSTROPHE, lex2 );
288
      }
289
      // <"'"nested>
290
      else if( match( QUOTE_DOUBLE, QUOTE_SINGLE, QUOTE_DOUBLE, WORD ) ) {
291
        emit( QUOTE_OPENING_SINGLE, lex2 );
292
      }
293
      // <"'" >
294
      else if( match( QUOTE_DOUBLE, QUOTE_SINGLE, QUOTE_DOUBLE, ANY ) ) {
295
        emit( QUOTE_AMBIGUOUS_SINGLE, lex2 );
296
      }
297
      // < '" >
298
      else if( match( ANY, QUOTE_SINGLE, LAGGING_QUOTE_OPENING_SINGLE, ANY ) ) {
299
        emit( QUOTE_OPENING_SINGLE, lex2 );
300
      }
301
      // Ambiguous
302
      else {
303
        emit( QUOTE_AMBIGUOUS_LEADING, lex2 );
304
      }
305
    }
306
    // <word'">, <...'--->, <"' >
307
    else if(
308
      match(
309
        LEADING_QUOTE_CLOSING_SINGLE, QUOTE_SINGLE,
310
        LAGGING_QUOTE_CLOSING_SINGLE, ANY
311
      )
312
    ) {
313
      final var word = lex1.toString( mText );
314
315
      if( mContractions.endedAmbiguously( word ) ) {
316
        emit( QUOTE_AMBIGUOUS_LAGGING, lex2 );
317
      }
318
      else {
319
        emit( QUOTE_CLOSING_SINGLE, lex2 );
320
      }
321
    }
322
    // <word';> (contraction inferred by previous matches)
323
    else if( match( WORD, QUOTE_SINGLE, PUNCT_PERIOD, ANY ) ) {
324
      emit( QUOTE_APOSTROPHE, lex2 );
325
    }
326
    // <---'">
327
    else if( match( DASH, QUOTE_SINGLE, QUOTE_DOUBLE, ANY ) ) {
328
      emit( QUOTE_CLOSING_SINGLE, lex2 );
329
    }
330
    // <'42>, <'-3.14>
331
    else if( match( ANY, QUOTE_SINGLE, NUMBER, ANY ) ) {
332
      emit( QUOTE_OPENING_SINGLE, lex2 );
333
    }
334
    // <PRE-PARSED><'---.>
335
    else if( match( LexemeType.NONE, QUOTE_SINGLE, ANY, ANY ) ) {
336
      emit( QUOTE_CLOSING_SINGLE, lex2 );
337
    }
338
    // <''Cause >
339
    else if( match( QUOTE_SINGLE, QUOTE_SINGLE, WORD, ANY ) ) {
340
      final var word = lex3.toString( mText );
341
342
      if( mContractions.beganAmbiguously( word ) ) {
343
        emit( QUOTE_AMBIGUOUS_LEADING, lex2 );
344
      }
345
      else if( mContractions.beganUnambiguously( word ) ) {
346
        emit( QUOTE_APOSTROPHE, lex2 );
347
      }
348
      else {
349
        emit( QUOTE_AMBIGUOUS_SINGLE, lex2 );
350
      }
351
    }
352
    else if( match( ANY, QUOTE_DOUBLE, ANY, ANY ) ) {
353
      emit( QUOTE_AMBIGUOUS_DOUBLE, lex2 );
354
    }
355
    // Ambiguous (no match)
356
    else if( match( ANY, QUOTE_SINGLE, ANY, ANY ) ) {
357
      emit( QUOTE_AMBIGUOUS_SINGLE, lex2 );
358
    }
359
  }
360
361
  private void emit( final TokenType tokenType, final Lexeme lexeme ) {
362
    mConsumer.accept( new Token( tokenType, lexeme ) );
363
  }
364
365
  private void emit(
366
    final TokenType tokenType,
367
    final int began,
368
    final int ended ) {
369
    mConsumer.accept( new Token( tokenType, began, ended ) );
370
  }
371
372
  private boolean match(
373
    final LexemeType l1,
374
    final LexemeType l2,
375
    final LexemeType l3,
376
    final LexemeType l4 ) {
377
    return mQ.get( 0 ).isType( l1 ) &&
378
      mQ.get( 1 ).isType( l2 ) &&
379
      mQ.get( 2 ).isType( l3 ) &&
380
      mQ.get( 3 ).isType( l4 );
381
  }
382
383
  private boolean match(
384
    final LexemeType[] l1,
385
    final LexemeType l2,
407 386
    final LexemeType l3,
408 387
    final LexemeType l4 ) {
M src/main/java/com/whitemagicsoftware/keenquotes/parser/Token.java
101 101
102 102
  boolean isAmbiguous() {
103
    return mTokenType == AMBIGUOUS ||
103
    return mTokenType == QUOTE_AMBIGUOUS_SINGLE ||
104
      mTokenType == QUOTE_AMBIGUOUS_DOUBLE ||
104 105
      mTokenType == QUOTE_AMBIGUOUS_LEADING ||
105 106
      mTokenType == QUOTE_AMBIGUOUS_LAGGING;
M src/main/java/com/whitemagicsoftware/keenquotes/parser/TokenType.java
19 19
  QUOTE_AMBIGUOUS_LEADING( "leading-ambiguous" ),
20 20
  QUOTE_AMBIGUOUS_LAGGING( "lagging-ambiguous" ),
21
  AMBIGUOUS( "ambiguous" ),
21
  QUOTE_AMBIGUOUS_DOUBLE( "double-ambiguous" ),
22
  QUOTE_AMBIGUOUS_SINGLE( "single-ambiguous" ),
22 23
  NONE;
23 24
M src/main/java/com/whitemagicsoftware/keenquotes/parser/Tree.java
72 72
   */
73 73
  public boolean hasOpeningSingleQuote() {
74
    return mOpening.isType( QUOTE_OPENING_SINGLE );
74
    return isOpening( QUOTE_OPENING_SINGLE );
75
  }
76
77
  public boolean hasOpeningDoubleQuote() {
78
    return isOpening( QUOTE_OPENING_DOUBLE );
79
  }
80
81
  private boolean isOpening( final TokenType tokenType ) {
82
    return mOpening.isType( tokenType );
75 83
  }
76 84
M src/test/java/com/whitemagicsoftware/keenquotes/parser/CurlerTest.java
26 26
  /**
27 27
   * Tests that straight quotes are converted to curly quotes.
28
   *
29
   * @throws IOException Error opening file full of fun.
30 28
   */
31 29
  @Test
32
  public void test_Parse_StraightQuotes_CurlyQuotes() throws IOException {
30
  public void test_Parse_UncurledQuotes1_CurlyQuotes() throws IOException {
33 31
    testCurler( createCurler( FILTER_PLAIN ), "unambiguous-1-pass.txt" );
34 32
  }
35 33
36 34
  @Test
37
  public void test_XmlParse_StraightQuotes_CurlyQuotes() throws IOException {
35
  public void test_Parse_UncurledQuotes2_CurlyQuotes() throws IOException {
36
    testCurler( createCurler( FILTER_PLAIN ), "unambiguous-2-pass.txt" );
37
  }
38
39
  @Test
40
  public void test_Parse_UncurledQuotesXml_CurlyQuotes() throws IOException {
38 41
    testCurler( createCurler( FILTER_XML ), "xml.txt" );
39 42
  }
M src/test/resources/com/whitemagicsoftware/keenquotes/texts/unambiguous-2-pass.txt
1 1
# ########################################################################
2
# Double quotes
3
# ########################################################################
4
5
""
6
&ldquo;&rdquo;
7
8
"" ""
9
&ldquo;&rdquo; &ldquo;&rdquo;
10
11
""""
12
&ldquo;&rdquo;&ldquo;&rdquo;
13
14
" "
15
&ldquo; &rdquo;
16
17
" " " "
18
&ldquo; &rdquo; &ldquo; &rdquo;
19
20
"?"
21
&ldquo;?&rdquo;
22
23
"!"
24
&ldquo;!&rdquo;
25
26
"‽"
27
&ldquo;‽&rdquo;
28
29
"...?!"
30
&ldquo;...?!&rdquo;
31
32
" ... "
33
&ldquo; ... &rdquo;
34
35
# ########################################################################
2 36
# Single quotes
3 37
# ########################################################################
38
39
' '
40
&lsquo; &rsquo;
41
42
' ' ' '
43
&lsquo; &rsquo; &lsquo; &rsquo;
44
45
' ... '
46
&lsquo; ... &rsquo;
47
48
' ... ' ' ... '
49
&lsquo; ... &rsquo; &lsquo; ... &rsquo;
50
51
'?'
52
&lsquo;?&rsquo;
53
54
'!'
55
&lsquo;!&rsquo;
56
57
'?!'
58
&lsquo;?!&rsquo;
59
60
'‽'
61
&lsquo;‽&rsquo;
62
4 63
With 'quotes' in the middle.
5 64
With &lsquo;quotes&rsquo; in the middle.