Dave Jarvis' Repositories

git clone https://repo.autonoma.ca/repo/keenquotes.git
A docs/releases/NOTES-2.5.5.md
1
# Release Notes
2
3
* **Version:** 2.5.5
4
* **Released:** August 25, 2025
5
6
## What's New
7
8
* **Enhancement**: Added protection for script and style elements
9
* **Output Enhancement**: Emits ellipses (`...` as `…`)
10
* **Code Formatting**: Reformatted codebase for improved consistency
11
1 12
D release.sh
1
#!/usr/bin/env bash
2
3
RELEASE=$(git describe --abbrev=0 --tags)
4
5
glab release create "${RELEASE}"
6
7
cat tokens/keenquotes.pat | glab auth login --hostname gitlab.com --stdin
8
9
glab release upload "${RELEASE}" build/libs/keenquotes.jar
10
11 1
M src/main/java/com/whitemagicsoftware/keenquotes/lex/LexemeGlyph.java
27 27
  LEX_DOUBLE_CHEVRON_RIGHT( '»' ),
28 28
  LEX_SINGLE_CHEVRON_LEFT( '‹' ),
29
  LEX_SINGLE_CHEVRON_RIGHT( '›' );
29
  LEX_SINGLE_CHEVRON_RIGHT( '›' ),
30
31
  LEX_SPACE( ' ' ),
32
33
  LEX_ELLIPSES( '…' ),
34
  LEX_DASH_EN( '–' ),
35
  LEX_DASH_EM( '—' ),
36
  LEX_HYPHEN( '-' );
30 37
31 38
  private final char mGlyph;
M src/main/java/com/whitemagicsoftware/keenquotes/lex/LexemeType.java
23 23
  public static final LexemeType EOP = new LexemeType();
24 24
  public static final LexemeType EOT = new LexemeType();
25
  public static final LexemeType SPACE = new LexemeType();
25
  public static final LexemeType SPACE = new LexemeType( LEX_SPACE );
26 26
  public static final LexemeType WORD = new LexemeType();
27 27
  public static final LexemeType NUMBER = new LexemeType();
28 28
  public static final LexemeType PUNCT = new LexemeType();
29 29
  public static final LexemeType OPENING_GROUP = new LexemeType();
30 30
  public static final LexemeType CLOSING_GROUP = new LexemeType();
31
  public static final LexemeType HYPHEN = new LexemeType();
32
  public static final LexemeType DASH = new LexemeType();
31
  public static final LexemeType DASH_EN = new LexemeType( LEX_DASH_EN );
32
  public static final LexemeType DASH_EM = new LexemeType( LEX_DASH_EM );
33
  public static final LexemeType DASH_LINE = new LexemeType();
34
  public static final LexemeType HYPHEN = new LexemeType( LEX_HYPHEN );
33 35
  public static final LexemeType EQUALS = new LexemeType();
34 36
  public static final LexemeType PERIOD = new LexemeType();
35
  public static final LexemeType ELLIPSIS = new LexemeType();
37
  public static final LexemeType ELLIPSIS = new LexemeType( LEX_ELLIPSES );
36 38
  public static final LexemeType ENDING = new LexemeType();
37 39
  public static final LexemeType ANY = new LexemeType();
M src/main/java/com/whitemagicsoftware/keenquotes/lex/Lexer.java
33 33
  );
34 34
35
  private static final Set<Character> DASH_CHARS = new HashSet<>(
36
    Arrays.asList( '-', '–', '—', '―' )
37
  );
38
39 35
  /**
40 36
   * Consider {@code _} and {@code *} as letters (in a word) because plain
...
94 90
      }
95 91
      else if( curr == '\r' || curr == '\n' ) {
96
        final var cr = new int[]{curr == '\r' ? 1 : 0};
97
        final var lf = new int[]{curr == '\n' ? 1 : 0};
92
        final var cr = new int[]{ curr == '\r' ? 1 : 0 };
93
        final var lf = new int[]{ curr == '\n' ? 1 : 0 };
98 94
99 95
        // Swallow all consecutive CR (Mac), CRLF (Windows), and/or LF (Unix).
...
132 128
        token = QUOTE_SINGLE;
133 129
      }
134
      else if( curr == '-' && i.peek() != '-' ) {
135
        token = HYPHEN;
130
      else if( isHyphen( curr ) ) {
131
        final var began = i.index();
132
        i.skip( Lexer::isHyphen );
133
        final var count = i.index() - began;
134
135
        token = switch( count ) {
136
          case 0 -> HYPHEN;
137
          case 1 -> DASH_EN;
138
          case 2 -> DASH_EM;
139
          default -> DASH_LINE;
140
        };
136 141
      }
137
      else if( isDash( curr ) ) {
138
        i.skip( Lexer::isDash );
139
        token = DASH;
142
      else if( curr == '–' ) {
143
        token = DASH_EN;
144
      }
145
      else if( curr == '—' ) {
146
        token = DASH_EM;
140 147
      }
141 148
      else if( curr == '(' || curr == '{' || curr == '[' ) {
...
231 238
232 239
  /**
233
   * Answers whether the given character may be part of an en- or em-dash.
234
   * This must be called after it is known that the character isn't a lone
235
   * hyphen.
240
   * Answers whether the given character is a hyphen.
236 241
   *
237
   * @param curr The character to check as being a dash.
238
   * @return {@code true} if the given character is part of a dash.
242
   * @param curr The character to check as being a hyphen.
243
   * @return {@code true} if the given character is a hyphen.
239 244
   */
240
  private static boolean isDash( final char curr ) {
241
    return DASH_CHARS.contains( curr );
245
  private static boolean isHyphen( final char curr ) {
246
    return curr == '-';
242 247
  }
243 248
M src/main/java/com/whitemagicsoftware/keenquotes/parser/QuoteEmitter.java
30 30
  };
31 31
32
  private static final LexemeType[] PUNCT_PERIOD_ELLIPSIS_DASH = {
33
    PUNCT, PERIOD, ELLIPSIS, DASH
34
  };
35
36
  private static final LexemeType[] PUNCT_PERIOD = {
37
    PUNCT, PERIOD
38
  };
39
40
  private static final LexemeType[] SPACE_DASH_ENDING = {
41
    SPACE, DASH, ENDING
42
  };
43
44
  private static final LexemeType[] SPACE_ENDING = {
45
    SPACE, ENDING
46
  };
47
48
  private static final LexemeType[] SPACE_HYPHEN = {
49
    SPACE, HYPHEN
50
  };
51
52
  private static final LexemeType[] SPACE_PUNCT = {
53
    SPACE, PUNCT
54
  };
55
56
  private static final LexemeType[] SPACE_SOT = {
57
    SPACE, SOT
58
  };
59
60
  /**
61
   * Single quotes preceded by these {@link LexemeType}s may be opening quotes.
62
   */
63
  private static final LexemeType[] LEADING_QUOTE_OPENING_SINGLE =
64
    new LexemeType[]{
65
      LexemeType.SOT, SPACE, DASH, QUOTE_DOUBLE, OPENING_GROUP, EOL, EOP
66
    };
67
68
  /**
69
   * Single quotes succeeded by these {@link LexemeType}s may be opening quotes.
70
   */
71
  private static final LexemeType[] LAGGING_QUOTE_OPENING_SINGLE =
72
    new LexemeType[]{
73
      WORD, ELLIPSIS, QUOTE_SINGLE, QUOTE_DOUBLE
74
    };
75
76
  /**
77
   * Single quotes preceded by these {@link LexemeType}s may be closing quotes.
78
   */
79
  private static final LexemeType[] LEADING_QUOTE_CLOSING_SINGLE =
80
    new LexemeType[]{
81
      WORD, NUMBER, PERIOD, PUNCT, ELLIPSIS, QUOTE_DOUBLE
82
    };
83
84
  /**
85
   * Single quotes succeeded by these {@link LexemeType}s may be closing quotes.
86
   */
87
  private static final LexemeType[] LAGGING_QUOTE_CLOSING_SINGLE =
88
    new LexemeType[]{
89
      SPACE, HYPHEN, DASH, PUNCT, PERIOD, ELLIPSIS, QUOTE_DOUBLE, CLOSING_GROUP,
90
      ENDING
91
    };
92
93
  /**
94
   * Double quotes preceded by these {@link LexemeType}s may be opening quotes.
95
   */
96
  private static final LexemeType[] LEADING_QUOTE_OPENING_DOUBLE =
97
    new LexemeType[]{
98
      LexemeType.SOT, SPACE, DASH, EQUALS, OPENING_GROUP, EOL, EOP
99
    };
100
101
  /**
102
   * Double quotes succeeded by these {@link LexemeType}s may be opening quotes.
103
   */
104
  private static final LexemeType[] LAGGING_QUOTE_OPENING_DOUBLE =
105
    new LexemeType[]{
106
      WORD, PUNCT, NUMBER, DASH, ELLIPSIS, OPENING_GROUP, QUOTE_SINGLE,
107
      QUOTE_SINGLE_OPENING, QUOTE_SINGLE_CLOSING, QUOTE_DOUBLE
108
    };
109
110
  /**
111
   * Double quotes preceded by these {@link LexemeType}s may be closing quotes.
112
   */
113
  private static final LexemeType[] LEADING_QUOTE_CLOSING_DOUBLE =
114
    new LexemeType[]{
115
      WORD, NUMBER, PERIOD, PUNCT, DASH, ELLIPSIS, CLOSING_GROUP, QUOTE_SINGLE,
116
      QUOTE_SINGLE_CLOSING, QUOTE_SINGLE_OPENING
117
    };
118
119
  /**
120
   * Double quotes succeeded by these {@link LexemeType}s may be closing quotes.
121
   */
122
  private static final LexemeType[] LAGGING_QUOTE_CLOSING_DOUBLE =
123
    new LexemeType[]{
124
      SPACE, PUNCT, PERIOD, EQUALS, HYPHEN, DASH, QUOTE_SINGLE, CLOSING_GROUP,
125
      ENDING
126
    };
127
128
  private final CircularFifoQueue<Lexeme> mQ = new CircularFifoQueue<>( 4 );
129
  private final String mText;
130
  private final Contractions mContractions;
131
  private final Consumer<Token> mConsumer;
132
133
  public QuoteEmitter(
134
    final String text,
135
    final Contractions contractions,
136
    final Consumer<Token> consumer
137
  ) {
138
    assert text != null;
139
    assert contractions != null;
140
141
    mText = text;
142
    mContractions = contractions;
143
    mConsumer = consumer;
144
  }
145
146
  /**
147
   * Scans the given text document for quotation marks and passes them to the
148
   * given {@link Token} {@link Consumer}.
149
   *
150
   * @param text         The prose to lex.
151
   * @param contractions List of ambiguous and unambiguous contractions.
152
   * @param consumer     Receives
153
   */
154
  public static void analyze(
155
    final String text,
156
    final Contractions contractions,
157
    final Consumer<Token> consumer,
158
    final LexerFilter filter
159
  ) {
160
    final var emitter = new QuoteEmitter( text, contractions, consumer );
161
    Lexer.lex( text, emitter, filter );
162
  }
163
164
  /**
165
   * @param lexeme the input argument
166
   */
167
  @Override
168
  public void accept( final Lexeme lexeme ) {
169
    mQ.add( lexeme );
170
171
    if( mQ.size() == 4 ) {
172
      parse();
173
    }
174
  }
175
176
  private void parse() {
177
    final var lex1 = mQ.get( 0 );
178
    final var lex2 = mQ.get( 1 );
179
    final var lex3 = mQ.get( 2 );
180
    final var lex4 = mQ.get( 3 );
181
182
    // <y'all>, <Ph.D.'ll>, <20's>, <she's>
183
    if( match( WORD_PERIOD_NUMBER, QUOTE_SINGLE, WORD, ANY ) ) {
184
      emit( QUOTE_APOSTROPHE, lex2 );
185
    }
186
    // <'n'>, <'N'>, <'owlin'>
187
    else if(
188
      match( ANY, QUOTE_SINGLE, WORD, QUOTE_SINGLE ) &&
189
        mContractions.beganEndedUnambiguously( lex3.toString( mText ) )
190
    ) {
191
      emit( QUOTE_APOSTROPHE, lex2 );
192
      emit( QUOTE_APOSTROPHE, lex4 );
193
      mQ.set( Lexeme.NONE, 3 );
194
    }
195
    // <2''>
196
    else if( match( NUMBER, QUOTE_SINGLE, QUOTE_SINGLE, ANY ) ) {
197
      // Force double primes to conform to the same constructor usage. This
198
      // simplifies the tokens, reduces some memory usage,
199
      final var lex = new Lexeme( PRIME_DOUBLE, lex2.began(), lex3.ended() );
200
201
      emit( QUOTE_PRIME_DOUBLE, lex );
202
      mQ.set( Lexeme.NONE, 2 );
203
    }
204
    // <2'>
205
    else if( match( NUMBER, QUOTE_SINGLE, ANY, ANY ) ) {
206
      emit( QUOTE_PRIME_SINGLE, lex2 );
207
    }
208
    // <2">
209
    else if( match( NUMBER, QUOTE_DOUBLE, ANY, ANY ) ) {
210
      emit( QUOTE_PRIME_DOUBLE, lex2 );
211
    }
212
    // <thinkin'>
213
    else if(
214
      match( WORD, QUOTE_SINGLE, ANY, ANY ) &&
215
        mContractions.endedUnambiguously( lex1.toString( mText ) )
216
    ) {
217
      emit( QUOTE_APOSTROPHE, lex2 );
218
    }
219
    // <'02>
220
    else if( match( ANY, QUOTE_SINGLE, NUMBER, SPACE_PUNCT ) ) {
221
      emit( QUOTE_APOSTROPHE, lex2 );
222
    }
223
    // <'20s>
224
    else if(
225
      match( ANY, QUOTE_SINGLE, NUMBER, WORD ) &&
226
        "s".equalsIgnoreCase( lex4.toString( mText ) )
227
    ) {
228
      emit( QUOTE_APOSTROPHE, lex2 );
229
    }
230
    // <.'\n>
231
    else if( match( PUNCT_PERIOD_ELLIPSIS_DASH, QUOTE_SINGLE, ENDING, ANY ) ) {
232
      emit( QUOTE_CLOSING_SINGLE, lex2 );
233
    }
234
    // <\'>
235
    else if( match( ESC_SINGLE, ANY, ANY, ANY ) ) {
236
      emit( QUOTE_STRAIGHT_SINGLE, lex1 );
237
    }
238
    // <\">
239
    else if( match( ESC_DOUBLE, ANY, ANY, ANY ) ) {
240
      emit( QUOTE_STRAIGHT_DOUBLE, lex1 );
241
242
      // <\"'--->
243
      if( match( ESC_DOUBLE, QUOTE_SINGLE, SPACE_DASH_ENDING, ANY ) ) {
244
        emit( QUOTE_CLOSING_SINGLE, lex2 );
245
      }
246
    }
247
    // <---'" >
248
    else if( match( DASH, QUOTE_SINGLE, QUOTE_DOUBLE, SPACE_ENDING ) ) {
249
      emit( QUOTE_CLOSING_SINGLE, lex2 );
250
    }
251
    // <o’-lantern>, <o' fellow>, <O'-the>
252
    else if(
253
      match( WORD, QUOTE_SINGLE, SPACE_HYPHEN, WORD ) &&
254
        "o".equalsIgnoreCase( lex1.toString( mText ) )
255
    ) {
256
      emit( QUOTE_APOSTROPHE, lex2 );
257
    }
258
    // <"">, <"...>, <"word>, <---"word>
259
    else if(
260
      match(
261
        LEADING_QUOTE_OPENING_DOUBLE, QUOTE_DOUBLE,
262
        LAGGING_QUOTE_OPENING_DOUBLE, ANY
263
      )
264
    ) {
265
      emit( QUOTE_OPENING_DOUBLE, lex2 );
266
    }
267
    // <..."'>, <word"'>, <?"'>, <word"?>
268
    else if(
269
      match(
270
        LEADING_QUOTE_CLOSING_DOUBLE, QUOTE_DOUBLE,
271
        LAGGING_QUOTE_CLOSING_DOUBLE, ANY
272
      )
273
    ) {
274
      emit( QUOTE_CLOSING_DOUBLE, lex2 );
275
    }
276
    // < ''E>
277
    else if( match( SPACE_SOT, QUOTE_SINGLE, QUOTE_SINGLE, WORD ) ) {
278
      // Consume both immediately to avoid the false ambiguity <'e>.
279
      emit( QUOTE_OPENING_SINGLE, lex2 );
280
      emit( QUOTE_APOSTROPHE, lex3 );
281
      mQ.set( Lexeme.NONE, 1 );
282
      mQ.set( Lexeme.NONE, 2 );
283
    }
284
    // <'...>, <'word>, <---'word>, < 'nation>
285
    else if(
286
      match(
287
        LEADING_QUOTE_OPENING_SINGLE, QUOTE_SINGLE,
288
        LAGGING_QUOTE_OPENING_SINGLE, ANY )
289
    ) {
290
      final var word = lex3.toString( mText );
291
292
      if( mContractions.beganAmbiguously( word ) ) {
293
        emit( QUOTE_AMBIGUOUS_LEADING, lex2 );
294
      }
295
      else if( mContractions.beganUnambiguously( word ) ) {
296
        emit( QUOTE_APOSTROPHE, lex2 );
297
      }
298
      // <"'"nested>
299
      else if( match( QUOTE_DOUBLE, QUOTE_SINGLE, QUOTE_DOUBLE, WORD ) ) {
300
        emit( QUOTE_OPENING_SINGLE, lex2 );
301
      }
302
      // <"'" >
303
      else if( match( QUOTE_DOUBLE, QUOTE_SINGLE, QUOTE_DOUBLE, ANY ) ) {
304
        emit( QUOTE_AMBIGUOUS_SINGLE, lex2 );
305
      }
306
      // < '" >
307
      else if( match( ANY, QUOTE_SINGLE, LAGGING_QUOTE_OPENING_SINGLE, ANY ) ) {
308
        emit( QUOTE_OPENING_SINGLE, lex2 );
309
      }
310
      // Ambiguous
311
      else {
312
        emit( QUOTE_AMBIGUOUS_LEADING, lex2 );
313
      }
314
    }
315
    // <"'--- >
316
    else if( match( QUOTE_DOUBLE, QUOTE_SINGLE, DASH, ANY ) ) {
317
      emit( QUOTE_OPENING_SINGLE, lex2 );
318
    }
319
    // <word'">, <...'--->, <"' >
320
    else if(
321
      match(
322
        LEADING_QUOTE_CLOSING_SINGLE, QUOTE_SINGLE,
323
        LAGGING_QUOTE_CLOSING_SINGLE, ANY
324
      )
325
    ) {
326
      final var word = lex1.toString( mText );
327
328
      if( mContractions.endedAmbiguously( word ) ) {
329
        emit( QUOTE_AMBIGUOUS_LAGGING, lex2 );
330
      }
331
      else {
332
        emit( QUOTE_CLOSING_SINGLE, lex2 );
333
      }
334
    }
335
    // <word';> (contraction inferred by previous matches)
336
    else if( match( WORD, QUOTE_SINGLE, PUNCT_PERIOD, ANY ) ) {
337
      emit( QUOTE_APOSTROPHE, lex2 );
338
    }
339
    // <---'">
340
    else if( match( DASH, QUOTE_SINGLE, QUOTE_DOUBLE, ANY ) ) {
341
      emit( QUOTE_CLOSING_SINGLE, lex2 );
342
    }
343
    // <'42>, <'-3.14>
344
    else if( match( ANY, QUOTE_SINGLE, NUMBER, ANY ) ) {
345
      emit( QUOTE_OPENING_SINGLE, lex2 );
346
    }
347
    // <PRE-PARSED><'---.>
348
    else if( match( LexemeType.NONE, QUOTE_SINGLE, ANY, ANY ) ) {
349
      emit( QUOTE_CLOSING_SINGLE, lex2 );
350
    }
351
    // <''Cause >
352
    else if( match( QUOTE_SINGLE, QUOTE_SINGLE, WORD, ANY ) ) {
353
      final var word = lex3.toString( mText );
354
355
      if( mContractions.beganAmbiguously( word ) ) {
356
        emit( QUOTE_AMBIGUOUS_LEADING, lex2 );
357
      }
358
      else if( mContractions.beganUnambiguously( word ) ) {
359
        emit( QUOTE_APOSTROPHE, lex2 );
360
      }
361
      else {
362
        emit( QUOTE_AMBIGUOUS_SINGLE, lex2 );
363
      }
364
    }
365
    // <'"Trouble>
366
    else if( match( QUOTE_SINGLE, QUOTE_DOUBLE, WORD, ANY ) ) {
367
      emit( QUOTE_OPENING_DOUBLE, lex2 );
368
    }
369
    // International quotation marks.
370
    else if( match( ANY, QUOTE_DOUBLE_OPENING, ANY, ANY ) ) {
371
      emit( QUOTE_OPENING_DOUBLE, lex2 );
372
    }
373
    else if( match( ANY, QUOTE_SINGLE_OPENING, ANY, ANY ) ) {
374
      emit( QUOTE_OPENING_SINGLE, lex2 );
375
    }
376
    else if( match( ANY, QUOTE_DOUBLE_CLOSING, ANY, ANY ) ) {
377
      emit( QUOTE_CLOSING_DOUBLE, lex2 );
378
    }
379
    else if( match( ANY, QUOTE_SINGLE_CLOSING, ANY, ANY ) ) {
380
      emit( QUOTE_CLOSING_SINGLE, lex2 );
381
    }
382
    // Ambiguous (no match)
383
    else if( match( ANY, QUOTE_SINGLE, ANY, ANY ) ) {
384
      emit( QUOTE_AMBIGUOUS_SINGLE, lex2 );
385
    }
386
    else if( match( ANY, QUOTE_DOUBLE, ANY, ANY ) ) {
387
      emit( QUOTE_AMBIGUOUS_DOUBLE, lex2 );
388
    }
389
    else if( match( ANY, ELLIPSIS, ANY, ANY ) ) {
390
      emit( QUOTE_ELLIPSIS, lex2 );
391
    }
392
  }
393
394
  private void emit( final TokenType tokenType, final Lexeme lexeme ) {
395
    mConsumer.accept( new Token( tokenType, lexeme ) );
396
  }
397
398
  private boolean match(
399
    final LexemeType l1,
400
    final LexemeType l2,
401
    final LexemeType l3,
402
    final LexemeType l4 ) {
403
    return mQ.get( 0 ).isType( l1 ) &&
404
      mQ.get( 1 ).isType( l2 ) &&
405
      mQ.get( 2 ).isType( l3 ) &&
406
      mQ.get( 3 ).isType( l4 );
407
  }
408
409
  private boolean match(
410
    final LexemeType[] l1,
411
    final LexemeType l2,
412
    final LexemeType l3,
413
    final LexemeType l4 ) {
414
    return mQ.get( 0 ).isType( l1 ) &&
415
      mQ.get( 1 ).isType( l2 ) &&
416
      mQ.get( 2 ).isType( l3 ) &&
417
      mQ.get( 3 ).isType( l4 );
418
  }
419
420
  private boolean match(
421
    final LexemeType l1,
422
    final LexemeType l2,
423
    final LexemeType[] l3,
424
    final LexemeType l4 ) {
425
    return mQ.get( 0 ).isType( l1 ) &&
426
      mQ.get( 1 ).isType( l2 ) &&
427
      mQ.get( 2 ).isType( l3 ) &&
428
      mQ.get( 3 ).isType( l4 );
429
  }
430
431
  private boolean match(
432
    final LexemeType l1,
433
    final LexemeType l2,
434
    final LexemeType l3,
435
    final LexemeType[] l4 ) {
436
    return mQ.get( 0 ).isType( l1 ) &&
437
      mQ.get( 1 ).isType( l2 ) &&
438
      mQ.get( 2 ).isType( l3 ) &&
439
      mQ.get( 3 ).isType( l4 );
440
  }
441
442
  private boolean match(
443
    final LexemeType[] l1,
444
    final LexemeType l2,
445
    final LexemeType[] l3,
446
    final LexemeType l4 ) {
32
  private static final LexemeType[] DASHES = {
33
    DASH_EN, DASH_EM, DASH_LINE, HYPHEN
34
  };
35
36
  private static final LexemeType[] PUNCT_PERIOD_ELLIPSIS_DASH = {
37
    PUNCT, PERIOD, ELLIPSIS, DASH_EN, DASH_EM, DASH_LINE, HYPHEN
38
  };
39
40
  private static final LexemeType[] PUNCT_PERIOD = {
41
    PUNCT, PERIOD
42
  };
43
44
  private static final LexemeType[] SPACE_DASH_ENDING = {
45
    SPACE, DASH_EN, DASH_EM, DASH_LINE, HYPHEN, ENDING
46
  };
47
48
  private static final LexemeType[] SPACE_ENDING = {
49
    SPACE, ENDING
50
  };
51
52
  private static final LexemeType[] SPACE_HYPHEN = {
53
    SPACE, HYPHEN
54
  };
55
56
  private static final LexemeType[] SPACE_PUNCT = {
57
    SPACE, PUNCT
58
  };
59
60
  private static final LexemeType[] SPACE_SOT = {
61
    SPACE, SOT
62
  };
63
64
  /**
65
   * Single quotes preceded by these {@link LexemeType}s may be opening quotes.
66
   */
67
  private static final LexemeType[] LEADING_QUOTE_OPENING_SINGLE =
68
    new LexemeType[]{
69
      LexemeType.SOT, SPACE,
70
      DASH_EN, DASH_EM, DASH_LINE, HYPHEN,
71
      QUOTE_DOUBLE, OPENING_GROUP, EOL, EOP
72
    };
73
74
  /**
75
   * Single quotes succeeded by these {@link LexemeType}s may be opening quotes.
76
   */
77
  private static final LexemeType[] LAGGING_QUOTE_OPENING_SINGLE =
78
    new LexemeType[]{
79
      WORD, ELLIPSIS, QUOTE_SINGLE, QUOTE_DOUBLE
80
    };
81
82
  /**
83
   * Single quotes preceded by these {@link LexemeType}s may be closing quotes.
84
   */
85
  private static final LexemeType[] LEADING_QUOTE_CLOSING_SINGLE =
86
    new LexemeType[]{
87
      WORD, NUMBER, PERIOD, PUNCT, ELLIPSIS, QUOTE_DOUBLE
88
    };
89
90
  /**
91
   * Single quotes succeeded by these {@link LexemeType}s may be closing quotes.
92
   */
93
  private static final LexemeType[] LAGGING_QUOTE_CLOSING_SINGLE =
94
    new LexemeType[]{
95
      SPACE,
96
      DASH_EN, DASH_EM, DASH_LINE, HYPHEN,
97
      PUNCT, PERIOD, ELLIPSIS, QUOTE_DOUBLE, CLOSING_GROUP, ENDING
98
    };
99
100
  /**
101
   * Double quotes preceded by these {@link LexemeType}s may be opening quotes.
102
   */
103
  private static final LexemeType[] LEADING_QUOTE_OPENING_DOUBLE =
104
    new LexemeType[]{
105
      LexemeType.SOT, SPACE,
106
      DASH_EN, DASH_EM, DASH_LINE, HYPHEN,
107
      EQUALS, OPENING_GROUP, EOL, EOP
108
    };
109
110
  /**
111
   * Double quotes succeeded by these {@link LexemeType}s may be opening quotes.
112
   */
113
  private static final LexemeType[] LAGGING_QUOTE_OPENING_DOUBLE =
114
    new LexemeType[]{
115
      WORD, PUNCT, NUMBER,
116
      DASH_EN, DASH_EM, DASH_LINE, HYPHEN,
117
      ELLIPSIS, OPENING_GROUP,
118
      QUOTE_SINGLE, QUOTE_SINGLE_OPENING, QUOTE_SINGLE_CLOSING,
119
      QUOTE_DOUBLE
120
    };
121
122
  /**
123
   * Double quotes preceded by these {@link LexemeType}s may be closing quotes.
124
   */
125
  private static final LexemeType[] LEADING_QUOTE_CLOSING_DOUBLE =
126
    new LexemeType[]{
127
      WORD, NUMBER, PERIOD, PUNCT,
128
      DASH_EN, DASH_EM, DASH_LINE, HYPHEN,
129
      ELLIPSIS, CLOSING_GROUP,
130
      QUOTE_SINGLE, QUOTE_SINGLE_CLOSING, QUOTE_SINGLE_OPENING
131
    };
132
133
  /**
134
   * Double quotes succeeded by these {@link LexemeType}s may be closing quotes.
135
   */
136
  private static final LexemeType[] LAGGING_QUOTE_CLOSING_DOUBLE =
137
    new LexemeType[]{
138
      SPACE, PUNCT, PERIOD, EQUALS,
139
      DASH_EN, DASH_EM, DASH_LINE, HYPHEN,
140
      QUOTE_SINGLE, CLOSING_GROUP, ENDING
141
    };
142
143
  private final CircularFifoQueue<Lexeme> mQ = new CircularFifoQueue<>( 4 );
144
  private final String mText;
145
  private final Contractions mContractions;
146
  private final Consumer<Token> mConsumer;
147
148
  public QuoteEmitter(
149
    final String text,
150
    final Contractions contractions,
151
    final Consumer<Token> consumer
152
  ) {
153
    assert text != null;
154
    assert contractions != null;
155
156
    mText = text;
157
    mContractions = contractions;
158
    mConsumer = consumer;
159
  }
160
161
  /**
162
   * Scans the given text document for quotation marks and passes them to the
163
   * given {@link Token} {@link Consumer}.
164
   *
165
   * @param text         The prose to lex.
166
   * @param contractions List of ambiguous and unambiguous contractions.
167
   * @param consumer     Receives
168
   */
169
  public static void analyze(
170
    final String text,
171
    final Contractions contractions,
172
    final Consumer<Token> consumer,
173
    final LexerFilter filter
174
  ) {
175
    final var emitter = new QuoteEmitter( text, contractions, consumer );
176
    Lexer.lex( text, emitter, filter );
177
  }
178
179
  /**
180
   * @param lexeme the input argument
181
   */
182
  @Override
183
  public void accept( final Lexeme lexeme ) {
184
    mQ.add( lexeme );
185
186
    if( mQ.size() == 4 ) {
187
      parse();
188
    }
189
  }
190
191
  private void parse() {
192
    final var lex1 = mQ.get( 0 );
193
    final var lex2 = mQ.get( 1 );
194
    final var lex3 = mQ.get( 2 );
195
    final var lex4 = mQ.get( 3 );
196
197
    // <y'all>, <Ph.D.'ll>, <20's>, <she's>
198
    if( match( WORD_PERIOD_NUMBER, QUOTE_SINGLE, WORD, ANY ) ) {
199
      emit( QUOTE_APOSTROPHE, lex2 );
200
    }
201
    // <'n'>, <'N'>, <'owlin'>
202
    else if(
203
      match( ANY, QUOTE_SINGLE, WORD, QUOTE_SINGLE ) &&
204
        mContractions.beganEndedUnambiguously( lex3.toString( mText ) )
205
    ) {
206
      emit( QUOTE_APOSTROPHE, lex2 );
207
      emit( QUOTE_APOSTROPHE, lex4 );
208
      mQ.set( Lexeme.NONE, 3 );
209
    }
210
    // <2''>
211
    else if( match( NUMBER, QUOTE_SINGLE, QUOTE_SINGLE, ANY ) ) {
212
      // Force double primes to conform to the same constructor usage. This
213
      // simplifies the tokens, reduces some memory usage,
214
      final var lex = new Lexeme( PRIME_DOUBLE, lex2.began(), lex3.ended() );
215
216
      emit( QUOTE_PRIME_DOUBLE, lex );
217
      mQ.set( Lexeme.NONE, 2 );
218
    }
219
    // <2'>
220
    else if( match( NUMBER, QUOTE_SINGLE, ANY, ANY ) ) {
221
      emit( QUOTE_PRIME_SINGLE, lex2 );
222
    }
223
    // <2">
224
    else if( match( NUMBER, QUOTE_DOUBLE, ANY, ANY ) ) {
225
      emit( QUOTE_PRIME_DOUBLE, lex2 );
226
    }
227
    // <thinkin'>
228
    else if(
229
      match( WORD, QUOTE_SINGLE, ANY, ANY ) &&
230
        mContractions.endedUnambiguously( lex1.toString( mText ) )
231
    ) {
232
      emit( QUOTE_APOSTROPHE, lex2 );
233
    }
234
    // <'02>
235
    else if( match( ANY, QUOTE_SINGLE, NUMBER, SPACE_PUNCT ) ) {
236
      emit( QUOTE_APOSTROPHE, lex2 );
237
    }
238
    // <'20s>
239
    else if(
240
      match( ANY, QUOTE_SINGLE, NUMBER, WORD ) &&
241
        "s".equalsIgnoreCase( lex4.toString( mText ) )
242
    ) {
243
      emit( QUOTE_APOSTROPHE, lex2 );
244
    }
245
    // <.'\n>
246
    else if( match( PUNCT_PERIOD_ELLIPSIS_DASH, QUOTE_SINGLE, ENDING, ANY ) ) {
247
      emit( QUOTE_CLOSING_SINGLE, lex2 );
248
    }
249
    // <\'>
250
    else if( match( ESC_SINGLE, ANY, ANY, ANY ) ) {
251
      emit( QUOTE_STRAIGHT_SINGLE, lex1 );
252
    }
253
    // <\">
254
    else if( match( ESC_DOUBLE, ANY, ANY, ANY ) ) {
255
      emit( QUOTE_STRAIGHT_DOUBLE, lex1 );
256
257
      // <\"'--->
258
      if( match( ESC_DOUBLE, QUOTE_SINGLE, SPACE_DASH_ENDING, ANY ) ) {
259
        emit( QUOTE_CLOSING_SINGLE, lex2 );
260
      }
261
    }
262
    // <---'" >
263
    else if( match( DASHES, QUOTE_SINGLE, QUOTE_DOUBLE, SPACE_ENDING ) ) {
264
      emit( QUOTE_CLOSING_SINGLE, lex2 );
265
    }
266
    // <o’-lantern>, <o' fellow>, <O'-the>
267
    else if(
268
      match( WORD, QUOTE_SINGLE, SPACE_HYPHEN, WORD ) &&
269
        "o".equalsIgnoreCase( lex1.toString( mText ) )
270
    ) {
271
      emit( QUOTE_APOSTROPHE, lex2 );
272
    }
273
    // <"">, <"...>, <"word>, <---"word>
274
    else if(
275
      match(
276
        LEADING_QUOTE_OPENING_DOUBLE, QUOTE_DOUBLE,
277
        LAGGING_QUOTE_OPENING_DOUBLE, ANY
278
      )
279
    ) {
280
      emit( QUOTE_OPENING_DOUBLE, lex2 );
281
    }
282
    // <..."'>, <word"'>, <?"'>, <word"?>
283
    else if(
284
      match(
285
        LEADING_QUOTE_CLOSING_DOUBLE, QUOTE_DOUBLE,
286
        LAGGING_QUOTE_CLOSING_DOUBLE, ANY
287
      )
288
    ) {
289
      emit( QUOTE_CLOSING_DOUBLE, lex2 );
290
    }
291
    // < ''E>
292
    else if( match( SPACE_SOT, QUOTE_SINGLE, QUOTE_SINGLE, WORD ) ) {
293
      // Consume both immediately to avoid the false ambiguity <'e>.
294
      emit( QUOTE_OPENING_SINGLE, lex2 );
295
      emit( QUOTE_APOSTROPHE, lex3 );
296
      mQ.set( Lexeme.NONE, 1 );
297
      mQ.set( Lexeme.NONE, 2 );
298
    }
299
    // <'...>, <'word>, <---'word>, < 'nation>
300
    else if(
301
      match(
302
        LEADING_QUOTE_OPENING_SINGLE, QUOTE_SINGLE,
303
        LAGGING_QUOTE_OPENING_SINGLE, ANY )
304
    ) {
305
      final var word = lex3.toString( mText );
306
307
      if( mContractions.beganAmbiguously( word ) ) {
308
        emit( QUOTE_AMBIGUOUS_LEADING, lex2 );
309
      }
310
      else if( mContractions.beganUnambiguously( word ) ) {
311
        emit( QUOTE_APOSTROPHE, lex2 );
312
      }
313
      // <"'"nested>
314
      else if( match( QUOTE_DOUBLE, QUOTE_SINGLE, QUOTE_DOUBLE, WORD ) ) {
315
        emit( QUOTE_OPENING_SINGLE, lex2 );
316
      }
317
      // <"'" >
318
      else if( match( QUOTE_DOUBLE, QUOTE_SINGLE, QUOTE_DOUBLE, ANY ) ) {
319
        emit( QUOTE_AMBIGUOUS_SINGLE, lex2 );
320
      }
321
      // < '" >
322
      else if( match( ANY, QUOTE_SINGLE, LAGGING_QUOTE_OPENING_SINGLE, ANY ) ) {
323
        emit( QUOTE_OPENING_SINGLE, lex2 );
324
      }
325
      // Ambiguous
326
      else {
327
        emit( QUOTE_AMBIGUOUS_LEADING, lex2 );
328
      }
329
    }
330
    // <"'--- >
331
    else if( match( QUOTE_DOUBLE, QUOTE_SINGLE, DASHES, ANY ) ) {
332
      emit( QUOTE_OPENING_SINGLE, lex2 );
333
    }
334
    // <word'">, <...'--->, <"' >
335
    else if(
336
      match(
337
        LEADING_QUOTE_CLOSING_SINGLE, QUOTE_SINGLE,
338
        LAGGING_QUOTE_CLOSING_SINGLE, ANY
339
      )
340
    ) {
341
      final var word = lex1.toString( mText );
342
343
      if( mContractions.endedAmbiguously( word ) ) {
344
        emit( QUOTE_AMBIGUOUS_LAGGING, lex2 );
345
      }
346
      else {
347
        emit( QUOTE_CLOSING_SINGLE, lex2 );
348
      }
349
    }
350
    // <word';> (contraction inferred by previous matches)
351
    else if( match( WORD, QUOTE_SINGLE, PUNCT_PERIOD, ANY ) ) {
352
      emit( QUOTE_APOSTROPHE, lex2 );
353
    }
354
    // <---'">
355
    else if( match( DASHES, QUOTE_SINGLE, QUOTE_DOUBLE, ANY ) ) {
356
      emit( QUOTE_CLOSING_SINGLE, lex2 );
357
    }
358
    // <'42>, <'-3.14>
359
    else if( match( ANY, QUOTE_SINGLE, NUMBER, ANY ) ) {
360
      emit( QUOTE_OPENING_SINGLE, lex2 );
361
    }
362
    // <PRE-PARSED><'---.>
363
    else if( match( LexemeType.NONE, QUOTE_SINGLE, ANY, ANY ) ) {
364
      emit( QUOTE_CLOSING_SINGLE, lex2 );
365
    }
366
    // <''Cause >
367
    else if( match( QUOTE_SINGLE, QUOTE_SINGLE, WORD, ANY ) ) {
368
      final var word = lex3.toString( mText );
369
370
      if( mContractions.beganAmbiguously( word ) ) {
371
        emit( QUOTE_AMBIGUOUS_LEADING, lex2 );
372
      }
373
      else if( mContractions.beganUnambiguously( word ) ) {
374
        emit( QUOTE_APOSTROPHE, lex2 );
375
      }
376
      else {
377
        emit( QUOTE_AMBIGUOUS_SINGLE, lex2 );
378
      }
379
    }
380
    // <'"Trouble>
381
    else if( match( QUOTE_SINGLE, QUOTE_DOUBLE, WORD, ANY ) ) {
382
      emit( QUOTE_OPENING_DOUBLE, lex2 );
383
    }
384
    // International quotation marks.
385
    else if( match( ANY, QUOTE_DOUBLE_OPENING, ANY, ANY ) ) {
386
      emit( QUOTE_OPENING_DOUBLE, lex2 );
387
    }
388
    else if( match( ANY, QUOTE_SINGLE_OPENING, ANY, ANY ) ) {
389
      emit( QUOTE_OPENING_SINGLE, lex2 );
390
    }
391
    else if( match( ANY, QUOTE_DOUBLE_CLOSING, ANY, ANY ) ) {
392
      emit( QUOTE_CLOSING_DOUBLE, lex2 );
393
    }
394
    else if( match( ANY, QUOTE_SINGLE_CLOSING, ANY, ANY ) ) {
395
      emit( QUOTE_CLOSING_SINGLE, lex2 );
396
    }
397
    // Ambiguous (no match)
398
    else if( match( ANY, QUOTE_SINGLE, ANY, ANY ) ) {
399
      emit( QUOTE_AMBIGUOUS_SINGLE, lex2 );
400
    }
401
    else if( match( ANY, QUOTE_DOUBLE, ANY, ANY ) ) {
402
      emit( QUOTE_AMBIGUOUS_DOUBLE, lex2 );
403
    }
404
    else if( match( ANY, ELLIPSIS, ANY, ANY ) ) {
405
      emit( QUOTE_ELLIPSIS, lex2 );
406
    }
407
    else if( match( ANY, DASH_EN, ANY, ANY ) ) {
408
      emit( QUOTE_DASH_EN, lex2 );
409
    }
410
    else if( match( ANY, DASH_EM, ANY, ANY ) ) {
411
      emit( QUOTE_DASH_EM, lex2 );
412
    }
413
  }
414
415
  private void emit( final TokenType tokenType, final Lexeme lexeme ) {
416
    mConsumer.accept( new Token( tokenType, lexeme ) );
417
  }
418
419
  private boolean match(
420
    final LexemeType l1,
421
    final LexemeType l2,
422
    final LexemeType l3,
423
    final LexemeType l4 ) {
424
    return mQ.get( 0 ).isType( l1 ) &&
425
      mQ.get( 1 ).isType( l2 ) &&
426
      mQ.get( 2 ).isType( l3 ) &&
427
      mQ.get( 3 ).isType( l4 );
428
  }
429
430
  private boolean match(
431
    final LexemeType[] l1,
432
    final LexemeType l2,
433
    final LexemeType l3,
434
    final LexemeType l4 ) {
435
    return mQ.get( 0 ).isType( l1 ) &&
436
      mQ.get( 1 ).isType( l2 ) &&
437
      mQ.get( 2 ).isType( l3 ) &&
438
      mQ.get( 3 ).isType( l4 );
439
  }
440
441
  private boolean match(
442
    final LexemeType l1,
443
    final LexemeType l2,
444
    final LexemeType[] l3,
445
    final LexemeType l4 ) {
446
    return mQ.get( 0 ).isType( l1 ) &&
447
      mQ.get( 1 ).isType( l2 ) &&
448
      mQ.get( 2 ).isType( l3 ) &&
449
      mQ.get( 3 ).isType( l4 );
450
  }
451
452
  private boolean match(
453
    final LexemeType l1,
454
    final LexemeType l2,
455
    final LexemeType l3,
456
    final LexemeType[] l4 ) {
457
    return mQ.get( 0 ).isType( l1 ) &&
458
      mQ.get( 1 ).isType( l2 ) &&
459
      mQ.get( 2 ).isType( l3 ) &&
460
      mQ.get( 3 ).isType( l4 );
461
  }
462
463
  private boolean match(
464
    final LexemeType[] l1,
465
    final LexemeType l2,
466
    final LexemeType[] l3,
467
    final LexemeType l4 ) {
468
    return mQ.get( 0 ).isType( l1 ) &&
469
      mQ.get( 1 ).isType( l2 ) &&
470
      mQ.get( 2 ).isType( l3 ) &&
471
      mQ.get( 3 ).isType( l4 );
472
  }
473
474
  private boolean match(
475
    final LexemeType[] l1,
476
    final LexemeType l2,
477
    final LexemeType l3,
478
    final LexemeType[] l4 ) {
447 479
    return mQ.get( 0 ).isType( l1 ) &&
448 480
      mQ.get( 1 ).isType( l2 ) &&
M src/main/java/com/whitemagicsoftware/keenquotes/parser/Token.java
37 37
    ENTITIES.put( QUOTE_PRIME_QUADRUPLE, "&qprime;" );
38 38
    ENTITIES.put( QUOTE_ELLIPSIS, "&hellip;" );
39
    ENTITIES.put( QUOTE_DASH_EN, "&ndash;" );
40
    ENTITIES.put( QUOTE_DASH_EM, "&mdash;" );
41
    ENTITIES.put( QUOTE_DASH_MINUS, "&minus;" );
39 42
  }
40 43
...
58 61
    CHARS.put( QUOTE_PRIME_QUADRUPLE, "⁗" );
59 62
    CHARS.put( QUOTE_ELLIPSIS, "…" );
63
    CHARS.put( QUOTE_DASH_MINUS, "−" );
64
    CHARS.put( QUOTE_DASH_EN, "–" );
65
    CHARS.put( QUOTE_DASH_EM, "—" );
60 66
  }
61 67
M src/main/java/com/whitemagicsoftware/keenquotes/parser/TokenType.java
22 22
  QUOTE_AMBIGUOUS_SINGLE( "single-ambiguous" ),
23 23
  QUOTE_ELLIPSIS,
24
  QUOTE_DASH_MINUS,
25
  QUOTE_DASH_EN,
26
  QUOTE_DASH_EM,
24 27
  NONE;
25 28
M src/main/java/com/whitemagicsoftware/keenquotes/util/FastCharacterIterator.java
18 18
 */
19 19
public final class FastCharacterIterator {
20
21 20
  private final String mS;
22 21
  private final int mLen;
M src/test/java/com/whitemagicsoftware/keenquotes/lex/LexerTest.java
49 49
  @Test
50 50
  void test_Lexing_PunctuationMarks_EmitPunctuationMarks() {
51
    testType( "-", HYPHEN );
52
    testType( "--", DASH_EN );
53
    testType( "–", DASH_EN );
54
    testType( "---", DASH_EM );
55
    testType( "--- -- -", DASH_EM, SPACE, DASH_EN, SPACE, HYPHEN );
56
    testType( "—", DASH_EM );
57
    testType( "----", DASH_LINE );
58
    testType( "-----", DASH_LINE );
59
    testType( "------", DASH_LINE );
60
    testType( "—-—", DASH_EM, HYPHEN, DASH_EM );
51 61
    testType( "!", PUNCT );
52 62
    testType( ";", PUNCT );
53 63
    testType( "\\", PUNCT );
54 64
    testType( ".", PERIOD );
55
    testType( "-", HYPHEN );
56
    testType( "--", DASH );
57
    testType( "---", DASH );
58
    testType( "–", DASH );
59
    testType( "―", DASH );
60
    testType( "—", DASH );
61
    testType( "—-—", DASH );
62 65
    testType( "...", ELLIPSIS );
63 66
    testType( ". .", ELLIPSIS );
...
144 147
    var counter = new AtomicInteger();
145 148
146
    Lexer.lex( text, lexeme -> {
147
      // Ignore the SOT and EOT lexemes (avoids duplication). Each test will
148
      // include EOL, EOP, and EOT tokens due to the lexer's algorithm.
149
      if( !lexeme.isType( SOT, EOT ) && counter.get() < elements.size() ) {
150
        final var expected = elements.get( counter.getAndIncrement() );
151
        final var actual = f.apply( lexeme, text );
149
    Lexer.lex(
150
      text, lexeme -> {
151
        // Ignore the SOT and EOT lexemes (avoids duplication). Each test will
152
        // include EOL, EOP, and EOT tokens due to the lexer's algorithm.
153
        if( !lexeme.isType( SOT, EOT ) && counter.get() < elements.size() ) {
154
          final var expected = elements.get( counter.getAndIncrement() );
155
          final var actual = f.apply( lexeme, text );
152 156
153
        assertEquals( expected, actual );
154
      }
155
    }, filter );
157
          assertEquals( expected, actual );
158
        }
159
      }, filter
160
    );
156 161
157 162
    // Ensure all expected values are matched (verify end of text reached).
M src/test/resources/com/whitemagicsoftware/keenquotes/texts/unambiguous-1-pass.txt
63 63
64 64
With--"air quotes"--and dashes.
65
With--&ldquo;air quotes&rdquo;--and dashes.
65
With&ndash;&ldquo;air quotes&rdquo;&ndash;and dashes.
66 66
67 67
"Not all open quotes are closed...
68 68
&ldquo;Not all open quotes are closed&hellip;
69 69
70 70
"---retroactively!"
71
&ldquo;---retroactively!&rdquo;
71
&ldquo;&mdash;retroactively!&rdquo;
72 72
73 73
"And this" and "and this" and "and this" and "and another."
74 74
&ldquo;And this&rdquo; and &ldquo;and this&rdquo; and &ldquo;and this&rdquo; and &ldquo;and another.&rdquo;
75 75
76 76
"Will'll invite?" Mrs. Thorne asked;\n"because he's coming--he's at the gate."
77
&ldquo;Will&apos;ll invite?&rdquo; Mrs. Thorne asked;\n&ldquo;because he&apos;s coming--he&apos;s at the gate.&rdquo;
77
&ldquo;Will&apos;ll invite?&rdquo; Mrs. Thorne asked;\n&ldquo;because he&apos;s coming&ndash;he&apos;s at the gate.&rdquo;
78 78
79 79
Model "T2000"
...
94 94
'...even better!'
95 95
&lsquo;&hellip;even better!&rsquo;
96
97
With-'imaginary'-dashes.
98
With-&lsquo;imaginary&rsquo;-dashes.
96 99
97 100
With--'imaginary'--dashes.
98
With--&lsquo;imaginary&rsquo;--dashes.
101
With&ndash;&lsquo;imaginary&rsquo;&ndash;dashes.
102
103
With---'imaginary'---dashes.
104
With&mdash;&lsquo;imaginary&rsquo;&mdash;dashes.
99 105
100 106
'It's a beautiful day!'
...
171 177
172 178
"'I'm in danger. Help? Take me to'--an address on Van Ness Avenue.
173
&ldquo;&lsquo;I&apos;m in danger. Help? Take me to&rsquo;--an address on Van Ness Avenue.
179
&ldquo;&lsquo;I&apos;m in danger. Help? Take me to&rsquo;&ndash;an address on Van Ness Avenue.
174 180
175 181
"Ah," she said, "you knew! He told you--and you said 'my dear'! How could you?!"
176
&ldquo;Ah,&rdquo; she said, &ldquo;you knew! He told you--and you said &lsquo;my dear&rsquo;! How could you?!&rdquo;
182
&ldquo;Ah,&rdquo; she said, &ldquo;you knew! He told you&ndash;and you said &lsquo;my dear&rsquo;! How could you?!&rdquo;
177 183
178 184
shouldn't drop letters, nor say "ain't" or "hain't."
179 185
shouldn&apos;t drop letters, nor say &ldquo;ain&apos;t&rdquo; or &ldquo;hain&apos;t.&rdquo;
180 186
181 187
"’Kearney lives on the banks of Killarney—’
182
&ldquo;&rsquo;Kearney lives on the banks of Killarney—&rsquo;
188
&ldquo;&rsquo;Kearney lives on the banks of Killarney&mdash;&rsquo;
183 189
184 190
# ########################################################################
...
216 222
217 223
"She said, 'Llamas'll languish, they'll--
218
&ldquo;She said, &lsquo;Llamas&apos;ll languish, they&apos;ll--
224
&ldquo;She said, &lsquo;Llamas&apos;ll languish, they&apos;ll&ndash;
219 225
220 226
'The 70s are my favorite numbers,' she said.
...
240 246
241 247
Computer says, "'It is mysteries---'"
242
Computer says, &ldquo;&lsquo;It is mysteries---&rsquo;&rdquo;
248
Computer says, &ldquo;&lsquo;It is mysteries&mdash;&rsquo;&rdquo;
243 249
M src/test/resources/com/whitemagicsoftware/keenquotes/texts/unambiguous-2-pass.txt
105 105
106 106
"Jane said, ''E'll be spooky, Sam's son with the jack-o'-lantern!'" said the O'Mally twins'---y'know---ghosts in unison.
107
&ldquo;Jane said, &lsquo;&apos;E&apos;ll be spooky, Sam&apos;s son with the jack-o&apos;-lantern!&rsquo;&rdquo; said the O&apos;Mally twins&apos;---y&apos;know---ghosts in unison.
107
&ldquo;Jane said, &lsquo;&apos;E&apos;ll be spooky, Sam&apos;s son with the jack-o&apos;-lantern!&rsquo;&rdquo; said the O&apos;Mally twins&apos;&mdash;y&apos;know&mdash;ghosts in unison.
108 108
109 109
''Sup, Doc?'
...
123 123
124 124
"Not much o' fellow-creaturs, I think, Miss; all I know—my old master, as war a knowin' man, used to say, says he, 'If e'er I sow my wheat wi'out brinin', I'm a Dutchman,' says he; an' that war as much as to say as a Dutchman war a fool, or next door. Nay, nay, I aren't goin' to bother mysen about Dutchmen. There's fools enoo, an' rogues enoo, wi'out lookin' i' books for 'em."
125
&ldquo;Not much o&apos; fellow-creaturs, I think, Miss; all I know—my old master, as war a knowin&apos; man, used to say, says he, &lsquo;If e&apos;er I sow my wheat wi&apos;out brinin&apos;, I&apos;m a Dutchman,&rsquo; says he; an&apos; that war as much as to say as a Dutchman war a fool, or next door. Nay, nay, I aren&apos;t goin&apos; to bother mysen about Dutchmen. There&apos;s fools enoo, an&apos; rogues enoo, wi&apos;out lookin&apos; i&apos; books for &apos;em.&rdquo;
125
&ldquo;Not much o&apos; fellow-creaturs, I think, Miss; all I know&mdash;my old master, as war a knowin&apos; man, used to say, says he, &lsquo;If e&apos;er I sow my wheat wi&apos;out brinin&apos;, I&apos;m a Dutchman,&rsquo; says he; an&apos; that war as much as to say as a Dutchman war a fool, or next door. Nay, nay, I aren&apos;t goin&apos; to bother mysen about Dutchmen. There&apos;s fools enoo, an&apos; rogues enoo, wi&apos;out lookin&apos; i&apos; books for &apos;em.&rdquo;
126 126
127 127
I leave proofs to those who 've seen 'em;\n\nBut this I heard her say, and can't be wrong\n\nAnd all may think which way their judgments lean 'em,\n\n''T is strange---the Hebrew noun which means "I am,"\n\nThe English always use to govern d--n.'
128
I leave proofs to those who &apos;ve seen &apos;em;\n\nBut this I heard her say, and can&apos;t be wrong\n\nAnd all may think which way their judgments lean &apos;em,\n\n&lsquo;&apos;T is strange---the Hebrew noun which means &ldquo;I am,&rdquo;\n\nThe English always use to govern d--n.&rsquo;
128
I leave proofs to those who &apos;ve seen &apos;em;\n\nBut this I heard her say, and can&apos;t be wrong\n\nAnd all may think which way their judgments lean &apos;em,\n\n&lsquo;&apos;T is strange&mdash;the Hebrew noun which means &ldquo;I am,&rdquo;\n\nThe English always use to govern d&ndash;n.&rsquo;
129 129
130 130
''E's got a 'ittle box 'n a big 'un,' she said, 'wit' th' 'ittle 'un 'bout 2'×6". An' no, y'ain't cryin' on th' "soap box" to me no mo, y'hear. 'Cause it 'tweren't ever a spec o' fun!' I says to my frien'.
...
144 144
145 145
"You 'cause---" all fifteen years' worth.
146
&ldquo;You &apos;cause---&rdquo; all fifteen years&apos; worth.
146
&ldquo;You &apos;cause&mdash;&rdquo; all fifteen years&apos; worth.
147 147
148 148
"'---has a prison.'"
149
&ldquo;&lsquo;---has a prison.&rsquo;&rdquo;
149
&ldquo;&lsquo;&mdash;has a prison.&rsquo;&rdquo;
150 150
151 151
"Teach 'em t'be more 'an we ever been!"
M src/test/resources/com/whitemagicsoftware/keenquotes/texts/xml.txt
11 11
<bold>&apos;twas</bold> redeemed for the <em>cat</em>&apos;s eye
12 12
13
<a href="https://x.org" title="X's Homepage">X11's bomb</a>
14
<a href="https://x.org" title="X's Homepage">X11&apos;s bomb</a>
13
<a href="https://dj.ca" title="X's Homepage">X11's bomb</a>
14
<a href="https://dj.ca" title="X's Homepage">X11&apos;s bomb</a>
15
16
<a href="https://d--j.ca">D--J</a>
17
<a href="https://d--j.ca">D&ndash;J</a>
15 18
16 19
<body><p>Test lungs' capacity; shovel's handle hit his head's electronics.</p><p>Hoped to bring him 'round.</p></body>