Dave Jarvis' Repositories

git clone https://repo.autonoma.ca/repo/keenquotes.git
M .gitignore
1
# Ignore Gradle project-specific cache directory
2 1
.gradle
3
4
# Ignore Gradle build output directory
5
lib/build
6
7 2
.idea
8 3
gradle
9 4
gradlew*
5
build
6
10 7
M README.md
3 3
A Java library to convert straight quotes into curly quotes.
4 4
5
# Requirements
6
7
Download and install OpenJDK 16 or greater.
8
9
# Download
10
11
Download the `.jar` file from this repository.
12
13
# Run
14
15
Run the software as follows:
16
17
    java -jar keenquotes.jar < src.txt > dst.txt 2> err.txt
18
19
Where:
20
21
* `src.txt` -- Input document file that contains straight quotes.
22
* `dst.txt` -- Output document file that'll contain curled quotes.
23
* `err.txt` -- Error file that will note ambiguous conversion errors.
24
25
For help, run the software as follows:
26
27
    java -jar keenquotes.jar -h
28
5 29
# Software Design
6 30
M build.gradle
1 1
plugins {
2 2
  id 'application'
3
  id 'com.palantir.git-version' version '0.12.3'
3 4
}
4 5
5 6
group 'com.whitemagicsoftware'
6
version '1.0'
7 7
8 8
repositories {
...
22 22
  main {
23 23
    java {
24
      srcDirs = ["src/main"]
24
      srcDirs = ["src/main/java"]
25 25
    }
26 26
  }
...
35 35
  mainClassName = "com.whitemagicsoftware.${applicationName}.KeenQuotes"
36 36
}
37
38
version = gitVersion()
39
40
def resourceDir = sourceSets.main.resources.srcDirs[0]
41
final File propertiesFile = file("${resourceDir}/com/whitemagicsoftware/${applicationName}/app.properties")
42
propertiesFile.write("application.version=${version}")
37 43
38 44
jar {
...
57 63
  useJUnitPlatform()
58 64
}
59
60 65
M src/main/java/com/whitemagicsoftware/keenquotes/Contractions.java
2 2
package com.whitemagicsoftware.keenquotes;
3 3
4
import java.util.HashSet;
5
import java.util.Set;
6
7
import static java.util.Collections.emptySet;
8
9
/**
10
 * Placeholder for various types of contractions.
11
 */
12
public class Contractions {
13
14
  private final Builder mBuilder;
15
16
  private Contractions( final Builder builder ) {
17
    assert builder != null;
18
    mBuilder = builder;
19
  }
20
21
  /**
22
   * Allows constructing a list of custom contractions.
23
   */
24
  @SuppressWarnings( "unused" )
25
  public static class Builder {
26
    private final Set<String> mBeganUnambiguous = new HashSet<>();
27
    private final Set<String> mEndedUnambiguous = new HashSet<>();
28
    private final Set<String> mBeganAmbiguous = new HashSet<>();
29
    private final Set<String> mEndedAmbiguous = new HashSet<>();
30
31
    public void withBeganUnambiguous( final Set<String> words ) {
32
      mBeganUnambiguous.addAll( words );
33
    }
34
35
    public void withEndedUnambiguous( final Set<String> words ) {
36
      mEndedUnambiguous.addAll( words );
37
    }
38
39
    public void withBeganAmbiguous( final Set<String> words ) {
40
      mBeganAmbiguous.addAll( words );
41
    }
42
43
    public void withEndedAmbiguous( final Set<String> words ) {
44
      mEndedAmbiguous.addAll( words );
45
    }
46
47
    /**
48
     * Constructs a new set of {@link Contractions} that can be configured
49
     * using this {@link Builder} instance.
50
     *
51
     * @return {@link Contractions} suitable for use with parsing text.
52
     */
53
    public Contractions build() {
54
      mBeganUnambiguous.addAll( from( mBeganUnambiguous, BEGAN_UNAMBIGUOUS ) );
55
      mEndedUnambiguous.addAll( from( mEndedUnambiguous, ENDED_UNAMBIGUOUS ) );
56
      mBeganAmbiguous.addAll( from( mBeganAmbiguous, BEGAN_AMBIGUOUS ) );
57
      mEndedAmbiguous.addAll( from( mEndedAmbiguous, ENDED_AMBIGUOUS ) );
58
59
      return new Contractions( this );
60
    }
61
62
    /**
63
     * This returns the {@code fallback} {@link Set} if {@code src} is empty;
64
     * otherwise, this returns the empty {@link Set}.
65
     *
66
     * @param src      A set of contractions, possibly empty.
67
     * @param fallback The default values to use if {@code src} is empty.
68
     * @param <T>      The type of data used by both {@link Set}s.
69
     * @return An empty {@link Set} if the {@code src} contains at least one
70
     * element; otherwise, this will return {@code fallback}.
71
     */
72
    private static <T> Set<T> from( final Set<T> src, final Set<T> fallback ) {
73
      assert src != null;
74
      assert fallback != null;
75
      return src.isEmpty() ? fallback : emptySet();
76
    }
77
  }
78
79
  /**
80
   * Answers whether the given word is a contraction that always starts
81
   * with an apostrophe. The comparison is case insensitive. This must
82
   * only be called when a straight quote is followed by a word.
83
   *
84
   * @param word The word to compare against the list of known unambiguous
85
   *             contractions.
86
   * @return {@code true} when the given word is in the set of unambiguous
87
   * contractions.
88
   */
89
  public boolean beganUnambiguously( final String word ) {
90
    assert word != null;
91
    return getBeganUnambiguous().contains( word.toLowerCase() );
92
  }
93
94
  /**
95
   * Answers whether the given word could be a contraction but is also a
96
   * valid word in non-contracted form.
97
   *
98
   * @param word The word to compare against the list of known ambiguous
99
   *             contractions.
100
   * @return {@code true} when the given word is in the set of ambiguous
101
   * contractions.
102
   */
103
  public boolean beganAmbiguously( final String word ) {
104
    assert word != null;
105
    return getBeganAmbiguous().contains( word.toLowerCase() );
106
  }
107
108
  public boolean endedUnambiguously( final String word ) {
109
    assert word != null;
110
    return getEndedUnambiguous().contains( word.toLowerCase() );
111
  }
112
113
  public boolean endedAmbiguously( final String word ) {
114
    assert word != null;
115
    final var check = word.toLowerCase();
116
117
    // Ensure that 'n' isn't matched for ambiguity by enforcing length, yet
118
    // allow o' to match because 'a sentence can end with the letter o'.
119
    return getEndedAmbiguous().contains( check ) ||
120
      check.endsWith( "s" ) || check.endsWith( "z" ) ||
121
      check.endsWith( "x" ) || (check.length() > 1 && check.endsWith( "n" ));
122
  }
123
124
  private Set<String> getBeganUnambiguous() {
125
    return mBuilder.mBeganUnambiguous;
126
  }
127
128
  private Set<String> getEndedUnambiguous() {
129
    return mBuilder.mEndedUnambiguous;
130
  }
131
132
  private Set<String> getBeganAmbiguous() {
133
    return mBuilder.mBeganAmbiguous;
134
  }
135
136
  private Set<String> getEndedAmbiguous() {
137
    return mBuilder.mEndedAmbiguous;
138
  }
139
140
  /**
141
   * Words having a straight apostrophe that cannot be mistaken for an
142
   * opening single quote.
143
   */
144
  private static final Set<String> BEGAN_UNAMBIGUOUS = Set.of(
145
    "aporth",
146
    "boutcha",
147
    "boutchu",
148
    "cept",
149
    "dillo",
150
    "em",
151
    "fraid",
152
    "gainst",
153
    "n",
154
    "neath",
155
    "nother",
156
    "onna",
157
    "onna'",
158
    "pon",
159
    "s",
160
    "sblood",
161
    "scuse",
162
    "sfar",
163
    "sfoot",
164
    "t",
165
    "taint",
166
    "tain",
167
    "til",
168
    "tis",
169
    "tisn",
170
    "tshall",
171
    "twas",
172
    "twasn",
173
    "tween",
174
    "twere",
175
    "tweren",
176
    "twixt",
177
    "twon",
178
    "twou",
179
    "twould",
180
    "twouldn",
181
    "ve"
182
  );
183
184
  /**
185
   * Words having a straight apostrophe that may be either part of a
186
   * contraction or a word that stands alone beside an opening single quote.
187
   */
188
  private static final Set<String> BEGAN_AMBIGUOUS = Set.of(
189
    // about|boxing match
190
    "bout",
191
    // because|causal
192
    "cause",
193
    // what you|choo choo train
194
    "choo",
195
    // he|e pluribus unum
196
    "e",
197
    // here|earlier
198
    "ere",
199
    // afro|to and fro
200
    "fro",
201
    // whore|ho ho!
202
    "ho",
203
    // okay|letter K
204
    "kay",
205
    // lo|lo and behold
206
    "lo",
207
    // are|regarding
208
    "re",
209
    // what's up|to sup
210
    "sup",
211
    // it will|twill fabric
212
    "twill",
213
    // them|utterance
214
    "um",
215
    // is that|Iranian village
216
    "zat"
217
  );
218
219
  private static final Set<String> ENDED_AMBIGUOUS = Set.of(
220
    // give|martial arts garment
221
    "gi",
222
    // in|I
223
    "i",
224
    // of|letter o
225
    "o"
226
  );
227
228
  private static final Set<String> ENDED_UNAMBIGUOUS = Set.of(
229
    // and
230
    "an",
231
    // for/before
232
    "fo",
233
    // friend
234
    "frien",
235
    // just
236
    "jus",
237
    // lord
238
    "lor",
239
    // myself
240
    "masel",
241
    // old
242
    "ol",
243
    // San (Francisco)
244
    "Sa",
245
    // shift
246
    "shif",
247
    // the
248
    "th",
249
    // what
250
    "wha",
251
    // world
252
    "worl",
253
    // Top ~500 common -ing words as English contractions.
254
    "acceptin",
255
    "accompanyin",
256
    "accordin",
257
    "accountin",
258
    "achievin",
259
    "acquirin",
260
    "actin",
261
    "addin",
262
    "addressin",
263
    "adjoinin",
264
    "adoptin",
265
    "advancin",
266
    "advertisin",
267
    "affectin",
268
    "agin",
269
    "allowin",
270
    "amazin",
271
    "analyzin",
272
    "answerin",
273
    "anythin",
274
    "appearin",
275
    "applyin",
276
    "approachin",
277
    "arguin",
278
    "arisin",
279
    "arrivin",
280
    "askin",
281
    "assessin",
282
    "assumin",
283
    "attackin",
284
    "attemptin",
285
    "attendin",
286
    "avoidin",
287
    "bankin",
288
    "bargainin",
289
    "bearin",
290
    "beatin",
291
    "becomin",
292
    "beginnin",
293
    "bein",
294
    "believin",
295
    "belongin",
296
    "bendin",
297
    "bindin",
298
    "bleedin",
299
    "blessin",
300
    "blowin",
301
    "boilin",
302
    "borrowin",
303
    "breakin",
304
    "breathin",
305
    "breedin",
306
    "bringin",
307
    "broadcastin",
308
    "buildin",
309
    "burnin",
310
    "buyin",
311
    "calculatin",
312
    "callin",
313
    "carryin",
314
    "castin",
315
    "causin",
316
    "ceilin",
317
    "challengin",
318
    "changin",
319
    "checkin",
320
    "choosin",
321
    "claimin",
322
    "cleanin",
323
    "clearin",
324
    "climbin",
325
    "closin",
326
    "clothin",
327
    "collectin",
328
    "combinin",
329
    "comin",
330
    "commandin",
331
    "comparin",
332
    "compellin",
333
    "competin",
334
    "computin",
335
    "concernin",
336
    "concludin",
337
    "conditionin",
338
    "conductin",
339
    "conflictin",
340
    "connectin",
341
    "considerin",
342
    "consistin",
343
    "constructin",
344
    "consultin",
345
    "consumin",
346
    "containin",
347
    "continuin",
348
    "contractin",
349
    "contributin",
350
    "controllin",
351
    "convincin",
352
    "cookin",
353
    "coolin",
354
    "copin",
355
    "correspondin",
356
    "counselin",
357
    "countin",
358
    "couplin",
359
    "coverin",
360
    "creatin",
361
    "crossin",
362
    "cryin",
363
    "cuttin",
364
    "dancin",
365
    "darlin",
366
    "datin",
367
    "dealin",
368
    "decidin",
369
    "declarin",
370
    "declinin",
371
    "decreasin",
372
    "definin",
373
    "demandin",
374
    "denyin",
375
    "dependin",
376
    "descendin",
377
    "describin",
378
    "designin",
379
    "destroyin",
380
    "determinin",
381
    "developin",
382
    "differin",
383
    "dinin",
384
    "directin",
385
    "discussin",
386
    "distinguishin",
387
    "disturbin",
388
    "dividin",
389
    "doin",
390
    "drawin",
391
    "dressin",
392
    "drinkin",
393
    "drivin",
394
    "droppin",
395
    "dryin",
396
    "durin",
397
    "dwellin",
398
    "dyin",
399
    "eatin",
400
    "editin",
401
    "emergin",
402
    "employin",
403
    "enablin",
404
    "encouragin",
405
    "endin",
406
    "engagin",
407
    "engineerin",
408
    "enjoyin",
409
    "enterin",
410
    "establishin",
411
    "evaluatin",
412
    "evenin",
413
    "everythin",
414
    "examinin",
415
    "exceedin",
416
    "excitin",
417
    "excludin",
418
    "existin",
419
    "expandin",
420
    "expectin",
421
    "experiencin",
422
    "explainin",
423
    "explorin",
424
    "expressin",
425
    "extendin",
426
    "facin",
427
    "failin",
428
    "fallin",
429
    "farmin",
430
    "fascinatin",
431
    "feedin",
432
    "feelin",
433
    "fightin",
434
    "filin",
435
    "fillin",
436
    "financin",
437
    "findin",
438
    "firin",
439
    "fishin",
440
    "fittin",
441
    "fixin",
442
    "floatin",
443
    "flowin",
444
    "flyin",
445
    "focusin",
446
    "followin",
447
    "forcin",
448
    "foregoin",
449
    "formin",
450
    "forthcomin",
451
    "foundin",
452
    "freezin",
453
    "fuckin",
454
    "functionin",
455
    "fundin",
456
    "gainin",
457
    "gatherin",
458
    "generatin",
459
    "gettin",
460
    "givin",
461
    "goin",
462
    "governin",
463
    "grantin",
464
    "growin",
465
    "hackin",
466
    "handlin",
467
    "hangin",
468
    "happenin",
469
    "havin",
470
    "headin",
471
    "healin",
472
    "hearin",
473
    "heatin",
474
    "helpin",
475
    "hidin",
476
    "holdin",
477
    "hopin",
478
    "housin",
479
    "huntin",
480
    "identifyin",
481
    "imagin",
482
    "implementin",
483
    "imposin",
484
    "improvin",
485
    "includin",
486
    "increasin",
487
    "indicatin",
488
    "interestin",
489
    "interpretin",
490
    "introducin",
491
    "involvin",
492
    "joinin",
493
    "judgin",
494
    "keepin",
495
    "killin",
496
    "knowin",
497
    "lackin",
498
    "landin",
499
    "lastin",
500
    "laughin",
501
    "layin",
502
    "leadin",
503
    "leanin",
504
    "learnin",
505
    "leavin",
506
    "lettin",
507
    "liftin",
508
    "lightin",
509
    "lightnin",
510
    "limitin",
511
    "listenin",
512
    "listin",
513
    "livin",
514
    "loadin",
515
    "lookin",
516
    "losin",
517
    "lovin",
518
    "lowerin",
519
    "lyin",
520
    "maintainin",
521
    "makin",
522
    "managin",
523
    "manufacturin",
524
    "mappin",
525
    "marketin",
526
    "markin",
527
    "matchin",
528
    "meanin",
529
    "measurin",
530
    "meetin",
531
    "meltin",
532
    "minin",
533
    "misleadin",
534
    "missin",
535
    "mixin",
536
    "modelin",
537
    "monitorin",
538
    "mornin",
539
    "movin",
540
    "neighborin",
4
import java.util.ArrayList;
5
import java.util.HashSet;
6
import java.util.Set;
7
8
import static java.lang.String.format;
9
import static java.util.Collections.emptySet;
10
import static java.util.Collections.sort;
11
12
/**
13
 * Placeholder for various types of contractions.
14
 */
15
public class Contractions {
16
17
  private final Builder mBuilder;
18
19
  private Contractions( final Builder builder ) {
20
    assert builder != null;
21
    mBuilder = builder;
22
  }
23
24
  /**
25
   * Allows constructing a list of custom contractions.
26
   */
27
  @SuppressWarnings( "unused" )
28
  public static class Builder {
29
    private final Set<String> mBeganUnambiguous = new HashSet<>();
30
    private final Set<String> mEndedUnambiguous = new HashSet<>();
31
    private final Set<String> mBeganAmbiguous = new HashSet<>();
32
    private final Set<String> mEndedAmbiguous = new HashSet<>();
33
34
    public void withBeganUnambiguous( final Set<String> words ) {
35
      mBeganUnambiguous.addAll( words );
36
    }
37
38
    public void withEndedUnambiguous( final Set<String> words ) {
39
      mEndedUnambiguous.addAll( words );
40
    }
41
42
    public void withBeganAmbiguous( final Set<String> words ) {
43
      mBeganAmbiguous.addAll( words );
44
    }
45
46
    public void withEndedAmbiguous( final Set<String> words ) {
47
      mEndedAmbiguous.addAll( words );
48
    }
49
50
    /**
51
     * Constructs a new set of {@link Contractions} that can be configured
52
     * using this {@link Builder} instance.
53
     *
54
     * @return {@link Contractions} suitable for use with parsing text.
55
     */
56
    public Contractions build() {
57
      mBeganUnambiguous.addAll( from( mBeganUnambiguous, BEGAN_UNAMBIGUOUS ) );
58
      mEndedUnambiguous.addAll( from( mEndedUnambiguous, ENDED_UNAMBIGUOUS ) );
59
      mBeganAmbiguous.addAll( from( mBeganAmbiguous, BEGAN_AMBIGUOUS ) );
60
      mEndedAmbiguous.addAll( from( mEndedAmbiguous, ENDED_AMBIGUOUS ) );
61
62
      return new Contractions( this );
63
    }
64
65
    /**
66
     * This returns the {@code fallback} {@link Set} if {@code src} is empty;
67
     * otherwise, this returns the empty {@link Set}.
68
     *
69
     * @param src      A set of contractions, possibly empty.
70
     * @param fallback The default values to use if {@code src} is empty.
71
     * @param <T>      The type of data used by both {@link Set}s.
72
     * @return An empty {@link Set} if the {@code src} contains at least one
73
     * element; otherwise, this will return {@code fallback}.
74
     */
75
    private static <T> Set<T> from( final Set<T> src, final Set<T> fallback ) {
76
      assert src != null;
77
      assert fallback != null;
78
      return src.isEmpty() ? fallback : emptySet();
79
    }
80
  }
81
82
  /**
83
   * Answers whether the given word is a contraction that always starts
84
   * with an apostrophe. The comparison is case insensitive. This must
85
   * only be called when a straight quote is followed by a word.
86
   *
87
   * @param word The word to compare against the list of known unambiguous
88
   *             contractions.
89
   * @return {@code true} when the given word is in the set of unambiguous
90
   * contractions.
91
   */
92
  public boolean beganUnambiguously( final String word ) {
93
    assert word != null;
94
    return getBeganUnambiguous().contains( word.toLowerCase() );
95
  }
96
97
  /**
98
   * Answers whether the given word could be a contraction but is also a
99
   * valid word in non-contracted form.
100
   *
101
   * @param word The word to compare against the list of known ambiguous
102
   *             contractions.
103
   * @return {@code true} when the given word is in the set of ambiguous
104
   * contractions.
105
   */
106
  public boolean beganAmbiguously( final String word ) {
107
    assert word != null;
108
    return getBeganAmbiguous().contains( word.toLowerCase() );
109
  }
110
111
  public boolean endedUnambiguously( final String word ) {
112
    assert word != null;
113
    return getEndedUnambiguous().contains( word.toLowerCase() );
114
  }
115
116
  public boolean endedAmbiguously( final String word ) {
117
    assert word != null;
118
    final var check = word.toLowerCase();
119
120
    // Ensure that 'n' isn't matched for ambiguity by enforcing length, yet
121
    // allow o' to match because 'a sentence can end with the letter o'.
122
    return getEndedAmbiguous().contains( check ) ||
123
      check.endsWith( "s" ) || check.endsWith( "z" ) ||
124
      check.endsWith( "x" ) || (check.length() > 1 && check.endsWith( "n" ));
125
  }
126
127
  private Set<String> getBeganUnambiguous() {
128
    return mBuilder.mBeganUnambiguous;
129
  }
130
131
  private Set<String> getEndedUnambiguous() {
132
    return mBuilder.mEndedUnambiguous;
133
  }
134
135
  private Set<String> getBeganAmbiguous() {
136
    return mBuilder.mBeganAmbiguous;
137
  }
138
139
  private Set<String> getEndedAmbiguous() {
140
    return mBuilder.mEndedAmbiguous;
141
  }
142
143
144
  @Override
145
  public String toString() {
146
    return
147
      toString( getBeganAmbiguous(), "Ambiguous Began", "'%s" ) +
148
        toString( getEndedAmbiguous(), "Ambiguous Ended", "%s'" ) +
149
        toString( getBeganUnambiguous(), "Unambiguous Began", "'%s" ) +
150
        toString( getEndedUnambiguous(), "Unambiguous Ended", "%s'" );
151
  }
152
153
  private String toString(
154
    final Set<String> words, final String category, final String fmt ) {
155
    final var sb = new StringBuilder( 16384 );
156
    final var newline = System.lineSeparator();
157
    final var list = new ArrayList<>( words );
158
159
    sort( list );
160
    sb.append( format( "%n%s%n", category ) );
161
    list.forEach( ( s ) -> sb.append( format( fmt, s ) ).append( newline ) );
162
163
    return sb.toString();
164
  }
165
166
  /**
167
   * Words having a straight apostrophe that cannot be mistaken for an
168
   * opening single quote.
169
   */
170
  private static final Set<String> BEGAN_UNAMBIGUOUS = Set.of(
171
    "aporth",
172
    "boutcha",
173
    "boutchu",
174
    "cept",
175
    "dillo",
176
    "em",
177
    "fraid",
178
    "gainst",
179
    "n",
180
    "neath",
181
    "nother",
182
    "onna",
183
    "onna'",
184
    "pon",
185
    "s",
186
    "sblood",
187
    "scuse",
188
    "sfar",
189
    "sfoot",
190
    "t",
191
    "taint",
192
    "tain",
193
    "til",
194
    "tis",
195
    "tisn",
196
    "tshall",
197
    "twas",
198
    "twasn",
199
    "tween",
200
    "twere",
201
    "tweren",
202
    "twixt",
203
    "twon",
204
    "twou",
205
    "twould",
206
    "twouldn",
207
    "ve"
208
  );
209
210
  /**
211
   * Words having a straight apostrophe that may be either part of a
212
   * contraction or a word that stands alone beside an opening single quote.
213
   */
214
  private static final Set<String> BEGAN_AMBIGUOUS = Set.of(
215
    // about|boxing match
216
    "bout",
217
    // because|causal
218
    "cause",
219
    // what you|choo choo train
220
    "choo",
221
    // he|e pluribus unum
222
    "e",
223
    // here|earlier
224
    "ere",
225
    // afro|to and fro
226
    "fro",
227
    // whore|ho ho!
228
    "ho",
229
    // okay|letter K
230
    "kay",
231
    // lo|lo and behold
232
    "lo",
233
    // are|regarding
234
    "re",
235
    // what's up|to sup
236
    "sup",
237
    // it will|twill fabric
238
    "twill",
239
    // them|utterance
240
    "um",
241
    // is that|Iranian village
242
    "zat"
243
  );
244
245
  private static final Set<String> ENDED_AMBIGUOUS = Set.of(
246
    // give|martial arts garment
247
    "gi",
248
    // in|I
249
    "i",
250
    // of|letter o
251
    "o"
252
  );
253
254
  private static final Set<String> ENDED_UNAMBIGUOUS = Set.of(
255
    // and
256
    "an",
257
    // for/before
258
    "fo",
259
    // friend
260
    "frien",
261
    // just
262
    "jus",
263
    // lord
264
    "lor",
265
    // myself
266
    "masel",
267
    // old
268
    "ol",
269
    // San (Francisco)
270
    "Sa",
271
    // shift
272
    "shif",
273
    // the
274
    "th",
275
    // what
276
    "wha",
277
    // world
278
    "worl",
279
    // Top ~500 common -ing words as English contractions.
280
    "acceptin",
281
    "accompanyin",
282
    "accordin",
283
    "accountin",
284
    "achievin",
285
    "acquirin",
286
    "actin",
287
    "addin",
288
    "addressin",
289
    "adjoinin",
290
    "adoptin",
291
    "advancin",
292
    "advertisin",
293
    "affectin",
294
    "agin",
295
    "allowin",
296
    "amazin",
297
    "analyzin",
298
    "answerin",
299
    "anythin",
300
    "appearin",
301
    "applyin",
302
    "approachin",
303
    "arguin",
304
    "arisin",
305
    "arrivin",
306
    "askin",
307
    "assessin",
308
    "assumin",
309
    "attackin",
310
    "attemptin",
311
    "attendin",
312
    "avoidin",
313
    "bankin",
314
    "bargainin",
315
    "bearin",
316
    "beatin",
317
    "becomin",
318
    "beginnin",
319
    "bein",
320
    "believin",
321
    "belongin",
322
    "bendin",
323
    "bindin",
324
    "bleedin",
325
    "blessin",
326
    "blowin",
327
    "boilin",
328
    "borrowin",
329
    "breakin",
330
    "breathin",
331
    "breedin",
332
    "bringin",
333
    "broadcastin",
334
    "buildin",
335
    "burnin",
336
    "buyin",
337
    "calculatin",
338
    "callin",
339
    "carryin",
340
    "castin",
341
    "causin",
342
    "ceilin",
343
    "challengin",
344
    "changin",
345
    "checkin",
346
    "choosin",
347
    "claimin",
348
    "cleanin",
349
    "clearin",
350
    "climbin",
351
    "closin",
352
    "clothin",
353
    "collectin",
354
    "combinin",
355
    "comin",
356
    "commandin",
357
    "comparin",
358
    "compellin",
359
    "competin",
360
    "computin",
361
    "concernin",
362
    "concludin",
363
    "conditionin",
364
    "conductin",
365
    "conflictin",
366
    "connectin",
367
    "considerin",
368
    "consistin",
369
    "constructin",
370
    "consultin",
371
    "consumin",
372
    "containin",
373
    "continuin",
374
    "contractin",
375
    "contributin",
376
    "controllin",
377
    "convincin",
378
    "cookin",
379
    "coolin",
380
    "copin",
381
    "correspondin",
382
    "counselin",
383
    "countin",
384
    "couplin",
385
    "coverin",
386
    "creatin",
387
    "crossin",
388
    "cryin",
389
    "cuttin",
390
    "dancin",
391
    "darlin",
392
    "datin",
393
    "dealin",
394
    "decidin",
395
    "declarin",
396
    "declinin",
397
    "decreasin",
398
    "definin",
399
    "demandin",
400
    "denyin",
401
    "dependin",
402
    "descendin",
403
    "describin",
404
    "designin",
405
    "destroyin",
406
    "determinin",
407
    "developin",
408
    "differin",
409
    "dinin",
410
    "directin",
411
    "discussin",
412
    "distinguishin",
413
    "disturbin",
414
    "dividin",
415
    "doin",
416
    "drawin",
417
    "dressin",
418
    "drinkin",
419
    "drivin",
420
    "droppin",
421
    "dryin",
422
    "durin",
423
    "dwellin",
424
    "dyin",
425
    "eatin",
426
    "editin",
427
    "emergin",
428
    "employin",
429
    "enablin",
430
    "encouragin",
431
    "endin",
432
    "engagin",
433
    "engineerin",
434
    "enjoyin",
435
    "enterin",
436
    "establishin",
437
    "evaluatin",
438
    "evenin",
439
    "everythin",
440
    "examinin",
441
    "exceedin",
442
    "excitin",
443
    "excludin",
444
    "existin",
445
    "expandin",
446
    "expectin",
447
    "experiencin",
448
    "explainin",
449
    "explorin",
450
    "expressin",
451
    "extendin",
452
    "facin",
453
    "failin",
454
    "fallin",
455
    "farmin",
456
    "fascinatin",
457
    "feedin",
458
    "feelin",
459
    "fightin",
460
    "filin",
461
    "fillin",
462
    "financin",
463
    "findin",
464
    "firin",
465
    "fishin",
466
    "fittin",
467
    "fixin",
468
    "floatin",
469
    "flowin",
470
    "flyin",
471
    "focusin",
472
    "followin",
473
    "forcin",
474
    "foregoin",
475
    "formin",
476
    "forthcomin",
477
    "foundin",
478
    "freezin",
479
    "fuckin",
480
    "functionin",
481
    "fundin",
482
    "gainin",
483
    "gatherin",
484
    "generatin",
485
    "gettin",
486
    "givin",
487
    "goin",
488
    "governin",
489
    "grantin",
490
    "growin",
491
    "hackin",
492
    "handlin",
493
    "hangin",
494
    "happenin",
495
    "havin",
496
    "headin",
497
    "healin",
498
    "hearin",
499
    "heatin",
500
    "helpin",
501
    "hidin",
502
    "holdin",
503
    "hopin",
504
    "housin",
505
    "huntin",
506
    "identifyin",
507
    "imagin",
508
    "implementin",
509
    "imposin",
510
    "improvin",
511
    "includin",
512
    "increasin",
513
    "indicatin",
514
    "interestin",
515
    "interpretin",
516
    "introducin",
517
    "involvin",
518
    "joinin",
519
    "judgin",
520
    "keepin",
521
    "killin",
522
    "knowin",
523
    "lackin",
524
    "landin",
525
    "lastin",
526
    "laughin",
527
    "layin",
528
    "leadin",
529
    "leanin",
530
    "learnin",
531
    "leavin",
532
    "lettin",
533
    "liftin",
534
    "lightin",
535
    "lightnin",
536
    "limitin",
537
    "listenin",
538
    "listin",
539
    "livin",
540
    "loadin",
541
    "lookin",
542
    "losin",
543
    "lovin",
544
    "lowerin",
545
    "lyin",
546
    "maintainin",
547
    "makin",
548
    "managin",
549
    "manufacturin",
550
    "mappin",
551
    "marketin",
552
    "markin",
553
    "matchin",
554
    "meanin",
555
    "measurin",
556
    "meetin",
557
    "meltin",
558
    "minin",
559
    "misleadin",
560
    "missin",
561
    "mixin",
562
    "modelin",
563
    "monitorin",
564
    "mornin",
565
    "movin",
566
    "neighborin",
567
    "neighbourin",
541 568
    "nothin",
542 569
    "notin",
A src/main/java/com/whitemagicsoftware/keenquotes/Converter.java
1
package com.whitemagicsoftware.keenquotes;
2
3
import java.util.ArrayList;
4
import java.util.Map;
5
import java.util.function.Consumer;
6
7
import static com.whitemagicsoftware.keenquotes.TokenType.*;
8
import static java.util.Collections.sort;
9
10
/**
11
 * Responsible for converting curly quotes to HTML entities throughout a
12
 * text string.
13
 */
14
public class Converter {
15
  private static final Map<TokenType, String> REPLACEMENTS = Map.of(
16
    QUOTE_OPENING_SINGLE, "&lsquo;",
17
    QUOTE_CLOSING_SINGLE, "&rsquo;",
18
    QUOTE_OPENING_DOUBLE, "&ldquo;",
19
    QUOTE_CLOSING_DOUBLE, "&rdquo;",
20
    QUOTE_STRAIGHT_SINGLE, "'",
21
    QUOTE_STRAIGHT_DOUBLE, "\"",
22
    QUOTE_APOSTROPHE, "&apos;",
23
    QUOTE_PRIME_SINGLE, "&prime;",
24
    QUOTE_PRIME_DOUBLE, "&Prime;"
25
  );
26
27
  /**
28
   * Converts straight quotes to curly quotes and primes. Any quotation marks
29
   * that cannot be converted are passed to the {@link Consumer}.
30
   *
31
   * @param text       The text to parse.
32
   * @param unresolved Recipient for ambiguous {@link Lexeme}s.
33
   * @return The given text string with as many straight quotes converted to
34
   * curly quotes as is feasible.
35
   */
36
  public static String convert(
37
    final String text, final Consumer<Lexeme> unresolved ) {
38
    final var parser = new Parser( text );
39
    final var tokens = new ArrayList<Token>();
40
41
    // Parse the tokens and consume all unresolved lexemes.
42
    parser.parse( tokens::add, unresolved );
43
44
    // The parser may emit tokens in any order.
45
    sort( tokens );
46
47
    final var result = new StringBuilder( text.length() );
48
    var position = 0;
49
50
    for( final var token : tokens ) {
51
      if( position <= token.began() ) {
52
        result.append( text, position, token.began() );
53
        result.append( REPLACEMENTS.get( token.getType() ) );
54
      }
55
56
      position = token.ended();
57
    }
58
59
    return result.append( text.substring( position ) ).toString();
60
  }
61
}
1 62
M src/main/java/com/whitemagicsoftware/keenquotes/KeenQuotes.java
2 2
package com.whitemagicsoftware.keenquotes;
3 3
4
import java.util.ArrayList;
5
import java.util.Map;
6
import java.util.function.Consumer;
4
import picocli.CommandLine;
7 5
8
import static com.whitemagicsoftware.keenquotes.TokenType.*;
9
import static java.util.Collections.sort;
6
import java.io.BufferedReader;
7
import java.io.IOException;
8
import java.io.InputStream;
9
import java.io.InputStreamReader;
10
import java.util.Properties;
11
12
import static java.lang.String.format;
13
import static picocli.CommandLine.Help.Ansi.Style.*;
14
import static picocli.CommandLine.Help.ColorScheme;
10 15
11 16
/**
12 17
 * Responsible for replacing {@link Token} instances with equivalent smart
13 18
 * quotes (or straight quotes). This will inform the caller when ambiguous
14 19
 * quotes cannot be reliably resolved.
15 20
 */
16 21
public final class KeenQuotes {
17
  private static final Map<TokenType, String> REPLACEMENTS = Map.of(
18
    QUOTE_OPENING_SINGLE, "&lsquo;",
19
    QUOTE_CLOSING_SINGLE, "&rsquo;",
20
    QUOTE_OPENING_DOUBLE, "&ldquo;",
21
    QUOTE_CLOSING_DOUBLE, "&rdquo;",
22
    QUOTE_STRAIGHT_SINGLE, "'",
23
    QUOTE_STRAIGHT_DOUBLE, "\"",
24
    QUOTE_APOSTROPHE, "&apos;",
25
    QUOTE_PRIME_SINGLE, "&prime;",
26
    QUOTE_PRIME_DOUBLE, "&Prime;"
27
  );
22
  private final Settings mSettings = new Settings( this );
23
24
  private static ColorScheme createColourScheme() {
25
    return new ColorScheme.Builder()
26
      .commands( bold )
27
      .options( fg_blue, bold )
28
      .parameters( fg_blue )
29
      .optionParams( italic )
30
      .errors( fg_red, bold )
31
      .stackTraces( italic )
32
      .build();
33
  }
34
35
  public void run() {
36
    if( getSettings().displayList() ) {
37
      displayList();
38
    }
39
    else {
40
      convert();
41
    }
42
  }
43
44
  private void displayList() {
45
    System.out.println( new Contractions.Builder().build().toString() );
46
  }
47
48
  private void convert() {
49
    final StringBuilder sb = new StringBuilder();
50
51
    try( final BufferedReader reader = open( System.in ) ) {
52
      String line;
53
      final var sep = System.lineSeparator();
54
55
      while( (line = reader.readLine()) != null ) {
56
        sb.append( line );
57
        sb.append( sep );
58
      }
59
60
      System.out.println(
61
        Converter.convert( sb.toString(), System.err::println )
62
      );
63
    } catch( final Exception ex ) {
64
      ex.printStackTrace( System.err );
65
    }
66
  }
67
68
  private Settings getSettings() {
69
    return mSettings;
70
  }
28 71
29 72
  /**
30
   * Converts straight quotes to curly quotes and primes. Any quotation marks
31
   * that cannot be converted are passed to the {@link Consumer}.
73
   * Returns the application version number retrieved from the application
74
   * properties file. The properties file is generated at build time, which
75
   * keys off the repository.
32 76
   *
33
   * @param text       The text to parse.
34
   * @param unresolved Recipient for ambiguous {@link Lexeme}s.
35
   * @return The given text string with as many straight quotes converted to
36
   * curly quotes as is feasible.
77
   * @return The application version number.
78
   * @throws RuntimeException An {@link IOException} occurred.
37 79
   */
38
  public static String convert(
39
    final String text, final Consumer<Lexeme> unresolved ) {
40
    final var parser = new Parser( text );
41
    final var tokens = new ArrayList<Token>();
80
  private static String getVersion() {
81
    try {
82
      final var properties = loadProperties( "app.properties" );
83
      return properties.getProperty( "application.version" );
84
    } catch( final Exception ex ) {
85
      throw new RuntimeException( ex );
86
    }
87
  }
42 88
43
    // Parse the tokens and consume all unresolved lexemes.
44
    parser.parse( tokens::add, unresolved );
89
  @SuppressWarnings( "SameParameterValue" )
90
  private static Properties loadProperties( final String resource )
91
    throws IOException {
92
    final var properties = new Properties();
93
    properties.load( getResourceAsStream( getResourceName( resource ) ) );
94
    return properties;
95
  }
45 96
46
    // The parser may emit tokens in any order.
47
    sort( tokens );
97
  private static String getResourceName( final String resource ) {
98
    return format( "%s/%s", getPackagePath(), resource );
99
  }
48 100
49
    final var result = new StringBuilder( text.length() );
50
    var position = 0;
101
  private static String getPackagePath() {
102
    return KeenQuotes.class.getPackageName().replace( '.', '/' );
103
  }
51 104
52
    for( final var token : tokens ) {
53
      if( position <= token.began() ) {
54
        result.append( text, position, token.began() );
55
        result.append( REPLACEMENTS.get( token.getType() ) );
56
      }
105
  private static InputStream getResourceAsStream( final String resource ) {
106
    return KeenQuotes.class.getClassLoader().getResourceAsStream( resource );
107
  }
57 108
58
      position = token.ended();
59
    }
109
  @SuppressWarnings( "SameParameterValue" )
110
  private static BufferedReader open( final InputStream in ) {
111
    return new BufferedReader( new InputStreamReader( in ) );
112
  }
60 113
61
    return result.append( text.substring( position ) ).toString();
114
  public static void main( final String[] args ) {
115
    final var app = new KeenQuotes();
116
    final var parser = new CommandLine( app.getSettings() );
117
    parser.setColorScheme( createColourScheme() );
118
119
    final var exitCode = parser.execute( args );
120
    final var parseResult = parser.getParseResult();
121
122
    if( parseResult.isUsageHelpRequested() ) {
123
      System.exit( exitCode );
124
    }
125
    else if( parseResult.isVersionHelpRequested() ) {
126
      System.out.println( getVersion() );
127
      System.exit( exitCode );
128
    }
62 129
  }
63 130
}
A src/main/java/com/whitemagicsoftware/keenquotes/Settings.java
1
/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */
2
package com.whitemagicsoftware.keenquotes;
3
4
import picocli.CommandLine;
5
6
import java.util.concurrent.Callable;
7
8
@CommandLine.Command(
9
  name = "KeenQuotes",
10
  mixinStandardHelpOptions = true,
11
  description = "Converts straight quotes to curly quotes."
12
)
13
@SuppressWarnings( {"FieldMayBeFinal", "CanBeFinal"} )
14
public final class Settings implements Callable<Integer> {
15
  /**
16
   * Main executable class.
17
   */
18
  private final KeenQuotes mMain;
19
20
//  /**
21
//   * List of unambiguous contractions having leading apostrophes.
22
//   */
23
//  @CommandLine.Option(
24
//    names = {"-ub", "--unamb-began"},
25
//    description =
26
//      "Contractions to treat as unambiguous (e.g., cause,bout)",
27
//    paramLabel = "words"
28
//  )
29
//  private String[] mUnambiguousBegan;
30
//
31
//  /**
32
//   * List of unambiguous contractions having lagging apostrophes.
33
//   */
34
//  @CommandLine.Option(
35
//    names = {"-ue", "--unamb-ended"},
36
//    description =
37
//      "Contractions to treat as unambiguous (e.g., frien,thinkin)",
38
//    paramLabel = "words"
39
//  )
40
//  private String[] mUnambiguousEnded;
41
//
42
//  /**
43
//   * List of ambiguous contractions having leading apostrophes.
44
//   */
45
//  @CommandLine.Option(
46
//    names = {"-ab", "--amb-began"},
47
//    description =
48
//      "Contractions to treat as ambiguous (e.g., sup,kay)",
49
//    paramLabel = "words"
50
//  )
51
//  private String[] mAmbiguousBegan;
52
//
53
//  /**
54
//   * List of ambiguous contractions having lagging apostrophes.
55
//   */
56
//  @CommandLine.Option(
57
//    names = {"-ae", "--amb-ended"},
58
//    description =
59
//      "Contractions to treat as ambiguous (e.g., gi,o)",
60
//    paramLabel = "words"
61
//  )
62
//  private String[] mAmbiguousEnded;
63
//
64
  /**
65
   * Display default values.
66
   */
67
  @CommandLine.Option(
68
    names = {"-l", "--list"},
69
    description = "List all ambiguous and unambiguous contractions"
70
  )
71
  private boolean mDisplayList;
72
73
  public Settings( final KeenQuotes main ) {
74
    assert main != null;
75
    mMain = main;
76
  }
77
78
  /**
79
   * Answers whether the contractions listings should be displayed.
80
   *
81
   * @return {@code true} to list the contractions.
82
   */
83
  public boolean displayList() {
84
    return mDisplayList;
85
  }
86
87
  /**
88
   * Invoked after the command-line arguments are parsed to launch the
89
   * application.
90
   *
91
   * @return Exit level zero.
92
   */
93
  @Override
94
  public Integer call() {
95
    mMain.run();
96
    return 0;
97
  }
98
}
1 99
M src/test/java/com/whitemagicsoftware/keenquotes/KeenQuotesTest.java
2 2
package com.whitemagicsoftware.keenquotes;
3 3
4
import org.junit.jupiter.api.Disabled;
4 5
import org.junit.jupiter.api.Test;
5 6
import org.junit.jupiter.params.ParameterizedTest;
6 7
import org.junit.jupiter.params.provider.ValueSource;
7 8
8 9
import java.io.BufferedReader;
9 10
import java.io.IOException;
10 11
import java.io.InputStreamReader;
11 12
import java.util.function.Function;
12 13
14
import static com.whitemagicsoftware.keenquotes.Converter.convert;
13 15
import static java.lang.System.out;
14 16
import static org.junit.jupiter.api.Assertions.assertEquals;
...
24 26
   */
25 27
  @Test
26
  //@Disabled
28
  @Disabled
27 29
  public void test_parse_SingleLine_Parsed() {
28
    out.println( KeenQuotes.convert(
30
    out.println( convert(
29 31
      "\"’Kearney lives on the banks of Killarney—’",
30 32
      out::println
...
39 41
  @Test
40 42
  public void test_Parse_StraightQuotes_CurlyQuotes() throws IOException {
41
    testConverter( text -> KeenQuotes.convert( text, ( lexeme ) -> {} ) );
43
    testConverter( text -> convert( text, ( lexeme ) -> {} ) );
42 44
  }
43 45
44 46
  @ParameterizedTest
45
  @ValueSource( strings = {"westrup"} )
47
  @ValueSource( strings = {"habberton"} )
46 48
  void test_Parse_Story_Converted( final String filename ) throws IOException {
47 49
    final var sb = new StringBuilder( 2 ^ 20 );
...
55 57
    }
56 58
57
    System.out.println( KeenQuotes.convert( sb.toString(), out::println ) );
59
    System.out.println( convert( sb.toString(), out::println ) );
58 60
  }
59 61