| 1 | # Ignore Gradle project-specific cache directory |
|
| 2 | 1 | .gradle |
| 3 | ||
| 4 | # Ignore Gradle build output directory |
|
| 5 | lib/build |
|
| 6 | ||
| 7 | 2 | .idea |
| 8 | 3 | gradle |
| 9 | 4 | gradlew* |
| 5 | build |
|
| 6 | ||
| 10 | 7 |
| 3 | 3 | A Java library to convert straight quotes into curly quotes. |
| 4 | 4 | |
| 5 | # Requirements |
|
| 6 | ||
| 7 | Download and install OpenJDK 16 or greater. |
|
| 8 | ||
| 9 | # Download |
|
| 10 | ||
| 11 | Download the `.jar` file from this repository. |
|
| 12 | ||
| 13 | # Run |
|
| 14 | ||
| 15 | Run the software as follows: |
|
| 16 | ||
| 17 | java -jar keenquotes.jar < src.txt > dst.txt 2> err.txt |
|
| 18 | ||
| 19 | Where: |
|
| 20 | ||
| 21 | * `src.txt` -- Input document file that contains straight quotes. |
|
| 22 | * `dst.txt` -- Output document file that'll contain curled quotes. |
|
| 23 | * `err.txt` -- Error file that will note ambiguous conversion errors. |
|
| 24 | ||
| 25 | For help, run the software as follows: |
|
| 26 | ||
| 27 | java -jar keenquotes.jar -h |
|
| 28 | ||
| 5 | 29 | # Software Design |
| 6 | 30 |
| 1 | 1 | plugins { |
| 2 | 2 | id 'application' |
| 3 | id 'com.palantir.git-version' version '0.12.3' |
|
| 3 | 4 | } |
| 4 | 5 | |
| 5 | 6 | group 'com.whitemagicsoftware' |
| 6 | version '1.0' |
|
| 7 | 7 | |
| 8 | 8 | repositories { |
| ... | ||
| 22 | 22 | main { |
| 23 | 23 | java { |
| 24 | srcDirs = ["src/main"] |
|
| 24 | srcDirs = ["src/main/java"] |
|
| 25 | 25 | } |
| 26 | 26 | } |
| ... | ||
| 35 | 35 | mainClassName = "com.whitemagicsoftware.${applicationName}.KeenQuotes" |
| 36 | 36 | } |
| 37 | ||
| 38 | version = gitVersion() |
|
| 39 | ||
| 40 | def resourceDir = sourceSets.main.resources.srcDirs[0] |
|
| 41 | final File propertiesFile = file("${resourceDir}/com/whitemagicsoftware/${applicationName}/app.properties") |
|
| 42 | propertiesFile.write("application.version=${version}") |
|
| 37 | 43 | |
| 38 | 44 | jar { |
| ... | ||
| 57 | 63 | useJUnitPlatform() |
| 58 | 64 | } |
| 59 | ||
| 60 | 65 | |
| 2 | 2 | package com.whitemagicsoftware.keenquotes; |
| 3 | 3 | |
| 4 | import java.util.HashSet; |
|
| 5 | import java.util.Set; |
|
| 6 | ||
| 7 | import static java.util.Collections.emptySet; |
|
| 8 | ||
| 9 | /** |
|
| 10 | * Placeholder for various types of contractions. |
|
| 11 | */ |
|
| 12 | public class Contractions { |
|
| 13 | ||
| 14 | private final Builder mBuilder; |
|
| 15 | ||
| 16 | private Contractions( final Builder builder ) { |
|
| 17 | assert builder != null; |
|
| 18 | mBuilder = builder; |
|
| 19 | } |
|
| 20 | ||
| 21 | /** |
|
| 22 | * Allows constructing a list of custom contractions. |
|
| 23 | */ |
|
| 24 | @SuppressWarnings( "unused" ) |
|
| 25 | public static class Builder { |
|
| 26 | private final Set<String> mBeganUnambiguous = new HashSet<>(); |
|
| 27 | private final Set<String> mEndedUnambiguous = new HashSet<>(); |
|
| 28 | private final Set<String> mBeganAmbiguous = new HashSet<>(); |
|
| 29 | private final Set<String> mEndedAmbiguous = new HashSet<>(); |
|
| 30 | ||
| 31 | public void withBeganUnambiguous( final Set<String> words ) { |
|
| 32 | mBeganUnambiguous.addAll( words ); |
|
| 33 | } |
|
| 34 | ||
| 35 | public void withEndedUnambiguous( final Set<String> words ) { |
|
| 36 | mEndedUnambiguous.addAll( words ); |
|
| 37 | } |
|
| 38 | ||
| 39 | public void withBeganAmbiguous( final Set<String> words ) { |
|
| 40 | mBeganAmbiguous.addAll( words ); |
|
| 41 | } |
|
| 42 | ||
| 43 | public void withEndedAmbiguous( final Set<String> words ) { |
|
| 44 | mEndedAmbiguous.addAll( words ); |
|
| 45 | } |
|
| 46 | ||
| 47 | /** |
|
| 48 | * Constructs a new set of {@link Contractions} that can be configured |
|
| 49 | * using this {@link Builder} instance. |
|
| 50 | * |
|
| 51 | * @return {@link Contractions} suitable for use with parsing text. |
|
| 52 | */ |
|
| 53 | public Contractions build() { |
|
| 54 | mBeganUnambiguous.addAll( from( mBeganUnambiguous, BEGAN_UNAMBIGUOUS ) ); |
|
| 55 | mEndedUnambiguous.addAll( from( mEndedUnambiguous, ENDED_UNAMBIGUOUS ) ); |
|
| 56 | mBeganAmbiguous.addAll( from( mBeganAmbiguous, BEGAN_AMBIGUOUS ) ); |
|
| 57 | mEndedAmbiguous.addAll( from( mEndedAmbiguous, ENDED_AMBIGUOUS ) ); |
|
| 58 | ||
| 59 | return new Contractions( this ); |
|
| 60 | } |
|
| 61 | ||
| 62 | /** |
|
| 63 | * This returns the {@code fallback} {@link Set} if {@code src} is empty; |
|
| 64 | * otherwise, this returns the empty {@link Set}. |
|
| 65 | * |
|
| 66 | * @param src A set of contractions, possibly empty. |
|
| 67 | * @param fallback The default values to use if {@code src} is empty. |
|
| 68 | * @param <T> The type of data used by both {@link Set}s. |
|
| 69 | * @return An empty {@link Set} if the {@code src} contains at least one |
|
| 70 | * element; otherwise, this will return {@code fallback}. |
|
| 71 | */ |
|
| 72 | private static <T> Set<T> from( final Set<T> src, final Set<T> fallback ) { |
|
| 73 | assert src != null; |
|
| 74 | assert fallback != null; |
|
| 75 | return src.isEmpty() ? fallback : emptySet(); |
|
| 76 | } |
|
| 77 | } |
|
| 78 | ||
| 79 | /** |
|
| 80 | * Answers whether the given word is a contraction that always starts |
|
| 81 | * with an apostrophe. The comparison is case insensitive. This must |
|
| 82 | * only be called when a straight quote is followed by a word. |
|
| 83 | * |
|
| 84 | * @param word The word to compare against the list of known unambiguous |
|
| 85 | * contractions. |
|
| 86 | * @return {@code true} when the given word is in the set of unambiguous |
|
| 87 | * contractions. |
|
| 88 | */ |
|
| 89 | public boolean beganUnambiguously( final String word ) { |
|
| 90 | assert word != null; |
|
| 91 | return getBeganUnambiguous().contains( word.toLowerCase() ); |
|
| 92 | } |
|
| 93 | ||
| 94 | /** |
|
| 95 | * Answers whether the given word could be a contraction but is also a |
|
| 96 | * valid word in non-contracted form. |
|
| 97 | * |
|
| 98 | * @param word The word to compare against the list of known ambiguous |
|
| 99 | * contractions. |
|
| 100 | * @return {@code true} when the given word is in the set of ambiguous |
|
| 101 | * contractions. |
|
| 102 | */ |
|
| 103 | public boolean beganAmbiguously( final String word ) { |
|
| 104 | assert word != null; |
|
| 105 | return getBeganAmbiguous().contains( word.toLowerCase() ); |
|
| 106 | } |
|
| 107 | ||
| 108 | public boolean endedUnambiguously( final String word ) { |
|
| 109 | assert word != null; |
|
| 110 | return getEndedUnambiguous().contains( word.toLowerCase() ); |
|
| 111 | } |
|
| 112 | ||
| 113 | public boolean endedAmbiguously( final String word ) { |
|
| 114 | assert word != null; |
|
| 115 | final var check = word.toLowerCase(); |
|
| 116 | ||
| 117 | // Ensure that 'n' isn't matched for ambiguity by enforcing length, yet |
|
| 118 | // allow o' to match because 'a sentence can end with the letter o'. |
|
| 119 | return getEndedAmbiguous().contains( check ) || |
|
| 120 | check.endsWith( "s" ) || check.endsWith( "z" ) || |
|
| 121 | check.endsWith( "x" ) || (check.length() > 1 && check.endsWith( "n" )); |
|
| 122 | } |
|
| 123 | ||
| 124 | private Set<String> getBeganUnambiguous() { |
|
| 125 | return mBuilder.mBeganUnambiguous; |
|
| 126 | } |
|
| 127 | ||
| 128 | private Set<String> getEndedUnambiguous() { |
|
| 129 | return mBuilder.mEndedUnambiguous; |
|
| 130 | } |
|
| 131 | ||
| 132 | private Set<String> getBeganAmbiguous() { |
|
| 133 | return mBuilder.mBeganAmbiguous; |
|
| 134 | } |
|
| 135 | ||
| 136 | private Set<String> getEndedAmbiguous() { |
|
| 137 | return mBuilder.mEndedAmbiguous; |
|
| 138 | } |
|
| 139 | ||
| 140 | /** |
|
| 141 | * Words having a straight apostrophe that cannot be mistaken for an |
|
| 142 | * opening single quote. |
|
| 143 | */ |
|
| 144 | private static final Set<String> BEGAN_UNAMBIGUOUS = Set.of( |
|
| 145 | "aporth", |
|
| 146 | "boutcha", |
|
| 147 | "boutchu", |
|
| 148 | "cept", |
|
| 149 | "dillo", |
|
| 150 | "em", |
|
| 151 | "fraid", |
|
| 152 | "gainst", |
|
| 153 | "n", |
|
| 154 | "neath", |
|
| 155 | "nother", |
|
| 156 | "onna", |
|
| 157 | "onna'", |
|
| 158 | "pon", |
|
| 159 | "s", |
|
| 160 | "sblood", |
|
| 161 | "scuse", |
|
| 162 | "sfar", |
|
| 163 | "sfoot", |
|
| 164 | "t", |
|
| 165 | "taint", |
|
| 166 | "tain", |
|
| 167 | "til", |
|
| 168 | "tis", |
|
| 169 | "tisn", |
|
| 170 | "tshall", |
|
| 171 | "twas", |
|
| 172 | "twasn", |
|
| 173 | "tween", |
|
| 174 | "twere", |
|
| 175 | "tweren", |
|
| 176 | "twixt", |
|
| 177 | "twon", |
|
| 178 | "twou", |
|
| 179 | "twould", |
|
| 180 | "twouldn", |
|
| 181 | "ve" |
|
| 182 | ); |
|
| 183 | ||
| 184 | /** |
|
| 185 | * Words having a straight apostrophe that may be either part of a |
|
| 186 | * contraction or a word that stands alone beside an opening single quote. |
|
| 187 | */ |
|
| 188 | private static final Set<String> BEGAN_AMBIGUOUS = Set.of( |
|
| 189 | // about|boxing match |
|
| 190 | "bout", |
|
| 191 | // because|causal |
|
| 192 | "cause", |
|
| 193 | // what you|choo choo train |
|
| 194 | "choo", |
|
| 195 | // he|e pluribus unum |
|
| 196 | "e", |
|
| 197 | // here|earlier |
|
| 198 | "ere", |
|
| 199 | // afro|to and fro |
|
| 200 | "fro", |
|
| 201 | // whore|ho ho! |
|
| 202 | "ho", |
|
| 203 | // okay|letter K |
|
| 204 | "kay", |
|
| 205 | // lo|lo and behold |
|
| 206 | "lo", |
|
| 207 | // are|regarding |
|
| 208 | "re", |
|
| 209 | // what's up|to sup |
|
| 210 | "sup", |
|
| 211 | // it will|twill fabric |
|
| 212 | "twill", |
|
| 213 | // them|utterance |
|
| 214 | "um", |
|
| 215 | // is that|Iranian village |
|
| 216 | "zat" |
|
| 217 | ); |
|
| 218 | ||
| 219 | private static final Set<String> ENDED_AMBIGUOUS = Set.of( |
|
| 220 | // give|martial arts garment |
|
| 221 | "gi", |
|
| 222 | // in|I |
|
| 223 | "i", |
|
| 224 | // of|letter o |
|
| 225 | "o" |
|
| 226 | ); |
|
| 227 | ||
| 228 | private static final Set<String> ENDED_UNAMBIGUOUS = Set.of( |
|
| 229 | // and |
|
| 230 | "an", |
|
| 231 | // for/before |
|
| 232 | "fo", |
|
| 233 | // friend |
|
| 234 | "frien", |
|
| 235 | // just |
|
| 236 | "jus", |
|
| 237 | // lord |
|
| 238 | "lor", |
|
| 239 | // myself |
|
| 240 | "masel", |
|
| 241 | // old |
|
| 242 | "ol", |
|
| 243 | // San (Francisco) |
|
| 244 | "Sa", |
|
| 245 | // shift |
|
| 246 | "shif", |
|
| 247 | // the |
|
| 248 | "th", |
|
| 249 | // what |
|
| 250 | "wha", |
|
| 251 | // world |
|
| 252 | "worl", |
|
| 253 | // Top ~500 common -ing words as English contractions. |
|
| 254 | "acceptin", |
|
| 255 | "accompanyin", |
|
| 256 | "accordin", |
|
| 257 | "accountin", |
|
| 258 | "achievin", |
|
| 259 | "acquirin", |
|
| 260 | "actin", |
|
| 261 | "addin", |
|
| 262 | "addressin", |
|
| 263 | "adjoinin", |
|
| 264 | "adoptin", |
|
| 265 | "advancin", |
|
| 266 | "advertisin", |
|
| 267 | "affectin", |
|
| 268 | "agin", |
|
| 269 | "allowin", |
|
| 270 | "amazin", |
|
| 271 | "analyzin", |
|
| 272 | "answerin", |
|
| 273 | "anythin", |
|
| 274 | "appearin", |
|
| 275 | "applyin", |
|
| 276 | "approachin", |
|
| 277 | "arguin", |
|
| 278 | "arisin", |
|
| 279 | "arrivin", |
|
| 280 | "askin", |
|
| 281 | "assessin", |
|
| 282 | "assumin", |
|
| 283 | "attackin", |
|
| 284 | "attemptin", |
|
| 285 | "attendin", |
|
| 286 | "avoidin", |
|
| 287 | "bankin", |
|
| 288 | "bargainin", |
|
| 289 | "bearin", |
|
| 290 | "beatin", |
|
| 291 | "becomin", |
|
| 292 | "beginnin", |
|
| 293 | "bein", |
|
| 294 | "believin", |
|
| 295 | "belongin", |
|
| 296 | "bendin", |
|
| 297 | "bindin", |
|
| 298 | "bleedin", |
|
| 299 | "blessin", |
|
| 300 | "blowin", |
|
| 301 | "boilin", |
|
| 302 | "borrowin", |
|
| 303 | "breakin", |
|
| 304 | "breathin", |
|
| 305 | "breedin", |
|
| 306 | "bringin", |
|
| 307 | "broadcastin", |
|
| 308 | "buildin", |
|
| 309 | "burnin", |
|
| 310 | "buyin", |
|
| 311 | "calculatin", |
|
| 312 | "callin", |
|
| 313 | "carryin", |
|
| 314 | "castin", |
|
| 315 | "causin", |
|
| 316 | "ceilin", |
|
| 317 | "challengin", |
|
| 318 | "changin", |
|
| 319 | "checkin", |
|
| 320 | "choosin", |
|
| 321 | "claimin", |
|
| 322 | "cleanin", |
|
| 323 | "clearin", |
|
| 324 | "climbin", |
|
| 325 | "closin", |
|
| 326 | "clothin", |
|
| 327 | "collectin", |
|
| 328 | "combinin", |
|
| 329 | "comin", |
|
| 330 | "commandin", |
|
| 331 | "comparin", |
|
| 332 | "compellin", |
|
| 333 | "competin", |
|
| 334 | "computin", |
|
| 335 | "concernin", |
|
| 336 | "concludin", |
|
| 337 | "conditionin", |
|
| 338 | "conductin", |
|
| 339 | "conflictin", |
|
| 340 | "connectin", |
|
| 341 | "considerin", |
|
| 342 | "consistin", |
|
| 343 | "constructin", |
|
| 344 | "consultin", |
|
| 345 | "consumin", |
|
| 346 | "containin", |
|
| 347 | "continuin", |
|
| 348 | "contractin", |
|
| 349 | "contributin", |
|
| 350 | "controllin", |
|
| 351 | "convincin", |
|
| 352 | "cookin", |
|
| 353 | "coolin", |
|
| 354 | "copin", |
|
| 355 | "correspondin", |
|
| 356 | "counselin", |
|
| 357 | "countin", |
|
| 358 | "couplin", |
|
| 359 | "coverin", |
|
| 360 | "creatin", |
|
| 361 | "crossin", |
|
| 362 | "cryin", |
|
| 363 | "cuttin", |
|
| 364 | "dancin", |
|
| 365 | "darlin", |
|
| 366 | "datin", |
|
| 367 | "dealin", |
|
| 368 | "decidin", |
|
| 369 | "declarin", |
|
| 370 | "declinin", |
|
| 371 | "decreasin", |
|
| 372 | "definin", |
|
| 373 | "demandin", |
|
| 374 | "denyin", |
|
| 375 | "dependin", |
|
| 376 | "descendin", |
|
| 377 | "describin", |
|
| 378 | "designin", |
|
| 379 | "destroyin", |
|
| 380 | "determinin", |
|
| 381 | "developin", |
|
| 382 | "differin", |
|
| 383 | "dinin", |
|
| 384 | "directin", |
|
| 385 | "discussin", |
|
| 386 | "distinguishin", |
|
| 387 | "disturbin", |
|
| 388 | "dividin", |
|
| 389 | "doin", |
|
| 390 | "drawin", |
|
| 391 | "dressin", |
|
| 392 | "drinkin", |
|
| 393 | "drivin", |
|
| 394 | "droppin", |
|
| 395 | "dryin", |
|
| 396 | "durin", |
|
| 397 | "dwellin", |
|
| 398 | "dyin", |
|
| 399 | "eatin", |
|
| 400 | "editin", |
|
| 401 | "emergin", |
|
| 402 | "employin", |
|
| 403 | "enablin", |
|
| 404 | "encouragin", |
|
| 405 | "endin", |
|
| 406 | "engagin", |
|
| 407 | "engineerin", |
|
| 408 | "enjoyin", |
|
| 409 | "enterin", |
|
| 410 | "establishin", |
|
| 411 | "evaluatin", |
|
| 412 | "evenin", |
|
| 413 | "everythin", |
|
| 414 | "examinin", |
|
| 415 | "exceedin", |
|
| 416 | "excitin", |
|
| 417 | "excludin", |
|
| 418 | "existin", |
|
| 419 | "expandin", |
|
| 420 | "expectin", |
|
| 421 | "experiencin", |
|
| 422 | "explainin", |
|
| 423 | "explorin", |
|
| 424 | "expressin", |
|
| 425 | "extendin", |
|
| 426 | "facin", |
|
| 427 | "failin", |
|
| 428 | "fallin", |
|
| 429 | "farmin", |
|
| 430 | "fascinatin", |
|
| 431 | "feedin", |
|
| 432 | "feelin", |
|
| 433 | "fightin", |
|
| 434 | "filin", |
|
| 435 | "fillin", |
|
| 436 | "financin", |
|
| 437 | "findin", |
|
| 438 | "firin", |
|
| 439 | "fishin", |
|
| 440 | "fittin", |
|
| 441 | "fixin", |
|
| 442 | "floatin", |
|
| 443 | "flowin", |
|
| 444 | "flyin", |
|
| 445 | "focusin", |
|
| 446 | "followin", |
|
| 447 | "forcin", |
|
| 448 | "foregoin", |
|
| 449 | "formin", |
|
| 450 | "forthcomin", |
|
| 451 | "foundin", |
|
| 452 | "freezin", |
|
| 453 | "fuckin", |
|
| 454 | "functionin", |
|
| 455 | "fundin", |
|
| 456 | "gainin", |
|
| 457 | "gatherin", |
|
| 458 | "generatin", |
|
| 459 | "gettin", |
|
| 460 | "givin", |
|
| 461 | "goin", |
|
| 462 | "governin", |
|
| 463 | "grantin", |
|
| 464 | "growin", |
|
| 465 | "hackin", |
|
| 466 | "handlin", |
|
| 467 | "hangin", |
|
| 468 | "happenin", |
|
| 469 | "havin", |
|
| 470 | "headin", |
|
| 471 | "healin", |
|
| 472 | "hearin", |
|
| 473 | "heatin", |
|
| 474 | "helpin", |
|
| 475 | "hidin", |
|
| 476 | "holdin", |
|
| 477 | "hopin", |
|
| 478 | "housin", |
|
| 479 | "huntin", |
|
| 480 | "identifyin", |
|
| 481 | "imagin", |
|
| 482 | "implementin", |
|
| 483 | "imposin", |
|
| 484 | "improvin", |
|
| 485 | "includin", |
|
| 486 | "increasin", |
|
| 487 | "indicatin", |
|
| 488 | "interestin", |
|
| 489 | "interpretin", |
|
| 490 | "introducin", |
|
| 491 | "involvin", |
|
| 492 | "joinin", |
|
| 493 | "judgin", |
|
| 494 | "keepin", |
|
| 495 | "killin", |
|
| 496 | "knowin", |
|
| 497 | "lackin", |
|
| 498 | "landin", |
|
| 499 | "lastin", |
|
| 500 | "laughin", |
|
| 501 | "layin", |
|
| 502 | "leadin", |
|
| 503 | "leanin", |
|
| 504 | "learnin", |
|
| 505 | "leavin", |
|
| 506 | "lettin", |
|
| 507 | "liftin", |
|
| 508 | "lightin", |
|
| 509 | "lightnin", |
|
| 510 | "limitin", |
|
| 511 | "listenin", |
|
| 512 | "listin", |
|
| 513 | "livin", |
|
| 514 | "loadin", |
|
| 515 | "lookin", |
|
| 516 | "losin", |
|
| 517 | "lovin", |
|
| 518 | "lowerin", |
|
| 519 | "lyin", |
|
| 520 | "maintainin", |
|
| 521 | "makin", |
|
| 522 | "managin", |
|
| 523 | "manufacturin", |
|
| 524 | "mappin", |
|
| 525 | "marketin", |
|
| 526 | "markin", |
|
| 527 | "matchin", |
|
| 528 | "meanin", |
|
| 529 | "measurin", |
|
| 530 | "meetin", |
|
| 531 | "meltin", |
|
| 532 | "minin", |
|
| 533 | "misleadin", |
|
| 534 | "missin", |
|
| 535 | "mixin", |
|
| 536 | "modelin", |
|
| 537 | "monitorin", |
|
| 538 | "mornin", |
|
| 539 | "movin", |
|
| 540 | "neighborin", |
|
| 4 | import java.util.ArrayList; |
|
| 5 | import java.util.HashSet; |
|
| 6 | import java.util.Set; |
|
| 7 | ||
| 8 | import static java.lang.String.format; |
|
| 9 | import static java.util.Collections.emptySet; |
|
| 10 | import static java.util.Collections.sort; |
|
| 11 | ||
| 12 | /** |
|
| 13 | * Placeholder for various types of contractions. |
|
| 14 | */ |
|
| 15 | public class Contractions { |
|
| 16 | ||
| 17 | private final Builder mBuilder; |
|
| 18 | ||
| 19 | private Contractions( final Builder builder ) { |
|
| 20 | assert builder != null; |
|
| 21 | mBuilder = builder; |
|
| 22 | } |
|
| 23 | ||
| 24 | /** |
|
| 25 | * Allows constructing a list of custom contractions. |
|
| 26 | */ |
|
| 27 | @SuppressWarnings( "unused" ) |
|
| 28 | public static class Builder { |
|
| 29 | private final Set<String> mBeganUnambiguous = new HashSet<>(); |
|
| 30 | private final Set<String> mEndedUnambiguous = new HashSet<>(); |
|
| 31 | private final Set<String> mBeganAmbiguous = new HashSet<>(); |
|
| 32 | private final Set<String> mEndedAmbiguous = new HashSet<>(); |
|
| 33 | ||
| 34 | public void withBeganUnambiguous( final Set<String> words ) { |
|
| 35 | mBeganUnambiguous.addAll( words ); |
|
| 36 | } |
|
| 37 | ||
| 38 | public void withEndedUnambiguous( final Set<String> words ) { |
|
| 39 | mEndedUnambiguous.addAll( words ); |
|
| 40 | } |
|
| 41 | ||
| 42 | public void withBeganAmbiguous( final Set<String> words ) { |
|
| 43 | mBeganAmbiguous.addAll( words ); |
|
| 44 | } |
|
| 45 | ||
| 46 | public void withEndedAmbiguous( final Set<String> words ) { |
|
| 47 | mEndedAmbiguous.addAll( words ); |
|
| 48 | } |
|
| 49 | ||
| 50 | /** |
|
| 51 | * Constructs a new set of {@link Contractions} that can be configured |
|
| 52 | * using this {@link Builder} instance. |
|
| 53 | * |
|
| 54 | * @return {@link Contractions} suitable for use with parsing text. |
|
| 55 | */ |
|
| 56 | public Contractions build() { |
|
| 57 | mBeganUnambiguous.addAll( from( mBeganUnambiguous, BEGAN_UNAMBIGUOUS ) ); |
|
| 58 | mEndedUnambiguous.addAll( from( mEndedUnambiguous, ENDED_UNAMBIGUOUS ) ); |
|
| 59 | mBeganAmbiguous.addAll( from( mBeganAmbiguous, BEGAN_AMBIGUOUS ) ); |
|
| 60 | mEndedAmbiguous.addAll( from( mEndedAmbiguous, ENDED_AMBIGUOUS ) ); |
|
| 61 | ||
| 62 | return new Contractions( this ); |
|
| 63 | } |
|
| 64 | ||
| 65 | /** |
|
| 66 | * This returns the {@code fallback} {@link Set} if {@code src} is empty; |
|
| 67 | * otherwise, this returns the empty {@link Set}. |
|
| 68 | * |
|
| 69 | * @param src A set of contractions, possibly empty. |
|
| 70 | * @param fallback The default values to use if {@code src} is empty. |
|
| 71 | * @param <T> The type of data used by both {@link Set}s. |
|
| 72 | * @return An empty {@link Set} if the {@code src} contains at least one |
|
| 73 | * element; otherwise, this will return {@code fallback}. |
|
| 74 | */ |
|
| 75 | private static <T> Set<T> from( final Set<T> src, final Set<T> fallback ) { |
|
| 76 | assert src != null; |
|
| 77 | assert fallback != null; |
|
| 78 | return src.isEmpty() ? fallback : emptySet(); |
|
| 79 | } |
|
| 80 | } |
|
| 81 | ||
| 82 | /** |
|
| 83 | * Answers whether the given word is a contraction that always starts |
|
| 84 | * with an apostrophe. The comparison is case insensitive. This must |
|
| 85 | * only be called when a straight quote is followed by a word. |
|
| 86 | * |
|
| 87 | * @param word The word to compare against the list of known unambiguous |
|
| 88 | * contractions. |
|
| 89 | * @return {@code true} when the given word is in the set of unambiguous |
|
| 90 | * contractions. |
|
| 91 | */ |
|
| 92 | public boolean beganUnambiguously( final String word ) { |
|
| 93 | assert word != null; |
|
| 94 | return getBeganUnambiguous().contains( word.toLowerCase() ); |
|
| 95 | } |
|
| 96 | ||
| 97 | /** |
|
| 98 | * Answers whether the given word could be a contraction but is also a |
|
| 99 | * valid word in non-contracted form. |
|
| 100 | * |
|
| 101 | * @param word The word to compare against the list of known ambiguous |
|
| 102 | * contractions. |
|
| 103 | * @return {@code true} when the given word is in the set of ambiguous |
|
| 104 | * contractions. |
|
| 105 | */ |
|
| 106 | public boolean beganAmbiguously( final String word ) { |
|
| 107 | assert word != null; |
|
| 108 | return getBeganAmbiguous().contains( word.toLowerCase() ); |
|
| 109 | } |
|
| 110 | ||
| 111 | public boolean endedUnambiguously( final String word ) { |
|
| 112 | assert word != null; |
|
| 113 | return getEndedUnambiguous().contains( word.toLowerCase() ); |
|
| 114 | } |
|
| 115 | ||
| 116 | public boolean endedAmbiguously( final String word ) { |
|
| 117 | assert word != null; |
|
| 118 | final var check = word.toLowerCase(); |
|
| 119 | ||
| 120 | // Ensure that 'n' isn't matched for ambiguity by enforcing length, yet |
|
| 121 | // allow o' to match because 'a sentence can end with the letter o'. |
|
| 122 | return getEndedAmbiguous().contains( check ) || |
|
| 123 | check.endsWith( "s" ) || check.endsWith( "z" ) || |
|
| 124 | check.endsWith( "x" ) || (check.length() > 1 && check.endsWith( "n" )); |
|
| 125 | } |
|
| 126 | ||
| 127 | private Set<String> getBeganUnambiguous() { |
|
| 128 | return mBuilder.mBeganUnambiguous; |
|
| 129 | } |
|
| 130 | ||
| 131 | private Set<String> getEndedUnambiguous() { |
|
| 132 | return mBuilder.mEndedUnambiguous; |
|
| 133 | } |
|
| 134 | ||
| 135 | private Set<String> getBeganAmbiguous() { |
|
| 136 | return mBuilder.mBeganAmbiguous; |
|
| 137 | } |
|
| 138 | ||
| 139 | private Set<String> getEndedAmbiguous() { |
|
| 140 | return mBuilder.mEndedAmbiguous; |
|
| 141 | } |
|
| 142 | ||
| 143 | ||
| 144 | @Override |
|
| 145 | public String toString() { |
|
| 146 | return |
|
| 147 | toString( getBeganAmbiguous(), "Ambiguous Began", "'%s" ) + |
|
| 148 | toString( getEndedAmbiguous(), "Ambiguous Ended", "%s'" ) + |
|
| 149 | toString( getBeganUnambiguous(), "Unambiguous Began", "'%s" ) + |
|
| 150 | toString( getEndedUnambiguous(), "Unambiguous Ended", "%s'" ); |
|
| 151 | } |
|
| 152 | ||
| 153 | private String toString( |
|
| 154 | final Set<String> words, final String category, final String fmt ) { |
|
| 155 | final var sb = new StringBuilder( 16384 ); |
|
| 156 | final var newline = System.lineSeparator(); |
|
| 157 | final var list = new ArrayList<>( words ); |
|
| 158 | ||
| 159 | sort( list ); |
|
| 160 | sb.append( format( "%n%s%n", category ) ); |
|
| 161 | list.forEach( ( s ) -> sb.append( format( fmt, s ) ).append( newline ) ); |
|
| 162 | ||
| 163 | return sb.toString(); |
|
| 164 | } |
|
| 165 | ||
| 166 | /** |
|
| 167 | * Words having a straight apostrophe that cannot be mistaken for an |
|
| 168 | * opening single quote. |
|
| 169 | */ |
|
| 170 | private static final Set<String> BEGAN_UNAMBIGUOUS = Set.of( |
|
| 171 | "aporth", |
|
| 172 | "boutcha", |
|
| 173 | "boutchu", |
|
| 174 | "cept", |
|
| 175 | "dillo", |
|
| 176 | "em", |
|
| 177 | "fraid", |
|
| 178 | "gainst", |
|
| 179 | "n", |
|
| 180 | "neath", |
|
| 181 | "nother", |
|
| 182 | "onna", |
|
| 183 | "onna'", |
|
| 184 | "pon", |
|
| 185 | "s", |
|
| 186 | "sblood", |
|
| 187 | "scuse", |
|
| 188 | "sfar", |
|
| 189 | "sfoot", |
|
| 190 | "t", |
|
| 191 | "taint", |
|
| 192 | "tain", |
|
| 193 | "til", |
|
| 194 | "tis", |
|
| 195 | "tisn", |
|
| 196 | "tshall", |
|
| 197 | "twas", |
|
| 198 | "twasn", |
|
| 199 | "tween", |
|
| 200 | "twere", |
|
| 201 | "tweren", |
|
| 202 | "twixt", |
|
| 203 | "twon", |
|
| 204 | "twou", |
|
| 205 | "twould", |
|
| 206 | "twouldn", |
|
| 207 | "ve" |
|
| 208 | ); |
|
| 209 | ||
| 210 | /** |
|
| 211 | * Words having a straight apostrophe that may be either part of a |
|
| 212 | * contraction or a word that stands alone beside an opening single quote. |
|
| 213 | */ |
|
| 214 | private static final Set<String> BEGAN_AMBIGUOUS = Set.of( |
|
| 215 | // about|boxing match |
|
| 216 | "bout", |
|
| 217 | // because|causal |
|
| 218 | "cause", |
|
| 219 | // what you|choo choo train |
|
| 220 | "choo", |
|
| 221 | // he|e pluribus unum |
|
| 222 | "e", |
|
| 223 | // here|earlier |
|
| 224 | "ere", |
|
| 225 | // afro|to and fro |
|
| 226 | "fro", |
|
| 227 | // whore|ho ho! |
|
| 228 | "ho", |
|
| 229 | // okay|letter K |
|
| 230 | "kay", |
|
| 231 | // lo|lo and behold |
|
| 232 | "lo", |
|
| 233 | // are|regarding |
|
| 234 | "re", |
|
| 235 | // what's up|to sup |
|
| 236 | "sup", |
|
| 237 | // it will|twill fabric |
|
| 238 | "twill", |
|
| 239 | // them|utterance |
|
| 240 | "um", |
|
| 241 | // is that|Iranian village |
|
| 242 | "zat" |
|
| 243 | ); |
|
| 244 | ||
| 245 | private static final Set<String> ENDED_AMBIGUOUS = Set.of( |
|
| 246 | // give|martial arts garment |
|
| 247 | "gi", |
|
| 248 | // in|I |
|
| 249 | "i", |
|
| 250 | // of|letter o |
|
| 251 | "o" |
|
| 252 | ); |
|
| 253 | ||
| 254 | private static final Set<String> ENDED_UNAMBIGUOUS = Set.of( |
|
| 255 | // and |
|
| 256 | "an", |
|
| 257 | // for/before |
|
| 258 | "fo", |
|
| 259 | // friend |
|
| 260 | "frien", |
|
| 261 | // just |
|
| 262 | "jus", |
|
| 263 | // lord |
|
| 264 | "lor", |
|
| 265 | // myself |
|
| 266 | "masel", |
|
| 267 | // old |
|
| 268 | "ol", |
|
| 269 | // San (Francisco) |
|
| 270 | "Sa", |
|
| 271 | // shift |
|
| 272 | "shif", |
|
| 273 | // the |
|
| 274 | "th", |
|
| 275 | // what |
|
| 276 | "wha", |
|
| 277 | // world |
|
| 278 | "worl", |
|
| 279 | // Top ~500 common -ing words as English contractions. |
|
| 280 | "acceptin", |
|
| 281 | "accompanyin", |
|
| 282 | "accordin", |
|
| 283 | "accountin", |
|
| 284 | "achievin", |
|
| 285 | "acquirin", |
|
| 286 | "actin", |
|
| 287 | "addin", |
|
| 288 | "addressin", |
|
| 289 | "adjoinin", |
|
| 290 | "adoptin", |
|
| 291 | "advancin", |
|
| 292 | "advertisin", |
|
| 293 | "affectin", |
|
| 294 | "agin", |
|
| 295 | "allowin", |
|
| 296 | "amazin", |
|
| 297 | "analyzin", |
|
| 298 | "answerin", |
|
| 299 | "anythin", |
|
| 300 | "appearin", |
|
| 301 | "applyin", |
|
| 302 | "approachin", |
|
| 303 | "arguin", |
|
| 304 | "arisin", |
|
| 305 | "arrivin", |
|
| 306 | "askin", |
|
| 307 | "assessin", |
|
| 308 | "assumin", |
|
| 309 | "attackin", |
|
| 310 | "attemptin", |
|
| 311 | "attendin", |
|
| 312 | "avoidin", |
|
| 313 | "bankin", |
|
| 314 | "bargainin", |
|
| 315 | "bearin", |
|
| 316 | "beatin", |
|
| 317 | "becomin", |
|
| 318 | "beginnin", |
|
| 319 | "bein", |
|
| 320 | "believin", |
|
| 321 | "belongin", |
|
| 322 | "bendin", |
|
| 323 | "bindin", |
|
| 324 | "bleedin", |
|
| 325 | "blessin", |
|
| 326 | "blowin", |
|
| 327 | "boilin", |
|
| 328 | "borrowin", |
|
| 329 | "breakin", |
|
| 330 | "breathin", |
|
| 331 | "breedin", |
|
| 332 | "bringin", |
|
| 333 | "broadcastin", |
|
| 334 | "buildin", |
|
| 335 | "burnin", |
|
| 336 | "buyin", |
|
| 337 | "calculatin", |
|
| 338 | "callin", |
|
| 339 | "carryin", |
|
| 340 | "castin", |
|
| 341 | "causin", |
|
| 342 | "ceilin", |
|
| 343 | "challengin", |
|
| 344 | "changin", |
|
| 345 | "checkin", |
|
| 346 | "choosin", |
|
| 347 | "claimin", |
|
| 348 | "cleanin", |
|
| 349 | "clearin", |
|
| 350 | "climbin", |
|
| 351 | "closin", |
|
| 352 | "clothin", |
|
| 353 | "collectin", |
|
| 354 | "combinin", |
|
| 355 | "comin", |
|
| 356 | "commandin", |
|
| 357 | "comparin", |
|
| 358 | "compellin", |
|
| 359 | "competin", |
|
| 360 | "computin", |
|
| 361 | "concernin", |
|
| 362 | "concludin", |
|
| 363 | "conditionin", |
|
| 364 | "conductin", |
|
| 365 | "conflictin", |
|
| 366 | "connectin", |
|
| 367 | "considerin", |
|
| 368 | "consistin", |
|
| 369 | "constructin", |
|
| 370 | "consultin", |
|
| 371 | "consumin", |
|
| 372 | "containin", |
|
| 373 | "continuin", |
|
| 374 | "contractin", |
|
| 375 | "contributin", |
|
| 376 | "controllin", |
|
| 377 | "convincin", |
|
| 378 | "cookin", |
|
| 379 | "coolin", |
|
| 380 | "copin", |
|
| 381 | "correspondin", |
|
| 382 | "counselin", |
|
| 383 | "countin", |
|
| 384 | "couplin", |
|
| 385 | "coverin", |
|
| 386 | "creatin", |
|
| 387 | "crossin", |
|
| 388 | "cryin", |
|
| 389 | "cuttin", |
|
| 390 | "dancin", |
|
| 391 | "darlin", |
|
| 392 | "datin", |
|
| 393 | "dealin", |
|
| 394 | "decidin", |
|
| 395 | "declarin", |
|
| 396 | "declinin", |
|
| 397 | "decreasin", |
|
| 398 | "definin", |
|
| 399 | "demandin", |
|
| 400 | "denyin", |
|
| 401 | "dependin", |
|
| 402 | "descendin", |
|
| 403 | "describin", |
|
| 404 | "designin", |
|
| 405 | "destroyin", |
|
| 406 | "determinin", |
|
| 407 | "developin", |
|
| 408 | "differin", |
|
| 409 | "dinin", |
|
| 410 | "directin", |
|
| 411 | "discussin", |
|
| 412 | "distinguishin", |
|
| 413 | "disturbin", |
|
| 414 | "dividin", |
|
| 415 | "doin", |
|
| 416 | "drawin", |
|
| 417 | "dressin", |
|
| 418 | "drinkin", |
|
| 419 | "drivin", |
|
| 420 | "droppin", |
|
| 421 | "dryin", |
|
| 422 | "durin", |
|
| 423 | "dwellin", |
|
| 424 | "dyin", |
|
| 425 | "eatin", |
|
| 426 | "editin", |
|
| 427 | "emergin", |
|
| 428 | "employin", |
|
| 429 | "enablin", |
|
| 430 | "encouragin", |
|
| 431 | "endin", |
|
| 432 | "engagin", |
|
| 433 | "engineerin", |
|
| 434 | "enjoyin", |
|
| 435 | "enterin", |
|
| 436 | "establishin", |
|
| 437 | "evaluatin", |
|
| 438 | "evenin", |
|
| 439 | "everythin", |
|
| 440 | "examinin", |
|
| 441 | "exceedin", |
|
| 442 | "excitin", |
|
| 443 | "excludin", |
|
| 444 | "existin", |
|
| 445 | "expandin", |
|
| 446 | "expectin", |
|
| 447 | "experiencin", |
|
| 448 | "explainin", |
|
| 449 | "explorin", |
|
| 450 | "expressin", |
|
| 451 | "extendin", |
|
| 452 | "facin", |
|
| 453 | "failin", |
|
| 454 | "fallin", |
|
| 455 | "farmin", |
|
| 456 | "fascinatin", |
|
| 457 | "feedin", |
|
| 458 | "feelin", |
|
| 459 | "fightin", |
|
| 460 | "filin", |
|
| 461 | "fillin", |
|
| 462 | "financin", |
|
| 463 | "findin", |
|
| 464 | "firin", |
|
| 465 | "fishin", |
|
| 466 | "fittin", |
|
| 467 | "fixin", |
|
| 468 | "floatin", |
|
| 469 | "flowin", |
|
| 470 | "flyin", |
|
| 471 | "focusin", |
|
| 472 | "followin", |
|
| 473 | "forcin", |
|
| 474 | "foregoin", |
|
| 475 | "formin", |
|
| 476 | "forthcomin", |
|
| 477 | "foundin", |
|
| 478 | "freezin", |
|
| 479 | "fuckin", |
|
| 480 | "functionin", |
|
| 481 | "fundin", |
|
| 482 | "gainin", |
|
| 483 | "gatherin", |
|
| 484 | "generatin", |
|
| 485 | "gettin", |
|
| 486 | "givin", |
|
| 487 | "goin", |
|
| 488 | "governin", |
|
| 489 | "grantin", |
|
| 490 | "growin", |
|
| 491 | "hackin", |
|
| 492 | "handlin", |
|
| 493 | "hangin", |
|
| 494 | "happenin", |
|
| 495 | "havin", |
|
| 496 | "headin", |
|
| 497 | "healin", |
|
| 498 | "hearin", |
|
| 499 | "heatin", |
|
| 500 | "helpin", |
|
| 501 | "hidin", |
|
| 502 | "holdin", |
|
| 503 | "hopin", |
|
| 504 | "housin", |
|
| 505 | "huntin", |
|
| 506 | "identifyin", |
|
| 507 | "imagin", |
|
| 508 | "implementin", |
|
| 509 | "imposin", |
|
| 510 | "improvin", |
|
| 511 | "includin", |
|
| 512 | "increasin", |
|
| 513 | "indicatin", |
|
| 514 | "interestin", |
|
| 515 | "interpretin", |
|
| 516 | "introducin", |
|
| 517 | "involvin", |
|
| 518 | "joinin", |
|
| 519 | "judgin", |
|
| 520 | "keepin", |
|
| 521 | "killin", |
|
| 522 | "knowin", |
|
| 523 | "lackin", |
|
| 524 | "landin", |
|
| 525 | "lastin", |
|
| 526 | "laughin", |
|
| 527 | "layin", |
|
| 528 | "leadin", |
|
| 529 | "leanin", |
|
| 530 | "learnin", |
|
| 531 | "leavin", |
|
| 532 | "lettin", |
|
| 533 | "liftin", |
|
| 534 | "lightin", |
|
| 535 | "lightnin", |
|
| 536 | "limitin", |
|
| 537 | "listenin", |
|
| 538 | "listin", |
|
| 539 | "livin", |
|
| 540 | "loadin", |
|
| 541 | "lookin", |
|
| 542 | "losin", |
|
| 543 | "lovin", |
|
| 544 | "lowerin", |
|
| 545 | "lyin", |
|
| 546 | "maintainin", |
|
| 547 | "makin", |
|
| 548 | "managin", |
|
| 549 | "manufacturin", |
|
| 550 | "mappin", |
|
| 551 | "marketin", |
|
| 552 | "markin", |
|
| 553 | "matchin", |
|
| 554 | "meanin", |
|
| 555 | "measurin", |
|
| 556 | "meetin", |
|
| 557 | "meltin", |
|
| 558 | "minin", |
|
| 559 | "misleadin", |
|
| 560 | "missin", |
|
| 561 | "mixin", |
|
| 562 | "modelin", |
|
| 563 | "monitorin", |
|
| 564 | "mornin", |
|
| 565 | "movin", |
|
| 566 | "neighborin", |
|
| 567 | "neighbourin", |
|
| 541 | 568 | "nothin", |
| 542 | 569 | "notin", |
| 1 | package com.whitemagicsoftware.keenquotes; |
|
| 2 | ||
| 3 | import java.util.ArrayList; |
|
| 4 | import java.util.Map; |
|
| 5 | import java.util.function.Consumer; |
|
| 6 | ||
| 7 | import static com.whitemagicsoftware.keenquotes.TokenType.*; |
|
| 8 | import static java.util.Collections.sort; |
|
| 9 | ||
| 10 | /** |
|
| 11 | * Responsible for converting curly quotes to HTML entities throughout a |
|
| 12 | * text string. |
|
| 13 | */ |
|
| 14 | public class Converter { |
|
| 15 | private static final Map<TokenType, String> REPLACEMENTS = Map.of( |
|
| 16 | QUOTE_OPENING_SINGLE, "‘", |
|
| 17 | QUOTE_CLOSING_SINGLE, "’", |
|
| 18 | QUOTE_OPENING_DOUBLE, "“", |
|
| 19 | QUOTE_CLOSING_DOUBLE, "”", |
|
| 20 | QUOTE_STRAIGHT_SINGLE, "'", |
|
| 21 | QUOTE_STRAIGHT_DOUBLE, "\"", |
|
| 22 | QUOTE_APOSTROPHE, "'", |
|
| 23 | QUOTE_PRIME_SINGLE, "′", |
|
| 24 | QUOTE_PRIME_DOUBLE, "″" |
|
| 25 | ); |
|
| 26 | ||
| 27 | /** |
|
| 28 | * Converts straight quotes to curly quotes and primes. Any quotation marks |
|
| 29 | * that cannot be converted are passed to the {@link Consumer}. |
|
| 30 | * |
|
| 31 | * @param text The text to parse. |
|
| 32 | * @param unresolved Recipient for ambiguous {@link Lexeme}s. |
|
| 33 | * @return The given text string with as many straight quotes converted to |
|
| 34 | * curly quotes as is feasible. |
|
| 35 | */ |
|
| 36 | public static String convert( |
|
| 37 | final String text, final Consumer<Lexeme> unresolved ) { |
|
| 38 | final var parser = new Parser( text ); |
|
| 39 | final var tokens = new ArrayList<Token>(); |
|
| 40 | ||
| 41 | // Parse the tokens and consume all unresolved lexemes. |
|
| 42 | parser.parse( tokens::add, unresolved ); |
|
| 43 | ||
| 44 | // The parser may emit tokens in any order. |
|
| 45 | sort( tokens ); |
|
| 46 | ||
| 47 | final var result = new StringBuilder( text.length() ); |
|
| 48 | var position = 0; |
|
| 49 | ||
| 50 | for( final var token : tokens ) { |
|
| 51 | if( position <= token.began() ) { |
|
| 52 | result.append( text, position, token.began() ); |
|
| 53 | result.append( REPLACEMENTS.get( token.getType() ) ); |
|
| 54 | } |
|
| 55 | ||
| 56 | position = token.ended(); |
|
| 57 | } |
|
| 58 | ||
| 59 | return result.append( text.substring( position ) ).toString(); |
|
| 60 | } |
|
| 61 | } |
|
| 1 | 62 |
| 2 | 2 | package com.whitemagicsoftware.keenquotes; |
| 3 | 3 | |
| 4 | import java.util.ArrayList; |
|
| 5 | import java.util.Map; |
|
| 6 | import java.util.function.Consumer; |
|
| 4 | import picocli.CommandLine; |
|
| 7 | 5 | |
| 8 | import static com.whitemagicsoftware.keenquotes.TokenType.*; |
|
| 9 | import static java.util.Collections.sort; |
|
| 6 | import java.io.BufferedReader; |
|
| 7 | import java.io.IOException; |
|
| 8 | import java.io.InputStream; |
|
| 9 | import java.io.InputStreamReader; |
|
| 10 | import java.util.Properties; |
|
| 11 | ||
| 12 | import static java.lang.String.format; |
|
| 13 | import static picocli.CommandLine.Help.Ansi.Style.*; |
|
| 14 | import static picocli.CommandLine.Help.ColorScheme; |
|
| 10 | 15 | |
| 11 | 16 | /** |
| 12 | 17 | * Responsible for replacing {@link Token} instances with equivalent smart |
| 13 | 18 | * quotes (or straight quotes). This will inform the caller when ambiguous |
| 14 | 19 | * quotes cannot be reliably resolved. |
| 15 | 20 | */ |
| 16 | 21 | public final class KeenQuotes { |
| 17 | private static final Map<TokenType, String> REPLACEMENTS = Map.of( |
|
| 18 | QUOTE_OPENING_SINGLE, "‘", |
|
| 19 | QUOTE_CLOSING_SINGLE, "’", |
|
| 20 | QUOTE_OPENING_DOUBLE, "“", |
|
| 21 | QUOTE_CLOSING_DOUBLE, "”", |
|
| 22 | QUOTE_STRAIGHT_SINGLE, "'", |
|
| 23 | QUOTE_STRAIGHT_DOUBLE, "\"", |
|
| 24 | QUOTE_APOSTROPHE, "'", |
|
| 25 | QUOTE_PRIME_SINGLE, "′", |
|
| 26 | QUOTE_PRIME_DOUBLE, "″" |
|
| 27 | ); |
|
| 22 | private final Settings mSettings = new Settings( this ); |
|
| 23 | ||
| 24 | private static ColorScheme createColourScheme() { |
|
| 25 | return new ColorScheme.Builder() |
|
| 26 | .commands( bold ) |
|
| 27 | .options( fg_blue, bold ) |
|
| 28 | .parameters( fg_blue ) |
|
| 29 | .optionParams( italic ) |
|
| 30 | .errors( fg_red, bold ) |
|
| 31 | .stackTraces( italic ) |
|
| 32 | .build(); |
|
| 33 | } |
|
| 34 | ||
| 35 | public void run() { |
|
| 36 | if( getSettings().displayList() ) { |
|
| 37 | displayList(); |
|
| 38 | } |
|
| 39 | else { |
|
| 40 | convert(); |
|
| 41 | } |
|
| 42 | } |
|
| 43 | ||
| 44 | private void displayList() { |
|
| 45 | System.out.println( new Contractions.Builder().build().toString() ); |
|
| 46 | } |
|
| 47 | ||
| 48 | private void convert() { |
|
| 49 | final StringBuilder sb = new StringBuilder(); |
|
| 50 | ||
| 51 | try( final BufferedReader reader = open( System.in ) ) { |
|
| 52 | String line; |
|
| 53 | final var sep = System.lineSeparator(); |
|
| 54 | ||
| 55 | while( (line = reader.readLine()) != null ) { |
|
| 56 | sb.append( line ); |
|
| 57 | sb.append( sep ); |
|
| 58 | } |
|
| 59 | ||
| 60 | System.out.println( |
|
| 61 | Converter.convert( sb.toString(), System.err::println ) |
|
| 62 | ); |
|
| 63 | } catch( final Exception ex ) { |
|
| 64 | ex.printStackTrace( System.err ); |
|
| 65 | } |
|
| 66 | } |
|
| 67 | ||
| 68 | private Settings getSettings() { |
|
| 69 | return mSettings; |
|
| 70 | } |
|
| 28 | 71 | |
| 29 | 72 | /** |
| 30 | * Converts straight quotes to curly quotes and primes. Any quotation marks |
|
| 31 | * that cannot be converted are passed to the {@link Consumer}. |
|
| 73 | * Returns the application version number retrieved from the application |
|
| 74 | * properties file. The properties file is generated at build time, which |
|
| 75 | * keys off the repository. |
|
| 32 | 76 | * |
| 33 | * @param text The text to parse. |
|
| 34 | * @param unresolved Recipient for ambiguous {@link Lexeme}s. |
|
| 35 | * @return The given text string with as many straight quotes converted to |
|
| 36 | * curly quotes as is feasible. |
|
| 77 | * @return The application version number. |
|
| 78 | * @throws RuntimeException An {@link IOException} occurred. |
|
| 37 | 79 | */ |
| 38 | public static String convert( |
|
| 39 | final String text, final Consumer<Lexeme> unresolved ) { |
|
| 40 | final var parser = new Parser( text ); |
|
| 41 | final var tokens = new ArrayList<Token>(); |
|
| 80 | private static String getVersion() { |
|
| 81 | try { |
|
| 82 | final var properties = loadProperties( "app.properties" ); |
|
| 83 | return properties.getProperty( "application.version" ); |
|
| 84 | } catch( final Exception ex ) { |
|
| 85 | throw new RuntimeException( ex ); |
|
| 86 | } |
|
| 87 | } |
|
| 42 | 88 | |
| 43 | // Parse the tokens and consume all unresolved lexemes. |
|
| 44 | parser.parse( tokens::add, unresolved ); |
|
| 89 | @SuppressWarnings( "SameParameterValue" ) |
|
| 90 | private static Properties loadProperties( final String resource ) |
|
| 91 | throws IOException { |
|
| 92 | final var properties = new Properties(); |
|
| 93 | properties.load( getResourceAsStream( getResourceName( resource ) ) ); |
|
| 94 | return properties; |
|
| 95 | } |
|
| 45 | 96 | |
| 46 | // The parser may emit tokens in any order. |
|
| 47 | sort( tokens ); |
|
| 97 | private static String getResourceName( final String resource ) { |
|
| 98 | return format( "%s/%s", getPackagePath(), resource ); |
|
| 99 | } |
|
| 48 | 100 | |
| 49 | final var result = new StringBuilder( text.length() ); |
|
| 50 | var position = 0; |
|
| 101 | private static String getPackagePath() { |
|
| 102 | return KeenQuotes.class.getPackageName().replace( '.', '/' ); |
|
| 103 | } |
|
| 51 | 104 | |
| 52 | for( final var token : tokens ) { |
|
| 53 | if( position <= token.began() ) { |
|
| 54 | result.append( text, position, token.began() ); |
|
| 55 | result.append( REPLACEMENTS.get( token.getType() ) ); |
|
| 56 | } |
|
| 105 | private static InputStream getResourceAsStream( final String resource ) { |
|
| 106 | return KeenQuotes.class.getClassLoader().getResourceAsStream( resource ); |
|
| 107 | } |
|
| 57 | 108 | |
| 58 | position = token.ended(); |
|
| 59 | } |
|
| 109 | @SuppressWarnings( "SameParameterValue" ) |
|
| 110 | private static BufferedReader open( final InputStream in ) { |
|
| 111 | return new BufferedReader( new InputStreamReader( in ) ); |
|
| 112 | } |
|
| 60 | 113 | |
| 61 | return result.append( text.substring( position ) ).toString(); |
|
| 114 | public static void main( final String[] args ) { |
|
| 115 | final var app = new KeenQuotes(); |
|
| 116 | final var parser = new CommandLine( app.getSettings() ); |
|
| 117 | parser.setColorScheme( createColourScheme() ); |
|
| 118 | ||
| 119 | final var exitCode = parser.execute( args ); |
|
| 120 | final var parseResult = parser.getParseResult(); |
|
| 121 | ||
| 122 | if( parseResult.isUsageHelpRequested() ) { |
|
| 123 | System.exit( exitCode ); |
|
| 124 | } |
|
| 125 | else if( parseResult.isVersionHelpRequested() ) { |
|
| 126 | System.out.println( getVersion() ); |
|
| 127 | System.exit( exitCode ); |
|
| 128 | } |
|
| 62 | 129 | } |
| 63 | 130 | } |
| 1 | /* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */ |
|
| 2 | package com.whitemagicsoftware.keenquotes; |
|
| 3 | ||
| 4 | import picocli.CommandLine; |
|
| 5 | ||
| 6 | import java.util.concurrent.Callable; |
|
| 7 | ||
| 8 | @CommandLine.Command( |
|
| 9 | name = "KeenQuotes", |
|
| 10 | mixinStandardHelpOptions = true, |
|
| 11 | description = "Converts straight quotes to curly quotes." |
|
| 12 | ) |
|
| 13 | @SuppressWarnings( {"FieldMayBeFinal", "CanBeFinal"} ) |
|
| 14 | public final class Settings implements Callable<Integer> { |
|
| 15 | /** |
|
| 16 | * Main executable class. |
|
| 17 | */ |
|
| 18 | private final KeenQuotes mMain; |
|
| 19 | ||
| 20 | // /** |
|
| 21 | // * List of unambiguous contractions having leading apostrophes. |
|
| 22 | // */ |
|
| 23 | // @CommandLine.Option( |
|
| 24 | // names = {"-ub", "--unamb-began"}, |
|
| 25 | // description = |
|
| 26 | // "Contractions to treat as unambiguous (e.g., cause,bout)", |
|
| 27 | // paramLabel = "words" |
|
| 28 | // ) |
|
| 29 | // private String[] mUnambiguousBegan; |
|
| 30 | // |
|
| 31 | // /** |
|
| 32 | // * List of unambiguous contractions having lagging apostrophes. |
|
| 33 | // */ |
|
| 34 | // @CommandLine.Option( |
|
| 35 | // names = {"-ue", "--unamb-ended"}, |
|
| 36 | // description = |
|
| 37 | // "Contractions to treat as unambiguous (e.g., frien,thinkin)", |
|
| 38 | // paramLabel = "words" |
|
| 39 | // ) |
|
| 40 | // private String[] mUnambiguousEnded; |
|
| 41 | // |
|
| 42 | // /** |
|
| 43 | // * List of ambiguous contractions having leading apostrophes. |
|
| 44 | // */ |
|
| 45 | // @CommandLine.Option( |
|
| 46 | // names = {"-ab", "--amb-began"}, |
|
| 47 | // description = |
|
| 48 | // "Contractions to treat as ambiguous (e.g., sup,kay)", |
|
| 49 | // paramLabel = "words" |
|
| 50 | // ) |
|
| 51 | // private String[] mAmbiguousBegan; |
|
| 52 | // |
|
| 53 | // /** |
|
| 54 | // * List of ambiguous contractions having lagging apostrophes. |
|
| 55 | // */ |
|
| 56 | // @CommandLine.Option( |
|
| 57 | // names = {"-ae", "--amb-ended"}, |
|
| 58 | // description = |
|
| 59 | // "Contractions to treat as ambiguous (e.g., gi,o)", |
|
| 60 | // paramLabel = "words" |
|
| 61 | // ) |
|
| 62 | // private String[] mAmbiguousEnded; |
|
| 63 | // |
|
| 64 | /** |
|
| 65 | * Display default values. |
|
| 66 | */ |
|
| 67 | @CommandLine.Option( |
|
| 68 | names = {"-l", "--list"}, |
|
| 69 | description = "List all ambiguous and unambiguous contractions" |
|
| 70 | ) |
|
| 71 | private boolean mDisplayList; |
|
| 72 | ||
| 73 | public Settings( final KeenQuotes main ) { |
|
| 74 | assert main != null; |
|
| 75 | mMain = main; |
|
| 76 | } |
|
| 77 | ||
| 78 | /** |
|
| 79 | * Answers whether the contractions listings should be displayed. |
|
| 80 | * |
|
| 81 | * @return {@code true} to list the contractions. |
|
| 82 | */ |
|
| 83 | public boolean displayList() { |
|
| 84 | return mDisplayList; |
|
| 85 | } |
|
| 86 | ||
| 87 | /** |
|
| 88 | * Invoked after the command-line arguments are parsed to launch the |
|
| 89 | * application. |
|
| 90 | * |
|
| 91 | * @return Exit level zero. |
|
| 92 | */ |
|
| 93 | @Override |
|
| 94 | public Integer call() { |
|
| 95 | mMain.run(); |
|
| 96 | return 0; |
|
| 97 | } |
|
| 98 | } |
|
| 1 | 99 |
| 2 | 2 | package com.whitemagicsoftware.keenquotes; |
| 3 | 3 | |
| 4 | import org.junit.jupiter.api.Disabled; |
|
| 4 | 5 | import org.junit.jupiter.api.Test; |
| 5 | 6 | import org.junit.jupiter.params.ParameterizedTest; |
| 6 | 7 | import org.junit.jupiter.params.provider.ValueSource; |
| 7 | 8 | |
| 8 | 9 | import java.io.BufferedReader; |
| 9 | 10 | import java.io.IOException; |
| 10 | 11 | import java.io.InputStreamReader; |
| 11 | 12 | import java.util.function.Function; |
| 12 | 13 | |
| 14 | import static com.whitemagicsoftware.keenquotes.Converter.convert; |
|
| 13 | 15 | import static java.lang.System.out; |
| 14 | 16 | import static org.junit.jupiter.api.Assertions.assertEquals; |
| ... | ||
| 24 | 26 | */ |
| 25 | 27 | @Test |
| 26 | //@Disabled |
|
| 28 | @Disabled |
|
| 27 | 29 | public void test_parse_SingleLine_Parsed() { |
| 28 | out.println( KeenQuotes.convert( |
|
| 30 | out.println( convert( |
|
| 29 | 31 | "\"’Kearney lives on the banks of Killarney—’", |
| 30 | 32 | out::println |
| ... | ||
| 39 | 41 | @Test |
| 40 | 42 | public void test_Parse_StraightQuotes_CurlyQuotes() throws IOException { |
| 41 | testConverter( text -> KeenQuotes.convert( text, ( lexeme ) -> {} ) ); |
|
| 43 | testConverter( text -> convert( text, ( lexeme ) -> {} ) ); |
|
| 42 | 44 | } |
| 43 | 45 | |
| 44 | 46 | @ParameterizedTest |
| 45 | @ValueSource( strings = {"westrup"} ) |
|
| 47 | @ValueSource( strings = {"habberton"} ) |
|
| 46 | 48 | void test_Parse_Story_Converted( final String filename ) throws IOException { |
| 47 | 49 | final var sb = new StringBuilder( 2 ^ 20 ); |
| ... | ||
| 55 | 57 | } |
| 56 | 58 | |
| 57 | System.out.println( KeenQuotes.convert( sb.toString(), out::println ) ); |
|
| 59 | System.out.println( convert( sb.toString(), out::println ) ); |
|
| 58 | 60 | } |
| 59 | 61 | |