| 61 | 61 | mEndedAmbiguous.addAll( ENDED_AMBIGUOUS ); |
| 62 | 62 | |
| 63 | return new Contractions( this ); |
|
| 64 | } |
|
| 65 | ||
| 66 | /** |
|
| 67 | * This returns the {@code fallback} {@link Set} if {@code src} is empty; |
|
| 68 | * otherwise, this returns the empty {@link Set}. |
|
| 69 | * |
|
| 70 | * @param src A set of contractions, possibly empty. |
|
| 71 | * @param fallback The default values to use if {@code src} is empty. |
|
| 72 | * @param <T> The type of data used by both {@link Set}s. |
|
| 73 | * @return An empty {@link Set} if the {@code src} contains at least one |
|
| 74 | * element; otherwise, this will return {@code fallback}. |
|
| 75 | */ |
|
| 76 | private static <T> Set<T> from( final Set<T> src, final Set<T> fallback ) { |
|
| 77 | assert src != null; |
|
| 78 | assert fallback != null; |
|
| 79 | return src.isEmpty() ? fallback : emptySet(); |
|
| 80 | } |
|
| 81 | } |
|
| 82 | ||
| 83 | /** |
|
| 84 | * Answers whether the given word is a contraction that always starts |
|
| 85 | * with an apostrophe. The comparison is case insensitive. This must |
|
| 86 | * only be called when a straight quote is followed by a word. |
|
| 87 | * |
|
| 88 | * @param word The word to compare against the list of known unambiguous |
|
| 89 | * contractions. |
|
| 90 | * @return {@code true} when the given word is in the set of unambiguous |
|
| 91 | * contractions. |
|
| 92 | */ |
|
| 93 | public boolean beganUnambiguously( final String word ) { |
|
| 94 | assert word != null; |
|
| 95 | return getBeganUnambiguous().contains( word.toLowerCase() ); |
|
| 96 | } |
|
| 97 | ||
| 98 | /** |
|
| 99 | * Answers whether the given word could be a contraction but is also a |
|
| 100 | * valid word in non-contracted form. |
|
| 101 | * |
|
| 102 | * @param word The word to compare against the list of known ambiguous |
|
| 103 | * contractions. |
|
| 104 | * @return {@code true} when the given word is in the set of ambiguous |
|
| 105 | * contractions. |
|
| 106 | */ |
|
| 107 | public boolean beganAmbiguously( final String word ) { |
|
| 108 | assert word != null; |
|
| 109 | return getBeganAmbiguous().contains( word.toLowerCase() ); |
|
| 110 | } |
|
| 111 | ||
| 112 | public boolean endedUnambiguously( final String word ) { |
|
| 113 | assert word != null; |
|
| 114 | return getEndedUnambiguous().contains( word.toLowerCase() ); |
|
| 115 | } |
|
| 116 | ||
| 117 | public boolean endedAmbiguously( final String word ) { |
|
| 118 | assert word != null; |
|
| 119 | final var check = word.toLowerCase(); |
|
| 120 | ||
| 121 | // Ensure that 'n' isn't matched for ambiguity by enforcing length, yet |
|
| 122 | // allow o' to match because 'a sentence can end with the letter o'. |
|
| 123 | return getEndedAmbiguous().contains( check ) || |
|
| 124 | check.endsWith( "s" ) || check.endsWith( "z" ) || |
|
| 125 | check.endsWith( "x" ) || (check.length() > 1 && check.endsWith( "n" )); |
|
| 126 | } |
|
| 127 | ||
| 128 | private Set<String> getBeganUnambiguous() { |
|
| 129 | return mBuilder.mBeganUnambiguous; |
|
| 130 | } |
|
| 131 | ||
| 132 | private Set<String> getEndedUnambiguous() { |
|
| 133 | return mBuilder.mEndedUnambiguous; |
|
| 134 | } |
|
| 135 | ||
| 136 | private Set<String> getBeganAmbiguous() { |
|
| 137 | return mBuilder.mBeganAmbiguous; |
|
| 138 | } |
|
| 139 | ||
| 140 | private Set<String> getEndedAmbiguous() { |
|
| 141 | return mBuilder.mEndedAmbiguous; |
|
| 142 | } |
|
| 143 | ||
| 144 | ||
| 145 | @Override |
|
| 146 | public String toString() { |
|
| 147 | return |
|
| 148 | toString( getBeganAmbiguous(), "Ambiguous Began", "'%s" ) + |
|
| 149 | toString( getEndedAmbiguous(), "Ambiguous Ended", "%s'" ) + |
|
| 150 | toString( getBeganUnambiguous(), "Unambiguous Began", "'%s" ) + |
|
| 151 | toString( getEndedUnambiguous(), "Unambiguous Ended", "%s'" ); |
|
| 152 | } |
|
| 153 | ||
| 154 | private String toString( |
|
| 155 | final Set<String> words, final String category, final String fmt ) { |
|
| 156 | final var sb = new StringBuilder( 16384 ); |
|
| 157 | final var newline = System.lineSeparator(); |
|
| 158 | final var list = new ArrayList<>( words ); |
|
| 159 | ||
| 160 | sort( list ); |
|
| 161 | sb.append( format( "%n%s%n", category ) ); |
|
| 162 | list.forEach( ( s ) -> sb.append( format( fmt, s ) ).append( newline ) ); |
|
| 163 | ||
| 164 | return sb.toString(); |
|
| 165 | } |
|
| 166 | ||
| 167 | /** |
|
| 168 | * Words having a straight apostrophe that cannot be mistaken for an |
|
| 169 | * opening single quote. |
|
| 170 | */ |
|
| 171 | private static final Set<String> BEGAN_UNAMBIGUOUS = Set.of( |
|
| 172 | "aporth", |
|
| 173 | "boutcha", |
|
| 174 | "boutchu", |
|
| 175 | "cept", |
|
| 176 | "dillo", |
|
| 177 | "em", |
|
| 178 | "fraid", |
|
| 179 | "gainst", |
|
| 180 | "n", |
|
| 181 | "neath", |
|
| 182 | "nother", |
|
| 183 | "nuff", |
|
| 184 | "onna", |
|
| 185 | "onna'", |
|
| 186 | "pon", |
|
| 187 | "s", |
|
| 188 | "sblood", |
|
| 189 | "scuse", |
|
| 190 | "sfar", |
|
| 191 | "sfoot", |
|
| 192 | "t", |
|
| 193 | "taint", |
|
| 194 | "tain", |
|
| 195 | "til", |
|
| 196 | "tis", |
|
| 197 | "tisn", |
|
| 198 | "tshall", |
|
| 199 | "twas", |
|
| 200 | "twasn", |
|
| 201 | "tween", |
|
| 202 | "twere", |
|
| 203 | "tweren", |
|
| 204 | "twixt", |
|
| 205 | "twon", |
|
| 206 | "twou", |
|
| 207 | "twould", |
|
| 208 | "twouldn", |
|
| 209 | "ve" |
|
| 210 | ); |
|
| 211 | ||
| 212 | /** |
|
| 213 | * Words having a straight apostrophe that may be either part of a |
|
| 214 | * contraction or a word that stands alone beside an opening single quote. |
|
| 215 | */ |
|
| 216 | private static final Set<String> BEGAN_AMBIGUOUS = Set.of( |
|
| 217 | // about|boxing match |
|
| 218 | "bout", |
|
| 219 | // because|causal |
|
| 220 | "cause", |
|
| 221 | // what you|choo choo train |
|
| 222 | "choo", |
|
| 223 | // he|e pluribus unum |
|
| 224 | "e", |
|
| 225 | // here|earlier |
|
| 226 | "ere", |
|
| 227 | // afro|to and fro |
|
| 228 | "fro", |
|
| 229 | // whore|ho ho! |
|
| 230 | "ho", |
|
| 231 | // okay|letter K |
|
| 232 | "kay", |
|
| 233 | // lo|lo and behold |
|
| 234 | "lo", |
|
| 235 | // are|regarding |
|
| 236 | "re", |
|
| 237 | // what's up|to sup |
|
| 238 | "sup", |
|
| 239 | // it will|twill fabric |
|
| 240 | "twill", |
|
| 241 | // them|utterance |
|
| 242 | "um", |
|
| 243 | // is that|Iranian village |
|
| 244 | "zat" |
|
| 245 | ); |
|
| 246 | ||
| 247 | private static final Set<String> ENDED_AMBIGUOUS = Set.of( |
|
| 248 | // give|martial arts garment |
|
| 249 | "gi", |
|
| 250 | // in|I |
|
| 251 | "i", |
|
| 252 | // of|letter o |
|
| 253 | "o" |
|
| 254 | ); |
|
| 255 | ||
| 256 | private static final Set<String> ENDED_UNAMBIGUOUS = Set.of( |
|
| 257 | // and |
|
| 258 | "an", |
|
| 259 | // for/before |
|
| 260 | "fo", |
|
| 261 | // friend |
|
| 262 | "frien", |
|
| 263 | // just |
|
| 264 | "jus", |
|
| 265 | // lord |
|
| 266 | "lor", |
|
| 267 | // myself |
|
| 268 | "masel", |
|
| 269 | // old |
|
| 270 | "ol", |
|
| 271 | // San (Francisco) |
|
| 272 | "Sa", |
|
| 273 | // shift |
|
| 274 | "shif", |
|
| 275 | // the |
|
| 276 | "th", |
|
| 277 | // what |
|
| 278 | "wha", |
|
| 279 | // world |
|
| 280 | "worl", |
|
| 281 | // Top ~500 common -ing words as English contractions. |
|
| 282 | "acceptin", |
|
| 283 | "accompanyin", |
|
| 284 | "accordin", |
|
| 285 | "accountin", |
|
| 286 | "achievin", |
|
| 287 | "acquirin", |
|
| 288 | "actin", |
|
| 289 | "addin", |
|
| 290 | "addressin", |
|
| 291 | "adjoinin", |
|
| 292 | "adoptin", |
|
| 293 | "advancin", |
|
| 294 | "advertisin", |
|
| 295 | "affectin", |
|
| 296 | "agin", |
|
| 297 | "allowin", |
|
| 298 | "amazin", |
|
| 299 | "analyzin", |
|
| 300 | "answerin", |
|
| 301 | "anythin", |
|
| 302 | "appearin", |
|
| 303 | "applyin", |
|
| 304 | "approachin", |
|
| 305 | "arguin", |
|
| 306 | "arisin", |
|
| 307 | "arrivin", |
|
| 308 | "askin", |
|
| 309 | "assessin", |
|
| 310 | "assumin", |
|
| 311 | "attackin", |
|
| 312 | "attemptin", |
|
| 313 | "attendin", |
|
| 314 | "avoidin", |
|
| 315 | "bankin", |
|
| 316 | "bargainin", |
|
| 317 | "bearin", |
|
| 318 | "beatin", |
|
| 319 | "becomin", |
|
| 320 | "beginnin", |
|
| 321 | "bein", |
|
| 322 | "believin", |
|
| 323 | "belongin", |
|
| 324 | "bendin", |
|
| 325 | "bindin", |
|
| 326 | "bleedin", |
|
| 327 | "blessin", |
|
| 328 | "blowin", |
|
| 329 | "boilin", |
|
| 330 | "borrowin", |
|
| 331 | "breakin", |
|
| 332 | "breathin", |
|
| 333 | "breedin", |
|
| 334 | "bringin", |
|
| 335 | "broadcastin", |
|
| 336 | "buildin", |
|
| 337 | "burnin", |
|
| 338 | "buyin", |
|
| 339 | "calculatin", |
|
| 340 | "callin", |
|
| 341 | "carryin", |
|
| 342 | "castin", |
|
| 343 | "causin", |
|
| 344 | "ceilin", |
|
| 345 | "challengin", |
|
| 346 | "changin", |
|
| 347 | "checkin", |
|
| 348 | "choosin", |
|
| 349 | "claimin", |
|
| 350 | "cleanin", |
|
| 351 | "clearin", |
|
| 352 | "climbin", |
|
| 353 | "closin", |
|
| 354 | "clothin", |
|
| 355 | "collectin", |
|
| 356 | "combinin", |
|
| 357 | "comin", |
|
| 358 | "commandin", |
|
| 359 | "comparin", |
|
| 360 | "compellin", |
|
| 361 | "competin", |
|
| 362 | "computin", |
|
| 363 | "concernin", |
|
| 364 | "concludin", |
|
| 365 | "conditionin", |
|
| 366 | "conductin", |
|
| 367 | "conflictin", |
|
| 368 | "connectin", |
|
| 369 | "considerin", |
|
| 370 | "consistin", |
|
| 371 | "constructin", |
|
| 372 | "consultin", |
|
| 373 | "consumin", |
|
| 374 | "containin", |
|
| 375 | "continuin", |
|
| 376 | "contractin", |
|
| 377 | "contributin", |
|
| 378 | "controllin", |
|
| 379 | "convincin", |
|
| 380 | "cookin", |
|
| 381 | "coolin", |
|
| 382 | "copin", |
|
| 383 | "correspondin", |
|
| 384 | "counselin", |
|
| 385 | "countin", |
|
| 386 | "couplin", |
|
| 387 | "coverin", |
|
| 388 | "creatin", |
|
| 389 | "crossin", |
|
| 390 | "cryin", |
|
| 391 | "cuttin", |
|
| 392 | "dancin", |
|
| 393 | "darlin", |
|
| 394 | "datin", |
|
| 395 | "dealin", |
|
| 396 | "decidin", |
|
| 397 | "declarin", |
|
| 398 | "declinin", |
|
| 399 | "decreasin", |
|
| 400 | "definin", |
|
| 401 | "demandin", |
|
| 402 | "denyin", |
|
| 403 | "dependin", |
|
| 404 | "descendin", |
|
| 405 | "describin", |
|
| 406 | "designin", |
|
| 407 | "destroyin", |
|
| 408 | "determinin", |
|
| 409 | "developin", |
|
| 410 | "differin", |
|
| 411 | "dinin", |
|
| 412 | "directin", |
|
| 413 | "discussin", |
|
| 414 | "distinguishin", |
|
| 415 | "disturbin", |
|
| 416 | "dividin", |
|
| 417 | "doin", |
|
| 418 | "drawin", |
|
| 419 | "dressin", |
|
| 420 | "drinkin", |
|
| 421 | "drivin", |
|
| 422 | "droppin", |
|
| 423 | "dryin", |
|
| 424 | "durin", |
|
| 425 | "dwellin", |
|
| 426 | "dyin", |
|
| 427 | "eatin", |
|
| 428 | "editin", |
|
| 429 | "emergin", |
|
| 430 | "employin", |
|
| 431 | "enablin", |
|
| 432 | "encouragin", |
|
| 433 | "endin", |
|
| 434 | "engagin", |
|
| 435 | "engineerin", |
|
| 436 | "enjoyin", |
|
| 437 | "enterin", |
|
| 438 | "establishin", |
|
| 439 | "evaluatin", |
|
| 440 | "evenin", |
|
| 441 | "everythin", |
|
| 442 | "examinin", |
|
| 443 | "exceedin", |
|
| 444 | "excitin", |
|
| 445 | "excludin", |
|
| 446 | "existin", |
|
| 447 | "expandin", |
|
| 448 | "expectin", |
|
| 449 | "experiencin", |
|
| 450 | "explainin", |
|
| 451 | "explorin", |
|
| 452 | "expressin", |
|
| 453 | "extendin", |
|
| 454 | "facin", |
|
| 455 | "failin", |
|
| 456 | "fallin", |
|
| 457 | "farmin", |
|
| 458 | "fascinatin", |
|
| 459 | "feedin", |
|
| 460 | "feelin", |
|
| 461 | "fightin", |
|
| 462 | "filin", |
|
| 463 | "fillin", |
|
| 464 | "financin", |
|
| 465 | "findin", |
|
| 466 | "firin", |
|
| 467 | "fishin", |
|
| 468 | "fittin", |
|
| 469 | "fixin", |
|
| 470 | "floatin", |
|
| 471 | "flowin", |
|
| 472 | "flyin", |
|
| 473 | "focusin", |
|
| 474 | "followin", |
|
| 475 | "forcin", |
|
| 476 | "foregoin", |
|
| 477 | "formin", |
|
| 478 | "forthcomin", |
|
| 479 | "foundin", |
|
| 480 | "freezin", |
|
| 481 | "fuckin", |
|
| 482 | "functionin", |
|
| 483 | "fundin", |
|
| 484 | "gainin", |
|
| 485 | "gatherin", |
|
| 486 | "generatin", |
|
| 487 | "gettin", |
|
| 488 | "givin", |
|
| 489 | "goin", |
|
| 490 | "governin", |
|
| 491 | "grantin", |
|
| 492 | "growin", |
|
| 493 | "hackin", |
|
| 494 | "handlin", |
|
| 495 | "hangin", |
|
| 496 | "happenin", |
|
| 497 | "havin", |
|
| 498 | "headin", |
|
| 499 | "healin", |
|
| 500 | "hearin", |
|
| 501 | "heatin", |
|
| 502 | "helpin", |
|
| 503 | "hidin", |
|
| 504 | "holdin", |
|
| 505 | "hopin", |
|
| 506 | "housin", |
|
| 507 | "huntin", |
|
| 508 | "identifyin", |
|
| 509 | "imagin", |
|
| 510 | "implementin", |
|
| 511 | "imposin", |
|
| 512 | "improvin", |
|
| 513 | "includin", |
|
| 514 | "increasin", |
|
| 515 | "indicatin", |
|
| 516 | "interestin", |
|
| 517 | "interpretin", |
|
| 518 | "introducin", |
|
| 519 | "involvin", |
|
| 520 | "joinin", |
|
| 521 | "judgin", |
|
| 522 | "keepin", |
|
| 523 | "killin", |
|
| 524 | "knowin", |
|
| 525 | "lackin", |
|
| 526 | "landin", |
|
| 527 | "lastin", |
|
| 528 | "laughin", |
|
| 529 | "layin", |
|
| 530 | "leadin", |
|
| 531 | "leanin", |
|
| 532 | "learnin", |
|
| 533 | "leavin", |
|
| 534 | "lettin", |
|
| 535 | "liftin", |
|
| 536 | "lightin", |
|
| 537 | "lightnin", |
|
| 538 | "limitin", |
|
| 539 | "listenin", |
|
| 540 | "listin", |
|
| 541 | "livin", |
|
| 542 | "loadin", |
|
| 543 | "lookin", |
|
| 544 | "losin", |
|
| 545 | "lovin", |
|
| 546 | "lowerin", |
|
| 547 | "lyin", |
|
| 548 | "maintainin", |
|
| 549 | "makin", |
|
| 550 | "managin", |
|
| 551 | "manufacturin", |
|
| 552 | "mappin", |
|
| 553 | "marketin", |
|
| 554 | "markin", |
|
| 555 | "matchin", |
|
| 556 | "meanin", |
|
| 557 | "measurin", |
|
| 558 | "meetin", |
|
| 559 | "meltin", |
|
| 560 | "minin", |
|
| 561 | "misleadin", |
|
| 562 | "missin", |
|
| 563 | "mixin", |
|
| 564 | "modelin", |
|
| 565 | "monitorin", |
|
| 566 | "mornin", |
|
| 567 | "movin", |
|
| 568 | "neighborin", |
|
| 569 | "neighbourin", |
|
| 570 | "nothin", |
|
| 571 | "notin", |
|
| 572 | "notwithstandin", |
|
| 573 | "nursin", |
|
| 574 | "observin", |
|
| 575 | "obtainin", |
|
| 576 | "occurrin", |
|
| 577 | "offerin", |
|
| 578 | "offsprin", |
|
| 579 | "ongoin", |
|
| 580 | "openin", |
|
| 581 | "operatin", |
|
| 582 | "opposin", |
|
| 583 | "orderin", |
|
| 584 | "organizin", |
|
| 585 | "outstandin", |
|
| 586 | "overwhelmin", |
|
| 587 | "packin", |
|
| 588 | "paintin", |
|
| 589 | "parkin", |
|
| 590 | "participatin", |
|
| 591 | "passin", |
|
| 592 | "payin", |
|
| 593 | "pendin", |
|
| 594 | "performin", |
|
| 595 | "pickin", |
|
| 596 | "pissin", |
|
| 597 | "placin", |
|
| 598 | "plannin", |
|
| 599 | "plantin", |
|
| 600 | "playin", |
|
| 601 | "pleasin", |
|
| 602 | "pointin", |
|
| 603 | "possessin", |
|
| 604 | "preachin", |
|
| 605 | "precedin", |
|
| 606 | "preparin", |
|
| 607 | "presentin", |
|
| 608 | "preservin", |
|
| 609 | "pressin", |
|
| 610 | "prevailin", |
|
| 611 | "preventin", |
|
| 612 | "pricin", |
|
| 613 | "printin", |
|
| 614 | "proceedin", |
|
| 615 | "processin", |
|
| 616 | "producin", |
|
| 617 | "programmin", |
|
| 618 | "promisin", |
|
| 619 | "promotin", |
|
| 620 | "protectin", |
|
| 621 | "providin", |
|
| 622 | "provin", |
|
| 623 | "publishin", |
|
| 624 | "pullin", |
|
| 625 | "purchasin", |
|
| 626 | "pursuin", |
|
| 627 | "pushin", |
|
| 628 | "puttin", |
|
| 629 | "questionin", |
|
| 630 | "rangin", |
|
| 631 | "ratin", |
|
| 632 | "reachin", |
|
| 633 | "readin", |
|
| 634 | "reasonin", |
|
| 635 | "receivin", |
|
| 636 | "recognizin", |
|
| 637 | "recordin", |
|
| 638 | "reducin", |
|
| 639 | "referrin", |
|
| 640 | "reflectin", |
|
| 641 | "refusin", |
|
| 642 | "regardin", |
|
| 643 | "regulatin", |
|
| 644 | "relatin", |
|
| 645 | "remainin", |
|
| 646 | "rememberin", |
|
| 647 | "removin", |
|
| 648 | "renderin", |
|
| 649 | "repeatin", |
|
| 650 | "replacin", |
|
| 651 | "reportin", |
|
| 652 | "representin", |
|
| 653 | "requirin", |
|
| 654 | "respectin", |
|
| 655 | "respondin", |
|
| 656 | "restin", |
|
| 657 | "resultin", |
|
| 658 | "returnin", |
|
| 659 | "revealin", |
|
| 660 | "ridin", |
|
| 661 | "risin", |
|
| 662 | "rulin", |
|
| 663 | "runnin", |
|
| 664 | "sailin", |
|
| 665 | "samplin", |
|
| 666 | "satisfyin", |
|
| 667 | "savin", |
|
| 668 | "sayin", |
|
| 669 | "scatterin", |
|
| 670 | "schoolin", |
|
| 671 | "screenin", |
|
| 672 | "searchin", |
|
| 673 | "securin", |
|
| 674 | "seein", |
|
| 675 | "seekin", |
|
| 676 | "selectin", |
|
| 677 | "sellin", |
|
| 678 | "sendin", |
|
| 679 | "separatin", |
|
| 680 | "servin", |
|
| 681 | "settin", |
|
| 682 | "settlin", |
|
| 683 | "sewin", |
|
| 684 | "shakin", |
|
| 685 | "shapin", |
|
| 686 | "sharin", |
|
| 687 | "shiftin", |
|
| 688 | "shinin", |
|
| 689 | "shippin", |
|
| 690 | "shittin", |
|
| 691 | "shootin", |
|
| 692 | "shoppin", |
|
| 693 | "showin", |
|
| 694 | "singin", |
|
| 695 | "sinkin", |
|
| 696 | "sittin", |
|
| 697 | "sleepin", |
|
| 698 | "smilin", |
|
| 699 | "smokin", |
|
| 700 | "spankin", |
|
| 701 | "solvin", |
|
| 702 | "somethin", |
|
| 703 | "speakin", |
|
| 704 | "spellin", |
|
| 705 | "spendin", |
|
| 706 | "spinnin", |
|
| 707 | "spittin", |
|
| 708 | "spreadin", |
|
| 709 | "standin", |
|
| 710 | "starin", |
|
| 711 | "startin", |
|
| 712 | "statin", |
|
| 713 | "stayin", |
|
| 714 | "stealin", |
|
| 715 | "sterlin", |
|
| 716 | "stimulatin", |
|
| 717 | "stirrin", |
|
| 718 | "stoppin", |
|
| 719 | "strengthenin", |
|
| 720 | "stretchin", |
|
| 721 | "strikin", |
|
| 722 | "strugglin", |
|
| 723 | "studyin", |
|
| 724 | "succeedin", |
|
| 725 | "sufferin", |
|
| 726 | "suggestin", |
|
| 727 | "supplyin", |
|
| 728 | "supportin", |
|
| 729 | "surprisin", |
|
| 730 | "surroundin", |
|
| 731 | "survivin", |
|
| 732 | "sweepin", |
|
| 733 | "swellin", |
|
| 734 | "swimmin", |
|
| 735 | "switchin", |
|
| 736 | "takin", |
|
| 737 | "talkin", |
|
| 738 | "teachin", |
|
| 739 | "tellin", |
|
| 740 | "testin", |
|
| 741 | "thinkin", |
|
| 742 | "threatenin", |
|
| 743 | "throwin", |
|
| 744 | "timin", |
|
| 745 | "touchin", |
|
| 746 | "tradin", |
|
| 747 | "trainin", |
|
| 748 | "travelin", |
|
| 749 | "treatin", |
|
| 750 | "tremblin", |
|
| 751 | "tryin", |
|
| 752 | "turnin", |
|
| 753 | "underlyin", |
|
| 754 | "understandin", |
|
| 755 | "undertakin", |
|
| 756 | "unwillin", |
|
| 757 | "usin", |
|
| 758 | "varyin", |
|
| 759 | "viewin", |
|
| 760 | "visitin", |
|
| 761 | "votin", |
|
| 762 | "waitin", |
|
| 763 | "walkin", |
|
| 764 | "wanderin", |
|
| 765 | "wantin", |
|
| 766 | "warnin", |
|
| 767 | "washin", |
|
| 768 | "watchin", |
|
| 769 | "wearin", |
|
| 770 | "weddin", |
|
| 771 | "whackin", |
|
| 63 | // Remove ambiguous items if they are already declared. |
|
| 64 | mBeganAmbiguous.removeAll( mBeganUnambiguous ); |
|
| 65 | mEndedAmbiguous.removeAll( mEndedUnambiguous ); |
|
| 66 | ||
| 67 | return new Contractions( this ); |
|
| 68 | } |
|
| 69 | ||
| 70 | /** |
|
| 71 | * This returns the {@code fallback} {@link Set} if {@code src} is empty; |
|
| 72 | * otherwise, this returns the empty {@link Set}. |
|
| 73 | * |
|
| 74 | * @param src A set of contractions, possibly empty. |
|
| 75 | * @param fallback The default values to use if {@code src} is empty. |
|
| 76 | * @param <T> The type of data used by both {@link Set}s. |
|
| 77 | * @return An empty {@link Set} if the {@code src} contains at least one |
|
| 78 | * element; otherwise, this will return {@code fallback}. |
|
| 79 | */ |
|
| 80 | private static <T> Set<T> from( final Set<T> src, final Set<T> fallback ) { |
|
| 81 | assert src != null; |
|
| 82 | assert fallback != null; |
|
| 83 | return src.isEmpty() ? fallback : emptySet(); |
|
| 84 | } |
|
| 85 | } |
|
| 86 | ||
| 87 | /** |
|
| 88 | * Answers whether the given word is a contraction that always starts |
|
| 89 | * with an apostrophe. The comparison is case insensitive. This must |
|
| 90 | * only be called when a straight quote is followed by a word. |
|
| 91 | * |
|
| 92 | * @param word The word to compare against the list of known unambiguous |
|
| 93 | * contractions. |
|
| 94 | * @return {@code true} when the given word is in the set of unambiguous |
|
| 95 | * contractions. |
|
| 96 | */ |
|
| 97 | public boolean beganUnambiguously( final String word ) { |
|
| 98 | assert word != null; |
|
| 99 | return getBeganUnambiguous().contains( word.toLowerCase() ); |
|
| 100 | } |
|
| 101 | ||
| 102 | /** |
|
| 103 | * Answers whether the given word could be a contraction but is also a |
|
| 104 | * valid word in non-contracted form. |
|
| 105 | * |
|
| 106 | * @param word The word to compare against the list of known ambiguous |
|
| 107 | * contractions. |
|
| 108 | * @return {@code true} when the given word is in the set of ambiguous |
|
| 109 | * contractions. |
|
| 110 | */ |
|
| 111 | public boolean beganAmbiguously( final String word ) { |
|
| 112 | assert word != null; |
|
| 113 | return getBeganAmbiguous().contains( word.toLowerCase() ); |
|
| 114 | } |
|
| 115 | ||
| 116 | public boolean endedUnambiguously( final String word ) { |
|
| 117 | assert word != null; |
|
| 118 | return getEndedUnambiguous().contains( word.toLowerCase() ); |
|
| 119 | } |
|
| 120 | ||
| 121 | public boolean endedAmbiguously( final String word ) { |
|
| 122 | assert word != null; |
|
| 123 | final var check = word.toLowerCase(); |
|
| 124 | ||
| 125 | // Ensure that 'n' isn't matched for ambiguity by enforcing length, yet |
|
| 126 | // allow o' to match because 'a sentence can end with the letter o'. |
|
| 127 | return getEndedAmbiguous().contains( check ) || |
|
| 128 | check.endsWith( "s" ) || check.endsWith( "z" ) || |
|
| 129 | check.endsWith( "x" ) || (check.length() > 1 && check.endsWith( "n" )); |
|
| 130 | } |
|
| 131 | ||
| 132 | private Set<String> getBeganUnambiguous() { |
|
| 133 | return mBuilder.mBeganUnambiguous; |
|
| 134 | } |
|
| 135 | ||
| 136 | private Set<String> getEndedUnambiguous() { |
|
| 137 | return mBuilder.mEndedUnambiguous; |
|
| 138 | } |
|
| 139 | ||
| 140 | private Set<String> getBeganAmbiguous() { |
|
| 141 | return mBuilder.mBeganAmbiguous; |
|
| 142 | } |
|
| 143 | ||
| 144 | private Set<String> getEndedAmbiguous() { |
|
| 145 | return mBuilder.mEndedAmbiguous; |
|
| 146 | } |
|
| 147 | ||
| 148 | ||
| 149 | @Override |
|
| 150 | public String toString() { |
|
| 151 | return |
|
| 152 | toString( getBeganAmbiguous(), "Ambiguous Began", "'%s" ) + |
|
| 153 | toString( getEndedAmbiguous(), "Ambiguous Ended", "%s'" ) + |
|
| 154 | toString( getBeganUnambiguous(), "Unambiguous Began", "'%s" ) + |
|
| 155 | toString( getEndedUnambiguous(), "Unambiguous Ended", "%s'" ); |
|
| 156 | } |
|
| 157 | ||
| 158 | private String toString( |
|
| 159 | final Set<String> words, final String category, final String fmt ) { |
|
| 160 | final var sb = new StringBuilder( 16384 ); |
|
| 161 | final var newline = System.lineSeparator(); |
|
| 162 | final var list = new ArrayList<>( words ); |
|
| 163 | ||
| 164 | sort( list ); |
|
| 165 | sb.append( format( "%n%s%n", category ) ); |
|
| 166 | list.forEach( ( s ) -> sb.append( format( fmt, s ) ).append( newline ) ); |
|
| 167 | ||
| 168 | return sb.toString(); |
|
| 169 | } |
|
| 170 | ||
| 171 | /** |
|
| 172 | * Words having a straight apostrophe that cannot be mistaken for an |
|
| 173 | * opening single quote. |
|
| 174 | */ |
|
| 175 | private static final Set<String> BEGAN_UNAMBIGUOUS = Set.of( |
|
| 176 | "aporth", |
|
| 177 | "boutcha", |
|
| 178 | "boutchu", |
|
| 179 | "cept", |
|
| 180 | "dillo", |
|
| 181 | "em", |
|
| 182 | "fraid", |
|
| 183 | "gainst", |
|
| 184 | "im", |
|
| 185 | "n", |
|
| 186 | "neath", |
|
| 187 | "nother", |
|
| 188 | "nuff", |
|
| 189 | "onna", |
|
| 190 | "onna'", |
|
| 191 | "pon", |
|
| 192 | "s", |
|
| 193 | "sblood", |
|
| 194 | "scuse", |
|
| 195 | "sfar", |
|
| 196 | "sfoot", |
|
| 197 | "t", |
|
| 198 | "taint", |
|
| 199 | "tain", |
|
| 200 | "til", |
|
| 201 | "tis", |
|
| 202 | "tisn", |
|
| 203 | "tshall", |
|
| 204 | "twas", |
|
| 205 | "twasn", |
|
| 206 | "tween", |
|
| 207 | "twere", |
|
| 208 | "tweren", |
|
| 209 | "twixt", |
|
| 210 | "twon", |
|
| 211 | "twou", |
|
| 212 | "twould", |
|
| 213 | "twouldn", |
|
| 214 | "ve" |
|
| 215 | ); |
|
| 216 | ||
| 217 | /** |
|
| 218 | * Words having a straight apostrophe that may be either part of a |
|
| 219 | * contraction or a word that stands alone beside an opening single quote. |
|
| 220 | */ |
|
| 221 | private static final Set<String> BEGAN_AMBIGUOUS = Set.of( |
|
| 222 | // about|boxing match |
|
| 223 | "bout", |
|
| 224 | // because|causal |
|
| 225 | "cause", |
|
| 226 | // what you|choo choo train |
|
| 227 | "choo", |
|
| 228 | // he|e pluribus unum |
|
| 229 | "e", |
|
| 230 | // here|earlier |
|
| 231 | "ere", |
|
| 232 | // afro|to and fro |
|
| 233 | "fro", |
|
| 234 | // whore|ho ho! |
|
| 235 | "ho", |
|
| 236 | // okay|letter K |
|
| 237 | "kay", |
|
| 238 | // lo|lo and behold |
|
| 239 | "lo", |
|
| 240 | // are|regarding |
|
| 241 | "re", |
|
| 242 | // what's up|to sup |
|
| 243 | "sup", |
|
| 244 | // it will|twill fabric |
|
| 245 | "twill", |
|
| 246 | // them|utterance |
|
| 247 | "um", |
|
| 248 | // is that|Iranian village |
|
| 249 | "zat" |
|
| 250 | ); |
|
| 251 | ||
| 252 | private static final Set<String> ENDED_AMBIGUOUS = Set.of( |
|
| 253 | // give|martial arts garment |
|
| 254 | "gi", |
|
| 255 | // in|I |
|
| 256 | "i", |
|
| 257 | // of|letter o |
|
| 258 | "o" |
|
| 259 | ); |
|
| 260 | ||
| 261 | private static final Set<String> ENDED_UNAMBIGUOUS = Set.of( |
|
| 262 | // and |
|
| 263 | "an", |
|
| 264 | // for/before |
|
| 265 | "fo", |
|
| 266 | // friend |
|
| 267 | "frien", |
|
| 268 | // just |
|
| 269 | "jus", |
|
| 270 | // lord |
|
| 271 | "lor", |
|
| 272 | // myself |
|
| 273 | "masel", |
|
| 274 | // old |
|
| 275 | "ol", |
|
| 276 | // San (Francisco) |
|
| 277 | "Sa", |
|
| 278 | // shift |
|
| 279 | "shif", |
|
| 280 | // the |
|
| 281 | "th", |
|
| 282 | // what |
|
| 283 | "wha", |
|
| 284 | // world |
|
| 285 | "worl", |
|
| 286 | // Top ~500 common -ing words as English contractions. |
|
| 287 | "acceptin", |
|
| 288 | "accompanyin", |
|
| 289 | "accordin", |
|
| 290 | "accountin", |
|
| 291 | "achievin", |
|
| 292 | "acquirin", |
|
| 293 | "actin", |
|
| 294 | "addin", |
|
| 295 | "addressin", |
|
| 296 | "adjoinin", |
|
| 297 | "adoptin", |
|
| 298 | "advancin", |
|
| 299 | "advertisin", |
|
| 300 | "affectin", |
|
| 301 | "agin", |
|
| 302 | "allowin", |
|
| 303 | "amazin", |
|
| 304 | "analyzin", |
|
| 305 | "answerin", |
|
| 306 | "anythin", |
|
| 307 | "appearin", |
|
| 308 | "applyin", |
|
| 309 | "approachin", |
|
| 310 | "arguin", |
|
| 311 | "arisin", |
|
| 312 | "arrivin", |
|
| 313 | "askin", |
|
| 314 | "assessin", |
|
| 315 | "assumin", |
|
| 316 | "attackin", |
|
| 317 | "attemptin", |
|
| 318 | "attendin", |
|
| 319 | "avoidin", |
|
| 320 | "bankin", |
|
| 321 | "bargainin", |
|
| 322 | "bearin", |
|
| 323 | "beatin", |
|
| 324 | "becomin", |
|
| 325 | "beginnin", |
|
| 326 | "bein", |
|
| 327 | "believin", |
|
| 328 | "belongin", |
|
| 329 | "bendin", |
|
| 330 | "bindin", |
|
| 331 | "bleedin", |
|
| 332 | "blessin", |
|
| 333 | "blowin", |
|
| 334 | "boilin", |
|
| 335 | "borrowin", |
|
| 336 | "breakin", |
|
| 337 | "breathin", |
|
| 338 | "breedin", |
|
| 339 | "bringin", |
|
| 340 | "broadcastin", |
|
| 341 | "buildin", |
|
| 342 | "burnin", |
|
| 343 | "buyin", |
|
| 344 | "calculatin", |
|
| 345 | "callin", |
|
| 346 | "carryin", |
|
| 347 | "castin", |
|
| 348 | "causin", |
|
| 349 | "ceilin", |
|
| 350 | "challengin", |
|
| 351 | "changin", |
|
| 352 | "checkin", |
|
| 353 | "choosin", |
|
| 354 | "claimin", |
|
| 355 | "cleanin", |
|
| 356 | "clearin", |
|
| 357 | "climbin", |
|
| 358 | "closin", |
|
| 359 | "clothin", |
|
| 360 | "collectin", |
|
| 361 | "combinin", |
|
| 362 | "comin", |
|
| 363 | "commandin", |
|
| 364 | "comparin", |
|
| 365 | "compellin", |
|
| 366 | "competin", |
|
| 367 | "computin", |
|
| 368 | "concernin", |
|
| 369 | "concludin", |
|
| 370 | "conditionin", |
|
| 371 | "conductin", |
|
| 372 | "conflictin", |
|
| 373 | "connectin", |
|
| 374 | "considerin", |
|
| 375 | "consistin", |
|
| 376 | "constructin", |
|
| 377 | "consultin", |
|
| 378 | "consumin", |
|
| 379 | "containin", |
|
| 380 | "continuin", |
|
| 381 | "contractin", |
|
| 382 | "contributin", |
|
| 383 | "controllin", |
|
| 384 | "convincin", |
|
| 385 | "cookin", |
|
| 386 | "coolin", |
|
| 387 | "copin", |
|
| 388 | "correspondin", |
|
| 389 | "counselin", |
|
| 390 | "countin", |
|
| 391 | "couplin", |
|
| 392 | "coverin", |
|
| 393 | "creatin", |
|
| 394 | "crossin", |
|
| 395 | "cryin", |
|
| 396 | "cuttin", |
|
| 397 | "dancin", |
|
| 398 | "darlin", |
|
| 399 | "datin", |
|
| 400 | "dealin", |
|
| 401 | "decidin", |
|
| 402 | "declarin", |
|
| 403 | "declinin", |
|
| 404 | "decreasin", |
|
| 405 | "definin", |
|
| 406 | "demandin", |
|
| 407 | "denyin", |
|
| 408 | "dependin", |
|
| 409 | "descendin", |
|
| 410 | "describin", |
|
| 411 | "designin", |
|
| 412 | "destroyin", |
|
| 413 | "determinin", |
|
| 414 | "developin", |
|
| 415 | "differin", |
|
| 416 | "dinin", |
|
| 417 | "directin", |
|
| 418 | "discussin", |
|
| 419 | "distinguishin", |
|
| 420 | "disturbin", |
|
| 421 | "dividin", |
|
| 422 | "doin", |
|
| 423 | "drawin", |
|
| 424 | "dressin", |
|
| 425 | "drinkin", |
|
| 426 | "drivin", |
|
| 427 | "droppin", |
|
| 428 | "dryin", |
|
| 429 | "durin", |
|
| 430 | "dwellin", |
|
| 431 | "dyin", |
|
| 432 | "eatin", |
|
| 433 | "editin", |
|
| 434 | "emergin", |
|
| 435 | "employin", |
|
| 436 | "enablin", |
|
| 437 | "encouragin", |
|
| 438 | "endin", |
|
| 439 | "engagin", |
|
| 440 | "engineerin", |
|
| 441 | "enjoyin", |
|
| 442 | "enterin", |
|
| 443 | "establishin", |
|
| 444 | "evaluatin", |
|
| 445 | "evenin", |
|
| 446 | "everythin", |
|
| 447 | "examinin", |
|
| 448 | "exceedin", |
|
| 449 | "excitin", |
|
| 450 | "excludin", |
|
| 451 | "existin", |
|
| 452 | "expandin", |
|
| 453 | "expectin", |
|
| 454 | "experiencin", |
|
| 455 | "explainin", |
|
| 456 | "explorin", |
|
| 457 | "expressin", |
|
| 458 | "extendin", |
|
| 459 | "facin", |
|
| 460 | "failin", |
|
| 461 | "fallin", |
|
| 462 | "farmin", |
|
| 463 | "fascinatin", |
|
| 464 | "feedin", |
|
| 465 | "feelin", |
|
| 466 | "fightin", |
|
| 467 | "filin", |
|
| 468 | "fillin", |
|
| 469 | "financin", |
|
| 470 | "findin", |
|
| 471 | "firin", |
|
| 472 | "fishin", |
|
| 473 | "fittin", |
|
| 474 | "fixin", |
|
| 475 | "floatin", |
|
| 476 | "flowin", |
|
| 477 | "flyin", |
|
| 478 | "focusin", |
|
| 479 | "followin", |
|
| 480 | "forcin", |
|
| 481 | "foregoin", |
|
| 482 | "formin", |
|
| 483 | "forthcomin", |
|
| 484 | "foundin", |
|
| 485 | "freezin", |
|
| 486 | "fuckin", |
|
| 487 | "functionin", |
|
| 488 | "fundin", |
|
| 489 | "gainin", |
|
| 490 | "gatherin", |
|
| 491 | "generatin", |
|
| 492 | "gettin", |
|
| 493 | "givin", |
|
| 494 | "goin", |
|
| 495 | "governin", |
|
| 496 | "grantin", |
|
| 497 | "growin", |
|
| 498 | "hackin", |
|
| 499 | "handlin", |
|
| 500 | "hangin", |
|
| 501 | "happenin", |
|
| 502 | "havin", |
|
| 503 | "headin", |
|
| 504 | "healin", |
|
| 505 | "hearin", |
|
| 506 | "heatin", |
|
| 507 | "helpin", |
|
| 508 | "hidin", |
|
| 509 | "holdin", |
|
| 510 | "hopin", |
|
| 511 | "housin", |
|
| 512 | "huntin", |
|
| 513 | "identifyin", |
|
| 514 | "imagin", |
|
| 515 | "implementin", |
|
| 516 | "imposin", |
|
| 517 | "improvin", |
|
| 518 | "includin", |
|
| 519 | "increasin", |
|
| 520 | "indicatin", |
|
| 521 | "interestin", |
|
| 522 | "interpretin", |
|
| 523 | "introducin", |
|
| 524 | "involvin", |
|
| 525 | "joinin", |
|
| 526 | "judgin", |
|
| 527 | "keepin", |
|
| 528 | "killin", |
|
| 529 | "knowin", |
|
| 530 | "lackin", |
|
| 531 | "landin", |
|
| 532 | "lastin", |
|
| 533 | "laughin", |
|
| 534 | "layin", |
|
| 535 | "leadin", |
|
| 536 | "leanin", |
|
| 537 | "learnin", |
|
| 538 | "leavin", |
|
| 539 | "lettin", |
|
| 540 | "liftin", |
|
| 541 | "lightin", |
|
| 542 | "lightnin", |
|
| 543 | "limitin", |
|
| 544 | "listenin", |
|
| 545 | "listin", |
|
| 546 | "livin", |
|
| 547 | "loadin", |
|
| 548 | "lookin", |
|
| 549 | "losin", |
|
| 550 | "lovin", |
|
| 551 | "lowerin", |
|
| 552 | "lyin", |
|
| 553 | "maintainin", |
|
| 554 | "makin", |
|
| 555 | "managin", |
|
| 556 | "manufacturin", |
|
| 557 | "mappin", |
|
| 558 | "marketin", |
|
| 559 | "markin", |
|
| 560 | "matchin", |
|
| 561 | "meanin", |
|
| 562 | "measurin", |
|
| 563 | "meetin", |
|
| 564 | "meltin", |
|
| 565 | "minin", |
|
| 566 | "misleadin", |
|
| 567 | "missin", |
|
| 568 | "mixin", |
|
| 569 | "modelin", |
|
| 570 | "monitorin", |
|
| 571 | "mornin", |
|
| 572 | "movin", |
|
| 573 | "neighborin", |
|
| 574 | "neighbourin", |
|
| 575 | "nothin", |
|
| 576 | "notin", |
|
| 577 | "notwithstandin", |
|
| 578 | "nursin", |
|
| 579 | "observin", |
|
| 580 | "obtainin", |
|
| 581 | "occurrin", |
|
| 582 | "offerin", |
|
| 583 | "offsprin", |
|
| 584 | "ongoin", |
|
| 585 | "openin", |
|
| 586 | "operatin", |
|
| 587 | "opposin", |
|
| 588 | "orderin", |
|
| 589 | "organizin", |
|
| 590 | "outstandin", |
|
| 591 | "overwhelmin", |
|
| 592 | "packin", |
|
| 593 | "paintin", |
|
| 594 | "parkin", |
|
| 595 | "participatin", |
|
| 596 | "passin", |
|
| 597 | "payin", |
|
| 598 | "pendin", |
|
| 599 | "performin", |
|
| 600 | "pickin", |
|
| 601 | "pissin", |
|
| 602 | "placin", |
|
| 603 | "plannin", |
|
| 604 | "plantin", |
|
| 605 | "playin", |
|
| 606 | "pleasin", |
|
| 607 | "pointin", |
|
| 608 | "poppin", |
|
| 609 | "possessin", |
|
| 610 | "preachin", |
|
| 611 | "precedin", |
|
| 612 | "preparin", |
|
| 613 | "presentin", |
|
| 614 | "preservin", |
|
| 615 | "pressin", |
|
| 616 | "prevailin", |
|
| 617 | "preventin", |
|
| 618 | "pricin", |
|
| 619 | "printin", |
|
| 620 | "proceedin", |
|
| 621 | "processin", |
|
| 622 | "producin", |
|
| 623 | "programmin", |
|
| 624 | "promisin", |
|
| 625 | "promotin", |
|
| 626 | "protectin", |
|
| 627 | "providin", |
|
| 628 | "provin", |
|
| 629 | "publishin", |
|
| 630 | "pullin", |
|
| 631 | "purchasin", |
|
| 632 | "pursuin", |
|
| 633 | "pushin", |
|
| 634 | "puttin", |
|
| 635 | "questionin", |
|
| 636 | "rangin", |
|
| 637 | "ratin", |
|
| 638 | "reachin", |
|
| 639 | "readin", |
|
| 640 | "reasonin", |
|
| 641 | "receivin", |
|
| 642 | "recognizin", |
|
| 643 | "recordin", |
|
| 644 | "reducin", |
|
| 645 | "referrin", |
|
| 646 | "reflectin", |
|
| 647 | "refusin", |
|
| 648 | "regardin", |
|
| 649 | "regulatin", |
|
| 650 | "relatin", |
|
| 651 | "remainin", |
|
| 652 | "rememberin", |
|
| 653 | "removin", |
|
| 654 | "renderin", |
|
| 655 | "repeatin", |
|
| 656 | "replacin", |
|
| 657 | "reportin", |
|
| 658 | "representin", |
|
| 659 | "requirin", |
|
| 660 | "respectin", |
|
| 661 | "respondin", |
|
| 662 | "restin", |
|
| 663 | "resultin", |
|
| 664 | "returnin", |
|
| 665 | "revealin", |
|
| 666 | "ridin", |
|
| 667 | "risin", |
|
| 668 | "rulin", |
|
| 669 | "runnin", |
|
| 670 | "rythin", |
|
| 671 | "sailin", |
|
| 672 | "samplin", |
|
| 673 | "satisfyin", |
|
| 674 | "savin", |
|
| 675 | "sayin", |
|
| 676 | "scatterin", |
|
| 677 | "schoolin", |
|
| 678 | "screenin", |
|
| 679 | "searchin", |
|
| 680 | "securin", |
|
| 681 | "seein", |
|
| 682 | "seekin", |
|
| 683 | "selectin", |
|
| 684 | "sellin", |
|
| 685 | "sendin", |
|
| 686 | "separatin", |
|
| 687 | "servin", |
|
| 688 | "settin", |
|
| 689 | "settlin", |
|
| 690 | "sewin", |
|
| 691 | "shakin", |
|
| 692 | "shapin", |
|
| 693 | "sharin", |
|
| 694 | "shiftin", |
|
| 695 | "shinin", |
|
| 696 | "shippin", |
|
| 697 | "shittin", |
|
| 698 | "shootin", |
|
| 699 | "shoppin", |
|
| 700 | "showin", |
|
| 701 | "singin", |
|
| 702 | "sinkin", |
|
| 703 | "sittin", |
|
| 704 | "sleepin", |
|
| 705 | "smilin", |
|
| 706 | "smokin", |
|
| 707 | "spankin", |
|
| 708 | "solvin", |
|
| 709 | "somethin", |
|
| 710 | "speakin", |
|
| 711 | "spellin", |
|
| 712 | "spendin", |
|
| 713 | "spinnin", |
|
| 714 | "spittin", |
|
| 715 | "spreadin", |
|
| 716 | "standin", |
|
| 717 | "starin", |
|
| 718 | "startin", |
|
| 719 | "statin", |
|
| 720 | "stayin", |
|
| 721 | "stealin", |
|
| 722 | "sterlin", |
|
| 723 | "stimulatin", |
|
| 724 | "stirrin", |
|
| 725 | "stoppin", |
|
| 726 | "strengthenin", |
|
| 727 | "stretchin", |
|
| 728 | "strikin", |
|
| 729 | "strugglin", |
|
| 730 | "studyin", |
|
| 731 | "succeedin", |
|
| 732 | "sufferin", |
|
| 733 | "suggestin", |
|
| 734 | "supplyin", |
|
| 735 | "supportin", |
|
| 736 | "surprisin", |
|
| 737 | "surroundin", |
|
| 738 | "survivin", |
|
| 739 | "sweepin", |
|
| 740 | "swellin", |
|
| 741 | "swimmin", |
|
| 742 | "switchin", |
|
| 743 | "takin", |
|
| 744 | "talkin", |
|
| 745 | "teachin", |
|
| 746 | "tellin", |
|
| 747 | "testin", |
|
| 748 | "thinkin", |
|
| 749 | "threatenin", |
|
| 750 | "throwin", |
|
| 751 | "timin", |
|
| 752 | "touchin", |
|
| 753 | "tradin", |
|
| 754 | "trainin", |
|
| 755 | "travelin", |
|
| 756 | "treatin", |
|
| 757 | "tremblin", |
|
| 758 | "tryin", |
|
| 759 | "turnin", |
|
| 760 | "underlyin", |
|
| 761 | "understandin", |
|
| 762 | "undertakin", |
|
| 763 | "unwillin", |
|
| 764 | "usin", |
|
| 765 | "varyin", |
|
| 766 | "viewin", |
|
| 767 | "visitin", |
|
| 768 | "votin", |
|
| 769 | "waitin", |
|
| 770 | "walkin", |
|
| 771 | "wanderin", |
|
| 772 | "wantin", |
|
| 773 | "warnin", |
|
| 774 | "washin", |
|
| 775 | "watchin", |
|
| 776 | "wearin", |
|
| 777 | "weddin", |
|
| 778 | "whackin", |
|
| 779 | "wi", |
|
| 772 | 780 | "willin", |
| 773 | 781 | "windin", |
| 4 | 4 | import picocli.CommandLine; |
| 5 | 5 | |
| 6 | import java.io.BufferedReader; |
|
| 7 | 6 | import java.io.IOException; |
| 8 | 7 | import java.io.InputStream; |
| 9 | import java.io.InputStreamReader; |
|
| 10 | 8 | import java.util.Properties; |
| 11 | 9 | |
| 12 | 10 | import static java.lang.String.format; |
| 11 | import static java.lang.System.*; |
|
| 13 | 12 | import static picocli.CommandLine.Help.Ansi.Style.*; |
| 14 | 13 | import static picocli.CommandLine.Help.ColorScheme; |
| ... | ||
| 38 | 37 | |
| 39 | 38 | if( settings.displayList() ) { |
| 40 | System.out.println( contractions.toString() ); |
|
| 39 | out.println( contractions.toString() ); |
|
| 41 | 40 | } |
| 42 | 41 | else { |
| 43 | convert( new Converter( System.err::println, contractions ) ); |
|
| 42 | try { |
|
| 43 | out.print( convert( new Converter( err::println, contractions ) ) ); |
|
| 44 | } catch( final Exception ex ) { |
|
| 45 | ex.printStackTrace( err ); |
|
| 46 | } |
|
| 44 | 47 | } |
| 45 | 48 | } |
| ... | ||
| 55 | 58 | return builder.build(); |
| 56 | 59 | } |
| 57 | ||
| 58 | private void convert( final Converter converter ) { |
|
| 59 | final var sb = new StringBuilder(); |
|
| 60 | ||
| 61 | try( final var reader = open( System.in ) ) { |
|
| 62 | String line; |
|
| 63 | final var sep = System.lineSeparator(); |
|
| 64 | ||
| 65 | while( (line = reader.readLine()) != null ) { |
|
| 66 | sb.append( line ); |
|
| 67 | sb.append( sep ); |
|
| 68 | } |
|
| 69 | 60 | |
| 70 | System.out.println( converter.apply( sb.toString() ) ); |
|
| 71 | } catch( final Exception ex ) { |
|
| 72 | ex.printStackTrace( System.err ); |
|
| 73 | } |
|
| 61 | private String convert( final Converter converter ) throws IOException { |
|
| 62 | return converter.apply( new String( in.readAllBytes() ) ); |
|
| 74 | 63 | } |
| 75 | 64 | |
| ... | ||
| 113 | 102 | private static InputStream getResourceAsStream( final String resource ) { |
| 114 | 103 | return KeenQuotes.class.getClassLoader().getResourceAsStream( resource ); |
| 115 | } |
|
| 116 | ||
| 117 | @SuppressWarnings( "SameParameterValue" ) |
|
| 118 | private static BufferedReader open( final InputStream in ) { |
|
| 119 | return new BufferedReader( new InputStreamReader( in ) ); |
|
| 120 | 104 | } |
| 121 | 105 | |
| ... | ||
| 134 | 118 | |
| 135 | 119 | if( parseResult.isUsageHelpRequested() ) { |
| 136 | System.exit( exitCode ); |
|
| 120 | exit( exitCode ); |
|
| 137 | 121 | } |
| 138 | 122 | else if( parseResult.isVersionHelpRequested() ) { |
| 139 | System.out.println( getVersion() ); |
|
| 140 | System.exit( exitCode ); |
|
| 123 | out.println( getVersion() ); |
|
| 124 | exit( exitCode ); |
|
| 141 | 125 | } |
| 142 | 126 | } |
| 50 | 50 | } |
| 51 | 51 | |
| 52 | static Lexeme createLexeme( |
|
| 53 | final LexemeType lexeme, final int began, final int ended ) { |
|
| 54 | assert lexeme != null; |
|
| 55 | return new Lexeme( lexeme, began, ended ); |
|
| 56 | } |
|
| 57 | ||
| 52 | 58 | /** |
| 53 | 59 | * Extracts a sequence of characters from the given text at the offsets |
| ... | ||
| 133 | 139 | assert that != null; |
| 134 | 140 | return this.mBegan - that.mBegan; |
| 135 | } |
|
| 136 | ||
| 137 | static Lexeme createLexeme( |
|
| 138 | final LexemeType lexeme, final int began, final int ended ) { |
|
| 139 | assert lexeme != null; |
|
| 140 | return new Lexeme( lexeme, began, ended ); |
|
| 141 | 141 | } |
| 142 | 142 | } |
| 91 | 91 | lexeme = createLexeme( |
| 92 | 92 | slurp( i, ( next, ci ) -> |
| 93 | next == '.' || (next == ' ' && peek( ci ) == '.') ) == 0 |
|
| 93 | next == '.' || next == ' ' && peek( ci ) == '.' ) == 0 |
|
| 94 | 94 | ? PERIOD |
| 95 | 95 | : ELLIPSIS, |
| ... | ||
| 189 | 189 | */ |
| 190 | 190 | private boolean isDash( final char curr ) { |
| 191 | return curr == '-' || curr == '–' || curr == '—'; |
|
| 191 | return curr == '-' || curr == '–' || curr == '—' || curr == '―'; |
|
| 192 | 192 | } |
| 193 | 193 | |
| 51 | 51 | */ |
| 52 | 52 | private static final LexemeType[] LAGGING_QUOTE_OPENING_DOUBLE = |
| 53 | new LexemeType[]{WORD, NUMBER, ELLIPSIS, |
|
| 53 | new LexemeType[]{WORD, NUMBER, ELLIPSIS, OPENING_GROUP, |
|
| 54 | 54 | QUOTE_SINGLE, QUOTE_SINGLE_OPENING, QUOTE_SINGLE_CLOSING, QUOTE_DOUBLE}; |
| 55 | 55 | |
| 56 | 56 | /** |
| 57 | 57 | * Double quotes preceded by these {@link LexemeType}s may be closing quotes. |
| 58 | 58 | */ |
| 59 | 59 | private static final LexemeType[] LEADING_QUOTE_CLOSING_DOUBLE = |
| 60 | new LexemeType[]{WORD, NUMBER, PERIOD, PUNCT, DASH, ELLIPSIS, |
|
| 60 | new LexemeType[]{WORD, NUMBER, PERIOD, PUNCT, DASH, ELLIPSIS, CLOSING_GROUP, |
|
| 61 | 61 | QUOTE_SINGLE, QUOTE_SINGLE_CLOSING, QUOTE_SINGLE_OPENING}; |
| 62 | 62 |
| 28 | 28 | names = {"-ub", "--unamb-began"}, |
| 29 | 29 | description = |
| 30 | "Contractions to treat as unambiguous (e.g., cause,bout)", |
|
| 31 | paramLabel = "words" |
|
| 30 | "Contraction to treat as unambiguous (e.g., cause, bout)", |
|
| 31 | paramLabel = "word" |
|
| 32 | 32 | ) |
| 33 | 33 | private String[] mBeganUnambiguous; |
| 34 | 34 | |
| 35 | 35 | /** |
| 36 | 36 | * List of unambiguous contractions having lagging apostrophes. |
| 37 | 37 | */ |
| 38 | 38 | @CommandLine.Option( |
| 39 | 39 | names = {"-ue", "--unamb-ended"}, |
| 40 | 40 | description = |
| 41 | "Contractions to treat as unambiguous (e.g., frien,thinkin)", |
|
| 42 | paramLabel = "words" |
|
| 41 | "Contraction to treat as unambiguous (e.g., frien, thinkin)", |
|
| 42 | paramLabel = "word" |
|
| 43 | 43 | ) |
| 44 | 44 | private String[] mEndedUnambiguous; |
| 45 | 45 | |
| 46 | 46 | /** |
| 47 | 47 | * List of ambiguous contractions having leading apostrophes. |
| 48 | 48 | */ |
| 49 | 49 | @CommandLine.Option( |
| 50 | 50 | names = {"-ab", "--amb-began"}, |
| 51 | 51 | description = |
| 52 | "Contractions to treat as ambiguous (e.g., sup,kay)", |
|
| 53 | paramLabel = "words" |
|
| 52 | "Contraction to treat as ambiguous (e.g., sup, kay)", |
|
| 53 | paramLabel = "word" |
|
| 54 | 54 | ) |
| 55 | 55 | private String[] mBeganAmbiguous; |
| 56 | 56 | |
| 57 | 57 | /** |
| 58 | 58 | * List of ambiguous contractions having lagging apostrophes. |
| 59 | 59 | */ |
| 60 | 60 | @CommandLine.Option( |
| 61 | 61 | names = {"-ae", "--amb-ended"}, |
| 62 | 62 | description = |
| 63 | "Contractions to treat as ambiguous (e.g., gi,o)", |
|
| 64 | paramLabel = "words" |
|
| 63 | "Contraction to treat as ambiguous (e.g., gi, o)", |
|
| 64 | paramLabel = "word" |
|
| 65 | 65 | ) |
| 66 | 66 | private String[] mEndedAmbiguous; |
| 58 | 58 | try( final var reader = open( filename + ".txt" ) ) { |
| 59 | 59 | String line; |
| 60 | final var sep = System.lineSeparator(); |
|
| 60 | 61 | |
| 61 | 62 | while( (line = reader.readLine()) != null ) { |
| 62 | sb.append( line ).append( '\n' ); |
|
| 63 | sb.append( line ).append( sep ); |
|
| 63 | 64 | } |
| 64 | 65 | } |
| 51 | 51 | testType( "---", DASH ); |
| 52 | 52 | testType( "–", DASH ); |
| 53 | testType( "―", DASH ); |
|
| 53 | 54 | testType( "—", DASH ); |
| 54 | 55 | testType( "—-—", DASH ); |