Dave Jarvis' Repositories

git clone https://repo.autonoma.ca/repo/keenquotes.git

Resolve numeric contractions, add ambiguous tests, add unambiguous contractions

AuthorDave Jarvis <email>
Date2021-06-10 21:02:09 GMT-0700
Commit65349cb8c71db6fbe602179205a5bc4fc0a83c15
Parent4f85bab
lib/src/main/java/com/keenwrite/quotes/Contractions.java
public class Contractions {
/**
+ * Answers whether the given word is a contraction that always starts
+ * with an apostrophe. The comparison is case insensitive. This must
+ * only be called when a straight quote is followed by a word.
+ *
+ * @param word The word to compare against the list of known unambiguous
+ * contractions.
+ * @return {@code true} when the given word is in the set of unambiguous
+ * contractions.
+ */
+ public static boolean contractionBeganUnambiguously( final String word ) {
+ assert word != null;
+ return BEGAN_UNAMBIGUOUS.contains( word.toLowerCase() ) ||
+ word.equals( "em" );
+ }
+
+ /**
+ * Answers whether the given word could be a contraction but is also a
+ * valid word in non-contracted form.
+ *
+ * @param word The word to compare against the list of known ambiguous
+ * contractions.
+ * @return {@code true} when the given word is in the set of ambiguous
+ * contractions.
+ */
+ public static boolean contractionBeganAmbiguously( final String word ) {
+ assert word != null;
+ return BEGAN_AMBIGUOUS.contains( word.toLowerCase() );
+ }
+
+ public static boolean contractionEndedUnambiguously( final String word ) {
+ assert word != null;
+ return ENDED_UNAMBIGUOUS.contains( word.toLowerCase() );
+ }
+
+ public static boolean contractionEndedAmbiguously( final String word ) {
+ assert word != null;
+ final var check = word.toLowerCase();
+
+ // Ensure that 'n' isn't matched for ambiguity by enforcing length, yet
+ // allow o' to match because 'a sentence can end with the letter o'.
+ return ENDED_AMBIGUOUS.contains( check ) ||
+ check.endsWith( "s" ) || check.endsWith( "z" ) ||
+ check.endsWith( "x" ) || (check.length() > 1 && check.endsWith( "n" ));
+ }
+
+ /**
* Words having a straight apostrophe that cannot be mistaken for an
* opening single quote.
*/
private static final Set<String> BEGAN_UNAMBIGUOUS = Set.of(
-// "aporth", "boutcha", "boutchu", "cept", "dillo", "e", "e'll", "e's",
-// "fraid", "gainst", "n", "neath", "nother", "onna", "onna'", "pon", "s",
-// "sblood", "scuse", "sfar", "sfoot", "t", "taint", "tain't", "til", "tis",
-// "tisn't", "tshall", "twas", "twasn't", "tween", "twere", "tweren't",
-// "twixt", "twon't", "twou'd", "twou'dn't", "twould", "twouldn't", "ve"
-
- "aporth", "boutcha", "boutchu", "cept", "dillo", "fraid", "gainst",
- "n", "neath", "nother", "onna", "onna'", "pon", "s", "sblood", "scuse",
- "sfar", "sfoot", "t", "taint", "tain", "til", "tis", "tisn", "tshall",
- "twas", "twasn", "tween", "twere", "tweren", "twixt", "twon", "twou",
- "twould", "twouldn", "ve"
+ "aporth",
+ "boutcha",
+ "boutchu",
+ "cept",
+ "dillo",
+ "fraid",
+ "gainst",
+ "n",
+ "neath",
+ "nother",
+ "onna",
+ "onna'",
+ "pon",
+ "s",
+ "sblood",
+ "scuse",
+ "sfar",
+ "sfoot",
+ "t",
+ "taint",
+ "tain",
+ "til",
+ "tis",
+ "tisn",
+ "tshall",
+ "twas",
+ "twasn",
+ "tween",
+ "twere",
+ "tweren",
+ "twixt",
+ "twon",
+ "twou",
+ "twould",
+ "twouldn",
+ "ve"
);
// he|e pluribus unum
"e",
- // them|emily
- "em",
// here|earlier
"ere",
"ol",
// the
- "th"
+ "th",
+ // Top ~500 common -ing words as English contractions.
+ "acceptin",
+ "accompanyin",
+ "accordin",
+ "accountin",
+ "achievin",
+ "acquirin",
+ "actin",
+ "addin",
+ "addressin",
+ "adjoinin",
+ "adoptin",
+ "advancin",
+ "advertisin",
+ "affectin",
+ "agin",
+ "allowin",
+ "amazin",
+ "analyzin",
+ "answerin",
+ "anythin",
+ "appearin",
+ "applyin",
+ "approachin",
+ "arguin",
+ "arisin",
+ "arrivin",
+ "askin",
+ "assessin",
+ "assumin",
+ "attackin",
+ "attemptin",
+ "attendin",
+ "avoidin",
+ "bankin",
+ "bargainin",
+ "bearin",
+ "beatin",
+ "becomin",
+ "beginnin",
+ "bein",
+ "believin",
+ "belongin",
+ "bendin",
+ "bindin",
+ "bleedin",
+ "blessin",
+ "blowin",
+ "boilin",
+ "borrowin",
+ "breakin",
+ "breathin",
+ "breedin",
+ "bringin",
+ "broadcastin",
+ "buildin",
+ "burnin",
+ "buyin",
+ "calculatin",
+ "callin",
+ "carryin",
+ "castin",
+ "causin",
+ "ceilin",
+ "challengin",
+ "changin",
+ "checkin",
+ "choosin",
+ "claimin",
+ "cleanin",
+ "clearin",
+ "climbin",
+ "closin",
+ "clothin",
+ "collectin",
+ "combinin",
+ "comin",
+ "commandin",
+ "comparin",
+ "compellin",
+ "competin",
+ "computin",
+ "concernin",
+ "concludin",
+ "conditionin",
+ "conductin",
+ "conflictin",
+ "connectin",
+ "considerin",
+ "consistin",
+ "constructin",
+ "consultin",
+ "consumin",
+ "containin",
+ "continuin",
+ "contractin",
+ "contributin",
+ "controllin",
+ "convincin",
+ "cookin",
+ "coolin",
+ "copin",
+ "correspondin",
+ "counselin",
+ "countin",
+ "couplin",
+ "coverin",
+ "creatin",
+ "crossin",
+ "cryin",
+ "cuttin",
+ "dancin",
+ "darlin",
+ "datin",
+ "dealin",
+ "decidin",
+ "declarin",
+ "declinin",
+ "decreasin",
+ "definin",
+ "demandin",
+ "denyin",
+ "dependin",
+ "descendin",
+ "describin",
+ "designin",
+ "destroyin",
+ "determinin",
+ "developin",
+ "differin",
+ "dinin",
+ "directin",
+ "discussin",
+ "distinguishin",
+ "disturbin",
+ "dividin",
+ "doin",
+ "drawin",
+ "dressin",
+ "drinkin",
+ "drivin",
+ "droppin",
+ "dryin",
+ "durin",
+ "dwellin",
+ "dyin",
+ "eatin",
+ "editin",
+ "emergin",
+ "employin",
+ "enablin",
+ "encouragin",
+ "endin",
+ "engagin",
+ "engineerin",
+ "enjoyin",
+ "enterin",
+ "establishin",
+ "evaluatin",
+ "evenin",
+ "everythin",
+ "examinin",
+ "exceedin",
+ "excitin",
+ "excludin",
+ "existin",
+ "expandin",
+ "expectin",
+ "experiencin",
+ "explainin",
+ "explorin",
+ "expressin",
+ "extendin",
+ "facin",
+ "failin",
+ "fallin",
+ "farmin",
+ "fascinatin",
+ "feedin",
+ "feelin",
+ "fightin",
+ "filin",
+ "fillin",
+ "financin",
+ "findin",
+ "firin",
+ "fishin",
+ "fittin",
+ "fixin",
+ "floatin",
+ "flowin",
+ "flyin",
+ "focusin",
+ "followin",
+ "forcin",
+ "foregoin",
+ "formin",
+ "forthcomin",
+ "foundin",
+ "freezin",
+ "functionin",
+ "fundin",
+ "gainin",
+ "gatherin",
+ "generatin",
+ "gettin",
+ "givin",
+ "goin",
+ "governin",
+ "grantin",
+ "growin",
+ "handlin",
+ "hangin",
+ "happenin",
+ "havin",
+ "headin",
+ "healin",
+ "hearin",
+ "heatin",
+ "helpin",
+ "hidin",
+ "holdin",
+ "hopin",
+ "housin",
+ "huntin",
+ "identifyin",
+ "imagin",
+ "implementin",
+ "imposin",
+ "improvin",
+ "includin",
+ "increasin",
+ "indicatin",
+ "interestin",
+ "interpretin",
+ "introducin",
+ "involvin",
+ "joinin",
+ "judgin",
+ "keepin",
+ "killin",
+ "knowin",
+ "lackin",
+ "landin",
+ "lastin",
+ "laughin",
+ "layin",
+ "leadin",
+ "leanin",
+ "learnin",
+ "leavin",
+ "lettin",
+ "liftin",
+ "lightin",
+ "lightnin",
+ "limitin",
+ "listenin",
+ "listin",
+ "livin",
+ "loadin",
+ "lookin",
+ "losin",
+ "lovin",
+ "lowerin",
+ "lyin",
+ "maintainin",
+ "makin",
+ "managin",
+ "manufacturin",
+ "mappin",
+ "marketin",
+ "markin",
+ "matchin",
+ "meanin",
+ "measurin",
+ "meetin",
+ "meltin",
+ "minin",
+ "misleadin",
+ "missin",
+ "mixin",
+ "modelin",
+ "monitorin",
+ "mornin",
+ "movin",
+ "neighborin",
+ "nothin",
+ "notin",
+ "notwithstandin",
+ "nursin",
+ "observin",
+ "obtainin",
+ "occurrin",
+ "offerin",
+ "offsprin",
+ "ongoin",
+ "openin",
+ "operatin",
+ "opposin",
+ "orderin",
+ "organizin",
+ "outstandin",
+ "overwhelmin",
+ "packin",
+ "paintin",
+ "parkin",
+ "participatin",
+ "passin",
+ "payin",
+ "pendin",
+ "performin",
+ "pickin",
+ "placin",
+ "plannin",
+ "plantin",
+ "playin",
+ "pleasin",
+ "pointin",
+ "possessin",
+ "preachin",
+ "precedin",
+ "preparin",
+ "presentin",
+ "preservin",
+ "pressin",
+ "prevailin",
+ "preventin",
+ "pricin",
+ "printin",
+ "proceedin",
+ "processin",
+ "producin",
+ "programmin",
+ "promisin",
+ "promotin",
+ "protectin",
+ "providin",
+ "provin",
+ "publishin",
+ "pullin",
+ "purchasin",
+ "pursuin",
+ "pushin",
+ "puttin",
+ "questionin",
+ "rangin",
+ "ratin",
+ "reachin",
+ "readin",
+ "reasonin",
+ "receivin",
+ "recognizin",
+ "recordin",
+ "reducin",
+ "referrin",
+ "reflectin",
+ "refusin",
+ "regardin",
+ "regulatin",
+ "relatin",
+ "remainin",
+ "rememberin",
+ "removin",
+ "renderin",
+ "repeatin",
+ "replacin",
+ "reportin",
+ "representin",
+ "requirin",
+ "respectin",
+ "respondin",
+ "restin",
+ "resultin",
+ "returnin",
+ "revealin",
+ "ridin",
+ "risin",
+ "rulin",
+ "runnin",
+ "sailin",
+ "samplin",
+ "satisfyin",
+ "savin",
+ "sayin",
+ "scatterin",
+ "schoolin",
+ "screenin",
+ "searchin",
+ "securin",
+ "seein",
+ "seekin",
+ "selectin",
+ "sellin",
+ "sendin",
+ "separatin",
+ "servin",
+ "settin",
+ "settlin",
+ "shakin",
+ "shapin",
+ "sharin",
+ "shiftin",
+ "shinin",
+ "shippin",
+ "shootin",
+ "shoppin",
+ "showin",
+ "singin",
+ "sinkin",
+ "sittin",
+ "sleepin",
+ "smilin",
+ "smokin",
+ "solvin",
+ "somethin",
+ "speakin",
+ "spellin",
+ "spendin",
+ "spinnin",
+ "spreadin",
+ "standin",
+ "starin",
+ "startin",
+ "statin",
+ "stayin",
+ "sterlin",
+ "stimulatin",
+ "stirrin",
+ "stoppin",
+ "strengthenin",
+ "stretchin",
+ "strikin",
+ "strugglin",
+ "studyin",
+ "succeedin",
+ "sufferin",
+ "suggestin",
+ "supplyin",
+ "supportin",
+ "surprisin",
+ "surroundin",
+ "survivin",
+ "sweepin",
+ "swellin",
+ "swimmin",
+ "switchin",
+ "takin",
+ "talkin",
+ "teachin",
+ "tellin",
+ "testin",
+ "thinkin",
+ "threatenin",
+ "throwin",
+ "timin",
+ "touchin",
+ "tradin",
+ "trainin",
+ "travelin",
+ "treatin",
+ "tremblin",
+ "tryin",
+ "turnin",
+ "underlyin",
+ "understandin",
+ "undertakin",
+ "unwillin",
+ "usin",
+ "varyin",
+ "viewin",
+ "visitin",
+ "votin",
+ "waitin",
+ "walkin",
+ "wanderin",
+ "wantin",
+ "warnin",
+ "washin",
+ "watchin",
+ "wearin",
+ "weddin",
+ "willin",
+ "windin",
+ "winnin",
+ "wishin",
+ "wonderin",
+ "workin",
+ "writin",
+ "yieldin"
);
-
- /**
- * Answers whether the given word is a contraction that always starts
- * with an apostrophe. The comparison is case insensitive. This must
- * only be called when a straight quote is followed by a word.
- *
- * @param word The word to compare against the list of known unambiguous
- * contractions.
- * @return {@code true} when the given word is in the set of unambiguous
- * contractions.
- */
- public static boolean contractionBeganUnambiguously( final String word ) {
- assert word != null;
- return BEGAN_UNAMBIGUOUS.contains( word.toLowerCase() );
- }
-
- /**
- * Answers whether the given word could be a contraction but is also a
- * valid word in non-contracted form.
- *
- * @param word The word to compare against the list of known ambiguous
- * contractions.
- * @return {@code true} when the given word is in the set of ambiguous
- * contractions.
- */
- public static boolean contractionBeganAmbiguously( final String word ) {
- assert word != null;
- return BEGAN_AMBIGUOUS.contains( word.toLowerCase() );
- }
-
- public static boolean contractionEndedAmbiguously( final String word ) {
- assert word != null;
- final var check = word.toLowerCase();
-
- return ENDED_AMBIGUOUS.contains( check ) || check.endsWith( "s" ) ||
- check.endsWith( "n" ) || check.endsWith( "z" ) || check.endsWith( "x" );
- }
-
- public static boolean contractionEndedUnambiguously( final String word ) {
- assert word != null;
- return ENDED_UNAMBIGUOUS.contains( word.toLowerCase() );
- }
}
lib/src/main/java/com/keenwrite/quotes/Parser.java
i.remove();
}
- else if( contractionEndedAmbiguously( word1 ) ) {
- ambiguousLaggingQuotes.add( lex2 );
- }
else if( contractionEndedUnambiguously( word1 ) ) {
consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) );
i.remove();
+ }
+ else if( contractionEndedAmbiguously( word1 ) ) {
+ ambiguousLaggingQuotes.add( lex2 );
}
else if( (lex1.isSot() || lex1.anyType( LEADING_QUOTE_OPENING_SINGLE ))
consumer.accept( new Token( quote, ambiguousLaggingQuotes.get( 0 ) ) );
}
- }
- else if( ambiguousLeadingCount > 0 && ambiguousLaggingCount == 0 ) {
- // If there are no ambiguous lagging quotes then all ambiguous leading
- // quotes must be contractions.
- ambiguousLeadingQuotes.forEach(
- lex -> consumer.accept( new Token( QUOTE_APOSTROPHE, lex ) )
- );
}
else if( ambiguousLeadingCount == 0 && ambiguousLaggingCount > 0 ) {
}
}
- }
- else if( resolvedLeadingQuotes == 1 && ambiguousLaggingCount == 1 ) {
- consumer.accept(
- new Token( QUOTE_CLOSING_SINGLE, ambiguousLaggingQuotes.get( 0 ) )
- );
}
}
consumer.accept( new Token( QUOTE_APOSTROPHE, lex3 ) );
}
- else if( lex1.isType( NUMBER ) && lex2.isType( QUOTE_SINGLE ) ) {
+ else if( lex2.isType( QUOTE_SINGLE ) && lex1.isType( NUMBER ) ) {
if( lex3.isType( QUOTE_SINGLE ) ) {
// E.g., 2''
}
}
- else if( lex1.isType( NUMBER ) && lex2.isType( QUOTE_DOUBLE ) ) {
+ else if( lex2.isType( QUOTE_DOUBLE ) && lex1.isType( NUMBER ) ) {
// E.g., 2"
consumer.accept( new Token( QUOTE_PRIME_DOUBLE, lex2 ) );
}
- else if( lex1.isType( QUOTE_SINGLE ) && lex2.isType( WORD ) &&
+ else if( lex2.isType( WORD ) && lex1.isType( QUOTE_SINGLE ) &&
contractionBeganUnambiguously( lex2.toString( mText ) ) ) {
// E.g., 'twas
consumer.accept( new Token( QUOTE_APOSTROPHE, lex1 ) );
}
- else if( lex1.isType( QUOTE_SINGLE ) && lex2.isType( NUMBER ) &&
+ else if( lex2.isType( NUMBER ) && lex1.isType( QUOTE_SINGLE ) &&
lex3.isType( WORD ) &&
lex3.toString( mText ).equalsIgnoreCase( "s" ) ) {
// E.g., '70s
// Sentences are re-written to avoid starting with numerals.
consumer.accept( new Token( QUOTE_APOSTROPHE, lex1 ) );
+ }
+ else if( lex2.isType( QUOTE_SINGLE ) && lex3.isType( NUMBER ) ) {
+ // E.g., '02
+ consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) );
}
else if( lex1.isType( ESC_SINGLE ) ) {
lib/src/test/java/com/keenwrite/quotes/SmartQuotesTest.java
public void test_parse_SingleLine_Parsed() {
out.println( SmartQuotes.replace(
- "'Oak,' 'elm,' and 'beech' are names of trees. So is 'pine.'"
- ) );
-
- System.out.println(" ------------------------ ");
-
- out.println( SmartQuotes.replace(
- "'Bout that time I says, 'Boys! I been thinkin' 'bout th' Universe.'"
+ "'Em wern't so sure."
) );
}
testLine = unescapeEol( testLine );
expected = unescapeEol( expected );
- System.out.println( testLine );
final var actual = parser.apply( testLine );
lib/src/test/resources/com/keenwrite/quotes/smartypants.txt
&lsquo;That&apos;s a &ldquo;magic&rdquo; sock.&rsquo;
-Website! Company Name, Inc. ("Company Name" or "Company") recommends reading the following terms and conditions, carefully:
-Website! Company Name, Inc. (&ldquo;Company Name&rdquo; or &ldquo;Company&rdquo;) recommends reading the following terms and conditions, carefully:
+'That's a "very 'magic'" sock.'
+&lsquo;That&apos;s a &ldquo;very &lsquo;magic&rsquo;&rdquo; sock.&rsquo;
-Website! Company Name, Inc. ('Company Name' or 'Company') recommends reading the following terms and conditions, carefully:
-Website! Company Name, Inc. (&lsquo;Company Name&rsquo; or &lsquo;Company&rsquo;) recommends reading the following terms and conditions, carefully:
+Website! Company Name, Inc. ("Company Name" or "Company") recommends reading carefully:
+Website! Company Name, Inc. (&ldquo;Company Name&rdquo; or &ldquo;Company&rdquo;) recommends reading carefully:
+
+Website! Company Name, Inc. ('Company Name' or 'Company') recommends reading carefully:
+Website! Company Name, Inc. (&lsquo;Company Name&rsquo; or &lsquo;Company&rsquo;) recommends reading carefully:
Workin' hard
The 2's complement.\n\n'Yar, binary!'
The 2&apos;s complement.\n\n&lsquo;Yar, binary!&rsquo;
+
+The Who's complement.\n\n\n"Paragraph boundary!"
+The Who&apos;s complement.\n\n\n&ldquo;Paragraph boundary!&rdquo;
'Oak,' 'elm,' and 'beech' are names of trees. So is 'pine.'
&lsquo;Oak,&rsquo; &lsquo;elm,&rsquo; and &lsquo;beech&rsquo; are names of trees. So is &lsquo;pine.&rsquo;
'A', 'B', and 'C' are letters.
&lsquo;A&rsquo;, &lsquo;B&rsquo;, and &lsquo;C&rsquo; are letters.
Book 'em, Danno. Rock 'n' roll. 'Cause 'twas the season.
-Book &apos;em, Danno. Rock &apos;n&apos; roll. &apos;Cause &apos;twas the season.
+Book &apos;em, Danno. Rock &apos;n&apos; roll. 'Cause &apos;twas the season.
'He said, "Thinkin'."'
&lsquo;He said, &ldquo;Thinkin&apos;.&rdquo;&rsquo;
+
+She said, 'Cause of death?\n\n'Old age, there's no doubt.'
+She said, 'Cause of death?\n\n&lsquo;Old age, there&apos;s no doubt.&rsquo;
+
+She said, 'Cause I said so!\n\n'If we're done, I've a secret.'
+She said, 'Cause I said so!\n\n&lsquo;If we&apos;re done, I&apos;ve a secret.&rsquo;
+
+'Em wasn't so sure.'
+&lsquo;Em wasn&apos;t so sure.&rsquo;
+
+# ########################################################################
+# Ambiguous (no solution)
+# ########################################################################
+'Em wern't so sure.
+'Em wern&apos;t so sure.
+
+'Bout that time I says, 'Boys! I been thinkin' 'bout th' Universe.'
+'Bout that time I says, &lsquo;Boys! I been thinkin&apos; 'bout th&apos; Universe.&rsquo;
Delta613 lines added, 88 lines removed, 525-line increase