Dave Jarvis' Repositories

git clone https://repo.autonoma.ca/repo/keenquotes.git

Adds ambiguous contraction than/an

AuthorDaveJarvis <email>
Date2024-03-25 23:35:53 GMT-0700
Commit5d7a7b9ad9d8b1657d52b2d91dd0938c68e381f0
Parent74e090a
BUILD.md
Building the software requires the following third-party programs:
-* [git](https://git-scm.com)
+* [git 2.44.0](https://git-scm.com)
* [OpenJDK 21](https://bell-sw.com/pages/downloads)
-* [Gradle 8.5](https://gradle.org)
+* [Gradle 8.7](https://gradle.org)
## Application
Build the application as follows:
- git clone https://github.com/DaveJarvis/keenquotes.git
+ git clone https://gitlab.com/DaveJarvis/KeenQuotes.git
cd keenquotes
gradle clean build
build/lib/keenquotes.jar
-
README.md
# Requirements
-Download and install JRE 21:
+Download and install JRE:
-* [Java 21](https://bell-sw.com/pages/downloads) or newer.
+* [Java 21](https://bell-sw.com/pages/downloads).
# Download
build.gradle
dependencies {
+ def v_junit = '5.10.2'
+ def v_picocli = '4.7.5'
+
// Command-line parsing
- implementation 'info.picocli:picocli:4.7.5'
+ implementation "info.picocli:picocli:${v_picocli}"
- testImplementation 'org.junit.jupiter:junit-jupiter-api:5.10.1'
- testImplementation 'org.junit.jupiter:junit-jupiter-params:5.10.1'
- testRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine:5.10.1'
+ testImplementation "org.junit.jupiter:junit-jupiter-api:${v_junit}"
+ testImplementation "org.junit.jupiter:junit-jupiter-params:${v_junit}"
+ testRuntimeOnly "org.junit.jupiter:junit-jupiter-engine:${v_junit}"
}
compileJava {
options.compilerArgs << "-Xlint:unchecked" << "-Xlint:deprecation"
- options.compilerArgs.addAll(['--release', '21'])
+ options.compilerArgs.addAll( ['--release', '21'] )
}
def resourceDir = sourceSets.main.resources.srcDirs[0]
-final File propertiesFile = file("${resourceDir}/com/whitemagicsoftware/${applicationName}/app/version.properties")
-propertiesFile.write("application.version=${version}")
+final File propertiesFile = file( "${resourceDir}/com/whitemagicsoftware/${applicationName}/app/version.properties" )
+propertiesFile.write( "application.version=${version}" )
jar {
duplicatesStrategy = DuplicatesStrategy.EXCLUDE
manifest {
attributes 'Main-Class': mainClassName
}
from {
- (configurations.runtimeClasspath.findAll { !it.path.endsWith(".pom") }).collect {
- it.isDirectory() ? it : zipTree(it)
+ (configurations.runtimeClasspath.findAll { !it.path.endsWith( ".pom" ) }).collect {
+ it.isDirectory() ? it : zipTree( it )
}
}
tasks.register( 'lib', Jar ) {
archiveFileName = "${applicationName}.jar"
- destinationDirectory = file( "$buildDir/lib" )
+ destinationDirectory = layout.buildDirectory.dir( "lib" )
- from "$buildDir/classes/java/main"
+ from layout.buildDirectory.dir( "classes/java/main" )
exclude "**/KeenQuotes.class"
}
lib.configure {
dependsOn compileJava
}
-tasks.named('test') {
+tasks.named( 'test' ) {
useJUnitPlatform()
}
src/main/java/com/whitemagicsoftware/keenquotes/parser/Contractions.java
* Placeholder for various types of contractions.
*/
-public class Contractions {
-
- private final Builder mBuilder;
-
- private Contractions( final Builder builder ) {
- assert builder != null;
- mBuilder = builder;
- }
-
- /**
- * Allows constructing a list of custom contractions.
- */
- @SuppressWarnings( "unused" )
- public static class Builder {
- private final Set<String> mBeganEndedUnambiguous = new HashSet<>();
- private final Set<String> mBeganUnambiguous = new HashSet<>();
- private final Set<String> mEndedUnambiguous = new HashSet<>();
- private final Set<String> mBeganAmbiguous = new HashSet<>();
- private final Set<String> mEndedAmbiguous = new HashSet<>();
-
- public Builder withBeganEndedUnambiguous( final List<String> words ) {
- mBeganEndedUnambiguous.addAll( words );
- return this;
- }
-
- public Builder withBeganUnambiguous( final List<String> words ) {
- mBeganUnambiguous.addAll( words );
- return this;
- }
-
- public Builder withEndedUnambiguous( final List<String> words ) {
- mEndedUnambiguous.addAll( words );
- return this;
- }
-
- public Builder withBeganAmbiguous( final List<String> words ) {
- mBeganAmbiguous.addAll( words );
- return this;
- }
-
- public Builder withEndedAmbiguous( final List<String> words ) {
- mEndedAmbiguous.addAll( words );
- return this;
- }
-
- /**
- * Constructs a new set of {@link Contractions} that can be configured
- * using this {@link Builder} instance.
- *
- * @return {@link Contractions} suitable for use with parsing text.
- */
- public Contractions build() {
- mBeganEndedUnambiguous.addAll( BEGAN_ENDED_UNAMBIGUOUS );
- mBeganUnambiguous.addAll( BEGAN_UNAMBIGUOUS );
- mEndedUnambiguous.addAll( ENDED_UNAMBIGUOUS );
- mBeganAmbiguous.addAll( BEGAN_AMBIGUOUS );
- mEndedAmbiguous.addAll( ENDED_AMBIGUOUS );
-
- // Remove ambiguous items if they are already declared.
- mBeganAmbiguous.removeAll( mBeganUnambiguous );
- mEndedAmbiguous.removeAll( mEndedUnambiguous );
-
- return new Contractions( this );
- }
-
- /**
- * This returns the {@code fallback} {@link Set} if {@code src} is empty;
- * otherwise, this returns the empty {@link Set}.
- *
- * @param src A set of contractions, possibly empty.
- * @param fallback The default values to use if {@code src} is empty.
- * @param <T> The type of data used by both {@link Set}s.
- * @return An empty {@link Set} if the {@code src} contains at least one
- * element; otherwise, this will return {@code fallback}.
- */
- private static <T> Set<T> from( final Set<T> src, final Set<T> fallback ) {
- assert src != null;
- assert fallback != null;
- return src.isEmpty() ? fallback : emptySet();
- }
- }
-
- public boolean beganEndedUnambiguously( final String word ) {
- assert word != null;
- return getBeganEndedUnambiguous().contains( word );
- }
-
- /**
- * Answers whether the given word is a contraction that always starts
- * with an apostrophe. The comparison is case-insensitive. This must
- * only be called when a straight quote is followed by a word.
- *
- * @param word The word to compare against the list of known unambiguous
- * contractions.
- * @return {@code true} when the given word is in the set of unambiguous
- * contractions.
- */
- public boolean beganUnambiguously( final String word ) {
- assert word != null;
- return getBeganUnambiguous().contains( word.toLowerCase() );
- }
-
- /**
- * Answers whether the given word could be a contraction but is also a
- * valid word in non-contracted form.
- *
- * @param word The word to compare against the list of known ambiguous
- * contractions.
- * @return {@code true} when the given word is in the set of ambiguous
- * contractions.
- */
- public boolean beganAmbiguously( final String word ) {
- assert word != null;
- return getBeganAmbiguous().contains( word.toLowerCase() );
- }
-
- public boolean endedUnambiguously( final String word ) {
- assert word != null;
- return getEndedUnambiguous().contains( word.toLowerCase() );
- }
-
- public boolean endedAmbiguously( final String word ) {
- assert word != null;
- final var check = word.toLowerCase();
-
- // Ensure that 'n' isn't matched for ambiguity by enforcing length, yet
- // allow o' to match because 'a sentence can end with the letter o'.
- return getEndedAmbiguous().contains( check ) ||
- check.endsWith( "s" ) || check.endsWith( "z" ) ||
- check.endsWith( "x" ) || (check.length() > 1 && check.endsWith( "n" ));
- }
-
- private Set<String> getBeganEndedUnambiguous() {
- return mBuilder.mBeganEndedUnambiguous;
- }
-
- private Set<String> getBeganUnambiguous() {
- return mBuilder.mBeganUnambiguous;
- }
-
- private Set<String> getEndedUnambiguous() {
- return mBuilder.mEndedUnambiguous;
- }
-
- private Set<String> getBeganAmbiguous() {
- return mBuilder.mBeganAmbiguous;
- }
-
- private Set<String> getEndedAmbiguous() {
- return mBuilder.mEndedAmbiguous;
- }
-
- @Override
- public String toString() {
- return
- toString( getBeganEndedUnambiguous(), "Unambiguous Began/Ended", "'%s" ) +
- toString( getBeganUnambiguous(), "Unambiguous Began", "'%s" ) +
- toString( getEndedUnambiguous(), "Unambiguous Ended", "%s'" ) +
- toString( getBeganAmbiguous(), "Ambiguous Began", "'%s" ) +
- toString( getEndedAmbiguous(), "Ambiguous Ended", "%s'" );
- }
-
- private String toString(
- final Set<String> words,
- final String category,
- final String fmt
- ) {
- final var sb = new StringBuilder( 16384 );
- final var newline = System.lineSeparator();
- final var list = new ArrayList<>( words );
-
- sort( list );
- sb.append( format( "%n%s%n", category ) );
- list.forEach( ( s ) -> sb.append( format( fmt, s ) ).append( newline ) );
-
- return sb.toString();
- }
-
- /**
- * Words having a straight apostrophe at the beginning and end.
- */
- private static final Set<String> BEGAN_ENDED_UNAMBIGUOUS = Set.of(
- // hacking
- "ackin",
- // hammering
- "ammerin",
- // hankering
- "ankerin",
- // having
- "avin",
- // hawking
- "awkin",
- // excepting
- "cepin",
- // excepting
- "ceppin",
- // excepting
- "ceptin",
- // according
- "cordin",
- // heading
- "eadin",
- // leaving
- "eavin",
- // helping
- "elpin",
- // hindering
- "inderin",
- // electioneering
- "lectioneerin",
- // amazing
- "mazin",
- // remembering
- "memberin",
- // fish 'n' chips
- "n",
- // hobbling
- "obblin",
- // holding
- "oldin",
- // hollering
- "ollerin",
- // hopping
- "oppin",
- // housekeeping
- "ousekeepin",
- // howling
- "owlin",
- // excepting
- "sceptin",
- // expecting
- "spectin",
- // explaining
- "splainin",
- // supposing
- "sposin",
- "sputin",
- // astonishing
- "stonishin",
- // destroying
- "stroyin",
- // persuading
- "suadin",
- "titivatin",
- // introducing
- "troducin",
- // hugging
- "uggin",
- // hulking
- "ulkin",
- // humbugging
- "umbuggin",
- // humiliating
- "umiliatin",
- // humming
- "ummin",
- // humping
- "umpin",
- // hurrying
- "urryin",
- // hurting
- "urtin",
- // hustling
- "ustlin",
- // investigating
- "vestigatin",
- // inviting
- "vitin",
- // excepting
- "xceptin",
- // explaining
- "xplainin",
- // exploding
- "xplodin"
- );
-
- /**
- * Words having a straight apostrophe that cannot be mistaken for an
- * opening single quote.
- */
- private static final Set<String> BEGAN_UNAMBIGUOUS = Set.of(
- "aporth",
- // about you
- "boutcha",
- // about you
- "boutchu",
- // about well
- "boutwell",
- // except
- "cept",
- // decided
- "cided",
- // because
- "cos",
- // armadillo
- "dillo",
- // themselves,
- "emselves",
- // affectionate
- "fectionate",
- // before
- "fore",
- // afraid
- "fraid",
- // ???
- "funder",
- // against
- "gainst",
- // him
- "im",
- // little
- "ittle",
- // and
- "n",
- // beneath
- "neath",
- // another
- "nother",
- // another
- "nudder",
- // enough
- "nuff",
- // gonna
- "onna",
- "onna'",
- // horse
- "oss",
- // horses
- "osses",
- // splash
- "plash",
- // upon
- "pon",
- // that's|is
- "s",
- "sblood",
- // excuse
- "scuse",
- "sfar",
- "sfoot",
- // considered
- "sidered",
- // suspect|expect
- "spect",
- // suspects|expects
- "spects",
- // exploit
- "sploit",
- // exploits
- "sploits",
- // suppose
- "spose",
- // instead
- "stead",
- // it
- "t",
- "taint",
- "tain",
- "tay",
- "til",
- "tis",
- "tish",
- // it isn't
- "tisn",
- // stomach
- "tomach",
- // stormed
- "tormed",
- "tshall",
- "twas",
- "twasn",
- "tween",
- "twere",
- "tweren",
- "twixt",
- "twon",
- "twou",
- "twould",
- "twouldn",
- // one
- "un",
- // have
- "ve",
- // exactly
- "xactly"
- );
-
- /**
- * Words having a straight apostrophe that may be either part of a
- * contraction or a word that stands alone beside an opening single quote.
- */
- private static final Set<String> BEGAN_AMBIGUOUS = Set.of(
- // have
- "a",
+@SuppressWarnings( { "SpellCheckingInspection", "GrazieInspection" } )
+public class Contractions {
+
+ private final Builder mBuilder;
+
+ private Contractions( final Builder builder ) {
+ assert builder != null;
+ mBuilder = builder;
+ }
+
+ /**
+ * Allows constructing a list of custom contractions.
+ */
+ @SuppressWarnings( "unused" )
+ public static class Builder {
+ private final Set<String> mBeganEndedUnambiguous = new HashSet<>();
+ private final Set<String> mBeganUnambiguous = new HashSet<>();
+ private final Set<String> mEndedUnambiguous = new HashSet<>();
+ private final Set<String> mBeganAmbiguous = new HashSet<>();
+ private final Set<String> mEndedAmbiguous = new HashSet<>();
+
+ public Builder withBeganEndedUnambiguous( final List<String> words ) {
+ mBeganEndedUnambiguous.addAll( words );
+ return this;
+ }
+
+ public Builder withBeganUnambiguous( final List<String> words ) {
+ mBeganUnambiguous.addAll( words );
+ return this;
+ }
+
+ public Builder withEndedUnambiguous( final List<String> words ) {
+ mEndedUnambiguous.addAll( words );
+ return this;
+ }
+
+ public Builder withBeganAmbiguous( final List<String> words ) {
+ mBeganAmbiguous.addAll( words );
+ return this;
+ }
+
+ public Builder withEndedAmbiguous( final List<String> words ) {
+ mEndedAmbiguous.addAll( words );
+ return this;
+ }
+
+ /**
+ * Constructs a new set of {@link Contractions} that can be configured
+ * using this {@link Builder} instance.
+ *
+ * @return {@link Contractions} suitable for use with parsing text.
+ */
+ public Contractions build() {
+ mBeganEndedUnambiguous.addAll( BEGAN_ENDED_UNAMBIGUOUS );
+ mBeganUnambiguous.addAll( BEGAN_UNAMBIGUOUS );
+ mEndedUnambiguous.addAll( ENDED_UNAMBIGUOUS );
+ mBeganAmbiguous.addAll( BEGAN_AMBIGUOUS );
+ mEndedAmbiguous.addAll( ENDED_AMBIGUOUS );
+
+ // Remove ambiguous items if they are already declared.
+ mBeganAmbiguous.removeAll( mBeganUnambiguous );
+ mEndedAmbiguous.removeAll( mEndedUnambiguous );
+
+ return new Contractions( this );
+ }
+
+ /**
+ * This returns the {@code fallback} {@link Set} if {@code src} is empty;
+ * otherwise, this returns the empty {@link Set}.
+ *
+ * @param src A set of contractions, possibly empty.
+ * @param fallback The default values to use if {@code src} is empty.
+ * @param <T> The type of data used by both {@link Set}s.
+ * @return An empty {@link Set} if the {@code src} contains at least one
+ * element; otherwise, this will return {@code fallback}.
+ */
+ private static <T> Set<T> from( final Set<T> src, final Set<T> fallback ) {
+ assert src != null;
+ assert fallback != null;
+ return src.isEmpty() ? fallback : emptySet();
+ }
+ }
+
+ public boolean beganEndedUnambiguously( final String word ) {
+ assert word != null;
+ return getBeganEndedUnambiguous().contains( word );
+ }
+
+ /**
+ * Answers whether the given word is a contraction that always starts
+ * with an apostrophe. The comparison is case-insensitive. This must
+ * only be called when a straight quote is followed by a word.
+ *
+ * @param word The word to compare against the list of known unambiguous
+ * contractions.
+ * @return {@code true} when the given word is in the set of unambiguous
+ * contractions.
+ */
+ public boolean beganUnambiguously( final String word ) {
+ assert word != null;
+ return getBeganUnambiguous().contains( word.toLowerCase() );
+ }
+
+ /**
+ * Answers whether the given word could be a contraction but is also a
+ * valid word in non-contracted form.
+ *
+ * @param word The word to compare against the list of known ambiguous
+ * contractions.
+ * @return {@code true} when the given word is in the set of ambiguous
+ * contractions.
+ */
+ public boolean beganAmbiguously( final String word ) {
+ assert word != null;
+ return getBeganAmbiguous().contains( word.toLowerCase() );
+ }
+
+ public boolean endedUnambiguously( final String word ) {
+ assert word != null;
+ return getEndedUnambiguous().contains( word.toLowerCase() );
+ }
+
+ public boolean endedAmbiguously( final String word ) {
+ assert word != null;
+ final var check = word.toLowerCase();
+
+ // Ensure that 'n' isn't matched for ambiguity by enforcing length, yet
+ // allow o' to match because 'a sentence can end with the letter o'.
+ return getEndedAmbiguous().contains( check ) ||
+ check.endsWith( "s" ) || check.endsWith( "z" ) ||
+ check.endsWith( "x" ) || (check.length() > 1 && check.endsWith( "n" ));
+ }
+
+ private Set<String> getBeganEndedUnambiguous() {
+ return mBuilder.mBeganEndedUnambiguous;
+ }
+
+ private Set<String> getBeganUnambiguous() {
+ return mBuilder.mBeganUnambiguous;
+ }
+
+ private Set<String> getEndedUnambiguous() {
+ return mBuilder.mEndedUnambiguous;
+ }
+
+ private Set<String> getBeganAmbiguous() {
+ return mBuilder.mBeganAmbiguous;
+ }
+
+ private Set<String> getEndedAmbiguous() {
+ return mBuilder.mEndedAmbiguous;
+ }
+
+ @Override
+ public String toString() {
+ return
+ toString( getBeganEndedUnambiguous(), "Unambiguous Began/Ended", "'%s" ) +
+ toString( getBeganUnambiguous(), "Unambiguous Began", "'%s" ) +
+ toString( getEndedUnambiguous(), "Unambiguous Ended", "%s'" ) +
+ toString( getBeganAmbiguous(), "Ambiguous Began", "'%s" ) +
+ toString( getEndedAmbiguous(), "Ambiguous Ended", "%s'" );
+ }
+
+ private String toString(
+ final Set<String> words,
+ final String category,
+ final String fmt
+ ) {
+ final var sb = new StringBuilder( 16384 );
+ final var newline = System.lineSeparator();
+ final var list = new ArrayList<>( words );
+
+ sort( list );
+ sb.append( format( "%n%s%n", category ) );
+ list.forEach( ( s ) -> sb.append( format( fmt, s ) ).append( newline ) );
+
+ return sb.toString();
+ }
+
+ /**
+ * Words having a straight apostrophe at the beginning and end.
+ */
+ private static final Set<String> BEGAN_ENDED_UNAMBIGUOUS = Set.of(
+ // hacking
+ "ackin",
+ // hammering
+ "ammerin",
+ // hankering
+ "ankerin",
+ // having
+ "avin",
+ // hawking
+ "awkin",
+ // excepting
+ "cepin",
+ // excepting
+ "ceppin",
+ // excepting
+ "ceptin",
+ // according
+ "cordin",
+ // heading
+ "eadin",
+ // leaving
+ "eavin",
+ // helping
+ "elpin",
+ // hindering
+ "inderin",
+ // electioneering
+ "lectioneerin",
+ // amazing
+ "mazin",
+ // remembering
+ "memberin",
+ // fish 'n' chips
+ "n",
+ // hobbling
+ "obblin",
+ // holding
+ "oldin",
+ // hollering
+ "ollerin",
+ // hopping
+ "oppin",
+ // housekeeping
+ "ousekeepin",
+ // howling
+ "owlin",
+ // excepting
+ "sceptin",
+ // expecting
+ "spectin",
+ // explaining
+ "splainin",
+ // supposing
+ "sposin",
+ "sputin",
+ // astonishing
+ "stonishin",
+ // destroying
+ "stroyin",
+ // persuading
+ "suadin",
+ "titivatin",
+ // introducing
+ "troducin",
+ // hugging
+ "uggin",
+ // hulking
+ "ulkin",
+ // humbugging
+ "umbuggin",
+ // humiliating
+ "umiliatin",
+ // humming
+ "ummin",
+ // humping
+ "umpin",
+ // hurrying
+ "urryin",
+ // hurting
+ "urtin",
+ // hustling
+ "ustlin",
+ // investigating
+ "vestigatin",
+ // inviting
+ "vitin",
+ // excepting
+ "xceptin",
+ // explaining
+ "xplainin",
+ // exploding
+ "xplodin"
+ );
+
+ /**
+ * Words having a straight apostrophe that cannot be mistaken for an
+ * opening single quote.
+ */
+ private static final Set<String> BEGAN_UNAMBIGUOUS = Set.of(
+ "aporth",
+ // about you
+ "boutcha",
+ // about you
+ "boutchu",
+ // about well
+ "boutwell",
+ // except
+ "cept",
+ // decided
+ "cided",
+ // because
+ "cos",
+ // armadillo
+ "dillo",
+ // themselves,
+ "emselves",
+ // affectionate
+ "fectionate",
+ // before
+ "fore",
+ // afraid
+ "fraid",
+ // ???
+ "funder",
+ // against
+ "gainst",
+ // him
+ "im",
+ // little
+ "ittle",
+ // and
+ "n",
+ // beneath
+ "neath",
+ // another
+ "nother",
+ // another
+ "nudder",
+ // enough
+ "nuff",
+ // gonna
+ "onna",
+ "onna'",
+ // horse
+ "oss",
+ // horses
+ "osses",
+ // splash
+ "plash",
+ // upon
+ "pon",
+ // that's|is
+ "s",
+ "sblood",
+ // excuse
+ "scuse",
+ "sfar",
+ "sfoot",
+ // considered
+ "sidered",
+ // suspect|expect
+ "spect",
+ // suspects|expects
+ "spects",
+ // exploit
+ "sploit",
+ // exploits
+ "sploits",
+ // suppose
+ "spose",
+ // instead
+ "stead",
+ // it
+ "t",
+ "taint",
+ "tain",
+ "tay",
+ "til",
+ "tis",
+ "tish",
+ // it isn't
+ "tisn",
+ // stomach
+ "tomach",
+ // stormed
+ "tormed",
+ "tshall",
+ "twas",
+ "twasn",
+ "tween",
+ "twere",
+ "tweren",
+ "twixt",
+ "twon",
+ "twou",
+ "twould",
+ "twouldn",
+ // one
+ "un",
+ // have
+ "ve",
+ // exactly
+ "xactly"
+ );
+
+ /**
+ * Words having a straight apostrophe that may be either part of a
+ * contraction or a word that stands alone beside an opening single quote.
+ */
+ private static final Set<String> BEGAN_AMBIGUOUS = Set.of(
+ // have
+ "a",
+ // than|an
+ "an",
// about|boxing match
"bout",
src/test/resources/com/whitemagicsoftware/keenquotes/texts/unambiguous-2-pass.txt
&ldquo;&lsquo;---has a prison.&rsquo;&rdquo;
+"Teach 'em t'be more 'an we ever been!"
+&ldquo;Teach &apos;em t&apos;be more &apos;an we ever been!&rdquo;
+
Delta420 lines added, 412 lines removed, 8-line increase