Dave Jarvis' Repositories

git clone https://repo.autonoma.ca/repo/keenquotes.git

Rename to new package, make into application

AuthorDave Jarvis <email>
Date2021-06-17 19:54:10 GMT-0700
Commit4dda57c31a9bf096e6342b2b84ff8d036ea8eef0
Parent6b057a0
build.gradle
+plugins {
+ id 'application'
+}
+
+group 'com.whitemagicsoftware'
+version '1.0'
+
+repositories {
+ mavenCentral()
+}
+
+dependencies {
+ // Command-line parsing
+ implementation 'info.picocli:picocli:4.4.0'
+
+ testImplementation 'org.junit.jupiter:junit-jupiter-api:5.7.1'
+ testImplementation 'org.junit.jupiter:junit-jupiter-params:5.7.1'
+ testRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine'
+}
+
+sourceSets {
+ main {
+ java {
+ srcDirs = ["src/main"]
+ }
+ }
+}
+
+compileJava {
+ options.compilerArgs << "-Xlint:unchecked" << "-Xlint:deprecation"
+}
+
+application {
+ applicationName = 'keenquotes'
+ mainClassName = "com.whitemagicsoftware.${applicationName}.KeenQuotes"
+}
+
+jar {
+ duplicatesStrategy = DuplicatesStrategy.EXCLUDE
+
+ manifest {
+ attributes 'Main-Class': mainClassName
+ }
+
+ from {
+ (configurations.runtimeClasspath.findAll { !it.path.endsWith(".pom") }).collect {
+ it.isDirectory() ? it : zipTree(it)
+ }
+ }
+
+ archiveFileName = "${applicationName}.jar"
+
+ exclude 'META-INF/*.RSA', 'META-INF/*.SF', 'META-INF/*.DSA'
+}
+
+tasks.named('test') {
+ useJUnitPlatform()
+}
+
lib/build.gradle
-plugins {
- id 'java-library'
-}
-
-repositories {
- // Use Maven Central for resolving dependencies.
- mavenCentral()
-}
-
-dependencies {
- testImplementation 'org.junit.jupiter:junit-jupiter-api:5.7.1'
- testImplementation 'org.junit.jupiter:junit-jupiter-params:5.7.1'
- testRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine'
-}
-
-tasks.named('test') {
- useJUnitPlatform()
-}
lib/src/main/java/com/keenwrite/quotes/CircularFifoQueue.java
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package com.keenwrite.quotes;
-
-import java.util.*;
-
-/**
- * CircularFifoQueue is a first-in first-out queue with a fixed size that
- * replaces its oldest element if full.
- * <p>
- * The removal order of a {@link CircularFifoQueue} is based on the
- * insertion order; elements are removed in the same order in which they
- * were added. The iteration order is the same as the removal order.
- * </p>
- * <p>
- * The {@link #add(Object)}, {@link #remove()}, {@link #peek()},
- * {@link #poll()},
- * {@link #offer(Object)} operations all perform in constant time.
- * All other operations perform in linear time or worse.
- * </p>
- * <p>
- * This queue prevents null objects from being added.
- * </p>
- *
- * @param <E> the type of elements in this collection
- * @since 4.0
- */
-public final class CircularFifoQueue<E> extends AbstractCollection<E>
- implements Queue<E> {
- /**
- * Underlying storage array.
- */
- private final transient E[] elements;
-
- /**
- * Array index of first (oldest) queue element.
- */
- private transient int start;
-
- /**
- * Index mod maxElements of the array position following the last queue
- * element. Queue elements start at elements[start] and "wrap around"
- * elements[maxElements-1], ending at elements[decrement(end)].
- * For example, elements = {c,a,b}, start=1, end=1 corresponds to
- * the queue [a,b,c].
- */
- private transient int end;
-
- /**
- * Flag to indicate if the queue is currently full.
- */
- private transient boolean full;
-
- /**
- * Capacity of the queue.
- */
- private final int maxElements;
-
- /**
- * Constructor that creates a queue with the specified size.
- *
- * @param size Immutable queue size, must be greater than zero.
- */
- @SuppressWarnings( "unchecked" )
- public CircularFifoQueue( final int size ) {
- assert size > 0;
-
- elements = (E[]) new Object[ size ];
- maxElements = elements.length;
- }
-
- /**
- * Returns the number of elements stored in the queue.
- *
- * @return this queue's size
- */
- @Override
- public int size() {
- int size;
-
- if( end < start ) {
- size = maxElements - start + end;
- }
- else if( end == start ) {
- size = full ? maxElements : 0;
- }
- else {
- size = end - start;
- }
-
- return size;
- }
-
- /**
- * Returns true if this queue is empty; false otherwise.
- *
- * @return true if this queue is empty
- */
- @Override
- public boolean isEmpty() {
- return size() == 0;
- }
-
- /**
- * Returns {@code true} if the capacity limit of this queue has been reached,
- * i.e. the number of elements stored in the queue equals its maximum size.
- *
- * @return {@code true} if the capacity limit has been reached, {@code
- * false} otherwise
- * @since 4.1
- */
- public boolean isAtFullCapacity() {
- return size() == maxElements;
- }
-
- /**
- * Clears this queue.
- */
- @Override
- public void clear() {
- full = false;
- start = 0;
- end = 0;
- Arrays.fill( elements, null );
- }
-
- /**
- * Adds the given element to this queue. If the queue is full, the least
- * recently added
- * element is discarded so that a new element can be inserted.
- *
- * @param element the element to add
- * @return true, always
- * @throws NullPointerException if the given element is null
- */
- @Override
- public boolean add( final E element ) {
- Objects.requireNonNull( element, "element" );
-
- if( isAtFullCapacity() ) {
- remove();
- }
-
- elements[ end++ ] = element;
-
- if( end >= maxElements ) {
- end = 0;
- }
-
- if( end == start ) {
- full = true;
- }
-
- return true;
- }
-
- /**
- * Returns the element at the specified position in this queue.
- *
- * @param index the position of the element in the queue
- * @return the element at position {@code index}
- * @throws NoSuchElementException if the requested position is outside the
- * range [0, size)
- */
- public E get( final int index ) {
- final int sz = size();
-
- if( index < 0 || index >= sz ) {
- throw new NoSuchElementException(
- String.format(
- "Index %1$d is outside the available range [0, %2$d)",
- index, sz ) );
- }
-
- final int idx = (start + index) % maxElements;
- return elements[ idx ];
- }
-
- /**
- * Adds the given element to this queue. If the queue is full, the least
- * recently added
- * element is discarded so that a new element can be inserted.
- *
- * @param element the element to add
- * @return true, always
- * @throws NullPointerException if the given element is null
- */
- @Override
- public boolean offer( final E element ) {
- return add( element );
- }
-
- @Override
- public E poll() {
- return isEmpty() ? null : remove();
- }
-
- @Override
- public E element() {
- if( isEmpty() ) {
- throw new NoSuchElementException( "empty queue" );
- }
-
- return peek();
- }
-
- @Override
- public E peek() {
- return isEmpty() ? null : elements[ start ];
- }
-
- @Override
- public E remove() {
- if( isEmpty() ) {
- throw new NoSuchElementException( "empty queue" );
- }
-
- final E element = elements[ start ];
- if( null != element ) {
- elements[ start++ ] = null;
-
- if( start >= maxElements ) {
- start = 0;
- }
- full = false;
- }
- return element;
- }
-
- /**
- * Increments the internal index.
- *
- * @param index the index to increment
- * @return the updated index
- */
- private int increment( int index ) {
- if( ++index >= maxElements ) {
- index = 0;
- }
- return index;
- }
-
- /**
- * Decrements the internal index.
- *
- * @param index the index to decrement
- * @return the updated index
- */
- private int decrement( int index ) {
- if( --index < 0 ) {
- index = maxElements - 1;
- }
- return index;
- }
-
- /**
- * Returns an iterator over this queue's elements.
- *
- * @return an iterator over this queue's elements
- */
- @Override
- public Iterator<E> iterator() {
- return new Iterator<>() {
-
- private int index = start;
- private int lastReturnedIndex = -1;
- private boolean isFirst = full;
-
- @Override
- public boolean hasNext() {
- return isFirst || index != end;
- }
-
- @Override
- public E next() {
- if( !hasNext() ) {
- throw new NoSuchElementException();
- }
-
- isFirst = false;
- lastReturnedIndex = index;
- index = increment( index );
- return elements[ lastReturnedIndex ];
- }
-
- @Override
- public void remove() {
- if( lastReturnedIndex == -1 ) {
- throw new IllegalStateException();
- }
-
- // First element can be removed quickly
- if( lastReturnedIndex == start ) {
- CircularFifoQueue.this.remove();
- lastReturnedIndex = -1;
- return;
- }
-
- int pos = lastReturnedIndex + 1;
- if( start < lastReturnedIndex && pos < end ) {
- // shift in one part
- System.arraycopy(
- elements, pos, elements, lastReturnedIndex, end - pos );
- }
- else {
- // Other elements require us to shift the subsequent elements
- while( pos != end ) {
- if( pos >= maxElements ) {
- elements[ pos - 1 ] = elements[ 0 ];
- pos = 0;
- }
- else {
- elements[ decrement( pos ) ] = elements[ pos ];
- pos = increment( pos );
- }
- }
- }
-
- lastReturnedIndex = -1;
- end = decrement( end );
- elements[ end ] = null;
- full = false;
- index = decrement( index );
- }
- };
- }
-}
lib/src/main/java/com/keenwrite/quotes/Contractions.java
-/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */
-package com.keenwrite.quotes;
-
-import java.util.HashSet;
-import java.util.Set;
-
-import static java.util.Collections.emptySet;
-
-/**
- * Placeholder for various types of contractions.
- */
-public class Contractions {
-
- private final Builder mBuilder;
-
- private Contractions( final Builder builder ) {
- assert builder != null;
- mBuilder = builder;
- }
-
- /**
- * Allows constructing a list of custom contractions.
- */
- @SuppressWarnings( "unused" )
- public static class Builder {
- private final Set<String> mBeganUnambiguous = new HashSet<>();
- private final Set<String> mEndedUnambiguous = new HashSet<>();
- private final Set<String> mBeganAmbiguous = new HashSet<>();
- private final Set<String> mEndedAmbiguous = new HashSet<>();
-
- public void withBeganUnambiguous( final Set<String> words ) {
- mBeganUnambiguous.addAll( words );
- }
-
- public void withEndedUnambiguous( final Set<String> words ) {
- mEndedUnambiguous.addAll( words );
- }
-
- public void withBeganAmbiguous( final Set<String> words ) {
- mBeganAmbiguous.addAll( words );
- }
-
- public void withEndedAmbiguous( final Set<String> words ) {
- mEndedAmbiguous.addAll( words );
- }
-
- /**
- * Constructs a new set of {@link Contractions} that can be configured
- * using this {@link Builder} instance.
- *
- * @return {@link Contractions} suitable for use with parsing text.
- */
- public Contractions build() {
- mBeganUnambiguous.addAll( from( mBeganUnambiguous, BEGAN_UNAMBIGUOUS ) );
- mEndedUnambiguous.addAll( from( mEndedUnambiguous, ENDED_UNAMBIGUOUS ) );
- mBeganAmbiguous.addAll( from( mBeganAmbiguous, BEGAN_AMBIGUOUS ) );
- mEndedAmbiguous.addAll( from( mEndedAmbiguous, ENDED_AMBIGUOUS ) );
-
- return new Contractions( this );
- }
-
- /**
- * This returns the {@code fallback} {@link Set} if {@code src} is empty;
- * otherwise, this returns the empty {@link Set}.
- *
- * @param src A set of contractions, possibly empty.
- * @param fallback The default values to use if {@code src} is empty.
- * @param <T> The type of data used by both {@link Set}s.
- * @return An empty {@link Set} if the {@code src} contains at least one
- * element; otherwise, this will return {@code fallback}.
- */
- private static <T> Set<T> from( final Set<T> src, final Set<T> fallback ) {
- assert src != null;
- assert fallback != null;
- return src.isEmpty() ? fallback : emptySet();
- }
- }
-
- /**
- * Answers whether the given word is a contraction that always starts
- * with an apostrophe. The comparison is case insensitive. This must
- * only be called when a straight quote is followed by a word.
- *
- * @param word The word to compare against the list of known unambiguous
- * contractions.
- * @return {@code true} when the given word is in the set of unambiguous
- * contractions.
- */
- public boolean beganUnambiguously( final String word ) {
- assert word != null;
- return getBeganUnambiguous().contains( word.toLowerCase() );
- }
-
- /**
- * Answers whether the given word could be a contraction but is also a
- * valid word in non-contracted form.
- *
- * @param word The word to compare against the list of known ambiguous
- * contractions.
- * @return {@code true} when the given word is in the set of ambiguous
- * contractions.
- */
- public boolean beganAmbiguously( final String word ) {
- assert word != null;
- return getBeganAmbiguous().contains( word.toLowerCase() );
- }
-
- public boolean endedUnambiguously( final String word ) {
- assert word != null;
- return getEndedUnambiguous().contains( word.toLowerCase() );
- }
-
- public boolean endedAmbiguously( final String word ) {
- assert word != null;
- final var check = word.toLowerCase();
-
- // Ensure that 'n' isn't matched for ambiguity by enforcing length, yet
- // allow o' to match because 'a sentence can end with the letter o'.
- return getEndedAmbiguous().contains( check ) ||
- check.endsWith( "s" ) || check.endsWith( "z" ) ||
- check.endsWith( "x" ) || (check.length() > 1 && check.endsWith( "n" ));
- }
-
- private Set<String> getBeganUnambiguous() {
- return mBuilder.mBeganUnambiguous;
- }
-
- private Set<String> getEndedUnambiguous() {
- return mBuilder.mEndedUnambiguous;
- }
-
- private Set<String> getBeganAmbiguous() {
- return mBuilder.mBeganAmbiguous;
- }
-
- private Set<String> getEndedAmbiguous() {
- return mBuilder.mEndedAmbiguous;
- }
-
- /**
- * Words having a straight apostrophe that cannot be mistaken for an
- * opening single quote.
- */
- private static final Set<String> BEGAN_UNAMBIGUOUS = Set.of(
- "aporth",
- "boutcha",
- "boutchu",
- "cept",
- "dillo",
- "em",
- "fraid",
- "gainst",
- "n",
- "neath",
- "nother",
- "onna",
- "onna'",
- "pon",
- "s",
- "sblood",
- "scuse",
- "sfar",
- "sfoot",
- "t",
- "taint",
- "tain",
- "til",
- "tis",
- "tisn",
- "tshall",
- "twas",
- "twasn",
- "tween",
- "twere",
- "tweren",
- "twixt",
- "twon",
- "twou",
- "twould",
- "twouldn",
- "ve"
- );
-
- /**
- * Words having a straight apostrophe that may be either part of a
- * contraction or a word that stands alone beside an opening single quote.
- */
- private static final Set<String> BEGAN_AMBIGUOUS = Set.of(
- // about|boxing match
- "bout",
- // because|causal
- "cause",
- // what you|choo choo train
- "choo",
- // he|e pluribus unum
- "e",
- // here|earlier
- "ere",
- // afro|to and fro
- "fro",
- // whore|ho ho!
- "ho",
- // okay|letter K
- "kay",
- // lo|lo and behold
- "lo",
- // are|regarding
- "re",
- // what's up|to sup
- "sup",
- // it will|twill fabric
- "twill",
- // them|utterance
- "um",
- // is that|Iranian village
- "zat"
- );
-
- private static final Set<String> ENDED_AMBIGUOUS = Set.of(
- // give|martial arts garment
- "gi",
- // in|I
- "i",
- // of|letter o
- "o"
- );
-
- private static final Set<String> ENDED_UNAMBIGUOUS = Set.of(
- // and
- "an",
- // for/before
- "fo",
- // friend
- "frien",
- // just
- "jus",
- // lord
- "lor",
- // myself
- "masel",
- // old
- "ol",
- // San (Francisco)
- "Sa",
- // shift
- "shif",
- // the
- "th",
- // what
- "wha",
- // world
- "worl",
- // Top ~500 common -ing words as English contractions.
- "acceptin",
- "accompanyin",
- "accordin",
- "accountin",
- "achievin",
- "acquirin",
- "actin",
- "addin",
- "addressin",
- "adjoinin",
- "adoptin",
- "advancin",
- "advertisin",
- "affectin",
- "agin",
- "allowin",
- "amazin",
- "analyzin",
- "answerin",
- "anythin",
- "appearin",
- "applyin",
- "approachin",
- "arguin",
- "arisin",
- "arrivin",
- "askin",
- "assessin",
- "assumin",
- "attackin",
- "attemptin",
- "attendin",
- "avoidin",
- "bankin",
- "bargainin",
- "bearin",
- "beatin",
- "becomin",
- "beginnin",
- "bein",
- "believin",
- "belongin",
- "bendin",
- "bindin",
- "bleedin",
- "blessin",
- "blowin",
- "boilin",
- "borrowin",
- "breakin",
- "breathin",
- "breedin",
- "bringin",
- "broadcastin",
- "buildin",
- "burnin",
- "buyin",
- "calculatin",
- "callin",
- "carryin",
- "castin",
- "causin",
- "ceilin",
- "challengin",
- "changin",
- "checkin",
- "choosin",
- "claimin",
- "cleanin",
- "clearin",
- "climbin",
- "closin",
- "clothin",
- "collectin",
- "combinin",
- "comin",
- "commandin",
- "comparin",
- "compellin",
- "competin",
- "computin",
- "concernin",
- "concludin",
- "conditionin",
- "conductin",
- "conflictin",
- "connectin",
- "considerin",
- "consistin",
- "constructin",
- "consultin",
- "consumin",
- "containin",
- "continuin",
- "contractin",
- "contributin",
- "controllin",
- "convincin",
- "cookin",
- "coolin",
- "copin",
- "correspondin",
- "counselin",
- "countin",
- "couplin",
- "coverin",
- "creatin",
- "crossin",
- "cryin",
- "cuttin",
- "dancin",
- "darlin",
- "datin",
- "dealin",
- "decidin",
- "declarin",
- "declinin",
- "decreasin",
- "definin",
- "demandin",
- "denyin",
- "dependin",
- "descendin",
- "describin",
- "designin",
- "destroyin",
- "determinin",
- "developin",
- "differin",
- "dinin",
- "directin",
- "discussin",
- "distinguishin",
- "disturbin",
- "dividin",
- "doin",
- "drawin",
- "dressin",
- "drinkin",
- "drivin",
- "droppin",
- "dryin",
- "durin",
- "dwellin",
- "dyin",
- "eatin",
- "editin",
- "emergin",
- "employin",
- "enablin",
- "encouragin",
- "endin",
- "engagin",
- "engineerin",
- "enjoyin",
- "enterin",
- "establishin",
- "evaluatin",
- "evenin",
- "everythin",
- "examinin",
- "exceedin",
- "excitin",
- "excludin",
- "existin",
- "expandin",
- "expectin",
- "experiencin",
- "explainin",
- "explorin",
- "expressin",
- "extendin",
- "facin",
- "failin",
- "fallin",
- "farmin",
- "fascinatin",
- "feedin",
- "feelin",
- "fightin",
- "filin",
- "fillin",
- "financin",
- "findin",
- "firin",
- "fishin",
- "fittin",
- "fixin",
- "floatin",
- "flowin",
- "flyin",
- "focusin",
- "followin",
- "forcin",
- "foregoin",
- "formin",
- "forthcomin",
- "foundin",
- "freezin",
- "fuckin",
- "functionin",
- "fundin",
- "gainin",
- "gatherin",
- "generatin",
- "gettin",
- "givin",
- "goin",
- "governin",
- "grantin",
- "growin",
- "hackin",
- "handlin",
- "hangin",
- "happenin",
- "havin",
- "headin",
- "healin",
- "hearin",
- "heatin",
- "helpin",
- "hidin",
- "holdin",
- "hopin",
- "housin",
- "huntin",
- "identifyin",
- "imagin",
- "implementin",
- "imposin",
- "improvin",
- "includin",
- "increasin",
- "indicatin",
- "interestin",
- "interpretin",
- "introducin",
- "involvin",
- "joinin",
- "judgin",
- "keepin",
- "killin",
- "knowin",
- "lackin",
- "landin",
- "lastin",
- "laughin",
- "layin",
- "leadin",
- "leanin",
- "learnin",
- "leavin",
- "lettin",
- "liftin",
- "lightin",
- "lightnin",
- "limitin",
- "listenin",
- "listin",
- "livin",
- "loadin",
- "lookin",
- "losin",
- "lovin",
- "lowerin",
- "lyin",
- "maintainin",
- "makin",
- "managin",
- "manufacturin",
- "mappin",
- "marketin",
- "markin",
- "matchin",
- "meanin",
- "measurin",
- "meetin",
- "meltin",
- "minin",
- "misleadin",
- "missin",
- "mixin",
- "modelin",
- "monitorin",
- "mornin",
- "movin",
- "neighborin",
- "nothin",
- "notin",
- "notwithstandin",
- "nursin",
- "observin",
- "obtainin",
- "occurrin",
- "offerin",
- "offsprin",
- "ongoin",
- "openin",
- "operatin",
- "opposin",
- "orderin",
- "organizin",
- "outstandin",
- "overwhelmin",
- "packin",
- "paintin",
- "parkin",
- "participatin",
- "passin",
- "payin",
- "pendin",
- "performin",
- "pickin",
- "pissin",
- "placin",
- "plannin",
- "plantin",
- "playin",
- "pleasin",
- "pointin",
- "possessin",
- "preachin",
- "precedin",
- "preparin",
- "presentin",
- "preservin",
- "pressin",
- "prevailin",
- "preventin",
- "pricin",
- "printin",
- "proceedin",
- "processin",
- "producin",
- "programmin",
- "promisin",
- "promotin",
- "protectin",
- "providin",
- "provin",
- "publishin",
- "pullin",
- "purchasin",
- "pursuin",
- "pushin",
- "puttin",
- "questionin",
- "rangin",
- "ratin",
- "reachin",
- "readin",
- "reasonin",
- "receivin",
- "recognizin",
- "recordin",
- "reducin",
- "referrin",
- "reflectin",
- "refusin",
- "regardin",
- "regulatin",
- "relatin",
- "remainin",
- "rememberin",
- "removin",
- "renderin",
- "repeatin",
- "replacin",
- "reportin",
- "representin",
- "requirin",
- "respectin",
- "respondin",
- "restin",
- "resultin",
- "returnin",
- "revealin",
- "ridin",
- "risin",
- "rulin",
- "runnin",
- "sailin",
- "samplin",
- "satisfyin",
- "savin",
- "sayin",
- "scatterin",
- "schoolin",
- "screenin",
- "searchin",
- "securin",
- "seein",
- "seekin",
- "selectin",
- "sellin",
- "sendin",
- "separatin",
- "servin",
- "settin",
- "settlin",
- "sewin",
- "shakin",
- "shapin",
- "sharin",
- "shiftin",
- "shinin",
- "shippin",
- "shittin",
- "shootin",
- "shoppin",
- "showin",
- "singin",
- "sinkin",
- "sittin",
- "sleepin",
- "smilin",
- "smokin",
- "spankin",
- "solvin",
- "somethin",
- "speakin",
- "spellin",
- "spendin",
- "spinnin",
- "spittin",
- "spreadin",
- "standin",
- "starin",
- "startin",
- "statin",
- "stayin",
- "stealin",
- "sterlin",
- "stimulatin",
- "stirrin",
- "stoppin",
- "strengthenin",
- "stretchin",
- "strikin",
- "strugglin",
- "studyin",
- "succeedin",
- "sufferin",
- "suggestin",
- "supplyin",
- "supportin",
- "surprisin",
- "surroundin",
- "survivin",
- "sweepin",
- "swellin",
- "swimmin",
- "switchin",
- "takin",
- "talkin",
- "teachin",
- "tellin",
- "testin",
- "thinkin",
- "threatenin",
- "throwin",
- "timin",
- "touchin",
- "tradin",
- "trainin",
- "travelin",
- "treatin",
- "tremblin",
- "tryin",
- "turnin",
- "underlyin",
- "understandin",
- "undertakin",
- "unwillin",
- "usin",
- "varyin",
- "viewin",
- "visitin",
- "votin",
- "waitin",
- "walkin",
- "wanderin",
- "wantin",
- "warnin",
- "washin",
- "watchin",
- "wearin",
- "weddin",
- "whackin",
- "willin",
- "windin",
- "winnin",
- "wishin",
- "wonderin",
- "workin",
- "writin",
- "yieldin"
- );
-}
lib/src/main/java/com/keenwrite/quotes/KeenQuotes.java
-/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */
-package com.keenwrite.quotes;
-
-import java.util.ArrayList;
-import java.util.Map;
-import java.util.function.Consumer;
-
-import static com.keenwrite.quotes.TokenType.*;
-import static java.util.Collections.sort;
-
-/**
- * Responsible for replacing {@link Token} instances with equivalent smart
- * quotes (or straight quotes). This will inform the caller when ambiguous
- * quotes cannot be reliably resolved.
- */
-public final class KeenQuotes {
- private static final Map<TokenType, String> REPLACEMENTS = Map.of(
- QUOTE_OPENING_SINGLE, "&lsquo;",
- QUOTE_CLOSING_SINGLE, "&rsquo;",
- QUOTE_OPENING_DOUBLE, "&ldquo;",
- QUOTE_CLOSING_DOUBLE, "&rdquo;",
- QUOTE_STRAIGHT_SINGLE, "'",
- QUOTE_STRAIGHT_DOUBLE, "\"",
- QUOTE_APOSTROPHE, "&apos;",
- QUOTE_PRIME_SINGLE, "&prime;",
- QUOTE_PRIME_DOUBLE, "&Prime;"
- );
-
- /**
- * Converts straight quotes to curly quotes and primes. Any quotation marks
- * that cannot be converted are passed to the {@link Consumer}.
- *
- * @param text The text to parse.
- * @param unresolved Recipient for ambiguous {@link Lexeme}s.
- * @return The given text string with as many straight quotes converted to
- * curly quotes as is feasible.
- */
- public static String convert(
- final String text, final Consumer<Lexeme> unresolved ) {
- final var parser = new Parser( text );
- final var tokens = new ArrayList<Token>();
-
- // Parse the tokens and consume all unresolved lexemes.
- parser.parse( tokens::add, unresolved );
-
- // The parser may emit tokens in any order.
- sort( tokens );
-
- final var result = new StringBuilder( text.length() );
- var position = 0;
-
- for( final var token : tokens ) {
- if( position <= token.began() ) {
- result.append( text, position, token.began() );
- result.append( REPLACEMENTS.get( token.getType() ) );
- }
-
- position = token.ended();
- }
-
- return result.append( text.substring( position ) ).toString();
- }
-}
lib/src/main/java/com/keenwrite/quotes/Lexeme.java
-/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */
-package com.keenwrite.quotes;
-
-import static com.keenwrite.quotes.LexemeType.FLAG;
-
-/**
- * Responsible for tracking the beginning and ending offsets of a lexeme within
- * a text string. Tracking the beginning and ending indices should use less
- * memory than duplicating the entire text of Unicode characters (i.e., using
- * a similar approach to run-length encoding).
- */
-public final class Lexeme implements Comparable<Lexeme> {
- /**
- * Signifies an invalid index to help distinguish EOT/SOT.
- */
- private static final int E_INDEX = -2;
-
- /**
- * Denotes there are no more lexemes: the end of text (EOT) has been reached.
- * The beginning index differentiates between EOT and SOT.
- */
- public static final Lexeme EOT = new Lexeme( FLAG, -1, E_INDEX );
-
- /**
- * Denotes parsing at the start of text (SOT). This is useful to avoid
- * branching conditions while iterating. The beginning index differentiates
- * between EOT and SOT.
- */
- public static final Lexeme SOT = new Lexeme( FLAG, 0, E_INDEX );
-
- private final LexemeType mType;
- private final int mBegan;
- private final int mEnded;
-
- /**
- * Create a lexeme that represents a section of the text.
- *
- * @param type Type of {@link Lexeme} to create.
- * @param began Offset into the text where this instance starts (0-based).
- * @param ended Offset into the text where this instance stops (0-based).
- */
- private Lexeme( final LexemeType type, final int began, final int ended ) {
- assert type != null;
- assert began >= 0 || ended == E_INDEX;
- assert ended >= began || ended == E_INDEX;
-
- mType = type;
- mBegan = began;
- mEnded = ended + 1;
- }
-
- /**
- * Extracts a sequence of characters from the given text at the offsets
- * captured by this lexeme.
- *
- * @param text The text that was parsed using this class.
- * @return The character string captured by the lexeme.
- */
- public String toString( final String text ) {
- assert text != null;
- return text.substring( mBegan, mEnded );
- }
-
- @Override
- public String toString() {
- return getClass().getSimpleName() + '{' +
- "mType=" + mType +
- ", mBegan=" + mBegan +
- ", mEnded=" + mEnded +
- '}';
- }
-
- /**
- * Answers whether the given {@link LexemeType} is the same as this
- * instance's internal {@link LexemeType}.
- *
- * @param type The {@link LexemeType} to compare.
- * @return {@code true} if the given {@link LexemeType} is equal to the
- * internal {@link LexemeType}.
- */
- public boolean isType( final LexemeType type ) {
- assert type != null;
- return mType == type;
- }
-
- /**
- * Answers whether any of the given {@link LexemeType} matches this
- * instance's internal {@link LexemeType}.
- *
- * @param types The {@link LexemeType}s to compare.
- * @return {@code true} if the internal {@link LexemeType} matches any one
- * of the given {@link LexemeType}s.
- */
- public boolean anyType( final LexemeType... types ) {
- assert types != null;
-
- for( final var type : types ) {
- if( mType == type ) {
- return true;
- }
- }
-
- return false;
- }
-
- public int began() {
- return mBegan;
- }
-
- public int ended() {
- return mEnded;
- }
-
- boolean isSot() {
- return mBegan == 0;
- }
-
- boolean isEot() {
- return mBegan == -1;
- }
-
- LexemeType getType() {
- return mType;
- }
-
- /**
- * Compares the starting offset of the given {@link Lexeme} to the starting
- * offset of this {@link Lexeme} instance. This allows a list {@link Lexeme}s
- * to be sorted by order of appearance in the parsed text.
- */
- @Override
- public int compareTo( final Lexeme that ) {
- assert that != null;
- return this.mBegan - that.mBegan;
- }
-
- static Lexeme createLexeme(
- final LexemeType lexeme, final int began, final int ended ) {
- assert lexeme != null;
- return new Lexeme( lexeme, began, ended );
- }
-}
lib/src/main/java/com/keenwrite/quotes/LexemeType.java
-/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */
-package com.keenwrite.quotes;
-
-/**
- * Represents the type of a {@link Lexeme} parsed by the {@link Lexer}.
- */
-public enum LexemeType {
- QUOTE_SINGLE,
- QUOTE_SINGLE_OPENING,
- QUOTE_SINGLE_CLOSING,
- QUOTE_DOUBLE,
- ESC_SINGLE,
- ESC_DOUBLE,
- EOL,
- EOP,
- SPACE,
- WORD,
- NUMBER,
- PUNCT,
- OPENING_GROUP,
- CLOSING_GROUP,
- HYPHEN,
- DASH,
- EQUALS,
- PERIOD,
- ELLIPSIS,
- FLAG
-}
lib/src/main/java/com/keenwrite/quotes/Lexer.java
-/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */
-package com.keenwrite.quotes;
-
-import java.text.CharacterIterator;
-import java.text.StringCharacterIterator;
-import java.util.function.BiFunction;
-
-import static com.keenwrite.quotes.Lexeme.createLexeme;
-import static com.keenwrite.quotes.LexemeType.*;
-import static java.lang.Character.isDigit;
-import static java.lang.Character.isWhitespace;
-import static java.text.CharacterIterator.DONE;
-
-/**
- * Turns text into words, numbers, punctuation, spaces, and more.
- */
-public class Lexer {
- /**
- * Iterates over the entire string of text to help produce lexemes.
- */
- private final CharacterIterator mIterator;
-
- /**
- * Default constructor, no state.
- */
- public Lexer( final String text ) {
- mIterator = new StringCharacterIterator( text );
- }
-
- public Lexeme next() {
- return parse( mIterator );
- }
-
- /**
- * Tokenizes a sequence of characters. The order of comparisons is optimized
- * towards probability of the occurrence of a character in regular English
- * prose: letters, space, quotation marks, numbers, periods, new lines,
- * then end of text.
- *
- * @param i The sequence of characters to tokenize.
- * @return The next token in the sequence.
- */
- private Lexeme parse( final CharacterIterator i ) {
- int began = i.getIndex();
- boolean isWord = false;
- Lexeme lexeme = null;
-
- do {
- final var curr = i.current();
-
- if( isLetter( curr ) ) {
- isWord = true;
-
- if( !isLetter( peek( i ) ) ) {
- lexeme = createLexeme( WORD, began, i.getIndex() );
- }
- }
- else if( curr == ' ' ) {
- lexeme = createLexeme( SPACE, began, i.getIndex() );
- }
- else if( curr == '\'' ) {
- lexeme = createLexeme( QUOTE_SINGLE, began, i.getIndex() );
- }
- else if( curr == '"' ) {
- lexeme = createLexeme( QUOTE_DOUBLE, began, i.getIndex() );
- }
- else if( curr == '‘') {
- lexeme = createLexeme( QUOTE_SINGLE_OPENING, began, i.getIndex() );
- }
- else if( curr == '’') {
- lexeme = createLexeme( QUOTE_SINGLE_CLOSING, began, i.getIndex() );
- }
- else if( curr == '-' && peek( i ) == '-' || curr == '—' ) {
- slurp( i, ( next, ci ) -> next == '-' || next == '—' );
-
- lexeme = createLexeme( DASH, began, i.getIndex() );
- }
- else if( isDigit( curr ) || isNumeric( curr ) && isDigit( peek( i ) ) ) {
- // Parse all consecutive number characters to prevent the main loop
- // from switching back to word tokens.
- slurp( i, ( next, ci ) ->
- isDigit( next ) || isNumeric( next ) && isDigit( peek( ci ) )
- );
-
- lexeme = createLexeme( isWord ? WORD : NUMBER, began, i.getIndex() );
- }
- else if( curr == '-' ) {
- lexeme = createLexeme( HYPHEN, began, i.getIndex() );
- }
- else if( curr == '.' ) {
- // Parse all consecutive periods into an ellipsis lexeme. This will
- // not capture space-separated ellipsis (such as ". . .").
- lexeme = createLexeme(
- slurp( i, ( next, ci ) -> next == '.' ) == 0 ? PERIOD : ELLIPSIS,
- began, i.getIndex()
- );
- }
- else if( curr == '\r' || curr == '\n' ) {
- final var cr = new int[]{curr == '\r' ? 1 : 0};
- final var lf = new int[]{curr == '\n' ? 1 : 0};
-
- // Swallow all consecutive CR (Mac), CRLF (Windows), and/or LF (Unix).
- slurp(
- i, ( next, ci ) -> {
- cr[ 0 ] += next == '\r' ? 1 : 0;
- lf[ 0 ] += next == '\n' ? 1 : 0;
- return next == '\r' || next == '\n';
- }
- );
-
- final var eol = cr[ 0 ] + lf[ 0 ] == 1 || cr[ 0 ] == 1 && lf[ 0 ] == 1;
- lexeme = createLexeme( eol ? EOL : EOP, began, i.getIndex() );
- }
- else if( isWhitespace( curr ) ) {
- lexeme = createLexeme( SPACE, began, i.getIndex() );
- }
- else if( curr == '\\' ) {
- final var next = i.next();
-
- if( next == '\'' ) {
- lexeme = createLexeme( ESC_SINGLE, began, i.getIndex() );
- }
- else if( next == '\"' ) {
- lexeme = createLexeme( ESC_DOUBLE, began, i.getIndex() );
- }
- else {
- // Push back the escaped character, which wasn't a straight quote.
- i.previous();
- }
- }
- else if( curr == '(' || curr == '{' || curr == '[' ) {
- lexeme = createLexeme( OPENING_GROUP, began, i.getIndex() );
- }
- else if( curr == ')' || curr == '}' || curr == ']' ) {
- lexeme = createLexeme( CLOSING_GROUP, began, i.getIndex() );
- }
- else if( curr == '=' ) {
- lexeme = createLexeme( EQUALS, began, i.getIndex() );
- }
- else if( curr != DONE ) {
- lexeme = createLexeme( PUNCT, began, i.getIndex() );
- }
- else {
- lexeme = Lexeme.EOT;
- }
-
- i.next();
- }
- while( lexeme == null );
-
- return lexeme;
- }
-
- /**
- * Answers whether the given character can be considered part of a word
- * or not. This will include {@code _} and {@code *} because plain text
- * formats often use those characters to emphasize a word.
- *
- * @param curr The character to check as being part of a word.
- * @return {@code true} if the given character is a letter or a formatting
- * indicator.
- */
- private static boolean isLetter( final char curr ) {
- return Character.isLetter( curr ) || curr == '_' || curr == '*';
- }
-
- /**
- * Answers whether the given character can be considered part of a number
- * or not. This does not include digits, which are checked independently
- * from this method.
- *
- * @param curr The character to check as being related to numbers.
- * @return {@code true} if the given character can be considered part of
- * a number (e.g., -2,000.2^2 is considered a single number).
- */
- private static boolean isNumeric( final char curr ) {
- return
- curr == '.' || curr == ',' || curr == '-' || curr == '+' || curr == '^';
- }
-
- private static char peek( final CharacterIterator ci ) {
- final var ch = ci.next();
- ci.previous();
- return ch;
- }
-
- /**
- * Parse all characters that match a given function.
- *
- * @param ci The iterator containing characters to parse.
- * @param f The function that determines when slurping stops.
- * @return The number of characters parsed.
- */
- private static int slurp(
- final CharacterIterator ci,
- final BiFunction<Character, CharacterIterator, Boolean> f ) {
- char next;
- int count = 0;
-
- do {
- next = ci.next();
- count++;
- }
- while( f.apply( next, ci ) );
-
- // The loop above will overshoot the tally by one character.
- ci.previous();
-
- return --count;
- }
-}
lib/src/main/java/com/keenwrite/quotes/Parser.java
-/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */
-package com.keenwrite.quotes;
-
-import java.util.ArrayList;
-import java.util.List;
-import java.util.function.Consumer;
-
-import static com.keenwrite.quotes.Lexeme.EOT;
-import static com.keenwrite.quotes.Lexeme.SOT;
-import static com.keenwrite.quotes.LexemeType.*;
-import static com.keenwrite.quotes.TokenType.*;
-
-/**
- * Converts straight double/single quotes and apostrophes to curly equivalents.
- */
-public final class Parser {
- /**
- * Single quotes preceded by these {@link LexemeType}s may be opening quotes.
- */
- private static final LexemeType[] LEADING_QUOTE_OPENING_SINGLE =
- new LexemeType[]{SPACE, DASH, QUOTE_DOUBLE, OPENING_GROUP, EOL, EOP};
-
- /**
- * Single quotes succeeded by these {@link LexemeType}s may be opening quotes.
- */
- private static final LexemeType[] LAGGING_QUOTE_OPENING_SINGLE =
- new LexemeType[]{WORD, ELLIPSIS, QUOTE_SINGLE, QUOTE_DOUBLE};
-
- /**
- * Single quotes preceded by these {@link LexemeType}s may be closing quotes.
- */
- private static final LexemeType[] LEADING_QUOTE_CLOSING_SINGLE =
- new LexemeType[]{WORD, NUMBER, PERIOD, PUNCT, ELLIPSIS, QUOTE_DOUBLE};
-
- /**
- * Single quotes succeeded by these {@link LexemeType}s may be closing quotes.
- */
- private static final LexemeType[] LAGGING_QUOTE_CLOSING_SINGLE =
- new LexemeType[]{SPACE, HYPHEN, DASH,
- QUOTE_DOUBLE, CLOSING_GROUP, EOL, EOP};
-
- /**
- * Double quotes preceded by these {@link LexemeType}s may be opening quotes.
- */
- private static final LexemeType[] LEADING_QUOTE_OPENING_DOUBLE =
- new LexemeType[]{SPACE, DASH, EQUALS, QUOTE_SINGLE, OPENING_GROUP, EOL, EOP};
-
- /**
- * Double quotes succeeded by these {@link LexemeType}s may be opening quotes.
- */
- private static final LexemeType[] LAGGING_QUOTE_OPENING_DOUBLE =
- new LexemeType[]{WORD, NUMBER, ELLIPSIS,
- QUOTE_SINGLE, QUOTE_SINGLE_OPENING, QUOTE_SINGLE_CLOSING, QUOTE_DOUBLE};
-
- /**
- * Double quotes preceded by these {@link LexemeType}s may be closing quotes.
- */
- private static final LexemeType[] LEADING_QUOTE_CLOSING_DOUBLE =
- new LexemeType[]{WORD, NUMBER, PERIOD, PUNCT, DASH, ELLIPSIS,
- QUOTE_SINGLE, QUOTE_SINGLE_CLOSING, QUOTE_SINGLE_OPENING};
-
- /**
- * Double quotes succeeded by these {@link LexemeType}s may be closing quotes.
- */
- private static final LexemeType[] LAGGING_QUOTE_CLOSING_DOUBLE =
- new LexemeType[]{SPACE, PUNCT, PERIOD, EQUALS, HYPHEN, DASH,
- QUOTE_SINGLE, CLOSING_GROUP, EOL, EOP};
-
- /**
- * The text to parse. A reference is required as a minor optimization in
- * memory and speed: the lexer records integer offsets, rather than new
- * {@link String} instances, to track parsed lexemes.
- */
- private final String mText;
-
- /**
- * Converts a string into an iterable list of {@link Lexeme} instances.
- */
- private final Lexer mLexer;
-
- /**
- * Sets of contractions that help disambiguate single quotes in the text.
- * These are effectively immutable while parsing.
- */
- private final Contractions sContractions;
-
- /**
- * Incremented for each opening single quote emitted. Used to help resolve
- * ambiguities when single quote marks are balanced.
- */
- private int mOpeningSingleQuote;
-
- /**
- * Incremented for each closing single quote emitted. Used to help resolve
- * ambiguities when single quote marks are balanced.
- */
- private int mClosingSingleQuote;
-
- /**
- * Constructs a new {@link Parser} using the default contraction sets
- * to help resolve some ambiguous scenarios.
- *
- * @param text The prose to parse, containing zero or more quotation
- * characters.
- */
- public Parser( final String text ) {
- this( text, new Contractions.Builder().build() );
- }
-
- /**
- * Constructs a new {@link Parser} using the default contraction sets
- * to help resolve some ambiguous scenarios.
- *
- * @param text The prose to parse, containing zero or more quotation
- * characters.
- * @param contractions Custom sets of contractions to help resolve
- * ambiguities.
- */
- public Parser( final String text, final Contractions contractions ) {
- mText = text;
- mLexer = new Lexer( mText );
- sContractions = contractions;
- }
-
- /**
- * Iterates over the entire text provided at construction, emitting
- * {@link Token}s that can be used to convert straight quotes to curly
- * quotes.
- *
- * @param tokenConsumer Receives emitted {@link Token}s.
- */
- public void parse(
- final Consumer<Token> tokenConsumer,
- final Consumer<Lexeme> lexemeConsumer ) {
- final var lexemes = new CircularFifoQueue<Lexeme>( 3 );
-
- // Allow consuming the very first token without checking the queue size.
- flush( lexemes );
-
- final var unresolved = new ArrayList<Lexeme[]>();
- Lexeme lexeme;
-
- // Create and convert a list of all unambiguous quote characters.
- while( (lexeme = mLexer.next()) != EOT ) {
- if( tokenize( lexeme, lexemes, tokenConsumer, unresolved ) ) {
- // Attempt to resolve any remaining unambiguous quotes.
- resolve( unresolved, tokenConsumer );
-
- // Notify of any unambiguous quotes that could not be resolved.
- unresolved.forEach( ( lex ) -> lexemeConsumer.accept( lex[ 1 ] ) );
- unresolved.clear();
- mOpeningSingleQuote = 0;
- mClosingSingleQuote = 0;
- }
- }
-
- // By loop's end, the lexemes list contains tokens for all except the
- // final two elements (from tokenizing in triplets). Tokenize the remaining
- // unprocessed lexemes.
- tokenize( EOT, lexemes, tokenConsumer, unresolved );
- tokenize( EOT, lexemes, tokenConsumer, unresolved );
-
- // Attempt to resolve any remaining unambiguous quotes.
- resolve( unresolved, tokenConsumer );
-
- // Notify of any unambiguous quotes that could not be resolved.
- unresolved.forEach( ( lex ) -> lexemeConsumer.accept( lex[ 1 ] ) );
- }
-
- /**
- * Converts {@link Lexeme}s identified as straight quotes into {@link Token}s
- * that represent the curly equivalent. The {@link Token}s are passed to
- * the given {@link Consumer} for further processing (e.g., replaced in
- * the original text being parsed).
- *
- * @param lexeme A part of the text being parsed.
- * @param lexemes A 3-element queue of lexemes that provide sufficient
- * context to identify curly quotes.
- * @param consumer Recipient of equivalent quotes.
- * @param unresolved Rolling list of potentially ambiguous {@link Lexeme}s
- * that could not be tokenized, yet.
- * @return {@code true} if an end-of-paragraph is detected.
- */
- private boolean tokenize( final Lexeme lexeme,
- final CircularFifoQueue<Lexeme> lexemes,
- final Consumer<Token> consumer,
- final List<Lexeme[]> unresolved ) {
- // Add the next lexeme to tokenize into the queue for immediate processing.
- lexemes.add( lexeme );
-
- final var lex1 = lexemes.get( 0 );
- final var lex2 = lexemes.get( 1 );
- final var lex3 = lexemes.get( 2 );
-
- if( lex2.isType( QUOTE_SINGLE ) && lex3.isType( WORD ) &&
- lex1.anyType( WORD, PERIOD, NUMBER ) ) {
- // Examples: y'all, Ph.D.'ll, 20's, she's
- consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) );
- }
- else if( lex1.isType( QUOTE_SINGLE ) && lex3.isType( QUOTE_SINGLE ) &&
- "n".equalsIgnoreCase( lex2.toString( mText ) ) ) {
- // I.e., 'n'
- consumer.accept( new Token( QUOTE_APOSTROPHE, lex1 ) );
- consumer.accept( new Token( QUOTE_APOSTROPHE, lex3 ) );
- flush( lexemes );
- truncate( unresolved );
- }
- else if( lex2.isType( QUOTE_SINGLE ) && lex1.isType( NUMBER ) ) {
- if( lex3.isType( QUOTE_SINGLE ) ) {
- // E.g., 2''
- consumer.accept(
- new Token( QUOTE_PRIME_DOUBLE, lex2.began(), lex3.ended() ) );
- flush( lexemes );
- }
- else {
- // E.g., 2'
- consumer.accept( new Token( QUOTE_PRIME_SINGLE, lex2 ) );
- }
- }
- else if( lex2.isType( QUOTE_DOUBLE ) && lex1.isType( NUMBER ) ) {
- // E.g., 2"
- consumer.accept( new Token( QUOTE_PRIME_DOUBLE, lex2 ) );
- }
- else if( lex2.isType( WORD ) && lex3.isType( QUOTE_SINGLE ) &&
- sContractions.endedUnambiguously( lex2.toString( mText ) ) ) {
- // E.g., thinkin'
- consumer.accept( new Token( QUOTE_APOSTROPHE, lex3 ) );
- flush( lexemes );
- }
- else if( lex2.isType( NUMBER ) && lex1.isType( QUOTE_SINGLE ) ) {
- if( lex3.anyType( SPACE, PUNCT ) || (lex3.isType( WORD ) &&
- lex3.toString( mText ).equalsIgnoreCase( "s" )) ) {
- // Sentences must re-written to avoid starting with numerals.
- // Examples: '20s, '02
- consumer.accept( new Token( QUOTE_APOSTROPHE, lex1 ) );
- }
- else {
- // E.g., '2''
- consumer.accept( new Token( QUOTE_OPENING_SINGLE, lex1 ) );
- mOpeningSingleQuote++;
- }
-
- truncate( unresolved );
- }
- else if( lex2.isType( QUOTE_SINGLE ) &&
- lex1.anyType( PUNCT, PERIOD, ELLIPSIS, DASH ) &&
- (lex3.anyType( EOL, EOP ) || lex3.isEot()) ) {
- consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) );
- mClosingSingleQuote++;
- }
- else if( lex1.isType( ESC_SINGLE ) ) {
- // E.g., \'
- consumer.accept( new Token( QUOTE_STRAIGHT_SINGLE, lex1 ) );
- }
- else if( lex1.isType( ESC_DOUBLE ) ) {
- // E.g., \"
- consumer.accept( new Token( QUOTE_STRAIGHT_DOUBLE, lex1 ) );
-
- if( lex2.isType( QUOTE_SINGLE ) &&
- (lex3.isEot() || lex3.anyType( SPACE, DASH, EOL, EOP )) ) {
- consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) );
- mClosingSingleQuote++;
- }
- }
- else if( lex2.isType( QUOTE_DOUBLE ) &&
- (lex1.isSot() || lex1.anyType( LEADING_QUOTE_OPENING_DOUBLE )) &&
- lex3.anyType( LAGGING_QUOTE_OPENING_DOUBLE ) ) {
- // Examples: "", "..., "word, ---"word
- consumer.accept( new Token( QUOTE_OPENING_DOUBLE, lex2 ) );
- }
- else if( lex2.isType( QUOTE_DOUBLE ) &&
- lex1.anyType( LEADING_QUOTE_CLOSING_DOUBLE ) &&
- (lex3.isEot() || lex3.anyType( LAGGING_QUOTE_CLOSING_DOUBLE )) ) {
- // Examples: ..."', word"', ?"', word"?
- consumer.accept( new Token( QUOTE_CLOSING_DOUBLE, lex2 ) );
- }
- else if( lex1.isType( QUOTE_SINGLE ) &&
- lex2.anyType( PUNCT, PERIOD, DASH ) && lex3.isType( QUOTE_DOUBLE ) ) {
- // E.g., '," (contraction ruled out from previous conditionals)
- consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex1 ) );
- truncate( unresolved );
- mClosingSingleQuote++;
- }
- else if( lex2.anyType( QUOTE_SINGLE, QUOTE_DOUBLE ) ) {
- // After tokenizing, the parser will attempt to resolve ambiguities.
- unresolved.add( new Lexeme[]{lex1, lex2, lex3} );
- }
-
- // Suggest to the caller that resolution should be performed. This allows
- // the algorithm to reset the opening/closing quote balance before the
- // next paragraph is parsed.
- return lex3.isType( EOP );
- }
-
- private void resolve(
- final List<Lexeme[]> unresolved, final Consumer<Token> consumer ) {
- // Some non-emitted tokenized lexemes may be ambiguous.
- final var ambiguousLeadingQuotes = new ArrayList<Lexeme[]>( 16 );
- final var ambiguousLaggingQuotes = new ArrayList<Lexeme[]>( 16 );
- var resolvedLeadingQuotes = 0;
- var resolvedLaggingQuotes = 0;
-
- // Count the number of ambiguous and non-ambiguous open single quotes.
- for( var i = unresolved.iterator(); i.hasNext(); ) {
- final var quotes = i.next();
- final var lex1 = quotes[ 0 ];
- final var lex2 = quotes[ 1 ];
- final var lex3 = quotes[ 2 ];
-
- if( lex2.isType( QUOTE_SINGLE ) ) {
- final var word1 = lex1 == SOT ? "" : lex1.toString( mText );
- final var word3 = lex3 == EOT ? "" : lex3.toString( mText );
-
- if( sContractions.beganAmbiguously( word3 ) ) {
- // E.g., 'Cause
- if( lex1.isType( QUOTE_SINGLE ) ) {
- // E.g., ''Cause
- consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) );
- i.remove();
- }
- else {
- // The contraction is uncertain until a closing quote is found that
- // may balance this single quote.
- ambiguousLeadingQuotes.add( quotes );
- }
- }
- else if( sContractions.beganUnambiguously( word3 ) ) {
- // The quote mark forms a word that does not stand alone from its
- // contraction. For example, twas is not a word: it's 'twas.
- consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) );
- i.remove();
- }
- else if( sContractions.endedAmbiguously( word1 ) ) {
- ambiguousLaggingQuotes.add( quotes );
- }
- else if( (lex1.isSot() || lex1.anyType( LEADING_QUOTE_OPENING_SINGLE ))
- && lex3.anyType( LAGGING_QUOTE_OPENING_SINGLE ) ) {
- consumer.accept( new Token( QUOTE_OPENING_SINGLE, lex2 ) );
- resolvedLeadingQuotes++;
- mOpeningSingleQuote++;
- i.remove();
- }
- else if( lex1.anyType( LEADING_QUOTE_CLOSING_SINGLE ) &&
- (lex3.isEot() || lex3.anyType( LAGGING_QUOTE_CLOSING_SINGLE )) ) {
- consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) );
- resolvedLaggingQuotes++;
- mClosingSingleQuote++;
- i.remove();
- }
- else if( lex3.isType( NUMBER ) ) {
- // E.g., '04
- ambiguousLeadingQuotes.add( quotes );
- }
- }
- }
-
- final var ambiguousLeadingCount = ambiguousLeadingQuotes.size();
- final var ambiguousLaggingCount = ambiguousLaggingQuotes.size();
-
- if( resolvedLeadingQuotes == 1 && resolvedLaggingQuotes == 0 ) {
- if( ambiguousLeadingCount == 0 && ambiguousLaggingCount == 1 ) {
- final var balanced = mClosingSingleQuote - mOpeningSingleQuote == 0;
- final var quote = balanced ? QUOTE_APOSTROPHE : QUOTE_CLOSING_SINGLE;
- final var lex = ambiguousLaggingQuotes.get( 0 );
- consumer.accept( new Token( quote, lex[ 1 ] ) );
- unresolved.remove( lex );
- }
- else if( ambiguousLeadingCount == 0 && unresolved.size() == 1 ) {
- // Must be a closing quote.
- final var closing = unresolved.get( 0 );
- consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) );
- unresolved.remove( closing );
- }
- }
- else if( ambiguousLeadingCount == 0 && ambiguousLaggingCount > 0 ) {
- // If there are no ambiguous leading quotes then all ambiguous lagging
- // quotes must be contractions.
- ambiguousLaggingQuotes.forEach(
- lex -> {
- consumer.accept( new Token( QUOTE_APOSTROPHE, lex[ 1 ] ) );
- unresolved.remove( lex );
- }
- );
- }
- else if( ambiguousLeadingCount == 0 ) {
- if( resolvedLaggingQuotes < resolvedLeadingQuotes ) {
- for( final var i = unresolved.iterator(); i.hasNext(); ) {
- final var closing = i.next()[ 1 ];
- consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing ) );
- i.remove();
- }
- }
- else if( mOpeningSingleQuote - mClosingSingleQuote == unresolved.size() ) {
- for( final var i = unresolved.iterator(); i.hasNext(); ) {
- final var closing = i.next();
- consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) );
- i.remove();
- }
- }
- else if( unresolved.size() == 2 ) {
- final var closing = unresolved.get( 0 );
- final var opening = unresolved.get( 1 );
- consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) );
- consumer.accept( new Token( QUOTE_OPENING_SINGLE, opening[ 1 ] ) );
-
- // Doesn't affect the algorithm.
- unresolved.clear();
- }
- }
- else if( ambiguousLeadingCount == 1 && resolvedLaggingQuotes == 1 ) {
- final var opening = ambiguousLeadingQuotes.get( 0 );
- consumer.accept( new Token( QUOTE_OPENING_SINGLE, opening[ 1 ] ) );
- unresolved.remove( opening );
- }
- }
-
- /**
- * Remove the last {@link Lexeme}s from the given list.
- *
- * @param unresolved The list of {@link Lexeme}s to modify.
- */
- private void truncate( final List<Lexeme[]> unresolved ) {
- if( !unresolved.isEmpty() ) {
- unresolved.remove( unresolved.size() - 1 );
- }
- }
-
- /**
- * Overwrites the {@link CircularFifoQueue}'s contents with start-of-text
- * indicators.
- *
- * @param lexemes The queue to overflow.
- */
- private void flush( final CircularFifoQueue<Lexeme> lexemes ) {
- lexemes.add( SOT );
- lexemes.add( SOT );
- lexemes.add( SOT );
- }
-}
lib/src/main/java/com/keenwrite/quotes/Token.java
-/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */
-package com.keenwrite.quotes;
-
-/**
- * Represents a high-level token read from the text.
- */
-final class Token implements Comparable<Token> {
- private final TokenType mType;
- final int mBegan;
- final int mEnded;
-
- /**
- * Convenience constructor to create a token that uses the lexeme's
- * beginning and ending offsets to represent a complete token.
- *
- * @param type The type of {@link Token} to create.
- * @param lexeme Container for beginning and ending text offsets.
- */
- Token( final TokenType type, final Lexeme lexeme ) {
- this( type, lexeme.began(), lexeme.ended() );
- }
-
- /**
- * This constructor can be used to create tokens that span more than a
- * single character. Almost all tokens represent a single character, only
- * the double-prime sequence ({@code ''}) is more than one character.
- *
- * @param type The type of {@link Token} to create.
- * @param began Beginning offset into text where token is found.
- * @param ended Ending offset into text where token is found.
- */
- Token( final TokenType type, final int began, final int ended ) {
- assert type != null;
- assert began >= 0;
- assert ended > began;
-
- mType = type;
- mBegan = began;
- mEnded = ended;
- }
-
- TokenType getType() {
- return mType;
- }
-
- int began() {
- return mBegan;
- }
-
- int ended() {
- return mEnded;
- }
-
- @Override
- public int compareTo( final Token that ) {
- return this.mBegan - that.mBegan;
- }
-}
lib/src/main/java/com/keenwrite/quotes/TokenType.java
-/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */
-package com.keenwrite.quotes;
-
-/**
- * The {@link Parser} emits these token types.
- */
-enum TokenType {
- QUOTE_OPENING_SINGLE,
- QUOTE_OPENING_DOUBLE,
- QUOTE_CLOSING_SINGLE,
- QUOTE_CLOSING_DOUBLE,
- QUOTE_APOSTROPHE,
- QUOTE_STRAIGHT_SINGLE,
- QUOTE_STRAIGHT_DOUBLE,
- QUOTE_PRIME_SINGLE,
- QUOTE_PRIME_DOUBLE,
-}
lib/src/test/java/com/keenwrite/quotes/KeenQuotesTest.java
-/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */
-package com.keenwrite.quotes;
-
-import org.junit.jupiter.api.Test;
-import org.junit.jupiter.params.ParameterizedTest;
-import org.junit.jupiter.params.provider.ValueSource;
-
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.util.function.Function;
-
-import static java.lang.System.out;
-import static org.junit.jupiter.api.Assertions.assertEquals;
-import static org.junit.jupiter.api.Assertions.assertNotNull;
-
-/**
- * Test that English straight quotes are converted to curly quotes and
- * apostrophes.
- */
-public class KeenQuotesTest {
- /**
- * This is a single-use test that is useful for debugging.
- */
- @Test
- //@Disabled
- public void test_parse_SingleLine_Parsed() {
- out.println( KeenQuotes.convert(
- "\"’Kearney lives on the banks of Killarney—’",
- out::println
- ) );
- }
-
- /**
- * Tests that straight quotes are converted to curly quotes.
- *
- * @throws IOException Error opening file full of fun.
- */
- @Test
- public void test_Parse_StraightQuotes_CurlyQuotes() throws IOException {
- testConverter( text -> KeenQuotes.convert( text, ( lexeme ) -> {} ) );
- }
-
- @ParameterizedTest
- @ValueSource( strings = {"westrup"} )
- void test_Parse_Story_Converted( final String filename ) throws IOException {
- final var sb = new StringBuilder( 2 ^ 20 );
-
- try( final var reader = open( filename + ".txt" ) ) {
- String line;
-
- while( (line = reader.readLine()) != null ) {
- sb.append( line ).append( '\n' );
- }
- }
-
- System.out.println( KeenQuotes.convert( sb.toString(), out::println ) );
- }
-
- /**
- * Reads a file full of couplets. The first of the pair is the input,
- * the second of the pair is the expected result. Couplets may include
- * newline characters to indicate end of lines and end of paragraphs.
- * Lines that start with {@code #} are ignored.
- *
- * @param parser The text processor capable of straight quote conversion.
- * @throws IOException Error opening file full of fun.
- */
- private void testConverter( final Function<String, String> parser )
- throws IOException {
- try( final var reader = open( "smartypants.txt" ) ) {
- String line;
- String testLine = "";
- String expected = "";
-
- while( ((line = reader.readLine()) != null) ) {
- if( line.startsWith( "#" ) || line.isBlank() ) { continue; }
-
- // Read the first line of the couplet.
- if( testLine.isBlank() ) {
- testLine = line;
- continue;
- }
-
- // Read the second line of the couplet.
- if( expected.isBlank() ) {
- expected = line;
- }
-
- testLine = unescapeEol( testLine );
- expected = unescapeEol( expected );
-
- final var actual = parser.apply( testLine );
- assertEquals( expected, actual );
-
- testLine = "";
- expected = "";
- }
- }
- }
-
- private static String unescapeEol( final String s ) {
- return String.join( "\n", s.split( "\\\\n" ) );
- }
-
- /**
- * Opens a text file for reading. Callers are responsible for closing.
- *
- * @param filename The file to open.
- * @return An instance of {@link BufferedReader} that can be used to
- * read all the lines in the file.
- */
- private BufferedReader open( final String filename ) {
- final var is = getClass().getResourceAsStream( filename );
- assertNotNull( is );
-
- return new BufferedReader( new InputStreamReader( is ) );
- }
-}
lib/src/test/java/com/keenwrite/quotes/LexerTest.java
-/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */
-package com.keenwrite.quotes;
-
-import org.junit.jupiter.api.Test;
-
-import java.util.Arrays;
-import java.util.List;
-import java.util.function.BiFunction;
-
-import static com.keenwrite.quotes.Lexeme.EOT;
-import static com.keenwrite.quotes.LexemeType.*;
-import static org.junit.jupiter.api.Assertions.assertEquals;
-
-/**
- * Tests lexing words, numbers, punctuation, spaces, newlines, etc.
- */
-class LexerTest {
- @Test
- void test_Lexing_Words_TokenValues() {
- testText( "abc 123", "abc", " ", "123" );
- testText( "-123 abc", "-123", " ", "abc" );
- }
-
- @Test
- void test_Lexing_Numbers_EmitNumbers() {
- testType( ".123", NUMBER );
- testType( "-123.", NUMBER, PERIOD );
- testType( " 123.123.123", SPACE, NUMBER );
- testType( "123 123\"", NUMBER, SPACE, NUMBER, QUOTE_DOUBLE );
- testType( "-123,123.123", NUMBER );
- testType( "...1,023...", ELLIPSIS, NUMBER, ELLIPSIS );
- }
-
- @Test
- void test_Lexing_Words_EmitWords() {
- testType( "abc", WORD );
- testType( "abc abc", WORD, SPACE, WORD );
- testType( "abc...", WORD, ELLIPSIS );
- testType( "abc123", WORD, NUMBER );
- testType( "-123abc", NUMBER, WORD );
- testType( "abc-o'-abc", WORD, HYPHEN, WORD, QUOTE_SINGLE, HYPHEN, WORD );
- }
-
- @Test
- void test_Lexing_PunctuationMarks_EmitPunctuationMarks() {
- testType( "!", PUNCT );
- testType( ";", PUNCT );
- testType( ".", PERIOD );
- testType( "-", HYPHEN );
- testType( "--", DASH );
- testType( "---", DASH );
- testType( "...", ELLIPSIS );
- }
-
- @Test
- void test_Lexing_Quotes_EmitQuotes() {
- testType( "'", QUOTE_SINGLE );
- testType( "\"", QUOTE_DOUBLE );
- testType( "3 o'clock", NUMBER, SPACE, WORD, QUOTE_SINGLE, WORD );
- }
-
- @Test
- void test_Lexing_Escapes_EmitEscapedQuotes() {
- testType( "123\\'456\\\"", NUMBER, ESC_SINGLE, NUMBER, ESC_DOUBLE );
- testText( "123\\'456\\\"", "123", "\\'", "456", "\\\"" );
- }
-
- @Test
- void test_Lexing_Newlines_EmitNewlines() {
- testType( "\r", EOL );
- testType( "\n", EOL );
- testType( "\r\n", EOL );
- testType( "\r\n\r\n", EOP );
- testType( "\r\n\n\r", EOP );
- testType( "abc \r\nabc\n", WORD, SPACE, EOL, WORD, EOL );
- }
-
- private void testType( final String actual, final LexemeType... expected ) {
- final var list = Arrays.asList( expected );
- testType( actual, ( lexeme, text ) -> lexeme.getType(), list );
- }
-
- private void testText( final String actual, final String... expected ) {
- testType( actual, Lexeme::toString, Arrays.asList( expected ) );
- }
-
- private <A, E> void testType(
- final String text,
- final BiFunction<Lexeme, String, A> f,
- final List<E> elements ) {
- final var lexer = new Lexer( text );
- var counter = 0;
-
- Lexeme lexeme;
-
- while( (lexeme = lexer.next()) != EOT ) {
- final var expected = elements.get( counter++ );
- final var actual = f.apply( lexeme, text );
-
- assertEquals( expected, actual );
- }
-
- // Ensure all expected values are matched (verify end of text reached).
- assertEquals( elements.size(), counter );
- }
-}
lib/src/test/java/com/keenwrite/quotes/ParserTest.java
-/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */
-package com.keenwrite.quotes;
-
-import org.junit.jupiter.api.Test;
-
-import java.util.HashMap;
-import java.util.Map;
-
-import static com.keenwrite.quotes.TokenType.QUOTE_APOSTROPHE;
-import static org.junit.jupiter.api.Assertions.assertEquals;
-
-/**
- * Test that all unambiguous apostrophes are emitted once.
- */
-class ParserTest {
- private final static Map<String, Map<TokenType, Integer>> TEST_CASES =
- Map.of(
- "That's a 35'×10\" yacht!",
- Map.of( QUOTE_APOSTROPHE, 1 ),
- "It's the 80's...",
- Map.of( QUOTE_APOSTROPHE, 2 ),
- "Fish-'n'-chips!",
- Map.of( QUOTE_APOSTROPHE, 2 ),
- "She's a cat ya'll couldn't've known!",
- Map.of( QUOTE_APOSTROPHE, 4 ),
- "'Twas and 'tis whate'er lay 'twixt dawn and dusk 'n River Styx.",
- Map.of( QUOTE_APOSTROPHE, 5 ),
- """
- But I must leave the proofs to those who 've seen 'em;
- But this I heard her say, and can't be wrong
- And all may think which way their judgments lean 'em,
- ''T is strange---the Hebrew noun which means "I am,"
- The English always use to govern d--n.'
- """,
- Map.of( QUOTE_APOSTROPHE, 5 )
- );
-
- @Test
- void test_Conversion_StraightQuotes_ExpectedConversionCount() {
- for( final var entry : TEST_CASES.entrySet() ) {
- parse( entry.getKey(), entry.getValue() );
- }
- }
-
- private void parse( final String text, final Map<TokenType, Integer> tally ) {
- final var parser = new Parser( text );
- final var actual = new HashMap<TokenType, Integer>();
-
- parser.parse(
- ( token ) -> actual.merge( token.getType(), 1, Integer::sum ),
- ( lexeme ) -> {}
- );
-
- for( final var expectedEntry : tally.entrySet() ) {
- final var expectedCount = expectedEntry.getValue();
- final var actualCount = actual.get( expectedEntry.getKey() );
-
- assertEquals( expectedCount, actualCount, text );
- }
- }
-}
lib/src/test/resources/com/keenwrite/quotes/smartypants.txt
-# ########################################################################
-# Escaped quotes
-# ########################################################################
-She stood 5\'7\".
-She stood 5'7".
-
-\"What?\"
-"What?"
-
-# ########################################################################
-# Primes (single, double)
-# ########################################################################
-Alice's friend is 6'3" tall.
-Alice&apos;s friend is 6&prime;3&Prime; tall.
-
-Bob's table is 5'' × 4''.
-Bob&apos;s table is 5&Prime; × 4&Prime;.
-
-Bob's table is 5'×4'.
-Bob&apos;s table is 5&prime;×4&prime;.
-
-What's this '-5.5'',' '-10.2'' cm,' and another '-7.25''' thing?
-What&apos;s this &lsquo;-5.5&Prime;,&rsquo; &lsquo;-10.2&Prime; cm,&rsquo; and another &lsquo;-7.25&Prime;&rsquo; thing?
-
-'What's this -5.5'' thing?'
-&lsquo;What&apos;s this -5.5&Prime; thing?&rsquo;
-
-+7.9'' is weird.
-+7.9&Prime; is weird.
-
-12" record, 5'10" height
-12&Prime; record, 5&prime;10&Prime; height
-
-Use 11.5"x14.25" virtual paper!
-Use 11.5&Prime;x14.25&Prime; virtual paper!
-
-Angular measure 3° 5' 30" means 3 degs, 5 arcmins, & 30 arcsecs.
-Angular measure 3° 5&prime; 30&Prime; means 3 degs, 5 arcmins, & 30 arcsecs.
-
-# ########################################################################
-# Double quotes
-# ########################################################################
-"I am Sam"
-&ldquo;I am Sam&rdquo;
-
-"...even better!"
-&ldquo;...even better!&rdquo;
-
-"It was so," said he.
-&ldquo;It was so,&rdquo; said he.
-
-"What cries in the streets"?
-&ldquo;What cries in the streets&rdquo;?
-
-With "air quotes" in the middle.
-With &ldquo;air quotes&rdquo; in the middle.
-
-With--"air quotes"--and dashes.
-With--&ldquo;air quotes&rdquo;--and dashes.
-
-"Not all open quotes are closed...
-&ldquo;Not all open quotes are closed...
-
-"And this" and "and this" and "and this" and "and another."
-&ldquo;And this&rdquo; and &ldquo;and this&rdquo; and &ldquo;and this&rdquo; and &ldquo;and another.&rdquo;
-
-"Will'll invite?" Mrs. Thorne asked;\n"because he's coming--he's at the gate."
-&ldquo;Will&apos;ll invite?&rdquo; Mrs. Thorne asked;\n&ldquo;because he&apos;s coming--he&apos;s at the gate.&rdquo;
-
-# ########################################################################
-# Single quotes
-# ########################################################################
-'I am Sam'
-&lsquo;I am Sam&rsquo;
-
-'It was so,' said he.
-&lsquo;It was so,&rsquo; said he.
-
-'...even better!'
-&lsquo;...even better!&rsquo;
-
-With 'quotes' in the middle.
-With &lsquo;quotes&rsquo; in the middle.
-
-With--'imaginary'--dashes.
-With--&lsquo;imaginary&rsquo;--dashes.
-
-''Cause I don't like it, 's why,' said Pat.
-&lsquo;&apos;Cause I don&apos;t like it, &apos;s why,&rsquo; said Pat.
-
-'It's a beautiful day!'
-&lsquo;It&apos;s a beautiful day!&rsquo;
-
-Book 'em, Danno. Rock 'n' roll. 'Cause 'twas the season.
-Book &apos;em, Danno. Rock &apos;n&apos; roll. 'Cause &apos;twas the season.
-
-# ########################################################################
-# Possessives
-# ########################################################################
-Sam's Sams' and the Ross's roses' thorns were prickly.
-Sam&apos;s Sams&apos; and the Ross&apos;s roses&apos; thorns were prickly.
-
-# ########################################################################
-# Inside contractions (no leading/trailing apostrophes)
-# ########################################################################
-I don't like it: I love's it!
-I don&apos;t like it: I love&apos;s it!
-
-We'd've thought that pancakes'll be sweeter there.
-We&apos;d&apos;ve thought that pancakes&apos;ll be sweeter there.
-
-She'd be coming o'er when the horse'd gone to pasture...
-She&apos;d be coming o&apos;er when the horse&apos;d gone to pasture...
-
-# ########################################################################
-# Beginning contractions (leading apostrophes)
-# ########################################################################
-'Twas and 'tis whate'er lay 'twixt dawn and dusk 'n River Styx.
-&apos;Twas and &apos;tis whate&apos;er lay &apos;twixt dawn and dusk &apos;n River Styx.
-
-# ########################################################################
-# Outside contractions (leading and trailing, no middle)
-# ########################################################################
-Salt 'n' vinegar, fish-'n'-chips, sugar 'n spice!
-Salt &apos;n&apos; vinegar, fish-&apos;n&apos;-chips, sugar &apos;n spice!
-
-# ########################################################################
-# Ending contractions (trailing apostrophes)
-# ########################################################################
-Didn' get th' message.
-Didn&apos; get th&apos; message.
-
-Namsayin', y'know what I'ma sayin'?
-Namsayin&apos;, y&apos;know what I&apos;ma sayin&apos;?
-
-# ########################################################################
-# Decades
-# ########################################################################
-The Roaring '20s had the best music, no?
-The Roaring &apos;20s had the best music, no?
-
-Took place in '04, yes'm!
-Took place in &apos;04, yes&apos;m!
-
-# ########################################################################
-# Consecutive quotes
-# ########################################################################
-"'I'm trouble.'"
-&ldquo;&lsquo;I&apos;m trouble.&rsquo;&rdquo;
-
-'"Trouble's my name."'
-&lsquo;&ldquo;Trouble&apos;s my name.&rdquo;&rsquo;
-
-# ########################################################################
-# Nested quotations
-# ########################################################################
-"'Here I am,' said Sam"
-&ldquo;&lsquo;Here I am,&rsquo; said Sam&rdquo;
-
-'"Here I am," said Sam'
-&lsquo;&ldquo;Here I am,&rdquo; said Sam&rsquo;
-
-'Hello, "Dr. Brown," what's your real name?'
-&lsquo;Hello, &ldquo;Dr. Brown,&rdquo; what&apos;s your real name?&rsquo;
-
-"'Twas, t'wasn't thy name, 'twas it?" said Jim "the Barber" Brown.
-&ldquo;&apos;Twas, t&apos;wasn&apos;t thy name, &apos;twas it?&rdquo; said Jim &ldquo;the Barber&rdquo; Brown.
-
-"'I'm in danger. Help? Take me to--' an address on Van Ness Avenue.
-&ldquo;&lsquo;I&apos;m in danger. Help? Take me to--&rsquo; an address on Van Ness Avenue.
-
-"Ah," she said, "you knew! He told you--and you said 'my dear'! How could you?!"
-&ldquo;Ah,&rdquo; she said, &ldquo;you knew! He told you--and you said &lsquo;my dear&rsquo;! How could you?!&rdquo;
-
-shouldn't drop letters, nor say "ain't" or "hain't."
-shouldn&apos;t drop letters, nor say &ldquo;ain&apos;t&rdquo; or &ldquo;hain&apos;t.&rdquo;
-
-"’Kearney lives on the banks of Killarney—’
-&ldquo;’Kearney lives on the banks of Killarney—’
-
-# ########################################################################
-# Mixed
-# ########################################################################
-"She said, 'That's Sam's'," said the Sams' cat.
-&ldquo;She said, &lsquo;That&apos;s Sam&apos;s&rsquo;,&rdquo; said the Sams&apos; cat.
-
-"'Jane said, ''E'll be spooky, Sam's son with the jack-o'-lantern!'" said the O'Mally twins'---y'know---ghosts in unison.
-&ldquo;&lsquo;Jane said, &lsquo;&apos;E&apos;ll be spooky, Sam&apos;s son with the jack-o&apos;-lantern!&rsquo;&rdquo; said the O&apos;Mally twins&apos;---y&apos;know---ghosts in unison.
-
-'He's at Sam's'
-&lsquo;He&apos;s at Sam&apos;s&rsquo;
-
-ma'am
-ma&apos;am
-
-'Twas midnight
-&apos;Twas midnight
-
-"Hello," said the spider. "'Shelob' is my name."
-&ldquo;Hello,&rdquo; said the spider. &ldquo;&lsquo;Shelob&rsquo; is my name.&rdquo;
-
-'He said, \"I want to go.\"' Were you alive in the 70's?
-&lsquo;He said, "I want to go."&rsquo; Were you alive in the 70&apos;s?
-
-"That's a 'magic' sock."
-&ldquo;That&apos;s a &lsquo;magic&rsquo; sock.&rdquo;
-
-'That's a "magic" sock.'
-&lsquo;That&apos;s a &ldquo;magic&rdquo; sock.&rsquo;
-
-'That's a "very 'magic'" sock.'
-&lsquo;That&apos;s a &ldquo;very &lsquo;magic&rsquo;&rdquo; sock.&rsquo;
-
-Website! Company Name, Inc. ("Company Name" or "Company") recommends reading carefully:
-Website! Company Name, Inc. (&ldquo;Company Name&rdquo; or &ldquo;Company&rdquo;) recommends reading carefully:
-
-Website! Company Name, Inc. ('Company Name' or 'Company') recommends reading carefully:
-Website! Company Name, Inc. (&lsquo;Company Name&rsquo; or &lsquo;Company&rsquo;) recommends reading carefully:
-
-Workin' hard
-Workin&apos; hard
-
-"She said, 'Llamas'll languish, they'll--
-&ldquo;She said, &lsquo;Llamas&apos;ll languish, they&apos;ll--
-
-'The 70s are my favorite numbers,' she said.
-&lsquo;The 70s are my favorite numbers,&rsquo; she said.
-
-'70s fashion was weird.
-&apos;70s fashion was weird.
-
-Model \"T2000\"
-Model "T2000"
-
-The 2's complement.\n\n'Yar, binary!'
-The 2&apos;s complement.\n\n&lsquo;Yar, binary!&rsquo;
-
-The Who's complement.\n\n\n"Paragraph boundary!"
-The Who&apos;s complement.\n\n\n&ldquo;Paragraph boundary!&rdquo;
-
-'Oak,' 'elm,' and 'beech' are names of trees. So is 'pine.'
-&lsquo;Oak,&rsquo; &lsquo;elm,&rsquo; and &lsquo;beech&rsquo; are names of trees. So is &lsquo;pine.&rsquo;
-
-'A', 'B', and 'C' are letters.
-&lsquo;A&rsquo;, &lsquo;B&rsquo;, and &lsquo;C&rsquo; are letters.
-
-'He said, "Thinkin'."'
-&lsquo;He said, &ldquo;Thinkin&apos;.&rdquo;&rsquo;
-
-''Sup, Doc?'
-&lsquo;&apos;Sup, Doc?&rsquo;
-
-# ########################################################################
-# Ambiguous (no solution)
-# ########################################################################
-She said, 'Cause of death?\n\n'Old age, there's no doubt.'
-She said, 'Cause of death?\n\n&lsquo;Old age, there&apos;s no doubt.&rsquo;
-
-She said, 'Cause I said so!\n\n'If we're done, I've a secret.'
-She said, 'Cause I said so!\n\n&lsquo;If we&apos;re done, I&apos;ve a secret.&rsquo;
-
-'Bout that time I says, 'Boys! I been thinkin' 'bout th' Universe.'
-'Bout that time I says, &lsquo;Boys! I been thinkin&apos; 'bout th&apos; Universe.&rsquo;
settings.gradle
-rootProject.name = 'quotes'
-include('lib')
-
src/main/java/com/whitemagicsoftware/keenquotes/CircularFifoQueue.java
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.whitemagicsoftware.keenquotes;
+
+import java.util.*;
+
+/**
+ * CircularFifoQueue is a first-in first-out queue with a fixed size that
+ * replaces its oldest element if full.
+ * <p>
+ * The removal order of a {@link CircularFifoQueue} is based on the
+ * insertion order; elements are removed in the same order in which they
+ * were added. The iteration order is the same as the removal order.
+ * </p>
+ * <p>
+ * The {@link #add(Object)}, {@link #remove()}, {@link #peek()},
+ * {@link #poll()},
+ * {@link #offer(Object)} operations all perform in constant time.
+ * All other operations perform in linear time or worse.
+ * </p>
+ * <p>
+ * This queue prevents null objects from being added.
+ * </p>
+ *
+ * @param <E> the type of elements in this collection
+ * @since 4.0
+ */
+public final class CircularFifoQueue<E> extends AbstractCollection<E>
+ implements Queue<E> {
+ /**
+ * Underlying storage array.
+ */
+ private final transient E[] elements;
+
+ /**
+ * Array index of first (oldest) queue element.
+ */
+ private transient int start;
+
+ /**
+ * Index mod maxElements of the array position following the last queue
+ * element. Queue elements start at elements[start] and "wrap around"
+ * elements[maxElements-1], ending at elements[decrement(end)].
+ * For example, elements = {c,a,b}, start=1, end=1 corresponds to
+ * the queue [a,b,c].
+ */
+ private transient int end;
+
+ /**
+ * Flag to indicate if the queue is currently full.
+ */
+ private transient boolean full;
+
+ /**
+ * Capacity of the queue.
+ */
+ private final int maxElements;
+
+ /**
+ * Constructor that creates a queue with the specified size.
+ *
+ * @param size Immutable queue size, must be greater than zero.
+ */
+ @SuppressWarnings( "unchecked" )
+ public CircularFifoQueue( final int size ) {
+ assert size > 0;
+
+ elements = (E[]) new Object[ size ];
+ maxElements = elements.length;
+ }
+
+ /**
+ * Returns the number of elements stored in the queue.
+ *
+ * @return this queue's size
+ */
+ @Override
+ public int size() {
+ int size;
+
+ if( end < start ) {
+ size = maxElements - start + end;
+ }
+ else if( end == start ) {
+ size = full ? maxElements : 0;
+ }
+ else {
+ size = end - start;
+ }
+
+ return size;
+ }
+
+ /**
+ * Returns true if this queue is empty; false otherwise.
+ *
+ * @return true if this queue is empty
+ */
+ @Override
+ public boolean isEmpty() {
+ return size() == 0;
+ }
+
+ /**
+ * Returns {@code true} if the capacity limit of this queue has been reached,
+ * i.e. the number of elements stored in the queue equals its maximum size.
+ *
+ * @return {@code true} if the capacity limit has been reached, {@code
+ * false} otherwise
+ * @since 4.1
+ */
+ public boolean isAtFullCapacity() {
+ return size() == maxElements;
+ }
+
+ /**
+ * Clears this queue.
+ */
+ @Override
+ public void clear() {
+ full = false;
+ start = 0;
+ end = 0;
+ Arrays.fill( elements, null );
+ }
+
+ /**
+ * Adds the given element to this queue. If the queue is full, the least
+ * recently added
+ * element is discarded so that a new element can be inserted.
+ *
+ * @param element the element to add
+ * @return true, always
+ * @throws NullPointerException if the given element is null
+ */
+ @Override
+ public boolean add( final E element ) {
+ Objects.requireNonNull( element, "element" );
+
+ if( isAtFullCapacity() ) {
+ remove();
+ }
+
+ elements[ end++ ] = element;
+
+ if( end >= maxElements ) {
+ end = 0;
+ }
+
+ if( end == start ) {
+ full = true;
+ }
+
+ return true;
+ }
+
+ /**
+ * Returns the element at the specified position in this queue.
+ *
+ * @param index the position of the element in the queue
+ * @return the element at position {@code index}
+ * @throws NoSuchElementException if the requested position is outside the
+ * range [0, size)
+ */
+ public E get( final int index ) {
+ final int sz = size();
+
+ if( index < 0 || index >= sz ) {
+ throw new NoSuchElementException(
+ String.format(
+ "Index %1$d is outside the available range [0, %2$d)",
+ index, sz ) );
+ }
+
+ final int idx = (start + index) % maxElements;
+ return elements[ idx ];
+ }
+
+ /**
+ * Adds the given element to this queue. If the queue is full, the least
+ * recently added
+ * element is discarded so that a new element can be inserted.
+ *
+ * @param element the element to add
+ * @return true, always
+ * @throws NullPointerException if the given element is null
+ */
+ @Override
+ public boolean offer( final E element ) {
+ return add( element );
+ }
+
+ @Override
+ public E poll() {
+ return isEmpty() ? null : remove();
+ }
+
+ @Override
+ public E element() {
+ if( isEmpty() ) {
+ throw new NoSuchElementException( "empty queue" );
+ }
+
+ return peek();
+ }
+
+ @Override
+ public E peek() {
+ return isEmpty() ? null : elements[ start ];
+ }
+
+ @Override
+ public E remove() {
+ if( isEmpty() ) {
+ throw new NoSuchElementException( "empty queue" );
+ }
+
+ final E element = elements[ start ];
+ if( null != element ) {
+ elements[ start++ ] = null;
+
+ if( start >= maxElements ) {
+ start = 0;
+ }
+ full = false;
+ }
+ return element;
+ }
+
+ /**
+ * Increments the internal index.
+ *
+ * @param index the index to increment
+ * @return the updated index
+ */
+ private int increment( int index ) {
+ if( ++index >= maxElements ) {
+ index = 0;
+ }
+ return index;
+ }
+
+ /**
+ * Decrements the internal index.
+ *
+ * @param index the index to decrement
+ * @return the updated index
+ */
+ private int decrement( int index ) {
+ if( --index < 0 ) {
+ index = maxElements - 1;
+ }
+ return index;
+ }
+
+ /**
+ * Returns an iterator over this queue's elements.
+ *
+ * @return an iterator over this queue's elements
+ */
+ @Override
+ public Iterator<E> iterator() {
+ return new Iterator<>() {
+
+ private int index = start;
+ private int lastReturnedIndex = -1;
+ private boolean isFirst = full;
+
+ @Override
+ public boolean hasNext() {
+ return isFirst || index != end;
+ }
+
+ @Override
+ public E next() {
+ if( !hasNext() ) {
+ throw new NoSuchElementException();
+ }
+
+ isFirst = false;
+ lastReturnedIndex = index;
+ index = increment( index );
+ return elements[ lastReturnedIndex ];
+ }
+
+ @Override
+ public void remove() {
+ if( lastReturnedIndex == -1 ) {
+ throw new IllegalStateException();
+ }
+
+ // First element can be removed quickly
+ if( lastReturnedIndex == start ) {
+ CircularFifoQueue.this.remove();
+ lastReturnedIndex = -1;
+ return;
+ }
+
+ int pos = lastReturnedIndex + 1;
+ if( start < lastReturnedIndex && pos < end ) {
+ // shift in one part
+ System.arraycopy(
+ elements, pos, elements, lastReturnedIndex, end - pos );
+ }
+ else {
+ // Other elements require us to shift the subsequent elements
+ while( pos != end ) {
+ if( pos >= maxElements ) {
+ elements[ pos - 1 ] = elements[ 0 ];
+ pos = 0;
+ }
+ else {
+ elements[ decrement( pos ) ] = elements[ pos ];
+ pos = increment( pos );
+ }
+ }
+ }
+
+ lastReturnedIndex = -1;
+ end = decrement( end );
+ elements[ end ] = null;
+ full = false;
+ index = decrement( index );
+ }
+ };
+ }
+}
src/main/java/com/whitemagicsoftware/keenquotes/Contractions.java
+/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */
+package com.whitemagicsoftware.keenquotes;
+
+import java.util.HashSet;
+import java.util.Set;
+
+import static java.util.Collections.emptySet;
+
+/**
+ * Placeholder for various types of contractions.
+ */
+public class Contractions {
+
+ private final Builder mBuilder;
+
+ private Contractions( final Builder builder ) {
+ assert builder != null;
+ mBuilder = builder;
+ }
+
+ /**
+ * Allows constructing a list of custom contractions.
+ */
+ @SuppressWarnings( "unused" )
+ public static class Builder {
+ private final Set<String> mBeganUnambiguous = new HashSet<>();
+ private final Set<String> mEndedUnambiguous = new HashSet<>();
+ private final Set<String> mBeganAmbiguous = new HashSet<>();
+ private final Set<String> mEndedAmbiguous = new HashSet<>();
+
+ public void withBeganUnambiguous( final Set<String> words ) {
+ mBeganUnambiguous.addAll( words );
+ }
+
+ public void withEndedUnambiguous( final Set<String> words ) {
+ mEndedUnambiguous.addAll( words );
+ }
+
+ public void withBeganAmbiguous( final Set<String> words ) {
+ mBeganAmbiguous.addAll( words );
+ }
+
+ public void withEndedAmbiguous( final Set<String> words ) {
+ mEndedAmbiguous.addAll( words );
+ }
+
+ /**
+ * Constructs a new set of {@link Contractions} that can be configured
+ * using this {@link Builder} instance.
+ *
+ * @return {@link Contractions} suitable for use with parsing text.
+ */
+ public Contractions build() {
+ mBeganUnambiguous.addAll( from( mBeganUnambiguous, BEGAN_UNAMBIGUOUS ) );
+ mEndedUnambiguous.addAll( from( mEndedUnambiguous, ENDED_UNAMBIGUOUS ) );
+ mBeganAmbiguous.addAll( from( mBeganAmbiguous, BEGAN_AMBIGUOUS ) );
+ mEndedAmbiguous.addAll( from( mEndedAmbiguous, ENDED_AMBIGUOUS ) );
+
+ return new Contractions( this );
+ }
+
+ /**
+ * This returns the {@code fallback} {@link Set} if {@code src} is empty;
+ * otherwise, this returns the empty {@link Set}.
+ *
+ * @param src A set of contractions, possibly empty.
+ * @param fallback The default values to use if {@code src} is empty.
+ * @param <T> The type of data used by both {@link Set}s.
+ * @return An empty {@link Set} if the {@code src} contains at least one
+ * element; otherwise, this will return {@code fallback}.
+ */
+ private static <T> Set<T> from( final Set<T> src, final Set<T> fallback ) {
+ assert src != null;
+ assert fallback != null;
+ return src.isEmpty() ? fallback : emptySet();
+ }
+ }
+
+ /**
+ * Answers whether the given word is a contraction that always starts
+ * with an apostrophe. The comparison is case insensitive. This must
+ * only be called when a straight quote is followed by a word.
+ *
+ * @param word The word to compare against the list of known unambiguous
+ * contractions.
+ * @return {@code true} when the given word is in the set of unambiguous
+ * contractions.
+ */
+ public boolean beganUnambiguously( final String word ) {
+ assert word != null;
+ return getBeganUnambiguous().contains( word.toLowerCase() );
+ }
+
+ /**
+ * Answers whether the given word could be a contraction but is also a
+ * valid word in non-contracted form.
+ *
+ * @param word The word to compare against the list of known ambiguous
+ * contractions.
+ * @return {@code true} when the given word is in the set of ambiguous
+ * contractions.
+ */
+ public boolean beganAmbiguously( final String word ) {
+ assert word != null;
+ return getBeganAmbiguous().contains( word.toLowerCase() );
+ }
+
+ public boolean endedUnambiguously( final String word ) {
+ assert word != null;
+ return getEndedUnambiguous().contains( word.toLowerCase() );
+ }
+
+ public boolean endedAmbiguously( final String word ) {
+ assert word != null;
+ final var check = word.toLowerCase();
+
+ // Ensure that 'n' isn't matched for ambiguity by enforcing length, yet
+ // allow o' to match because 'a sentence can end with the letter o'.
+ return getEndedAmbiguous().contains( check ) ||
+ check.endsWith( "s" ) || check.endsWith( "z" ) ||
+ check.endsWith( "x" ) || (check.length() > 1 && check.endsWith( "n" ));
+ }
+
+ private Set<String> getBeganUnambiguous() {
+ return mBuilder.mBeganUnambiguous;
+ }
+
+ private Set<String> getEndedUnambiguous() {
+ return mBuilder.mEndedUnambiguous;
+ }
+
+ private Set<String> getBeganAmbiguous() {
+ return mBuilder.mBeganAmbiguous;
+ }
+
+ private Set<String> getEndedAmbiguous() {
+ return mBuilder.mEndedAmbiguous;
+ }
+
+ /**
+ * Words having a straight apostrophe that cannot be mistaken for an
+ * opening single quote.
+ */
+ private static final Set<String> BEGAN_UNAMBIGUOUS = Set.of(
+ "aporth",
+ "boutcha",
+ "boutchu",
+ "cept",
+ "dillo",
+ "em",
+ "fraid",
+ "gainst",
+ "n",
+ "neath",
+ "nother",
+ "onna",
+ "onna'",
+ "pon",
+ "s",
+ "sblood",
+ "scuse",
+ "sfar",
+ "sfoot",
+ "t",
+ "taint",
+ "tain",
+ "til",
+ "tis",
+ "tisn",
+ "tshall",
+ "twas",
+ "twasn",
+ "tween",
+ "twere",
+ "tweren",
+ "twixt",
+ "twon",
+ "twou",
+ "twould",
+ "twouldn",
+ "ve"
+ );
+
+ /**
+ * Words having a straight apostrophe that may be either part of a
+ * contraction or a word that stands alone beside an opening single quote.
+ */
+ private static final Set<String> BEGAN_AMBIGUOUS = Set.of(
+ // about|boxing match
+ "bout",
+ // because|causal
+ "cause",
+ // what you|choo choo train
+ "choo",
+ // he|e pluribus unum
+ "e",
+ // here|earlier
+ "ere",
+ // afro|to and fro
+ "fro",
+ // whore|ho ho!
+ "ho",
+ // okay|letter K
+ "kay",
+ // lo|lo and behold
+ "lo",
+ // are|regarding
+ "re",
+ // what's up|to sup
+ "sup",
+ // it will|twill fabric
+ "twill",
+ // them|utterance
+ "um",
+ // is that|Iranian village
+ "zat"
+ );
+
+ private static final Set<String> ENDED_AMBIGUOUS = Set.of(
+ // give|martial arts garment
+ "gi",
+ // in|I
+ "i",
+ // of|letter o
+ "o"
+ );
+
+ private static final Set<String> ENDED_UNAMBIGUOUS = Set.of(
+ // and
+ "an",
+ // for/before
+ "fo",
+ // friend
+ "frien",
+ // just
+ "jus",
+ // lord
+ "lor",
+ // myself
+ "masel",
+ // old
+ "ol",
+ // San (Francisco)
+ "Sa",
+ // shift
+ "shif",
+ // the
+ "th",
+ // what
+ "wha",
+ // world
+ "worl",
+ // Top ~500 common -ing words as English contractions.
+ "acceptin",
+ "accompanyin",
+ "accordin",
+ "accountin",
+ "achievin",
+ "acquirin",
+ "actin",
+ "addin",
+ "addressin",
+ "adjoinin",
+ "adoptin",
+ "advancin",
+ "advertisin",
+ "affectin",
+ "agin",
+ "allowin",
+ "amazin",
+ "analyzin",
+ "answerin",
+ "anythin",
+ "appearin",
+ "applyin",
+ "approachin",
+ "arguin",
+ "arisin",
+ "arrivin",
+ "askin",
+ "assessin",
+ "assumin",
+ "attackin",
+ "attemptin",
+ "attendin",
+ "avoidin",
+ "bankin",
+ "bargainin",
+ "bearin",
+ "beatin",
+ "becomin",
+ "beginnin",
+ "bein",
+ "believin",
+ "belongin",
+ "bendin",
+ "bindin",
+ "bleedin",
+ "blessin",
+ "blowin",
+ "boilin",
+ "borrowin",
+ "breakin",
+ "breathin",
+ "breedin",
+ "bringin",
+ "broadcastin",
+ "buildin",
+ "burnin",
+ "buyin",
+ "calculatin",
+ "callin",
+ "carryin",
+ "castin",
+ "causin",
+ "ceilin",
+ "challengin",
+ "changin",
+ "checkin",
+ "choosin",
+ "claimin",
+ "cleanin",
+ "clearin",
+ "climbin",
+ "closin",
+ "clothin",
+ "collectin",
+ "combinin",
+ "comin",
+ "commandin",
+ "comparin",
+ "compellin",
+ "competin",
+ "computin",
+ "concernin",
+ "concludin",
+ "conditionin",
+ "conductin",
+ "conflictin",
+ "connectin",
+ "considerin",
+ "consistin",
+ "constructin",
+ "consultin",
+ "consumin",
+ "containin",
+ "continuin",
+ "contractin",
+ "contributin",
+ "controllin",
+ "convincin",
+ "cookin",
+ "coolin",
+ "copin",
+ "correspondin",
+ "counselin",
+ "countin",
+ "couplin",
+ "coverin",
+ "creatin",
+ "crossin",
+ "cryin",
+ "cuttin",
+ "dancin",
+ "darlin",
+ "datin",
+ "dealin",
+ "decidin",
+ "declarin",
+ "declinin",
+ "decreasin",
+ "definin",
+ "demandin",
+ "denyin",
+ "dependin",
+ "descendin",
+ "describin",
+ "designin",
+ "destroyin",
+ "determinin",
+ "developin",
+ "differin",
+ "dinin",
+ "directin",
+ "discussin",
+ "distinguishin",
+ "disturbin",
+ "dividin",
+ "doin",
+ "drawin",
+ "dressin",
+ "drinkin",
+ "drivin",
+ "droppin",
+ "dryin",
+ "durin",
+ "dwellin",
+ "dyin",
+ "eatin",
+ "editin",
+ "emergin",
+ "employin",
+ "enablin",
+ "encouragin",
+ "endin",
+ "engagin",
+ "engineerin",
+ "enjoyin",
+ "enterin",
+ "establishin",
+ "evaluatin",
+ "evenin",
+ "everythin",
+ "examinin",
+ "exceedin",
+ "excitin",
+ "excludin",
+ "existin",
+ "expandin",
+ "expectin",
+ "experiencin",
+ "explainin",
+ "explorin",
+ "expressin",
+ "extendin",
+ "facin",
+ "failin",
+ "fallin",
+ "farmin",
+ "fascinatin",
+ "feedin",
+ "feelin",
+ "fightin",
+ "filin",
+ "fillin",
+ "financin",
+ "findin",
+ "firin",
+ "fishin",
+ "fittin",
+ "fixin",
+ "floatin",
+ "flowin",
+ "flyin",
+ "focusin",
+ "followin",
+ "forcin",
+ "foregoin",
+ "formin",
+ "forthcomin",
+ "foundin",
+ "freezin",
+ "fuckin",
+ "functionin",
+ "fundin",
+ "gainin",
+ "gatherin",
+ "generatin",
+ "gettin",
+ "givin",
+ "goin",
+ "governin",
+ "grantin",
+ "growin",
+ "hackin",
+ "handlin",
+ "hangin",
+ "happenin",
+ "havin",
+ "headin",
+ "healin",
+ "hearin",
+ "heatin",
+ "helpin",
+ "hidin",
+ "holdin",
+ "hopin",
+ "housin",
+ "huntin",
+ "identifyin",
+ "imagin",
+ "implementin",
+ "imposin",
+ "improvin",
+ "includin",
+ "increasin",
+ "indicatin",
+ "interestin",
+ "interpretin",
+ "introducin",
+ "involvin",
+ "joinin",
+ "judgin",
+ "keepin",
+ "killin",
+ "knowin",
+ "lackin",
+ "landin",
+ "lastin",
+ "laughin",
+ "layin",
+ "leadin",
+ "leanin",
+ "learnin",
+ "leavin",
+ "lettin",
+ "liftin",
+ "lightin",
+ "lightnin",
+ "limitin",
+ "listenin",
+ "listin",
+ "livin",
+ "loadin",
+ "lookin",
+ "losin",
+ "lovin",
+ "lowerin",
+ "lyin",
+ "maintainin",
+ "makin",
+ "managin",
+ "manufacturin",
+ "mappin",
+ "marketin",
+ "markin",
+ "matchin",
+ "meanin",
+ "measurin",
+ "meetin",
+ "meltin",
+ "minin",
+ "misleadin",
+ "missin",
+ "mixin",
+ "modelin",
+ "monitorin",
+ "mornin",
+ "movin",
+ "neighborin",
+ "nothin",
+ "notin",
+ "notwithstandin",
+ "nursin",
+ "observin",
+ "obtainin",
+ "occurrin",
+ "offerin",
+ "offsprin",
+ "ongoin",
+ "openin",
+ "operatin",
+ "opposin",
+ "orderin",
+ "organizin",
+ "outstandin",
+ "overwhelmin",
+ "packin",
+ "paintin",
+ "parkin",
+ "participatin",
+ "passin",
+ "payin",
+ "pendin",
+ "performin",
+ "pickin",
+ "pissin",
+ "placin",
+ "plannin",
+ "plantin",
+ "playin",
+ "pleasin",
+ "pointin",
+ "possessin",
+ "preachin",
+ "precedin",
+ "preparin",
+ "presentin",
+ "preservin",
+ "pressin",
+ "prevailin",
+ "preventin",
+ "pricin",
+ "printin",
+ "proceedin",
+ "processin",
+ "producin",
+ "programmin",
+ "promisin",
+ "promotin",
+ "protectin",
+ "providin",
+ "provin",
+ "publishin",
+ "pullin",
+ "purchasin",
+ "pursuin",
+ "pushin",
+ "puttin",
+ "questionin",
+ "rangin",
+ "ratin",
+ "reachin",
+ "readin",
+ "reasonin",
+ "receivin",
+ "recognizin",
+ "recordin",
+ "reducin",
+ "referrin",
+ "reflectin",
+ "refusin",
+ "regardin",
+ "regulatin",
+ "relatin",
+ "remainin",
+ "rememberin",
+ "removin",
+ "renderin",
+ "repeatin",
+ "replacin",
+ "reportin",
+ "representin",
+ "requirin",
+ "respectin",
+ "respondin",
+ "restin",
+ "resultin",
+ "returnin",
+ "revealin",
+ "ridin",
+ "risin",
+ "rulin",
+ "runnin",
+ "sailin",
+ "samplin",
+ "satisfyin",
+ "savin",
+ "sayin",
+ "scatterin",
+ "schoolin",
+ "screenin",
+ "searchin",
+ "securin",
+ "seein",
+ "seekin",
+ "selectin",
+ "sellin",
+ "sendin",
+ "separatin",
+ "servin",
+ "settin",
+ "settlin",
+ "sewin",
+ "shakin",
+ "shapin",
+ "sharin",
+ "shiftin",
+ "shinin",
+ "shippin",
+ "shittin",
+ "shootin",
+ "shoppin",
+ "showin",
+ "singin",
+ "sinkin",
+ "sittin",
+ "sleepin",
+ "smilin",
+ "smokin",
+ "spankin",
+ "solvin",
+ "somethin",
+ "speakin",
+ "spellin",
+ "spendin",
+ "spinnin",
+ "spittin",
+ "spreadin",
+ "standin",
+ "starin",
+ "startin",
+ "statin",
+ "stayin",
+ "stealin",
+ "sterlin",
+ "stimulatin",
+ "stirrin",
+ "stoppin",
+ "strengthenin",
+ "stretchin",
+ "strikin",
+ "strugglin",
+ "studyin",
+ "succeedin",
+ "sufferin",
+ "suggestin",
+ "supplyin",
+ "supportin",
+ "surprisin",
+ "surroundin",
+ "survivin",
+ "sweepin",
+ "swellin",
+ "swimmin",
+ "switchin",
+ "takin",
+ "talkin",
+ "teachin",
+ "tellin",
+ "testin",
+ "thinkin",
+ "threatenin",
+ "throwin",
+ "timin",
+ "touchin",
+ "tradin",
+ "trainin",
+ "travelin",
+ "treatin",
+ "tremblin",
+ "tryin",
+ "turnin",
+ "underlyin",
+ "understandin",
+ "undertakin",
+ "unwillin",
+ "usin",
+ "varyin",
+ "viewin",
+ "visitin",
+ "votin",
+ "waitin",
+ "walkin",
+ "wanderin",
+ "wantin",
+ "warnin",
+ "washin",
+ "watchin",
+ "wearin",
+ "weddin",
+ "whackin",
+ "willin",
+ "windin",
+ "winnin",
+ "wishin",
+ "wonderin",
+ "workin",
+ "writin",
+ "yieldin"
+ );
+}
src/main/java/com/whitemagicsoftware/keenquotes/KeenQuotes.java
+/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */
+package com.whitemagicsoftware.keenquotes;
+
+import java.util.ArrayList;
+import java.util.Map;
+import java.util.function.Consumer;
+
+import static com.whitemagicsoftware.keenquotes.TokenType.*;
+import static java.util.Collections.sort;
+
+/**
+ * Responsible for replacing {@link Token} instances with equivalent smart
+ * quotes (or straight quotes). This will inform the caller when ambiguous
+ * quotes cannot be reliably resolved.
+ */
+public final class KeenQuotes {
+ private static final Map<TokenType, String> REPLACEMENTS = Map.of(
+ QUOTE_OPENING_SINGLE, "&lsquo;",
+ QUOTE_CLOSING_SINGLE, "&rsquo;",
+ QUOTE_OPENING_DOUBLE, "&ldquo;",
+ QUOTE_CLOSING_DOUBLE, "&rdquo;",
+ QUOTE_STRAIGHT_SINGLE, "'",
+ QUOTE_STRAIGHT_DOUBLE, "\"",
+ QUOTE_APOSTROPHE, "&apos;",
+ QUOTE_PRIME_SINGLE, "&prime;",
+ QUOTE_PRIME_DOUBLE, "&Prime;"
+ );
+
+ /**
+ * Converts straight quotes to curly quotes and primes. Any quotation marks
+ * that cannot be converted are passed to the {@link Consumer}.
+ *
+ * @param text The text to parse.
+ * @param unresolved Recipient for ambiguous {@link Lexeme}s.
+ * @return The given text string with as many straight quotes converted to
+ * curly quotes as is feasible.
+ */
+ public static String convert(
+ final String text, final Consumer<Lexeme> unresolved ) {
+ final var parser = new Parser( text );
+ final var tokens = new ArrayList<Token>();
+
+ // Parse the tokens and consume all unresolved lexemes.
+ parser.parse( tokens::add, unresolved );
+
+ // The parser may emit tokens in any order.
+ sort( tokens );
+
+ final var result = new StringBuilder( text.length() );
+ var position = 0;
+
+ for( final var token : tokens ) {
+ if( position <= token.began() ) {
+ result.append( text, position, token.began() );
+ result.append( REPLACEMENTS.get( token.getType() ) );
+ }
+
+ position = token.ended();
+ }
+
+ return result.append( text.substring( position ) ).toString();
+ }
+}
src/main/java/com/whitemagicsoftware/keenquotes/Lexeme.java
+/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */
+package com.whitemagicsoftware.keenquotes;
+
+import static com.whitemagicsoftware.keenquotes.LexemeType.FLAG;
+
+/**
+ * Responsible for tracking the beginning and ending offsets of a lexeme within
+ * a text string. Tracking the beginning and ending indices should use less
+ * memory than duplicating the entire text of Unicode characters (i.e., using
+ * a similar approach to run-length encoding).
+ */
+public final class Lexeme implements Comparable<Lexeme> {
+ /**
+ * Signifies an invalid index to help distinguish EOT/SOT.
+ */
+ private static final int E_INDEX = -2;
+
+ /**
+ * Denotes there are no more lexemes: the end of text (EOT) has been reached.
+ * The beginning index differentiates between EOT and SOT.
+ */
+ public static final Lexeme EOT = new Lexeme( FLAG, -1, E_INDEX );
+
+ /**
+ * Denotes parsing at the start of text (SOT). This is useful to avoid
+ * branching conditions while iterating. The beginning index differentiates
+ * between EOT and SOT.
+ */
+ public static final Lexeme SOT = new Lexeme( FLAG, 0, E_INDEX );
+
+ private final LexemeType mType;
+ private final int mBegan;
+ private final int mEnded;
+
+ /**
+ * Create a lexeme that represents a section of the text.
+ *
+ * @param type Type of {@link Lexeme} to create.
+ * @param began Offset into the text where this instance starts (0-based).
+ * @param ended Offset into the text where this instance stops (0-based).
+ */
+ private Lexeme( final LexemeType type, final int began, final int ended ) {
+ assert type != null;
+ assert began >= 0 || ended == E_INDEX;
+ assert ended >= began || ended == E_INDEX;
+
+ mType = type;
+ mBegan = began;
+ mEnded = ended + 1;
+ }
+
+ /**
+ * Extracts a sequence of characters from the given text at the offsets
+ * captured by this lexeme.
+ *
+ * @param text The text that was parsed using this class.
+ * @return The character string captured by the lexeme.
+ */
+ public String toString( final String text ) {
+ assert text != null;
+ return text.substring( mBegan, mEnded );
+ }
+
+ @Override
+ public String toString() {
+ return getClass().getSimpleName() + '{' +
+ "mType=" + mType +
+ ", mBegan=" + mBegan +
+ ", mEnded=" + mEnded +
+ '}';
+ }
+
+ /**
+ * Answers whether the given {@link LexemeType} is the same as this
+ * instance's internal {@link LexemeType}.
+ *
+ * @param type The {@link LexemeType} to compare.
+ * @return {@code true} if the given {@link LexemeType} is equal to the
+ * internal {@link LexemeType}.
+ */
+ public boolean isType( final LexemeType type ) {
+ assert type != null;
+ return mType == type;
+ }
+
+ /**
+ * Answers whether any of the given {@link LexemeType} matches this
+ * instance's internal {@link LexemeType}.
+ *
+ * @param types The {@link LexemeType}s to compare.
+ * @return {@code true} if the internal {@link LexemeType} matches any one
+ * of the given {@link LexemeType}s.
+ */
+ public boolean anyType( final LexemeType... types ) {
+ assert types != null;
+
+ for( final var type : types ) {
+ if( mType == type ) {
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ public int began() {
+ return mBegan;
+ }
+
+ public int ended() {
+ return mEnded;
+ }
+
+ boolean isSot() {
+ return mBegan == 0;
+ }
+
+ boolean isEot() {
+ return mBegan == -1;
+ }
+
+ LexemeType getType() {
+ return mType;
+ }
+
+ /**
+ * Compares the starting offset of the given {@link Lexeme} to the starting
+ * offset of this {@link Lexeme} instance. This allows a list {@link Lexeme}s
+ * to be sorted by order of appearance in the parsed text.
+ */
+ @Override
+ public int compareTo( final Lexeme that ) {
+ assert that != null;
+ return this.mBegan - that.mBegan;
+ }
+
+ static Lexeme createLexeme(
+ final LexemeType lexeme, final int began, final int ended ) {
+ assert lexeme != null;
+ return new Lexeme( lexeme, began, ended );
+ }
+}
src/main/java/com/whitemagicsoftware/keenquotes/LexemeType.java
+/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */
+package com.whitemagicsoftware.keenquotes;
+
+/**
+ * Represents the type of a {@link Lexeme} parsed by the {@link Lexer}.
+ */
+public enum LexemeType {
+ QUOTE_SINGLE,
+ QUOTE_SINGLE_OPENING,
+ QUOTE_SINGLE_CLOSING,
+ QUOTE_DOUBLE,
+ ESC_SINGLE,
+ ESC_DOUBLE,
+ EOL,
+ EOP,
+ SPACE,
+ WORD,
+ NUMBER,
+ PUNCT,
+ OPENING_GROUP,
+ CLOSING_GROUP,
+ HYPHEN,
+ DASH,
+ EQUALS,
+ PERIOD,
+ ELLIPSIS,
+ FLAG
+}
src/main/java/com/whitemagicsoftware/keenquotes/Lexer.java
+/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */
+package com.whitemagicsoftware.keenquotes;
+
+import java.text.CharacterIterator;
+import java.text.StringCharacterIterator;
+import java.util.function.BiFunction;
+
+import static com.whitemagicsoftware.keenquotes.Lexeme.createLexeme;
+import static com.whitemagicsoftware.keenquotes.LexemeType.*;
+import static java.lang.Character.isDigit;
+import static java.lang.Character.isWhitespace;
+import static java.text.CharacterIterator.DONE;
+
+/**
+ * Turns text into words, numbers, punctuation, spaces, and more.
+ */
+public class Lexer {
+ /**
+ * Iterates over the entire string of text to help produce lexemes.
+ */
+ private final CharacterIterator mIterator;
+
+ /**
+ * Default constructor, no state.
+ */
+ public Lexer( final String text ) {
+ mIterator = new StringCharacterIterator( text );
+ }
+
+ public Lexeme next() {
+ return parse( mIterator );
+ }
+
+ /**
+ * Tokenizes a sequence of characters. The order of comparisons is optimized
+ * towards probability of the occurrence of a character in regular English
+ * prose: letters, space, quotation marks, numbers, periods, new lines,
+ * then end of text.
+ *
+ * @param i The sequence of characters to tokenize.
+ * @return The next token in the sequence.
+ */
+ private Lexeme parse( final CharacterIterator i ) {
+ int began = i.getIndex();
+ boolean isWord = false;
+ Lexeme lexeme = null;
+
+ do {
+ final var curr = i.current();
+
+ if( isLetter( curr ) ) {
+ isWord = true;
+
+ if( !isLetter( peek( i ) ) ) {
+ lexeme = createLexeme( WORD, began, i.getIndex() );
+ }
+ }
+ else if( curr == ' ' ) {
+ lexeme = createLexeme( SPACE, began, i.getIndex() );
+ }
+ else if( curr == '\'' ) {
+ lexeme = createLexeme( QUOTE_SINGLE, began, i.getIndex() );
+ }
+ else if( curr == '"' ) {
+ lexeme = createLexeme( QUOTE_DOUBLE, began, i.getIndex() );
+ }
+ else if( curr == '‘') {
+ lexeme = createLexeme( QUOTE_SINGLE_OPENING, began, i.getIndex() );
+ }
+ else if( curr == '’') {
+ lexeme = createLexeme( QUOTE_SINGLE_CLOSING, began, i.getIndex() );
+ }
+ else if( curr == '-' && peek( i ) == '-' || curr == '—' ) {
+ slurp( i, ( next, ci ) -> next == '-' || next == '—' );
+
+ lexeme = createLexeme( DASH, began, i.getIndex() );
+ }
+ else if( isDigit( curr ) || isNumeric( curr ) && isDigit( peek( i ) ) ) {
+ // Parse all consecutive number characters to prevent the main loop
+ // from switching back to word tokens.
+ slurp( i, ( next, ci ) ->
+ isDigit( next ) || isNumeric( next ) && isDigit( peek( ci ) )
+ );
+
+ lexeme = createLexeme( isWord ? WORD : NUMBER, began, i.getIndex() );
+ }
+ else if( curr == '-' ) {
+ lexeme = createLexeme( HYPHEN, began, i.getIndex() );
+ }
+ else if( curr == '.' ) {
+ // Parse all consecutive periods into an ellipsis lexeme. This will
+ // not capture space-separated ellipsis (such as ". . .").
+ lexeme = createLexeme(
+ slurp( i, ( next, ci ) -> next == '.' ) == 0 ? PERIOD : ELLIPSIS,
+ began, i.getIndex()
+ );
+ }
+ else if( curr == '\r' || curr == '\n' ) {
+ final var cr = new int[]{curr == '\r' ? 1 : 0};
+ final var lf = new int[]{curr == '\n' ? 1 : 0};
+
+ // Swallow all consecutive CR (Mac), CRLF (Windows), and/or LF (Unix).
+ slurp(
+ i, ( next, ci ) -> {
+ cr[ 0 ] += next == '\r' ? 1 : 0;
+ lf[ 0 ] += next == '\n' ? 1 : 0;
+ return next == '\r' || next == '\n';
+ }
+ );
+
+ final var eol = cr[ 0 ] + lf[ 0 ] == 1 || cr[ 0 ] == 1 && lf[ 0 ] == 1;
+ lexeme = createLexeme( eol ? EOL : EOP, began, i.getIndex() );
+ }
+ else if( isWhitespace( curr ) ) {
+ lexeme = createLexeme( SPACE, began, i.getIndex() );
+ }
+ else if( curr == '\\' ) {
+ final var next = i.next();
+
+ if( next == '\'' ) {
+ lexeme = createLexeme( ESC_SINGLE, began, i.getIndex() );
+ }
+ else if( next == '\"' ) {
+ lexeme = createLexeme( ESC_DOUBLE, began, i.getIndex() );
+ }
+ else {
+ // Push back the escaped character, which wasn't a straight quote.
+ i.previous();
+ }
+ }
+ else if( curr == '(' || curr == '{' || curr == '[' ) {
+ lexeme = createLexeme( OPENING_GROUP, began, i.getIndex() );
+ }
+ else if( curr == ')' || curr == '}' || curr == ']' ) {
+ lexeme = createLexeme( CLOSING_GROUP, began, i.getIndex() );
+ }
+ else if( curr == '=' ) {
+ lexeme = createLexeme( EQUALS, began, i.getIndex() );
+ }
+ else if( curr != DONE ) {
+ lexeme = createLexeme( PUNCT, began, i.getIndex() );
+ }
+ else {
+ lexeme = Lexeme.EOT;
+ }
+
+ i.next();
+ }
+ while( lexeme == null );
+
+ return lexeme;
+ }
+
+ /**
+ * Answers whether the given character can be considered part of a word
+ * or not. This will include {@code _} and {@code *} because plain text
+ * formats often use those characters to emphasize a word.
+ *
+ * @param curr The character to check as being part of a word.
+ * @return {@code true} if the given character is a letter or a formatting
+ * indicator.
+ */
+ private static boolean isLetter( final char curr ) {
+ return Character.isLetter( curr ) || curr == '_' || curr == '*';
+ }
+
+ /**
+ * Answers whether the given character can be considered part of a number
+ * or not. This does not include digits, which are checked independently
+ * from this method.
+ *
+ * @param curr The character to check as being related to numbers.
+ * @return {@code true} if the given character can be considered part of
+ * a number (e.g., -2,000.2^2 is considered a single number).
+ */
+ private static boolean isNumeric( final char curr ) {
+ return
+ curr == '.' || curr == ',' || curr == '-' || curr == '+' || curr == '^';
+ }
+
+ private static char peek( final CharacterIterator ci ) {
+ final var ch = ci.next();
+ ci.previous();
+ return ch;
+ }
+
+ /**
+ * Parse all characters that match a given function.
+ *
+ * @param ci The iterator containing characters to parse.
+ * @param f The function that determines when slurping stops.
+ * @return The number of characters parsed.
+ */
+ private static int slurp(
+ final CharacterIterator ci,
+ final BiFunction<Character, CharacterIterator, Boolean> f ) {
+ char next;
+ int count = 0;
+
+ do {
+ next = ci.next();
+ count++;
+ }
+ while( f.apply( next, ci ) );
+
+ // The loop above will overshoot the tally by one character.
+ ci.previous();
+
+ return --count;
+ }
+}
src/main/java/com/whitemagicsoftware/keenquotes/Parser.java
+/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */
+package com.whitemagicsoftware.keenquotes;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.function.Consumer;
+
+import static com.whitemagicsoftware.keenquotes.Lexeme.EOT;
+import static com.whitemagicsoftware.keenquotes.Lexeme.SOT;
+import static com.whitemagicsoftware.keenquotes.LexemeType.*;
+import static com.whitemagicsoftware.keenquotes.TokenType.*;
+
+/**
+ * Converts straight double/single quotes and apostrophes to curly equivalents.
+ */
+public final class Parser {
+ /**
+ * Single quotes preceded by these {@link LexemeType}s may be opening quotes.
+ */
+ private static final LexemeType[] LEADING_QUOTE_OPENING_SINGLE =
+ new LexemeType[]{SPACE, DASH, QUOTE_DOUBLE, OPENING_GROUP, EOL, EOP};
+
+ /**
+ * Single quotes succeeded by these {@link LexemeType}s may be opening quotes.
+ */
+ private static final LexemeType[] LAGGING_QUOTE_OPENING_SINGLE =
+ new LexemeType[]{WORD, ELLIPSIS, QUOTE_SINGLE, QUOTE_DOUBLE};
+
+ /**
+ * Single quotes preceded by these {@link LexemeType}s may be closing quotes.
+ */
+ private static final LexemeType[] LEADING_QUOTE_CLOSING_SINGLE =
+ new LexemeType[]{WORD, NUMBER, PERIOD, PUNCT, ELLIPSIS, QUOTE_DOUBLE};
+
+ /**
+ * Single quotes succeeded by these {@link LexemeType}s may be closing quotes.
+ */
+ private static final LexemeType[] LAGGING_QUOTE_CLOSING_SINGLE =
+ new LexemeType[]{SPACE, HYPHEN, DASH,
+ QUOTE_DOUBLE, CLOSING_GROUP, EOL, EOP};
+
+ /**
+ * Double quotes preceded by these {@link LexemeType}s may be opening quotes.
+ */
+ private static final LexemeType[] LEADING_QUOTE_OPENING_DOUBLE =
+ new LexemeType[]{SPACE, DASH, EQUALS, QUOTE_SINGLE, OPENING_GROUP, EOL, EOP};
+
+ /**
+ * Double quotes succeeded by these {@link LexemeType}s may be opening quotes.
+ */
+ private static final LexemeType[] LAGGING_QUOTE_OPENING_DOUBLE =
+ new LexemeType[]{WORD, NUMBER, ELLIPSIS,
+ QUOTE_SINGLE, QUOTE_SINGLE_OPENING, QUOTE_SINGLE_CLOSING, QUOTE_DOUBLE};
+
+ /**
+ * Double quotes preceded by these {@link LexemeType}s may be closing quotes.
+ */
+ private static final LexemeType[] LEADING_QUOTE_CLOSING_DOUBLE =
+ new LexemeType[]{WORD, NUMBER, PERIOD, PUNCT, DASH, ELLIPSIS,
+ QUOTE_SINGLE, QUOTE_SINGLE_CLOSING, QUOTE_SINGLE_OPENING};
+
+ /**
+ * Double quotes succeeded by these {@link LexemeType}s may be closing quotes.
+ */
+ private static final LexemeType[] LAGGING_QUOTE_CLOSING_DOUBLE =
+ new LexemeType[]{SPACE, PUNCT, PERIOD, EQUALS, HYPHEN, DASH,
+ QUOTE_SINGLE, CLOSING_GROUP, EOL, EOP};
+
+ /**
+ * The text to parse. A reference is required as a minor optimization in
+ * memory and speed: the lexer records integer offsets, rather than new
+ * {@link String} instances, to track parsed lexemes.
+ */
+ private final String mText;
+
+ /**
+ * Converts a string into an iterable list of {@link Lexeme} instances.
+ */
+ private final Lexer mLexer;
+
+ /**
+ * Sets of contractions that help disambiguate single quotes in the text.
+ * These are effectively immutable while parsing.
+ */
+ private final Contractions sContractions;
+
+ /**
+ * Incremented for each opening single quote emitted. Used to help resolve
+ * ambiguities when single quote marks are balanced.
+ */
+ private int mOpeningSingleQuote;
+
+ /**
+ * Incremented for each closing single quote emitted. Used to help resolve
+ * ambiguities when single quote marks are balanced.
+ */
+ private int mClosingSingleQuote;
+
+ /**
+ * Constructs a new {@link Parser} using the default contraction sets
+ * to help resolve some ambiguous scenarios.
+ *
+ * @param text The prose to parse, containing zero or more quotation
+ * characters.
+ */
+ public Parser( final String text ) {
+ this( text, new Contractions.Builder().build() );
+ }
+
+ /**
+ * Constructs a new {@link Parser} using the default contraction sets
+ * to help resolve some ambiguous scenarios.
+ *
+ * @param text The prose to parse, containing zero or more quotation
+ * characters.
+ * @param contractions Custom sets of contractions to help resolve
+ * ambiguities.
+ */
+ public Parser( final String text, final Contractions contractions ) {
+ mText = text;
+ mLexer = new Lexer( mText );
+ sContractions = contractions;
+ }
+
+ /**
+ * Iterates over the entire text provided at construction, emitting
+ * {@link Token}s that can be used to convert straight quotes to curly
+ * quotes.
+ *
+ * @param tokenConsumer Receives emitted {@link Token}s.
+ */
+ public void parse(
+ final Consumer<Token> tokenConsumer,
+ final Consumer<Lexeme> lexemeConsumer ) {
+ final var lexemes = new CircularFifoQueue<Lexeme>( 3 );
+
+ // Allow consuming the very first token without checking the queue size.
+ flush( lexemes );
+
+ final var unresolved = new ArrayList<Lexeme[]>();
+ Lexeme lexeme;
+
+ // Create and convert a list of all unambiguous quote characters.
+ while( (lexeme = mLexer.next()) != EOT ) {
+ if( tokenize( lexeme, lexemes, tokenConsumer, unresolved ) ) {
+ // Attempt to resolve any remaining unambiguous quotes.
+ resolve( unresolved, tokenConsumer );
+
+ // Notify of any unambiguous quotes that could not be resolved.
+ unresolved.forEach( ( lex ) -> lexemeConsumer.accept( lex[ 1 ] ) );
+ unresolved.clear();
+ mOpeningSingleQuote = 0;
+ mClosingSingleQuote = 0;
+ }
+ }
+
+ // By loop's end, the lexemes list contains tokens for all except the
+ // final two elements (from tokenizing in triplets). Tokenize the remaining
+ // unprocessed lexemes.
+ tokenize( EOT, lexemes, tokenConsumer, unresolved );
+ tokenize( EOT, lexemes, tokenConsumer, unresolved );
+
+ // Attempt to resolve any remaining unambiguous quotes.
+ resolve( unresolved, tokenConsumer );
+
+ // Notify of any unambiguous quotes that could not be resolved.
+ unresolved.forEach( ( lex ) -> lexemeConsumer.accept( lex[ 1 ] ) );
+ }
+
+ /**
+ * Converts {@link Lexeme}s identified as straight quotes into {@link Token}s
+ * that represent the curly equivalent. The {@link Token}s are passed to
+ * the given {@link Consumer} for further processing (e.g., replaced in
+ * the original text being parsed).
+ *
+ * @param lexeme A part of the text being parsed.
+ * @param lexemes A 3-element queue of lexemes that provide sufficient
+ * context to identify curly quotes.
+ * @param consumer Recipient of equivalent quotes.
+ * @param unresolved Rolling list of potentially ambiguous {@link Lexeme}s
+ * that could not be tokenized, yet.
+ * @return {@code true} if an end-of-paragraph is detected.
+ */
+ private boolean tokenize( final Lexeme lexeme,
+ final CircularFifoQueue<Lexeme> lexemes,
+ final Consumer<Token> consumer,
+ final List<Lexeme[]> unresolved ) {
+ // Add the next lexeme to tokenize into the queue for immediate processing.
+ lexemes.add( lexeme );
+
+ final var lex1 = lexemes.get( 0 );
+ final var lex2 = lexemes.get( 1 );
+ final var lex3 = lexemes.get( 2 );
+
+ if( lex2.isType( QUOTE_SINGLE ) && lex3.isType( WORD ) &&
+ lex1.anyType( WORD, PERIOD, NUMBER ) ) {
+ // Examples: y'all, Ph.D.'ll, 20's, she's
+ consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) );
+ }
+ else if( lex1.isType( QUOTE_SINGLE ) && lex3.isType( QUOTE_SINGLE ) &&
+ "n".equalsIgnoreCase( lex2.toString( mText ) ) ) {
+ // I.e., 'n'
+ consumer.accept( new Token( QUOTE_APOSTROPHE, lex1 ) );
+ consumer.accept( new Token( QUOTE_APOSTROPHE, lex3 ) );
+ flush( lexemes );
+ truncate( unresolved );
+ }
+ else if( lex2.isType( QUOTE_SINGLE ) && lex1.isType( NUMBER ) ) {
+ if( lex3.isType( QUOTE_SINGLE ) ) {
+ // E.g., 2''
+ consumer.accept(
+ new Token( QUOTE_PRIME_DOUBLE, lex2.began(), lex3.ended() ) );
+ flush( lexemes );
+ }
+ else {
+ // E.g., 2'
+ consumer.accept( new Token( QUOTE_PRIME_SINGLE, lex2 ) );
+ }
+ }
+ else if( lex2.isType( QUOTE_DOUBLE ) && lex1.isType( NUMBER ) ) {
+ // E.g., 2"
+ consumer.accept( new Token( QUOTE_PRIME_DOUBLE, lex2 ) );
+ }
+ else if( lex2.isType( WORD ) && lex3.isType( QUOTE_SINGLE ) &&
+ sContractions.endedUnambiguously( lex2.toString( mText ) ) ) {
+ // E.g., thinkin'
+ consumer.accept( new Token( QUOTE_APOSTROPHE, lex3 ) );
+ flush( lexemes );
+ }
+ else if( lex2.isType( NUMBER ) && lex1.isType( QUOTE_SINGLE ) ) {
+ if( lex3.anyType( SPACE, PUNCT ) || (lex3.isType( WORD ) &&
+ lex3.toString( mText ).equalsIgnoreCase( "s" )) ) {
+ // Sentences must re-written to avoid starting with numerals.
+ // Examples: '20s, '02
+ consumer.accept( new Token( QUOTE_APOSTROPHE, lex1 ) );
+ }
+ else {
+ // E.g., '2''
+ consumer.accept( new Token( QUOTE_OPENING_SINGLE, lex1 ) );
+ mOpeningSingleQuote++;
+ }
+
+ truncate( unresolved );
+ }
+ else if( lex2.isType( QUOTE_SINGLE ) &&
+ lex1.anyType( PUNCT, PERIOD, ELLIPSIS, DASH ) &&
+ (lex3.anyType( EOL, EOP ) || lex3.isEot()) ) {
+ consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) );
+ mClosingSingleQuote++;
+ }
+ else if( lex1.isType( ESC_SINGLE ) ) {
+ // E.g., \'
+ consumer.accept( new Token( QUOTE_STRAIGHT_SINGLE, lex1 ) );
+ }
+ else if( lex1.isType( ESC_DOUBLE ) ) {
+ // E.g., \"
+ consumer.accept( new Token( QUOTE_STRAIGHT_DOUBLE, lex1 ) );
+
+ if( lex2.isType( QUOTE_SINGLE ) &&
+ (lex3.isEot() || lex3.anyType( SPACE, DASH, EOL, EOP )) ) {
+ consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) );
+ mClosingSingleQuote++;
+ }
+ }
+ else if( lex2.isType( QUOTE_DOUBLE ) &&
+ (lex1.isSot() || lex1.anyType( LEADING_QUOTE_OPENING_DOUBLE )) &&
+ lex3.anyType( LAGGING_QUOTE_OPENING_DOUBLE ) ) {
+ // Examples: "", "..., "word, ---"word
+ consumer.accept( new Token( QUOTE_OPENING_DOUBLE, lex2 ) );
+ }
+ else if( lex2.isType( QUOTE_DOUBLE ) &&
+ lex1.anyType( LEADING_QUOTE_CLOSING_DOUBLE ) &&
+ (lex3.isEot() || lex3.anyType( LAGGING_QUOTE_CLOSING_DOUBLE )) ) {
+ // Examples: ..."', word"', ?"', word"?
+ consumer.accept( new Token( QUOTE_CLOSING_DOUBLE, lex2 ) );
+ }
+ else if( lex1.isType( QUOTE_SINGLE ) &&
+ lex2.anyType( PUNCT, PERIOD, DASH ) && lex3.isType( QUOTE_DOUBLE ) ) {
+ // E.g., '," (contraction ruled out from previous conditionals)
+ consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex1 ) );
+ truncate( unresolved );
+ mClosingSingleQuote++;
+ }
+ else if( lex2.anyType( QUOTE_SINGLE, QUOTE_DOUBLE ) ) {
+ // After tokenizing, the parser will attempt to resolve ambiguities.
+ unresolved.add( new Lexeme[]{lex1, lex2, lex3} );
+ }
+
+ // Suggest to the caller that resolution should be performed. This allows
+ // the algorithm to reset the opening/closing quote balance before the
+ // next paragraph is parsed.
+ return lex3.isType( EOP );
+ }
+
+ private void resolve(
+ final List<Lexeme[]> unresolved, final Consumer<Token> consumer ) {
+ // Some non-emitted tokenized lexemes may be ambiguous.
+ final var ambiguousLeadingQuotes = new ArrayList<Lexeme[]>( 16 );
+ final var ambiguousLaggingQuotes = new ArrayList<Lexeme[]>( 16 );
+ var resolvedLeadingQuotes = 0;
+ var resolvedLaggingQuotes = 0;
+
+ // Count the number of ambiguous and non-ambiguous open single quotes.
+ for( var i = unresolved.iterator(); i.hasNext(); ) {
+ final var quotes = i.next();
+ final var lex1 = quotes[ 0 ];
+ final var lex2 = quotes[ 1 ];
+ final var lex3 = quotes[ 2 ];
+
+ if( lex2.isType( QUOTE_SINGLE ) ) {
+ final var word1 = lex1 == SOT ? "" : lex1.toString( mText );
+ final var word3 = lex3 == EOT ? "" : lex3.toString( mText );
+
+ if( sContractions.beganAmbiguously( word3 ) ) {
+ // E.g., 'Cause
+ if( lex1.isType( QUOTE_SINGLE ) ) {
+ // E.g., ''Cause
+ consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) );
+ i.remove();
+ }
+ else {
+ // The contraction is uncertain until a closing quote is found that
+ // may balance this single quote.
+ ambiguousLeadingQuotes.add( quotes );
+ }
+ }
+ else if( sContractions.beganUnambiguously( word3 ) ) {
+ // The quote mark forms a word that does not stand alone from its
+ // contraction. For example, twas is not a word: it's 'twas.
+ consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) );
+ i.remove();
+ }
+ else if( sContractions.endedAmbiguously( word1 ) ) {
+ ambiguousLaggingQuotes.add( quotes );
+ }
+ else if( (lex1.isSot() || lex1.anyType( LEADING_QUOTE_OPENING_SINGLE ))
+ && lex3.anyType( LAGGING_QUOTE_OPENING_SINGLE ) ) {
+ consumer.accept( new Token( QUOTE_OPENING_SINGLE, lex2 ) );
+ resolvedLeadingQuotes++;
+ mOpeningSingleQuote++;
+ i.remove();
+ }
+ else if( lex1.anyType( LEADING_QUOTE_CLOSING_SINGLE ) &&
+ (lex3.isEot() || lex3.anyType( LAGGING_QUOTE_CLOSING_SINGLE )) ) {
+ consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) );
+ resolvedLaggingQuotes++;
+ mClosingSingleQuote++;
+ i.remove();
+ }
+ else if( lex3.isType( NUMBER ) ) {
+ // E.g., '04
+ ambiguousLeadingQuotes.add( quotes );
+ }
+ }
+ }
+
+ final var ambiguousLeadingCount = ambiguousLeadingQuotes.size();
+ final var ambiguousLaggingCount = ambiguousLaggingQuotes.size();
+
+ if( resolvedLeadingQuotes == 1 && resolvedLaggingQuotes == 0 ) {
+ if( ambiguousLeadingCount == 0 && ambiguousLaggingCount == 1 ) {
+ final var balanced = mClosingSingleQuote - mOpeningSingleQuote == 0;
+ final var quote = balanced ? QUOTE_APOSTROPHE : QUOTE_CLOSING_SINGLE;
+ final var lex = ambiguousLaggingQuotes.get( 0 );
+ consumer.accept( new Token( quote, lex[ 1 ] ) );
+ unresolved.remove( lex );
+ }
+ else if( ambiguousLeadingCount == 0 && unresolved.size() == 1 ) {
+ // Must be a closing quote.
+ final var closing = unresolved.get( 0 );
+ consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) );
+ unresolved.remove( closing );
+ }
+ }
+ else if( ambiguousLeadingCount == 0 && ambiguousLaggingCount > 0 ) {
+ // If there are no ambiguous leading quotes then all ambiguous lagging
+ // quotes must be contractions.
+ ambiguousLaggingQuotes.forEach(
+ lex -> {
+ consumer.accept( new Token( QUOTE_APOSTROPHE, lex[ 1 ] ) );
+ unresolved.remove( lex );
+ }
+ );
+ }
+ else if( ambiguousLeadingCount == 0 ) {
+ if( resolvedLaggingQuotes < resolvedLeadingQuotes ) {
+ for( final var i = unresolved.iterator(); i.hasNext(); ) {
+ final var closing = i.next()[ 1 ];
+ consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing ) );
+ i.remove();
+ }
+ }
+ else if( mOpeningSingleQuote - mClosingSingleQuote == unresolved.size() ) {
+ for( final var i = unresolved.iterator(); i.hasNext(); ) {
+ final var closing = i.next();
+ consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) );
+ i.remove();
+ }
+ }
+ else if( unresolved.size() == 2 ) {
+ final var closing = unresolved.get( 0 );
+ final var opening = unresolved.get( 1 );
+ consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) );
+ consumer.accept( new Token( QUOTE_OPENING_SINGLE, opening[ 1 ] ) );
+
+ // Doesn't affect the algorithm.
+ unresolved.clear();
+ }
+ }
+ else if( ambiguousLeadingCount == 1 && resolvedLaggingQuotes == 1 ) {
+ final var opening = ambiguousLeadingQuotes.get( 0 );
+ consumer.accept( new Token( QUOTE_OPENING_SINGLE, opening[ 1 ] ) );
+ unresolved.remove( opening );
+ }
+ }
+
+ /**
+ * Remove the last {@link Lexeme}s from the given list.
+ *
+ * @param unresolved The list of {@link Lexeme}s to modify.
+ */
+ private void truncate( final List<Lexeme[]> unresolved ) {
+ if( !unresolved.isEmpty() ) {
+ unresolved.remove( unresolved.size() - 1 );
+ }
+ }
+
+ /**
+ * Overwrites the {@link CircularFifoQueue}'s contents with start-of-text
+ * indicators.
+ *
+ * @param lexemes The queue to overflow.
+ */
+ private void flush( final CircularFifoQueue<Lexeme> lexemes ) {
+ lexemes.add( SOT );
+ lexemes.add( SOT );
+ lexemes.add( SOT );
+ }
+}
src/main/java/com/whitemagicsoftware/keenquotes/Token.java
+/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */
+package com.whitemagicsoftware.keenquotes;
+
+/**
+ * Represents a high-level token read from the text.
+ */
+final class Token implements Comparable<Token> {
+ private final TokenType mType;
+ final int mBegan;
+ final int mEnded;
+
+ /**
+ * Convenience constructor to create a token that uses the lexeme's
+ * beginning and ending offsets to represent a complete token.
+ *
+ * @param type The type of {@link Token} to create.
+ * @param lexeme Container for beginning and ending text offsets.
+ */
+ Token( final TokenType type, final Lexeme lexeme ) {
+ this( type, lexeme.began(), lexeme.ended() );
+ }
+
+ /**
+ * This constructor can be used to create tokens that span more than a
+ * single character. Almost all tokens represent a single character, only
+ * the double-prime sequence ({@code ''}) is more than one character.
+ *
+ * @param type The type of {@link Token} to create.
+ * @param began Beginning offset into text where token is found.
+ * @param ended Ending offset into text where token is found.
+ */
+ Token( final TokenType type, final int began, final int ended ) {
+ assert type != null;
+ assert began >= 0;
+ assert ended > began;
+
+ mType = type;
+ mBegan = began;
+ mEnded = ended;
+ }
+
+ TokenType getType() {
+ return mType;
+ }
+
+ int began() {
+ return mBegan;
+ }
+
+ int ended() {
+ return mEnded;
+ }
+
+ @Override
+ public int compareTo( final Token that ) {
+ return this.mBegan - that.mBegan;
+ }
+}
src/main/java/com/whitemagicsoftware/keenquotes/TokenType.java
+/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */
+package com.whitemagicsoftware.keenquotes;
+
+/**
+ * The {@link Parser} emits these token types.
+ */
+enum TokenType {
+ QUOTE_OPENING_SINGLE,
+ QUOTE_OPENING_DOUBLE,
+ QUOTE_CLOSING_SINGLE,
+ QUOTE_CLOSING_DOUBLE,
+ QUOTE_APOSTROPHE,
+ QUOTE_STRAIGHT_SINGLE,
+ QUOTE_STRAIGHT_DOUBLE,
+ QUOTE_PRIME_SINGLE,
+ QUOTE_PRIME_DOUBLE,
+}
src/test/java/com/whitemagicsoftware/keenquotes/KeenQuotesTest.java
+/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */
+package com.whitemagicsoftware.keenquotes;
+
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.ValueSource;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.util.function.Function;
+
+import static java.lang.System.out;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+
+/**
+ * Test that English straight quotes are converted to curly quotes and
+ * apostrophes.
+ */
+public class KeenQuotesTest {
+ /**
+ * This is a single-use test that is useful for debugging.
+ */
+ @Test
+ //@Disabled
+ public void test_parse_SingleLine_Parsed() {
+ out.println( KeenQuotes.convert(
+ "\"’Kearney lives on the banks of Killarney—’",
+ out::println
+ ) );
+ }
+
+ /**
+ * Tests that straight quotes are converted to curly quotes.
+ *
+ * @throws IOException Error opening file full of fun.
+ */
+ @Test
+ public void test_Parse_StraightQuotes_CurlyQuotes() throws IOException {
+ testConverter( text -> KeenQuotes.convert( text, ( lexeme ) -> {} ) );
+ }
+
+ @ParameterizedTest
+ @ValueSource( strings = {"westrup"} )
+ void test_Parse_Story_Converted( final String filename ) throws IOException {
+ final var sb = new StringBuilder( 2 ^ 20 );
+
+ try( final var reader = open( filename + ".txt" ) ) {
+ String line;
+
+ while( (line = reader.readLine()) != null ) {
+ sb.append( line ).append( '\n' );
+ }
+ }
+
+ System.out.println( KeenQuotes.convert( sb.toString(), out::println ) );
+ }
+
+ /**
+ * Reads a file full of couplets. The first of the pair is the input,
+ * the second of the pair is the expected result. Couplets may include
+ * newline characters to indicate end of lines and end of paragraphs.
+ * Lines that start with {@code #} are ignored.
+ *
+ * @param parser The text processor capable of straight quote conversion.
+ * @throws IOException Error opening file full of fun.
+ */
+ private void testConverter( final Function<String, String> parser )
+ throws IOException {
+ try( final var reader = open( "smartypants.txt" ) ) {
+ String line;
+ String testLine = "";
+ String expected = "";
+
+ while( ((line = reader.readLine()) != null) ) {
+ if( line.startsWith( "#" ) || line.isBlank() ) { continue; }
+
+ // Read the first line of the couplet.
+ if( testLine.isBlank() ) {
+ testLine = line;
+ continue;
+ }
+
+ // Read the second line of the couplet.
+ if( expected.isBlank() ) {
+ expected = line;
+ }
+
+ testLine = unescapeEol( testLine );
+ expected = unescapeEol( expected );
+
+ final var actual = parser.apply( testLine );
+ assertEquals( expected, actual );
+
+ testLine = "";
+ expected = "";
+ }
+ }
+ }
+
+ private static String unescapeEol( final String s ) {
+ return String.join( "\n", s.split( "\\\\n" ) );
+ }
+
+ /**
+ * Opens a text file for reading. Callers are responsible for closing.
+ *
+ * @param filename The file to open.
+ * @return An instance of {@link BufferedReader} that can be used to
+ * read all the lines in the file.
+ */
+ private BufferedReader open( final String filename ) {
+ final var is = getClass().getResourceAsStream( filename );
+ assertNotNull( is );
+
+ return new BufferedReader( new InputStreamReader( is ) );
+ }
+}
src/test/java/com/whitemagicsoftware/keenquotes/LexerTest.java
+/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */
+package com.whitemagicsoftware.keenquotes;
+
+import org.junit.jupiter.api.Test;
+
+import java.util.Arrays;
+import java.util.List;
+import java.util.function.BiFunction;
+
+import static com.whitemagicsoftware.keenquotes.Lexeme.EOT;
+import static com.whitemagicsoftware.keenquotes.LexemeType.*;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+/**
+ * Tests lexing words, numbers, punctuation, spaces, newlines, etc.
+ */
+class LexerTest {
+ @Test
+ void test_Lexing_Words_TokenValues() {
+ testText( "abc 123", "abc", " ", "123" );
+ testText( "-123 abc", "-123", " ", "abc" );
+ }
+
+ @Test
+ void test_Lexing_Numbers_EmitNumbers() {
+ testType( ".123", NUMBER );
+ testType( "-123.", NUMBER, PERIOD );
+ testType( " 123.123.123", SPACE, NUMBER );
+ testType( "123 123\"", NUMBER, SPACE, NUMBER, QUOTE_DOUBLE );
+ testType( "-123,123.123", NUMBER );
+ testType( "...1,023...", ELLIPSIS, NUMBER, ELLIPSIS );
+ }
+
+ @Test
+ void test_Lexing_Words_EmitWords() {
+ testType( "abc", WORD );
+ testType( "abc abc", WORD, SPACE, WORD );
+ testType( "abc...", WORD, ELLIPSIS );
+ testType( "abc123", WORD, NUMBER );
+ testType( "-123abc", NUMBER, WORD );
+ testType( "abc-o'-abc", WORD, HYPHEN, WORD, QUOTE_SINGLE, HYPHEN, WORD );
+ }
+
+ @Test
+ void test_Lexing_PunctuationMarks_EmitPunctuationMarks() {
+ testType( "!", PUNCT );
+ testType( ";", PUNCT );
+ testType( ".", PERIOD );
+ testType( "-", HYPHEN );
+ testType( "--", DASH );
+ testType( "---", DASH );
+ testType( "...", ELLIPSIS );
+ }
+
+ @Test
+ void test_Lexing_Quotes_EmitQuotes() {
+ testType( "'", QUOTE_SINGLE );
+ testType( "\"", QUOTE_DOUBLE );
+ testType( "3 o'clock", NUMBER, SPACE, WORD, QUOTE_SINGLE, WORD );
+ }
+
+ @Test
+ void test_Lexing_Escapes_EmitEscapedQuotes() {
+ testType( "123\\'456\\\"", NUMBER, ESC_SINGLE, NUMBER, ESC_DOUBLE );
+ testText( "123\\'456\\\"", "123", "\\'", "456", "\\\"" );
+ }
+
+ @Test
+ void test_Lexing_Newlines_EmitNewlines() {
+ testType( "\r", EOL );
+ testType( "\n", EOL );
+ testType( "\r\n", EOL );
+ testType( "\r\n\r\n", EOP );
+ testType( "\r\n\n\r", EOP );
+ testType( "abc \r\nabc\n", WORD, SPACE, EOL, WORD, EOL );
+ }
+
+ private void testType( final String actual, final LexemeType... expected ) {
+ final var list = Arrays.asList( expected );
+ testType( actual, ( lexeme, text ) -> lexeme.getType(), list );
+ }
+
+ private void testText( final String actual, final String... expected ) {
+ testType( actual, Lexeme::toString, Arrays.asList( expected ) );
+ }
+
+ private <A, E> void testType(
+ final String text,
+ final BiFunction<Lexeme, String, A> f,
+ final List<E> elements ) {
+ final var lexer = new Lexer( text );
+ var counter = 0;
+
+ Lexeme lexeme;
+
+ while( (lexeme = lexer.next()) != EOT ) {
+ final var expected = elements.get( counter++ );
+ final var actual = f.apply( lexeme, text );
+
+ assertEquals( expected, actual );
+ }
+
+ // Ensure all expected values are matched (verify end of text reached).
+ assertEquals( elements.size(), counter );
+ }
+}
src/test/java/com/whitemagicsoftware/keenquotes/ParserTest.java
+/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */
+package com.whitemagicsoftware.keenquotes;
+
+import org.junit.jupiter.api.Test;
+
+import java.util.HashMap;
+import java.util.Map;
+
+import static com.whitemagicsoftware.keenquotes.TokenType.QUOTE_APOSTROPHE;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+/**
+ * Test that all unambiguous apostrophes are emitted once.
+ */
+class ParserTest {
+ private final static Map<String, Map<TokenType, Integer>> TEST_CASES =
+ Map.of(
+ "That's a 35'×10\" yacht!",
+ Map.of( QUOTE_APOSTROPHE, 1 ),
+ "It's the 80's...",
+ Map.of( QUOTE_APOSTROPHE, 2 ),
+ "Fish-'n'-chips!",
+ Map.of( QUOTE_APOSTROPHE, 2 ),
+ "She's a cat ya'll couldn't've known!",
+ Map.of( QUOTE_APOSTROPHE, 4 ),
+ "'Twas and 'tis whate'er lay 'twixt dawn and dusk 'n River Styx.",
+ Map.of( QUOTE_APOSTROPHE, 5 ),
+ """
+ But I must leave the proofs to those who 've seen 'em;
+ But this I heard her say, and can't be wrong
+ And all may think which way their judgments lean 'em,
+ ''T is strange---the Hebrew noun which means "I am,"
+ The English always use to govern d--n.'
+ """,
+ Map.of( QUOTE_APOSTROPHE, 5 )
+ );
+
+ @Test
+ void test_Conversion_StraightQuotes_ExpectedConversionCount() {
+ for( final var entry : TEST_CASES.entrySet() ) {
+ parse( entry.getKey(), entry.getValue() );
+ }
+ }
+
+ private void parse( final String text, final Map<TokenType, Integer> tally ) {
+ final var parser = new Parser( text );
+ final var actual = new HashMap<TokenType, Integer>();
+
+ parser.parse(
+ ( token ) -> actual.merge( token.getType(), 1, Integer::sum ),
+ ( lexeme ) -> {}
+ );
+
+ for( final var expectedEntry : tally.entrySet() ) {
+ final var expectedCount = expectedEntry.getValue();
+ final var actualCount = actual.get( expectedEntry.getKey() );
+
+ assertEquals( expectedCount, actualCount, text );
+ }
+ }
+}
src/test/resources/com/whitemagicsoftware/keenquotes/smartypants.txt
+# ########################################################################
+# Escaped quotes
+# ########################################################################
+She stood 5\'7\".
+She stood 5'7".
+
+\"What?\"
+"What?"
+
+# ########################################################################
+# Primes (single, double)
+# ########################################################################
+Alice's friend is 6'3" tall.
+Alice&apos;s friend is 6&prime;3&Prime; tall.
+
+Bob's table is 5'' × 4''.
+Bob&apos;s table is 5&Prime; × 4&Prime;.
+
+Bob's table is 5'×4'.
+Bob&apos;s table is 5&prime;×4&prime;.
+
+What's this '-5.5'',' '-10.2'' cm,' and another '-7.25''' thing?
+What&apos;s this &lsquo;-5.5&Prime;,&rsquo; &lsquo;-10.2&Prime; cm,&rsquo; and another &lsquo;-7.25&Prime;&rsquo; thing?
+
+'What's this -5.5'' thing?'
+&lsquo;What&apos;s this -5.5&Prime; thing?&rsquo;
+
++7.9'' is weird.
++7.9&Prime; is weird.
+
+12" record, 5'10" height
+12&Prime; record, 5&prime;10&Prime; height
+
+Use 11.5"x14.25" virtual paper!
+Use 11.5&Prime;x14.25&Prime; virtual paper!
+
+Angular measure 3° 5' 30" means 3 degs, 5 arcmins, & 30 arcsecs.
+Angular measure 3° 5&prime; 30&Prime; means 3 degs, 5 arcmins, & 30 arcsecs.
+
+# ########################################################################
+# Double quotes
+# ########################################################################
+"I am Sam"
+&ldquo;I am Sam&rdquo;
+
+"...even better!"
+&ldquo;...even better!&rdquo;
+
+"It was so," said he.
+&ldquo;It was so,&rdquo; said he.
+
+"What cries in the streets"?
+&ldquo;What cries in the streets&rdquo;?
+
+With "air quotes" in the middle.
+With &ldquo;air quotes&rdquo; in the middle.
+
+With--"air quotes"--and dashes.
+With--&ldquo;air quotes&rdquo;--and dashes.
+
+"Not all open quotes are closed...
+&ldquo;Not all open quotes are closed...
+
+"And this" and "and this" and "and this" and "and another."
+&ldquo;And this&rdquo; and &ldquo;and this&rdquo; and &ldquo;and this&rdquo; and &ldquo;and another.&rdquo;
+
+"Will'll invite?" Mrs. Thorne asked;\n"because he's coming--he's at the gate."
+&ldquo;Will&apos;ll invite?&rdquo; Mrs. Thorne asked;\n&ldquo;because he&apos;s coming--he&apos;s at the gate.&rdquo;
+
+# ########################################################################
+# Single quotes
+# ########################################################################
+'I am Sam'
+&lsquo;I am Sam&rsquo;
+
+'It was so,' said he.
+&lsquo;It was so,&rsquo; said he.
+
+'...even better!'
+&lsquo;...even better!&rsquo;
+
+With 'quotes' in the middle.
+With &lsquo;quotes&rsquo; in the middle.
+
+With--'imaginary'--dashes.
+With--&lsquo;imaginary&rsquo;--dashes.
+
+''Cause I don't like it, 's why,' said Pat.
+&lsquo;&apos;Cause I don&apos;t like it, &apos;s why,&rsquo; said Pat.
+
+'It's a beautiful day!'
+&lsquo;It&apos;s a beautiful day!&rsquo;
+
+Book 'em, Danno. Rock 'n' roll. 'Cause 'twas the season.
+Book &apos;em, Danno. Rock &apos;n&apos; roll. 'Cause &apos;twas the season.
+
+# ########################################################################
+# Possessives
+# ########################################################################
+Sam's Sams' and the Ross's roses' thorns were prickly.
+Sam&apos;s Sams&apos; and the Ross&apos;s roses&apos; thorns were prickly.
+
+# ########################################################################
+# Inside contractions (no leading/trailing apostrophes)
+# ########################################################################
+I don't like it: I love's it!
+I don&apos;t like it: I love&apos;s it!
+
+We'd've thought that pancakes'll be sweeter there.
+We&apos;d&apos;ve thought that pancakes&apos;ll be sweeter there.
+
+She'd be coming o'er when the horse'd gone to pasture...
+She&apos;d be coming o&apos;er when the horse&apos;d gone to pasture...
+
+# ########################################################################
+# Beginning contractions (leading apostrophes)
+# ########################################################################
+'Twas and 'tis whate'er lay 'twixt dawn and dusk 'n River Styx.
+&apos;Twas and &apos;tis whate&apos;er lay &apos;twixt dawn and dusk &apos;n River Styx.
+
+# ########################################################################
+# Outside contractions (leading and trailing, no middle)
+# ########################################################################
+Salt 'n' vinegar, fish-'n'-chips, sugar 'n spice!
+Salt &apos;n&apos; vinegar, fish-&apos;n&apos;-chips, sugar &apos;n spice!
+
+# ########################################################################
+# Ending contractions (trailing apostrophes)
+# ########################################################################
+Didn' get th' message.
+Didn&apos; get th&apos; message.
+
+Namsayin', y'know what I'ma sayin'?
+Namsayin&apos;, y&apos;know what I&apos;ma sayin&apos;?
+
+# ########################################################################
+# Decades
+# ########################################################################
+The Roaring '20s had the best music, no?
+The Roaring &apos;20s had the best music, no?
+
+Took place in '04, yes'm!
+Took place in &apos;04, yes&apos;m!
+
+# ########################################################################
+# Consecutive quotes
+# ########################################################################
+"'I'm trouble.'"
+&ldquo;&lsquo;I&apos;m trouble.&rsquo;&rdquo;
+
+'"Trouble's my name."'
+&lsquo;&ldquo;Trouble&apos;s my name.&rdquo;&rsquo;
+
+# ########################################################################
+# Nested quotations
+# ########################################################################
+"'Here I am,' said Sam"
+&ldquo;&lsquo;Here I am,&rsquo; said Sam&rdquo;
+
+'"Here I am," said Sam'
+&lsquo;&ldquo;Here I am,&rdquo; said Sam&rsquo;
+
+'Hello, "Dr. Brown," what's your real name?'
+&lsquo;Hello, &ldquo;Dr. Brown,&rdquo; what&apos;s your real name?&rsquo;
+
+"'Twas, t'wasn't thy name, 'twas it?" said Jim "the Barber" Brown.
+&ldquo;&apos;Twas, t&apos;wasn&apos;t thy name, &apos;twas it?&rdquo; said Jim &ldquo;the Barber&rdquo; Brown.
+
+"'I'm in danger. Help? Take me to--' an address on Van Ness Avenue.
+&ldquo;&lsquo;I&apos;m in danger. Help? Take me to--&rsquo; an address on Van Ness Avenue.
+
+"Ah," she said, "you knew! He told you--and you said 'my dear'! How could you?!"
+&ldquo;Ah,&rdquo; she said, &ldquo;you knew! He told you--and you said &lsquo;my dear&rsquo;! How could you?!&rdquo;
+
+shouldn't drop letters, nor say "ain't" or "hain't."
+shouldn&apos;t drop letters, nor say &ldquo;ain&apos;t&rdquo; or &ldquo;hain&apos;t.&rdquo;
+
+"’Kearney lives on the banks of Killarney—’
+&ldquo;’Kearney lives on the banks of Killarney—’
+
+# ########################################################################
+# Mixed
+# ########################################################################
+"She said, 'That's Sam's'," said the Sams' cat.
+&ldquo;She said, &lsquo;That&apos;s Sam&apos;s&rsquo;,&rdquo; said the Sams&apos; cat.
+
+"'Jane said, ''E'll be spooky, Sam's son with the jack-o'-lantern!'" said the O'Mally twins'---y'know---ghosts in unison.
+&ldquo;&lsquo;Jane said, &lsquo;&apos;E&apos;ll be spooky, Sam&apos;s son with the jack-o&apos;-lantern!&rsquo;&rdquo; said the O&apos;Mally twins&apos;---y&apos;know---ghosts in unison.
+
+'He's at Sam's'
+&lsquo;He&apos;s at Sam&apos;s&rsquo;
+
+ma'am
+ma&apos;am
+
+'Twas midnight
+&apos;Twas midnight
+
+"Hello," said the spider. "'Shelob' is my name."
+&ldquo;Hello,&rdquo; said the spider. &ldquo;&lsquo;Shelob&rsquo; is my name.&rdquo;
+
+'He said, \"I want to go.\"' Were you alive in the 70's?
+&lsquo;He said, "I want to go."&rsquo; Were you alive in the 70&apos;s?
+
+"That's a 'magic' sock."
+&ldquo;That&apos;s a &lsquo;magic&rsquo; sock.&rdquo;
+
+'That's a "magic" sock.'
+&lsquo;That&apos;s a &ldquo;magic&rdquo; sock.&rsquo;
+
+'That's a "very 'magic'" sock.'
+&lsquo;That&apos;s a &ldquo;very &lsquo;magic&rsquo;&rdquo; sock.&rsquo;
+
+Website! Company Name, Inc. ("Company Name" or "Company") recommends reading carefully:
+Website! Company Name, Inc. (&ldquo;Company Name&rdquo; or &ldquo;Company&rdquo;) recommends reading carefully:
+
+Website! Company Name, Inc. ('Company Name' or 'Company') recommends reading carefully:
+Website! Company Name, Inc. (&lsquo;Company Name&rsquo; or &lsquo;Company&rsquo;) recommends reading carefully:
+
+Workin' hard
+Workin&apos; hard
+
+"She said, 'Llamas'll languish, they'll--
+&ldquo;She said, &lsquo;Llamas&apos;ll languish, they&apos;ll--
+
+'The 70s are my favorite numbers,' she said.
+&lsquo;The 70s are my favorite numbers,&rsquo; she said.
+
+'70s fashion was weird.
+&apos;70s fashion was weird.
+
+Model \"T2000\"
+Model "T2000"
+
+The 2's complement.\n\n'Yar, binary!'
+The 2&apos;s complement.\n\n&lsquo;Yar, binary!&rsquo;
+
+The Who's complement.\n\n\n"Paragraph boundary!"
+The Who&apos;s complement.\n\n\n&ldquo;Paragraph boundary!&rdquo;
+
+'Oak,' 'elm,' and 'beech' are names of trees. So is 'pine.'
+&lsquo;Oak,&rsquo; &lsquo;elm,&rsquo; and &lsquo;beech&rsquo; are names of trees. So is &lsquo;pine.&rsquo;
+
+'A', 'B', and 'C' are letters.
+&lsquo;A&rsquo;, &lsquo;B&rsquo;, and &lsquo;C&rsquo; are letters.
+
+'He said, "Thinkin'."'
+&lsquo;He said, &ldquo;Thinkin&apos;.&rdquo;&rsquo;
+
+''Sup, Doc?'
+&lsquo;&apos;Sup, Doc?&rsquo;
+
+# ########################################################################
+# Ambiguous (no solution)
+# ########################################################################
+She said, 'Cause of death?\n\n'Old age, there's no doubt.'
+She said, 'Cause of death?\n\n&lsquo;Old age, there&apos;s no doubt.&rsquo;
+
+She said, 'Cause I said so!\n\n'If we're done, I've a secret.'
+She said, 'Cause I said so!\n\n&lsquo;If we&apos;re done, I&apos;ve a secret.&rsquo;
+
+'Bout that time I says, 'Boys! I been thinkin' 'bout th' Universe.'
+'Bout that time I says, &lsquo;Boys! I been thinkin&apos; 'bout th&apos; Universe.&rsquo;
Delta2659 lines added, 2621 lines removed, 38-line increase