| Author | Dave Jarvis <email> |
|---|---|
| Date | 2021-06-17 19:54:10 GMT-0700 |
| Commit | 4dda57c31a9bf096e6342b2b84ff8d036ea8eef0 |
| Parent | 6b057a0 |
| +plugins { | ||
| + id 'application' | ||
| +} | ||
| + | ||
| +group 'com.whitemagicsoftware' | ||
| +version '1.0' | ||
| + | ||
| +repositories { | ||
| + mavenCentral() | ||
| +} | ||
| + | ||
| +dependencies { | ||
| + // Command-line parsing | ||
| + implementation 'info.picocli:picocli:4.4.0' | ||
| + | ||
| + testImplementation 'org.junit.jupiter:junit-jupiter-api:5.7.1' | ||
| + testImplementation 'org.junit.jupiter:junit-jupiter-params:5.7.1' | ||
| + testRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine' | ||
| +} | ||
| + | ||
| +sourceSets { | ||
| + main { | ||
| + java { | ||
| + srcDirs = ["src/main"] | ||
| + } | ||
| + } | ||
| +} | ||
| + | ||
| +compileJava { | ||
| + options.compilerArgs << "-Xlint:unchecked" << "-Xlint:deprecation" | ||
| +} | ||
| + | ||
| +application { | ||
| + applicationName = 'keenquotes' | ||
| + mainClassName = "com.whitemagicsoftware.${applicationName}.KeenQuotes" | ||
| +} | ||
| + | ||
| +jar { | ||
| + duplicatesStrategy = DuplicatesStrategy.EXCLUDE | ||
| + | ||
| + manifest { | ||
| + attributes 'Main-Class': mainClassName | ||
| + } | ||
| + | ||
| + from { | ||
| + (configurations.runtimeClasspath.findAll { !it.path.endsWith(".pom") }).collect { | ||
| + it.isDirectory() ? it : zipTree(it) | ||
| + } | ||
| + } | ||
| + | ||
| + archiveFileName = "${applicationName}.jar" | ||
| + | ||
| + exclude 'META-INF/*.RSA', 'META-INF/*.SF', 'META-INF/*.DSA' | ||
| +} | ||
| + | ||
| +tasks.named('test') { | ||
| + useJUnitPlatform() | ||
| +} | ||
| + | ||
| -plugins { | ||
| - id 'java-library' | ||
| -} | ||
| - | ||
| -repositories { | ||
| - // Use Maven Central for resolving dependencies. | ||
| - mavenCentral() | ||
| -} | ||
| - | ||
| -dependencies { | ||
| - testImplementation 'org.junit.jupiter:junit-jupiter-api:5.7.1' | ||
| - testImplementation 'org.junit.jupiter:junit-jupiter-params:5.7.1' | ||
| - testRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine' | ||
| -} | ||
| - | ||
| -tasks.named('test') { | ||
| - useJUnitPlatform() | ||
| -} | ||
| -/* | ||
| - * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| - * contributor license agreements. See the NOTICE file distributed with | ||
| - * this work for additional information regarding copyright ownership. | ||
| - * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| - * (the "License"); you may not use this file except in compliance with | ||
| - * the License. You may obtain a copy of the License at | ||
| - * | ||
| - * http://www.apache.org/licenses/LICENSE-2.0 | ||
| - * | ||
| - * Unless required by applicable law or agreed to in writing, software | ||
| - * distributed under the License is distributed on an "AS IS" BASIS, | ||
| - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| - * See the License for the specific language governing permissions and | ||
| - * limitations under the License. | ||
| - */ | ||
| -package com.keenwrite.quotes; | ||
| - | ||
| -import java.util.*; | ||
| - | ||
| -/** | ||
| - * CircularFifoQueue is a first-in first-out queue with a fixed size that | ||
| - * replaces its oldest element if full. | ||
| - * <p> | ||
| - * The removal order of a {@link CircularFifoQueue} is based on the | ||
| - * insertion order; elements are removed in the same order in which they | ||
| - * were added. The iteration order is the same as the removal order. | ||
| - * </p> | ||
| - * <p> | ||
| - * The {@link #add(Object)}, {@link #remove()}, {@link #peek()}, | ||
| - * {@link #poll()}, | ||
| - * {@link #offer(Object)} operations all perform in constant time. | ||
| - * All other operations perform in linear time or worse. | ||
| - * </p> | ||
| - * <p> | ||
| - * This queue prevents null objects from being added. | ||
| - * </p> | ||
| - * | ||
| - * @param <E> the type of elements in this collection | ||
| - * @since 4.0 | ||
| - */ | ||
| -public final class CircularFifoQueue<E> extends AbstractCollection<E> | ||
| - implements Queue<E> { | ||
| - /** | ||
| - * Underlying storage array. | ||
| - */ | ||
| - private final transient E[] elements; | ||
| - | ||
| - /** | ||
| - * Array index of first (oldest) queue element. | ||
| - */ | ||
| - private transient int start; | ||
| - | ||
| - /** | ||
| - * Index mod maxElements of the array position following the last queue | ||
| - * element. Queue elements start at elements[start] and "wrap around" | ||
| - * elements[maxElements-1], ending at elements[decrement(end)]. | ||
| - * For example, elements = {c,a,b}, start=1, end=1 corresponds to | ||
| - * the queue [a,b,c]. | ||
| - */ | ||
| - private transient int end; | ||
| - | ||
| - /** | ||
| - * Flag to indicate if the queue is currently full. | ||
| - */ | ||
| - private transient boolean full; | ||
| - | ||
| - /** | ||
| - * Capacity of the queue. | ||
| - */ | ||
| - private final int maxElements; | ||
| - | ||
| - /** | ||
| - * Constructor that creates a queue with the specified size. | ||
| - * | ||
| - * @param size Immutable queue size, must be greater than zero. | ||
| - */ | ||
| - @SuppressWarnings( "unchecked" ) | ||
| - public CircularFifoQueue( final int size ) { | ||
| - assert size > 0; | ||
| - | ||
| - elements = (E[]) new Object[ size ]; | ||
| - maxElements = elements.length; | ||
| - } | ||
| - | ||
| - /** | ||
| - * Returns the number of elements stored in the queue. | ||
| - * | ||
| - * @return this queue's size | ||
| - */ | ||
| - @Override | ||
| - public int size() { | ||
| - int size; | ||
| - | ||
| - if( end < start ) { | ||
| - size = maxElements - start + end; | ||
| - } | ||
| - else if( end == start ) { | ||
| - size = full ? maxElements : 0; | ||
| - } | ||
| - else { | ||
| - size = end - start; | ||
| - } | ||
| - | ||
| - return size; | ||
| - } | ||
| - | ||
| - /** | ||
| - * Returns true if this queue is empty; false otherwise. | ||
| - * | ||
| - * @return true if this queue is empty | ||
| - */ | ||
| - @Override | ||
| - public boolean isEmpty() { | ||
| - return size() == 0; | ||
| - } | ||
| - | ||
| - /** | ||
| - * Returns {@code true} if the capacity limit of this queue has been reached, | ||
| - * i.e. the number of elements stored in the queue equals its maximum size. | ||
| - * | ||
| - * @return {@code true} if the capacity limit has been reached, {@code | ||
| - * false} otherwise | ||
| - * @since 4.1 | ||
| - */ | ||
| - public boolean isAtFullCapacity() { | ||
| - return size() == maxElements; | ||
| - } | ||
| - | ||
| - /** | ||
| - * Clears this queue. | ||
| - */ | ||
| - @Override | ||
| - public void clear() { | ||
| - full = false; | ||
| - start = 0; | ||
| - end = 0; | ||
| - Arrays.fill( elements, null ); | ||
| - } | ||
| - | ||
| - /** | ||
| - * Adds the given element to this queue. If the queue is full, the least | ||
| - * recently added | ||
| - * element is discarded so that a new element can be inserted. | ||
| - * | ||
| - * @param element the element to add | ||
| - * @return true, always | ||
| - * @throws NullPointerException if the given element is null | ||
| - */ | ||
| - @Override | ||
| - public boolean add( final E element ) { | ||
| - Objects.requireNonNull( element, "element" ); | ||
| - | ||
| - if( isAtFullCapacity() ) { | ||
| - remove(); | ||
| - } | ||
| - | ||
| - elements[ end++ ] = element; | ||
| - | ||
| - if( end >= maxElements ) { | ||
| - end = 0; | ||
| - } | ||
| - | ||
| - if( end == start ) { | ||
| - full = true; | ||
| - } | ||
| - | ||
| - return true; | ||
| - } | ||
| - | ||
| - /** | ||
| - * Returns the element at the specified position in this queue. | ||
| - * | ||
| - * @param index the position of the element in the queue | ||
| - * @return the element at position {@code index} | ||
| - * @throws NoSuchElementException if the requested position is outside the | ||
| - * range [0, size) | ||
| - */ | ||
| - public E get( final int index ) { | ||
| - final int sz = size(); | ||
| - | ||
| - if( index < 0 || index >= sz ) { | ||
| - throw new NoSuchElementException( | ||
| - String.format( | ||
| - "Index %1$d is outside the available range [0, %2$d)", | ||
| - index, sz ) ); | ||
| - } | ||
| - | ||
| - final int idx = (start + index) % maxElements; | ||
| - return elements[ idx ]; | ||
| - } | ||
| - | ||
| - /** | ||
| - * Adds the given element to this queue. If the queue is full, the least | ||
| - * recently added | ||
| - * element is discarded so that a new element can be inserted. | ||
| - * | ||
| - * @param element the element to add | ||
| - * @return true, always | ||
| - * @throws NullPointerException if the given element is null | ||
| - */ | ||
| - @Override | ||
| - public boolean offer( final E element ) { | ||
| - return add( element ); | ||
| - } | ||
| - | ||
| - @Override | ||
| - public E poll() { | ||
| - return isEmpty() ? null : remove(); | ||
| - } | ||
| - | ||
| - @Override | ||
| - public E element() { | ||
| - if( isEmpty() ) { | ||
| - throw new NoSuchElementException( "empty queue" ); | ||
| - } | ||
| - | ||
| - return peek(); | ||
| - } | ||
| - | ||
| - @Override | ||
| - public E peek() { | ||
| - return isEmpty() ? null : elements[ start ]; | ||
| - } | ||
| - | ||
| - @Override | ||
| - public E remove() { | ||
| - if( isEmpty() ) { | ||
| - throw new NoSuchElementException( "empty queue" ); | ||
| - } | ||
| - | ||
| - final E element = elements[ start ]; | ||
| - if( null != element ) { | ||
| - elements[ start++ ] = null; | ||
| - | ||
| - if( start >= maxElements ) { | ||
| - start = 0; | ||
| - } | ||
| - full = false; | ||
| - } | ||
| - return element; | ||
| - } | ||
| - | ||
| - /** | ||
| - * Increments the internal index. | ||
| - * | ||
| - * @param index the index to increment | ||
| - * @return the updated index | ||
| - */ | ||
| - private int increment( int index ) { | ||
| - if( ++index >= maxElements ) { | ||
| - index = 0; | ||
| - } | ||
| - return index; | ||
| - } | ||
| - | ||
| - /** | ||
| - * Decrements the internal index. | ||
| - * | ||
| - * @param index the index to decrement | ||
| - * @return the updated index | ||
| - */ | ||
| - private int decrement( int index ) { | ||
| - if( --index < 0 ) { | ||
| - index = maxElements - 1; | ||
| - } | ||
| - return index; | ||
| - } | ||
| - | ||
| - /** | ||
| - * Returns an iterator over this queue's elements. | ||
| - * | ||
| - * @return an iterator over this queue's elements | ||
| - */ | ||
| - @Override | ||
| - public Iterator<E> iterator() { | ||
| - return new Iterator<>() { | ||
| - | ||
| - private int index = start; | ||
| - private int lastReturnedIndex = -1; | ||
| - private boolean isFirst = full; | ||
| - | ||
| - @Override | ||
| - public boolean hasNext() { | ||
| - return isFirst || index != end; | ||
| - } | ||
| - | ||
| - @Override | ||
| - public E next() { | ||
| - if( !hasNext() ) { | ||
| - throw new NoSuchElementException(); | ||
| - } | ||
| - | ||
| - isFirst = false; | ||
| - lastReturnedIndex = index; | ||
| - index = increment( index ); | ||
| - return elements[ lastReturnedIndex ]; | ||
| - } | ||
| - | ||
| - @Override | ||
| - public void remove() { | ||
| - if( lastReturnedIndex == -1 ) { | ||
| - throw new IllegalStateException(); | ||
| - } | ||
| - | ||
| - // First element can be removed quickly | ||
| - if( lastReturnedIndex == start ) { | ||
| - CircularFifoQueue.this.remove(); | ||
| - lastReturnedIndex = -1; | ||
| - return; | ||
| - } | ||
| - | ||
| - int pos = lastReturnedIndex + 1; | ||
| - if( start < lastReturnedIndex && pos < end ) { | ||
| - // shift in one part | ||
| - System.arraycopy( | ||
| - elements, pos, elements, lastReturnedIndex, end - pos ); | ||
| - } | ||
| - else { | ||
| - // Other elements require us to shift the subsequent elements | ||
| - while( pos != end ) { | ||
| - if( pos >= maxElements ) { | ||
| - elements[ pos - 1 ] = elements[ 0 ]; | ||
| - pos = 0; | ||
| - } | ||
| - else { | ||
| - elements[ decrement( pos ) ] = elements[ pos ]; | ||
| - pos = increment( pos ); | ||
| - } | ||
| - } | ||
| - } | ||
| - | ||
| - lastReturnedIndex = -1; | ||
| - end = decrement( end ); | ||
| - elements[ end ] = null; | ||
| - full = false; | ||
| - index = decrement( index ); | ||
| - } | ||
| - }; | ||
| - } | ||
| -} | ||
| -/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */ | ||
| -package com.keenwrite.quotes; | ||
| - | ||
| -import java.util.HashSet; | ||
| -import java.util.Set; | ||
| - | ||
| -import static java.util.Collections.emptySet; | ||
| - | ||
| -/** | ||
| - * Placeholder for various types of contractions. | ||
| - */ | ||
| -public class Contractions { | ||
| - | ||
| - private final Builder mBuilder; | ||
| - | ||
| - private Contractions( final Builder builder ) { | ||
| - assert builder != null; | ||
| - mBuilder = builder; | ||
| - } | ||
| - | ||
| - /** | ||
| - * Allows constructing a list of custom contractions. | ||
| - */ | ||
| - @SuppressWarnings( "unused" ) | ||
| - public static class Builder { | ||
| - private final Set<String> mBeganUnambiguous = new HashSet<>(); | ||
| - private final Set<String> mEndedUnambiguous = new HashSet<>(); | ||
| - private final Set<String> mBeganAmbiguous = new HashSet<>(); | ||
| - private final Set<String> mEndedAmbiguous = new HashSet<>(); | ||
| - | ||
| - public void withBeganUnambiguous( final Set<String> words ) { | ||
| - mBeganUnambiguous.addAll( words ); | ||
| - } | ||
| - | ||
| - public void withEndedUnambiguous( final Set<String> words ) { | ||
| - mEndedUnambiguous.addAll( words ); | ||
| - } | ||
| - | ||
| - public void withBeganAmbiguous( final Set<String> words ) { | ||
| - mBeganAmbiguous.addAll( words ); | ||
| - } | ||
| - | ||
| - public void withEndedAmbiguous( final Set<String> words ) { | ||
| - mEndedAmbiguous.addAll( words ); | ||
| - } | ||
| - | ||
| - /** | ||
| - * Constructs a new set of {@link Contractions} that can be configured | ||
| - * using this {@link Builder} instance. | ||
| - * | ||
| - * @return {@link Contractions} suitable for use with parsing text. | ||
| - */ | ||
| - public Contractions build() { | ||
| - mBeganUnambiguous.addAll( from( mBeganUnambiguous, BEGAN_UNAMBIGUOUS ) ); | ||
| - mEndedUnambiguous.addAll( from( mEndedUnambiguous, ENDED_UNAMBIGUOUS ) ); | ||
| - mBeganAmbiguous.addAll( from( mBeganAmbiguous, BEGAN_AMBIGUOUS ) ); | ||
| - mEndedAmbiguous.addAll( from( mEndedAmbiguous, ENDED_AMBIGUOUS ) ); | ||
| - | ||
| - return new Contractions( this ); | ||
| - } | ||
| - | ||
| - /** | ||
| - * This returns the {@code fallback} {@link Set} if {@code src} is empty; | ||
| - * otherwise, this returns the empty {@link Set}. | ||
| - * | ||
| - * @param src A set of contractions, possibly empty. | ||
| - * @param fallback The default values to use if {@code src} is empty. | ||
| - * @param <T> The type of data used by both {@link Set}s. | ||
| - * @return An empty {@link Set} if the {@code src} contains at least one | ||
| - * element; otherwise, this will return {@code fallback}. | ||
| - */ | ||
| - private static <T> Set<T> from( final Set<T> src, final Set<T> fallback ) { | ||
| - assert src != null; | ||
| - assert fallback != null; | ||
| - return src.isEmpty() ? fallback : emptySet(); | ||
| - } | ||
| - } | ||
| - | ||
| - /** | ||
| - * Answers whether the given word is a contraction that always starts | ||
| - * with an apostrophe. The comparison is case insensitive. This must | ||
| - * only be called when a straight quote is followed by a word. | ||
| - * | ||
| - * @param word The word to compare against the list of known unambiguous | ||
| - * contractions. | ||
| - * @return {@code true} when the given word is in the set of unambiguous | ||
| - * contractions. | ||
| - */ | ||
| - public boolean beganUnambiguously( final String word ) { | ||
| - assert word != null; | ||
| - return getBeganUnambiguous().contains( word.toLowerCase() ); | ||
| - } | ||
| - | ||
| - /** | ||
| - * Answers whether the given word could be a contraction but is also a | ||
| - * valid word in non-contracted form. | ||
| - * | ||
| - * @param word The word to compare against the list of known ambiguous | ||
| - * contractions. | ||
| - * @return {@code true} when the given word is in the set of ambiguous | ||
| - * contractions. | ||
| - */ | ||
| - public boolean beganAmbiguously( final String word ) { | ||
| - assert word != null; | ||
| - return getBeganAmbiguous().contains( word.toLowerCase() ); | ||
| - } | ||
| - | ||
| - public boolean endedUnambiguously( final String word ) { | ||
| - assert word != null; | ||
| - return getEndedUnambiguous().contains( word.toLowerCase() ); | ||
| - } | ||
| - | ||
| - public boolean endedAmbiguously( final String word ) { | ||
| - assert word != null; | ||
| - final var check = word.toLowerCase(); | ||
| - | ||
| - // Ensure that 'n' isn't matched for ambiguity by enforcing length, yet | ||
| - // allow o' to match because 'a sentence can end with the letter o'. | ||
| - return getEndedAmbiguous().contains( check ) || | ||
| - check.endsWith( "s" ) || check.endsWith( "z" ) || | ||
| - check.endsWith( "x" ) || (check.length() > 1 && check.endsWith( "n" )); | ||
| - } | ||
| - | ||
| - private Set<String> getBeganUnambiguous() { | ||
| - return mBuilder.mBeganUnambiguous; | ||
| - } | ||
| - | ||
| - private Set<String> getEndedUnambiguous() { | ||
| - return mBuilder.mEndedUnambiguous; | ||
| - } | ||
| - | ||
| - private Set<String> getBeganAmbiguous() { | ||
| - return mBuilder.mBeganAmbiguous; | ||
| - } | ||
| - | ||
| - private Set<String> getEndedAmbiguous() { | ||
| - return mBuilder.mEndedAmbiguous; | ||
| - } | ||
| - | ||
| - /** | ||
| - * Words having a straight apostrophe that cannot be mistaken for an | ||
| - * opening single quote. | ||
| - */ | ||
| - private static final Set<String> BEGAN_UNAMBIGUOUS = Set.of( | ||
| - "aporth", | ||
| - "boutcha", | ||
| - "boutchu", | ||
| - "cept", | ||
| - "dillo", | ||
| - "em", | ||
| - "fraid", | ||
| - "gainst", | ||
| - "n", | ||
| - "neath", | ||
| - "nother", | ||
| - "onna", | ||
| - "onna'", | ||
| - "pon", | ||
| - "s", | ||
| - "sblood", | ||
| - "scuse", | ||
| - "sfar", | ||
| - "sfoot", | ||
| - "t", | ||
| - "taint", | ||
| - "tain", | ||
| - "til", | ||
| - "tis", | ||
| - "tisn", | ||
| - "tshall", | ||
| - "twas", | ||
| - "twasn", | ||
| - "tween", | ||
| - "twere", | ||
| - "tweren", | ||
| - "twixt", | ||
| - "twon", | ||
| - "twou", | ||
| - "twould", | ||
| - "twouldn", | ||
| - "ve" | ||
| - ); | ||
| - | ||
| - /** | ||
| - * Words having a straight apostrophe that may be either part of a | ||
| - * contraction or a word that stands alone beside an opening single quote. | ||
| - */ | ||
| - private static final Set<String> BEGAN_AMBIGUOUS = Set.of( | ||
| - // about|boxing match | ||
| - "bout", | ||
| - // because|causal | ||
| - "cause", | ||
| - // what you|choo choo train | ||
| - "choo", | ||
| - // he|e pluribus unum | ||
| - "e", | ||
| - // here|earlier | ||
| - "ere", | ||
| - // afro|to and fro | ||
| - "fro", | ||
| - // whore|ho ho! | ||
| - "ho", | ||
| - // okay|letter K | ||
| - "kay", | ||
| - // lo|lo and behold | ||
| - "lo", | ||
| - // are|regarding | ||
| - "re", | ||
| - // what's up|to sup | ||
| - "sup", | ||
| - // it will|twill fabric | ||
| - "twill", | ||
| - // them|utterance | ||
| - "um", | ||
| - // is that|Iranian village | ||
| - "zat" | ||
| - ); | ||
| - | ||
| - private static final Set<String> ENDED_AMBIGUOUS = Set.of( | ||
| - // give|martial arts garment | ||
| - "gi", | ||
| - // in|I | ||
| - "i", | ||
| - // of|letter o | ||
| - "o" | ||
| - ); | ||
| - | ||
| - private static final Set<String> ENDED_UNAMBIGUOUS = Set.of( | ||
| - // and | ||
| - "an", | ||
| - // for/before | ||
| - "fo", | ||
| - // friend | ||
| - "frien", | ||
| - // just | ||
| - "jus", | ||
| - // lord | ||
| - "lor", | ||
| - // myself | ||
| - "masel", | ||
| - // old | ||
| - "ol", | ||
| - // San (Francisco) | ||
| - "Sa", | ||
| - // shift | ||
| - "shif", | ||
| - // the | ||
| - "th", | ||
| - // what | ||
| - "wha", | ||
| - // world | ||
| - "worl", | ||
| - // Top ~500 common -ing words as English contractions. | ||
| - "acceptin", | ||
| - "accompanyin", | ||
| - "accordin", | ||
| - "accountin", | ||
| - "achievin", | ||
| - "acquirin", | ||
| - "actin", | ||
| - "addin", | ||
| - "addressin", | ||
| - "adjoinin", | ||
| - "adoptin", | ||
| - "advancin", | ||
| - "advertisin", | ||
| - "affectin", | ||
| - "agin", | ||
| - "allowin", | ||
| - "amazin", | ||
| - "analyzin", | ||
| - "answerin", | ||
| - "anythin", | ||
| - "appearin", | ||
| - "applyin", | ||
| - "approachin", | ||
| - "arguin", | ||
| - "arisin", | ||
| - "arrivin", | ||
| - "askin", | ||
| - "assessin", | ||
| - "assumin", | ||
| - "attackin", | ||
| - "attemptin", | ||
| - "attendin", | ||
| - "avoidin", | ||
| - "bankin", | ||
| - "bargainin", | ||
| - "bearin", | ||
| - "beatin", | ||
| - "becomin", | ||
| - "beginnin", | ||
| - "bein", | ||
| - "believin", | ||
| - "belongin", | ||
| - "bendin", | ||
| - "bindin", | ||
| - "bleedin", | ||
| - "blessin", | ||
| - "blowin", | ||
| - "boilin", | ||
| - "borrowin", | ||
| - "breakin", | ||
| - "breathin", | ||
| - "breedin", | ||
| - "bringin", | ||
| - "broadcastin", | ||
| - "buildin", | ||
| - "burnin", | ||
| - "buyin", | ||
| - "calculatin", | ||
| - "callin", | ||
| - "carryin", | ||
| - "castin", | ||
| - "causin", | ||
| - "ceilin", | ||
| - "challengin", | ||
| - "changin", | ||
| - "checkin", | ||
| - "choosin", | ||
| - "claimin", | ||
| - "cleanin", | ||
| - "clearin", | ||
| - "climbin", | ||
| - "closin", | ||
| - "clothin", | ||
| - "collectin", | ||
| - "combinin", | ||
| - "comin", | ||
| - "commandin", | ||
| - "comparin", | ||
| - "compellin", | ||
| - "competin", | ||
| - "computin", | ||
| - "concernin", | ||
| - "concludin", | ||
| - "conditionin", | ||
| - "conductin", | ||
| - "conflictin", | ||
| - "connectin", | ||
| - "considerin", | ||
| - "consistin", | ||
| - "constructin", | ||
| - "consultin", | ||
| - "consumin", | ||
| - "containin", | ||
| - "continuin", | ||
| - "contractin", | ||
| - "contributin", | ||
| - "controllin", | ||
| - "convincin", | ||
| - "cookin", | ||
| - "coolin", | ||
| - "copin", | ||
| - "correspondin", | ||
| - "counselin", | ||
| - "countin", | ||
| - "couplin", | ||
| - "coverin", | ||
| - "creatin", | ||
| - "crossin", | ||
| - "cryin", | ||
| - "cuttin", | ||
| - "dancin", | ||
| - "darlin", | ||
| - "datin", | ||
| - "dealin", | ||
| - "decidin", | ||
| - "declarin", | ||
| - "declinin", | ||
| - "decreasin", | ||
| - "definin", | ||
| - "demandin", | ||
| - "denyin", | ||
| - "dependin", | ||
| - "descendin", | ||
| - "describin", | ||
| - "designin", | ||
| - "destroyin", | ||
| - "determinin", | ||
| - "developin", | ||
| - "differin", | ||
| - "dinin", | ||
| - "directin", | ||
| - "discussin", | ||
| - "distinguishin", | ||
| - "disturbin", | ||
| - "dividin", | ||
| - "doin", | ||
| - "drawin", | ||
| - "dressin", | ||
| - "drinkin", | ||
| - "drivin", | ||
| - "droppin", | ||
| - "dryin", | ||
| - "durin", | ||
| - "dwellin", | ||
| - "dyin", | ||
| - "eatin", | ||
| - "editin", | ||
| - "emergin", | ||
| - "employin", | ||
| - "enablin", | ||
| - "encouragin", | ||
| - "endin", | ||
| - "engagin", | ||
| - "engineerin", | ||
| - "enjoyin", | ||
| - "enterin", | ||
| - "establishin", | ||
| - "evaluatin", | ||
| - "evenin", | ||
| - "everythin", | ||
| - "examinin", | ||
| - "exceedin", | ||
| - "excitin", | ||
| - "excludin", | ||
| - "existin", | ||
| - "expandin", | ||
| - "expectin", | ||
| - "experiencin", | ||
| - "explainin", | ||
| - "explorin", | ||
| - "expressin", | ||
| - "extendin", | ||
| - "facin", | ||
| - "failin", | ||
| - "fallin", | ||
| - "farmin", | ||
| - "fascinatin", | ||
| - "feedin", | ||
| - "feelin", | ||
| - "fightin", | ||
| - "filin", | ||
| - "fillin", | ||
| - "financin", | ||
| - "findin", | ||
| - "firin", | ||
| - "fishin", | ||
| - "fittin", | ||
| - "fixin", | ||
| - "floatin", | ||
| - "flowin", | ||
| - "flyin", | ||
| - "focusin", | ||
| - "followin", | ||
| - "forcin", | ||
| - "foregoin", | ||
| - "formin", | ||
| - "forthcomin", | ||
| - "foundin", | ||
| - "freezin", | ||
| - "fuckin", | ||
| - "functionin", | ||
| - "fundin", | ||
| - "gainin", | ||
| - "gatherin", | ||
| - "generatin", | ||
| - "gettin", | ||
| - "givin", | ||
| - "goin", | ||
| - "governin", | ||
| - "grantin", | ||
| - "growin", | ||
| - "hackin", | ||
| - "handlin", | ||
| - "hangin", | ||
| - "happenin", | ||
| - "havin", | ||
| - "headin", | ||
| - "healin", | ||
| - "hearin", | ||
| - "heatin", | ||
| - "helpin", | ||
| - "hidin", | ||
| - "holdin", | ||
| - "hopin", | ||
| - "housin", | ||
| - "huntin", | ||
| - "identifyin", | ||
| - "imagin", | ||
| - "implementin", | ||
| - "imposin", | ||
| - "improvin", | ||
| - "includin", | ||
| - "increasin", | ||
| - "indicatin", | ||
| - "interestin", | ||
| - "interpretin", | ||
| - "introducin", | ||
| - "involvin", | ||
| - "joinin", | ||
| - "judgin", | ||
| - "keepin", | ||
| - "killin", | ||
| - "knowin", | ||
| - "lackin", | ||
| - "landin", | ||
| - "lastin", | ||
| - "laughin", | ||
| - "layin", | ||
| - "leadin", | ||
| - "leanin", | ||
| - "learnin", | ||
| - "leavin", | ||
| - "lettin", | ||
| - "liftin", | ||
| - "lightin", | ||
| - "lightnin", | ||
| - "limitin", | ||
| - "listenin", | ||
| - "listin", | ||
| - "livin", | ||
| - "loadin", | ||
| - "lookin", | ||
| - "losin", | ||
| - "lovin", | ||
| - "lowerin", | ||
| - "lyin", | ||
| - "maintainin", | ||
| - "makin", | ||
| - "managin", | ||
| - "manufacturin", | ||
| - "mappin", | ||
| - "marketin", | ||
| - "markin", | ||
| - "matchin", | ||
| - "meanin", | ||
| - "measurin", | ||
| - "meetin", | ||
| - "meltin", | ||
| - "minin", | ||
| - "misleadin", | ||
| - "missin", | ||
| - "mixin", | ||
| - "modelin", | ||
| - "monitorin", | ||
| - "mornin", | ||
| - "movin", | ||
| - "neighborin", | ||
| - "nothin", | ||
| - "notin", | ||
| - "notwithstandin", | ||
| - "nursin", | ||
| - "observin", | ||
| - "obtainin", | ||
| - "occurrin", | ||
| - "offerin", | ||
| - "offsprin", | ||
| - "ongoin", | ||
| - "openin", | ||
| - "operatin", | ||
| - "opposin", | ||
| - "orderin", | ||
| - "organizin", | ||
| - "outstandin", | ||
| - "overwhelmin", | ||
| - "packin", | ||
| - "paintin", | ||
| - "parkin", | ||
| - "participatin", | ||
| - "passin", | ||
| - "payin", | ||
| - "pendin", | ||
| - "performin", | ||
| - "pickin", | ||
| - "pissin", | ||
| - "placin", | ||
| - "plannin", | ||
| - "plantin", | ||
| - "playin", | ||
| - "pleasin", | ||
| - "pointin", | ||
| - "possessin", | ||
| - "preachin", | ||
| - "precedin", | ||
| - "preparin", | ||
| - "presentin", | ||
| - "preservin", | ||
| - "pressin", | ||
| - "prevailin", | ||
| - "preventin", | ||
| - "pricin", | ||
| - "printin", | ||
| - "proceedin", | ||
| - "processin", | ||
| - "producin", | ||
| - "programmin", | ||
| - "promisin", | ||
| - "promotin", | ||
| - "protectin", | ||
| - "providin", | ||
| - "provin", | ||
| - "publishin", | ||
| - "pullin", | ||
| - "purchasin", | ||
| - "pursuin", | ||
| - "pushin", | ||
| - "puttin", | ||
| - "questionin", | ||
| - "rangin", | ||
| - "ratin", | ||
| - "reachin", | ||
| - "readin", | ||
| - "reasonin", | ||
| - "receivin", | ||
| - "recognizin", | ||
| - "recordin", | ||
| - "reducin", | ||
| - "referrin", | ||
| - "reflectin", | ||
| - "refusin", | ||
| - "regardin", | ||
| - "regulatin", | ||
| - "relatin", | ||
| - "remainin", | ||
| - "rememberin", | ||
| - "removin", | ||
| - "renderin", | ||
| - "repeatin", | ||
| - "replacin", | ||
| - "reportin", | ||
| - "representin", | ||
| - "requirin", | ||
| - "respectin", | ||
| - "respondin", | ||
| - "restin", | ||
| - "resultin", | ||
| - "returnin", | ||
| - "revealin", | ||
| - "ridin", | ||
| - "risin", | ||
| - "rulin", | ||
| - "runnin", | ||
| - "sailin", | ||
| - "samplin", | ||
| - "satisfyin", | ||
| - "savin", | ||
| - "sayin", | ||
| - "scatterin", | ||
| - "schoolin", | ||
| - "screenin", | ||
| - "searchin", | ||
| - "securin", | ||
| - "seein", | ||
| - "seekin", | ||
| - "selectin", | ||
| - "sellin", | ||
| - "sendin", | ||
| - "separatin", | ||
| - "servin", | ||
| - "settin", | ||
| - "settlin", | ||
| - "sewin", | ||
| - "shakin", | ||
| - "shapin", | ||
| - "sharin", | ||
| - "shiftin", | ||
| - "shinin", | ||
| - "shippin", | ||
| - "shittin", | ||
| - "shootin", | ||
| - "shoppin", | ||
| - "showin", | ||
| - "singin", | ||
| - "sinkin", | ||
| - "sittin", | ||
| - "sleepin", | ||
| - "smilin", | ||
| - "smokin", | ||
| - "spankin", | ||
| - "solvin", | ||
| - "somethin", | ||
| - "speakin", | ||
| - "spellin", | ||
| - "spendin", | ||
| - "spinnin", | ||
| - "spittin", | ||
| - "spreadin", | ||
| - "standin", | ||
| - "starin", | ||
| - "startin", | ||
| - "statin", | ||
| - "stayin", | ||
| - "stealin", | ||
| - "sterlin", | ||
| - "stimulatin", | ||
| - "stirrin", | ||
| - "stoppin", | ||
| - "strengthenin", | ||
| - "stretchin", | ||
| - "strikin", | ||
| - "strugglin", | ||
| - "studyin", | ||
| - "succeedin", | ||
| - "sufferin", | ||
| - "suggestin", | ||
| - "supplyin", | ||
| - "supportin", | ||
| - "surprisin", | ||
| - "surroundin", | ||
| - "survivin", | ||
| - "sweepin", | ||
| - "swellin", | ||
| - "swimmin", | ||
| - "switchin", | ||
| - "takin", | ||
| - "talkin", | ||
| - "teachin", | ||
| - "tellin", | ||
| - "testin", | ||
| - "thinkin", | ||
| - "threatenin", | ||
| - "throwin", | ||
| - "timin", | ||
| - "touchin", | ||
| - "tradin", | ||
| - "trainin", | ||
| - "travelin", | ||
| - "treatin", | ||
| - "tremblin", | ||
| - "tryin", | ||
| - "turnin", | ||
| - "underlyin", | ||
| - "understandin", | ||
| - "undertakin", | ||
| - "unwillin", | ||
| - "usin", | ||
| - "varyin", | ||
| - "viewin", | ||
| - "visitin", | ||
| - "votin", | ||
| - "waitin", | ||
| - "walkin", | ||
| - "wanderin", | ||
| - "wantin", | ||
| - "warnin", | ||
| - "washin", | ||
| - "watchin", | ||
| - "wearin", | ||
| - "weddin", | ||
| - "whackin", | ||
| - "willin", | ||
| - "windin", | ||
| - "winnin", | ||
| - "wishin", | ||
| - "wonderin", | ||
| - "workin", | ||
| - "writin", | ||
| - "yieldin" | ||
| - ); | ||
| -} | ||
| -/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */ | ||
| -package com.keenwrite.quotes; | ||
| - | ||
| -import java.util.ArrayList; | ||
| -import java.util.Map; | ||
| -import java.util.function.Consumer; | ||
| - | ||
| -import static com.keenwrite.quotes.TokenType.*; | ||
| -import static java.util.Collections.sort; | ||
| - | ||
| -/** | ||
| - * Responsible for replacing {@link Token} instances with equivalent smart | ||
| - * quotes (or straight quotes). This will inform the caller when ambiguous | ||
| - * quotes cannot be reliably resolved. | ||
| - */ | ||
| -public final class KeenQuotes { | ||
| - private static final Map<TokenType, String> REPLACEMENTS = Map.of( | ||
| - QUOTE_OPENING_SINGLE, "‘", | ||
| - QUOTE_CLOSING_SINGLE, "’", | ||
| - QUOTE_OPENING_DOUBLE, "“", | ||
| - QUOTE_CLOSING_DOUBLE, "”", | ||
| - QUOTE_STRAIGHT_SINGLE, "'", | ||
| - QUOTE_STRAIGHT_DOUBLE, "\"", | ||
| - QUOTE_APOSTROPHE, "'", | ||
| - QUOTE_PRIME_SINGLE, "′", | ||
| - QUOTE_PRIME_DOUBLE, "″" | ||
| - ); | ||
| - | ||
| - /** | ||
| - * Converts straight quotes to curly quotes and primes. Any quotation marks | ||
| - * that cannot be converted are passed to the {@link Consumer}. | ||
| - * | ||
| - * @param text The text to parse. | ||
| - * @param unresolved Recipient for ambiguous {@link Lexeme}s. | ||
| - * @return The given text string with as many straight quotes converted to | ||
| - * curly quotes as is feasible. | ||
| - */ | ||
| - public static String convert( | ||
| - final String text, final Consumer<Lexeme> unresolved ) { | ||
| - final var parser = new Parser( text ); | ||
| - final var tokens = new ArrayList<Token>(); | ||
| - | ||
| - // Parse the tokens and consume all unresolved lexemes. | ||
| - parser.parse( tokens::add, unresolved ); | ||
| - | ||
| - // The parser may emit tokens in any order. | ||
| - sort( tokens ); | ||
| - | ||
| - final var result = new StringBuilder( text.length() ); | ||
| - var position = 0; | ||
| - | ||
| - for( final var token : tokens ) { | ||
| - if( position <= token.began() ) { | ||
| - result.append( text, position, token.began() ); | ||
| - result.append( REPLACEMENTS.get( token.getType() ) ); | ||
| - } | ||
| - | ||
| - position = token.ended(); | ||
| - } | ||
| - | ||
| - return result.append( text.substring( position ) ).toString(); | ||
| - } | ||
| -} | ||
| -/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */ | ||
| -package com.keenwrite.quotes; | ||
| - | ||
| -import static com.keenwrite.quotes.LexemeType.FLAG; | ||
| - | ||
| -/** | ||
| - * Responsible for tracking the beginning and ending offsets of a lexeme within | ||
| - * a text string. Tracking the beginning and ending indices should use less | ||
| - * memory than duplicating the entire text of Unicode characters (i.e., using | ||
| - * a similar approach to run-length encoding). | ||
| - */ | ||
| -public final class Lexeme implements Comparable<Lexeme> { | ||
| - /** | ||
| - * Signifies an invalid index to help distinguish EOT/SOT. | ||
| - */ | ||
| - private static final int E_INDEX = -2; | ||
| - | ||
| - /** | ||
| - * Denotes there are no more lexemes: the end of text (EOT) has been reached. | ||
| - * The beginning index differentiates between EOT and SOT. | ||
| - */ | ||
| - public static final Lexeme EOT = new Lexeme( FLAG, -1, E_INDEX ); | ||
| - | ||
| - /** | ||
| - * Denotes parsing at the start of text (SOT). This is useful to avoid | ||
| - * branching conditions while iterating. The beginning index differentiates | ||
| - * between EOT and SOT. | ||
| - */ | ||
| - public static final Lexeme SOT = new Lexeme( FLAG, 0, E_INDEX ); | ||
| - | ||
| - private final LexemeType mType; | ||
| - private final int mBegan; | ||
| - private final int mEnded; | ||
| - | ||
| - /** | ||
| - * Create a lexeme that represents a section of the text. | ||
| - * | ||
| - * @param type Type of {@link Lexeme} to create. | ||
| - * @param began Offset into the text where this instance starts (0-based). | ||
| - * @param ended Offset into the text where this instance stops (0-based). | ||
| - */ | ||
| - private Lexeme( final LexemeType type, final int began, final int ended ) { | ||
| - assert type != null; | ||
| - assert began >= 0 || ended == E_INDEX; | ||
| - assert ended >= began || ended == E_INDEX; | ||
| - | ||
| - mType = type; | ||
| - mBegan = began; | ||
| - mEnded = ended + 1; | ||
| - } | ||
| - | ||
| - /** | ||
| - * Extracts a sequence of characters from the given text at the offsets | ||
| - * captured by this lexeme. | ||
| - * | ||
| - * @param text The text that was parsed using this class. | ||
| - * @return The character string captured by the lexeme. | ||
| - */ | ||
| - public String toString( final String text ) { | ||
| - assert text != null; | ||
| - return text.substring( mBegan, mEnded ); | ||
| - } | ||
| - | ||
| - @Override | ||
| - public String toString() { | ||
| - return getClass().getSimpleName() + '{' + | ||
| - "mType=" + mType + | ||
| - ", mBegan=" + mBegan + | ||
| - ", mEnded=" + mEnded + | ||
| - '}'; | ||
| - } | ||
| - | ||
| - /** | ||
| - * Answers whether the given {@link LexemeType} is the same as this | ||
| - * instance's internal {@link LexemeType}. | ||
| - * | ||
| - * @param type The {@link LexemeType} to compare. | ||
| - * @return {@code true} if the given {@link LexemeType} is equal to the | ||
| - * internal {@link LexemeType}. | ||
| - */ | ||
| - public boolean isType( final LexemeType type ) { | ||
| - assert type != null; | ||
| - return mType == type; | ||
| - } | ||
| - | ||
| - /** | ||
| - * Answers whether any of the given {@link LexemeType} matches this | ||
| - * instance's internal {@link LexemeType}. | ||
| - * | ||
| - * @param types The {@link LexemeType}s to compare. | ||
| - * @return {@code true} if the internal {@link LexemeType} matches any one | ||
| - * of the given {@link LexemeType}s. | ||
| - */ | ||
| - public boolean anyType( final LexemeType... types ) { | ||
| - assert types != null; | ||
| - | ||
| - for( final var type : types ) { | ||
| - if( mType == type ) { | ||
| - return true; | ||
| - } | ||
| - } | ||
| - | ||
| - return false; | ||
| - } | ||
| - | ||
| - public int began() { | ||
| - return mBegan; | ||
| - } | ||
| - | ||
| - public int ended() { | ||
| - return mEnded; | ||
| - } | ||
| - | ||
| - boolean isSot() { | ||
| - return mBegan == 0; | ||
| - } | ||
| - | ||
| - boolean isEot() { | ||
| - return mBegan == -1; | ||
| - } | ||
| - | ||
| - LexemeType getType() { | ||
| - return mType; | ||
| - } | ||
| - | ||
| - /** | ||
| - * Compares the starting offset of the given {@link Lexeme} to the starting | ||
| - * offset of this {@link Lexeme} instance. This allows a list {@link Lexeme}s | ||
| - * to be sorted by order of appearance in the parsed text. | ||
| - */ | ||
| - @Override | ||
| - public int compareTo( final Lexeme that ) { | ||
| - assert that != null; | ||
| - return this.mBegan - that.mBegan; | ||
| - } | ||
| - | ||
| - static Lexeme createLexeme( | ||
| - final LexemeType lexeme, final int began, final int ended ) { | ||
| - assert lexeme != null; | ||
| - return new Lexeme( lexeme, began, ended ); | ||
| - } | ||
| -} | ||
| -/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */ | ||
| -package com.keenwrite.quotes; | ||
| - | ||
| -/** | ||
| - * Represents the type of a {@link Lexeme} parsed by the {@link Lexer}. | ||
| - */ | ||
| -public enum LexemeType { | ||
| - QUOTE_SINGLE, | ||
| - QUOTE_SINGLE_OPENING, | ||
| - QUOTE_SINGLE_CLOSING, | ||
| - QUOTE_DOUBLE, | ||
| - ESC_SINGLE, | ||
| - ESC_DOUBLE, | ||
| - EOL, | ||
| - EOP, | ||
| - SPACE, | ||
| - WORD, | ||
| - NUMBER, | ||
| - PUNCT, | ||
| - OPENING_GROUP, | ||
| - CLOSING_GROUP, | ||
| - HYPHEN, | ||
| - DASH, | ||
| - EQUALS, | ||
| - PERIOD, | ||
| - ELLIPSIS, | ||
| - FLAG | ||
| -} | ||
| -/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */ | ||
| -package com.keenwrite.quotes; | ||
| - | ||
| -import java.text.CharacterIterator; | ||
| -import java.text.StringCharacterIterator; | ||
| -import java.util.function.BiFunction; | ||
| - | ||
| -import static com.keenwrite.quotes.Lexeme.createLexeme; | ||
| -import static com.keenwrite.quotes.LexemeType.*; | ||
| -import static java.lang.Character.isDigit; | ||
| -import static java.lang.Character.isWhitespace; | ||
| -import static java.text.CharacterIterator.DONE; | ||
| - | ||
| -/** | ||
| - * Turns text into words, numbers, punctuation, spaces, and more. | ||
| - */ | ||
| -public class Lexer { | ||
| - /** | ||
| - * Iterates over the entire string of text to help produce lexemes. | ||
| - */ | ||
| - private final CharacterIterator mIterator; | ||
| - | ||
| - /** | ||
| - * Default constructor, no state. | ||
| - */ | ||
| - public Lexer( final String text ) { | ||
| - mIterator = new StringCharacterIterator( text ); | ||
| - } | ||
| - | ||
| - public Lexeme next() { | ||
| - return parse( mIterator ); | ||
| - } | ||
| - | ||
| - /** | ||
| - * Tokenizes a sequence of characters. The order of comparisons is optimized | ||
| - * towards probability of the occurrence of a character in regular English | ||
| - * prose: letters, space, quotation marks, numbers, periods, new lines, | ||
| - * then end of text. | ||
| - * | ||
| - * @param i The sequence of characters to tokenize. | ||
| - * @return The next token in the sequence. | ||
| - */ | ||
| - private Lexeme parse( final CharacterIterator i ) { | ||
| - int began = i.getIndex(); | ||
| - boolean isWord = false; | ||
| - Lexeme lexeme = null; | ||
| - | ||
| - do { | ||
| - final var curr = i.current(); | ||
| - | ||
| - if( isLetter( curr ) ) { | ||
| - isWord = true; | ||
| - | ||
| - if( !isLetter( peek( i ) ) ) { | ||
| - lexeme = createLexeme( WORD, began, i.getIndex() ); | ||
| - } | ||
| - } | ||
| - else if( curr == ' ' ) { | ||
| - lexeme = createLexeme( SPACE, began, i.getIndex() ); | ||
| - } | ||
| - else if( curr == '\'' ) { | ||
| - lexeme = createLexeme( QUOTE_SINGLE, began, i.getIndex() ); | ||
| - } | ||
| - else if( curr == '"' ) { | ||
| - lexeme = createLexeme( QUOTE_DOUBLE, began, i.getIndex() ); | ||
| - } | ||
| - else if( curr == '‘') { | ||
| - lexeme = createLexeme( QUOTE_SINGLE_OPENING, began, i.getIndex() ); | ||
| - } | ||
| - else if( curr == '’') { | ||
| - lexeme = createLexeme( QUOTE_SINGLE_CLOSING, began, i.getIndex() ); | ||
| - } | ||
| - else if( curr == '-' && peek( i ) == '-' || curr == '—' ) { | ||
| - slurp( i, ( next, ci ) -> next == '-' || next == '—' ); | ||
| - | ||
| - lexeme = createLexeme( DASH, began, i.getIndex() ); | ||
| - } | ||
| - else if( isDigit( curr ) || isNumeric( curr ) && isDigit( peek( i ) ) ) { | ||
| - // Parse all consecutive number characters to prevent the main loop | ||
| - // from switching back to word tokens. | ||
| - slurp( i, ( next, ci ) -> | ||
| - isDigit( next ) || isNumeric( next ) && isDigit( peek( ci ) ) | ||
| - ); | ||
| - | ||
| - lexeme = createLexeme( isWord ? WORD : NUMBER, began, i.getIndex() ); | ||
| - } | ||
| - else if( curr == '-' ) { | ||
| - lexeme = createLexeme( HYPHEN, began, i.getIndex() ); | ||
| - } | ||
| - else if( curr == '.' ) { | ||
| - // Parse all consecutive periods into an ellipsis lexeme. This will | ||
| - // not capture space-separated ellipsis (such as ". . ."). | ||
| - lexeme = createLexeme( | ||
| - slurp( i, ( next, ci ) -> next == '.' ) == 0 ? PERIOD : ELLIPSIS, | ||
| - began, i.getIndex() | ||
| - ); | ||
| - } | ||
| - else if( curr == '\r' || curr == '\n' ) { | ||
| - final var cr = new int[]{curr == '\r' ? 1 : 0}; | ||
| - final var lf = new int[]{curr == '\n' ? 1 : 0}; | ||
| - | ||
| - // Swallow all consecutive CR (Mac), CRLF (Windows), and/or LF (Unix). | ||
| - slurp( | ||
| - i, ( next, ci ) -> { | ||
| - cr[ 0 ] += next == '\r' ? 1 : 0; | ||
| - lf[ 0 ] += next == '\n' ? 1 : 0; | ||
| - return next == '\r' || next == '\n'; | ||
| - } | ||
| - ); | ||
| - | ||
| - final var eol = cr[ 0 ] + lf[ 0 ] == 1 || cr[ 0 ] == 1 && lf[ 0 ] == 1; | ||
| - lexeme = createLexeme( eol ? EOL : EOP, began, i.getIndex() ); | ||
| - } | ||
| - else if( isWhitespace( curr ) ) { | ||
| - lexeme = createLexeme( SPACE, began, i.getIndex() ); | ||
| - } | ||
| - else if( curr == '\\' ) { | ||
| - final var next = i.next(); | ||
| - | ||
| - if( next == '\'' ) { | ||
| - lexeme = createLexeme( ESC_SINGLE, began, i.getIndex() ); | ||
| - } | ||
| - else if( next == '\"' ) { | ||
| - lexeme = createLexeme( ESC_DOUBLE, began, i.getIndex() ); | ||
| - } | ||
| - else { | ||
| - // Push back the escaped character, which wasn't a straight quote. | ||
| - i.previous(); | ||
| - } | ||
| - } | ||
| - else if( curr == '(' || curr == '{' || curr == '[' ) { | ||
| - lexeme = createLexeme( OPENING_GROUP, began, i.getIndex() ); | ||
| - } | ||
| - else if( curr == ')' || curr == '}' || curr == ']' ) { | ||
| - lexeme = createLexeme( CLOSING_GROUP, began, i.getIndex() ); | ||
| - } | ||
| - else if( curr == '=' ) { | ||
| - lexeme = createLexeme( EQUALS, began, i.getIndex() ); | ||
| - } | ||
| - else if( curr != DONE ) { | ||
| - lexeme = createLexeme( PUNCT, began, i.getIndex() ); | ||
| - } | ||
| - else { | ||
| - lexeme = Lexeme.EOT; | ||
| - } | ||
| - | ||
| - i.next(); | ||
| - } | ||
| - while( lexeme == null ); | ||
| - | ||
| - return lexeme; | ||
| - } | ||
| - | ||
| - /** | ||
| - * Answers whether the given character can be considered part of a word | ||
| - * or not. This will include {@code _} and {@code *} because plain text | ||
| - * formats often use those characters to emphasize a word. | ||
| - * | ||
| - * @param curr The character to check as being part of a word. | ||
| - * @return {@code true} if the given character is a letter or a formatting | ||
| - * indicator. | ||
| - */ | ||
| - private static boolean isLetter( final char curr ) { | ||
| - return Character.isLetter( curr ) || curr == '_' || curr == '*'; | ||
| - } | ||
| - | ||
| - /** | ||
| - * Answers whether the given character can be considered part of a number | ||
| - * or not. This does not include digits, which are checked independently | ||
| - * from this method. | ||
| - * | ||
| - * @param curr The character to check as being related to numbers. | ||
| - * @return {@code true} if the given character can be considered part of | ||
| - * a number (e.g., -2,000.2^2 is considered a single number). | ||
| - */ | ||
| - private static boolean isNumeric( final char curr ) { | ||
| - return | ||
| - curr == '.' || curr == ',' || curr == '-' || curr == '+' || curr == '^'; | ||
| - } | ||
| - | ||
| - private static char peek( final CharacterIterator ci ) { | ||
| - final var ch = ci.next(); | ||
| - ci.previous(); | ||
| - return ch; | ||
| - } | ||
| - | ||
| - /** | ||
| - * Parse all characters that match a given function. | ||
| - * | ||
| - * @param ci The iterator containing characters to parse. | ||
| - * @param f The function that determines when slurping stops. | ||
| - * @return The number of characters parsed. | ||
| - */ | ||
| - private static int slurp( | ||
| - final CharacterIterator ci, | ||
| - final BiFunction<Character, CharacterIterator, Boolean> f ) { | ||
| - char next; | ||
| - int count = 0; | ||
| - | ||
| - do { | ||
| - next = ci.next(); | ||
| - count++; | ||
| - } | ||
| - while( f.apply( next, ci ) ); | ||
| - | ||
| - // The loop above will overshoot the tally by one character. | ||
| - ci.previous(); | ||
| - | ||
| - return --count; | ||
| - } | ||
| -} | ||
| -/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */ | ||
| -package com.keenwrite.quotes; | ||
| - | ||
| -import java.util.ArrayList; | ||
| -import java.util.List; | ||
| -import java.util.function.Consumer; | ||
| - | ||
| -import static com.keenwrite.quotes.Lexeme.EOT; | ||
| -import static com.keenwrite.quotes.Lexeme.SOT; | ||
| -import static com.keenwrite.quotes.LexemeType.*; | ||
| -import static com.keenwrite.quotes.TokenType.*; | ||
| - | ||
| -/** | ||
| - * Converts straight double/single quotes and apostrophes to curly equivalents. | ||
| - */ | ||
| -public final class Parser { | ||
| - /** | ||
| - * Single quotes preceded by these {@link LexemeType}s may be opening quotes. | ||
| - */ | ||
| - private static final LexemeType[] LEADING_QUOTE_OPENING_SINGLE = | ||
| - new LexemeType[]{SPACE, DASH, QUOTE_DOUBLE, OPENING_GROUP, EOL, EOP}; | ||
| - | ||
| - /** | ||
| - * Single quotes succeeded by these {@link LexemeType}s may be opening quotes. | ||
| - */ | ||
| - private static final LexemeType[] LAGGING_QUOTE_OPENING_SINGLE = | ||
| - new LexemeType[]{WORD, ELLIPSIS, QUOTE_SINGLE, QUOTE_DOUBLE}; | ||
| - | ||
| - /** | ||
| - * Single quotes preceded by these {@link LexemeType}s may be closing quotes. | ||
| - */ | ||
| - private static final LexemeType[] LEADING_QUOTE_CLOSING_SINGLE = | ||
| - new LexemeType[]{WORD, NUMBER, PERIOD, PUNCT, ELLIPSIS, QUOTE_DOUBLE}; | ||
| - | ||
| - /** | ||
| - * Single quotes succeeded by these {@link LexemeType}s may be closing quotes. | ||
| - */ | ||
| - private static final LexemeType[] LAGGING_QUOTE_CLOSING_SINGLE = | ||
| - new LexemeType[]{SPACE, HYPHEN, DASH, | ||
| - QUOTE_DOUBLE, CLOSING_GROUP, EOL, EOP}; | ||
| - | ||
| - /** | ||
| - * Double quotes preceded by these {@link LexemeType}s may be opening quotes. | ||
| - */ | ||
| - private static final LexemeType[] LEADING_QUOTE_OPENING_DOUBLE = | ||
| - new LexemeType[]{SPACE, DASH, EQUALS, QUOTE_SINGLE, OPENING_GROUP, EOL, EOP}; | ||
| - | ||
| - /** | ||
| - * Double quotes succeeded by these {@link LexemeType}s may be opening quotes. | ||
| - */ | ||
| - private static final LexemeType[] LAGGING_QUOTE_OPENING_DOUBLE = | ||
| - new LexemeType[]{WORD, NUMBER, ELLIPSIS, | ||
| - QUOTE_SINGLE, QUOTE_SINGLE_OPENING, QUOTE_SINGLE_CLOSING, QUOTE_DOUBLE}; | ||
| - | ||
| - /** | ||
| - * Double quotes preceded by these {@link LexemeType}s may be closing quotes. | ||
| - */ | ||
| - private static final LexemeType[] LEADING_QUOTE_CLOSING_DOUBLE = | ||
| - new LexemeType[]{WORD, NUMBER, PERIOD, PUNCT, DASH, ELLIPSIS, | ||
| - QUOTE_SINGLE, QUOTE_SINGLE_CLOSING, QUOTE_SINGLE_OPENING}; | ||
| - | ||
| - /** | ||
| - * Double quotes succeeded by these {@link LexemeType}s may be closing quotes. | ||
| - */ | ||
| - private static final LexemeType[] LAGGING_QUOTE_CLOSING_DOUBLE = | ||
| - new LexemeType[]{SPACE, PUNCT, PERIOD, EQUALS, HYPHEN, DASH, | ||
| - QUOTE_SINGLE, CLOSING_GROUP, EOL, EOP}; | ||
| - | ||
| - /** | ||
| - * The text to parse. A reference is required as a minor optimization in | ||
| - * memory and speed: the lexer records integer offsets, rather than new | ||
| - * {@link String} instances, to track parsed lexemes. | ||
| - */ | ||
| - private final String mText; | ||
| - | ||
| - /** | ||
| - * Converts a string into an iterable list of {@link Lexeme} instances. | ||
| - */ | ||
| - private final Lexer mLexer; | ||
| - | ||
| - /** | ||
| - * Sets of contractions that help disambiguate single quotes in the text. | ||
| - * These are effectively immutable while parsing. | ||
| - */ | ||
| - private final Contractions sContractions; | ||
| - | ||
| - /** | ||
| - * Incremented for each opening single quote emitted. Used to help resolve | ||
| - * ambiguities when single quote marks are balanced. | ||
| - */ | ||
| - private int mOpeningSingleQuote; | ||
| - | ||
| - /** | ||
| - * Incremented for each closing single quote emitted. Used to help resolve | ||
| - * ambiguities when single quote marks are balanced. | ||
| - */ | ||
| - private int mClosingSingleQuote; | ||
| - | ||
| - /** | ||
| - * Constructs a new {@link Parser} using the default contraction sets | ||
| - * to help resolve some ambiguous scenarios. | ||
| - * | ||
| - * @param text The prose to parse, containing zero or more quotation | ||
| - * characters. | ||
| - */ | ||
| - public Parser( final String text ) { | ||
| - this( text, new Contractions.Builder().build() ); | ||
| - } | ||
| - | ||
| - /** | ||
| - * Constructs a new {@link Parser} using the default contraction sets | ||
| - * to help resolve some ambiguous scenarios. | ||
| - * | ||
| - * @param text The prose to parse, containing zero or more quotation | ||
| - * characters. | ||
| - * @param contractions Custom sets of contractions to help resolve | ||
| - * ambiguities. | ||
| - */ | ||
| - public Parser( final String text, final Contractions contractions ) { | ||
| - mText = text; | ||
| - mLexer = new Lexer( mText ); | ||
| - sContractions = contractions; | ||
| - } | ||
| - | ||
| - /** | ||
| - * Iterates over the entire text provided at construction, emitting | ||
| - * {@link Token}s that can be used to convert straight quotes to curly | ||
| - * quotes. | ||
| - * | ||
| - * @param tokenConsumer Receives emitted {@link Token}s. | ||
| - */ | ||
| - public void parse( | ||
| - final Consumer<Token> tokenConsumer, | ||
| - final Consumer<Lexeme> lexemeConsumer ) { | ||
| - final var lexemes = new CircularFifoQueue<Lexeme>( 3 ); | ||
| - | ||
| - // Allow consuming the very first token without checking the queue size. | ||
| - flush( lexemes ); | ||
| - | ||
| - final var unresolved = new ArrayList<Lexeme[]>(); | ||
| - Lexeme lexeme; | ||
| - | ||
| - // Create and convert a list of all unambiguous quote characters. | ||
| - while( (lexeme = mLexer.next()) != EOT ) { | ||
| - if( tokenize( lexeme, lexemes, tokenConsumer, unresolved ) ) { | ||
| - // Attempt to resolve any remaining unambiguous quotes. | ||
| - resolve( unresolved, tokenConsumer ); | ||
| - | ||
| - // Notify of any unambiguous quotes that could not be resolved. | ||
| - unresolved.forEach( ( lex ) -> lexemeConsumer.accept( lex[ 1 ] ) ); | ||
| - unresolved.clear(); | ||
| - mOpeningSingleQuote = 0; | ||
| - mClosingSingleQuote = 0; | ||
| - } | ||
| - } | ||
| - | ||
| - // By loop's end, the lexemes list contains tokens for all except the | ||
| - // final two elements (from tokenizing in triplets). Tokenize the remaining | ||
| - // unprocessed lexemes. | ||
| - tokenize( EOT, lexemes, tokenConsumer, unresolved ); | ||
| - tokenize( EOT, lexemes, tokenConsumer, unresolved ); | ||
| - | ||
| - // Attempt to resolve any remaining unambiguous quotes. | ||
| - resolve( unresolved, tokenConsumer ); | ||
| - | ||
| - // Notify of any unambiguous quotes that could not be resolved. | ||
| - unresolved.forEach( ( lex ) -> lexemeConsumer.accept( lex[ 1 ] ) ); | ||
| - } | ||
| - | ||
| - /** | ||
| - * Converts {@link Lexeme}s identified as straight quotes into {@link Token}s | ||
| - * that represent the curly equivalent. The {@link Token}s are passed to | ||
| - * the given {@link Consumer} for further processing (e.g., replaced in | ||
| - * the original text being parsed). | ||
| - * | ||
| - * @param lexeme A part of the text being parsed. | ||
| - * @param lexemes A 3-element queue of lexemes that provide sufficient | ||
| - * context to identify curly quotes. | ||
| - * @param consumer Recipient of equivalent quotes. | ||
| - * @param unresolved Rolling list of potentially ambiguous {@link Lexeme}s | ||
| - * that could not be tokenized, yet. | ||
| - * @return {@code true} if an end-of-paragraph is detected. | ||
| - */ | ||
| - private boolean tokenize( final Lexeme lexeme, | ||
| - final CircularFifoQueue<Lexeme> lexemes, | ||
| - final Consumer<Token> consumer, | ||
| - final List<Lexeme[]> unresolved ) { | ||
| - // Add the next lexeme to tokenize into the queue for immediate processing. | ||
| - lexemes.add( lexeme ); | ||
| - | ||
| - final var lex1 = lexemes.get( 0 ); | ||
| - final var lex2 = lexemes.get( 1 ); | ||
| - final var lex3 = lexemes.get( 2 ); | ||
| - | ||
| - if( lex2.isType( QUOTE_SINGLE ) && lex3.isType( WORD ) && | ||
| - lex1.anyType( WORD, PERIOD, NUMBER ) ) { | ||
| - // Examples: y'all, Ph.D.'ll, 20's, she's | ||
| - consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) ); | ||
| - } | ||
| - else if( lex1.isType( QUOTE_SINGLE ) && lex3.isType( QUOTE_SINGLE ) && | ||
| - "n".equalsIgnoreCase( lex2.toString( mText ) ) ) { | ||
| - // I.e., 'n' | ||
| - consumer.accept( new Token( QUOTE_APOSTROPHE, lex1 ) ); | ||
| - consumer.accept( new Token( QUOTE_APOSTROPHE, lex3 ) ); | ||
| - flush( lexemes ); | ||
| - truncate( unresolved ); | ||
| - } | ||
| - else if( lex2.isType( QUOTE_SINGLE ) && lex1.isType( NUMBER ) ) { | ||
| - if( lex3.isType( QUOTE_SINGLE ) ) { | ||
| - // E.g., 2'' | ||
| - consumer.accept( | ||
| - new Token( QUOTE_PRIME_DOUBLE, lex2.began(), lex3.ended() ) ); | ||
| - flush( lexemes ); | ||
| - } | ||
| - else { | ||
| - // E.g., 2' | ||
| - consumer.accept( new Token( QUOTE_PRIME_SINGLE, lex2 ) ); | ||
| - } | ||
| - } | ||
| - else if( lex2.isType( QUOTE_DOUBLE ) && lex1.isType( NUMBER ) ) { | ||
| - // E.g., 2" | ||
| - consumer.accept( new Token( QUOTE_PRIME_DOUBLE, lex2 ) ); | ||
| - } | ||
| - else if( lex2.isType( WORD ) && lex3.isType( QUOTE_SINGLE ) && | ||
| - sContractions.endedUnambiguously( lex2.toString( mText ) ) ) { | ||
| - // E.g., thinkin' | ||
| - consumer.accept( new Token( QUOTE_APOSTROPHE, lex3 ) ); | ||
| - flush( lexemes ); | ||
| - } | ||
| - else if( lex2.isType( NUMBER ) && lex1.isType( QUOTE_SINGLE ) ) { | ||
| - if( lex3.anyType( SPACE, PUNCT ) || (lex3.isType( WORD ) && | ||
| - lex3.toString( mText ).equalsIgnoreCase( "s" )) ) { | ||
| - // Sentences must re-written to avoid starting with numerals. | ||
| - // Examples: '20s, '02 | ||
| - consumer.accept( new Token( QUOTE_APOSTROPHE, lex1 ) ); | ||
| - } | ||
| - else { | ||
| - // E.g., '2'' | ||
| - consumer.accept( new Token( QUOTE_OPENING_SINGLE, lex1 ) ); | ||
| - mOpeningSingleQuote++; | ||
| - } | ||
| - | ||
| - truncate( unresolved ); | ||
| - } | ||
| - else if( lex2.isType( QUOTE_SINGLE ) && | ||
| - lex1.anyType( PUNCT, PERIOD, ELLIPSIS, DASH ) && | ||
| - (lex3.anyType( EOL, EOP ) || lex3.isEot()) ) { | ||
| - consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) ); | ||
| - mClosingSingleQuote++; | ||
| - } | ||
| - else if( lex1.isType( ESC_SINGLE ) ) { | ||
| - // E.g., \' | ||
| - consumer.accept( new Token( QUOTE_STRAIGHT_SINGLE, lex1 ) ); | ||
| - } | ||
| - else if( lex1.isType( ESC_DOUBLE ) ) { | ||
| - // E.g., \" | ||
| - consumer.accept( new Token( QUOTE_STRAIGHT_DOUBLE, lex1 ) ); | ||
| - | ||
| - if( lex2.isType( QUOTE_SINGLE ) && | ||
| - (lex3.isEot() || lex3.anyType( SPACE, DASH, EOL, EOP )) ) { | ||
| - consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) ); | ||
| - mClosingSingleQuote++; | ||
| - } | ||
| - } | ||
| - else if( lex2.isType( QUOTE_DOUBLE ) && | ||
| - (lex1.isSot() || lex1.anyType( LEADING_QUOTE_OPENING_DOUBLE )) && | ||
| - lex3.anyType( LAGGING_QUOTE_OPENING_DOUBLE ) ) { | ||
| - // Examples: "", "..., "word, ---"word | ||
| - consumer.accept( new Token( QUOTE_OPENING_DOUBLE, lex2 ) ); | ||
| - } | ||
| - else if( lex2.isType( QUOTE_DOUBLE ) && | ||
| - lex1.anyType( LEADING_QUOTE_CLOSING_DOUBLE ) && | ||
| - (lex3.isEot() || lex3.anyType( LAGGING_QUOTE_CLOSING_DOUBLE )) ) { | ||
| - // Examples: ..."', word"', ?"', word"? | ||
| - consumer.accept( new Token( QUOTE_CLOSING_DOUBLE, lex2 ) ); | ||
| - } | ||
| - else if( lex1.isType( QUOTE_SINGLE ) && | ||
| - lex2.anyType( PUNCT, PERIOD, DASH ) && lex3.isType( QUOTE_DOUBLE ) ) { | ||
| - // E.g., '," (contraction ruled out from previous conditionals) | ||
| - consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex1 ) ); | ||
| - truncate( unresolved ); | ||
| - mClosingSingleQuote++; | ||
| - } | ||
| - else if( lex2.anyType( QUOTE_SINGLE, QUOTE_DOUBLE ) ) { | ||
| - // After tokenizing, the parser will attempt to resolve ambiguities. | ||
| - unresolved.add( new Lexeme[]{lex1, lex2, lex3} ); | ||
| - } | ||
| - | ||
| - // Suggest to the caller that resolution should be performed. This allows | ||
| - // the algorithm to reset the opening/closing quote balance before the | ||
| - // next paragraph is parsed. | ||
| - return lex3.isType( EOP ); | ||
| - } | ||
| - | ||
| - private void resolve( | ||
| - final List<Lexeme[]> unresolved, final Consumer<Token> consumer ) { | ||
| - // Some non-emitted tokenized lexemes may be ambiguous. | ||
| - final var ambiguousLeadingQuotes = new ArrayList<Lexeme[]>( 16 ); | ||
| - final var ambiguousLaggingQuotes = new ArrayList<Lexeme[]>( 16 ); | ||
| - var resolvedLeadingQuotes = 0; | ||
| - var resolvedLaggingQuotes = 0; | ||
| - | ||
| - // Count the number of ambiguous and non-ambiguous open single quotes. | ||
| - for( var i = unresolved.iterator(); i.hasNext(); ) { | ||
| - final var quotes = i.next(); | ||
| - final var lex1 = quotes[ 0 ]; | ||
| - final var lex2 = quotes[ 1 ]; | ||
| - final var lex3 = quotes[ 2 ]; | ||
| - | ||
| - if( lex2.isType( QUOTE_SINGLE ) ) { | ||
| - final var word1 = lex1 == SOT ? "" : lex1.toString( mText ); | ||
| - final var word3 = lex3 == EOT ? "" : lex3.toString( mText ); | ||
| - | ||
| - if( sContractions.beganAmbiguously( word3 ) ) { | ||
| - // E.g., 'Cause | ||
| - if( lex1.isType( QUOTE_SINGLE ) ) { | ||
| - // E.g., ''Cause | ||
| - consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) ); | ||
| - i.remove(); | ||
| - } | ||
| - else { | ||
| - // The contraction is uncertain until a closing quote is found that | ||
| - // may balance this single quote. | ||
| - ambiguousLeadingQuotes.add( quotes ); | ||
| - } | ||
| - } | ||
| - else if( sContractions.beganUnambiguously( word3 ) ) { | ||
| - // The quote mark forms a word that does not stand alone from its | ||
| - // contraction. For example, twas is not a word: it's 'twas. | ||
| - consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) ); | ||
| - i.remove(); | ||
| - } | ||
| - else if( sContractions.endedAmbiguously( word1 ) ) { | ||
| - ambiguousLaggingQuotes.add( quotes ); | ||
| - } | ||
| - else if( (lex1.isSot() || lex1.anyType( LEADING_QUOTE_OPENING_SINGLE )) | ||
| - && lex3.anyType( LAGGING_QUOTE_OPENING_SINGLE ) ) { | ||
| - consumer.accept( new Token( QUOTE_OPENING_SINGLE, lex2 ) ); | ||
| - resolvedLeadingQuotes++; | ||
| - mOpeningSingleQuote++; | ||
| - i.remove(); | ||
| - } | ||
| - else if( lex1.anyType( LEADING_QUOTE_CLOSING_SINGLE ) && | ||
| - (lex3.isEot() || lex3.anyType( LAGGING_QUOTE_CLOSING_SINGLE )) ) { | ||
| - consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) ); | ||
| - resolvedLaggingQuotes++; | ||
| - mClosingSingleQuote++; | ||
| - i.remove(); | ||
| - } | ||
| - else if( lex3.isType( NUMBER ) ) { | ||
| - // E.g., '04 | ||
| - ambiguousLeadingQuotes.add( quotes ); | ||
| - } | ||
| - } | ||
| - } | ||
| - | ||
| - final var ambiguousLeadingCount = ambiguousLeadingQuotes.size(); | ||
| - final var ambiguousLaggingCount = ambiguousLaggingQuotes.size(); | ||
| - | ||
| - if( resolvedLeadingQuotes == 1 && resolvedLaggingQuotes == 0 ) { | ||
| - if( ambiguousLeadingCount == 0 && ambiguousLaggingCount == 1 ) { | ||
| - final var balanced = mClosingSingleQuote - mOpeningSingleQuote == 0; | ||
| - final var quote = balanced ? QUOTE_APOSTROPHE : QUOTE_CLOSING_SINGLE; | ||
| - final var lex = ambiguousLaggingQuotes.get( 0 ); | ||
| - consumer.accept( new Token( quote, lex[ 1 ] ) ); | ||
| - unresolved.remove( lex ); | ||
| - } | ||
| - else if( ambiguousLeadingCount == 0 && unresolved.size() == 1 ) { | ||
| - // Must be a closing quote. | ||
| - final var closing = unresolved.get( 0 ); | ||
| - consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) ); | ||
| - unresolved.remove( closing ); | ||
| - } | ||
| - } | ||
| - else if( ambiguousLeadingCount == 0 && ambiguousLaggingCount > 0 ) { | ||
| - // If there are no ambiguous leading quotes then all ambiguous lagging | ||
| - // quotes must be contractions. | ||
| - ambiguousLaggingQuotes.forEach( | ||
| - lex -> { | ||
| - consumer.accept( new Token( QUOTE_APOSTROPHE, lex[ 1 ] ) ); | ||
| - unresolved.remove( lex ); | ||
| - } | ||
| - ); | ||
| - } | ||
| - else if( ambiguousLeadingCount == 0 ) { | ||
| - if( resolvedLaggingQuotes < resolvedLeadingQuotes ) { | ||
| - for( final var i = unresolved.iterator(); i.hasNext(); ) { | ||
| - final var closing = i.next()[ 1 ]; | ||
| - consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing ) ); | ||
| - i.remove(); | ||
| - } | ||
| - } | ||
| - else if( mOpeningSingleQuote - mClosingSingleQuote == unresolved.size() ) { | ||
| - for( final var i = unresolved.iterator(); i.hasNext(); ) { | ||
| - final var closing = i.next(); | ||
| - consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) ); | ||
| - i.remove(); | ||
| - } | ||
| - } | ||
| - else if( unresolved.size() == 2 ) { | ||
| - final var closing = unresolved.get( 0 ); | ||
| - final var opening = unresolved.get( 1 ); | ||
| - consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) ); | ||
| - consumer.accept( new Token( QUOTE_OPENING_SINGLE, opening[ 1 ] ) ); | ||
| - | ||
| - // Doesn't affect the algorithm. | ||
| - unresolved.clear(); | ||
| - } | ||
| - } | ||
| - else if( ambiguousLeadingCount == 1 && resolvedLaggingQuotes == 1 ) { | ||
| - final var opening = ambiguousLeadingQuotes.get( 0 ); | ||
| - consumer.accept( new Token( QUOTE_OPENING_SINGLE, opening[ 1 ] ) ); | ||
| - unresolved.remove( opening ); | ||
| - } | ||
| - } | ||
| - | ||
| - /** | ||
| - * Remove the last {@link Lexeme}s from the given list. | ||
| - * | ||
| - * @param unresolved The list of {@link Lexeme}s to modify. | ||
| - */ | ||
| - private void truncate( final List<Lexeme[]> unresolved ) { | ||
| - if( !unresolved.isEmpty() ) { | ||
| - unresolved.remove( unresolved.size() - 1 ); | ||
| - } | ||
| - } | ||
| - | ||
| - /** | ||
| - * Overwrites the {@link CircularFifoQueue}'s contents with start-of-text | ||
| - * indicators. | ||
| - * | ||
| - * @param lexemes The queue to overflow. | ||
| - */ | ||
| - private void flush( final CircularFifoQueue<Lexeme> lexemes ) { | ||
| - lexemes.add( SOT ); | ||
| - lexemes.add( SOT ); | ||
| - lexemes.add( SOT ); | ||
| - } | ||
| -} | ||
| -/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */ | ||
| -package com.keenwrite.quotes; | ||
| - | ||
| -/** | ||
| - * Represents a high-level token read from the text. | ||
| - */ | ||
| -final class Token implements Comparable<Token> { | ||
| - private final TokenType mType; | ||
| - final int mBegan; | ||
| - final int mEnded; | ||
| - | ||
| - /** | ||
| - * Convenience constructor to create a token that uses the lexeme's | ||
| - * beginning and ending offsets to represent a complete token. | ||
| - * | ||
| - * @param type The type of {@link Token} to create. | ||
| - * @param lexeme Container for beginning and ending text offsets. | ||
| - */ | ||
| - Token( final TokenType type, final Lexeme lexeme ) { | ||
| - this( type, lexeme.began(), lexeme.ended() ); | ||
| - } | ||
| - | ||
| - /** | ||
| - * This constructor can be used to create tokens that span more than a | ||
| - * single character. Almost all tokens represent a single character, only | ||
| - * the double-prime sequence ({@code ''}) is more than one character. | ||
| - * | ||
| - * @param type The type of {@link Token} to create. | ||
| - * @param began Beginning offset into text where token is found. | ||
| - * @param ended Ending offset into text where token is found. | ||
| - */ | ||
| - Token( final TokenType type, final int began, final int ended ) { | ||
| - assert type != null; | ||
| - assert began >= 0; | ||
| - assert ended > began; | ||
| - | ||
| - mType = type; | ||
| - mBegan = began; | ||
| - mEnded = ended; | ||
| - } | ||
| - | ||
| - TokenType getType() { | ||
| - return mType; | ||
| - } | ||
| - | ||
| - int began() { | ||
| - return mBegan; | ||
| - } | ||
| - | ||
| - int ended() { | ||
| - return mEnded; | ||
| - } | ||
| - | ||
| - @Override | ||
| - public int compareTo( final Token that ) { | ||
| - return this.mBegan - that.mBegan; | ||
| - } | ||
| -} | ||
| -/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */ | ||
| -package com.keenwrite.quotes; | ||
| - | ||
| -/** | ||
| - * The {@link Parser} emits these token types. | ||
| - */ | ||
| -enum TokenType { | ||
| - QUOTE_OPENING_SINGLE, | ||
| - QUOTE_OPENING_DOUBLE, | ||
| - QUOTE_CLOSING_SINGLE, | ||
| - QUOTE_CLOSING_DOUBLE, | ||
| - QUOTE_APOSTROPHE, | ||
| - QUOTE_STRAIGHT_SINGLE, | ||
| - QUOTE_STRAIGHT_DOUBLE, | ||
| - QUOTE_PRIME_SINGLE, | ||
| - QUOTE_PRIME_DOUBLE, | ||
| -} | ||
| -/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */ | ||
| -package com.keenwrite.quotes; | ||
| - | ||
| -import org.junit.jupiter.api.Test; | ||
| -import org.junit.jupiter.params.ParameterizedTest; | ||
| -import org.junit.jupiter.params.provider.ValueSource; | ||
| - | ||
| -import java.io.BufferedReader; | ||
| -import java.io.IOException; | ||
| -import java.io.InputStreamReader; | ||
| -import java.util.function.Function; | ||
| - | ||
| -import static java.lang.System.out; | ||
| -import static org.junit.jupiter.api.Assertions.assertEquals; | ||
| -import static org.junit.jupiter.api.Assertions.assertNotNull; | ||
| - | ||
| -/** | ||
| - * Test that English straight quotes are converted to curly quotes and | ||
| - * apostrophes. | ||
| - */ | ||
| -public class KeenQuotesTest { | ||
| - /** | ||
| - * This is a single-use test that is useful for debugging. | ||
| - */ | ||
| - @Test | ||
| - //@Disabled | ||
| - public void test_parse_SingleLine_Parsed() { | ||
| - out.println( KeenQuotes.convert( | ||
| - "\"’Kearney lives on the banks of Killarney—’", | ||
| - out::println | ||
| - ) ); | ||
| - } | ||
| - | ||
| - /** | ||
| - * Tests that straight quotes are converted to curly quotes. | ||
| - * | ||
| - * @throws IOException Error opening file full of fun. | ||
| - */ | ||
| - @Test | ||
| - public void test_Parse_StraightQuotes_CurlyQuotes() throws IOException { | ||
| - testConverter( text -> KeenQuotes.convert( text, ( lexeme ) -> {} ) ); | ||
| - } | ||
| - | ||
| - @ParameterizedTest | ||
| - @ValueSource( strings = {"westrup"} ) | ||
| - void test_Parse_Story_Converted( final String filename ) throws IOException { | ||
| - final var sb = new StringBuilder( 2 ^ 20 ); | ||
| - | ||
| - try( final var reader = open( filename + ".txt" ) ) { | ||
| - String line; | ||
| - | ||
| - while( (line = reader.readLine()) != null ) { | ||
| - sb.append( line ).append( '\n' ); | ||
| - } | ||
| - } | ||
| - | ||
| - System.out.println( KeenQuotes.convert( sb.toString(), out::println ) ); | ||
| - } | ||
| - | ||
| - /** | ||
| - * Reads a file full of couplets. The first of the pair is the input, | ||
| - * the second of the pair is the expected result. Couplets may include | ||
| - * newline characters to indicate end of lines and end of paragraphs. | ||
| - * Lines that start with {@code #} are ignored. | ||
| - * | ||
| - * @param parser The text processor capable of straight quote conversion. | ||
| - * @throws IOException Error opening file full of fun. | ||
| - */ | ||
| - private void testConverter( final Function<String, String> parser ) | ||
| - throws IOException { | ||
| - try( final var reader = open( "smartypants.txt" ) ) { | ||
| - String line; | ||
| - String testLine = ""; | ||
| - String expected = ""; | ||
| - | ||
| - while( ((line = reader.readLine()) != null) ) { | ||
| - if( line.startsWith( "#" ) || line.isBlank() ) { continue; } | ||
| - | ||
| - // Read the first line of the couplet. | ||
| - if( testLine.isBlank() ) { | ||
| - testLine = line; | ||
| - continue; | ||
| - } | ||
| - | ||
| - // Read the second line of the couplet. | ||
| - if( expected.isBlank() ) { | ||
| - expected = line; | ||
| - } | ||
| - | ||
| - testLine = unescapeEol( testLine ); | ||
| - expected = unescapeEol( expected ); | ||
| - | ||
| - final var actual = parser.apply( testLine ); | ||
| - assertEquals( expected, actual ); | ||
| - | ||
| - testLine = ""; | ||
| - expected = ""; | ||
| - } | ||
| - } | ||
| - } | ||
| - | ||
| - private static String unescapeEol( final String s ) { | ||
| - return String.join( "\n", s.split( "\\\\n" ) ); | ||
| - } | ||
| - | ||
| - /** | ||
| - * Opens a text file for reading. Callers are responsible for closing. | ||
| - * | ||
| - * @param filename The file to open. | ||
| - * @return An instance of {@link BufferedReader} that can be used to | ||
| - * read all the lines in the file. | ||
| - */ | ||
| - private BufferedReader open( final String filename ) { | ||
| - final var is = getClass().getResourceAsStream( filename ); | ||
| - assertNotNull( is ); | ||
| - | ||
| - return new BufferedReader( new InputStreamReader( is ) ); | ||
| - } | ||
| -} | ||
| -/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */ | ||
| -package com.keenwrite.quotes; | ||
| - | ||
| -import org.junit.jupiter.api.Test; | ||
| - | ||
| -import java.util.Arrays; | ||
| -import java.util.List; | ||
| -import java.util.function.BiFunction; | ||
| - | ||
| -import static com.keenwrite.quotes.Lexeme.EOT; | ||
| -import static com.keenwrite.quotes.LexemeType.*; | ||
| -import static org.junit.jupiter.api.Assertions.assertEquals; | ||
| - | ||
| -/** | ||
| - * Tests lexing words, numbers, punctuation, spaces, newlines, etc. | ||
| - */ | ||
| -class LexerTest { | ||
| - @Test | ||
| - void test_Lexing_Words_TokenValues() { | ||
| - testText( "abc 123", "abc", " ", "123" ); | ||
| - testText( "-123 abc", "-123", " ", "abc" ); | ||
| - } | ||
| - | ||
| - @Test | ||
| - void test_Lexing_Numbers_EmitNumbers() { | ||
| - testType( ".123", NUMBER ); | ||
| - testType( "-123.", NUMBER, PERIOD ); | ||
| - testType( " 123.123.123", SPACE, NUMBER ); | ||
| - testType( "123 123\"", NUMBER, SPACE, NUMBER, QUOTE_DOUBLE ); | ||
| - testType( "-123,123.123", NUMBER ); | ||
| - testType( "...1,023...", ELLIPSIS, NUMBER, ELLIPSIS ); | ||
| - } | ||
| - | ||
| - @Test | ||
| - void test_Lexing_Words_EmitWords() { | ||
| - testType( "abc", WORD ); | ||
| - testType( "abc abc", WORD, SPACE, WORD ); | ||
| - testType( "abc...", WORD, ELLIPSIS ); | ||
| - testType( "abc123", WORD, NUMBER ); | ||
| - testType( "-123abc", NUMBER, WORD ); | ||
| - testType( "abc-o'-abc", WORD, HYPHEN, WORD, QUOTE_SINGLE, HYPHEN, WORD ); | ||
| - } | ||
| - | ||
| - @Test | ||
| - void test_Lexing_PunctuationMarks_EmitPunctuationMarks() { | ||
| - testType( "!", PUNCT ); | ||
| - testType( ";", PUNCT ); | ||
| - testType( ".", PERIOD ); | ||
| - testType( "-", HYPHEN ); | ||
| - testType( "--", DASH ); | ||
| - testType( "---", DASH ); | ||
| - testType( "...", ELLIPSIS ); | ||
| - } | ||
| - | ||
| - @Test | ||
| - void test_Lexing_Quotes_EmitQuotes() { | ||
| - testType( "'", QUOTE_SINGLE ); | ||
| - testType( "\"", QUOTE_DOUBLE ); | ||
| - testType( "3 o'clock", NUMBER, SPACE, WORD, QUOTE_SINGLE, WORD ); | ||
| - } | ||
| - | ||
| - @Test | ||
| - void test_Lexing_Escapes_EmitEscapedQuotes() { | ||
| - testType( "123\\'456\\\"", NUMBER, ESC_SINGLE, NUMBER, ESC_DOUBLE ); | ||
| - testText( "123\\'456\\\"", "123", "\\'", "456", "\\\"" ); | ||
| - } | ||
| - | ||
| - @Test | ||
| - void test_Lexing_Newlines_EmitNewlines() { | ||
| - testType( "\r", EOL ); | ||
| - testType( "\n", EOL ); | ||
| - testType( "\r\n", EOL ); | ||
| - testType( "\r\n\r\n", EOP ); | ||
| - testType( "\r\n\n\r", EOP ); | ||
| - testType( "abc \r\nabc\n", WORD, SPACE, EOL, WORD, EOL ); | ||
| - } | ||
| - | ||
| - private void testType( final String actual, final LexemeType... expected ) { | ||
| - final var list = Arrays.asList( expected ); | ||
| - testType( actual, ( lexeme, text ) -> lexeme.getType(), list ); | ||
| - } | ||
| - | ||
| - private void testText( final String actual, final String... expected ) { | ||
| - testType( actual, Lexeme::toString, Arrays.asList( expected ) ); | ||
| - } | ||
| - | ||
| - private <A, E> void testType( | ||
| - final String text, | ||
| - final BiFunction<Lexeme, String, A> f, | ||
| - final List<E> elements ) { | ||
| - final var lexer = new Lexer( text ); | ||
| - var counter = 0; | ||
| - | ||
| - Lexeme lexeme; | ||
| - | ||
| - while( (lexeme = lexer.next()) != EOT ) { | ||
| - final var expected = elements.get( counter++ ); | ||
| - final var actual = f.apply( lexeme, text ); | ||
| - | ||
| - assertEquals( expected, actual ); | ||
| - } | ||
| - | ||
| - // Ensure all expected values are matched (verify end of text reached). | ||
| - assertEquals( elements.size(), counter ); | ||
| - } | ||
| -} | ||
| -/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */ | ||
| -package com.keenwrite.quotes; | ||
| - | ||
| -import org.junit.jupiter.api.Test; | ||
| - | ||
| -import java.util.HashMap; | ||
| -import java.util.Map; | ||
| - | ||
| -import static com.keenwrite.quotes.TokenType.QUOTE_APOSTROPHE; | ||
| -import static org.junit.jupiter.api.Assertions.assertEquals; | ||
| - | ||
| -/** | ||
| - * Test that all unambiguous apostrophes are emitted once. | ||
| - */ | ||
| -class ParserTest { | ||
| - private final static Map<String, Map<TokenType, Integer>> TEST_CASES = | ||
| - Map.of( | ||
| - "That's a 35'×10\" yacht!", | ||
| - Map.of( QUOTE_APOSTROPHE, 1 ), | ||
| - "It's the 80's...", | ||
| - Map.of( QUOTE_APOSTROPHE, 2 ), | ||
| - "Fish-'n'-chips!", | ||
| - Map.of( QUOTE_APOSTROPHE, 2 ), | ||
| - "She's a cat ya'll couldn't've known!", | ||
| - Map.of( QUOTE_APOSTROPHE, 4 ), | ||
| - "'Twas and 'tis whate'er lay 'twixt dawn and dusk 'n River Styx.", | ||
| - Map.of( QUOTE_APOSTROPHE, 5 ), | ||
| - """ | ||
| - But I must leave the proofs to those who 've seen 'em; | ||
| - But this I heard her say, and can't be wrong | ||
| - And all may think which way their judgments lean 'em, | ||
| - ''T is strange---the Hebrew noun which means "I am," | ||
| - The English always use to govern d--n.' | ||
| - """, | ||
| - Map.of( QUOTE_APOSTROPHE, 5 ) | ||
| - ); | ||
| - | ||
| - @Test | ||
| - void test_Conversion_StraightQuotes_ExpectedConversionCount() { | ||
| - for( final var entry : TEST_CASES.entrySet() ) { | ||
| - parse( entry.getKey(), entry.getValue() ); | ||
| - } | ||
| - } | ||
| - | ||
| - private void parse( final String text, final Map<TokenType, Integer> tally ) { | ||
| - final var parser = new Parser( text ); | ||
| - final var actual = new HashMap<TokenType, Integer>(); | ||
| - | ||
| - parser.parse( | ||
| - ( token ) -> actual.merge( token.getType(), 1, Integer::sum ), | ||
| - ( lexeme ) -> {} | ||
| - ); | ||
| - | ||
| - for( final var expectedEntry : tally.entrySet() ) { | ||
| - final var expectedCount = expectedEntry.getValue(); | ||
| - final var actualCount = actual.get( expectedEntry.getKey() ); | ||
| - | ||
| - assertEquals( expectedCount, actualCount, text ); | ||
| - } | ||
| - } | ||
| -} | ||
| -# ######################################################################## | ||
| -# Escaped quotes | ||
| -# ######################################################################## | ||
| -She stood 5\'7\". | ||
| -She stood 5'7". | ||
| - | ||
| -\"What?\" | ||
| -"What?" | ||
| - | ||
| -# ######################################################################## | ||
| -# Primes (single, double) | ||
| -# ######################################################################## | ||
| -Alice's friend is 6'3" tall. | ||
| -Alice's friend is 6′3″ tall. | ||
| - | ||
| -Bob's table is 5'' × 4''. | ||
| -Bob's table is 5″ × 4″. | ||
| - | ||
| -Bob's table is 5'×4'. | ||
| -Bob's table is 5′×4′. | ||
| - | ||
| -What's this '-5.5'',' '-10.2'' cm,' and another '-7.25''' thing? | ||
| -What's this ‘-5.5″,’ ‘-10.2″ cm,’ and another ‘-7.25″’ thing? | ||
| - | ||
| -'What's this -5.5'' thing?' | ||
| -‘What's this -5.5″ thing?’ | ||
| - | ||
| -+7.9'' is weird. | ||
| -+7.9″ is weird. | ||
| - | ||
| -12" record, 5'10" height | ||
| -12″ record, 5′10″ height | ||
| - | ||
| -Use 11.5"x14.25" virtual paper! | ||
| -Use 11.5″x14.25″ virtual paper! | ||
| - | ||
| -Angular measure 3° 5' 30" means 3 degs, 5 arcmins, & 30 arcsecs. | ||
| -Angular measure 3° 5′ 30″ means 3 degs, 5 arcmins, & 30 arcsecs. | ||
| - | ||
| -# ######################################################################## | ||
| -# Double quotes | ||
| -# ######################################################################## | ||
| -"I am Sam" | ||
| -“I am Sam” | ||
| - | ||
| -"...even better!" | ||
| -“...even better!” | ||
| - | ||
| -"It was so," said he. | ||
| -“It was so,” said he. | ||
| - | ||
| -"What cries in the streets"? | ||
| -“What cries in the streets”? | ||
| - | ||
| -With "air quotes" in the middle. | ||
| -With “air quotes” in the middle. | ||
| - | ||
| -With--"air quotes"--and dashes. | ||
| -With--“air quotes”--and dashes. | ||
| - | ||
| -"Not all open quotes are closed... | ||
| -“Not all open quotes are closed... | ||
| - | ||
| -"And this" and "and this" and "and this" and "and another." | ||
| -“And this” and “and this” and “and this” and “and another.” | ||
| - | ||
| -"Will'll invite?" Mrs. Thorne asked;\n"because he's coming--he's at the gate." | ||
| -“Will'll invite?” Mrs. Thorne asked;\n“because he's coming--he's at the gate.” | ||
| - | ||
| -# ######################################################################## | ||
| -# Single quotes | ||
| -# ######################################################################## | ||
| -'I am Sam' | ||
| -‘I am Sam’ | ||
| - | ||
| -'It was so,' said he. | ||
| -‘It was so,’ said he. | ||
| - | ||
| -'...even better!' | ||
| -‘...even better!’ | ||
| - | ||
| -With 'quotes' in the middle. | ||
| -With ‘quotes’ in the middle. | ||
| - | ||
| -With--'imaginary'--dashes. | ||
| -With--‘imaginary’--dashes. | ||
| - | ||
| -''Cause I don't like it, 's why,' said Pat. | ||
| -‘'Cause I don't like it, 's why,’ said Pat. | ||
| - | ||
| -'It's a beautiful day!' | ||
| -‘It's a beautiful day!’ | ||
| - | ||
| -Book 'em, Danno. Rock 'n' roll. 'Cause 'twas the season. | ||
| -Book 'em, Danno. Rock 'n' roll. 'Cause 'twas the season. | ||
| - | ||
| -# ######################################################################## | ||
| -# Possessives | ||
| -# ######################################################################## | ||
| -Sam's Sams' and the Ross's roses' thorns were prickly. | ||
| -Sam's Sams' and the Ross's roses' thorns were prickly. | ||
| - | ||
| -# ######################################################################## | ||
| -# Inside contractions (no leading/trailing apostrophes) | ||
| -# ######################################################################## | ||
| -I don't like it: I love's it! | ||
| -I don't like it: I love's it! | ||
| - | ||
| -We'd've thought that pancakes'll be sweeter there. | ||
| -We'd've thought that pancakes'll be sweeter there. | ||
| - | ||
| -She'd be coming o'er when the horse'd gone to pasture... | ||
| -She'd be coming o'er when the horse'd gone to pasture... | ||
| - | ||
| -# ######################################################################## | ||
| -# Beginning contractions (leading apostrophes) | ||
| -# ######################################################################## | ||
| -'Twas and 'tis whate'er lay 'twixt dawn and dusk 'n River Styx. | ||
| -'Twas and 'tis whate'er lay 'twixt dawn and dusk 'n River Styx. | ||
| - | ||
| -# ######################################################################## | ||
| -# Outside contractions (leading and trailing, no middle) | ||
| -# ######################################################################## | ||
| -Salt 'n' vinegar, fish-'n'-chips, sugar 'n spice! | ||
| -Salt 'n' vinegar, fish-'n'-chips, sugar 'n spice! | ||
| - | ||
| -# ######################################################################## | ||
| -# Ending contractions (trailing apostrophes) | ||
| -# ######################################################################## | ||
| -Didn' get th' message. | ||
| -Didn' get th' message. | ||
| - | ||
| -Namsayin', y'know what I'ma sayin'? | ||
| -Namsayin', y'know what I'ma sayin'? | ||
| - | ||
| -# ######################################################################## | ||
| -# Decades | ||
| -# ######################################################################## | ||
| -The Roaring '20s had the best music, no? | ||
| -The Roaring '20s had the best music, no? | ||
| - | ||
| -Took place in '04, yes'm! | ||
| -Took place in '04, yes'm! | ||
| - | ||
| -# ######################################################################## | ||
| -# Consecutive quotes | ||
| -# ######################################################################## | ||
| -"'I'm trouble.'" | ||
| -“‘I'm trouble.’” | ||
| - | ||
| -'"Trouble's my name."' | ||
| -‘“Trouble's my name.”’ | ||
| - | ||
| -# ######################################################################## | ||
| -# Nested quotations | ||
| -# ######################################################################## | ||
| -"'Here I am,' said Sam" | ||
| -“‘Here I am,’ said Sam” | ||
| - | ||
| -'"Here I am," said Sam' | ||
| -‘“Here I am,” said Sam’ | ||
| - | ||
| -'Hello, "Dr. Brown," what's your real name?' | ||
| -‘Hello, “Dr. Brown,” what's your real name?’ | ||
| - | ||
| -"'Twas, t'wasn't thy name, 'twas it?" said Jim "the Barber" Brown. | ||
| -“'Twas, t'wasn't thy name, 'twas it?” said Jim “the Barber” Brown. | ||
| - | ||
| -"'I'm in danger. Help? Take me to--' an address on Van Ness Avenue. | ||
| -“‘I'm in danger. Help? Take me to--’ an address on Van Ness Avenue. | ||
| - | ||
| -"Ah," she said, "you knew! He told you--and you said 'my dear'! How could you?!" | ||
| -“Ah,” she said, “you knew! He told you--and you said ‘my dear’! How could you?!” | ||
| - | ||
| -shouldn't drop letters, nor say "ain't" or "hain't." | ||
| -shouldn't drop letters, nor say “ain't” or “hain't.” | ||
| - | ||
| -"’Kearney lives on the banks of Killarney—’ | ||
| -“’Kearney lives on the banks of Killarney—’ | ||
| - | ||
| -# ######################################################################## | ||
| -# Mixed | ||
| -# ######################################################################## | ||
| -"She said, 'That's Sam's'," said the Sams' cat. | ||
| -“She said, ‘That's Sam's’,” said the Sams' cat. | ||
| - | ||
| -"'Jane said, ''E'll be spooky, Sam's son with the jack-o'-lantern!'" said the O'Mally twins'---y'know---ghosts in unison. | ||
| -“‘Jane said, ‘'E'll be spooky, Sam's son with the jack-o'-lantern!’” said the O'Mally twins'---y'know---ghosts in unison. | ||
| - | ||
| -'He's at Sam's' | ||
| -‘He's at Sam's’ | ||
| - | ||
| -ma'am | ||
| -ma'am | ||
| - | ||
| -'Twas midnight | ||
| -'Twas midnight | ||
| - | ||
| -"Hello," said the spider. "'Shelob' is my name." | ||
| -“Hello,” said the spider. “‘Shelob’ is my name.” | ||
| - | ||
| -'He said, \"I want to go.\"' Were you alive in the 70's? | ||
| -‘He said, "I want to go."’ Were you alive in the 70's? | ||
| - | ||
| -"That's a 'magic' sock." | ||
| -“That's a ‘magic’ sock.” | ||
| - | ||
| -'That's a "magic" sock.' | ||
| -‘That's a “magic” sock.’ | ||
| - | ||
| -'That's a "very 'magic'" sock.' | ||
| -‘That's a “very ‘magic’” sock.’ | ||
| - | ||
| -Website! Company Name, Inc. ("Company Name" or "Company") recommends reading carefully: | ||
| -Website! Company Name, Inc. (“Company Name” or “Company”) recommends reading carefully: | ||
| - | ||
| -Website! Company Name, Inc. ('Company Name' or 'Company') recommends reading carefully: | ||
| -Website! Company Name, Inc. (‘Company Name’ or ‘Company’) recommends reading carefully: | ||
| - | ||
| -Workin' hard | ||
| -Workin' hard | ||
| - | ||
| -"She said, 'Llamas'll languish, they'll-- | ||
| -“She said, ‘Llamas'll languish, they'll-- | ||
| - | ||
| -'The 70s are my favorite numbers,' she said. | ||
| -‘The 70s are my favorite numbers,’ she said. | ||
| - | ||
| -'70s fashion was weird. | ||
| -'70s fashion was weird. | ||
| - | ||
| -Model \"T2000\" | ||
| -Model "T2000" | ||
| - | ||
| -The 2's complement.\n\n'Yar, binary!' | ||
| -The 2's complement.\n\n‘Yar, binary!’ | ||
| - | ||
| -The Who's complement.\n\n\n"Paragraph boundary!" | ||
| -The Who's complement.\n\n\n“Paragraph boundary!” | ||
| - | ||
| -'Oak,' 'elm,' and 'beech' are names of trees. So is 'pine.' | ||
| -‘Oak,’ ‘elm,’ and ‘beech’ are names of trees. So is ‘pine.’ | ||
| - | ||
| -'A', 'B', and 'C' are letters. | ||
| -‘A’, ‘B’, and ‘C’ are letters. | ||
| - | ||
| -'He said, "Thinkin'."' | ||
| -‘He said, “Thinkin'.”’ | ||
| - | ||
| -''Sup, Doc?' | ||
| -‘'Sup, Doc?’ | ||
| - | ||
| -# ######################################################################## | ||
| -# Ambiguous (no solution) | ||
| -# ######################################################################## | ||
| -She said, 'Cause of death?\n\n'Old age, there's no doubt.' | ||
| -She said, 'Cause of death?\n\n‘Old age, there's no doubt.’ | ||
| - | ||
| -She said, 'Cause I said so!\n\n'If we're done, I've a secret.' | ||
| -She said, 'Cause I said so!\n\n‘If we're done, I've a secret.’ | ||
| - | ||
| -'Bout that time I says, 'Boys! I been thinkin' 'bout th' Universe.' | ||
| -'Bout that time I says, ‘Boys! I been thinkin' 'bout th' Universe.’ | ||
| -rootProject.name = 'quotes' | ||
| -include('lib') | ||
| - | ||
| +/* | ||
| + * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| + * contributor license agreements. See the NOTICE file distributed with | ||
| + * this work for additional information regarding copyright ownership. | ||
| + * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| + * (the "License"); you may not use this file except in compliance with | ||
| + * the License. You may obtain a copy of the License at | ||
| + * | ||
| + * http://www.apache.org/licenses/LICENSE-2.0 | ||
| + * | ||
| + * Unless required by applicable law or agreed to in writing, software | ||
| + * distributed under the License is distributed on an "AS IS" BASIS, | ||
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| + * See the License for the specific language governing permissions and | ||
| + * limitations under the License. | ||
| + */ | ||
| +package com.whitemagicsoftware.keenquotes; | ||
| + | ||
| +import java.util.*; | ||
| + | ||
| +/** | ||
| + * CircularFifoQueue is a first-in first-out queue with a fixed size that | ||
| + * replaces its oldest element if full. | ||
| + * <p> | ||
| + * The removal order of a {@link CircularFifoQueue} is based on the | ||
| + * insertion order; elements are removed in the same order in which they | ||
| + * were added. The iteration order is the same as the removal order. | ||
| + * </p> | ||
| + * <p> | ||
| + * The {@link #add(Object)}, {@link #remove()}, {@link #peek()}, | ||
| + * {@link #poll()}, | ||
| + * {@link #offer(Object)} operations all perform in constant time. | ||
| + * All other operations perform in linear time or worse. | ||
| + * </p> | ||
| + * <p> | ||
| + * This queue prevents null objects from being added. | ||
| + * </p> | ||
| + * | ||
| + * @param <E> the type of elements in this collection | ||
| + * @since 4.0 | ||
| + */ | ||
| +public final class CircularFifoQueue<E> extends AbstractCollection<E> | ||
| + implements Queue<E> { | ||
| + /** | ||
| + * Underlying storage array. | ||
| + */ | ||
| + private final transient E[] elements; | ||
| + | ||
| + /** | ||
| + * Array index of first (oldest) queue element. | ||
| + */ | ||
| + private transient int start; | ||
| + | ||
| + /** | ||
| + * Index mod maxElements of the array position following the last queue | ||
| + * element. Queue elements start at elements[start] and "wrap around" | ||
| + * elements[maxElements-1], ending at elements[decrement(end)]. | ||
| + * For example, elements = {c,a,b}, start=1, end=1 corresponds to | ||
| + * the queue [a,b,c]. | ||
| + */ | ||
| + private transient int end; | ||
| + | ||
| + /** | ||
| + * Flag to indicate if the queue is currently full. | ||
| + */ | ||
| + private transient boolean full; | ||
| + | ||
| + /** | ||
| + * Capacity of the queue. | ||
| + */ | ||
| + private final int maxElements; | ||
| + | ||
| + /** | ||
| + * Constructor that creates a queue with the specified size. | ||
| + * | ||
| + * @param size Immutable queue size, must be greater than zero. | ||
| + */ | ||
| + @SuppressWarnings( "unchecked" ) | ||
| + public CircularFifoQueue( final int size ) { | ||
| + assert size > 0; | ||
| + | ||
| + elements = (E[]) new Object[ size ]; | ||
| + maxElements = elements.length; | ||
| + } | ||
| + | ||
| + /** | ||
| + * Returns the number of elements stored in the queue. | ||
| + * | ||
| + * @return this queue's size | ||
| + */ | ||
| + @Override | ||
| + public int size() { | ||
| + int size; | ||
| + | ||
| + if( end < start ) { | ||
| + size = maxElements - start + end; | ||
| + } | ||
| + else if( end == start ) { | ||
| + size = full ? maxElements : 0; | ||
| + } | ||
| + else { | ||
| + size = end - start; | ||
| + } | ||
| + | ||
| + return size; | ||
| + } | ||
| + | ||
| + /** | ||
| + * Returns true if this queue is empty; false otherwise. | ||
| + * | ||
| + * @return true if this queue is empty | ||
| + */ | ||
| + @Override | ||
| + public boolean isEmpty() { | ||
| + return size() == 0; | ||
| + } | ||
| + | ||
| + /** | ||
| + * Returns {@code true} if the capacity limit of this queue has been reached, | ||
| + * i.e. the number of elements stored in the queue equals its maximum size. | ||
| + * | ||
| + * @return {@code true} if the capacity limit has been reached, {@code | ||
| + * false} otherwise | ||
| + * @since 4.1 | ||
| + */ | ||
| + public boolean isAtFullCapacity() { | ||
| + return size() == maxElements; | ||
| + } | ||
| + | ||
| + /** | ||
| + * Clears this queue. | ||
| + */ | ||
| + @Override | ||
| + public void clear() { | ||
| + full = false; | ||
| + start = 0; | ||
| + end = 0; | ||
| + Arrays.fill( elements, null ); | ||
| + } | ||
| + | ||
| + /** | ||
| + * Adds the given element to this queue. If the queue is full, the least | ||
| + * recently added | ||
| + * element is discarded so that a new element can be inserted. | ||
| + * | ||
| + * @param element the element to add | ||
| + * @return true, always | ||
| + * @throws NullPointerException if the given element is null | ||
| + */ | ||
| + @Override | ||
| + public boolean add( final E element ) { | ||
| + Objects.requireNonNull( element, "element" ); | ||
| + | ||
| + if( isAtFullCapacity() ) { | ||
| + remove(); | ||
| + } | ||
| + | ||
| + elements[ end++ ] = element; | ||
| + | ||
| + if( end >= maxElements ) { | ||
| + end = 0; | ||
| + } | ||
| + | ||
| + if( end == start ) { | ||
| + full = true; | ||
| + } | ||
| + | ||
| + return true; | ||
| + } | ||
| + | ||
| + /** | ||
| + * Returns the element at the specified position in this queue. | ||
| + * | ||
| + * @param index the position of the element in the queue | ||
| + * @return the element at position {@code index} | ||
| + * @throws NoSuchElementException if the requested position is outside the | ||
| + * range [0, size) | ||
| + */ | ||
| + public E get( final int index ) { | ||
| + final int sz = size(); | ||
| + | ||
| + if( index < 0 || index >= sz ) { | ||
| + throw new NoSuchElementException( | ||
| + String.format( | ||
| + "Index %1$d is outside the available range [0, %2$d)", | ||
| + index, sz ) ); | ||
| + } | ||
| + | ||
| + final int idx = (start + index) % maxElements; | ||
| + return elements[ idx ]; | ||
| + } | ||
| + | ||
| + /** | ||
| + * Adds the given element to this queue. If the queue is full, the least | ||
| + * recently added | ||
| + * element is discarded so that a new element can be inserted. | ||
| + * | ||
| + * @param element the element to add | ||
| + * @return true, always | ||
| + * @throws NullPointerException if the given element is null | ||
| + */ | ||
| + @Override | ||
| + public boolean offer( final E element ) { | ||
| + return add( element ); | ||
| + } | ||
| + | ||
| + @Override | ||
| + public E poll() { | ||
| + return isEmpty() ? null : remove(); | ||
| + } | ||
| + | ||
| + @Override | ||
| + public E element() { | ||
| + if( isEmpty() ) { | ||
| + throw new NoSuchElementException( "empty queue" ); | ||
| + } | ||
| + | ||
| + return peek(); | ||
| + } | ||
| + | ||
| + @Override | ||
| + public E peek() { | ||
| + return isEmpty() ? null : elements[ start ]; | ||
| + } | ||
| + | ||
| + @Override | ||
| + public E remove() { | ||
| + if( isEmpty() ) { | ||
| + throw new NoSuchElementException( "empty queue" ); | ||
| + } | ||
| + | ||
| + final E element = elements[ start ]; | ||
| + if( null != element ) { | ||
| + elements[ start++ ] = null; | ||
| + | ||
| + if( start >= maxElements ) { | ||
| + start = 0; | ||
| + } | ||
| + full = false; | ||
| + } | ||
| + return element; | ||
| + } | ||
| + | ||
| + /** | ||
| + * Increments the internal index. | ||
| + * | ||
| + * @param index the index to increment | ||
| + * @return the updated index | ||
| + */ | ||
| + private int increment( int index ) { | ||
| + if( ++index >= maxElements ) { | ||
| + index = 0; | ||
| + } | ||
| + return index; | ||
| + } | ||
| + | ||
| + /** | ||
| + * Decrements the internal index. | ||
| + * | ||
| + * @param index the index to decrement | ||
| + * @return the updated index | ||
| + */ | ||
| + private int decrement( int index ) { | ||
| + if( --index < 0 ) { | ||
| + index = maxElements - 1; | ||
| + } | ||
| + return index; | ||
| + } | ||
| + | ||
| + /** | ||
| + * Returns an iterator over this queue's elements. | ||
| + * | ||
| + * @return an iterator over this queue's elements | ||
| + */ | ||
| + @Override | ||
| + public Iterator<E> iterator() { | ||
| + return new Iterator<>() { | ||
| + | ||
| + private int index = start; | ||
| + private int lastReturnedIndex = -1; | ||
| + private boolean isFirst = full; | ||
| + | ||
| + @Override | ||
| + public boolean hasNext() { | ||
| + return isFirst || index != end; | ||
| + } | ||
| + | ||
| + @Override | ||
| + public E next() { | ||
| + if( !hasNext() ) { | ||
| + throw new NoSuchElementException(); | ||
| + } | ||
| + | ||
| + isFirst = false; | ||
| + lastReturnedIndex = index; | ||
| + index = increment( index ); | ||
| + return elements[ lastReturnedIndex ]; | ||
| + } | ||
| + | ||
| + @Override | ||
| + public void remove() { | ||
| + if( lastReturnedIndex == -1 ) { | ||
| + throw new IllegalStateException(); | ||
| + } | ||
| + | ||
| + // First element can be removed quickly | ||
| + if( lastReturnedIndex == start ) { | ||
| + CircularFifoQueue.this.remove(); | ||
| + lastReturnedIndex = -1; | ||
| + return; | ||
| + } | ||
| + | ||
| + int pos = lastReturnedIndex + 1; | ||
| + if( start < lastReturnedIndex && pos < end ) { | ||
| + // shift in one part | ||
| + System.arraycopy( | ||
| + elements, pos, elements, lastReturnedIndex, end - pos ); | ||
| + } | ||
| + else { | ||
| + // Other elements require us to shift the subsequent elements | ||
| + while( pos != end ) { | ||
| + if( pos >= maxElements ) { | ||
| + elements[ pos - 1 ] = elements[ 0 ]; | ||
| + pos = 0; | ||
| + } | ||
| + else { | ||
| + elements[ decrement( pos ) ] = elements[ pos ]; | ||
| + pos = increment( pos ); | ||
| + } | ||
| + } | ||
| + } | ||
| + | ||
| + lastReturnedIndex = -1; | ||
| + end = decrement( end ); | ||
| + elements[ end ] = null; | ||
| + full = false; | ||
| + index = decrement( index ); | ||
| + } | ||
| + }; | ||
| + } | ||
| +} | ||
| +/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */ | ||
| +package com.whitemagicsoftware.keenquotes; | ||
| + | ||
| +import java.util.HashSet; | ||
| +import java.util.Set; | ||
| + | ||
| +import static java.util.Collections.emptySet; | ||
| + | ||
| +/** | ||
| + * Placeholder for various types of contractions. | ||
| + */ | ||
| +public class Contractions { | ||
| + | ||
| + private final Builder mBuilder; | ||
| + | ||
| + private Contractions( final Builder builder ) { | ||
| + assert builder != null; | ||
| + mBuilder = builder; | ||
| + } | ||
| + | ||
| + /** | ||
| + * Allows constructing a list of custom contractions. | ||
| + */ | ||
| + @SuppressWarnings( "unused" ) | ||
| + public static class Builder { | ||
| + private final Set<String> mBeganUnambiguous = new HashSet<>(); | ||
| + private final Set<String> mEndedUnambiguous = new HashSet<>(); | ||
| + private final Set<String> mBeganAmbiguous = new HashSet<>(); | ||
| + private final Set<String> mEndedAmbiguous = new HashSet<>(); | ||
| + | ||
| + public void withBeganUnambiguous( final Set<String> words ) { | ||
| + mBeganUnambiguous.addAll( words ); | ||
| + } | ||
| + | ||
| + public void withEndedUnambiguous( final Set<String> words ) { | ||
| + mEndedUnambiguous.addAll( words ); | ||
| + } | ||
| + | ||
| + public void withBeganAmbiguous( final Set<String> words ) { | ||
| + mBeganAmbiguous.addAll( words ); | ||
| + } | ||
| + | ||
| + public void withEndedAmbiguous( final Set<String> words ) { | ||
| + mEndedAmbiguous.addAll( words ); | ||
| + } | ||
| + | ||
| + /** | ||
| + * Constructs a new set of {@link Contractions} that can be configured | ||
| + * using this {@link Builder} instance. | ||
| + * | ||
| + * @return {@link Contractions} suitable for use with parsing text. | ||
| + */ | ||
| + public Contractions build() { | ||
| + mBeganUnambiguous.addAll( from( mBeganUnambiguous, BEGAN_UNAMBIGUOUS ) ); | ||
| + mEndedUnambiguous.addAll( from( mEndedUnambiguous, ENDED_UNAMBIGUOUS ) ); | ||
| + mBeganAmbiguous.addAll( from( mBeganAmbiguous, BEGAN_AMBIGUOUS ) ); | ||
| + mEndedAmbiguous.addAll( from( mEndedAmbiguous, ENDED_AMBIGUOUS ) ); | ||
| + | ||
| + return new Contractions( this ); | ||
| + } | ||
| + | ||
| + /** | ||
| + * This returns the {@code fallback} {@link Set} if {@code src} is empty; | ||
| + * otherwise, this returns the empty {@link Set}. | ||
| + * | ||
| + * @param src A set of contractions, possibly empty. | ||
| + * @param fallback The default values to use if {@code src} is empty. | ||
| + * @param <T> The type of data used by both {@link Set}s. | ||
| + * @return An empty {@link Set} if the {@code src} contains at least one | ||
| + * element; otherwise, this will return {@code fallback}. | ||
| + */ | ||
| + private static <T> Set<T> from( final Set<T> src, final Set<T> fallback ) { | ||
| + assert src != null; | ||
| + assert fallback != null; | ||
| + return src.isEmpty() ? fallback : emptySet(); | ||
| + } | ||
| + } | ||
| + | ||
| + /** | ||
| + * Answers whether the given word is a contraction that always starts | ||
| + * with an apostrophe. The comparison is case insensitive. This must | ||
| + * only be called when a straight quote is followed by a word. | ||
| + * | ||
| + * @param word The word to compare against the list of known unambiguous | ||
| + * contractions. | ||
| + * @return {@code true} when the given word is in the set of unambiguous | ||
| + * contractions. | ||
| + */ | ||
| + public boolean beganUnambiguously( final String word ) { | ||
| + assert word != null; | ||
| + return getBeganUnambiguous().contains( word.toLowerCase() ); | ||
| + } | ||
| + | ||
| + /** | ||
| + * Answers whether the given word could be a contraction but is also a | ||
| + * valid word in non-contracted form. | ||
| + * | ||
| + * @param word The word to compare against the list of known ambiguous | ||
| + * contractions. | ||
| + * @return {@code true} when the given word is in the set of ambiguous | ||
| + * contractions. | ||
| + */ | ||
| + public boolean beganAmbiguously( final String word ) { | ||
| + assert word != null; | ||
| + return getBeganAmbiguous().contains( word.toLowerCase() ); | ||
| + } | ||
| + | ||
| + public boolean endedUnambiguously( final String word ) { | ||
| + assert word != null; | ||
| + return getEndedUnambiguous().contains( word.toLowerCase() ); | ||
| + } | ||
| + | ||
| + public boolean endedAmbiguously( final String word ) { | ||
| + assert word != null; | ||
| + final var check = word.toLowerCase(); | ||
| + | ||
| + // Ensure that 'n' isn't matched for ambiguity by enforcing length, yet | ||
| + // allow o' to match because 'a sentence can end with the letter o'. | ||
| + return getEndedAmbiguous().contains( check ) || | ||
| + check.endsWith( "s" ) || check.endsWith( "z" ) || | ||
| + check.endsWith( "x" ) || (check.length() > 1 && check.endsWith( "n" )); | ||
| + } | ||
| + | ||
| + private Set<String> getBeganUnambiguous() { | ||
| + return mBuilder.mBeganUnambiguous; | ||
| + } | ||
| + | ||
| + private Set<String> getEndedUnambiguous() { | ||
| + return mBuilder.mEndedUnambiguous; | ||
| + } | ||
| + | ||
| + private Set<String> getBeganAmbiguous() { | ||
| + return mBuilder.mBeganAmbiguous; | ||
| + } | ||
| + | ||
| + private Set<String> getEndedAmbiguous() { | ||
| + return mBuilder.mEndedAmbiguous; | ||
| + } | ||
| + | ||
| + /** | ||
| + * Words having a straight apostrophe that cannot be mistaken for an | ||
| + * opening single quote. | ||
| + */ | ||
| + private static final Set<String> BEGAN_UNAMBIGUOUS = Set.of( | ||
| + "aporth", | ||
| + "boutcha", | ||
| + "boutchu", | ||
| + "cept", | ||
| + "dillo", | ||
| + "em", | ||
| + "fraid", | ||
| + "gainst", | ||
| + "n", | ||
| + "neath", | ||
| + "nother", | ||
| + "onna", | ||
| + "onna'", | ||
| + "pon", | ||
| + "s", | ||
| + "sblood", | ||
| + "scuse", | ||
| + "sfar", | ||
| + "sfoot", | ||
| + "t", | ||
| + "taint", | ||
| + "tain", | ||
| + "til", | ||
| + "tis", | ||
| + "tisn", | ||
| + "tshall", | ||
| + "twas", | ||
| + "twasn", | ||
| + "tween", | ||
| + "twere", | ||
| + "tweren", | ||
| + "twixt", | ||
| + "twon", | ||
| + "twou", | ||
| + "twould", | ||
| + "twouldn", | ||
| + "ve" | ||
| + ); | ||
| + | ||
| + /** | ||
| + * Words having a straight apostrophe that may be either part of a | ||
| + * contraction or a word that stands alone beside an opening single quote. | ||
| + */ | ||
| + private static final Set<String> BEGAN_AMBIGUOUS = Set.of( | ||
| + // about|boxing match | ||
| + "bout", | ||
| + // because|causal | ||
| + "cause", | ||
| + // what you|choo choo train | ||
| + "choo", | ||
| + // he|e pluribus unum | ||
| + "e", | ||
| + // here|earlier | ||
| + "ere", | ||
| + // afro|to and fro | ||
| + "fro", | ||
| + // whore|ho ho! | ||
| + "ho", | ||
| + // okay|letter K | ||
| + "kay", | ||
| + // lo|lo and behold | ||
| + "lo", | ||
| + // are|regarding | ||
| + "re", | ||
| + // what's up|to sup | ||
| + "sup", | ||
| + // it will|twill fabric | ||
| + "twill", | ||
| + // them|utterance | ||
| + "um", | ||
| + // is that|Iranian village | ||
| + "zat" | ||
| + ); | ||
| + | ||
| + private static final Set<String> ENDED_AMBIGUOUS = Set.of( | ||
| + // give|martial arts garment | ||
| + "gi", | ||
| + // in|I | ||
| + "i", | ||
| + // of|letter o | ||
| + "o" | ||
| + ); | ||
| + | ||
| + private static final Set<String> ENDED_UNAMBIGUOUS = Set.of( | ||
| + // and | ||
| + "an", | ||
| + // for/before | ||
| + "fo", | ||
| + // friend | ||
| + "frien", | ||
| + // just | ||
| + "jus", | ||
| + // lord | ||
| + "lor", | ||
| + // myself | ||
| + "masel", | ||
| + // old | ||
| + "ol", | ||
| + // San (Francisco) | ||
| + "Sa", | ||
| + // shift | ||
| + "shif", | ||
| + // the | ||
| + "th", | ||
| + // what | ||
| + "wha", | ||
| + // world | ||
| + "worl", | ||
| + // Top ~500 common -ing words as English contractions. | ||
| + "acceptin", | ||
| + "accompanyin", | ||
| + "accordin", | ||
| + "accountin", | ||
| + "achievin", | ||
| + "acquirin", | ||
| + "actin", | ||
| + "addin", | ||
| + "addressin", | ||
| + "adjoinin", | ||
| + "adoptin", | ||
| + "advancin", | ||
| + "advertisin", | ||
| + "affectin", | ||
| + "agin", | ||
| + "allowin", | ||
| + "amazin", | ||
| + "analyzin", | ||
| + "answerin", | ||
| + "anythin", | ||
| + "appearin", | ||
| + "applyin", | ||
| + "approachin", | ||
| + "arguin", | ||
| + "arisin", | ||
| + "arrivin", | ||
| + "askin", | ||
| + "assessin", | ||
| + "assumin", | ||
| + "attackin", | ||
| + "attemptin", | ||
| + "attendin", | ||
| + "avoidin", | ||
| + "bankin", | ||
| + "bargainin", | ||
| + "bearin", | ||
| + "beatin", | ||
| + "becomin", | ||
| + "beginnin", | ||
| + "bein", | ||
| + "believin", | ||
| + "belongin", | ||
| + "bendin", | ||
| + "bindin", | ||
| + "bleedin", | ||
| + "blessin", | ||
| + "blowin", | ||
| + "boilin", | ||
| + "borrowin", | ||
| + "breakin", | ||
| + "breathin", | ||
| + "breedin", | ||
| + "bringin", | ||
| + "broadcastin", | ||
| + "buildin", | ||
| + "burnin", | ||
| + "buyin", | ||
| + "calculatin", | ||
| + "callin", | ||
| + "carryin", | ||
| + "castin", | ||
| + "causin", | ||
| + "ceilin", | ||
| + "challengin", | ||
| + "changin", | ||
| + "checkin", | ||
| + "choosin", | ||
| + "claimin", | ||
| + "cleanin", | ||
| + "clearin", | ||
| + "climbin", | ||
| + "closin", | ||
| + "clothin", | ||
| + "collectin", | ||
| + "combinin", | ||
| + "comin", | ||
| + "commandin", | ||
| + "comparin", | ||
| + "compellin", | ||
| + "competin", | ||
| + "computin", | ||
| + "concernin", | ||
| + "concludin", | ||
| + "conditionin", | ||
| + "conductin", | ||
| + "conflictin", | ||
| + "connectin", | ||
| + "considerin", | ||
| + "consistin", | ||
| + "constructin", | ||
| + "consultin", | ||
| + "consumin", | ||
| + "containin", | ||
| + "continuin", | ||
| + "contractin", | ||
| + "contributin", | ||
| + "controllin", | ||
| + "convincin", | ||
| + "cookin", | ||
| + "coolin", | ||
| + "copin", | ||
| + "correspondin", | ||
| + "counselin", | ||
| + "countin", | ||
| + "couplin", | ||
| + "coverin", | ||
| + "creatin", | ||
| + "crossin", | ||
| + "cryin", | ||
| + "cuttin", | ||
| + "dancin", | ||
| + "darlin", | ||
| + "datin", | ||
| + "dealin", | ||
| + "decidin", | ||
| + "declarin", | ||
| + "declinin", | ||
| + "decreasin", | ||
| + "definin", | ||
| + "demandin", | ||
| + "denyin", | ||
| + "dependin", | ||
| + "descendin", | ||
| + "describin", | ||
| + "designin", | ||
| + "destroyin", | ||
| + "determinin", | ||
| + "developin", | ||
| + "differin", | ||
| + "dinin", | ||
| + "directin", | ||
| + "discussin", | ||
| + "distinguishin", | ||
| + "disturbin", | ||
| + "dividin", | ||
| + "doin", | ||
| + "drawin", | ||
| + "dressin", | ||
| + "drinkin", | ||
| + "drivin", | ||
| + "droppin", | ||
| + "dryin", | ||
| + "durin", | ||
| + "dwellin", | ||
| + "dyin", | ||
| + "eatin", | ||
| + "editin", | ||
| + "emergin", | ||
| + "employin", | ||
| + "enablin", | ||
| + "encouragin", | ||
| + "endin", | ||
| + "engagin", | ||
| + "engineerin", | ||
| + "enjoyin", | ||
| + "enterin", | ||
| + "establishin", | ||
| + "evaluatin", | ||
| + "evenin", | ||
| + "everythin", | ||
| + "examinin", | ||
| + "exceedin", | ||
| + "excitin", | ||
| + "excludin", | ||
| + "existin", | ||
| + "expandin", | ||
| + "expectin", | ||
| + "experiencin", | ||
| + "explainin", | ||
| + "explorin", | ||
| + "expressin", | ||
| + "extendin", | ||
| + "facin", | ||
| + "failin", | ||
| + "fallin", | ||
| + "farmin", | ||
| + "fascinatin", | ||
| + "feedin", | ||
| + "feelin", | ||
| + "fightin", | ||
| + "filin", | ||
| + "fillin", | ||
| + "financin", | ||
| + "findin", | ||
| + "firin", | ||
| + "fishin", | ||
| + "fittin", | ||
| + "fixin", | ||
| + "floatin", | ||
| + "flowin", | ||
| + "flyin", | ||
| + "focusin", | ||
| + "followin", | ||
| + "forcin", | ||
| + "foregoin", | ||
| + "formin", | ||
| + "forthcomin", | ||
| + "foundin", | ||
| + "freezin", | ||
| + "fuckin", | ||
| + "functionin", | ||
| + "fundin", | ||
| + "gainin", | ||
| + "gatherin", | ||
| + "generatin", | ||
| + "gettin", | ||
| + "givin", | ||
| + "goin", | ||
| + "governin", | ||
| + "grantin", | ||
| + "growin", | ||
| + "hackin", | ||
| + "handlin", | ||
| + "hangin", | ||
| + "happenin", | ||
| + "havin", | ||
| + "headin", | ||
| + "healin", | ||
| + "hearin", | ||
| + "heatin", | ||
| + "helpin", | ||
| + "hidin", | ||
| + "holdin", | ||
| + "hopin", | ||
| + "housin", | ||
| + "huntin", | ||
| + "identifyin", | ||
| + "imagin", | ||
| + "implementin", | ||
| + "imposin", | ||
| + "improvin", | ||
| + "includin", | ||
| + "increasin", | ||
| + "indicatin", | ||
| + "interestin", | ||
| + "interpretin", | ||
| + "introducin", | ||
| + "involvin", | ||
| + "joinin", | ||
| + "judgin", | ||
| + "keepin", | ||
| + "killin", | ||
| + "knowin", | ||
| + "lackin", | ||
| + "landin", | ||
| + "lastin", | ||
| + "laughin", | ||
| + "layin", | ||
| + "leadin", | ||
| + "leanin", | ||
| + "learnin", | ||
| + "leavin", | ||
| + "lettin", | ||
| + "liftin", | ||
| + "lightin", | ||
| + "lightnin", | ||
| + "limitin", | ||
| + "listenin", | ||
| + "listin", | ||
| + "livin", | ||
| + "loadin", | ||
| + "lookin", | ||
| + "losin", | ||
| + "lovin", | ||
| + "lowerin", | ||
| + "lyin", | ||
| + "maintainin", | ||
| + "makin", | ||
| + "managin", | ||
| + "manufacturin", | ||
| + "mappin", | ||
| + "marketin", | ||
| + "markin", | ||
| + "matchin", | ||
| + "meanin", | ||
| + "measurin", | ||
| + "meetin", | ||
| + "meltin", | ||
| + "minin", | ||
| + "misleadin", | ||
| + "missin", | ||
| + "mixin", | ||
| + "modelin", | ||
| + "monitorin", | ||
| + "mornin", | ||
| + "movin", | ||
| + "neighborin", | ||
| + "nothin", | ||
| + "notin", | ||
| + "notwithstandin", | ||
| + "nursin", | ||
| + "observin", | ||
| + "obtainin", | ||
| + "occurrin", | ||
| + "offerin", | ||
| + "offsprin", | ||
| + "ongoin", | ||
| + "openin", | ||
| + "operatin", | ||
| + "opposin", | ||
| + "orderin", | ||
| + "organizin", | ||
| + "outstandin", | ||
| + "overwhelmin", | ||
| + "packin", | ||
| + "paintin", | ||
| + "parkin", | ||
| + "participatin", | ||
| + "passin", | ||
| + "payin", | ||
| + "pendin", | ||
| + "performin", | ||
| + "pickin", | ||
| + "pissin", | ||
| + "placin", | ||
| + "plannin", | ||
| + "plantin", | ||
| + "playin", | ||
| + "pleasin", | ||
| + "pointin", | ||
| + "possessin", | ||
| + "preachin", | ||
| + "precedin", | ||
| + "preparin", | ||
| + "presentin", | ||
| + "preservin", | ||
| + "pressin", | ||
| + "prevailin", | ||
| + "preventin", | ||
| + "pricin", | ||
| + "printin", | ||
| + "proceedin", | ||
| + "processin", | ||
| + "producin", | ||
| + "programmin", | ||
| + "promisin", | ||
| + "promotin", | ||
| + "protectin", | ||
| + "providin", | ||
| + "provin", | ||
| + "publishin", | ||
| + "pullin", | ||
| + "purchasin", | ||
| + "pursuin", | ||
| + "pushin", | ||
| + "puttin", | ||
| + "questionin", | ||
| + "rangin", | ||
| + "ratin", | ||
| + "reachin", | ||
| + "readin", | ||
| + "reasonin", | ||
| + "receivin", | ||
| + "recognizin", | ||
| + "recordin", | ||
| + "reducin", | ||
| + "referrin", | ||
| + "reflectin", | ||
| + "refusin", | ||
| + "regardin", | ||
| + "regulatin", | ||
| + "relatin", | ||
| + "remainin", | ||
| + "rememberin", | ||
| + "removin", | ||
| + "renderin", | ||
| + "repeatin", | ||
| + "replacin", | ||
| + "reportin", | ||
| + "representin", | ||
| + "requirin", | ||
| + "respectin", | ||
| + "respondin", | ||
| + "restin", | ||
| + "resultin", | ||
| + "returnin", | ||
| + "revealin", | ||
| + "ridin", | ||
| + "risin", | ||
| + "rulin", | ||
| + "runnin", | ||
| + "sailin", | ||
| + "samplin", | ||
| + "satisfyin", | ||
| + "savin", | ||
| + "sayin", | ||
| + "scatterin", | ||
| + "schoolin", | ||
| + "screenin", | ||
| + "searchin", | ||
| + "securin", | ||
| + "seein", | ||
| + "seekin", | ||
| + "selectin", | ||
| + "sellin", | ||
| + "sendin", | ||
| + "separatin", | ||
| + "servin", | ||
| + "settin", | ||
| + "settlin", | ||
| + "sewin", | ||
| + "shakin", | ||
| + "shapin", | ||
| + "sharin", | ||
| + "shiftin", | ||
| + "shinin", | ||
| + "shippin", | ||
| + "shittin", | ||
| + "shootin", | ||
| + "shoppin", | ||
| + "showin", | ||
| + "singin", | ||
| + "sinkin", | ||
| + "sittin", | ||
| + "sleepin", | ||
| + "smilin", | ||
| + "smokin", | ||
| + "spankin", | ||
| + "solvin", | ||
| + "somethin", | ||
| + "speakin", | ||
| + "spellin", | ||
| + "spendin", | ||
| + "spinnin", | ||
| + "spittin", | ||
| + "spreadin", | ||
| + "standin", | ||
| + "starin", | ||
| + "startin", | ||
| + "statin", | ||
| + "stayin", | ||
| + "stealin", | ||
| + "sterlin", | ||
| + "stimulatin", | ||
| + "stirrin", | ||
| + "stoppin", | ||
| + "strengthenin", | ||
| + "stretchin", | ||
| + "strikin", | ||
| + "strugglin", | ||
| + "studyin", | ||
| + "succeedin", | ||
| + "sufferin", | ||
| + "suggestin", | ||
| + "supplyin", | ||
| + "supportin", | ||
| + "surprisin", | ||
| + "surroundin", | ||
| + "survivin", | ||
| + "sweepin", | ||
| + "swellin", | ||
| + "swimmin", | ||
| + "switchin", | ||
| + "takin", | ||
| + "talkin", | ||
| + "teachin", | ||
| + "tellin", | ||
| + "testin", | ||
| + "thinkin", | ||
| + "threatenin", | ||
| + "throwin", | ||
| + "timin", | ||
| + "touchin", | ||
| + "tradin", | ||
| + "trainin", | ||
| + "travelin", | ||
| + "treatin", | ||
| + "tremblin", | ||
| + "tryin", | ||
| + "turnin", | ||
| + "underlyin", | ||
| + "understandin", | ||
| + "undertakin", | ||
| + "unwillin", | ||
| + "usin", | ||
| + "varyin", | ||
| + "viewin", | ||
| + "visitin", | ||
| + "votin", | ||
| + "waitin", | ||
| + "walkin", | ||
| + "wanderin", | ||
| + "wantin", | ||
| + "warnin", | ||
| + "washin", | ||
| + "watchin", | ||
| + "wearin", | ||
| + "weddin", | ||
| + "whackin", | ||
| + "willin", | ||
| + "windin", | ||
| + "winnin", | ||
| + "wishin", | ||
| + "wonderin", | ||
| + "workin", | ||
| + "writin", | ||
| + "yieldin" | ||
| + ); | ||
| +} | ||
| +/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */ | ||
| +package com.whitemagicsoftware.keenquotes; | ||
| + | ||
| +import java.util.ArrayList; | ||
| +import java.util.Map; | ||
| +import java.util.function.Consumer; | ||
| + | ||
| +import static com.whitemagicsoftware.keenquotes.TokenType.*; | ||
| +import static java.util.Collections.sort; | ||
| + | ||
| +/** | ||
| + * Responsible for replacing {@link Token} instances with equivalent smart | ||
| + * quotes (or straight quotes). This will inform the caller when ambiguous | ||
| + * quotes cannot be reliably resolved. | ||
| + */ | ||
| +public final class KeenQuotes { | ||
| + private static final Map<TokenType, String> REPLACEMENTS = Map.of( | ||
| + QUOTE_OPENING_SINGLE, "‘", | ||
| + QUOTE_CLOSING_SINGLE, "’", | ||
| + QUOTE_OPENING_DOUBLE, "“", | ||
| + QUOTE_CLOSING_DOUBLE, "”", | ||
| + QUOTE_STRAIGHT_SINGLE, "'", | ||
| + QUOTE_STRAIGHT_DOUBLE, "\"", | ||
| + QUOTE_APOSTROPHE, "'", | ||
| + QUOTE_PRIME_SINGLE, "′", | ||
| + QUOTE_PRIME_DOUBLE, "″" | ||
| + ); | ||
| + | ||
| + /** | ||
| + * Converts straight quotes to curly quotes and primes. Any quotation marks | ||
| + * that cannot be converted are passed to the {@link Consumer}. | ||
| + * | ||
| + * @param text The text to parse. | ||
| + * @param unresolved Recipient for ambiguous {@link Lexeme}s. | ||
| + * @return The given text string with as many straight quotes converted to | ||
| + * curly quotes as is feasible. | ||
| + */ | ||
| + public static String convert( | ||
| + final String text, final Consumer<Lexeme> unresolved ) { | ||
| + final var parser = new Parser( text ); | ||
| + final var tokens = new ArrayList<Token>(); | ||
| + | ||
| + // Parse the tokens and consume all unresolved lexemes. | ||
| + parser.parse( tokens::add, unresolved ); | ||
| + | ||
| + // The parser may emit tokens in any order. | ||
| + sort( tokens ); | ||
| + | ||
| + final var result = new StringBuilder( text.length() ); | ||
| + var position = 0; | ||
| + | ||
| + for( final var token : tokens ) { | ||
| + if( position <= token.began() ) { | ||
| + result.append( text, position, token.began() ); | ||
| + result.append( REPLACEMENTS.get( token.getType() ) ); | ||
| + } | ||
| + | ||
| + position = token.ended(); | ||
| + } | ||
| + | ||
| + return result.append( text.substring( position ) ).toString(); | ||
| + } | ||
| +} | ||
| +/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */ | ||
| +package com.whitemagicsoftware.keenquotes; | ||
| + | ||
| +import static com.whitemagicsoftware.keenquotes.LexemeType.FLAG; | ||
| + | ||
| +/** | ||
| + * Responsible for tracking the beginning and ending offsets of a lexeme within | ||
| + * a text string. Tracking the beginning and ending indices should use less | ||
| + * memory than duplicating the entire text of Unicode characters (i.e., using | ||
| + * a similar approach to run-length encoding). | ||
| + */ | ||
| +public final class Lexeme implements Comparable<Lexeme> { | ||
| + /** | ||
| + * Signifies an invalid index to help distinguish EOT/SOT. | ||
| + */ | ||
| + private static final int E_INDEX = -2; | ||
| + | ||
| + /** | ||
| + * Denotes there are no more lexemes: the end of text (EOT) has been reached. | ||
| + * The beginning index differentiates between EOT and SOT. | ||
| + */ | ||
| + public static final Lexeme EOT = new Lexeme( FLAG, -1, E_INDEX ); | ||
| + | ||
| + /** | ||
| + * Denotes parsing at the start of text (SOT). This is useful to avoid | ||
| + * branching conditions while iterating. The beginning index differentiates | ||
| + * between EOT and SOT. | ||
| + */ | ||
| + public static final Lexeme SOT = new Lexeme( FLAG, 0, E_INDEX ); | ||
| + | ||
| + private final LexemeType mType; | ||
| + private final int mBegan; | ||
| + private final int mEnded; | ||
| + | ||
| + /** | ||
| + * Create a lexeme that represents a section of the text. | ||
| + * | ||
| + * @param type Type of {@link Lexeme} to create. | ||
| + * @param began Offset into the text where this instance starts (0-based). | ||
| + * @param ended Offset into the text where this instance stops (0-based). | ||
| + */ | ||
| + private Lexeme( final LexemeType type, final int began, final int ended ) { | ||
| + assert type != null; | ||
| + assert began >= 0 || ended == E_INDEX; | ||
| + assert ended >= began || ended == E_INDEX; | ||
| + | ||
| + mType = type; | ||
| + mBegan = began; | ||
| + mEnded = ended + 1; | ||
| + } | ||
| + | ||
| + /** | ||
| + * Extracts a sequence of characters from the given text at the offsets | ||
| + * captured by this lexeme. | ||
| + * | ||
| + * @param text The text that was parsed using this class. | ||
| + * @return The character string captured by the lexeme. | ||
| + */ | ||
| + public String toString( final String text ) { | ||
| + assert text != null; | ||
| + return text.substring( mBegan, mEnded ); | ||
| + } | ||
| + | ||
| + @Override | ||
| + public String toString() { | ||
| + return getClass().getSimpleName() + '{' + | ||
| + "mType=" + mType + | ||
| + ", mBegan=" + mBegan + | ||
| + ", mEnded=" + mEnded + | ||
| + '}'; | ||
| + } | ||
| + | ||
| + /** | ||
| + * Answers whether the given {@link LexemeType} is the same as this | ||
| + * instance's internal {@link LexemeType}. | ||
| + * | ||
| + * @param type The {@link LexemeType} to compare. | ||
| + * @return {@code true} if the given {@link LexemeType} is equal to the | ||
| + * internal {@link LexemeType}. | ||
| + */ | ||
| + public boolean isType( final LexemeType type ) { | ||
| + assert type != null; | ||
| + return mType == type; | ||
| + } | ||
| + | ||
| + /** | ||
| + * Answers whether any of the given {@link LexemeType} matches this | ||
| + * instance's internal {@link LexemeType}. | ||
| + * | ||
| + * @param types The {@link LexemeType}s to compare. | ||
| + * @return {@code true} if the internal {@link LexemeType} matches any one | ||
| + * of the given {@link LexemeType}s. | ||
| + */ | ||
| + public boolean anyType( final LexemeType... types ) { | ||
| + assert types != null; | ||
| + | ||
| + for( final var type : types ) { | ||
| + if( mType == type ) { | ||
| + return true; | ||
| + } | ||
| + } | ||
| + | ||
| + return false; | ||
| + } | ||
| + | ||
| + public int began() { | ||
| + return mBegan; | ||
| + } | ||
| + | ||
| + public int ended() { | ||
| + return mEnded; | ||
| + } | ||
| + | ||
| + boolean isSot() { | ||
| + return mBegan == 0; | ||
| + } | ||
| + | ||
| + boolean isEot() { | ||
| + return mBegan == -1; | ||
| + } | ||
| + | ||
| + LexemeType getType() { | ||
| + return mType; | ||
| + } | ||
| + | ||
| + /** | ||
| + * Compares the starting offset of the given {@link Lexeme} to the starting | ||
| + * offset of this {@link Lexeme} instance. This allows a list {@link Lexeme}s | ||
| + * to be sorted by order of appearance in the parsed text. | ||
| + */ | ||
| + @Override | ||
| + public int compareTo( final Lexeme that ) { | ||
| + assert that != null; | ||
| + return this.mBegan - that.mBegan; | ||
| + } | ||
| + | ||
| + static Lexeme createLexeme( | ||
| + final LexemeType lexeme, final int began, final int ended ) { | ||
| + assert lexeme != null; | ||
| + return new Lexeme( lexeme, began, ended ); | ||
| + } | ||
| +} | ||
| +/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */ | ||
| +package com.whitemagicsoftware.keenquotes; | ||
| + | ||
| +/** | ||
| + * Represents the type of a {@link Lexeme} parsed by the {@link Lexer}. | ||
| + */ | ||
| +public enum LexemeType { | ||
| + QUOTE_SINGLE, | ||
| + QUOTE_SINGLE_OPENING, | ||
| + QUOTE_SINGLE_CLOSING, | ||
| + QUOTE_DOUBLE, | ||
| + ESC_SINGLE, | ||
| + ESC_DOUBLE, | ||
| + EOL, | ||
| + EOP, | ||
| + SPACE, | ||
| + WORD, | ||
| + NUMBER, | ||
| + PUNCT, | ||
| + OPENING_GROUP, | ||
| + CLOSING_GROUP, | ||
| + HYPHEN, | ||
| + DASH, | ||
| + EQUALS, | ||
| + PERIOD, | ||
| + ELLIPSIS, | ||
| + FLAG | ||
| +} | ||
| +/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */ | ||
| +package com.whitemagicsoftware.keenquotes; | ||
| + | ||
| +import java.text.CharacterIterator; | ||
| +import java.text.StringCharacterIterator; | ||
| +import java.util.function.BiFunction; | ||
| + | ||
| +import static com.whitemagicsoftware.keenquotes.Lexeme.createLexeme; | ||
| +import static com.whitemagicsoftware.keenquotes.LexemeType.*; | ||
| +import static java.lang.Character.isDigit; | ||
| +import static java.lang.Character.isWhitespace; | ||
| +import static java.text.CharacterIterator.DONE; | ||
| + | ||
| +/** | ||
| + * Turns text into words, numbers, punctuation, spaces, and more. | ||
| + */ | ||
| +public class Lexer { | ||
| + /** | ||
| + * Iterates over the entire string of text to help produce lexemes. | ||
| + */ | ||
| + private final CharacterIterator mIterator; | ||
| + | ||
| + /** | ||
| + * Default constructor, no state. | ||
| + */ | ||
| + public Lexer( final String text ) { | ||
| + mIterator = new StringCharacterIterator( text ); | ||
| + } | ||
| + | ||
| + public Lexeme next() { | ||
| + return parse( mIterator ); | ||
| + } | ||
| + | ||
| + /** | ||
| + * Tokenizes a sequence of characters. The order of comparisons is optimized | ||
| + * towards probability of the occurrence of a character in regular English | ||
| + * prose: letters, space, quotation marks, numbers, periods, new lines, | ||
| + * then end of text. | ||
| + * | ||
| + * @param i The sequence of characters to tokenize. | ||
| + * @return The next token in the sequence. | ||
| + */ | ||
| + private Lexeme parse( final CharacterIterator i ) { | ||
| + int began = i.getIndex(); | ||
| + boolean isWord = false; | ||
| + Lexeme lexeme = null; | ||
| + | ||
| + do { | ||
| + final var curr = i.current(); | ||
| + | ||
| + if( isLetter( curr ) ) { | ||
| + isWord = true; | ||
| + | ||
| + if( !isLetter( peek( i ) ) ) { | ||
| + lexeme = createLexeme( WORD, began, i.getIndex() ); | ||
| + } | ||
| + } | ||
| + else if( curr == ' ' ) { | ||
| + lexeme = createLexeme( SPACE, began, i.getIndex() ); | ||
| + } | ||
| + else if( curr == '\'' ) { | ||
| + lexeme = createLexeme( QUOTE_SINGLE, began, i.getIndex() ); | ||
| + } | ||
| + else if( curr == '"' ) { | ||
| + lexeme = createLexeme( QUOTE_DOUBLE, began, i.getIndex() ); | ||
| + } | ||
| + else if( curr == '‘') { | ||
| + lexeme = createLexeme( QUOTE_SINGLE_OPENING, began, i.getIndex() ); | ||
| + } | ||
| + else if( curr == '’') { | ||
| + lexeme = createLexeme( QUOTE_SINGLE_CLOSING, began, i.getIndex() ); | ||
| + } | ||
| + else if( curr == '-' && peek( i ) == '-' || curr == '—' ) { | ||
| + slurp( i, ( next, ci ) -> next == '-' || next == '—' ); | ||
| + | ||
| + lexeme = createLexeme( DASH, began, i.getIndex() ); | ||
| + } | ||
| + else if( isDigit( curr ) || isNumeric( curr ) && isDigit( peek( i ) ) ) { | ||
| + // Parse all consecutive number characters to prevent the main loop | ||
| + // from switching back to word tokens. | ||
| + slurp( i, ( next, ci ) -> | ||
| + isDigit( next ) || isNumeric( next ) && isDigit( peek( ci ) ) | ||
| + ); | ||
| + | ||
| + lexeme = createLexeme( isWord ? WORD : NUMBER, began, i.getIndex() ); | ||
| + } | ||
| + else if( curr == '-' ) { | ||
| + lexeme = createLexeme( HYPHEN, began, i.getIndex() ); | ||
| + } | ||
| + else if( curr == '.' ) { | ||
| + // Parse all consecutive periods into an ellipsis lexeme. This will | ||
| + // not capture space-separated ellipsis (such as ". . ."). | ||
| + lexeme = createLexeme( | ||
| + slurp( i, ( next, ci ) -> next == '.' ) == 0 ? PERIOD : ELLIPSIS, | ||
| + began, i.getIndex() | ||
| + ); | ||
| + } | ||
| + else if( curr == '\r' || curr == '\n' ) { | ||
| + final var cr = new int[]{curr == '\r' ? 1 : 0}; | ||
| + final var lf = new int[]{curr == '\n' ? 1 : 0}; | ||
| + | ||
| + // Swallow all consecutive CR (Mac), CRLF (Windows), and/or LF (Unix). | ||
| + slurp( | ||
| + i, ( next, ci ) -> { | ||
| + cr[ 0 ] += next == '\r' ? 1 : 0; | ||
| + lf[ 0 ] += next == '\n' ? 1 : 0; | ||
| + return next == '\r' || next == '\n'; | ||
| + } | ||
| + ); | ||
| + | ||
| + final var eol = cr[ 0 ] + lf[ 0 ] == 1 || cr[ 0 ] == 1 && lf[ 0 ] == 1; | ||
| + lexeme = createLexeme( eol ? EOL : EOP, began, i.getIndex() ); | ||
| + } | ||
| + else if( isWhitespace( curr ) ) { | ||
| + lexeme = createLexeme( SPACE, began, i.getIndex() ); | ||
| + } | ||
| + else if( curr == '\\' ) { | ||
| + final var next = i.next(); | ||
| + | ||
| + if( next == '\'' ) { | ||
| + lexeme = createLexeme( ESC_SINGLE, began, i.getIndex() ); | ||
| + } | ||
| + else if( next == '\"' ) { | ||
| + lexeme = createLexeme( ESC_DOUBLE, began, i.getIndex() ); | ||
| + } | ||
| + else { | ||
| + // Push back the escaped character, which wasn't a straight quote. | ||
| + i.previous(); | ||
| + } | ||
| + } | ||
| + else if( curr == '(' || curr == '{' || curr == '[' ) { | ||
| + lexeme = createLexeme( OPENING_GROUP, began, i.getIndex() ); | ||
| + } | ||
| + else if( curr == ')' || curr == '}' || curr == ']' ) { | ||
| + lexeme = createLexeme( CLOSING_GROUP, began, i.getIndex() ); | ||
| + } | ||
| + else if( curr == '=' ) { | ||
| + lexeme = createLexeme( EQUALS, began, i.getIndex() ); | ||
| + } | ||
| + else if( curr != DONE ) { | ||
| + lexeme = createLexeme( PUNCT, began, i.getIndex() ); | ||
| + } | ||
| + else { | ||
| + lexeme = Lexeme.EOT; | ||
| + } | ||
| + | ||
| + i.next(); | ||
| + } | ||
| + while( lexeme == null ); | ||
| + | ||
| + return lexeme; | ||
| + } | ||
| + | ||
| + /** | ||
| + * Answers whether the given character can be considered part of a word | ||
| + * or not. This will include {@code _} and {@code *} because plain text | ||
| + * formats often use those characters to emphasize a word. | ||
| + * | ||
| + * @param curr The character to check as being part of a word. | ||
| + * @return {@code true} if the given character is a letter or a formatting | ||
| + * indicator. | ||
| + */ | ||
| + private static boolean isLetter( final char curr ) { | ||
| + return Character.isLetter( curr ) || curr == '_' || curr == '*'; | ||
| + } | ||
| + | ||
| + /** | ||
| + * Answers whether the given character can be considered part of a number | ||
| + * or not. This does not include digits, which are checked independently | ||
| + * from this method. | ||
| + * | ||
| + * @param curr The character to check as being related to numbers. | ||
| + * @return {@code true} if the given character can be considered part of | ||
| + * a number (e.g., -2,000.2^2 is considered a single number). | ||
| + */ | ||
| + private static boolean isNumeric( final char curr ) { | ||
| + return | ||
| + curr == '.' || curr == ',' || curr == '-' || curr == '+' || curr == '^'; | ||
| + } | ||
| + | ||
| + private static char peek( final CharacterIterator ci ) { | ||
| + final var ch = ci.next(); | ||
| + ci.previous(); | ||
| + return ch; | ||
| + } | ||
| + | ||
| + /** | ||
| + * Parse all characters that match a given function. | ||
| + * | ||
| + * @param ci The iterator containing characters to parse. | ||
| + * @param f The function that determines when slurping stops. | ||
| + * @return The number of characters parsed. | ||
| + */ | ||
| + private static int slurp( | ||
| + final CharacterIterator ci, | ||
| + final BiFunction<Character, CharacterIterator, Boolean> f ) { | ||
| + char next; | ||
| + int count = 0; | ||
| + | ||
| + do { | ||
| + next = ci.next(); | ||
| + count++; | ||
| + } | ||
| + while( f.apply( next, ci ) ); | ||
| + | ||
| + // The loop above will overshoot the tally by one character. | ||
| + ci.previous(); | ||
| + | ||
| + return --count; | ||
| + } | ||
| +} | ||
| +/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */ | ||
| +package com.whitemagicsoftware.keenquotes; | ||
| + | ||
| +import java.util.ArrayList; | ||
| +import java.util.List; | ||
| +import java.util.function.Consumer; | ||
| + | ||
| +import static com.whitemagicsoftware.keenquotes.Lexeme.EOT; | ||
| +import static com.whitemagicsoftware.keenquotes.Lexeme.SOT; | ||
| +import static com.whitemagicsoftware.keenquotes.LexemeType.*; | ||
| +import static com.whitemagicsoftware.keenquotes.TokenType.*; | ||
| + | ||
| +/** | ||
| + * Converts straight double/single quotes and apostrophes to curly equivalents. | ||
| + */ | ||
| +public final class Parser { | ||
| + /** | ||
| + * Single quotes preceded by these {@link LexemeType}s may be opening quotes. | ||
| + */ | ||
| + private static final LexemeType[] LEADING_QUOTE_OPENING_SINGLE = | ||
| + new LexemeType[]{SPACE, DASH, QUOTE_DOUBLE, OPENING_GROUP, EOL, EOP}; | ||
| + | ||
| + /** | ||
| + * Single quotes succeeded by these {@link LexemeType}s may be opening quotes. | ||
| + */ | ||
| + private static final LexemeType[] LAGGING_QUOTE_OPENING_SINGLE = | ||
| + new LexemeType[]{WORD, ELLIPSIS, QUOTE_SINGLE, QUOTE_DOUBLE}; | ||
| + | ||
| + /** | ||
| + * Single quotes preceded by these {@link LexemeType}s may be closing quotes. | ||
| + */ | ||
| + private static final LexemeType[] LEADING_QUOTE_CLOSING_SINGLE = | ||
| + new LexemeType[]{WORD, NUMBER, PERIOD, PUNCT, ELLIPSIS, QUOTE_DOUBLE}; | ||
| + | ||
| + /** | ||
| + * Single quotes succeeded by these {@link LexemeType}s may be closing quotes. | ||
| + */ | ||
| + private static final LexemeType[] LAGGING_QUOTE_CLOSING_SINGLE = | ||
| + new LexemeType[]{SPACE, HYPHEN, DASH, | ||
| + QUOTE_DOUBLE, CLOSING_GROUP, EOL, EOP}; | ||
| + | ||
| + /** | ||
| + * Double quotes preceded by these {@link LexemeType}s may be opening quotes. | ||
| + */ | ||
| + private static final LexemeType[] LEADING_QUOTE_OPENING_DOUBLE = | ||
| + new LexemeType[]{SPACE, DASH, EQUALS, QUOTE_SINGLE, OPENING_GROUP, EOL, EOP}; | ||
| + | ||
| + /** | ||
| + * Double quotes succeeded by these {@link LexemeType}s may be opening quotes. | ||
| + */ | ||
| + private static final LexemeType[] LAGGING_QUOTE_OPENING_DOUBLE = | ||
| + new LexemeType[]{WORD, NUMBER, ELLIPSIS, | ||
| + QUOTE_SINGLE, QUOTE_SINGLE_OPENING, QUOTE_SINGLE_CLOSING, QUOTE_DOUBLE}; | ||
| + | ||
| + /** | ||
| + * Double quotes preceded by these {@link LexemeType}s may be closing quotes. | ||
| + */ | ||
| + private static final LexemeType[] LEADING_QUOTE_CLOSING_DOUBLE = | ||
| + new LexemeType[]{WORD, NUMBER, PERIOD, PUNCT, DASH, ELLIPSIS, | ||
| + QUOTE_SINGLE, QUOTE_SINGLE_CLOSING, QUOTE_SINGLE_OPENING}; | ||
| + | ||
| + /** | ||
| + * Double quotes succeeded by these {@link LexemeType}s may be closing quotes. | ||
| + */ | ||
| + private static final LexemeType[] LAGGING_QUOTE_CLOSING_DOUBLE = | ||
| + new LexemeType[]{SPACE, PUNCT, PERIOD, EQUALS, HYPHEN, DASH, | ||
| + QUOTE_SINGLE, CLOSING_GROUP, EOL, EOP}; | ||
| + | ||
| + /** | ||
| + * The text to parse. A reference is required as a minor optimization in | ||
| + * memory and speed: the lexer records integer offsets, rather than new | ||
| + * {@link String} instances, to track parsed lexemes. | ||
| + */ | ||
| + private final String mText; | ||
| + | ||
| + /** | ||
| + * Converts a string into an iterable list of {@link Lexeme} instances. | ||
| + */ | ||
| + private final Lexer mLexer; | ||
| + | ||
| + /** | ||
| + * Sets of contractions that help disambiguate single quotes in the text. | ||
| + * These are effectively immutable while parsing. | ||
| + */ | ||
| + private final Contractions sContractions; | ||
| + | ||
| + /** | ||
| + * Incremented for each opening single quote emitted. Used to help resolve | ||
| + * ambiguities when single quote marks are balanced. | ||
| + */ | ||
| + private int mOpeningSingleQuote; | ||
| + | ||
| + /** | ||
| + * Incremented for each closing single quote emitted. Used to help resolve | ||
| + * ambiguities when single quote marks are balanced. | ||
| + */ | ||
| + private int mClosingSingleQuote; | ||
| + | ||
| + /** | ||
| + * Constructs a new {@link Parser} using the default contraction sets | ||
| + * to help resolve some ambiguous scenarios. | ||
| + * | ||
| + * @param text The prose to parse, containing zero or more quotation | ||
| + * characters. | ||
| + */ | ||
| + public Parser( final String text ) { | ||
| + this( text, new Contractions.Builder().build() ); | ||
| + } | ||
| + | ||
| + /** | ||
| + * Constructs a new {@link Parser} using the default contraction sets | ||
| + * to help resolve some ambiguous scenarios. | ||
| + * | ||
| + * @param text The prose to parse, containing zero or more quotation | ||
| + * characters. | ||
| + * @param contractions Custom sets of contractions to help resolve | ||
| + * ambiguities. | ||
| + */ | ||
| + public Parser( final String text, final Contractions contractions ) { | ||
| + mText = text; | ||
| + mLexer = new Lexer( mText ); | ||
| + sContractions = contractions; | ||
| + } | ||
| + | ||
| + /** | ||
| + * Iterates over the entire text provided at construction, emitting | ||
| + * {@link Token}s that can be used to convert straight quotes to curly | ||
| + * quotes. | ||
| + * | ||
| + * @param tokenConsumer Receives emitted {@link Token}s. | ||
| + */ | ||
| + public void parse( | ||
| + final Consumer<Token> tokenConsumer, | ||
| + final Consumer<Lexeme> lexemeConsumer ) { | ||
| + final var lexemes = new CircularFifoQueue<Lexeme>( 3 ); | ||
| + | ||
| + // Allow consuming the very first token without checking the queue size. | ||
| + flush( lexemes ); | ||
| + | ||
| + final var unresolved = new ArrayList<Lexeme[]>(); | ||
| + Lexeme lexeme; | ||
| + | ||
| + // Create and convert a list of all unambiguous quote characters. | ||
| + while( (lexeme = mLexer.next()) != EOT ) { | ||
| + if( tokenize( lexeme, lexemes, tokenConsumer, unresolved ) ) { | ||
| + // Attempt to resolve any remaining unambiguous quotes. | ||
| + resolve( unresolved, tokenConsumer ); | ||
| + | ||
| + // Notify of any unambiguous quotes that could not be resolved. | ||
| + unresolved.forEach( ( lex ) -> lexemeConsumer.accept( lex[ 1 ] ) ); | ||
| + unresolved.clear(); | ||
| + mOpeningSingleQuote = 0; | ||
| + mClosingSingleQuote = 0; | ||
| + } | ||
| + } | ||
| + | ||
| + // By loop's end, the lexemes list contains tokens for all except the | ||
| + // final two elements (from tokenizing in triplets). Tokenize the remaining | ||
| + // unprocessed lexemes. | ||
| + tokenize( EOT, lexemes, tokenConsumer, unresolved ); | ||
| + tokenize( EOT, lexemes, tokenConsumer, unresolved ); | ||
| + | ||
| + // Attempt to resolve any remaining unambiguous quotes. | ||
| + resolve( unresolved, tokenConsumer ); | ||
| + | ||
| + // Notify of any unambiguous quotes that could not be resolved. | ||
| + unresolved.forEach( ( lex ) -> lexemeConsumer.accept( lex[ 1 ] ) ); | ||
| + } | ||
| + | ||
| + /** | ||
| + * Converts {@link Lexeme}s identified as straight quotes into {@link Token}s | ||
| + * that represent the curly equivalent. The {@link Token}s are passed to | ||
| + * the given {@link Consumer} for further processing (e.g., replaced in | ||
| + * the original text being parsed). | ||
| + * | ||
| + * @param lexeme A part of the text being parsed. | ||
| + * @param lexemes A 3-element queue of lexemes that provide sufficient | ||
| + * context to identify curly quotes. | ||
| + * @param consumer Recipient of equivalent quotes. | ||
| + * @param unresolved Rolling list of potentially ambiguous {@link Lexeme}s | ||
| + * that could not be tokenized, yet. | ||
| + * @return {@code true} if an end-of-paragraph is detected. | ||
| + */ | ||
| + private boolean tokenize( final Lexeme lexeme, | ||
| + final CircularFifoQueue<Lexeme> lexemes, | ||
| + final Consumer<Token> consumer, | ||
| + final List<Lexeme[]> unresolved ) { | ||
| + // Add the next lexeme to tokenize into the queue for immediate processing. | ||
| + lexemes.add( lexeme ); | ||
| + | ||
| + final var lex1 = lexemes.get( 0 ); | ||
| + final var lex2 = lexemes.get( 1 ); | ||
| + final var lex3 = lexemes.get( 2 ); | ||
| + | ||
| + if( lex2.isType( QUOTE_SINGLE ) && lex3.isType( WORD ) && | ||
| + lex1.anyType( WORD, PERIOD, NUMBER ) ) { | ||
| + // Examples: y'all, Ph.D.'ll, 20's, she's | ||
| + consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) ); | ||
| + } | ||
| + else if( lex1.isType( QUOTE_SINGLE ) && lex3.isType( QUOTE_SINGLE ) && | ||
| + "n".equalsIgnoreCase( lex2.toString( mText ) ) ) { | ||
| + // I.e., 'n' | ||
| + consumer.accept( new Token( QUOTE_APOSTROPHE, lex1 ) ); | ||
| + consumer.accept( new Token( QUOTE_APOSTROPHE, lex3 ) ); | ||
| + flush( lexemes ); | ||
| + truncate( unresolved ); | ||
| + } | ||
| + else if( lex2.isType( QUOTE_SINGLE ) && lex1.isType( NUMBER ) ) { | ||
| + if( lex3.isType( QUOTE_SINGLE ) ) { | ||
| + // E.g., 2'' | ||
| + consumer.accept( | ||
| + new Token( QUOTE_PRIME_DOUBLE, lex2.began(), lex3.ended() ) ); | ||
| + flush( lexemes ); | ||
| + } | ||
| + else { | ||
| + // E.g., 2' | ||
| + consumer.accept( new Token( QUOTE_PRIME_SINGLE, lex2 ) ); | ||
| + } | ||
| + } | ||
| + else if( lex2.isType( QUOTE_DOUBLE ) && lex1.isType( NUMBER ) ) { | ||
| + // E.g., 2" | ||
| + consumer.accept( new Token( QUOTE_PRIME_DOUBLE, lex2 ) ); | ||
| + } | ||
| + else if( lex2.isType( WORD ) && lex3.isType( QUOTE_SINGLE ) && | ||
| + sContractions.endedUnambiguously( lex2.toString( mText ) ) ) { | ||
| + // E.g., thinkin' | ||
| + consumer.accept( new Token( QUOTE_APOSTROPHE, lex3 ) ); | ||
| + flush( lexemes ); | ||
| + } | ||
| + else if( lex2.isType( NUMBER ) && lex1.isType( QUOTE_SINGLE ) ) { | ||
| + if( lex3.anyType( SPACE, PUNCT ) || (lex3.isType( WORD ) && | ||
| + lex3.toString( mText ).equalsIgnoreCase( "s" )) ) { | ||
| + // Sentences must re-written to avoid starting with numerals. | ||
| + // Examples: '20s, '02 | ||
| + consumer.accept( new Token( QUOTE_APOSTROPHE, lex1 ) ); | ||
| + } | ||
| + else { | ||
| + // E.g., '2'' | ||
| + consumer.accept( new Token( QUOTE_OPENING_SINGLE, lex1 ) ); | ||
| + mOpeningSingleQuote++; | ||
| + } | ||
| + | ||
| + truncate( unresolved ); | ||
| + } | ||
| + else if( lex2.isType( QUOTE_SINGLE ) && | ||
| + lex1.anyType( PUNCT, PERIOD, ELLIPSIS, DASH ) && | ||
| + (lex3.anyType( EOL, EOP ) || lex3.isEot()) ) { | ||
| + consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) ); | ||
| + mClosingSingleQuote++; | ||
| + } | ||
| + else if( lex1.isType( ESC_SINGLE ) ) { | ||
| + // E.g., \' | ||
| + consumer.accept( new Token( QUOTE_STRAIGHT_SINGLE, lex1 ) ); | ||
| + } | ||
| + else if( lex1.isType( ESC_DOUBLE ) ) { | ||
| + // E.g., \" | ||
| + consumer.accept( new Token( QUOTE_STRAIGHT_DOUBLE, lex1 ) ); | ||
| + | ||
| + if( lex2.isType( QUOTE_SINGLE ) && | ||
| + (lex3.isEot() || lex3.anyType( SPACE, DASH, EOL, EOP )) ) { | ||
| + consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) ); | ||
| + mClosingSingleQuote++; | ||
| + } | ||
| + } | ||
| + else if( lex2.isType( QUOTE_DOUBLE ) && | ||
| + (lex1.isSot() || lex1.anyType( LEADING_QUOTE_OPENING_DOUBLE )) && | ||
| + lex3.anyType( LAGGING_QUOTE_OPENING_DOUBLE ) ) { | ||
| + // Examples: "", "..., "word, ---"word | ||
| + consumer.accept( new Token( QUOTE_OPENING_DOUBLE, lex2 ) ); | ||
| + } | ||
| + else if( lex2.isType( QUOTE_DOUBLE ) && | ||
| + lex1.anyType( LEADING_QUOTE_CLOSING_DOUBLE ) && | ||
| + (lex3.isEot() || lex3.anyType( LAGGING_QUOTE_CLOSING_DOUBLE )) ) { | ||
| + // Examples: ..."', word"', ?"', word"? | ||
| + consumer.accept( new Token( QUOTE_CLOSING_DOUBLE, lex2 ) ); | ||
| + } | ||
| + else if( lex1.isType( QUOTE_SINGLE ) && | ||
| + lex2.anyType( PUNCT, PERIOD, DASH ) && lex3.isType( QUOTE_DOUBLE ) ) { | ||
| + // E.g., '," (contraction ruled out from previous conditionals) | ||
| + consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex1 ) ); | ||
| + truncate( unresolved ); | ||
| + mClosingSingleQuote++; | ||
| + } | ||
| + else if( lex2.anyType( QUOTE_SINGLE, QUOTE_DOUBLE ) ) { | ||
| + // After tokenizing, the parser will attempt to resolve ambiguities. | ||
| + unresolved.add( new Lexeme[]{lex1, lex2, lex3} ); | ||
| + } | ||
| + | ||
| + // Suggest to the caller that resolution should be performed. This allows | ||
| + // the algorithm to reset the opening/closing quote balance before the | ||
| + // next paragraph is parsed. | ||
| + return lex3.isType( EOP ); | ||
| + } | ||
| + | ||
| + private void resolve( | ||
| + final List<Lexeme[]> unresolved, final Consumer<Token> consumer ) { | ||
| + // Some non-emitted tokenized lexemes may be ambiguous. | ||
| + final var ambiguousLeadingQuotes = new ArrayList<Lexeme[]>( 16 ); | ||
| + final var ambiguousLaggingQuotes = new ArrayList<Lexeme[]>( 16 ); | ||
| + var resolvedLeadingQuotes = 0; | ||
| + var resolvedLaggingQuotes = 0; | ||
| + | ||
| + // Count the number of ambiguous and non-ambiguous open single quotes. | ||
| + for( var i = unresolved.iterator(); i.hasNext(); ) { | ||
| + final var quotes = i.next(); | ||
| + final var lex1 = quotes[ 0 ]; | ||
| + final var lex2 = quotes[ 1 ]; | ||
| + final var lex3 = quotes[ 2 ]; | ||
| + | ||
| + if( lex2.isType( QUOTE_SINGLE ) ) { | ||
| + final var word1 = lex1 == SOT ? "" : lex1.toString( mText ); | ||
| + final var word3 = lex3 == EOT ? "" : lex3.toString( mText ); | ||
| + | ||
| + if( sContractions.beganAmbiguously( word3 ) ) { | ||
| + // E.g., 'Cause | ||
| + if( lex1.isType( QUOTE_SINGLE ) ) { | ||
| + // E.g., ''Cause | ||
| + consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) ); | ||
| + i.remove(); | ||
| + } | ||
| + else { | ||
| + // The contraction is uncertain until a closing quote is found that | ||
| + // may balance this single quote. | ||
| + ambiguousLeadingQuotes.add( quotes ); | ||
| + } | ||
| + } | ||
| + else if( sContractions.beganUnambiguously( word3 ) ) { | ||
| + // The quote mark forms a word that does not stand alone from its | ||
| + // contraction. For example, twas is not a word: it's 'twas. | ||
| + consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) ); | ||
| + i.remove(); | ||
| + } | ||
| + else if( sContractions.endedAmbiguously( word1 ) ) { | ||
| + ambiguousLaggingQuotes.add( quotes ); | ||
| + } | ||
| + else if( (lex1.isSot() || lex1.anyType( LEADING_QUOTE_OPENING_SINGLE )) | ||
| + && lex3.anyType( LAGGING_QUOTE_OPENING_SINGLE ) ) { | ||
| + consumer.accept( new Token( QUOTE_OPENING_SINGLE, lex2 ) ); | ||
| + resolvedLeadingQuotes++; | ||
| + mOpeningSingleQuote++; | ||
| + i.remove(); | ||
| + } | ||
| + else if( lex1.anyType( LEADING_QUOTE_CLOSING_SINGLE ) && | ||
| + (lex3.isEot() || lex3.anyType( LAGGING_QUOTE_CLOSING_SINGLE )) ) { | ||
| + consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) ); | ||
| + resolvedLaggingQuotes++; | ||
| + mClosingSingleQuote++; | ||
| + i.remove(); | ||
| + } | ||
| + else if( lex3.isType( NUMBER ) ) { | ||
| + // E.g., '04 | ||
| + ambiguousLeadingQuotes.add( quotes ); | ||
| + } | ||
| + } | ||
| + } | ||
| + | ||
| + final var ambiguousLeadingCount = ambiguousLeadingQuotes.size(); | ||
| + final var ambiguousLaggingCount = ambiguousLaggingQuotes.size(); | ||
| + | ||
| + if( resolvedLeadingQuotes == 1 && resolvedLaggingQuotes == 0 ) { | ||
| + if( ambiguousLeadingCount == 0 && ambiguousLaggingCount == 1 ) { | ||
| + final var balanced = mClosingSingleQuote - mOpeningSingleQuote == 0; | ||
| + final var quote = balanced ? QUOTE_APOSTROPHE : QUOTE_CLOSING_SINGLE; | ||
| + final var lex = ambiguousLaggingQuotes.get( 0 ); | ||
| + consumer.accept( new Token( quote, lex[ 1 ] ) ); | ||
| + unresolved.remove( lex ); | ||
| + } | ||
| + else if( ambiguousLeadingCount == 0 && unresolved.size() == 1 ) { | ||
| + // Must be a closing quote. | ||
| + final var closing = unresolved.get( 0 ); | ||
| + consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) ); | ||
| + unresolved.remove( closing ); | ||
| + } | ||
| + } | ||
| + else if( ambiguousLeadingCount == 0 && ambiguousLaggingCount > 0 ) { | ||
| + // If there are no ambiguous leading quotes then all ambiguous lagging | ||
| + // quotes must be contractions. | ||
| + ambiguousLaggingQuotes.forEach( | ||
| + lex -> { | ||
| + consumer.accept( new Token( QUOTE_APOSTROPHE, lex[ 1 ] ) ); | ||
| + unresolved.remove( lex ); | ||
| + } | ||
| + ); | ||
| + } | ||
| + else if( ambiguousLeadingCount == 0 ) { | ||
| + if( resolvedLaggingQuotes < resolvedLeadingQuotes ) { | ||
| + for( final var i = unresolved.iterator(); i.hasNext(); ) { | ||
| + final var closing = i.next()[ 1 ]; | ||
| + consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing ) ); | ||
| + i.remove(); | ||
| + } | ||
| + } | ||
| + else if( mOpeningSingleQuote - mClosingSingleQuote == unresolved.size() ) { | ||
| + for( final var i = unresolved.iterator(); i.hasNext(); ) { | ||
| + final var closing = i.next(); | ||
| + consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) ); | ||
| + i.remove(); | ||
| + } | ||
| + } | ||
| + else if( unresolved.size() == 2 ) { | ||
| + final var closing = unresolved.get( 0 ); | ||
| + final var opening = unresolved.get( 1 ); | ||
| + consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) ); | ||
| + consumer.accept( new Token( QUOTE_OPENING_SINGLE, opening[ 1 ] ) ); | ||
| + | ||
| + // Doesn't affect the algorithm. | ||
| + unresolved.clear(); | ||
| + } | ||
| + } | ||
| + else if( ambiguousLeadingCount == 1 && resolvedLaggingQuotes == 1 ) { | ||
| + final var opening = ambiguousLeadingQuotes.get( 0 ); | ||
| + consumer.accept( new Token( QUOTE_OPENING_SINGLE, opening[ 1 ] ) ); | ||
| + unresolved.remove( opening ); | ||
| + } | ||
| + } | ||
| + | ||
| + /** | ||
| + * Remove the last {@link Lexeme}s from the given list. | ||
| + * | ||
| + * @param unresolved The list of {@link Lexeme}s to modify. | ||
| + */ | ||
| + private void truncate( final List<Lexeme[]> unresolved ) { | ||
| + if( !unresolved.isEmpty() ) { | ||
| + unresolved.remove( unresolved.size() - 1 ); | ||
| + } | ||
| + } | ||
| + | ||
| + /** | ||
| + * Overwrites the {@link CircularFifoQueue}'s contents with start-of-text | ||
| + * indicators. | ||
| + * | ||
| + * @param lexemes The queue to overflow. | ||
| + */ | ||
| + private void flush( final CircularFifoQueue<Lexeme> lexemes ) { | ||
| + lexemes.add( SOT ); | ||
| + lexemes.add( SOT ); | ||
| + lexemes.add( SOT ); | ||
| + } | ||
| +} | ||
| +/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */ | ||
| +package com.whitemagicsoftware.keenquotes; | ||
| + | ||
| +/** | ||
| + * Represents a high-level token read from the text. | ||
| + */ | ||
| +final class Token implements Comparable<Token> { | ||
| + private final TokenType mType; | ||
| + final int mBegan; | ||
| + final int mEnded; | ||
| + | ||
| + /** | ||
| + * Convenience constructor to create a token that uses the lexeme's | ||
| + * beginning and ending offsets to represent a complete token. | ||
| + * | ||
| + * @param type The type of {@link Token} to create. | ||
| + * @param lexeme Container for beginning and ending text offsets. | ||
| + */ | ||
| + Token( final TokenType type, final Lexeme lexeme ) { | ||
| + this( type, lexeme.began(), lexeme.ended() ); | ||
| + } | ||
| + | ||
| + /** | ||
| + * This constructor can be used to create tokens that span more than a | ||
| + * single character. Almost all tokens represent a single character, only | ||
| + * the double-prime sequence ({@code ''}) is more than one character. | ||
| + * | ||
| + * @param type The type of {@link Token} to create. | ||
| + * @param began Beginning offset into text where token is found. | ||
| + * @param ended Ending offset into text where token is found. | ||
| + */ | ||
| + Token( final TokenType type, final int began, final int ended ) { | ||
| + assert type != null; | ||
| + assert began >= 0; | ||
| + assert ended > began; | ||
| + | ||
| + mType = type; | ||
| + mBegan = began; | ||
| + mEnded = ended; | ||
| + } | ||
| + | ||
| + TokenType getType() { | ||
| + return mType; | ||
| + } | ||
| + | ||
| + int began() { | ||
| + return mBegan; | ||
| + } | ||
| + | ||
| + int ended() { | ||
| + return mEnded; | ||
| + } | ||
| + | ||
| + @Override | ||
| + public int compareTo( final Token that ) { | ||
| + return this.mBegan - that.mBegan; | ||
| + } | ||
| +} | ||
| +/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */ | ||
| +package com.whitemagicsoftware.keenquotes; | ||
| + | ||
| +/** | ||
| + * The {@link Parser} emits these token types. | ||
| + */ | ||
| +enum TokenType { | ||
| + QUOTE_OPENING_SINGLE, | ||
| + QUOTE_OPENING_DOUBLE, | ||
| + QUOTE_CLOSING_SINGLE, | ||
| + QUOTE_CLOSING_DOUBLE, | ||
| + QUOTE_APOSTROPHE, | ||
| + QUOTE_STRAIGHT_SINGLE, | ||
| + QUOTE_STRAIGHT_DOUBLE, | ||
| + QUOTE_PRIME_SINGLE, | ||
| + QUOTE_PRIME_DOUBLE, | ||
| +} | ||
| +/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */ | ||
| +package com.whitemagicsoftware.keenquotes; | ||
| + | ||
| +import org.junit.jupiter.api.Test; | ||
| +import org.junit.jupiter.params.ParameterizedTest; | ||
| +import org.junit.jupiter.params.provider.ValueSource; | ||
| + | ||
| +import java.io.BufferedReader; | ||
| +import java.io.IOException; | ||
| +import java.io.InputStreamReader; | ||
| +import java.util.function.Function; | ||
| + | ||
| +import static java.lang.System.out; | ||
| +import static org.junit.jupiter.api.Assertions.assertEquals; | ||
| +import static org.junit.jupiter.api.Assertions.assertNotNull; | ||
| + | ||
| +/** | ||
| + * Test that English straight quotes are converted to curly quotes and | ||
| + * apostrophes. | ||
| + */ | ||
| +public class KeenQuotesTest { | ||
| + /** | ||
| + * This is a single-use test that is useful for debugging. | ||
| + */ | ||
| + @Test | ||
| + //@Disabled | ||
| + public void test_parse_SingleLine_Parsed() { | ||
| + out.println( KeenQuotes.convert( | ||
| + "\"’Kearney lives on the banks of Killarney—’", | ||
| + out::println | ||
| + ) ); | ||
| + } | ||
| + | ||
| + /** | ||
| + * Tests that straight quotes are converted to curly quotes. | ||
| + * | ||
| + * @throws IOException Error opening file full of fun. | ||
| + */ | ||
| + @Test | ||
| + public void test_Parse_StraightQuotes_CurlyQuotes() throws IOException { | ||
| + testConverter( text -> KeenQuotes.convert( text, ( lexeme ) -> {} ) ); | ||
| + } | ||
| + | ||
| + @ParameterizedTest | ||
| + @ValueSource( strings = {"westrup"} ) | ||
| + void test_Parse_Story_Converted( final String filename ) throws IOException { | ||
| + final var sb = new StringBuilder( 2 ^ 20 ); | ||
| + | ||
| + try( final var reader = open( filename + ".txt" ) ) { | ||
| + String line; | ||
| + | ||
| + while( (line = reader.readLine()) != null ) { | ||
| + sb.append( line ).append( '\n' ); | ||
| + } | ||
| + } | ||
| + | ||
| + System.out.println( KeenQuotes.convert( sb.toString(), out::println ) ); | ||
| + } | ||
| + | ||
| + /** | ||
| + * Reads a file full of couplets. The first of the pair is the input, | ||
| + * the second of the pair is the expected result. Couplets may include | ||
| + * newline characters to indicate end of lines and end of paragraphs. | ||
| + * Lines that start with {@code #} are ignored. | ||
| + * | ||
| + * @param parser The text processor capable of straight quote conversion. | ||
| + * @throws IOException Error opening file full of fun. | ||
| + */ | ||
| + private void testConverter( final Function<String, String> parser ) | ||
| + throws IOException { | ||
| + try( final var reader = open( "smartypants.txt" ) ) { | ||
| + String line; | ||
| + String testLine = ""; | ||
| + String expected = ""; | ||
| + | ||
| + while( ((line = reader.readLine()) != null) ) { | ||
| + if( line.startsWith( "#" ) || line.isBlank() ) { continue; } | ||
| + | ||
| + // Read the first line of the couplet. | ||
| + if( testLine.isBlank() ) { | ||
| + testLine = line; | ||
| + continue; | ||
| + } | ||
| + | ||
| + // Read the second line of the couplet. | ||
| + if( expected.isBlank() ) { | ||
| + expected = line; | ||
| + } | ||
| + | ||
| + testLine = unescapeEol( testLine ); | ||
| + expected = unescapeEol( expected ); | ||
| + | ||
| + final var actual = parser.apply( testLine ); | ||
| + assertEquals( expected, actual ); | ||
| + | ||
| + testLine = ""; | ||
| + expected = ""; | ||
| + } | ||
| + } | ||
| + } | ||
| + | ||
| + private static String unescapeEol( final String s ) { | ||
| + return String.join( "\n", s.split( "\\\\n" ) ); | ||
| + } | ||
| + | ||
| + /** | ||
| + * Opens a text file for reading. Callers are responsible for closing. | ||
| + * | ||
| + * @param filename The file to open. | ||
| + * @return An instance of {@link BufferedReader} that can be used to | ||
| + * read all the lines in the file. | ||
| + */ | ||
| + private BufferedReader open( final String filename ) { | ||
| + final var is = getClass().getResourceAsStream( filename ); | ||
| + assertNotNull( is ); | ||
| + | ||
| + return new BufferedReader( new InputStreamReader( is ) ); | ||
| + } | ||
| +} | ||
| +/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */ | ||
| +package com.whitemagicsoftware.keenquotes; | ||
| + | ||
| +import org.junit.jupiter.api.Test; | ||
| + | ||
| +import java.util.Arrays; | ||
| +import java.util.List; | ||
| +import java.util.function.BiFunction; | ||
| + | ||
| +import static com.whitemagicsoftware.keenquotes.Lexeme.EOT; | ||
| +import static com.whitemagicsoftware.keenquotes.LexemeType.*; | ||
| +import static org.junit.jupiter.api.Assertions.assertEquals; | ||
| + | ||
| +/** | ||
| + * Tests lexing words, numbers, punctuation, spaces, newlines, etc. | ||
| + */ | ||
| +class LexerTest { | ||
| + @Test | ||
| + void test_Lexing_Words_TokenValues() { | ||
| + testText( "abc 123", "abc", " ", "123" ); | ||
| + testText( "-123 abc", "-123", " ", "abc" ); | ||
| + } | ||
| + | ||
| + @Test | ||
| + void test_Lexing_Numbers_EmitNumbers() { | ||
| + testType( ".123", NUMBER ); | ||
| + testType( "-123.", NUMBER, PERIOD ); | ||
| + testType( " 123.123.123", SPACE, NUMBER ); | ||
| + testType( "123 123\"", NUMBER, SPACE, NUMBER, QUOTE_DOUBLE ); | ||
| + testType( "-123,123.123", NUMBER ); | ||
| + testType( "...1,023...", ELLIPSIS, NUMBER, ELLIPSIS ); | ||
| + } | ||
| + | ||
| + @Test | ||
| + void test_Lexing_Words_EmitWords() { | ||
| + testType( "abc", WORD ); | ||
| + testType( "abc abc", WORD, SPACE, WORD ); | ||
| + testType( "abc...", WORD, ELLIPSIS ); | ||
| + testType( "abc123", WORD, NUMBER ); | ||
| + testType( "-123abc", NUMBER, WORD ); | ||
| + testType( "abc-o'-abc", WORD, HYPHEN, WORD, QUOTE_SINGLE, HYPHEN, WORD ); | ||
| + } | ||
| + | ||
| + @Test | ||
| + void test_Lexing_PunctuationMarks_EmitPunctuationMarks() { | ||
| + testType( "!", PUNCT ); | ||
| + testType( ";", PUNCT ); | ||
| + testType( ".", PERIOD ); | ||
| + testType( "-", HYPHEN ); | ||
| + testType( "--", DASH ); | ||
| + testType( "---", DASH ); | ||
| + testType( "...", ELLIPSIS ); | ||
| + } | ||
| + | ||
| + @Test | ||
| + void test_Lexing_Quotes_EmitQuotes() { | ||
| + testType( "'", QUOTE_SINGLE ); | ||
| + testType( "\"", QUOTE_DOUBLE ); | ||
| + testType( "3 o'clock", NUMBER, SPACE, WORD, QUOTE_SINGLE, WORD ); | ||
| + } | ||
| + | ||
| + @Test | ||
| + void test_Lexing_Escapes_EmitEscapedQuotes() { | ||
| + testType( "123\\'456\\\"", NUMBER, ESC_SINGLE, NUMBER, ESC_DOUBLE ); | ||
| + testText( "123\\'456\\\"", "123", "\\'", "456", "\\\"" ); | ||
| + } | ||
| + | ||
| + @Test | ||
| + void test_Lexing_Newlines_EmitNewlines() { | ||
| + testType( "\r", EOL ); | ||
| + testType( "\n", EOL ); | ||
| + testType( "\r\n", EOL ); | ||
| + testType( "\r\n\r\n", EOP ); | ||
| + testType( "\r\n\n\r", EOP ); | ||
| + testType( "abc \r\nabc\n", WORD, SPACE, EOL, WORD, EOL ); | ||
| + } | ||
| + | ||
| + private void testType( final String actual, final LexemeType... expected ) { | ||
| + final var list = Arrays.asList( expected ); | ||
| + testType( actual, ( lexeme, text ) -> lexeme.getType(), list ); | ||
| + } | ||
| + | ||
| + private void testText( final String actual, final String... expected ) { | ||
| + testType( actual, Lexeme::toString, Arrays.asList( expected ) ); | ||
| + } | ||
| + | ||
| + private <A, E> void testType( | ||
| + final String text, | ||
| + final BiFunction<Lexeme, String, A> f, | ||
| + final List<E> elements ) { | ||
| + final var lexer = new Lexer( text ); | ||
| + var counter = 0; | ||
| + | ||
| + Lexeme lexeme; | ||
| + | ||
| + while( (lexeme = lexer.next()) != EOT ) { | ||
| + final var expected = elements.get( counter++ ); | ||
| + final var actual = f.apply( lexeme, text ); | ||
| + | ||
| + assertEquals( expected, actual ); | ||
| + } | ||
| + | ||
| + // Ensure all expected values are matched (verify end of text reached). | ||
| + assertEquals( elements.size(), counter ); | ||
| + } | ||
| +} | ||
| +/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */ | ||
| +package com.whitemagicsoftware.keenquotes; | ||
| + | ||
| +import org.junit.jupiter.api.Test; | ||
| + | ||
| +import java.util.HashMap; | ||
| +import java.util.Map; | ||
| + | ||
| +import static com.whitemagicsoftware.keenquotes.TokenType.QUOTE_APOSTROPHE; | ||
| +import static org.junit.jupiter.api.Assertions.assertEquals; | ||
| + | ||
| +/** | ||
| + * Test that all unambiguous apostrophes are emitted once. | ||
| + */ | ||
| +class ParserTest { | ||
| + private final static Map<String, Map<TokenType, Integer>> TEST_CASES = | ||
| + Map.of( | ||
| + "That's a 35'×10\" yacht!", | ||
| + Map.of( QUOTE_APOSTROPHE, 1 ), | ||
| + "It's the 80's...", | ||
| + Map.of( QUOTE_APOSTROPHE, 2 ), | ||
| + "Fish-'n'-chips!", | ||
| + Map.of( QUOTE_APOSTROPHE, 2 ), | ||
| + "She's a cat ya'll couldn't've known!", | ||
| + Map.of( QUOTE_APOSTROPHE, 4 ), | ||
| + "'Twas and 'tis whate'er lay 'twixt dawn and dusk 'n River Styx.", | ||
| + Map.of( QUOTE_APOSTROPHE, 5 ), | ||
| + """ | ||
| + But I must leave the proofs to those who 've seen 'em; | ||
| + But this I heard her say, and can't be wrong | ||
| + And all may think which way their judgments lean 'em, | ||
| + ''T is strange---the Hebrew noun which means "I am," | ||
| + The English always use to govern d--n.' | ||
| + """, | ||
| + Map.of( QUOTE_APOSTROPHE, 5 ) | ||
| + ); | ||
| + | ||
| + @Test | ||
| + void test_Conversion_StraightQuotes_ExpectedConversionCount() { | ||
| + for( final var entry : TEST_CASES.entrySet() ) { | ||
| + parse( entry.getKey(), entry.getValue() ); | ||
| + } | ||
| + } | ||
| + | ||
| + private void parse( final String text, final Map<TokenType, Integer> tally ) { | ||
| + final var parser = new Parser( text ); | ||
| + final var actual = new HashMap<TokenType, Integer>(); | ||
| + | ||
| + parser.parse( | ||
| + ( token ) -> actual.merge( token.getType(), 1, Integer::sum ), | ||
| + ( lexeme ) -> {} | ||
| + ); | ||
| + | ||
| + for( final var expectedEntry : tally.entrySet() ) { | ||
| + final var expectedCount = expectedEntry.getValue(); | ||
| + final var actualCount = actual.get( expectedEntry.getKey() ); | ||
| + | ||
| + assertEquals( expectedCount, actualCount, text ); | ||
| + } | ||
| + } | ||
| +} | ||
| +# ######################################################################## | ||
| +# Escaped quotes | ||
| +# ######################################################################## | ||
| +She stood 5\'7\". | ||
| +She stood 5'7". | ||
| + | ||
| +\"What?\" | ||
| +"What?" | ||
| + | ||
| +# ######################################################################## | ||
| +# Primes (single, double) | ||
| +# ######################################################################## | ||
| +Alice's friend is 6'3" tall. | ||
| +Alice's friend is 6′3″ tall. | ||
| + | ||
| +Bob's table is 5'' × 4''. | ||
| +Bob's table is 5″ × 4″. | ||
| + | ||
| +Bob's table is 5'×4'. | ||
| +Bob's table is 5′×4′. | ||
| + | ||
| +What's this '-5.5'',' '-10.2'' cm,' and another '-7.25''' thing? | ||
| +What's this ‘-5.5″,’ ‘-10.2″ cm,’ and another ‘-7.25″’ thing? | ||
| + | ||
| +'What's this -5.5'' thing?' | ||
| +‘What's this -5.5″ thing?’ | ||
| + | ||
| ++7.9'' is weird. | ||
| ++7.9″ is weird. | ||
| + | ||
| +12" record, 5'10" height | ||
| +12″ record, 5′10″ height | ||
| + | ||
| +Use 11.5"x14.25" virtual paper! | ||
| +Use 11.5″x14.25″ virtual paper! | ||
| + | ||
| +Angular measure 3° 5' 30" means 3 degs, 5 arcmins, & 30 arcsecs. | ||
| +Angular measure 3° 5′ 30″ means 3 degs, 5 arcmins, & 30 arcsecs. | ||
| + | ||
| +# ######################################################################## | ||
| +# Double quotes | ||
| +# ######################################################################## | ||
| +"I am Sam" | ||
| +“I am Sam” | ||
| + | ||
| +"...even better!" | ||
| +“...even better!” | ||
| + | ||
| +"It was so," said he. | ||
| +“It was so,” said he. | ||
| + | ||
| +"What cries in the streets"? | ||
| +“What cries in the streets”? | ||
| + | ||
| +With "air quotes" in the middle. | ||
| +With “air quotes” in the middle. | ||
| + | ||
| +With--"air quotes"--and dashes. | ||
| +With--“air quotes”--and dashes. | ||
| + | ||
| +"Not all open quotes are closed... | ||
| +“Not all open quotes are closed... | ||
| + | ||
| +"And this" and "and this" and "and this" and "and another." | ||
| +“And this” and “and this” and “and this” and “and another.” | ||
| + | ||
| +"Will'll invite?" Mrs. Thorne asked;\n"because he's coming--he's at the gate." | ||
| +“Will'll invite?” Mrs. Thorne asked;\n“because he's coming--he's at the gate.” | ||
| + | ||
| +# ######################################################################## | ||
| +# Single quotes | ||
| +# ######################################################################## | ||
| +'I am Sam' | ||
| +‘I am Sam’ | ||
| + | ||
| +'It was so,' said he. | ||
| +‘It was so,’ said he. | ||
| + | ||
| +'...even better!' | ||
| +‘...even better!’ | ||
| + | ||
| +With 'quotes' in the middle. | ||
| +With ‘quotes’ in the middle. | ||
| + | ||
| +With--'imaginary'--dashes. | ||
| +With--‘imaginary’--dashes. | ||
| + | ||
| +''Cause I don't like it, 's why,' said Pat. | ||
| +‘'Cause I don't like it, 's why,’ said Pat. | ||
| + | ||
| +'It's a beautiful day!' | ||
| +‘It's a beautiful day!’ | ||
| + | ||
| +Book 'em, Danno. Rock 'n' roll. 'Cause 'twas the season. | ||
| +Book 'em, Danno. Rock 'n' roll. 'Cause 'twas the season. | ||
| + | ||
| +# ######################################################################## | ||
| +# Possessives | ||
| +# ######################################################################## | ||
| +Sam's Sams' and the Ross's roses' thorns were prickly. | ||
| +Sam's Sams' and the Ross's roses' thorns were prickly. | ||
| + | ||
| +# ######################################################################## | ||
| +# Inside contractions (no leading/trailing apostrophes) | ||
| +# ######################################################################## | ||
| +I don't like it: I love's it! | ||
| +I don't like it: I love's it! | ||
| + | ||
| +We'd've thought that pancakes'll be sweeter there. | ||
| +We'd've thought that pancakes'll be sweeter there. | ||
| + | ||
| +She'd be coming o'er when the horse'd gone to pasture... | ||
| +She'd be coming o'er when the horse'd gone to pasture... | ||
| + | ||
| +# ######################################################################## | ||
| +# Beginning contractions (leading apostrophes) | ||
| +# ######################################################################## | ||
| +'Twas and 'tis whate'er lay 'twixt dawn and dusk 'n River Styx. | ||
| +'Twas and 'tis whate'er lay 'twixt dawn and dusk 'n River Styx. | ||
| + | ||
| +# ######################################################################## | ||
| +# Outside contractions (leading and trailing, no middle) | ||
| +# ######################################################################## | ||
| +Salt 'n' vinegar, fish-'n'-chips, sugar 'n spice! | ||
| +Salt 'n' vinegar, fish-'n'-chips, sugar 'n spice! | ||
| + | ||
| +# ######################################################################## | ||
| +# Ending contractions (trailing apostrophes) | ||
| +# ######################################################################## | ||
| +Didn' get th' message. | ||
| +Didn' get th' message. | ||
| + | ||
| +Namsayin', y'know what I'ma sayin'? | ||
| +Namsayin', y'know what I'ma sayin'? | ||
| + | ||
| +# ######################################################################## | ||
| +# Decades | ||
| +# ######################################################################## | ||
| +The Roaring '20s had the best music, no? | ||
| +The Roaring '20s had the best music, no? | ||
| + | ||
| +Took place in '04, yes'm! | ||
| +Took place in '04, yes'm! | ||
| + | ||
| +# ######################################################################## | ||
| +# Consecutive quotes | ||
| +# ######################################################################## | ||
| +"'I'm trouble.'" | ||
| +“‘I'm trouble.’” | ||
| + | ||
| +'"Trouble's my name."' | ||
| +‘“Trouble's my name.”’ | ||
| + | ||
| +# ######################################################################## | ||
| +# Nested quotations | ||
| +# ######################################################################## | ||
| +"'Here I am,' said Sam" | ||
| +“‘Here I am,’ said Sam” | ||
| + | ||
| +'"Here I am," said Sam' | ||
| +‘“Here I am,” said Sam’ | ||
| + | ||
| +'Hello, "Dr. Brown," what's your real name?' | ||
| +‘Hello, “Dr. Brown,” what's your real name?’ | ||
| + | ||
| +"'Twas, t'wasn't thy name, 'twas it?" said Jim "the Barber" Brown. | ||
| +“'Twas, t'wasn't thy name, 'twas it?” said Jim “the Barber” Brown. | ||
| + | ||
| +"'I'm in danger. Help? Take me to--' an address on Van Ness Avenue. | ||
| +“‘I'm in danger. Help? Take me to--’ an address on Van Ness Avenue. | ||
| + | ||
| +"Ah," she said, "you knew! He told you--and you said 'my dear'! How could you?!" | ||
| +“Ah,” she said, “you knew! He told you--and you said ‘my dear’! How could you?!” | ||
| + | ||
| +shouldn't drop letters, nor say "ain't" or "hain't." | ||
| +shouldn't drop letters, nor say “ain't” or “hain't.” | ||
| + | ||
| +"’Kearney lives on the banks of Killarney—’ | ||
| +“’Kearney lives on the banks of Killarney—’ | ||
| + | ||
| +# ######################################################################## | ||
| +# Mixed | ||
| +# ######################################################################## | ||
| +"She said, 'That's Sam's'," said the Sams' cat. | ||
| +“She said, ‘That's Sam's’,” said the Sams' cat. | ||
| + | ||
| +"'Jane said, ''E'll be spooky, Sam's son with the jack-o'-lantern!'" said the O'Mally twins'---y'know---ghosts in unison. | ||
| +“‘Jane said, ‘'E'll be spooky, Sam's son with the jack-o'-lantern!’” said the O'Mally twins'---y'know---ghosts in unison. | ||
| + | ||
| +'He's at Sam's' | ||
| +‘He's at Sam's’ | ||
| + | ||
| +ma'am | ||
| +ma'am | ||
| + | ||
| +'Twas midnight | ||
| +'Twas midnight | ||
| + | ||
| +"Hello," said the spider. "'Shelob' is my name." | ||
| +“Hello,” said the spider. “‘Shelob’ is my name.” | ||
| + | ||
| +'He said, \"I want to go.\"' Were you alive in the 70's? | ||
| +‘He said, "I want to go."’ Were you alive in the 70's? | ||
| + | ||
| +"That's a 'magic' sock." | ||
| +“That's a ‘magic’ sock.” | ||
| + | ||
| +'That's a "magic" sock.' | ||
| +‘That's a “magic” sock.’ | ||
| + | ||
| +'That's a "very 'magic'" sock.' | ||
| +‘That's a “very ‘magic’” sock.’ | ||
| + | ||
| +Website! Company Name, Inc. ("Company Name" or "Company") recommends reading carefully: | ||
| +Website! Company Name, Inc. (“Company Name” or “Company”) recommends reading carefully: | ||
| + | ||
| +Website! Company Name, Inc. ('Company Name' or 'Company') recommends reading carefully: | ||
| +Website! Company Name, Inc. (‘Company Name’ or ‘Company’) recommends reading carefully: | ||
| + | ||
| +Workin' hard | ||
| +Workin' hard | ||
| + | ||
| +"She said, 'Llamas'll languish, they'll-- | ||
| +“She said, ‘Llamas'll languish, they'll-- | ||
| + | ||
| +'The 70s are my favorite numbers,' she said. | ||
| +‘The 70s are my favorite numbers,’ she said. | ||
| + | ||
| +'70s fashion was weird. | ||
| +'70s fashion was weird. | ||
| + | ||
| +Model \"T2000\" | ||
| +Model "T2000" | ||
| + | ||
| +The 2's complement.\n\n'Yar, binary!' | ||
| +The 2's complement.\n\n‘Yar, binary!’ | ||
| + | ||
| +The Who's complement.\n\n\n"Paragraph boundary!" | ||
| +The Who's complement.\n\n\n“Paragraph boundary!” | ||
| + | ||
| +'Oak,' 'elm,' and 'beech' are names of trees. So is 'pine.' | ||
| +‘Oak,’ ‘elm,’ and ‘beech’ are names of trees. So is ‘pine.’ | ||
| + | ||
| +'A', 'B', and 'C' are letters. | ||
| +‘A’, ‘B’, and ‘C’ are letters. | ||
| + | ||
| +'He said, "Thinkin'."' | ||
| +‘He said, “Thinkin'.”’ | ||
| + | ||
| +''Sup, Doc?' | ||
| +‘'Sup, Doc?’ | ||
| + | ||
| +# ######################################################################## | ||
| +# Ambiguous (no solution) | ||
| +# ######################################################################## | ||
| +She said, 'Cause of death?\n\n'Old age, there's no doubt.' | ||
| +She said, 'Cause of death?\n\n‘Old age, there's no doubt.’ | ||
| + | ||
| +She said, 'Cause I said so!\n\n'If we're done, I've a secret.' | ||
| +She said, 'Cause I said so!\n\n‘If we're done, I've a secret.’ | ||
| + | ||
| +'Bout that time I says, 'Boys! I been thinkin' 'bout th' Universe.' | ||
| +'Bout that time I says, ‘Boys! I been thinkin' 'bout th' Universe.’ | ||
| Delta | 2659 lines added, 2621 lines removed, 38-line increase |
|---|