Dave Jarvis' Repositories

git clone https://repo.autonoma.ca/repo/keenquotes.git

Add options for runtime ambiguities, fix dashes, robust ellipsis

AuthorDave Jarvis <email>
Date2021-06-20 14:20:57 GMT-0700
Commit8a075f90d7f3c01de3a5fe309308bfe00a4b2d2f
Parentf523bc7
src/main/java/com/whitemagicsoftware/keenquotes/Contractions.java
import java.util.ArrayList;
import java.util.HashSet;
+import java.util.List;
import java.util.Set;
private final Set<String> mEndedAmbiguous = new HashSet<>();
- public void withBeganUnambiguous( final Set<String> words ) {
+ public void withBeganUnambiguous( final List<String> words ) {
mBeganUnambiguous.addAll( words );
}
- public void withEndedUnambiguous( final Set<String> words ) {
+ public void withEndedUnambiguous( final List<String> words ) {
mEndedUnambiguous.addAll( words );
}
- public void withBeganAmbiguous( final Set<String> words ) {
+ public void withBeganAmbiguous( final List<String> words ) {
mBeganAmbiguous.addAll( words );
}
- public void withEndedAmbiguous( final Set<String> words ) {
+ public void withEndedAmbiguous( final List<String> words ) {
mEndedAmbiguous.addAll( words );
}
*/
public Contractions build() {
- mBeganUnambiguous.addAll( from( mBeganUnambiguous, BEGAN_UNAMBIGUOUS ) );
- mEndedUnambiguous.addAll( from( mEndedUnambiguous, ENDED_UNAMBIGUOUS ) );
- mBeganAmbiguous.addAll( from( mBeganAmbiguous, BEGAN_AMBIGUOUS ) );
- mEndedAmbiguous.addAll( from( mEndedAmbiguous, ENDED_AMBIGUOUS ) );
+ mBeganUnambiguous.addAll( BEGAN_UNAMBIGUOUS );
+ mEndedUnambiguous.addAll( ENDED_UNAMBIGUOUS );
+ mBeganAmbiguous.addAll( BEGAN_AMBIGUOUS );
+ mEndedAmbiguous.addAll( ENDED_AMBIGUOUS );
return new Contractions( this );
"neath",
"nother",
+ "nuff",
"onna",
"onna'",
src/main/java/com/whitemagicsoftware/keenquotes/Converter.java
import java.util.Map;
import java.util.function.Consumer;
+import java.util.function.Function;
import static com.whitemagicsoftware.keenquotes.TokenType.*;
import static java.util.Collections.sort;
/**
* Responsible for converting curly quotes to HTML entities throughout a
* text string.
*/
-public class Converter {
+public class Converter implements Function<String, String> {
private static final Map<TokenType, String> REPLACEMENTS = Map.of(
QUOTE_OPENING_SINGLE, "&lsquo;",
QUOTE_PRIME_DOUBLE, "&Prime;"
);
+
+ private final Consumer<Lexeme> mUnresolved;
+ private final Contractions mContractions;
+
+ public Converter( final Consumer<Lexeme> unresolved ) {
+ this( unresolved, new Contractions.Builder().build() );
+ }
+
+ public Converter( final Consumer<Lexeme> unresolved, final Contractions c ) {
+ mUnresolved = unresolved;
+ mContractions = c;
+ }
/**
* Converts straight quotes to curly quotes and primes. Any quotation marks
- * that cannot be converted are passed to the {@link Consumer}.
+ * that cannot be converted are passed to the {@link Consumer}. This method
+ * is re-entrant, but not tested to be thread-safe.
*
- * @param text The text to parse.
- * @param unresolved Recipient for ambiguous {@link Lexeme}s.
+ * @param text The text to parse.
* @return The given text string with as many straight quotes converted to
* curly quotes as is feasible.
*/
- public static String convert(
- final String text, final Consumer<Lexeme> unresolved ) {
- final var parser = new Parser( text );
+ @Override
+ public String apply( final String text ) {
+ final var parser = new Parser( text, mContractions );
final var tokens = new ArrayList<Token>();
// Parse the tokens and consume all unresolved lexemes.
- parser.parse( tokens::add, unresolved );
+ parser.parse( tokens::add, mUnresolved );
// The parser may emit tokens in any order.
src/main/java/com/whitemagicsoftware/keenquotes/KeenQuotes.java
public void run() {
- if( getSettings().displayList() ) {
- displayList();
+ final var settings = getSettings();
+ final var contractions = createContractions( settings );
+
+ if( settings.displayList() ) {
+ System.out.println( contractions.toString() );
}
else {
- convert();
+ convert( new Converter( System.err::println, contractions ) );
}
}
- private void displayList() {
- System.out.println( new Contractions.Builder().build().toString() );
+ private Contractions createContractions( final Settings settings ) {
+ final var builder = new Contractions.Builder();
+
+ builder.withBeganUnambiguous( settings.getBeganUnambiguous() );
+ builder.withEndedUnambiguous( settings.getEndedUnambiguous() );
+ builder.withBeganAmbiguous( settings.getBeganAmbiguous() );
+ builder.withEndedAmbiguous( settings.getEndedAmbiguous() );
+
+ return builder.build();
}
- private void convert() {
- final StringBuilder sb = new StringBuilder();
+ private void convert( final Converter converter ) {
+ final var sb = new StringBuilder();
- try( final BufferedReader reader = open( System.in ) ) {
+ try( final var reader = open( System.in ) ) {
String line;
final var sep = System.lineSeparator();
while( (line = reader.readLine()) != null ) {
sb.append( line );
sb.append( sep );
}
- System.out.println(
- Converter.convert( sb.toString(), System.err::println )
- );
+ System.out.println( converter.apply( sb.toString() ) );
} catch( final Exception ex ) {
ex.printStackTrace( System.err );
}
+ /**
+ * Main application entry point.
+ *
+ * @param args Command-line arguments.
+ */
public static void main( final String[] args ) {
final var app = new KeenQuotes();
src/main/java/com/whitemagicsoftware/keenquotes/Lexer.java
lexeme = createLexeme( QUOTE_DOUBLE, began, i.getIndex() );
}
- else if( curr == '‘') {
+ else if( curr == '‘' ) {
lexeme = createLexeme( QUOTE_SINGLE_OPENING, began, i.getIndex() );
}
- else if( curr == '’') {
+ else if( curr == '’' ) {
lexeme = createLexeme( QUOTE_SINGLE_CLOSING, began, i.getIndex() );
}
- else if( curr == '-' && peek( i ) == '-' || curr == '—' ) {
- slurp( i, ( next, ci ) -> next == '-' || next == '—' );
+ else if( curr == '-' && peek( i ) != '-' ) {
+ lexeme = createLexeme( HYPHEN, began, i.getIndex() );
+ }
+ else if( isDash( curr ) ) {
+ slurp( i, ( next, ci ) -> isDash( next ) );
lexeme = createLexeme( DASH, began, i.getIndex() );
lexeme = createLexeme( isWord ? WORD : NUMBER, began, i.getIndex() );
- }
- else if( curr == '-' ) {
- lexeme = createLexeme( HYPHEN, began, i.getIndex() );
}
else if( curr == '.' ) {
- // Parse all consecutive periods into an ellipsis lexeme. This will
- // not capture space-separated ellipsis (such as ". . .").
lexeme = createLexeme(
- slurp( i, ( next, ci ) -> next == '.' ) == 0 ? PERIOD : ELLIPSIS,
+ slurp( i, ( next, ci ) ->
+ next == '.' || (next == ' ' && peek( ci ) == '.') ) == 0
+ ? PERIOD
+ : ELLIPSIS,
began, i.getIndex()
);
return
curr == '.' || curr == ',' || curr == '-' || curr == '+' || curr == '^';
+ }
+
+ /**
+ * Answers whether the given character may be part of an en- or em-dash.
+ * This must be called after it is known that the character isn't a lone
+ * hyphen.
+ *
+ * @param curr The character to check as being a dash.
+ * @return {@code true} if the given character is part of a dash.
+ */
+ private boolean isDash( final char curr ) {
+ return curr == '-' || curr == '–' || curr == '—';
}
src/main/java/com/whitemagicsoftware/keenquotes/Parser.java
*/
private static final LexemeType[] LEADING_QUOTE_OPENING_DOUBLE =
- new LexemeType[]{SPACE, DASH, EQUALS, QUOTE_SINGLE, OPENING_GROUP, EOL, EOP};
+ new LexemeType[]{SPACE, DASH, EQUALS, QUOTE_SINGLE, OPENING_GROUP, EOL,
+ EOP};
/**
*/
private int mClosingSingleQuote;
-
- /**
- * Constructs a new {@link Parser} using the default contraction sets
- * to help resolve some ambiguous scenarios.
- *
- * @param text The prose to parse, containing zero or more quotation
- * characters.
- */
- public Parser( final String text ) {
- this( text, new Contractions.Builder().build() );
- }
/**
src/main/java/com/whitemagicsoftware/keenquotes/Settings.java
import picocli.CommandLine;
+import java.util.List;
import java.util.concurrent.Callable;
+
+import static java.util.Arrays.asList;
+import static java.util.Collections.emptyList;
@CommandLine.Command(
private final KeenQuotes mMain;
-// /**
-// * List of unambiguous contractions having leading apostrophes.
-// */
-// @CommandLine.Option(
-// names = {"-ub", "--unamb-began"},
-// description =
-// "Contractions to treat as unambiguous (e.g., cause,bout)",
-// paramLabel = "words"
-// )
-// private String[] mUnambiguousBegan;
-//
-// /**
-// * List of unambiguous contractions having lagging apostrophes.
-// */
-// @CommandLine.Option(
-// names = {"-ue", "--unamb-ended"},
-// description =
-// "Contractions to treat as unambiguous (e.g., frien,thinkin)",
-// paramLabel = "words"
-// )
-// private String[] mUnambiguousEnded;
-//
-// /**
-// * List of ambiguous contractions having leading apostrophes.
-// */
-// @CommandLine.Option(
-// names = {"-ab", "--amb-began"},
-// description =
-// "Contractions to treat as ambiguous (e.g., sup,kay)",
-// paramLabel = "words"
-// )
-// private String[] mAmbiguousBegan;
-//
-// /**
-// * List of ambiguous contractions having lagging apostrophes.
-// */
-// @CommandLine.Option(
-// names = {"-ae", "--amb-ended"},
-// description =
-// "Contractions to treat as ambiguous (e.g., gi,o)",
-// paramLabel = "words"
-// )
-// private String[] mAmbiguousEnded;
-//
+ /**
+ * List of unambiguous contractions having leading apostrophes.
+ */
+ @CommandLine.Option(
+ names = {"-ub", "--unamb-began"},
+ description =
+ "Contractions to treat as unambiguous (e.g., cause,bout)",
+ paramLabel = "words"
+ )
+ private String[] mBeganUnambiguous;
+
+ /**
+ * List of unambiguous contractions having lagging apostrophes.
+ */
+ @CommandLine.Option(
+ names = {"-ue", "--unamb-ended"},
+ description =
+ "Contractions to treat as unambiguous (e.g., frien,thinkin)",
+ paramLabel = "words"
+ )
+ private String[] mEndedUnambiguous;
+
+ /**
+ * List of ambiguous contractions having leading apostrophes.
+ */
+ @CommandLine.Option(
+ names = {"-ab", "--amb-began"},
+ description =
+ "Contractions to treat as ambiguous (e.g., sup,kay)",
+ paramLabel = "words"
+ )
+ private String[] mBeganAmbiguous;
+
+ /**
+ * List of ambiguous contractions having lagging apostrophes.
+ */
+ @CommandLine.Option(
+ names = {"-ae", "--amb-ended"},
+ description =
+ "Contractions to treat as ambiguous (e.g., gi,o)",
+ paramLabel = "words"
+ )
+ private String[] mEndedAmbiguous;
+
/**
* Display default values.
* @return {@code true} to list the contractions.
*/
- public boolean displayList() {
+ boolean displayList() {
return mDisplayList;
+ }
+
+ List<String> getBeganUnambiguous() {
+ return nullSafe( mBeganUnambiguous );
+ }
+
+ List<String> getEndedUnambiguous() {
+ return nullSafe( mEndedUnambiguous );
+ }
+
+ List<String> getBeganAmbiguous() {
+ return nullSafe( mBeganAmbiguous );
+ }
+
+ List<String> getEndedAmbiguous() {
+ return nullSafe( mEndedAmbiguous );
+ }
+
+ private List<String> nullSafe( final String[] words ) {
+ return words == null ? emptyList() : asList( words );
}
src/test/java/com/whitemagicsoftware/keenquotes/KeenQuotesTest.java
import java.util.function.Function;
-import static com.whitemagicsoftware.keenquotes.Converter.convert;
import static java.lang.System.out;
import static org.junit.jupiter.api.Assertions.assertEquals;
@Disabled
public void test_parse_SingleLine_Parsed() {
- out.println( convert(
- "\"’Kearney lives on the banks of Killarney—’",
- out::println
+ final var converter = new Converter( out::println );
+ out.println( converter.apply(
+ "\"’Kearney lives on the banks of Killarney—’"
) );
}
@Test
public void test_Parse_StraightQuotes_CurlyQuotes() throws IOException {
- testConverter( text -> convert( text, ( lexeme ) -> {} ) );
+ testConverter( new Converter( ( lex ) -> {} ) );
}
}
- System.out.println( convert( sb.toString(), out::println ) );
+ final var converter = new Converter( out::println );
+ System.out.println( converter.apply( sb.toString() ) );
}
src/test/java/com/whitemagicsoftware/keenquotes/LexerTest.java
testType( "--", DASH );
testType( "---", DASH );
+ testType( "–", DASH );
+ testType( "—", DASH );
+ testType( "—-—", DASH );
testType( "...", ELLIPSIS );
+ testType( ". .", ELLIPSIS );
+ testType( ". . .", ELLIPSIS );
+ testType( ".. ... ....", ELLIPSIS );
}
src/test/java/com/whitemagicsoftware/keenquotes/ParserTest.java
*/
class ParserTest {
+ @SuppressWarnings( "TextBlockMigration" )
private final static Map<String, Map<TokenType, Integer>> TEST_CASES =
Map.of(
"'Twas and 'tis whate'er lay 'twixt dawn and dusk 'n River Styx.",
Map.of( QUOTE_APOSTROPHE, 5 ),
- """
- But I must leave the proofs to those who 've seen 'em;
- But this I heard her say, and can't be wrong
- And all may think which way their judgments lean 'em,
- ''T is strange---the Hebrew noun which means "I am,"
- The English always use to govern d--n.'
- """,
+ "But I must leave the proofs to those who 've seen 'em;\n" +
+ "But this I heard her say, and can't be wrong\n" +
+ "And all may think which way their judgments lean 'em,\n" +
+ "''T is strange---the Hebrew noun which means \"I am,\"\n" +
+ "The English always use to govern d--n.'",
Map.of( QUOTE_APOSTROPHE, 5 )
);
private void parse( final String text, final Map<TokenType, Integer> tally ) {
- final var parser = new Parser( text );
+ final var contractions = new Contractions.Builder().build();
+ final var parser = new Parser( text, contractions );
final var actual = new HashMap<TokenType, Integer>();
Delta169 lines added, 108 lines removed, 61-line increase