Add options for runtime ambiguities, fix dashes, robust ellipsis

Author	Dave Jarvis <email>
Date	2021-06-20 14:20:57 GMT-0700
Commit	8a075f90d7f3c01de3a5fe309308bfe00a4b2d2f
Parent	f523bc7

src/main/java/com/whitemagicsoftware/keenquotes/Contractions.java

		import java.util.ArrayList;
		import java.util.HashSet;
		+import java.util.List;
		import java.util.Set;


		private final Set<String> mEndedAmbiguous = new HashSet<>();

		- public void withBeganUnambiguous( final Set<String> words ) {
		+ public void withBeganUnambiguous( final List<String> words ) {
		mBeganUnambiguous.addAll( words );
		}

		- public void withEndedUnambiguous( final Set<String> words ) {
		+ public void withEndedUnambiguous( final List<String> words ) {
		mEndedUnambiguous.addAll( words );
		}

		- public void withBeganAmbiguous( final Set<String> words ) {
		+ public void withBeganAmbiguous( final List<String> words ) {
		mBeganAmbiguous.addAll( words );
		}

		- public void withEndedAmbiguous( final Set<String> words ) {
		+ public void withEndedAmbiguous( final List<String> words ) {
		mEndedAmbiguous.addAll( words );
		}

		*/
		public Contractions build() {
		- mBeganUnambiguous.addAll( from( mBeganUnambiguous, BEGAN_UNAMBIGUOUS ) );
		- mEndedUnambiguous.addAll( from( mEndedUnambiguous, ENDED_UNAMBIGUOUS ) );
		- mBeganAmbiguous.addAll( from( mBeganAmbiguous, BEGAN_AMBIGUOUS ) );
		- mEndedAmbiguous.addAll( from( mEndedAmbiguous, ENDED_AMBIGUOUS ) );
		+ mBeganUnambiguous.addAll( BEGAN_UNAMBIGUOUS );
		+ mEndedUnambiguous.addAll( ENDED_UNAMBIGUOUS );
		+ mBeganAmbiguous.addAll( BEGAN_AMBIGUOUS );
		+ mEndedAmbiguous.addAll( ENDED_AMBIGUOUS );

		return new Contractions( this );

		"neath",
		"nother",
		+ "nuff",
		"onna",
		"onna'",

src/main/java/com/whitemagicsoftware/keenquotes/Converter.java

		import java.util.Map;
		import java.util.function.Consumer;
		+import java.util.function.Function;

		import static com.whitemagicsoftware.keenquotes.TokenType.*;
		import static java.util.Collections.sort;

		/**
		* Responsible for converting curly quotes to HTML entities throughout a
		* text string.
		*/
		-public class Converter {
		+public class Converter implements Function<String, String> {
		private static final Map<TokenType, String> REPLACEMENTS = Map.of(
		QUOTE_OPENING_SINGLE, "‘",

		QUOTE_PRIME_DOUBLE, "″"
		);
		+
		+ private final Consumer<Lexeme> mUnresolved;
		+ private final Contractions mContractions;
		+
		+ public Converter( final Consumer<Lexeme> unresolved ) {
		+ this( unresolved, new Contractions.Builder().build() );
		+ }
		+
		+ public Converter( final Consumer<Lexeme> unresolved, final Contractions c ) {
		+ mUnresolved = unresolved;
		+ mContractions = c;
		+ }

		/**
		* Converts straight quotes to curly quotes and primes. Any quotation marks
		- * that cannot be converted are passed to the {@link Consumer}.
		+ * that cannot be converted are passed to the {@link Consumer}. This method
		+ * is re-entrant, but not tested to be thread-safe.
		*
		- * @param text The text to parse.
		- * @param unresolved Recipient for ambiguous {@link Lexeme}s.
		+ * @param text The text to parse.
		* @return The given text string with as many straight quotes converted to
		* curly quotes as is feasible.
		*/
		- public static String convert(
		- final String text, final Consumer<Lexeme> unresolved ) {
		- final var parser = new Parser( text );
		+ @Override
		+ public String apply( final String text ) {
		+ final var parser = new Parser( text, mContractions );
		final var tokens = new ArrayList<Token>();

		// Parse the tokens and consume all unresolved lexemes.
		- parser.parse( tokens::add, unresolved );
		+ parser.parse( tokens::add, mUnresolved );

		// The parser may emit tokens in any order.

src/main/java/com/whitemagicsoftware/keenquotes/KeenQuotes.java


		public void run() {
		- if( getSettings().displayList() ) {
		- displayList();
		+ final var settings = getSettings();
		+ final var contractions = createContractions( settings );
		+
		+ if( settings.displayList() ) {
		+ System.out.println( contractions.toString() );
		}
		else {
		- convert();
		+ convert( new Converter( System.err::println, contractions ) );
		}
		}

		- private void displayList() {
		- System.out.println( new Contractions.Builder().build().toString() );
		+ private Contractions createContractions( final Settings settings ) {
		+ final var builder = new Contractions.Builder();
		+
		+ builder.withBeganUnambiguous( settings.getBeganUnambiguous() );
		+ builder.withEndedUnambiguous( settings.getEndedUnambiguous() );
		+ builder.withBeganAmbiguous( settings.getBeganAmbiguous() );
		+ builder.withEndedAmbiguous( settings.getEndedAmbiguous() );
		+
		+ return builder.build();
		}

		- private void convert() {
		- final StringBuilder sb = new StringBuilder();
		+ private void convert( final Converter converter ) {
		+ final var sb = new StringBuilder();

		- try( final BufferedReader reader = open( System.in ) ) {
		+ try( final var reader = open( System.in ) ) {
		String line;
		final var sep = System.lineSeparator();

		while( (line = reader.readLine()) != null ) {
		sb.append( line );
		sb.append( sep );
		}

		- System.out.println(
		- Converter.convert( sb.toString(), System.err::println )
		- );
		+ System.out.println( converter.apply( sb.toString() ) );
		} catch( final Exception ex ) {
		ex.printStackTrace( System.err );

		}

		+ /**
		+ * Main application entry point.
		+ *
		+ * @param args Command-line arguments.
		+ */
		public static void main( final String[] args ) {
		final var app = new KeenQuotes();

src/main/java/com/whitemagicsoftware/keenquotes/Lexer.java

		lexeme = createLexeme( QUOTE_DOUBLE, began, i.getIndex() );
		}
		- else if( curr == '‘') {
		+ else if( curr == '‘' ) {
		lexeme = createLexeme( QUOTE_SINGLE_OPENING, began, i.getIndex() );
		}
		- else if( curr == '’') {
		+ else if( curr == '’' ) {
		lexeme = createLexeme( QUOTE_SINGLE_CLOSING, began, i.getIndex() );
		}
		- else if( curr == '-' && peek( i ) == '-' \|\| curr == '—' ) {
		- slurp( i, ( next, ci ) -> next == '-' \|\| next == '—' );
		+ else if( curr == '-' && peek( i ) != '-' ) {
		+ lexeme = createLexeme( HYPHEN, began, i.getIndex() );
		+ }
		+ else if( isDash( curr ) ) {
		+ slurp( i, ( next, ci ) -> isDash( next ) );

		lexeme = createLexeme( DASH, began, i.getIndex() );


		lexeme = createLexeme( isWord ? WORD : NUMBER, began, i.getIndex() );
		- }
		- else if( curr == '-' ) {
		- lexeme = createLexeme( HYPHEN, began, i.getIndex() );
		}
		else if( curr == '.' ) {
		- // Parse all consecutive periods into an ellipsis lexeme. This will
		- // not capture space-separated ellipsis (such as ". . .").
		lexeme = createLexeme(
		- slurp( i, ( next, ci ) -> next == '.' ) == 0 ? PERIOD : ELLIPSIS,
		+ slurp( i, ( next, ci ) ->
		+ next == '.' \|\| (next == ' ' && peek( ci ) == '.') ) == 0
		+ ? PERIOD
		+ : ELLIPSIS,
		began, i.getIndex()
		);

		return
		curr == '.' \|\| curr == ',' \|\| curr == '-' \|\| curr == '+' \|\| curr == '^';
		+ }
		+
		+ /**
		+ * Answers whether the given character may be part of an en- or em-dash.
		+ * This must be called after it is known that the character isn't a lone
		+ * hyphen.
		+ *
		+ * @param curr The character to check as being a dash.
		+ * @return {@code true} if the given character is part of a dash.
		+ */
		+ private boolean isDash( final char curr ) {
		+ return curr == '-' \|\| curr == '–' \|\| curr == '—';
		}

src/main/java/com/whitemagicsoftware/keenquotes/Parser.java

		*/
		private static final LexemeType[] LEADING_QUOTE_OPENING_DOUBLE =
		- new LexemeType[]{SPACE, DASH, EQUALS, QUOTE_SINGLE, OPENING_GROUP, EOL, EOP};
		+ new LexemeType[]{SPACE, DASH, EQUALS, QUOTE_SINGLE, OPENING_GROUP, EOL,
		+ EOP};

		/**

		*/
		private int mClosingSingleQuote;
		-
		- /**
		- * Constructs a new {@link Parser} using the default contraction sets
		- * to help resolve some ambiguous scenarios.
		- *
		- * @param text The prose to parse, containing zero or more quotation
		- * characters.
		- */
		- public Parser( final String text ) {
		- this( text, new Contractions.Builder().build() );
		- }

		/**

src/main/java/com/whitemagicsoftware/keenquotes/Settings.java

		import picocli.CommandLine;

		+import java.util.List;
		import java.util.concurrent.Callable;
		+
		+import static java.util.Arrays.asList;
		+import static java.util.Collections.emptyList;

		@CommandLine.Command(

		private final KeenQuotes mMain;

		-// /**
		-// * List of unambiguous contractions having leading apostrophes.
		-// */
		-// @CommandLine.Option(
		-// names = {"-ub", "--unamb-began"},
		-// description =
		-// "Contractions to treat as unambiguous (e.g., cause,bout)",
		-// paramLabel = "words"
		-// )
		-// private String[] mUnambiguousBegan;
		-//
		-// /**
		-// * List of unambiguous contractions having lagging apostrophes.
		-// */
		-// @CommandLine.Option(
		-// names = {"-ue", "--unamb-ended"},
		-// description =
		-// "Contractions to treat as unambiguous (e.g., frien,thinkin)",
		-// paramLabel = "words"
		-// )
		-// private String[] mUnambiguousEnded;
		-//
		-// /**
		-// * List of ambiguous contractions having leading apostrophes.
		-// */
		-// @CommandLine.Option(
		-// names = {"-ab", "--amb-began"},
		-// description =
		-// "Contractions to treat as ambiguous (e.g., sup,kay)",
		-// paramLabel = "words"
		-// )
		-// private String[] mAmbiguousBegan;
		-//
		-// /**
		-// * List of ambiguous contractions having lagging apostrophes.
		-// */
		-// @CommandLine.Option(
		-// names = {"-ae", "--amb-ended"},
		-// description =
		-// "Contractions to treat as ambiguous (e.g., gi,o)",
		-// paramLabel = "words"
		-// )
		-// private String[] mAmbiguousEnded;
		-//
		+ /**
		+ * List of unambiguous contractions having leading apostrophes.
		+ */
		+ @CommandLine.Option(
		+ names = {"-ub", "--unamb-began"},
		+ description =
		+ "Contractions to treat as unambiguous (e.g., cause,bout)",
		+ paramLabel = "words"
		+ )
		+ private String[] mBeganUnambiguous;
		+
		+ /**
		+ * List of unambiguous contractions having lagging apostrophes.
		+ */
		+ @CommandLine.Option(
		+ names = {"-ue", "--unamb-ended"},
		+ description =
		+ "Contractions to treat as unambiguous (e.g., frien,thinkin)",
		+ paramLabel = "words"
		+ )
		+ private String[] mEndedUnambiguous;
		+
		+ /**
		+ * List of ambiguous contractions having leading apostrophes.
		+ */
		+ @CommandLine.Option(
		+ names = {"-ab", "--amb-began"},
		+ description =
		+ "Contractions to treat as ambiguous (e.g., sup,kay)",
		+ paramLabel = "words"
		+ )
		+ private String[] mBeganAmbiguous;
		+
		+ /**
		+ * List of ambiguous contractions having lagging apostrophes.
		+ */
		+ @CommandLine.Option(
		+ names = {"-ae", "--amb-ended"},
		+ description =
		+ "Contractions to treat as ambiguous (e.g., gi,o)",
		+ paramLabel = "words"
		+ )
		+ private String[] mEndedAmbiguous;
		+
		/**
		* Display default values.

		* @return {@code true} to list the contractions.
		*/
		- public boolean displayList() {
		+ boolean displayList() {
		return mDisplayList;
		+ }
		+
		+ List<String> getBeganUnambiguous() {
		+ return nullSafe( mBeganUnambiguous );
		+ }
		+
		+ List<String> getEndedUnambiguous() {
		+ return nullSafe( mEndedUnambiguous );
		+ }
		+
		+ List<String> getBeganAmbiguous() {
		+ return nullSafe( mBeganAmbiguous );
		+ }
		+
		+ List<String> getEndedAmbiguous() {
		+ return nullSafe( mEndedAmbiguous );
		+ }
		+
		+ private List<String> nullSafe( final String[] words ) {
		+ return words == null ? emptyList() : asList( words );
		}

src/test/java/com/whitemagicsoftware/keenquotes/KeenQuotesTest.java

		import java.util.function.Function;

		-import static com.whitemagicsoftware.keenquotes.Converter.convert;
		import static java.lang.System.out;
		import static org.junit.jupiter.api.Assertions.assertEquals;

		@Disabled
		public void test_parse_SingleLine_Parsed() {
		- out.println( convert(
		- "\"’Kearney lives on the banks of Killarney—’",
		- out::println
		+ final var converter = new Converter( out::println );
		+ out.println( converter.apply(
		+ "\"’Kearney lives on the banks of Killarney—’"
		) );
		}

		@Test
		public void test_Parse_StraightQuotes_CurlyQuotes() throws IOException {
		- testConverter( text -> convert( text, ( lexeme ) -> {} ) );
		+ testConverter( new Converter( ( lex ) -> {} ) );
		}


		}

		- System.out.println( convert( sb.toString(), out::println ) );
		+ final var converter = new Converter( out::println );
		+ System.out.println( converter.apply( sb.toString() ) );
		}

src/test/java/com/whitemagicsoftware/keenquotes/LexerTest.java

		testType( "--", DASH );
		testType( "---", DASH );
		+ testType( "–", DASH );
		+ testType( "—", DASH );
		+ testType( "—-—", DASH );
		testType( "...", ELLIPSIS );
		+ testType( ". .", ELLIPSIS );
		+ testType( ". . .", ELLIPSIS );
		+ testType( ".. ... ....", ELLIPSIS );
		}

src/test/java/com/whitemagicsoftware/keenquotes/ParserTest.java

		*/
		class ParserTest {
		+ @SuppressWarnings( "TextBlockMigration" )
		private final static Map<String, Map<TokenType, Integer>> TEST_CASES =
		Map.of(

		"'Twas and 'tis whate'er lay 'twixt dawn and dusk 'n River Styx.",
		Map.of( QUOTE_APOSTROPHE, 5 ),
		- """
		- But I must leave the proofs to those who 've seen 'em;
		- But this I heard her say, and can't be wrong
		- And all may think which way their judgments lean 'em,
		- ''T is strange---the Hebrew noun which means "I am,"
		- The English always use to govern d--n.'
		- """,
		+ "But I must leave the proofs to those who 've seen 'em;\n" +
		+ "But this I heard her say, and can't be wrong\n" +
		+ "And all may think which way their judgments lean 'em,\n" +
		+ "''T is strange---the Hebrew noun which means \"I am,\"\n" +
		+ "The English always use to govern d--n.'",
		Map.of( QUOTE_APOSTROPHE, 5 )
		);


		private void parse( final String text, final Map<TokenType, Integer> tally ) {
		- final var parser = new Parser( text );
		+ final var contractions = new Contractions.Builder().build();
		+ final var parser = new Parser( text, contractions );
		final var actual = new HashMap<TokenType, Integer>();

Delta	169 lines added, 108 lines removed, 61-line increase