| Author | Dave Jarvis <email> |
|---|---|
| Date | 2021-06-20 14:20:57 GMT-0700 |
| Commit | 8a075f90d7f3c01de3a5fe309308bfe00a4b2d2f |
| Parent | f523bc7 |
| import java.util.ArrayList; | ||
| import java.util.HashSet; | ||
| +import java.util.List; | ||
| import java.util.Set; | ||
| private final Set<String> mEndedAmbiguous = new HashSet<>(); | ||
| - public void withBeganUnambiguous( final Set<String> words ) { | ||
| + public void withBeganUnambiguous( final List<String> words ) { | ||
| mBeganUnambiguous.addAll( words ); | ||
| } | ||
| - public void withEndedUnambiguous( final Set<String> words ) { | ||
| + public void withEndedUnambiguous( final List<String> words ) { | ||
| mEndedUnambiguous.addAll( words ); | ||
| } | ||
| - public void withBeganAmbiguous( final Set<String> words ) { | ||
| + public void withBeganAmbiguous( final List<String> words ) { | ||
| mBeganAmbiguous.addAll( words ); | ||
| } | ||
| - public void withEndedAmbiguous( final Set<String> words ) { | ||
| + public void withEndedAmbiguous( final List<String> words ) { | ||
| mEndedAmbiguous.addAll( words ); | ||
| } | ||
| */ | ||
| public Contractions build() { | ||
| - mBeganUnambiguous.addAll( from( mBeganUnambiguous, BEGAN_UNAMBIGUOUS ) ); | ||
| - mEndedUnambiguous.addAll( from( mEndedUnambiguous, ENDED_UNAMBIGUOUS ) ); | ||
| - mBeganAmbiguous.addAll( from( mBeganAmbiguous, BEGAN_AMBIGUOUS ) ); | ||
| - mEndedAmbiguous.addAll( from( mEndedAmbiguous, ENDED_AMBIGUOUS ) ); | ||
| + mBeganUnambiguous.addAll( BEGAN_UNAMBIGUOUS ); | ||
| + mEndedUnambiguous.addAll( ENDED_UNAMBIGUOUS ); | ||
| + mBeganAmbiguous.addAll( BEGAN_AMBIGUOUS ); | ||
| + mEndedAmbiguous.addAll( ENDED_AMBIGUOUS ); | ||
| return new Contractions( this ); | ||
| "neath", | ||
| "nother", | ||
| + "nuff", | ||
| "onna", | ||
| "onna'", | ||
| import java.util.Map; | ||
| import java.util.function.Consumer; | ||
| +import java.util.function.Function; | ||
| import static com.whitemagicsoftware.keenquotes.TokenType.*; | ||
| import static java.util.Collections.sort; | ||
| /** | ||
| * Responsible for converting curly quotes to HTML entities throughout a | ||
| * text string. | ||
| */ | ||
| -public class Converter { | ||
| +public class Converter implements Function<String, String> { | ||
| private static final Map<TokenType, String> REPLACEMENTS = Map.of( | ||
| QUOTE_OPENING_SINGLE, "‘", | ||
| QUOTE_PRIME_DOUBLE, "″" | ||
| ); | ||
| + | ||
| + private final Consumer<Lexeme> mUnresolved; | ||
| + private final Contractions mContractions; | ||
| + | ||
| + public Converter( final Consumer<Lexeme> unresolved ) { | ||
| + this( unresolved, new Contractions.Builder().build() ); | ||
| + } | ||
| + | ||
| + public Converter( final Consumer<Lexeme> unresolved, final Contractions c ) { | ||
| + mUnresolved = unresolved; | ||
| + mContractions = c; | ||
| + } | ||
| /** | ||
| * Converts straight quotes to curly quotes and primes. Any quotation marks | ||
| - * that cannot be converted are passed to the {@link Consumer}. | ||
| + * that cannot be converted are passed to the {@link Consumer}. This method | ||
| + * is re-entrant, but not tested to be thread-safe. | ||
| * | ||
| - * @param text The text to parse. | ||
| - * @param unresolved Recipient for ambiguous {@link Lexeme}s. | ||
| + * @param text The text to parse. | ||
| * @return The given text string with as many straight quotes converted to | ||
| * curly quotes as is feasible. | ||
| */ | ||
| - public static String convert( | ||
| - final String text, final Consumer<Lexeme> unresolved ) { | ||
| - final var parser = new Parser( text ); | ||
| + @Override | ||
| + public String apply( final String text ) { | ||
| + final var parser = new Parser( text, mContractions ); | ||
| final var tokens = new ArrayList<Token>(); | ||
| // Parse the tokens and consume all unresolved lexemes. | ||
| - parser.parse( tokens::add, unresolved ); | ||
| + parser.parse( tokens::add, mUnresolved ); | ||
| // The parser may emit tokens in any order. | ||
| public void run() { | ||
| - if( getSettings().displayList() ) { | ||
| - displayList(); | ||
| + final var settings = getSettings(); | ||
| + final var contractions = createContractions( settings ); | ||
| + | ||
| + if( settings.displayList() ) { | ||
| + System.out.println( contractions.toString() ); | ||
| } | ||
| else { | ||
| - convert(); | ||
| + convert( new Converter( System.err::println, contractions ) ); | ||
| } | ||
| } | ||
| - private void displayList() { | ||
| - System.out.println( new Contractions.Builder().build().toString() ); | ||
| + private Contractions createContractions( final Settings settings ) { | ||
| + final var builder = new Contractions.Builder(); | ||
| + | ||
| + builder.withBeganUnambiguous( settings.getBeganUnambiguous() ); | ||
| + builder.withEndedUnambiguous( settings.getEndedUnambiguous() ); | ||
| + builder.withBeganAmbiguous( settings.getBeganAmbiguous() ); | ||
| + builder.withEndedAmbiguous( settings.getEndedAmbiguous() ); | ||
| + | ||
| + return builder.build(); | ||
| } | ||
| - private void convert() { | ||
| - final StringBuilder sb = new StringBuilder(); | ||
| + private void convert( final Converter converter ) { | ||
| + final var sb = new StringBuilder(); | ||
| - try( final BufferedReader reader = open( System.in ) ) { | ||
| + try( final var reader = open( System.in ) ) { | ||
| String line; | ||
| final var sep = System.lineSeparator(); | ||
| while( (line = reader.readLine()) != null ) { | ||
| sb.append( line ); | ||
| sb.append( sep ); | ||
| } | ||
| - System.out.println( | ||
| - Converter.convert( sb.toString(), System.err::println ) | ||
| - ); | ||
| + System.out.println( converter.apply( sb.toString() ) ); | ||
| } catch( final Exception ex ) { | ||
| ex.printStackTrace( System.err ); | ||
| } | ||
| + /** | ||
| + * Main application entry point. | ||
| + * | ||
| + * @param args Command-line arguments. | ||
| + */ | ||
| public static void main( final String[] args ) { | ||
| final var app = new KeenQuotes(); | ||
| lexeme = createLexeme( QUOTE_DOUBLE, began, i.getIndex() ); | ||
| } | ||
| - else if( curr == '‘') { | ||
| + else if( curr == '‘' ) { | ||
| lexeme = createLexeme( QUOTE_SINGLE_OPENING, began, i.getIndex() ); | ||
| } | ||
| - else if( curr == '’') { | ||
| + else if( curr == '’' ) { | ||
| lexeme = createLexeme( QUOTE_SINGLE_CLOSING, began, i.getIndex() ); | ||
| } | ||
| - else if( curr == '-' && peek( i ) == '-' || curr == '—' ) { | ||
| - slurp( i, ( next, ci ) -> next == '-' || next == '—' ); | ||
| + else if( curr == '-' && peek( i ) != '-' ) { | ||
| + lexeme = createLexeme( HYPHEN, began, i.getIndex() ); | ||
| + } | ||
| + else if( isDash( curr ) ) { | ||
| + slurp( i, ( next, ci ) -> isDash( next ) ); | ||
| lexeme = createLexeme( DASH, began, i.getIndex() ); | ||
| lexeme = createLexeme( isWord ? WORD : NUMBER, began, i.getIndex() ); | ||
| - } | ||
| - else if( curr == '-' ) { | ||
| - lexeme = createLexeme( HYPHEN, began, i.getIndex() ); | ||
| } | ||
| else if( curr == '.' ) { | ||
| - // Parse all consecutive periods into an ellipsis lexeme. This will | ||
| - // not capture space-separated ellipsis (such as ". . ."). | ||
| lexeme = createLexeme( | ||
| - slurp( i, ( next, ci ) -> next == '.' ) == 0 ? PERIOD : ELLIPSIS, | ||
| + slurp( i, ( next, ci ) -> | ||
| + next == '.' || (next == ' ' && peek( ci ) == '.') ) == 0 | ||
| + ? PERIOD | ||
| + : ELLIPSIS, | ||
| began, i.getIndex() | ||
| ); | ||
| return | ||
| curr == '.' || curr == ',' || curr == '-' || curr == '+' || curr == '^'; | ||
| + } | ||
| + | ||
| + /** | ||
| + * Answers whether the given character may be part of an en- or em-dash. | ||
| + * This must be called after it is known that the character isn't a lone | ||
| + * hyphen. | ||
| + * | ||
| + * @param curr The character to check as being a dash. | ||
| + * @return {@code true} if the given character is part of a dash. | ||
| + */ | ||
| + private boolean isDash( final char curr ) { | ||
| + return curr == '-' || curr == '–' || curr == '—'; | ||
| } | ||
| */ | ||
| private static final LexemeType[] LEADING_QUOTE_OPENING_DOUBLE = | ||
| - new LexemeType[]{SPACE, DASH, EQUALS, QUOTE_SINGLE, OPENING_GROUP, EOL, EOP}; | ||
| + new LexemeType[]{SPACE, DASH, EQUALS, QUOTE_SINGLE, OPENING_GROUP, EOL, | ||
| + EOP}; | ||
| /** | ||
| */ | ||
| private int mClosingSingleQuote; | ||
| - | ||
| - /** | ||
| - * Constructs a new {@link Parser} using the default contraction sets | ||
| - * to help resolve some ambiguous scenarios. | ||
| - * | ||
| - * @param text The prose to parse, containing zero or more quotation | ||
| - * characters. | ||
| - */ | ||
| - public Parser( final String text ) { | ||
| - this( text, new Contractions.Builder().build() ); | ||
| - } | ||
| /** | ||
| import picocli.CommandLine; | ||
| +import java.util.List; | ||
| import java.util.concurrent.Callable; | ||
| + | ||
| +import static java.util.Arrays.asList; | ||
| +import static java.util.Collections.emptyList; | ||
| @CommandLine.Command( | ||
| private final KeenQuotes mMain; | ||
| -// /** | ||
| -// * List of unambiguous contractions having leading apostrophes. | ||
| -// */ | ||
| -// @CommandLine.Option( | ||
| -// names = {"-ub", "--unamb-began"}, | ||
| -// description = | ||
| -// "Contractions to treat as unambiguous (e.g., cause,bout)", | ||
| -// paramLabel = "words" | ||
| -// ) | ||
| -// private String[] mUnambiguousBegan; | ||
| -// | ||
| -// /** | ||
| -// * List of unambiguous contractions having lagging apostrophes. | ||
| -// */ | ||
| -// @CommandLine.Option( | ||
| -// names = {"-ue", "--unamb-ended"}, | ||
| -// description = | ||
| -// "Contractions to treat as unambiguous (e.g., frien,thinkin)", | ||
| -// paramLabel = "words" | ||
| -// ) | ||
| -// private String[] mUnambiguousEnded; | ||
| -// | ||
| -// /** | ||
| -// * List of ambiguous contractions having leading apostrophes. | ||
| -// */ | ||
| -// @CommandLine.Option( | ||
| -// names = {"-ab", "--amb-began"}, | ||
| -// description = | ||
| -// "Contractions to treat as ambiguous (e.g., sup,kay)", | ||
| -// paramLabel = "words" | ||
| -// ) | ||
| -// private String[] mAmbiguousBegan; | ||
| -// | ||
| -// /** | ||
| -// * List of ambiguous contractions having lagging apostrophes. | ||
| -// */ | ||
| -// @CommandLine.Option( | ||
| -// names = {"-ae", "--amb-ended"}, | ||
| -// description = | ||
| -// "Contractions to treat as ambiguous (e.g., gi,o)", | ||
| -// paramLabel = "words" | ||
| -// ) | ||
| -// private String[] mAmbiguousEnded; | ||
| -// | ||
| + /** | ||
| + * List of unambiguous contractions having leading apostrophes. | ||
| + */ | ||
| + @CommandLine.Option( | ||
| + names = {"-ub", "--unamb-began"}, | ||
| + description = | ||
| + "Contractions to treat as unambiguous (e.g., cause,bout)", | ||
| + paramLabel = "words" | ||
| + ) | ||
| + private String[] mBeganUnambiguous; | ||
| + | ||
| + /** | ||
| + * List of unambiguous contractions having lagging apostrophes. | ||
| + */ | ||
| + @CommandLine.Option( | ||
| + names = {"-ue", "--unamb-ended"}, | ||
| + description = | ||
| + "Contractions to treat as unambiguous (e.g., frien,thinkin)", | ||
| + paramLabel = "words" | ||
| + ) | ||
| + private String[] mEndedUnambiguous; | ||
| + | ||
| + /** | ||
| + * List of ambiguous contractions having leading apostrophes. | ||
| + */ | ||
| + @CommandLine.Option( | ||
| + names = {"-ab", "--amb-began"}, | ||
| + description = | ||
| + "Contractions to treat as ambiguous (e.g., sup,kay)", | ||
| + paramLabel = "words" | ||
| + ) | ||
| + private String[] mBeganAmbiguous; | ||
| + | ||
| + /** | ||
| + * List of ambiguous contractions having lagging apostrophes. | ||
| + */ | ||
| + @CommandLine.Option( | ||
| + names = {"-ae", "--amb-ended"}, | ||
| + description = | ||
| + "Contractions to treat as ambiguous (e.g., gi,o)", | ||
| + paramLabel = "words" | ||
| + ) | ||
| + private String[] mEndedAmbiguous; | ||
| + | ||
| /** | ||
| * Display default values. | ||
| * @return {@code true} to list the contractions. | ||
| */ | ||
| - public boolean displayList() { | ||
| + boolean displayList() { | ||
| return mDisplayList; | ||
| + } | ||
| + | ||
| + List<String> getBeganUnambiguous() { | ||
| + return nullSafe( mBeganUnambiguous ); | ||
| + } | ||
| + | ||
| + List<String> getEndedUnambiguous() { | ||
| + return nullSafe( mEndedUnambiguous ); | ||
| + } | ||
| + | ||
| + List<String> getBeganAmbiguous() { | ||
| + return nullSafe( mBeganAmbiguous ); | ||
| + } | ||
| + | ||
| + List<String> getEndedAmbiguous() { | ||
| + return nullSafe( mEndedAmbiguous ); | ||
| + } | ||
| + | ||
| + private List<String> nullSafe( final String[] words ) { | ||
| + return words == null ? emptyList() : asList( words ); | ||
| } | ||
| import java.util.function.Function; | ||
| -import static com.whitemagicsoftware.keenquotes.Converter.convert; | ||
| import static java.lang.System.out; | ||
| import static org.junit.jupiter.api.Assertions.assertEquals; | ||
| @Disabled | ||
| public void test_parse_SingleLine_Parsed() { | ||
| - out.println( convert( | ||
| - "\"’Kearney lives on the banks of Killarney—’", | ||
| - out::println | ||
| + final var converter = new Converter( out::println ); | ||
| + out.println( converter.apply( | ||
| + "\"’Kearney lives on the banks of Killarney—’" | ||
| ) ); | ||
| } | ||
| @Test | ||
| public void test_Parse_StraightQuotes_CurlyQuotes() throws IOException { | ||
| - testConverter( text -> convert( text, ( lexeme ) -> {} ) ); | ||
| + testConverter( new Converter( ( lex ) -> {} ) ); | ||
| } | ||
| } | ||
| - System.out.println( convert( sb.toString(), out::println ) ); | ||
| + final var converter = new Converter( out::println ); | ||
| + System.out.println( converter.apply( sb.toString() ) ); | ||
| } | ||
| testType( "--", DASH ); | ||
| testType( "---", DASH ); | ||
| + testType( "–", DASH ); | ||
| + testType( "—", DASH ); | ||
| + testType( "—-—", DASH ); | ||
| testType( "...", ELLIPSIS ); | ||
| + testType( ". .", ELLIPSIS ); | ||
| + testType( ". . .", ELLIPSIS ); | ||
| + testType( ".. ... ....", ELLIPSIS ); | ||
| } | ||
| */ | ||
| class ParserTest { | ||
| + @SuppressWarnings( "TextBlockMigration" ) | ||
| private final static Map<String, Map<TokenType, Integer>> TEST_CASES = | ||
| Map.of( | ||
| "'Twas and 'tis whate'er lay 'twixt dawn and dusk 'n River Styx.", | ||
| Map.of( QUOTE_APOSTROPHE, 5 ), | ||
| - """ | ||
| - But I must leave the proofs to those who 've seen 'em; | ||
| - But this I heard her say, and can't be wrong | ||
| - And all may think which way their judgments lean 'em, | ||
| - ''T is strange---the Hebrew noun which means "I am," | ||
| - The English always use to govern d--n.' | ||
| - """, | ||
| + "But I must leave the proofs to those who 've seen 'em;\n" + | ||
| + "But this I heard her say, and can't be wrong\n" + | ||
| + "And all may think which way their judgments lean 'em,\n" + | ||
| + "''T is strange---the Hebrew noun which means \"I am,\"\n" + | ||
| + "The English always use to govern d--n.'", | ||
| Map.of( QUOTE_APOSTROPHE, 5 ) | ||
| ); | ||
| private void parse( final String text, final Map<TokenType, Integer> tally ) { | ||
| - final var parser = new Parser( text ); | ||
| + final var contractions = new Contractions.Builder().build(); | ||
| + final var parser = new Parser( text, contractions ); | ||
| final var actual = new HashMap<TokenType, Integer>(); | ||
| Delta | 169 lines added, 108 lines removed, 61-line increase |
|---|