| 4 | 4 | import java.util.ArrayList; |
| 5 | 5 | import java.util.HashSet; |
| 6 | import java.util.List; |
|
| 6 | 7 | import java.util.Set; |
| 7 | 8 | |
| ... | ||
| 32 | 33 | private final Set<String> mEndedAmbiguous = new HashSet<>(); |
| 33 | 34 | |
| 34 | public void withBeganUnambiguous( final Set<String> words ) { |
|
| 35 | public void withBeganUnambiguous( final List<String> words ) { |
|
| 35 | 36 | mBeganUnambiguous.addAll( words ); |
| 36 | 37 | } |
| 37 | 38 | |
| 38 | public void withEndedUnambiguous( final Set<String> words ) { |
|
| 39 | public void withEndedUnambiguous( final List<String> words ) { |
|
| 39 | 40 | mEndedUnambiguous.addAll( words ); |
| 40 | 41 | } |
| 41 | 42 | |
| 42 | public void withBeganAmbiguous( final Set<String> words ) { |
|
| 43 | public void withBeganAmbiguous( final List<String> words ) { |
|
| 43 | 44 | mBeganAmbiguous.addAll( words ); |
| 44 | 45 | } |
| 45 | 46 | |
| 46 | public void withEndedAmbiguous( final Set<String> words ) { |
|
| 47 | public void withEndedAmbiguous( final List<String> words ) { |
|
| 47 | 48 | mEndedAmbiguous.addAll( words ); |
| 48 | 49 | } |
| ... | ||
| 55 | 56 | */ |
| 56 | 57 | public Contractions build() { |
| 57 | mBeganUnambiguous.addAll( from( mBeganUnambiguous, BEGAN_UNAMBIGUOUS ) ); |
|
| 58 | mEndedUnambiguous.addAll( from( mEndedUnambiguous, ENDED_UNAMBIGUOUS ) ); |
|
| 59 | mBeganAmbiguous.addAll( from( mBeganAmbiguous, BEGAN_AMBIGUOUS ) ); |
|
| 60 | mEndedAmbiguous.addAll( from( mEndedAmbiguous, ENDED_AMBIGUOUS ) ); |
|
| 58 | mBeganUnambiguous.addAll( BEGAN_UNAMBIGUOUS ); |
|
| 59 | mEndedUnambiguous.addAll( ENDED_UNAMBIGUOUS ); |
|
| 60 | mBeganAmbiguous.addAll( BEGAN_AMBIGUOUS ); |
|
| 61 | mEndedAmbiguous.addAll( ENDED_AMBIGUOUS ); |
|
| 61 | 62 | |
| 62 | 63 | return new Contractions( this ); |
| ... | ||
| 180 | 181 | "neath", |
| 181 | 182 | "nother", |
| 183 | "nuff", |
|
| 182 | 184 | "onna", |
| 183 | 185 | "onna'", |
| 4 | 4 | import java.util.Map; |
| 5 | 5 | import java.util.function.Consumer; |
| 6 | import java.util.function.Function; |
|
| 6 | 7 | |
| 7 | 8 | import static com.whitemagicsoftware.keenquotes.TokenType.*; |
| 8 | 9 | import static java.util.Collections.sort; |
| 9 | 10 | |
| 10 | 11 | /** |
| 11 | 12 | * Responsible for converting curly quotes to HTML entities throughout a |
| 12 | 13 | * text string. |
| 13 | 14 | */ |
| 14 | public class Converter { |
|
| 15 | public class Converter implements Function<String, String> { |
|
| 15 | 16 | private static final Map<TokenType, String> REPLACEMENTS = Map.of( |
| 16 | 17 | QUOTE_OPENING_SINGLE, "‘", |
| ... | ||
| 24 | 25 | QUOTE_PRIME_DOUBLE, "″" |
| 25 | 26 | ); |
| 27 | ||
| 28 | private final Consumer<Lexeme> mUnresolved; |
|
| 29 | private final Contractions mContractions; |
|
| 30 | ||
| 31 | public Converter( final Consumer<Lexeme> unresolved ) { |
|
| 32 | this( unresolved, new Contractions.Builder().build() ); |
|
| 33 | } |
|
| 34 | ||
| 35 | public Converter( final Consumer<Lexeme> unresolved, final Contractions c ) { |
|
| 36 | mUnresolved = unresolved; |
|
| 37 | mContractions = c; |
|
| 38 | } |
|
| 26 | 39 | |
| 27 | 40 | /** |
| 28 | 41 | * Converts straight quotes to curly quotes and primes. Any quotation marks |
| 29 | * that cannot be converted are passed to the {@link Consumer}. |
|
| 42 | * that cannot be converted are passed to the {@link Consumer}. This method |
|
| 43 | * is re-entrant, but not tested to be thread-safe. |
|
| 30 | 44 | * |
| 31 | * @param text The text to parse. |
|
| 32 | * @param unresolved Recipient for ambiguous {@link Lexeme}s. |
|
| 45 | * @param text The text to parse. |
|
| 33 | 46 | * @return The given text string with as many straight quotes converted to |
| 34 | 47 | * curly quotes as is feasible. |
| 35 | 48 | */ |
| 36 | public static String convert( |
|
| 37 | final String text, final Consumer<Lexeme> unresolved ) { |
|
| 38 | final var parser = new Parser( text ); |
|
| 49 | @Override |
|
| 50 | public String apply( final String text ) { |
|
| 51 | final var parser = new Parser( text, mContractions ); |
|
| 39 | 52 | final var tokens = new ArrayList<Token>(); |
| 40 | 53 | |
| 41 | 54 | // Parse the tokens and consume all unresolved lexemes. |
| 42 | parser.parse( tokens::add, unresolved ); |
|
| 55 | parser.parse( tokens::add, mUnresolved ); |
|
| 43 | 56 | |
| 44 | 57 | // The parser may emit tokens in any order. |
| 34 | 34 | |
| 35 | 35 | public void run() { |
| 36 | if( getSettings().displayList() ) { |
|
| 37 | displayList(); |
|
| 36 | final var settings = getSettings(); |
|
| 37 | final var contractions = createContractions( settings ); |
|
| 38 | ||
| 39 | if( settings.displayList() ) { |
|
| 40 | System.out.println( contractions.toString() ); |
|
| 38 | 41 | } |
| 39 | 42 | else { |
| 40 | convert(); |
|
| 43 | convert( new Converter( System.err::println, contractions ) ); |
|
| 41 | 44 | } |
| 42 | 45 | } |
| 43 | 46 | |
| 44 | private void displayList() { |
|
| 45 | System.out.println( new Contractions.Builder().build().toString() ); |
|
| 47 | private Contractions createContractions( final Settings settings ) { |
|
| 48 | final var builder = new Contractions.Builder(); |
|
| 49 | ||
| 50 | builder.withBeganUnambiguous( settings.getBeganUnambiguous() ); |
|
| 51 | builder.withEndedUnambiguous( settings.getEndedUnambiguous() ); |
|
| 52 | builder.withBeganAmbiguous( settings.getBeganAmbiguous() ); |
|
| 53 | builder.withEndedAmbiguous( settings.getEndedAmbiguous() ); |
|
| 54 | ||
| 55 | return builder.build(); |
|
| 46 | 56 | } |
| 47 | 57 | |
| 48 | private void convert() { |
|
| 49 | final StringBuilder sb = new StringBuilder(); |
|
| 58 | private void convert( final Converter converter ) { |
|
| 59 | final var sb = new StringBuilder(); |
|
| 50 | 60 | |
| 51 | try( final BufferedReader reader = open( System.in ) ) { |
|
| 61 | try( final var reader = open( System.in ) ) { |
|
| 52 | 62 | String line; |
| 53 | 63 | final var sep = System.lineSeparator(); |
| 54 | 64 | |
| 55 | 65 | while( (line = reader.readLine()) != null ) { |
| 56 | 66 | sb.append( line ); |
| 57 | 67 | sb.append( sep ); |
| 58 | 68 | } |
| 59 | 69 | |
| 60 | System.out.println( |
|
| 61 | Converter.convert( sb.toString(), System.err::println ) |
|
| 62 | ); |
|
| 70 | System.out.println( converter.apply( sb.toString() ) ); |
|
| 63 | 71 | } catch( final Exception ex ) { |
| 64 | 72 | ex.printStackTrace( System.err ); |
| ... | ||
| 112 | 120 | } |
| 113 | 121 | |
| 122 | /** |
|
| 123 | * Main application entry point. |
|
| 124 | * |
|
| 125 | * @param args Command-line arguments. |
|
| 126 | */ |
|
| 114 | 127 | public static void main( final String[] args ) { |
| 115 | 128 | final var app = new KeenQuotes(); |
| 65 | 65 | lexeme = createLexeme( QUOTE_DOUBLE, began, i.getIndex() ); |
| 66 | 66 | } |
| 67 | else if( curr == '‘') { |
|
| 67 | else if( curr == '‘' ) { |
|
| 68 | 68 | lexeme = createLexeme( QUOTE_SINGLE_OPENING, began, i.getIndex() ); |
| 69 | 69 | } |
| 70 | else if( curr == '’') { |
|
| 70 | else if( curr == '’' ) { |
|
| 71 | 71 | lexeme = createLexeme( QUOTE_SINGLE_CLOSING, began, i.getIndex() ); |
| 72 | } |
|
| 73 | else if( curr == '-' && peek( i ) == '-' || curr == '—' ) { |
|
| 74 | slurp( i, ( next, ci ) -> next == '-' || next == '—' ); |
|
| 75 | ||
| 76 | lexeme = createLexeme( DASH, began, i.getIndex() ); |
|
| 77 | 72 | } |
| 78 | 73 | else if( isDigit( curr ) || isNumeric( curr ) && isDigit( peek( i ) ) ) { |
| ... | ||
| 85 | 80 | lexeme = createLexeme( isWord ? WORD : NUMBER, began, i.getIndex() ); |
| 86 | 81 | } |
| 87 | else if( curr == '-' ) { |
|
| 82 | else if( curr == '-' && peek( i ) != '-' ) { |
|
| 88 | 83 | lexeme = createLexeme( HYPHEN, began, i.getIndex() ); |
| 84 | } |
|
| 85 | else if( isDash( curr ) ) { |
|
| 86 | slurp( i, ( next, ci ) -> isDash( next ) ); |
|
| 87 | ||
| 88 | lexeme = createLexeme( DASH, began, i.getIndex() ); |
|
| 89 | 89 | } |
| 90 | 90 | else if( curr == '.' ) { |
| 91 | // Parse all consecutive periods into an ellipsis lexeme. This will |
|
| 92 | // not capture space-separated ellipsis (such as ". . ."). |
|
| 93 | 91 | lexeme = createLexeme( |
| 94 | slurp( i, ( next, ci ) -> next == '.' ) == 0 ? PERIOD : ELLIPSIS, |
|
| 92 | slurp( i, ( next, ci ) -> |
|
| 93 | next == '.' || (next == ' ' && peek( ci ) == '.') ) == 0 |
|
| 94 | ? PERIOD |
|
| 95 | : ELLIPSIS, |
|
| 95 | 96 | began, i.getIndex() |
| 96 | 97 | ); |
| ... | ||
| 177 | 178 | return |
| 178 | 179 | curr == '.' || curr == ',' || curr == '-' || curr == '+' || curr == '^'; |
| 180 | } |
|
| 181 | ||
| 182 | /** |
|
| 183 | * Answers whether the given character may be part of an en- or em-dash. |
|
| 184 | * This must be called after it is known that the character isn't a lone |
|
| 185 | * hyphen. |
|
| 186 | * |
|
| 187 | * @param curr The character to check as being a dash. |
|
| 188 | * @return {@code true} if the given character is part of a dash. |
|
| 189 | */ |
|
| 190 | private boolean isDash( final char curr ) { |
|
| 191 | return curr == '-' || curr == '–' || curr == '—'; |
|
| 179 | 192 | } |
| 180 | 193 | |
| 44 | 44 | */ |
| 45 | 45 | private static final LexemeType[] LEADING_QUOTE_OPENING_DOUBLE = |
| 46 | new LexemeType[]{SPACE, DASH, EQUALS, QUOTE_SINGLE, OPENING_GROUP, EOL, EOP}; |
|
| 46 | new LexemeType[]{SPACE, DASH, EQUALS, QUOTE_SINGLE, OPENING_GROUP, EOL, |
|
| 47 | EOP}; |
|
| 47 | 48 | |
| 48 | 49 | /** |
| ... | ||
| 96 | 97 | */ |
| 97 | 98 | private int mClosingSingleQuote; |
| 98 | ||
| 99 | /** |
|
| 100 | * Constructs a new {@link Parser} using the default contraction sets |
|
| 101 | * to help resolve some ambiguous scenarios. |
|
| 102 | * |
|
| 103 | * @param text The prose to parse, containing zero or more quotation |
|
| 104 | * characters. |
|
| 105 | */ |
|
| 106 | public Parser( final String text ) { |
|
| 107 | this( text, new Contractions.Builder().build() ); |
|
| 108 | } |
|
| 109 | 99 | |
| 110 | 100 | /** |
| 4 | 4 | import picocli.CommandLine; |
| 5 | 5 | |
| 6 | import java.util.List; |
|
| 6 | 7 | import java.util.concurrent.Callable; |
| 8 | ||
| 9 | import static java.util.Arrays.asList; |
|
| 10 | import static java.util.Collections.emptyList; |
|
| 7 | 11 | |
| 8 | 12 | @CommandLine.Command( |
| ... | ||
| 18 | 22 | private final KeenQuotes mMain; |
| 19 | 23 | |
| 20 | // /** |
|
| 21 | // * List of unambiguous contractions having leading apostrophes. |
|
| 22 | // */ |
|
| 23 | // @CommandLine.Option( |
|
| 24 | // names = {"-ub", "--unamb-began"}, |
|
| 25 | // description = |
|
| 26 | // "Contractions to treat as unambiguous (e.g., cause,bout)", |
|
| 27 | // paramLabel = "words" |
|
| 28 | // ) |
|
| 29 | // private String[] mUnambiguousBegan; |
|
| 30 | // |
|
| 31 | // /** |
|
| 32 | // * List of unambiguous contractions having lagging apostrophes. |
|
| 33 | // */ |
|
| 34 | // @CommandLine.Option( |
|
| 35 | // names = {"-ue", "--unamb-ended"}, |
|
| 36 | // description = |
|
| 37 | // "Contractions to treat as unambiguous (e.g., frien,thinkin)", |
|
| 38 | // paramLabel = "words" |
|
| 39 | // ) |
|
| 40 | // private String[] mUnambiguousEnded; |
|
| 41 | // |
|
| 42 | // /** |
|
| 43 | // * List of ambiguous contractions having leading apostrophes. |
|
| 44 | // */ |
|
| 45 | // @CommandLine.Option( |
|
| 46 | // names = {"-ab", "--amb-began"}, |
|
| 47 | // description = |
|
| 48 | // "Contractions to treat as ambiguous (e.g., sup,kay)", |
|
| 49 | // paramLabel = "words" |
|
| 50 | // ) |
|
| 51 | // private String[] mAmbiguousBegan; |
|
| 52 | // |
|
| 53 | // /** |
|
| 54 | // * List of ambiguous contractions having lagging apostrophes. |
|
| 55 | // */ |
|
| 56 | // @CommandLine.Option( |
|
| 57 | // names = {"-ae", "--amb-ended"}, |
|
| 58 | // description = |
|
| 59 | // "Contractions to treat as ambiguous (e.g., gi,o)", |
|
| 60 | // paramLabel = "words" |
|
| 61 | // ) |
|
| 62 | // private String[] mAmbiguousEnded; |
|
| 63 | // |
|
| 24 | /** |
|
| 25 | * List of unambiguous contractions having leading apostrophes. |
|
| 26 | */ |
|
| 27 | @CommandLine.Option( |
|
| 28 | names = {"-ub", "--unamb-began"}, |
|
| 29 | description = |
|
| 30 | "Contractions to treat as unambiguous (e.g., cause,bout)", |
|
| 31 | paramLabel = "words" |
|
| 32 | ) |
|
| 33 | private String[] mBeganUnambiguous; |
|
| 34 | ||
| 35 | /** |
|
| 36 | * List of unambiguous contractions having lagging apostrophes. |
|
| 37 | */ |
|
| 38 | @CommandLine.Option( |
|
| 39 | names = {"-ue", "--unamb-ended"}, |
|
| 40 | description = |
|
| 41 | "Contractions to treat as unambiguous (e.g., frien,thinkin)", |
|
| 42 | paramLabel = "words" |
|
| 43 | ) |
|
| 44 | private String[] mEndedUnambiguous; |
|
| 45 | ||
| 46 | /** |
|
| 47 | * List of ambiguous contractions having leading apostrophes. |
|
| 48 | */ |
|
| 49 | @CommandLine.Option( |
|
| 50 | names = {"-ab", "--amb-began"}, |
|
| 51 | description = |
|
| 52 | "Contractions to treat as ambiguous (e.g., sup,kay)", |
|
| 53 | paramLabel = "words" |
|
| 54 | ) |
|
| 55 | private String[] mBeganAmbiguous; |
|
| 56 | ||
| 57 | /** |
|
| 58 | * List of ambiguous contractions having lagging apostrophes. |
|
| 59 | */ |
|
| 60 | @CommandLine.Option( |
|
| 61 | names = {"-ae", "--amb-ended"}, |
|
| 62 | description = |
|
| 63 | "Contractions to treat as ambiguous (e.g., gi,o)", |
|
| 64 | paramLabel = "words" |
|
| 65 | ) |
|
| 66 | private String[] mEndedAmbiguous; |
|
| 67 | ||
| 64 | 68 | /** |
| 65 | 69 | * Display default values. |
| ... | ||
| 81 | 85 | * @return {@code true} to list the contractions. |
| 82 | 86 | */ |
| 83 | public boolean displayList() { |
|
| 87 | boolean displayList() { |
|
| 84 | 88 | return mDisplayList; |
| 89 | } |
|
| 90 | ||
| 91 | List<String> getBeganUnambiguous() { |
|
| 92 | return nullSafe( mBeganUnambiguous ); |
|
| 93 | } |
|
| 94 | ||
| 95 | List<String> getEndedUnambiguous() { |
|
| 96 | return nullSafe( mEndedUnambiguous ); |
|
| 97 | } |
|
| 98 | ||
| 99 | List<String> getBeganAmbiguous() { |
|
| 100 | return nullSafe( mBeganAmbiguous ); |
|
| 101 | } |
|
| 102 | ||
| 103 | List<String> getEndedAmbiguous() { |
|
| 104 | return nullSafe( mEndedAmbiguous ); |
|
| 105 | } |
|
| 106 | ||
| 107 | private List<String> nullSafe( final String[] words ) { |
|
| 108 | return words == null ? emptyList() : asList( words ); |
|
| 85 | 109 | } |
| 86 | 110 | |
| 12 | 12 | import java.util.function.Function; |
| 13 | 13 | |
| 14 | import static com.whitemagicsoftware.keenquotes.Converter.convert; |
|
| 15 | 14 | import static java.lang.System.out; |
| 16 | 15 | import static org.junit.jupiter.api.Assertions.assertEquals; |
| ... | ||
| 28 | 27 | @Disabled |
| 29 | 28 | public void test_parse_SingleLine_Parsed() { |
| 30 | out.println( convert( |
|
| 31 | "\"’Kearney lives on the banks of Killarney—’", |
|
| 32 | out::println |
|
| 29 | final var converter = new Converter( out::println ); |
|
| 30 | out.println( converter.apply( |
|
| 31 | "\"’Kearney lives on the banks of Killarney—’" |
|
| 33 | 32 | ) ); |
| 34 | 33 | } |
| ... | ||
| 41 | 40 | @Test |
| 42 | 41 | public void test_Parse_StraightQuotes_CurlyQuotes() throws IOException { |
| 43 | testConverter( text -> convert( text, ( lexeme ) -> {} ) ); |
|
| 42 | testConverter( new Converter( ( lex ) -> {} ) ); |
|
| 44 | 43 | } |
| 45 | 44 | |
| ... | ||
| 65 | 64 | } |
| 66 | 65 | |
| 67 | System.out.println( convert( sb.toString(), out::println ) ); |
|
| 66 | final var converter = new Converter( out::println ); |
|
| 67 | System.out.println( converter.apply( sb.toString() ) ); |
|
| 68 | 68 | } |
| 69 | 69 | |
| 50 | 50 | testType( "--", DASH ); |
| 51 | 51 | testType( "---", DASH ); |
| 52 | testType( "–", DASH ); |
|
| 53 | testType( "—", DASH ); |
|
| 54 | testType( "—-—", DASH ); |
|
| 52 | 55 | testType( "...", ELLIPSIS ); |
| 56 | testType( ". .", ELLIPSIS ); |
|
| 57 | testType( ". . .", ELLIPSIS ); |
|
| 58 | testType( ".. ... ....", ELLIPSIS ); |
|
| 53 | 59 | } |
| 54 | 60 |
| 14 | 14 | */ |
| 15 | 15 | class ParserTest { |
| 16 | @SuppressWarnings( "TextBlockMigration" ) |
|
| 16 | 17 | private final static Map<String, Map<TokenType, Integer>> TEST_CASES = |
| 17 | 18 | Map.of( |
| ... | ||
| 26 | 27 | "'Twas and 'tis whate'er lay 'twixt dawn and dusk 'n River Styx.", |
| 27 | 28 | Map.of( QUOTE_APOSTROPHE, 5 ), |
| 28 | """ |
|
| 29 | But I must leave the proofs to those who 've seen 'em; |
|
| 30 | But this I heard her say, and can't be wrong |
|
| 31 | And all may think which way their judgments lean 'em, |
|
| 32 | ''T is strange---the Hebrew noun which means "I am," |
|
| 33 | The English always use to govern d--n.' |
|
| 34 | """, |
|
| 29 | "But I must leave the proofs to those who 've seen 'em;\n" + |
|
| 30 | "But this I heard her say, and can't be wrong\n" + |
|
| 31 | "And all may think which way their judgments lean 'em,\n" + |
|
| 32 | "''T is strange---the Hebrew noun which means \"I am,\"\n" + |
|
| 33 | "The English always use to govern d--n.'", |
|
| 35 | 34 | Map.of( QUOTE_APOSTROPHE, 5 ) |
| 36 | 35 | ); |
| ... | ||
| 44 | 43 | |
| 45 | 44 | private void parse( final String text, final Map<TokenType, Integer> tally ) { |
| 46 | final var parser = new Parser( text ); |
|
| 45 | final var contractions = new Contractions.Builder().build(); |
|
| 46 | final var parser = new Parser( text, contractions ); |
|
| 47 | 47 | final var actual = new HashMap<TokenType, Integer>(); |
| 48 | 48 | |