| Author | Dave Jarvis <email> |
|---|---|
| Date | 2022-10-07 23:23:44 GMT-0700 |
| Commit | 944cce6277a98ac6a533f4f958a1a818dd6d4bbd |
| Parent | 8509d5e |
| import java.util.Properties; | ||
| -import static com.whitemagicsoftware.keenquotes.lex.FilterType.FILTER_PLAIN; | ||
| -import static com.whitemagicsoftware.keenquotes.lex.FilterType.FILTER_XML; | ||
| import static java.lang.String.format; | ||
| import static java.lang.System.*; | ||
| else { | ||
| try { | ||
| - final var filter = settings.filterXml() ? FILTER_XML : FILTER_PLAIN; | ||
| + final var filter = settings.filterXml(); | ||
| final var c = new Curler( contractions, filter ); | ||
| return mGlyph == glyph; | ||
| } | ||
| + | ||
| + public String text() { | ||
| + return Character.toString( mGlyph ); | ||
| + } | ||
| } | ||
| token = CLOSING_GROUP; | ||
| } | ||
| - else if( curr == '“' ) { | ||
| - token = QUOTE_DOUBLE_OPENING; | ||
| + else if( LEX_DOUBLE_QUOTE_OPENING.equals( curr ) ) { | ||
| + token = QUOTE_DOUBLE_OPENING.with( LEX_DOUBLE_QUOTE_OPENING ); | ||
| } | ||
| - else if( curr == '”' ) { | ||
| - token = QUOTE_DOUBLE_CLOSING; | ||
| + else if( LEX_DOUBLE_QUOTE_CLOSING.equals( curr ) ) { | ||
| + token = QUOTE_DOUBLE_CLOSING.with( LEX_DOUBLE_QUOTE_CLOSING ); | ||
| } | ||
| - else if( curr == '‘' ) { | ||
| - token = QUOTE_SINGLE_OPENING; | ||
| + else if( LEX_SINGLE_QUOTE_OPENING.equals( curr ) ) { | ||
| + token = QUOTE_SINGLE_OPENING.with( LEX_SINGLE_QUOTE_OPENING ); | ||
| } | ||
| - else if( curr == '’' ) { | ||
| - token = QUOTE_SINGLE_CLOSING; | ||
| + else if( LEX_SINGLE_QUOTE_CLOSING.equals( curr ) ) { | ||
| + token = QUOTE_SINGLE_CLOSING.with( LEX_SINGLE_QUOTE_CLOSING ); | ||
| } | ||
| else if( curr == '\\' ) { |
| package com.whitemagicsoftware.keenquotes.parser; | ||
| -import com.whitemagicsoftware.keenquotes.lex.FilterType; | ||
| -import com.whitemagicsoftware.keenquotes.lex.LexemeGlyph; | ||
| - | ||
| -import java.util.Map; | ||
| import java.util.concurrent.atomic.AtomicInteger; | ||
| import java.util.function.Consumer; | ||
| import java.util.function.Function; | ||
| -import static com.whitemagicsoftware.keenquotes.lex.LexemeGlyph.LEX_DOUBLE_QUOTE_OPENING_LOW; | ||
| -import static com.whitemagicsoftware.keenquotes.parser.TokenType.*; | ||
| -import static java.util.Map.entry; | ||
| -import static java.util.Map.ofEntries; | ||
| +import static com.whitemagicsoftware.keenquotes.lex.FilterType.FILTER_PLAIN; | ||
| +import static com.whitemagicsoftware.keenquotes.lex.FilterType.FILTER_XML; | ||
| /** | ||
| * Resolves straight quotes into curly quotes throughout a document. | ||
| */ | ||
| @SuppressWarnings( "unused" ) | ||
| public class Curler implements Function<String, String> { | ||
| - /** | ||
| - * Provides an entity-based set of {@link Token} replacements. | ||
| - */ | ||
| - public static final Map<TokenType, String> ENTITIES = ofEntries( | ||
| - entry( QUOTE_OPENING_SINGLE, "‘" ), | ||
| - entry( QUOTE_CLOSING_SINGLE, "’" ), | ||
| - entry( QUOTE_OPENING_DOUBLE, "“" ), | ||
| - entry( QUOTE_CLOSING_DOUBLE, "”" ), | ||
| - entry( QUOTE_STRAIGHT_SINGLE, "'" ), | ||
| - entry( QUOTE_STRAIGHT_DOUBLE, "\"" ), | ||
| - entry( QUOTE_APOSTROPHE, "'" ), | ||
| - entry( QUOTE_PRIME_SINGLE, "′" ), | ||
| - entry( QUOTE_PRIME_DOUBLE, "″" ), | ||
| - entry( QUOTE_PRIME_TRIPLE, "‴" ), | ||
| - entry( QUOTE_PRIME_QUADRUPLE, "⁗" ) | ||
| - ); | ||
| - | ||
| - /** | ||
| - * Provides a character-based set of {@link Token} replacements. | ||
| - */ | ||
| - public static final Map<TokenType, String> CHARS = ofEntries( | ||
| - entry( QUOTE_OPENING_SINGLE, "‘" ), | ||
| - entry( QUOTE_CLOSING_SINGLE, "’" ), | ||
| - entry( QUOTE_OPENING_DOUBLE, "“" ), | ||
| - entry( QUOTE_CLOSING_DOUBLE, "”" ), | ||
| - entry( QUOTE_STRAIGHT_SINGLE, "'" ), | ||
| - entry( QUOTE_STRAIGHT_DOUBLE, "\"" ), | ||
| - entry( QUOTE_APOSTROPHE, "’" ), | ||
| - entry( QUOTE_PRIME_SINGLE, "′" ), | ||
| - entry( QUOTE_PRIME_DOUBLE, "″" ), | ||
| - entry( QUOTE_PRIME_TRIPLE, "‴" ), | ||
| - entry( QUOTE_PRIME_QUADRUPLE, "⁗" ) | ||
| - ); | ||
| - | ||
| - /** | ||
| - * Glyphs not found in the table will use the document's glyph. | ||
| - */ | ||
| - public static final Map<LexemeGlyph, String> I18N_ENTITIES = ofEntries( | ||
| - entry( LEX_DOUBLE_QUOTE_OPENING_LOW, "„" ) | ||
| - ); | ||
| - | ||
| - /** | ||
| - * Glyphs not found in the table will use the value from the document. | ||
| - */ | ||
| - public static final Map<LexemeGlyph, String> I18N_CHARS = ofEntries(); | ||
| private final Contractions mContractions; | ||
| - private final Map<TokenType, String> mReplacements; | ||
| - private final Map<LexemeGlyph, String> mI18n; | ||
| - private final FilterType mFilterType; | ||
| - | ||
| - /** | ||
| - * Maps quotes to HTML entities. | ||
| - * | ||
| - * @param c Contractions listings. | ||
| - * @param parserType Creates a parser based on document content structure. | ||
| - */ | ||
| - public Curler( final Contractions c, final FilterType parserType ) { | ||
| - this( c, ENTITIES, I18N_ENTITIES, parserType ); | ||
| - } | ||
| + private final boolean mEntities; | ||
| /** | ||
| * Maps quotes to curled character equivalents. | ||
| * | ||
| - * @param c Contractions listings. | ||
| - * @param replacements Map of recognized quotes to output types (entity or | ||
| - * Unicode character). | ||
| + * @param c Contractions listings. | ||
| + * @param entities {@code true} to convert quotation marks to HTML entities. | ||
| */ | ||
| public Curler( | ||
| final Contractions c, | ||
| - final Map<TokenType, String> replacements, | ||
| - final Map<LexemeGlyph, String> i18n, | ||
| - final FilterType parserType | ||
| + final boolean entities | ||
| ) { | ||
| assert c != null; | ||
| - assert replacements != null; | ||
| - assert !replacements.isEmpty(); | ||
| - assert i18n != null; | ||
| - assert parserType != null; | ||
| mContractions = c; | ||
| - mReplacements = replacements; | ||
| - mI18n = i18n; | ||
| - mFilterType = parserType; | ||
| + mEntities = entities; | ||
| } | ||
| text, | ||
| mContractions, | ||
| - replace( output, offset, mReplacements, mI18n ), | ||
| - mFilterType.filter() | ||
| + replace( output, offset, mEntities ), | ||
| + (mEntities ? FILTER_XML : FILTER_PLAIN).filter() | ||
| ); | ||
| return output.toString(); | ||
| } | ||
| /** | ||
| * Replaces non-ambiguous tokens with their equivalent string representation. | ||
| * | ||
| - * @param output Continuously updated result document. | ||
| - * @param offset Accumulating index where {@link Token} is replaced. | ||
| - * @param replacements Maps {@link TokenType}s to replacement strings. | ||
| - * @param i18n Maps {@link LexemeGlyph}s to internationalized strings. | ||
| + * @param output Continuously updated result document. | ||
| + * @param offset Accumulating index where {@link Token} is replaced. | ||
| + * @param entities {@code true} to convert quotation marks to HTML entities. | ||
| * @return Instructions to replace a {@link Token} in the result document. | ||
| */ | ||
| public static Consumer<Token> replace( | ||
| final StringBuilder output, | ||
| final AtomicInteger offset, | ||
| - final Map<TokenType, String> replacements, | ||
| - final Map<LexemeGlyph, String> i18n | ||
| + final boolean entities | ||
| ) { | ||
| return token -> { | ||
| if( !token.isAmbiguous() ) { | ||
| - final var text = token.toString( replacements, i18n ); | ||
| + final var text = token.toString( entities ); | ||
| output.replace( | ||
| emit( QUOTE_AMBIGUOUS_DOUBLE, lex2 ); | ||
| } | ||
| - // International opening quotation mark. | ||
| + // International opening double quotation mark. | ||
| else if( match( ANY, QUOTE_DOUBLE_OPENING, ANY, ANY ) ) { | ||
| emit( QUOTE_OPENING_DOUBLE, lex2 ); | ||
| + } | ||
| + // International opening single quotation mark. | ||
| + else if( match( ANY, QUOTE_SINGLE_OPENING, ANY, ANY ) ) { | ||
| + emit( QUOTE_OPENING_SINGLE, lex2 ); | ||
| + } | ||
| + // International double closing quotation mark. | ||
| + else if( match( ANY, ANY, ANY, QUOTE_DOUBLE_CLOSING ) ) { | ||
| + emit( QUOTE_CLOSING_DOUBLE, lex4 ); | ||
| + } | ||
| + // International single closing quotation mark. | ||
| + else if( match( ANY, ANY, ANY, QUOTE_SINGLE_CLOSING ) ) { | ||
| + emit( QUOTE_CLOSING_SINGLE, lex4 ); | ||
| } | ||
| // Ambiguous (no match) |
| import java.util.Map; | ||
| +import static com.whitemagicsoftware.keenquotes.lex.LexemeGlyph.*; | ||
| import static com.whitemagicsoftware.keenquotes.parser.TokenType.*; | ||
| +import static java.util.Map.entry; | ||
| +import static java.util.Map.ofEntries; | ||
| /** | ||
| * Represents a high-level token read from a text document. | ||
| */ | ||
| final class Token implements Comparable<Token>, Stem { | ||
| + /** | ||
| + * Provides an entity-based set of {@link Token} replacements. | ||
| + */ | ||
| + private static final Map<TokenType, String> ENTITIES = ofEntries( | ||
| + entry( QUOTE_OPENING_SINGLE, "‘" ), | ||
| + entry( QUOTE_CLOSING_SINGLE, "’" ), | ||
| + entry( QUOTE_OPENING_DOUBLE, "“" ), | ||
| + entry( QUOTE_CLOSING_DOUBLE, "”" ), | ||
| + entry( QUOTE_STRAIGHT_SINGLE, "'" ), | ||
| + entry( QUOTE_STRAIGHT_DOUBLE, "\"" ), | ||
| + entry( QUOTE_APOSTROPHE, "'" ), | ||
| + entry( QUOTE_PRIME_SINGLE, "′" ), | ||
| + entry( QUOTE_PRIME_DOUBLE, "″" ), | ||
| + entry( QUOTE_PRIME_TRIPLE, "‴" ), | ||
| + entry( QUOTE_PRIME_QUADRUPLE, "⁗" ) | ||
| + ); | ||
| + | ||
| + /** | ||
| + * Provides a character-based set of {@link Token} replacements. | ||
| + */ | ||
| + private static final Map<TokenType, String> CHARS = ofEntries( | ||
| + entry( QUOTE_OPENING_SINGLE, "‘" ), | ||
| + entry( QUOTE_CLOSING_SINGLE, "’" ), | ||
| + entry( QUOTE_OPENING_DOUBLE, "“" ), | ||
| + entry( QUOTE_CLOSING_DOUBLE, "”" ), | ||
| + entry( QUOTE_STRAIGHT_SINGLE, "'" ), | ||
| + entry( QUOTE_STRAIGHT_DOUBLE, "\"" ), | ||
| + entry( QUOTE_APOSTROPHE, "’" ), | ||
| + entry( QUOTE_PRIME_SINGLE, "′" ), | ||
| + entry( QUOTE_PRIME_DOUBLE, "″" ), | ||
| + entry( QUOTE_PRIME_TRIPLE, "‴" ), | ||
| + entry( QUOTE_PRIME_QUADRUPLE, "⁗" ) | ||
| + ); | ||
| + | ||
| + /** | ||
| + * Glyphs not found in the table will use the document's glyph. | ||
| + */ | ||
| + private static final Map<LexemeGlyph, String> I18N_ENTITIES = ofEntries( | ||
| + entry( LEX_DOUBLE_QUOTE_OPENING_LOW, "„" ), | ||
| + entry( LEX_DOUBLE_CHEVRON_LEFT, "«" ), | ||
| + entry( LEX_DOUBLE_CHEVRON_RIGHT, "»" ), | ||
| + entry( LEX_SINGLE_CHEVRON_LEFT, "‹" ), | ||
| + entry( LEX_SINGLE_CHEVRON_RIGHT, "›" ) | ||
| + ); | ||
| + | ||
| /** | ||
| * Denotes that the token does not represent a value in the parsed document. | ||
| } | ||
| - public String toString( | ||
| - final Map<TokenType, String> entities, | ||
| - final Map<LexemeGlyph, String> i18n ) { | ||
| - return i18n.getOrDefault( | ||
| - mLexeme.getType().glyph(), | ||
| - entities.get( getType() ) | ||
| - ); | ||
| + /** | ||
| + * Converts this token to its string representation, which will either be | ||
| + * an HTML entity or a character. | ||
| + * | ||
| + * @param entities {@code true} to convert quotation marks to HTML entities. | ||
| + * @return A plain quotation mark character or an HTML entity. | ||
| + */ | ||
| + public String toString( final boolean entities ) { | ||
| + final var glyph = mLexeme.getType().glyph(); | ||
| + | ||
| + return entities | ||
| + ? I18N_ENTITIES.getOrDefault( glyph, ENTITIES.get( getType() ) ) | ||
| + : CHARS.getOrDefault( getType(), glyph.text() ); | ||
| } | ||
| @Disabled | ||
| + @SuppressWarnings( "unused" ) | ||
| void test_Resolve_InvalidGrammar_AmbiguousRemain() throws IOException { | ||
| test( "invalid-grammar.txt" ); | ||
| input, | ||
| CONTRACTIONS, | ||
| - replace( output, offset, ENTITIES, I18N_ENTITIES ), | ||
| + replace( output, offset, true ), | ||
| filter -> false | ||
| ); | ||
| package com.whitemagicsoftware.keenquotes.parser; | ||
| -import com.whitemagicsoftware.keenquotes.lex.FilterType; | ||
| import org.junit.jupiter.api.Disabled; | ||
| import org.junit.jupiter.api.Test; | ||
| import org.junit.jupiter.params.ParameterizedTest; | ||
| import org.junit.jupiter.params.provider.ValueSource; | ||
| import java.io.IOException; | ||
| import java.util.function.Function; | ||
| -import static com.whitemagicsoftware.keenquotes.lex.FilterType.FILTER_PLAIN; | ||
| -import static com.whitemagicsoftware.keenquotes.lex.FilterType.FILTER_XML; | ||
| import static com.whitemagicsoftware.keenquotes.texts.TestResource.open; | ||
| import static com.whitemagicsoftware.keenquotes.texts.TestResource.readPairs; | ||
| @Test | ||
| public void test_Parse_UncurledQuotes1_CurlyQuotes() throws IOException { | ||
| - testCurler( createCurler( FILTER_PLAIN ), "unambiguous-1-pass.txt" ); | ||
| + testCurler( createCurler( true ), "unambiguous-1-pass.txt" ); | ||
| } | ||
| @Test | ||
| public void test_Parse_UncurledQuotes2_CurlyQuotes() throws IOException { | ||
| - testCurler( createCurler( FILTER_PLAIN ), "unambiguous-2-pass.txt" ); | ||
| + testCurler( createCurler( true ), "unambiguous-2-pass.txt" ); | ||
| } | ||
| @Disabled | ||
| + @SuppressWarnings( {"unused", "JUnit3StyleTestMethodInJUnit4Class"} ) | ||
| public void test_Parse_AmbiguousQuotes_PartiallyCurled() throws IOException { | ||
| - testCurler( createCurler( FILTER_PLAIN ), "ambiguous-n-pass.txt" ); | ||
| + testCurler( createCurler( false ), "ambiguous-n-pass.txt" ); | ||
| } | ||
| @Test | ||
| public void test_Parse_UncurledQuotesXml_CurlyQuotes() throws IOException { | ||
| - testCurler( createCurler( FILTER_XML ), "xml.txt" ); | ||
| + testCurler( createCurler( true ), "xml.txt" ); | ||
| } | ||
| @Test | ||
| public void test_Parse_UncurledQuotesI11l_CurlyQuotes() throws IOException { | ||
| - testCurler( createCurler( FILTER_PLAIN ), "i18n.txt" ); | ||
| + testCurler( createCurler( true ), "i18n.txt" ); | ||
| } | ||
| } | ||
| - final var curler = createCurler( FILTER_XML ); | ||
| + final var curler = createCurler( true ); | ||
| System.out.println( curler.apply( sb.toString() ) ); | ||
| } | ||
| } | ||
| - private Function<String, String> createCurler( final FilterType parserType ) { | ||
| - return new Curler( new Contractions.Builder().build(), parserType ); | ||
| + private Function<String, String> createCurler( final boolean entities ) { | ||
| + return new Curler( createContractions(), entities ); | ||
| + } | ||
| + | ||
| + private Contractions createContractions() { | ||
| + return new Contractions.Builder().build(); | ||
| } | ||
| } | ||
| import java.util.concurrent.atomic.AtomicInteger; | ||
| -import static com.whitemagicsoftware.keenquotes.parser.Curler.*; | ||
| +import static com.whitemagicsoftware.keenquotes.parser.Curler.replace; | ||
| import static com.whitemagicsoftware.keenquotes.texts.TestResource.readPairs; | ||
| import static org.junit.jupiter.api.Assertions.assertEquals; | ||
| input, | ||
| CONTRACTIONS, | ||
| - replace( output, offset, ENTITIES, I18N_ENTITIES ), | ||
| + replace( output, offset, true ), | ||
| filter -> false | ||
| ); | ||
| # ######################################################################## | ||
| +# French | ||
| +# ######################################################################## | ||
| +«Ce n'est pas la mer à boire,» il me dit. | ||
| +«Ce n'est pas la mer à boire,» il me dit. | ||
| + | ||
| +Dit il, ‹Ce n'est pas la mer à boire?› | ||
| +Dit il, ‹Ce n'est pas la mer à boire?› | ||
| + | ||
| +# ######################################################################## | ||
| # Dutch | ||
| # ######################################################################## |
| "’Kearney lives on the banks of Killarney—’ | ||
| -“’Kearney lives on the banks of Killarney—’ | ||
| +“’Kearney lives on the banks of Killarney—’ | ||
| # ######################################################################## |
| Delta | 128 lines added, 122 lines removed, 6-line increase |
|---|