| 46 | 46 | else { |
| 47 | 47 | try { |
| 48 | final var filter = settings.filterXml() ? FILTER_XML : FILTER_PLAIN; |
|
| 49 | final var c = new Curler( contractions, filter ); |
|
| 48 | final var c = new Curler( |
|
| 49 | contractions, |
|
| 50 | settings.filterXml() ? FILTER_XML : FILTER_PLAIN, |
|
| 51 | settings.entities() |
|
| 52 | ); |
|
| 50 | 53 | |
| 51 | 54 | out.print( convert( c ) ); |
| 77 | 77 | |
| 78 | 78 | /** |
| 79 | * Encode quotation marks using HTML entities. |
|
| 80 | */ |
|
| 81 | @CommandLine.Option( |
|
| 82 | names = {"-e", "--entities"}, |
|
| 83 | description = "Encode quotation marks using HTML entities" |
|
| 84 | ) |
|
| 85 | private boolean mEntities; |
|
| 86 | ||
| 87 | /** |
|
| 79 | 88 | * Enable the {@link XmlFilter}. |
| 80 | 89 | */ |
| ... | ||
| 95 | 104 | * @return {@code true} to list the contractions. |
| 96 | 105 | */ |
| 97 | boolean displayList() { |
|
| 98 | return mDisplayList; |
|
| 99 | } |
|
| 106 | boolean displayList() { return mDisplayList; } |
|
| 100 | 107 | |
| 108 | /** |
|
| 109 | * Answers whether quotation marks within XML elements are ignored. |
|
| 110 | * |
|
| 111 | * @return {@code true} to honour quotation marks inside XML elements. |
|
| 112 | */ |
|
| 101 | 113 | boolean filterXml() { return mFilterXml; } |
| 114 | ||
| 115 | /** |
|
| 116 | * Answers whether entities must be encoded using HTML entities. |
|
| 117 | * |
|
| 118 | * @return {@code true} to encode quotation marks using HTML entities. |
|
| 119 | */ |
|
| 120 | boolean entities() { return mEntities; } |
|
| 102 | 121 | |
| 103 | 122 | List<String> getBeganUnambiguous() { |
| 1 | /* Copyright 2022 White Magic Software, Ltd. -- All rights reserved. */ |
|
| 2 | package com.whitemagicsoftware.keenquotes.lex; |
|
| 3 | ||
| 4 | /** |
|
| 5 | * Common international quotation mark symbols to allow for re-encoding when |
|
| 6 | * exporting back to text. |
|
| 7 | */ |
|
| 8 | public enum LexemeGlyph { |
|
| 9 | /** |
|
| 10 | * Any other character that's not a quotation mark. |
|
| 11 | */ |
|
| 12 | LEX_OTHER( (char) 0 ), |
|
| 13 | ||
| 14 | LEX_SINGLE_QUOTE( '\'' ), |
|
| 15 | LEX_SINGLE_QUOTE_OPENING( '‘' ), |
|
| 16 | LEX_SINGLE_QUOTE_CLOSING( '’' ), |
|
| 17 | ||
| 18 | LEX_DOUBLE_QUOTE( '"' ), |
|
| 19 | LEX_DOUBLE_QUOTE_OPENING( '“' ), |
|
| 20 | LEX_DOUBLE_QUOTE_CLOSING( '”' ), |
|
| 21 | LEX_DOUBLE_QUOTE_OPENING_LOW( '„' ), |
|
| 22 | ||
| 23 | LEX_DOUBLE_CHEVRON_LEFT( '«' ), |
|
| 24 | LEX_DOUBLE_CHEVRON_RIGHT( '»' ), |
|
| 25 | LEX_SINGLE_CHEVRON_LEFT( '‹' ), |
|
| 26 | LEX_SINGLE_CHEVRON_RIGHT( '›' ); |
|
| 27 | ||
| 28 | private final char mGlyph; |
|
| 29 | ||
| 30 | LexemeGlyph( final char glyph ) { |
|
| 31 | mGlyph = glyph; |
|
| 32 | } |
|
| 33 | ||
| 34 | /** |
|
| 35 | * Answers whether the given character matches the internal glyph. |
|
| 36 | * |
|
| 37 | * @param glyph The character to compare against the internal character. |
|
| 38 | * @return {@code true} iff the characters are equal. |
|
| 39 | */ |
|
| 40 | public boolean equals( final char glyph ) { |
|
| 41 | return mGlyph == glyph; |
|
| 42 | } |
|
| 43 | ||
| 44 | public String text() { |
|
| 45 | return Character.toString( mGlyph ); |
|
| 46 | } |
|
| 47 | } |
|
| 1 | 48 |
| 2 | 2 | package com.whitemagicsoftware.keenquotes.lex; |
| 3 | 3 | |
| 4 | import static com.whitemagicsoftware.keenquotes.lex.LexemeGlyph.*; |
|
| 5 | ||
| 4 | 6 | /** |
| 5 | 7 | * Represents the type of {@link Lexeme} parsed by the {@link Lexer}. |
| 6 | 8 | */ |
| 7 | 9 | @SuppressWarnings( "SpellCheckingInspection" ) |
| 8 | public enum LexemeType { |
|
| 9 | QUOTE_SINGLE, |
|
| 10 | QUOTE_SINGLE_OPENING, |
|
| 11 | QUOTE_SINGLE_CLOSING, |
|
| 12 | QUOTE_DOUBLE, |
|
| 13 | QUOTE_DOUBLE_OPENING, |
|
| 14 | QUOTE_DOUBLE_CLOSING, |
|
| 15 | ESC_SINGLE, |
|
| 16 | ESC_DOUBLE, |
|
| 17 | SOT, |
|
| 18 | EOL, |
|
| 19 | EOP, |
|
| 20 | EOT, |
|
| 21 | SPACE, |
|
| 22 | WORD, |
|
| 23 | NUMBER, |
|
| 24 | PUNCT, |
|
| 25 | OPENING_GROUP, |
|
| 26 | CLOSING_GROUP, |
|
| 27 | HYPHEN, |
|
| 28 | DASH, |
|
| 29 | EQUALS, |
|
| 30 | PERIOD, |
|
| 31 | ELLIPSIS, |
|
| 32 | ENDING, |
|
| 33 | ANY, |
|
| 34 | NONE |
|
| 10 | public final class LexemeType { |
|
| 11 | // @formatter:off |
|
| 12 | public static final LexemeType QUOTE_SINGLE = new LexemeType( LEX_SINGLE_QUOTE ); |
|
| 13 | public static final LexemeType QUOTE_SINGLE_OPENING = new LexemeType( LEX_SINGLE_QUOTE_OPENING ); |
|
| 14 | public static final LexemeType QUOTE_SINGLE_CLOSING = new LexemeType( LEX_SINGLE_QUOTE_CLOSING ); |
|
| 15 | public static final LexemeType QUOTE_DOUBLE = new LexemeType( LEX_DOUBLE_QUOTE ); |
|
| 16 | public static final LexemeType QUOTE_DOUBLE_OPENING = new LexemeType( LEX_DOUBLE_QUOTE_OPENING ); |
|
| 17 | public static final LexemeType QUOTE_DOUBLE_CLOSING = new LexemeType( LEX_DOUBLE_QUOTE_CLOSING ); |
|
| 18 | public static final LexemeType ESC_SINGLE = new LexemeType(); |
|
| 19 | public static final LexemeType ESC_DOUBLE = new LexemeType(); |
|
| 20 | public static final LexemeType PRIME_DOUBLE = new LexemeType(); |
|
| 21 | public static final LexemeType SOT = new LexemeType(); |
|
| 22 | public static final LexemeType EOL = new LexemeType(); |
|
| 23 | public static final LexemeType EOP = new LexemeType(); |
|
| 24 | public static final LexemeType EOT = new LexemeType(); |
|
| 25 | public static final LexemeType SPACE = new LexemeType(); |
|
| 26 | public static final LexemeType WORD = new LexemeType(); |
|
| 27 | public static final LexemeType NUMBER = new LexemeType(); |
|
| 28 | public static final LexemeType PUNCT = new LexemeType(); |
|
| 29 | public static final LexemeType OPENING_GROUP = new LexemeType(); |
|
| 30 | public static final LexemeType CLOSING_GROUP = new LexemeType(); |
|
| 31 | public static final LexemeType HYPHEN = new LexemeType(); |
|
| 32 | public static final LexemeType DASH = new LexemeType(); |
|
| 33 | public static final LexemeType EQUALS = new LexemeType(); |
|
| 34 | public static final LexemeType PERIOD = new LexemeType(); |
|
| 35 | public static final LexemeType ELLIPSIS = new LexemeType(); |
|
| 36 | public static final LexemeType ENDING = new LexemeType(); |
|
| 37 | public static final LexemeType ANY = new LexemeType(); |
|
| 38 | public static final LexemeType NONE = new LexemeType(); |
|
| 39 | // @formatter:on |
|
| 40 | ||
| 41 | private LexemeGlyph mGlyph; |
|
| 42 | ||
| 43 | /** |
|
| 44 | * Constructs an instance of {@link LexemeType} using |
|
| 45 | * {@link LexemeGlyph#LEX_OTHER} to indicate that this type of lexeme isn't |
|
| 46 | * a quotation mark glyph. |
|
| 47 | */ |
|
| 48 | public LexemeType() { |
|
| 49 | this( LEX_OTHER ); |
|
| 50 | } |
|
| 51 | ||
| 52 | /** |
|
| 53 | * Constructs an instance of {@link LexemeType} using a particular glyph. |
|
| 54 | * |
|
| 55 | * @param glyph Typically represents an internationalized quotation mark |
|
| 56 | * character. |
|
| 57 | */ |
|
| 58 | public LexemeType( final LexemeGlyph glyph ) { |
|
| 59 | setGlyph( glyph ); |
|
| 60 | } |
|
| 61 | ||
| 62 | /** |
|
| 63 | * Changes the type of glyph associated with this type of lexeme. This |
|
| 64 | * is useful for passing along different glyphs represented by the same |
|
| 65 | * lexeme (such as different opening quotation marks). |
|
| 66 | * |
|
| 67 | * @param glyph The new {@link LexemeGlyph} to associate, often an |
|
| 68 | * internationalized quotation mark. |
|
| 69 | * @return {@code this} to allow chaining. |
|
| 70 | */ |
|
| 71 | public LexemeType with( final LexemeGlyph glyph ) { |
|
| 72 | setGlyph( glyph ); |
|
| 73 | return this; |
|
| 74 | } |
|
| 75 | ||
| 76 | /** |
|
| 77 | * Provides the glyph used to identify international quotation marks. |
|
| 78 | * |
|
| 79 | * @return The glyph set either at construction time or after calling |
|
| 80 | * {@link #with(LexemeGlyph)}. |
|
| 81 | */ |
|
| 82 | public LexemeGlyph glyph() { |
|
| 83 | return mGlyph; |
|
| 84 | } |
|
| 85 | ||
| 86 | private void setGlyph( final LexemeGlyph glyph ) { |
|
| 87 | mGlyph = glyph; |
|
| 88 | } |
|
| 89 | ||
| 90 | /** |
|
| 91 | * Provides useful debugging information. |
|
| 92 | * |
|
| 93 | * @return The class name and encodable glyph. |
|
| 94 | */ |
|
| 95 | @Override |
|
| 96 | public String toString() { |
|
| 97 | return getClass().getSimpleName() + |
|
| 98 | '[' + glyph() + ']'; |
|
| 99 | } |
|
| 35 | 100 | } |
| 36 | 101 |
| 6 | 6 | import java.util.function.Consumer; |
| 7 | 7 | |
| 8 | import static com.whitemagicsoftware.keenquotes.lex.LexemeGlyph.*; |
|
| 8 | 9 | import static com.whitemagicsoftware.keenquotes.lex.LexemeType.*; |
| 9 | 10 | import static java.lang.Character.isWhitespace; |
| ... | ||
| 47 | 48 | // Allow filters to skip character sequences (such as XML tags). This |
| 48 | 49 | // must allow back-to-back filtering, hence the loop. |
| 49 | while( filter.test( i ) ); |
|
| 50 | while( filter.test( i ) ) ; |
|
| 50 | 51 | |
| 51 | 52 | final var index = i.index(); |
| ... | ||
| 114 | 115 | token = CLOSING_GROUP; |
| 115 | 116 | } |
| 116 | else if( curr == '“' ) { |
|
| 117 | token = QUOTE_DOUBLE_OPENING; |
|
| 117 | else if( LEX_DOUBLE_QUOTE_OPENING.equals( curr ) ) { |
|
| 118 | token = QUOTE_DOUBLE_OPENING.with( LEX_DOUBLE_QUOTE_OPENING ); |
|
| 118 | 119 | } |
| 119 | else if( curr == '”' ) { |
|
| 120 | token = QUOTE_DOUBLE_CLOSING; |
|
| 120 | else if( LEX_DOUBLE_QUOTE_CLOSING.equals( curr ) ) { |
|
| 121 | token = QUOTE_DOUBLE_CLOSING.with( LEX_DOUBLE_QUOTE_CLOSING ); |
|
| 121 | 122 | } |
| 122 | else if( curr == '‘' ) { |
|
| 123 | token = QUOTE_SINGLE_OPENING; |
|
| 123 | else if( LEX_SINGLE_QUOTE_OPENING.equals( curr ) ) { |
|
| 124 | token = QUOTE_SINGLE_OPENING.with( LEX_SINGLE_QUOTE_OPENING ); |
|
| 124 | 125 | } |
| 125 | else if( curr == '’' ) { |
|
| 126 | token = QUOTE_SINGLE_CLOSING; |
|
| 126 | else if( LEX_SINGLE_QUOTE_CLOSING.equals( curr ) ) { |
|
| 127 | token = QUOTE_SINGLE_CLOSING.with( LEX_SINGLE_QUOTE_CLOSING ); |
|
| 127 | 128 | } |
| 128 | 129 | else if( curr == '\\' ) { |
| ... | ||
| 143 | 144 | else if( curr == '=' ) { |
| 144 | 145 | token = EQUALS; |
| 146 | } |
|
| 147 | else if( curr == ',' && i.peek() == ',' ) { |
|
| 148 | i.skip( next -> next == ',' ); |
|
| 149 | token = QUOTE_DOUBLE_OPENING.with( LEX_DOUBLE_QUOTE_OPENING_LOW ); |
|
| 150 | } |
|
| 151 | else if( LEX_DOUBLE_QUOTE_OPENING_LOW.equals( curr ) ) { |
|
| 152 | token = QUOTE_DOUBLE_OPENING.with( LEX_DOUBLE_QUOTE_OPENING_LOW ); |
|
| 153 | } |
|
| 154 | else if( LEX_SINGLE_CHEVRON_LEFT.equals( curr ) ) { |
|
| 155 | token = QUOTE_SINGLE_OPENING.with( LEX_SINGLE_CHEVRON_LEFT ); |
|
| 156 | } |
|
| 157 | else if( LEX_DOUBLE_CHEVRON_LEFT.equals( curr ) ) { |
|
| 158 | token = QUOTE_DOUBLE_OPENING.with( LEX_DOUBLE_CHEVRON_LEFT ); |
|
| 159 | } |
|
| 160 | else if( LEX_SINGLE_CHEVRON_RIGHT.equals( curr ) ) { |
|
| 161 | token = QUOTE_SINGLE_CLOSING.with( LEX_SINGLE_CHEVRON_RIGHT ); |
|
| 162 | } |
|
| 163 | else if( LEX_DOUBLE_CHEVRON_RIGHT.equals( curr ) ) { |
|
| 164 | token = QUOTE_DOUBLE_CLOSING.with( LEX_DOUBLE_CHEVRON_RIGHT ); |
|
| 145 | 165 | } |
| 146 | 166 | else if( curr == DONE ) { |
| 3 | 3 | |
| 4 | 4 | import com.whitemagicsoftware.keenquotes.lex.FilterType; |
| 5 | import com.whitemagicsoftware.keenquotes.lex.LexerFilter; |
|
| 5 | 6 | |
| 6 | import java.util.Map; |
|
| 7 | 7 | import java.util.concurrent.atomic.AtomicInteger; |
| 8 | 8 | import java.util.function.Consumer; |
| 9 | 9 | import java.util.function.Function; |
| 10 | ||
| 11 | import static com.whitemagicsoftware.keenquotes.parser.TokenType.*; |
|
| 12 | import static java.util.Map.entry; |
|
| 13 | import static java.util.Map.ofEntries; |
|
| 14 | 10 | |
| 15 | 11 | /** |
| 16 | 12 | * Resolves straight quotes into curly quotes throughout a document. |
| 17 | 13 | */ |
| 18 | 14 | @SuppressWarnings( "unused" ) |
| 19 | 15 | public class Curler implements Function<String, String> { |
| 20 | /** |
|
| 21 | * Provides an entity-based set of {@link Token} replacements. |
|
| 22 | */ |
|
| 23 | public static final Map<TokenType, String> ENTITIES = ofEntries( |
|
| 24 | entry( QUOTE_OPENING_SINGLE, "‘" ), |
|
| 25 | entry( QUOTE_CLOSING_SINGLE, "’" ), |
|
| 26 | entry( QUOTE_OPENING_DOUBLE, "“" ), |
|
| 27 | entry( QUOTE_CLOSING_DOUBLE, "”" ), |
|
| 28 | entry( QUOTE_STRAIGHT_SINGLE, "'" ), |
|
| 29 | entry( QUOTE_STRAIGHT_DOUBLE, "\"" ), |
|
| 30 | entry( QUOTE_APOSTROPHE, "'" ), |
|
| 31 | entry( QUOTE_PRIME_SINGLE, "′" ), |
|
| 32 | entry( QUOTE_PRIME_DOUBLE, "″" ), |
|
| 33 | entry( QUOTE_PRIME_TRIPLE, "‴" ), |
|
| 34 | entry( QUOTE_PRIME_QUADRUPLE, "⁗" ) |
|
| 35 | ); |
|
| 36 | ||
| 37 | /** |
|
| 38 | * Provides a character-based set of {@link Token} replacements. |
|
| 39 | */ |
|
| 40 | public static final Map<TokenType, String> CHARS = ofEntries( |
|
| 41 | entry( QUOTE_OPENING_SINGLE, "‘" ), |
|
| 42 | entry( QUOTE_CLOSING_SINGLE, "’" ), |
|
| 43 | entry( QUOTE_OPENING_DOUBLE, "“" ), |
|
| 44 | entry( QUOTE_CLOSING_DOUBLE, "”" ), |
|
| 45 | entry( QUOTE_STRAIGHT_SINGLE, "'" ), |
|
| 46 | entry( QUOTE_STRAIGHT_DOUBLE, "\"" ), |
|
| 47 | entry( QUOTE_APOSTROPHE, "’" ), |
|
| 48 | entry( QUOTE_PRIME_SINGLE, "′" ), |
|
| 49 | entry( QUOTE_PRIME_DOUBLE, "″" ), |
|
| 50 | entry( QUOTE_PRIME_TRIPLE, "‴" ), |
|
| 51 | entry( QUOTE_PRIME_QUADRUPLE, "⁗" ) |
|
| 52 | ); |
|
| 53 | 16 | |
| 54 | 17 | private final Contractions mContractions; |
| 55 | private final Map<TokenType, String> mReplacements; |
|
| 56 | private final FilterType mFilterType; |
|
| 57 | ||
| 58 | /** |
|
| 59 | * Maps quotes to HTML entities. |
|
| 60 | * |
|
| 61 | * @param c Contractions listings. |
|
| 62 | * @param parserType Creates a parser based on document content structure. |
|
| 63 | */ |
|
| 64 | public Curler( final Contractions c, final FilterType parserType ) { |
|
| 65 | this( c, ENTITIES, parserType ); |
|
| 66 | } |
|
| 18 | private final LexerFilter mFilter; |
|
| 19 | private final boolean mEntities; |
|
| 67 | 20 | |
| 68 | 21 | /** |
| 69 | 22 | * Maps quotes to curled character equivalents. |
| 70 | 23 | * |
| 71 | * @param c Contractions listings. |
|
| 72 | * @param replacements Map of recognized quotes to output types (entity or |
|
| 73 | * Unicode character). |
|
| 24 | * @param c Contractions listings. |
|
| 25 | * @param entities {@code true} to convert quotation marks to HTML entities. |
|
| 74 | 26 | */ |
| 75 | 27 | public Curler( |
| 76 | 28 | final Contractions c, |
| 77 | final Map<TokenType, String> replacements, |
|
| 78 | final FilterType parserType |
|
| 29 | final FilterType filterType, |
|
| 30 | final boolean entities |
|
| 79 | 31 | ) { |
| 32 | assert c != null; |
|
| 33 | ||
| 80 | 34 | mContractions = c; |
| 81 | mReplacements = replacements; |
|
| 82 | mFilterType = parserType; |
|
| 35 | mEntities = entities; |
|
| 36 | mFilter = filterType.filter(); |
|
| 83 | 37 | } |
| 84 | 38 | |
| ... | ||
| 100 | 54 | text, |
| 101 | 55 | mContractions, |
| 102 | replace( output, offset, mReplacements ), |
|
| 103 | mFilterType.filter() |
|
| 56 | replace( output, offset, mEntities ), |
|
| 57 | mFilter |
|
| 104 | 58 | ); |
| 105 | 59 | |
| 106 | 60 | return output.toString(); |
| 107 | 61 | } |
| 108 | 62 | |
| 109 | 63 | /** |
| 110 | 64 | * Replaces non-ambiguous tokens with their equivalent string representation. |
| 111 | 65 | * |
| 112 | * @param output Continuously updated result document. |
|
| 113 | * @param offset Accumulating index where {@link Token} is replaced. |
|
| 114 | * @param replacements Map of {@link TokenType}s to replacement strings. |
|
| 66 | * @param output Continuously updated result document. |
|
| 67 | * @param offset Accumulating index where {@link Token} is replaced. |
|
| 68 | * @param entities {@code true} to convert quotation marks to HTML entities. |
|
| 115 | 69 | * @return Instructions to replace a {@link Token} in the result document. |
| 116 | 70 | */ |
| 117 | 71 | public static Consumer<Token> replace( |
| 118 | 72 | final StringBuilder output, |
| 119 | 73 | final AtomicInteger offset, |
| 120 | final Map<TokenType, String> replacements |
|
| 74 | final boolean entities |
|
| 121 | 75 | ) { |
| 122 | 76 | return token -> { |
| 123 | 77 | if( !token.isAmbiguous() ) { |
| 124 | final var entity = token.toString( replacements ); |
|
| 78 | final var text = token.toString( entities ); |
|
| 125 | 79 | |
| 126 | 80 | output.replace( |
| 127 | 81 | token.began() + offset.get(), |
| 128 | 82 | token.ended() + offset.get(), |
| 129 | entity |
|
| 83 | text |
|
| 130 | 84 | ); |
| 131 | 85 | |
| 132 | offset.addAndGet( entity.length() - (token.ended() - token.began()) ); |
|
| 86 | offset.addAndGet( text.length() - (token.ended() - token.began()) ); |
|
| 133 | 87 | } |
| 134 | 88 | }; |
| 190 | 190 | // <2''> |
| 191 | 191 | else if( match( NUMBER, QUOTE_SINGLE, QUOTE_SINGLE, ANY ) ) { |
| 192 | emit( QUOTE_PRIME_DOUBLE, lex2.began(), lex3.ended() ); |
|
| 192 | // Force double primes to conform to the same constructor usage. This |
|
| 193 | // simplifies the tokens, reduces some memory usage, |
|
| 194 | final var lex = new Lexeme( PRIME_DOUBLE, lex2.began(), lex3.ended() ); |
|
| 195 | ||
| 196 | emit( QUOTE_PRIME_DOUBLE, lex ); |
|
| 193 | 197 | mQ.set( Lexeme.NONE, 2 ); |
| 194 | 198 | } |
| ... | ||
| 349 | 353 | emit( QUOTE_AMBIGUOUS_SINGLE, lex2 ); |
| 350 | 354 | } |
| 355 | } |
|
| 356 | // <'"Trouble> |
|
| 357 | else if( match( QUOTE_SINGLE, QUOTE_DOUBLE, WORD, ANY ) ) { |
|
| 358 | emit( QUOTE_OPENING_DOUBLE, lex2 ); |
|
| 351 | 359 | } |
| 352 | 360 | else if( match( ANY, QUOTE_DOUBLE, ANY, ANY ) ) { |
| 353 | 361 | emit( QUOTE_AMBIGUOUS_DOUBLE, lex2 ); |
| 362 | } |
|
| 363 | // International opening double quotation mark. |
|
| 364 | else if( match( ANY, QUOTE_DOUBLE_OPENING, ANY, ANY ) ) { |
|
| 365 | emit( QUOTE_OPENING_DOUBLE, lex2 ); |
|
| 366 | } |
|
| 367 | // International opening single quotation mark. |
|
| 368 | else if( match( ANY, QUOTE_SINGLE_OPENING, ANY, ANY ) ) { |
|
| 369 | emit( QUOTE_OPENING_SINGLE, lex2 ); |
|
| 370 | } |
|
| 371 | // International double closing quotation mark. |
|
| 372 | else if( match( ANY, ANY, ANY, QUOTE_DOUBLE_CLOSING ) ) { |
|
| 373 | emit( QUOTE_CLOSING_DOUBLE, lex4 ); |
|
| 374 | } |
|
| 375 | // International single closing quotation mark. |
|
| 376 | else if( match( ANY, ANY, ANY, QUOTE_SINGLE_CLOSING ) ) { |
|
| 377 | emit( QUOTE_CLOSING_SINGLE, lex4 ); |
|
| 354 | 378 | } |
| 355 | 379 | // Ambiguous (no match) |
| 356 | 380 | else if( match( ANY, QUOTE_SINGLE, ANY, ANY ) ) { |
| 357 | 381 | emit( QUOTE_AMBIGUOUS_SINGLE, lex2 ); |
| 358 | 382 | } |
| 359 | 383 | } |
| 360 | 384 | |
| 361 | 385 | private void emit( final TokenType tokenType, final Lexeme lexeme ) { |
| 362 | 386 | mConsumer.accept( new Token( tokenType, lexeme ) ); |
| 363 | } |
|
| 364 | ||
| 365 | private void emit( |
|
| 366 | final TokenType tokenType, |
|
| 367 | final int began, |
|
| 368 | final int ended ) { |
|
| 369 | mConsumer.accept( new Token( tokenType, began, ended ) ); |
|
| 370 | 387 | } |
| 371 | 388 | |
| 3 | 3 | |
| 4 | 4 | import com.whitemagicsoftware.keenquotes.lex.Lexeme; |
| 5 | import com.whitemagicsoftware.keenquotes.lex.LexemeGlyph; |
|
| 5 | 6 | |
| 6 | 7 | import java.util.Map; |
| 7 | 8 | |
| 9 | import static com.whitemagicsoftware.keenquotes.lex.LexemeGlyph.*; |
|
| 8 | 10 | import static com.whitemagicsoftware.keenquotes.parser.TokenType.*; |
| 11 | import static java.util.Map.entry; |
|
| 12 | import static java.util.Map.ofEntries; |
|
| 9 | 13 | |
| 10 | 14 | /** |
| 11 | 15 | * Represents a high-level token read from a text document. |
| 12 | 16 | */ |
| 13 | 17 | final class Token implements Comparable<Token>, Stem { |
| 18 | /** |
|
| 19 | * Provides an entity-based set of {@link Token} replacements. |
|
| 20 | */ |
|
| 21 | private static final Map<TokenType, String> ENTITIES = ofEntries( |
|
| 22 | entry( QUOTE_OPENING_SINGLE, "‘" ), |
|
| 23 | entry( QUOTE_CLOSING_SINGLE, "’" ), |
|
| 24 | entry( QUOTE_OPENING_DOUBLE, "“" ), |
|
| 25 | entry( QUOTE_CLOSING_DOUBLE, "”" ), |
|
| 26 | entry( QUOTE_STRAIGHT_SINGLE, "'" ), |
|
| 27 | entry( QUOTE_STRAIGHT_DOUBLE, "\"" ), |
|
| 28 | entry( QUOTE_APOSTROPHE, "'" ), |
|
| 29 | entry( QUOTE_PRIME_SINGLE, "′" ), |
|
| 30 | entry( QUOTE_PRIME_DOUBLE, "″" ), |
|
| 31 | entry( QUOTE_PRIME_TRIPLE, "‴" ), |
|
| 32 | entry( QUOTE_PRIME_QUADRUPLE, "⁗" ) |
|
| 33 | ); |
|
| 34 | ||
| 35 | /** |
|
| 36 | * Provides a character-based set of {@link Token} replacements. |
|
| 37 | */ |
|
| 38 | private static final Map<TokenType, String> CHARS = ofEntries( |
|
| 39 | entry( QUOTE_OPENING_SINGLE, "‘" ), |
|
| 40 | entry( QUOTE_CLOSING_SINGLE, "’" ), |
|
| 41 | entry( QUOTE_OPENING_DOUBLE, "“" ), |
|
| 42 | entry( QUOTE_CLOSING_DOUBLE, "”" ), |
|
| 43 | entry( QUOTE_STRAIGHT_SINGLE, "'" ), |
|
| 44 | entry( QUOTE_STRAIGHT_DOUBLE, "\"" ), |
|
| 45 | entry( QUOTE_APOSTROPHE, "’" ), |
|
| 46 | entry( QUOTE_PRIME_SINGLE, "′" ), |
|
| 47 | entry( QUOTE_PRIME_DOUBLE, "″" ), |
|
| 48 | entry( QUOTE_PRIME_TRIPLE, "‴" ), |
|
| 49 | entry( QUOTE_PRIME_QUADRUPLE, "⁗" ) |
|
| 50 | ); |
|
| 51 | ||
| 52 | /** |
|
| 53 | * Glyphs not found in the table will use the document's glyph. |
|
| 54 | */ |
|
| 55 | private static final Map<LexemeGlyph, String> I18N_ENTITIES = ofEntries( |
|
| 56 | entry( LEX_DOUBLE_QUOTE_OPENING_LOW, "„" ), |
|
| 57 | entry( LEX_DOUBLE_CHEVRON_LEFT, "«" ), |
|
| 58 | entry( LEX_DOUBLE_CHEVRON_RIGHT, "»" ), |
|
| 59 | entry( LEX_SINGLE_CHEVRON_LEFT, "‹" ), |
|
| 60 | entry( LEX_SINGLE_CHEVRON_RIGHT, "›" ) |
|
| 61 | ); |
|
| 62 | ||
| 14 | 63 | /** |
| 15 | 64 | * Denotes that the token does not represent a value in the parsed document. |
| 16 | 65 | */ |
| 17 | 66 | public static final Token NONE = new Token( TokenType.NONE, Lexeme.NONE ); |
| 18 | 67 | |
| 19 | 68 | private TokenType mTokenType; |
| 20 | private final int mBegan; |
|
| 21 | private final int mEnded; |
|
| 69 | private final Lexeme mLexeme; |
|
| 22 | 70 | |
| 23 | 71 | /** |
| 24 | 72 | * Convenience constructor to create a token that uses the lexeme's |
| 25 | 73 | * beginning and ending offsets to represent a complete token. |
| 26 | * |
|
| 27 | * @param type The type of {@link Token} to create. |
|
| 28 | * @param lexeme Container for beginning and ending text offsets. |
|
| 29 | */ |
|
| 30 | Token( final TokenType type, final Lexeme lexeme ) { |
|
| 31 | this( type, lexeme.began(), lexeme.ended() ); |
|
| 32 | } |
|
| 33 | ||
| 34 | /** |
|
| 35 | * This constructor can be used to create tokens that span more than a |
|
| 36 | * single character. Almost all tokens represent a single character, only |
|
| 37 | * the double-prime sequence ({@code ''}) is more than one character. |
|
| 38 | 74 | * |
| 39 | 75 | * @param tokenType The type of {@link Token} to create. |
| 40 | * @param began Beginning offset into text where token is found. |
|
| 41 | * @param ended Ending offset into text where token is found. |
|
| 76 | * @param lexeme Container for text offsets and i18n glyphs. |
|
| 42 | 77 | */ |
| 43 | Token( final TokenType tokenType, final int began, final int ended ) { |
|
| 78 | Token( final TokenType tokenType, final Lexeme lexeme ) { |
|
| 44 | 79 | assert tokenType != null; |
| 45 | assert began >= 0; |
|
| 46 | assert ended >= began; |
|
| 80 | assert lexeme.began() >= 0; |
|
| 81 | assert lexeme.ended() >= lexeme.began(); |
|
| 47 | 82 | |
| 48 | 83 | mTokenType = tokenType; |
| 49 | mBegan = began; |
|
| 50 | mEnded = ended; |
|
| 84 | mLexeme = lexeme; |
|
| 51 | 85 | } |
| 52 | 86 | |
| ... | ||
| 63 | 97 | assert token != NONE; |
| 64 | 98 | |
| 65 | return mEnded <= token.mBegan; |
|
| 99 | return mLexeme.ended() <= token.began(); |
|
| 66 | 100 | } |
| 67 | 101 | |
| ... | ||
| 79 | 113 | assert token != NONE; |
| 80 | 114 | |
| 81 | return mBegan > token.mEnded; |
|
| 115 | return mLexeme.began() > token.ended(); |
|
| 82 | 116 | } |
| 83 | 117 | |
| ... | ||
| 93 | 127 | |
| 94 | 128 | int began() { |
| 95 | return mBegan; |
|
| 129 | return mLexeme.began(); |
|
| 96 | 130 | } |
| 97 | 131 | |
| 98 | 132 | int ended() { |
| 99 | return mEnded; |
|
| 133 | return mLexeme.ended(); |
|
| 100 | 134 | } |
| 101 | 135 | |
| ... | ||
| 125 | 159 | @Override |
| 126 | 160 | public int compareTo( final Token that ) { |
| 127 | return this.mBegan - that.mBegan; |
|
| 161 | return this.began() - that.began(); |
|
| 128 | 162 | } |
| 129 | 163 | |
| ... | ||
| 137 | 171 | } |
| 138 | 172 | |
| 139 | public String toString( final Map<TokenType, String> entities ) { |
|
| 140 | return entities.get( getType() ); |
|
| 173 | /** |
|
| 174 | * Converts this token to its string representation, which will either be |
|
| 175 | * an HTML entity or a character. |
|
| 176 | * |
|
| 177 | * @param entities {@code true} to convert quotation marks to HTML entities. |
|
| 178 | * @return A plain quotation mark character or an HTML entity. |
|
| 179 | */ |
|
| 180 | public String toString( final boolean entities ) { |
|
| 181 | final var glyph = mLexeme.getType().glyph(); |
|
| 182 | ||
| 183 | return entities |
|
| 184 | ? I18N_ENTITIES.getOrDefault( glyph, ENTITIES.get( getType() ) ) |
|
| 185 | : CHARS.getOrDefault( getType(), glyph.text() ); |
|
| 141 | 186 | } |
| 142 | 187 | |
| 143 | 188 | @Override |
| 144 | 189 | public String toString() { |
| 145 | 190 | return getClass().getSimpleName() + '[' + |
| 146 | "mType=" + mTokenType + |
|
| 147 | ", mBegan=" + mBegan + |
|
| 148 | ", mEnded=" + mEnded + |
|
| 191 | "mType=" + getType() + |
|
| 192 | ", mBegan=" + began() + |
|
| 193 | ", mEnded=" + ended() + |
|
| 149 | 194 | ']'; |
| 150 | 195 | } |
| 68 | 68 | @Test |
| 69 | 69 | void test_Lexing_Quotes_EmitQuotes() { |
| 70 | testType( "'", QUOTE_SINGLE ); |
|
| 71 | testType( "\"", QUOTE_DOUBLE ); |
|
| 72 | 70 | testType( "‘", QUOTE_SINGLE_OPENING ); |
| 71 | testType( "‹", QUOTE_SINGLE_OPENING ); |
|
| 73 | 72 | testType( "’", QUOTE_SINGLE_CLOSING ); |
| 73 | testType( "›", QUOTE_SINGLE_CLOSING ); |
|
| 74 | testType( "'", QUOTE_SINGLE ); |
|
| 75 | ||
| 74 | 76 | testType( "“", QUOTE_DOUBLE_OPENING ); |
| 77 | testType( "„", QUOTE_DOUBLE_OPENING ); |
|
| 78 | testType( "«", QUOTE_DOUBLE_OPENING ); |
|
| 79 | testType( ",,", QUOTE_DOUBLE_OPENING ); |
|
| 80 | testType( "\"", QUOTE_DOUBLE ); |
|
| 81 | ||
| 75 | 82 | testType( "”", QUOTE_DOUBLE_CLOSING ); |
| 83 | testType( "»", QUOTE_DOUBLE_CLOSING ); |
|
| 84 | ||
| 76 | 85 | testType( "3 o'clock", NUMBER, SPACE, WORD, QUOTE_SINGLE, WORD ); |
| 77 | 86 | } |
| 8 | 8 | import java.util.concurrent.atomic.AtomicInteger; |
| 9 | 9 | |
| 10 | import static com.whitemagicsoftware.keenquotes.parser.Curler.ENTITIES; |
|
| 11 | import static com.whitemagicsoftware.keenquotes.parser.Curler.replace; |
|
| 10 | import static com.whitemagicsoftware.keenquotes.parser.Curler.*; |
|
| 12 | 11 | import static com.whitemagicsoftware.keenquotes.texts.TestResource.readPairs; |
| 13 | 12 | import static org.junit.jupiter.api.Assertions.assertEquals; |
| ... | ||
| 26 | 25 | } |
| 27 | 26 | |
| 28 | @Test |
|
| 29 | 27 | @Disabled |
| 28 | @SuppressWarnings( "unused" ) |
|
| 30 | 29 | void test_Resolve_InvalidGrammar_AmbiguousRemain() throws IOException { |
| 31 | 30 | test( "invalid-grammar.txt" ); |
| ... | ||
| 44 | 43 | input, |
| 45 | 44 | CONTRACTIONS, |
| 46 | replace( output, offset, ENTITIES ), |
|
| 45 | replace( output, offset, true ), |
|
| 47 | 46 | filter -> false |
| 48 | 47 | ); |
| 29 | 29 | @Test |
| 30 | 30 | public void test_Parse_UncurledQuotes1_CurlyQuotes() throws IOException { |
| 31 | testCurler( createCurler( FILTER_PLAIN ), "unambiguous-1-pass.txt" ); |
|
| 31 | testCurler( createCurler( FILTER_PLAIN, true ), "unambiguous-1-pass.txt" ); |
|
| 32 | 32 | } |
| 33 | 33 | |
| 34 | 34 | @Test |
| 35 | 35 | public void test_Parse_UncurledQuotes2_CurlyQuotes() throws IOException { |
| 36 | testCurler( createCurler( FILTER_PLAIN ), "unambiguous-2-pass.txt" ); |
|
| 36 | testCurler( createCurler( FILTER_PLAIN, true ), "unambiguous-2-pass.txt" ); |
|
| 37 | 37 | } |
| 38 | 38 | |
| 39 | @Test |
|
| 39 | @Disabled |
|
| 40 | @SuppressWarnings( {"unused", "JUnit3StyleTestMethodInJUnit4Class"} ) |
|
| 40 | 41 | public void test_Parse_AmbiguousQuotes_PartiallyCurled() throws IOException { |
| 41 | testCurler( createCurler( FILTER_PLAIN ), "ambiguous-n-pass.txt" ); |
|
| 42 | testCurler( createCurler( FILTER_PLAIN, false ), "ambiguous-n-pass.txt" ); |
|
| 42 | 43 | } |
| 43 | 44 | |
| 44 | 45 | @Test |
| 45 | 46 | public void test_Parse_UncurledQuotesXml_CurlyQuotes() throws IOException { |
| 46 | testCurler( createCurler( FILTER_XML ), "xml.txt" ); |
|
| 47 | testCurler( createCurler( FILTER_XML, true ), "xml.txt" ); |
|
| 48 | } |
|
| 49 | ||
| 50 | @Test |
|
| 51 | public void test_Parse_UncurledQuotesI11l_CurlyQuotes() throws IOException { |
|
| 52 | testCurler( createCurler( FILTER_PLAIN, true ), "i18n.txt" ); |
|
| 47 | 53 | } |
| 48 | 54 | |
| ... | ||
| 68 | 74 | } |
| 69 | 75 | |
| 70 | final var curler = createCurler( FILTER_XML ); |
|
| 76 | final var curler = createCurler( FILTER_XML, true ); |
|
| 71 | 77 | System.out.println( curler.apply( sb.toString() ) ); |
| 72 | 78 | } |
| ... | ||
| 95 | 101 | } |
| 96 | 102 | |
| 97 | private Function<String, String> createCurler( final FilterType parserType ) { |
|
| 98 | return new Curler( new Contractions.Builder().build(), parserType ); |
|
| 103 | private Function<String, String> createCurler( |
|
| 104 | final FilterType filterType, |
|
| 105 | final boolean entities ) { |
|
| 106 | return new Curler( createContractions(), filterType, entities ); |
|
| 107 | } |
|
| 108 | ||
| 109 | private Contractions createContractions() { |
|
| 110 | return new Contractions.Builder().build(); |
|
| 99 | 111 | } |
| 100 | 112 | } |
| 7 | 7 | import java.util.concurrent.atomic.AtomicInteger; |
| 8 | 8 | |
| 9 | import static com.whitemagicsoftware.keenquotes.parser.Curler.ENTITIES; |
|
| 10 | 9 | import static com.whitemagicsoftware.keenquotes.parser.Curler.replace; |
| 11 | 10 | import static com.whitemagicsoftware.keenquotes.texts.TestResource.readPairs; |
| 12 | 11 | import static org.junit.jupiter.api.Assertions.assertEquals; |
| 13 | 12 | |
| 14 | 13 | class QuoteEmitterTest { |
| 15 | 14 | private final Contractions CONTRACTIONS = new Contractions.Builder().build(); |
| 16 | 15 | |
| 17 | 16 | @Test |
| 18 | 17 | void test_Emit_MultipleInputs_QuotesEmitted() throws IOException { |
| 19 | final var couplets = readPairs( |
|
| 20 | "unambiguous-1-pass.txt" ); |
|
| 18 | final var couplets = readPairs( "unambiguous-1-pass.txt" ); |
|
| 21 | 19 | |
| 22 | 20 | couplets.forEach( couplet -> { |
| ... | ||
| 29 | 27 | input, |
| 30 | 28 | CONTRACTIONS, |
| 31 | replace( output, offset, ENTITIES ), |
|
| 29 | replace( output, offset, true ), |
|
| 32 | 30 | filter -> false |
| 33 | 31 | ); |
| 1 | # ######################################################################## |
|
| 2 | # French |
|
| 3 | # ######################################################################## |
|
| 4 | «Ce n'est pas la mer à boire,» il me dit. |
|
| 5 | «Ce n'est pas la mer à boire,» il me dit. |
|
| 6 | ||
| 7 | Dit il, ‹Ce n'est pas la mer à boire?› |
|
| 8 | Dit il, ‹Ce n'est pas la mer à boire?› |
|
| 9 | ||
| 10 | # ######################################################################## |
|
| 11 | # Dutch |
|
| 12 | # ######################################################################## |
|
| 13 | ,,Dit is een citaat," zei hij. |
|
| 14 | „Dit is een citaat,” zei hij. |
|
| 15 | ||
| 16 | ,,Ik heb twee opa's en twee oma's," zei het meisje,„hoeveel heb jij er?" |
|
| 17 | „Ik heb twee opa's en twee oma's,” zei het meisje,„hoeveel heb jij er?” |
|
| 18 | ||
| 19 | "Zeg, wat betekent 'quod non' eigenlijk?", vroeg Nynke. |
|
| 20 | “Zeg, wat betekent ‘quod non’ eigenlijk?”, vroeg Nynke. |
|
| 21 | ||
| 22 | "Wat zijn 'zebra's'?", vroeg de jongen. |
|
| 23 | “Wat zijn ‘zebra's’?”, vroeg de jongen. |
|
| 24 | ||
| 25 | "'s Morgens gaat het regenen in 's-Gravenhage," zei de weervrouw. |
|
| 26 | “'s Morgens gaat het regenen in 's-Gravenhage,” zei de weervrouw. |
|
| 27 | ||
| 28 | De voorzitter zei: 'Gelukkig nieuwjaar!' |
|
| 29 | De voorzitter zei: ‘Gelukkig nieuwjaar!’ |
|
| 30 | ||
| 31 | 'Wat een gedoe, al die baby'tjes hun lolly'tjes geven,' zuchtte de oppas, 'en het is nog ongezond ook!' |
|
| 32 | ‘Wat een gedoe, al die baby'tjes hun lolly'tjes geven,’ zuchtte de oppas, ‘en het is nog ongezond ook!’ |
|
| 33 | ||
| 34 | "In '84," zei hij terwijl hij een hand door z'n haar haalde, "heb ik Alice's zus een kus gegeven op d'r wangen." |
|
| 35 | “In '84,” zei hij terwijl hij een hand door z'n haar haalde, “heb ik Alice's zus een kus gegeven op d'r wangen.” |
|
| 1 | 36 |
| 1 | # """wtf""" |
|
| 2 | # “““wtf””” |
|
| 1 | """insane""" |
|
| 2 | “”“insane”“” |
|
| 3 | 3 | |
| 4 | # '''wtf''' |
|
| 5 | # ‘‘‘wtf’’’ |
|
| 4 | '''wtf''' |
|
| 5 | ‘‘‘wtf’’’ |
|
| 6 | 6 |
| 180 | 180 | |
| 181 | 181 | "’Kearney lives on the banks of Killarney—’ |
| 182 | “’Kearney lives on the banks of Killarney—’ |
|
| 182 | “’Kearney lives on the banks of Killarney—’ |
|
| 183 | 183 | |
| 184 | 184 | # ######################################################################## |