| 48 | 48 | gradle clean build |
| 49 | 49 | |
| 50 | ## Library |
|
| 51 | ||
| 52 | To build a library for use with other software applications, run: |
|
| 53 | ||
| 54 | gradle clean lib |
|
| 55 | ||
| 56 | Find the library at: |
|
| 57 | ||
| 58 | build/lib/keenquotes.jar |
|
| 59 | ||
| 50 | 60 |
| 1 | /* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */ |
|
| 1 | 2 | package com.whitemagicsoftware.keenquotes; |
| 3 | ||
| 4 | import com.whitemagicsoftware.keenquotes.ParserFactory.ParserType; |
|
| 2 | 5 | |
| 3 | 6 | import java.util.ArrayList; |
| ... | ||
| 45 | 48 | private final Contractions mContractions; |
| 46 | 49 | private final Map<TokenType, String> mReplacements; |
| 50 | private final ParserFactory mFactory; |
|
| 47 | 51 | |
| 48 | 52 | /** |
| 49 | 53 | * Maps quotes to HTML entities. |
| 50 | 54 | * |
| 51 | 55 | * @param unresolved Consumes {@link Lexeme}s that could not be converted |
| 52 | 56 | * into HTML entities. |
| 57 | * @param parserType Creates a parser based on document content structure. |
|
| 53 | 58 | */ |
| 54 | public Converter( final Consumer<Lexeme> unresolved ) { |
|
| 55 | this( unresolved, new Contractions.Builder().build() ); |
|
| 59 | public Converter( |
|
| 60 | final Consumer<Lexeme> unresolved, final ParserType parserType ) { |
|
| 61 | this( unresolved, new Contractions.Builder().build(), parserType ); |
|
| 56 | 62 | } |
| 57 | 63 | |
| 58 | 64 | /** |
| 59 | 65 | * Maps quotes to HTML entities. |
| 60 | 66 | * |
| 61 | 67 | * @param unresolved Consumes {@link Lexeme}s that could not be converted |
| 62 | 68 | * into HTML entities. |
| 69 | * @param parserType Creates a parser based on document content structure. |
|
| 63 | 70 | */ |
| 64 | 71 | public Converter( |
| 65 | 72 | final Consumer<Lexeme> unresolved, |
| 66 | final Map<TokenType, String> replacements ) { |
|
| 67 | this( unresolved, new Contractions.Builder().build(), replacements ); |
|
| 73 | final Map<TokenType, String> replacements, |
|
| 74 | final ParserType parserType ) { |
|
| 75 | this( |
|
| 76 | unresolved, new Contractions.Builder().build(), replacements, parserType |
|
| 77 | ); |
|
| 68 | 78 | } |
| 69 | 79 | |
| 70 | 80 | /** |
| 71 | 81 | * Maps quotes to HTML entities. |
| 72 | 82 | * |
| 73 | 83 | * @param unresolved Consumes {@link Lexeme}s that could not be converted |
| 74 | 84 | * into HTML entities. |
| 75 | 85 | * @param c Contractions listings. |
| 86 | * @param parserType Creates a parser based on document content structure. |
|
| 76 | 87 | */ |
| 77 | public Converter( final Consumer<Lexeme> unresolved, final Contractions c ) { |
|
| 78 | this( unresolved, c, ENTITIES ); |
|
| 88 | public Converter( |
|
| 89 | final Consumer<Lexeme> unresolved, |
|
| 90 | final Contractions c, |
|
| 91 | final ParserType parserType ) { |
|
| 92 | this( unresolved, c, ENTITIES, parserType ); |
|
| 79 | 93 | } |
| 80 | 94 | |
| ... | ||
| 91 | 105 | final Consumer<Lexeme> unresolved, |
| 92 | 106 | final Contractions c, |
| 93 | final Map<TokenType, String> replacements ) { |
|
| 107 | final Map<TokenType, String> replacements, |
|
| 108 | final ParserType parserType ) { |
|
| 94 | 109 | mUnresolved = unresolved; |
| 95 | 110 | mContractions = c; |
| 96 | 111 | mReplacements = replacements; |
| 112 | mFactory = new ParserFactory( parserType ); |
|
| 97 | 113 | } |
| 98 | 114 | |
| ... | ||
| 108 | 124 | @Override |
| 109 | 125 | public String apply( final String text ) { |
| 110 | final var parser = new Parser( text, mContractions ); |
|
| 126 | final var parser = mFactory.createParser( text, mContractions ); |
|
| 111 | 127 | final var tokens = new ArrayList<Token>(); |
| 112 | 128 | |
| 8 | 8 | import java.util.Properties; |
| 9 | 9 | |
| 10 | import static com.whitemagicsoftware.keenquotes.ParserFactory.ParserType.PARSER_PLAIN; |
|
| 10 | 11 | import static java.lang.String.format; |
| 11 | 12 | import static java.lang.System.*; |
| ... | ||
| 41 | 42 | else { |
| 42 | 43 | try { |
| 43 | out.print( convert( new Converter( err::println, contractions ) ) ); |
|
| 44 | final var c = new Converter( err::println, contractions, PARSER_PLAIN ); |
|
| 45 | out.print( convert( c ) ); |
|
| 44 | 46 | } catch( final Exception ex ) { |
| 45 | 47 | ex.printStackTrace( err ); |
| 25 | 25 | PERIOD, |
| 26 | 26 | ELLIPSIS, |
| 27 | FLAG |
|
| 27 | FLAG, |
|
| 28 | TAG |
|
| 28 | 29 | } |
| 29 | 30 |
| 22 | 22 | |
| 23 | 23 | /** |
| 24 | * Default constructor, no state. |
|
| 24 | * Constructs a {@link Lexer} capable of turning text int {@link Lexeme}s. |
|
| 25 | * |
|
| 26 | * @param text The text to lex. |
|
| 25 | 27 | */ |
| 26 | public Lexer( final String text ) { |
|
| 28 | Lexer( final String text ) { |
|
| 27 | 29 | mIterator = new StringCharacterIterator( text ); |
| 28 | 30 | } |
| 29 | 31 | |
| 30 | public Lexeme next() { |
|
| 32 | Lexeme next() { |
|
| 31 | 33 | return parse( mIterator ); |
| 32 | 34 | } |
| ... | ||
| 41 | 43 | * @return The next token in the sequence. |
| 42 | 44 | */ |
| 43 | private Lexeme parse( final CharacterIterator i ) { |
|
| 45 | Lexeme parse( final CharacterIterator i ) { |
|
| 44 | 46 | int began = i.getIndex(); |
| 45 | 47 | boolean isWord = false; |
| 46 | 48 | Lexeme lexeme = null; |
| 47 | 49 | |
| 48 | 50 | do { |
| 51 | // Allow subclasses to skip character sequences. This allows XML tags |
|
| 52 | // to be skipped. |
|
| 53 | if( skip( i ) ) { |
|
| 54 | began = i.getIndex(); |
|
| 55 | } |
|
| 56 | ||
| 49 | 57 | final var curr = i.current(); |
| 50 | 58 | |
| ... | ||
| 152 | 160 | |
| 153 | 161 | return lexeme; |
| 162 | } |
|
| 163 | ||
| 164 | boolean skip( final CharacterIterator i ) { |
|
| 165 | return false; |
|
| 154 | 166 | } |
| 155 | 167 | |
| ... | ||
| 193 | 205 | } |
| 194 | 206 | |
| 195 | private static char peek( final CharacterIterator ci ) { |
|
| 207 | static char peek( final CharacterIterator ci ) { |
|
| 196 | 208 | final var ch = ci.next(); |
| 197 | 209 | ci.previous(); |
| ... | ||
| 206 | 218 | * @return The number of characters parsed. |
| 207 | 219 | */ |
| 208 | private static int slurp( |
|
| 220 | static int slurp( |
|
| 209 | 221 | final CharacterIterator ci, |
| 210 | 222 | final BiFunction<Character, CharacterIterator, Boolean> f ) { |
| 16 | 16 | * Converts straight double/single quotes and apostrophes to curly equivalents. |
| 17 | 17 | */ |
| 18 | public final class Parser { |
|
| 19 | /** |
|
| 20 | * Single quotes preceded by these {@link LexemeType}s may be opening quotes. |
|
| 21 | */ |
|
| 22 | private static final LexemeType[] LEADING_QUOTE_OPENING_SINGLE = |
|
| 23 | new LexemeType[]{SPACE, DASH, QUOTE_DOUBLE, OPENING_GROUP, EOL, EOP}; |
|
| 24 | ||
| 25 | /** |
|
| 26 | * Single quotes succeeded by these {@link LexemeType}s may be opening quotes. |
|
| 27 | */ |
|
| 28 | private static final LexemeType[] LAGGING_QUOTE_OPENING_SINGLE = |
|
| 29 | new LexemeType[]{WORD, ELLIPSIS, QUOTE_SINGLE, QUOTE_DOUBLE}; |
|
| 30 | ||
| 31 | /** |
|
| 32 | * Single quotes preceded by these {@link LexemeType}s may be closing quotes. |
|
| 33 | */ |
|
| 34 | private static final LexemeType[] LEADING_QUOTE_CLOSING_SINGLE = |
|
| 35 | new LexemeType[]{WORD, NUMBER, PERIOD, PUNCT, ELLIPSIS, QUOTE_DOUBLE}; |
|
| 36 | ||
| 37 | /** |
|
| 38 | * Single quotes succeeded by these {@link LexemeType}s may be closing quotes. |
|
| 39 | */ |
|
| 40 | private static final LexemeType[] LAGGING_QUOTE_CLOSING_SINGLE = |
|
| 41 | new LexemeType[]{SPACE, HYPHEN, DASH, |
|
| 42 | QUOTE_DOUBLE, CLOSING_GROUP, EOL, EOP}; |
|
| 43 | ||
| 44 | /** |
|
| 45 | * Double quotes preceded by these {@link LexemeType}s may be opening quotes. |
|
| 46 | */ |
|
| 47 | private static final LexemeType[] LEADING_QUOTE_OPENING_DOUBLE = |
|
| 48 | new LexemeType[]{SPACE, DASH, EQUALS, QUOTE_SINGLE, OPENING_GROUP, EOL, |
|
| 49 | EOP}; |
|
| 50 | ||
| 51 | /** |
|
| 52 | * Double quotes succeeded by these {@link LexemeType}s may be opening quotes. |
|
| 53 | */ |
|
| 54 | private static final LexemeType[] LAGGING_QUOTE_OPENING_DOUBLE = |
|
| 55 | new LexemeType[]{WORD, NUMBER, ELLIPSIS, OPENING_GROUP, |
|
| 56 | QUOTE_SINGLE, QUOTE_SINGLE_OPENING, QUOTE_SINGLE_CLOSING, QUOTE_DOUBLE}; |
|
| 57 | ||
| 58 | /** |
|
| 59 | * Double quotes preceded by these {@link LexemeType}s may be closing quotes. |
|
| 60 | */ |
|
| 61 | private static final LexemeType[] LEADING_QUOTE_CLOSING_DOUBLE = |
|
| 62 | new LexemeType[]{WORD, NUMBER, PERIOD, PUNCT, DASH, ELLIPSIS, CLOSING_GROUP, |
|
| 63 | QUOTE_SINGLE, QUOTE_SINGLE_CLOSING, QUOTE_SINGLE_OPENING}; |
|
| 64 | ||
| 65 | /** |
|
| 66 | * Double quotes succeeded by these {@link LexemeType}s may be closing quotes. |
|
| 67 | */ |
|
| 68 | private static final LexemeType[] LAGGING_QUOTE_CLOSING_DOUBLE = |
|
| 69 | new LexemeType[]{SPACE, PUNCT, PERIOD, EQUALS, HYPHEN, DASH, |
|
| 70 | QUOTE_SINGLE, CLOSING_GROUP, EOL, EOP}; |
|
| 71 | ||
| 72 | /** |
|
| 73 | * The text to parse. A reference is required as a minor optimization in |
|
| 74 | * memory and speed: the lexer records integer offsets, rather than new |
|
| 75 | * {@link String} instances, to track parsed lexemes. |
|
| 76 | */ |
|
| 77 | private final String mText; |
|
| 78 | ||
| 79 | /** |
|
| 80 | * Converts a string into an iterable list of {@link Lexeme} instances. |
|
| 81 | */ |
|
| 82 | private final Lexer mLexer; |
|
| 83 | ||
| 84 | /** |
|
| 85 | * Sets of contractions that help disambiguate single quotes in the text. |
|
| 86 | * These are effectively immutable while parsing. |
|
| 87 | */ |
|
| 88 | private final Contractions sContractions; |
|
| 89 | ||
| 90 | /** |
|
| 91 | * Contains each emitted opening single quote per paragraph. |
|
| 92 | */ |
|
| 93 | private final List<Lexeme> mOpeningSingleQuotes = new ArrayList<>(); |
|
| 94 | ||
| 95 | /** |
|
| 96 | * Contains each emitted closing single quote per paragraph. |
|
| 97 | */ |
|
| 98 | private final List<Lexeme> mClosingSingleQuotes = new ArrayList<>(); |
|
| 99 | ||
| 100 | /** |
|
| 101 | * Contains each emitted opening double quote per paragraph. |
|
| 102 | */ |
|
| 103 | private final List<Lexeme> mOpeningDoubleQuotes = new ArrayList<>(); |
|
| 104 | ||
| 105 | /** |
|
| 106 | * Contains each emitted closing double quote per paragraph. |
|
| 107 | */ |
|
| 108 | private final List<Lexeme> mClosingDoubleQuotes = new ArrayList<>(); |
|
| 109 | ||
| 110 | /** |
|
| 111 | * Constructs a new {@link Parser} using the default contraction sets |
|
| 112 | * to help resolve some ambiguous scenarios. |
|
| 113 | * |
|
| 114 | * @param text The prose to parse, containing zero or more quotation |
|
| 115 | * characters. |
|
| 116 | * @param contractions Custom sets of contractions to help resolve |
|
| 117 | * ambiguities. |
|
| 118 | */ |
|
| 119 | public Parser( final String text, final Contractions contractions ) { |
|
| 120 | mText = text; |
|
| 121 | mLexer = new Lexer( mText ); |
|
| 122 | sContractions = contractions; |
|
| 123 | } |
|
| 124 | ||
| 125 | /** |
|
| 126 | * Iterates over the entire text provided at construction, emitting |
|
| 127 | * {@link Token}s that can be used to convert straight quotes to curly |
|
| 128 | * quotes. |
|
| 129 | * |
|
| 130 | * @param tokenConsumer Receives emitted {@link Token}s. |
|
| 131 | */ |
|
| 132 | public void parse( |
|
| 133 | final Consumer<Token> tokenConsumer, |
|
| 134 | final Consumer<Lexeme> lexemeConsumer ) { |
|
| 135 | final var lexemes = new CircularFifoQueue<Lexeme>( 3 ); |
|
| 136 | ||
| 137 | // Allow consuming the very first token without needing a queue size check. |
|
| 138 | flush( lexemes ); |
|
| 139 | ||
| 140 | final var unresolved = new ArrayList<Lexeme[]>(); |
|
| 141 | Lexeme lexeme; |
|
| 142 | ||
| 143 | // Create and convert a list of all unambiguous quote characters. |
|
| 144 | while( (lexeme = mLexer.next()) != EOT ) { |
|
| 145 | // Reset after tokenizing a paragraph. |
|
| 146 | if( tokenize( lexeme, lexemes, tokenConsumer, unresolved ) ) { |
|
| 147 | // Attempt to resolve any remaining unambiguous quotes. |
|
| 148 | resolve( unresolved, tokenConsumer ); |
|
| 149 | ||
| 150 | // Notify of any unambiguous quotes that could not be resolved. |
|
| 151 | unresolved.forEach( ( lex ) -> lexemeConsumer.accept( lex[ 1 ] ) ); |
|
| 152 | unresolved.clear(); |
|
| 153 | mOpeningSingleQuotes.clear(); |
|
| 154 | mClosingSingleQuotes.clear(); |
|
| 155 | mOpeningDoubleQuotes.clear(); |
|
| 156 | mClosingDoubleQuotes.clear(); |
|
| 157 | } |
|
| 158 | } |
|
| 159 | ||
| 160 | // By loop's end, the lexemes list contains tokens for all except the |
|
| 161 | // final two elements (from tokenizing in triplets). Tokenize the remaining |
|
| 162 | // unprocessed lexemes. |
|
| 163 | tokenize( EOT, lexemes, tokenConsumer, unresolved ); |
|
| 164 | tokenize( EOT, lexemes, tokenConsumer, unresolved ); |
|
| 165 | ||
| 166 | // Attempt to resolve any remaining unambiguous quotes. |
|
| 167 | resolve( unresolved, tokenConsumer ); |
|
| 168 | ||
| 169 | // Notify of any unambiguous quotes that could not be resolved. |
|
| 170 | unresolved.forEach( ( lex ) -> lexemeConsumer.accept( lex[ 1 ] ) ); |
|
| 171 | } |
|
| 172 | ||
| 173 | /** |
|
| 174 | * Converts {@link Lexeme}s identified as straight quotes into {@link Token}s |
|
| 175 | * that represent the curly equivalent. The {@link Token}s are passed to |
|
| 176 | * the given {@link Consumer} for further processing (e.g., replaced in |
|
| 177 | * the original text being parsed). |
|
| 178 | * |
|
| 179 | * @param lexeme A part of the text being parsed. |
|
| 180 | * @param lexemes A 3-element queue of lexemes that provide sufficient |
|
| 181 | * context to identify curly quotes. |
|
| 182 | * @param consumer Recipient of equivalent quotes. |
|
| 183 | * @param unresolved Rolling list of potentially ambiguous {@link Lexeme}s |
|
| 184 | * that could not be tokenized, yet. |
|
| 185 | * @return {@code true} if an end-of-paragraph is detected. |
|
| 186 | */ |
|
| 187 | private boolean tokenize( final Lexeme lexeme, |
|
| 188 | final CircularFifoQueue<Lexeme> lexemes, |
|
| 189 | final Consumer<Token> consumer, |
|
| 190 | final List<Lexeme[]> unresolved ) { |
|
| 191 | // Add the next lexeme to tokenize into the queue for immediate processing. |
|
| 192 | lexemes.add( lexeme ); |
|
| 193 | ||
| 194 | final var lex1 = lexemes.get( 0 ); |
|
| 195 | final var lex2 = lexemes.get( 1 ); |
|
| 196 | final var lex3 = lexemes.get( 2 ); |
|
| 197 | ||
| 198 | if( lex2.isType( QUOTE_SINGLE ) && lex3.isType( WORD ) && |
|
| 199 | lex1.isType( WORD, PERIOD, NUMBER ) ) { |
|
| 200 | // Examples: y'all, Ph.D.'ll, 20's, she's |
|
| 201 | consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) ); |
|
| 202 | } |
|
| 203 | else if( lex1.isType( QUOTE_SINGLE ) && lex3.isType( QUOTE_SINGLE ) && |
|
| 204 | "n".equalsIgnoreCase( lex2.toString( mText ) ) ) { |
|
| 205 | // I.e., 'n' |
|
| 206 | consumer.accept( new Token( QUOTE_APOSTROPHE, lex1 ) ); |
|
| 207 | consumer.accept( new Token( QUOTE_APOSTROPHE, lex3 ) ); |
|
| 208 | flush( lexemes ); |
|
| 209 | truncate( unresolved ); |
|
| 210 | } |
|
| 211 | else if( lex2.isType( QUOTE_SINGLE ) && lex1.isType( NUMBER ) ) { |
|
| 212 | if( lex3.isType( QUOTE_SINGLE ) ) { |
|
| 213 | // E.g., 2'' |
|
| 214 | consumer.accept( |
|
| 215 | new Token( QUOTE_PRIME_DOUBLE, lex2.began(), lex3.ended() ) ); |
|
| 216 | flush( lexemes ); |
|
| 217 | } |
|
| 218 | else { |
|
| 219 | // E.g., 2' |
|
| 220 | consumer.accept( new Token( QUOTE_PRIME_SINGLE, lex2 ) ); |
|
| 221 | } |
|
| 222 | } |
|
| 223 | else if( lex2.isType( QUOTE_DOUBLE ) && lex1.isType( NUMBER ) ) { |
|
| 224 | // E.g., 2" |
|
| 225 | consumer.accept( new Token( QUOTE_PRIME_DOUBLE, lex2 ) ); |
|
| 226 | } |
|
| 227 | else if( lex2.isType( WORD ) && lex3.isType( QUOTE_SINGLE ) && |
|
| 228 | sContractions.endedUnambiguously( lex2.toString( mText ) ) ) { |
|
| 229 | // E.g., thinkin' |
|
| 230 | consumer.accept( new Token( QUOTE_APOSTROPHE, lex3 ) ); |
|
| 231 | flush( lexemes ); |
|
| 232 | } |
|
| 233 | else if( lex2.isType( NUMBER ) && lex1.isType( QUOTE_SINGLE ) ) { |
|
| 234 | // Sentences must re-written to avoid starting with numerals. |
|
| 235 | if( lex3.isType( SPACE, PUNCT ) || (lex3.isType( WORD ) && |
|
| 236 | lex3.toString( mText ).equalsIgnoreCase( "s" )) ) { |
|
| 237 | // Examples: '20s, '02 |
|
| 238 | consumer.accept( new Token( QUOTE_APOSTROPHE, lex1 ) ); |
|
| 239 | } |
|
| 240 | else { |
|
| 241 | // E.g., '2'' |
|
| 242 | consumer.accept( new Token( QUOTE_OPENING_SINGLE, lex1 ) ); |
|
| 243 | mOpeningSingleQuotes.add( lex1 ); |
|
| 244 | } |
|
| 245 | ||
| 246 | truncate( unresolved ); |
|
| 247 | } |
|
| 248 | else if( lex2.isType( QUOTE_SINGLE ) && |
|
| 249 | lex1.isType( PUNCT, PERIOD, ELLIPSIS, DASH ) && |
|
| 250 | (lex3.isType( EOL, EOP ) || lex3.isEot()) ) { |
|
| 251 | consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) ); |
|
| 252 | mClosingSingleQuotes.add( lex2 ); |
|
| 253 | } |
|
| 254 | else if( lex1.isType( ESC_SINGLE ) ) { |
|
| 255 | // E.g., \' |
|
| 256 | consumer.accept( new Token( QUOTE_STRAIGHT_SINGLE, lex1 ) ); |
|
| 257 | } |
|
| 258 | else if( lex1.isType( ESC_DOUBLE ) ) { |
|
| 259 | // E.g., \" |
|
| 260 | consumer.accept( new Token( QUOTE_STRAIGHT_DOUBLE, lex1 ) ); |
|
| 261 | ||
| 262 | if( lex2.isType( QUOTE_SINGLE ) && |
|
| 263 | (lex3.isEot() || lex3.isType( SPACE, DASH, EOL, EOP )) ) { |
|
| 264 | consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) ); |
|
| 265 | mClosingSingleQuotes.add( lex2 ); |
|
| 266 | } |
|
| 267 | } |
|
| 268 | else if( lex2.isType( QUOTE_DOUBLE ) && |
|
| 269 | (lex1.isSot() || lex1.isType( LEADING_QUOTE_OPENING_DOUBLE )) && |
|
| 270 | lex3.isType( LAGGING_QUOTE_OPENING_DOUBLE ) ) { |
|
| 271 | // Examples: "", "..., "word, ---"word |
|
| 272 | consumer.accept( new Token( QUOTE_OPENING_DOUBLE, lex2 ) ); |
|
| 273 | mOpeningDoubleQuotes.add( lex2 ); |
|
| 274 | } |
|
| 275 | else if( lex2.isType( QUOTE_DOUBLE ) && |
|
| 276 | lex1.isType( LEADING_QUOTE_CLOSING_DOUBLE ) && |
|
| 277 | (lex3.isEot() || lex3.isType( LAGGING_QUOTE_CLOSING_DOUBLE )) ) { |
|
| 278 | // Examples: ..."', word"', ?"', word"? |
|
| 279 | consumer.accept( new Token( QUOTE_CLOSING_DOUBLE, lex2 ) ); |
|
| 280 | mClosingDoubleQuotes.add( lex2 ); |
|
| 281 | } |
|
| 282 | else if( lex1.isType( WORD ) && lex2.isType( QUOTE_SINGLE ) && |
|
| 283 | lex3.isType( PUNCT, PERIOD ) ) { |
|
| 284 | // E.g., word', (contraction ruled out by previous conditions) |
|
| 285 | consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) ); |
|
| 286 | mClosingSingleQuotes.add( lex2 ); |
|
| 287 | } |
|
| 288 | else if( lex2.isType( QUOTE_SINGLE, QUOTE_DOUBLE ) ) { |
|
| 289 | // After tokenizing, the parser will attempt to resolve ambiguities. |
|
| 290 | unresolved.add( new Lexeme[]{lex1, lex2, lex3} ); |
|
| 291 | } |
|
| 292 | ||
| 293 | // Suggest to the caller that resolution should be performed. This allows |
|
| 294 | // the algorithm to reset the opening/closing quote balance before the |
|
| 295 | // next paragraph is parsed. |
|
| 296 | return lex3.isType( EOP ); |
|
| 297 | } |
|
| 298 | ||
| 299 | private void resolve( |
|
| 300 | final List<Lexeme[]> unresolved, final Consumer<Token> consumer ) { |
|
| 301 | // Some non-emitted tokenized lexemes may be ambiguous. |
|
| 302 | final var ambiguousLeadingQuotes = new ArrayList<Lexeme[]>( 16 ); |
|
| 303 | final var ambiguousLaggingQuotes = new ArrayList<Lexeme[]>( 16 ); |
|
| 304 | var resolvedLeadingQuotes = 0; |
|
| 305 | var resolvedLaggingQuotes = 0; |
|
| 306 | ||
| 307 | // Count the number of ambiguous and non-ambiguous open single quotes. |
|
| 308 | for( var i = unresolved.iterator(); i.hasNext(); ) { |
|
| 309 | final var quotes = i.next(); |
|
| 310 | final var lex1 = quotes[ 0 ]; |
|
| 311 | final var lex2 = quotes[ 1 ]; |
|
| 312 | final var lex3 = quotes[ 2 ]; |
|
| 313 | ||
| 314 | if( lex2.isType( QUOTE_SINGLE ) ) { |
|
| 315 | final var word1 = lex1 == SOT ? "" : lex1.toString( mText ); |
|
| 316 | final var word3 = lex3 == EOT ? "" : lex3.toString( mText ); |
|
| 317 | ||
| 318 | if( sContractions.beganAmbiguously( word3 ) ) { |
|
| 319 | // E.g., 'Cause |
|
| 320 | if( lex1.isType( QUOTE_SINGLE ) ) { |
|
| 321 | // E.g., ''Cause |
|
| 322 | consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) ); |
|
| 323 | i.remove(); |
|
| 324 | } |
|
| 325 | else { |
|
| 326 | // The contraction is uncertain until a closing quote is found that |
|
| 327 | // may balance this single quote. |
|
| 328 | ambiguousLeadingQuotes.add( quotes ); |
|
| 329 | } |
|
| 330 | } |
|
| 331 | else if( sContractions.beganUnambiguously( word3 ) ) { |
|
| 332 | // The quote mark forms a word that does not stand alone from its |
|
| 333 | // contraction. For example, twas is not a word: it's 'twas. |
|
| 334 | consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) ); |
|
| 335 | i.remove(); |
|
| 336 | } |
|
| 337 | else if( sContractions.endedAmbiguously( word1 ) ) { |
|
| 338 | ambiguousLaggingQuotes.add( quotes ); |
|
| 339 | } |
|
| 340 | else if( (lex1.isSot() || lex1.isType( LEADING_QUOTE_OPENING_SINGLE )) && |
|
| 341 | lex3.isType( LAGGING_QUOTE_OPENING_SINGLE ) ) { |
|
| 342 | consumer.accept( new Token( QUOTE_OPENING_SINGLE, lex2 ) ); |
|
| 343 | resolvedLeadingQuotes++; |
|
| 344 | mOpeningSingleQuotes.add( lex2 ); |
|
| 345 | i.remove(); |
|
| 346 | } |
|
| 347 | else if( lex1.isType( LEADING_QUOTE_CLOSING_SINGLE ) && |
|
| 348 | (lex3.isEot() || lex3.isType( LAGGING_QUOTE_CLOSING_SINGLE )) ) { |
|
| 349 | consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) ); |
|
| 350 | resolvedLaggingQuotes++; |
|
| 351 | mClosingSingleQuotes.add( lex2 ); |
|
| 352 | i.remove(); |
|
| 353 | } |
|
| 354 | else if( lex3.isType( NUMBER ) ) { |
|
| 355 | // E.g., '04 |
|
| 356 | ambiguousLeadingQuotes.add( quotes ); |
|
| 357 | } |
|
| 358 | } |
|
| 359 | } |
|
| 360 | ||
| 361 | sort( mOpeningSingleQuotes ); |
|
| 362 | sort( mClosingSingleQuotes ); |
|
| 363 | sort( mOpeningDoubleQuotes ); |
|
| 364 | sort( mClosingDoubleQuotes ); |
|
| 365 | ||
| 366 | final var singleQuoteEmpty = |
|
| 367 | mOpeningSingleQuotes.isEmpty() || mClosingSingleQuotes.isEmpty(); |
|
| 368 | final var doubleQuoteEmpty = |
|
| 369 | mOpeningDoubleQuotes.isEmpty() || mClosingDoubleQuotes.isEmpty(); |
|
| 370 | ||
| 371 | final var singleQuoteDelta = abs( |
|
| 372 | mClosingSingleQuotes.size() - mOpeningSingleQuotes.size() |
|
| 373 | ); |
|
| 374 | ||
| 375 | final var doubleQuoteDelta = abs( |
|
| 376 | mClosingDoubleQuotes.size() - mOpeningDoubleQuotes.size() |
|
| 377 | ); |
|
| 378 | ||
| 379 | final var ambiguousLeadingCount = ambiguousLeadingQuotes.size(); |
|
| 380 | final var ambiguousLaggingCount = ambiguousLaggingQuotes.size(); |
|
| 381 | ||
| 382 | if( resolvedLeadingQuotes == 1 && resolvedLaggingQuotes == 0 ) { |
|
| 383 | if( ambiguousLeadingCount == 0 && ambiguousLaggingCount == 1 ) { |
|
| 384 | final var balanced = singleQuoteDelta == 0; |
|
| 385 | final var quote = balanced ? QUOTE_APOSTROPHE : QUOTE_CLOSING_SINGLE; |
|
| 386 | final var lex = ambiguousLaggingQuotes.get( 0 ); |
|
| 387 | consumer.accept( new Token( quote, lex[ 1 ] ) ); |
|
| 388 | unresolved.remove( lex ); |
|
| 389 | } |
|
| 390 | else if( ambiguousLeadingCount == 0 && unresolved.size() == 1 ) { |
|
| 391 | // Must be a closing quote. |
|
| 392 | final var closing = unresolved.get( 0 ); |
|
| 393 | consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) ); |
|
| 394 | unresolved.remove( closing ); |
|
| 395 | } |
|
| 396 | } |
|
| 397 | else if( ambiguousLeadingCount == 0 && ambiguousLaggingCount > 0 ) { |
|
| 398 | // If there are no ambiguous leading quotes then all ambiguous lagging |
|
| 399 | // quotes must be contractions. |
|
| 400 | ambiguousLaggingQuotes.forEach( |
|
| 401 | lex -> { |
|
| 402 | consumer.accept( new Token( QUOTE_APOSTROPHE, lex[ 1 ] ) ); |
|
| 403 | unresolved.remove( lex ); |
|
| 404 | } |
|
| 405 | ); |
|
| 406 | } |
|
| 407 | else if( mOpeningSingleQuotes.size() == 0 && |
|
| 408 | mClosingSingleQuotes.size() == 1 && !unresolved.isEmpty() ) { |
|
| 409 | final var opening = unresolved.get( 0 ); |
|
| 410 | consumer.accept( new Token( QUOTE_OPENING_SINGLE, opening[ 1 ] ) ); |
|
| 411 | unresolved.remove( opening ); |
|
| 412 | } |
|
| 413 | else if( ambiguousLeadingCount == 0 ) { |
|
| 414 | if( resolvedLaggingQuotes < resolvedLeadingQuotes ) { |
|
| 415 | for( final var i = unresolved.iterator(); i.hasNext(); ) { |
|
| 416 | final var closing = i.next()[ 1 ]; |
|
| 417 | consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing ) ); |
|
| 418 | i.remove(); |
|
| 419 | } |
|
| 420 | } |
|
| 421 | else if( singleQuoteDelta == unresolved.size() ) { |
|
| 422 | for( final var i = unresolved.iterator(); i.hasNext(); ) { |
|
| 423 | final var closing = i.next(); |
|
| 424 | consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) ); |
|
| 425 | i.remove(); |
|
| 426 | } |
|
| 427 | } |
|
| 428 | else if( unresolved.size() == 2 ) { |
|
| 429 | final var closing = unresolved.get( 0 ); |
|
| 430 | final var opening = unresolved.get( 1 ); |
|
| 431 | consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) ); |
|
| 432 | consumer.accept( new Token( QUOTE_OPENING_SINGLE, opening[ 1 ] ) ); |
|
| 433 | ||
| 434 | // Doesn't affect the algorithm. |
|
| 435 | unresolved.clear(); |
|
| 436 | } |
|
| 437 | } |
|
| 438 | else if( (singleQuoteDelta == 0 && !singleQuoteEmpty) || |
|
| 439 | (doubleQuoteDelta == 0 && !doubleQuoteEmpty) ) { |
|
| 440 | // An apostrophe stands betwixt opening/closing single quotes. |
|
| 441 | for( final var lexemes = unresolved.iterator(); lexemes.hasNext(); ) { |
|
| 442 | final var quote = lexemes.next()[ 1 ]; |
|
| 443 | ||
| 444 | for( int i = 0; i < mOpeningSingleQuotes.size(); i++ ) { |
|
| 445 | // An apostrophe must fall between an open/close pair. |
|
| 446 | final var openingQuote = mOpeningSingleQuotes.get( i ); |
|
| 447 | final var closingQuote = mClosingSingleQuotes.get( i ); |
|
| 448 | ||
| 449 | if( openingQuote.before( quote ) && closingQuote.after( quote ) ) { |
|
| 450 | consumer.accept( new Token( QUOTE_APOSTROPHE, quote ) ); |
|
| 451 | lexemes.remove(); |
|
| 452 | } |
|
| 453 | } |
|
| 454 | } |
|
| 455 | ||
| 456 | // An apostrophe stands betwixt opening/closing double quotes. |
|
| 457 | for( final var lexemes = unresolved.iterator(); lexemes.hasNext(); ) { |
|
| 458 | final var quote = lexemes.next()[ 1 ]; |
|
| 459 | ||
| 460 | for( int i = 0; i < mOpeningDoubleQuotes.size(); i++ ) { |
|
| 461 | // An apostrophe must fall between an open/close pair. |
|
| 462 | final var openingQuote = mOpeningDoubleQuotes.get( i ); |
|
| 463 | final var closingQuote = mClosingDoubleQuotes.get( i ); |
|
| 464 | ||
| 465 | if( openingQuote.before( quote ) && closingQuote.after( quote ) ) { |
|
| 466 | consumer.accept( new Token( QUOTE_APOSTROPHE, quote ) ); |
|
| 467 | lexemes.remove(); |
|
| 468 | } |
|
| 469 | } |
|
| 470 | } |
|
| 471 | } |
|
| 472 | else if( ambiguousLeadingCount == 1 && resolvedLaggingQuotes == 1 ) { |
|
| 473 | final var opening = ambiguousLeadingQuotes.get( 0 ); |
|
| 474 | consumer.accept( new Token( QUOTE_OPENING_SINGLE, opening[ 1 ] ) ); |
|
| 475 | unresolved.remove( opening ); |
|
| 476 | } |
|
| 18 | public class Parser { |
|
| 19 | /** |
|
| 20 | * Single quotes preceded by these {@link LexemeType}s may be opening quotes. |
|
| 21 | */ |
|
| 22 | private static final LexemeType[] LEADING_QUOTE_OPENING_SINGLE = |
|
| 23 | new LexemeType[]{SPACE, DASH, QUOTE_DOUBLE, OPENING_GROUP, EOL, EOP}; |
|
| 24 | ||
| 25 | /** |
|
| 26 | * Single quotes succeeded by these {@link LexemeType}s may be opening quotes. |
|
| 27 | */ |
|
| 28 | private static final LexemeType[] LAGGING_QUOTE_OPENING_SINGLE = |
|
| 29 | new LexemeType[]{WORD, ELLIPSIS, QUOTE_SINGLE, QUOTE_DOUBLE}; |
|
| 30 | ||
| 31 | /** |
|
| 32 | * Single quotes preceded by these {@link LexemeType}s may be closing quotes. |
|
| 33 | */ |
|
| 34 | private static final LexemeType[] LEADING_QUOTE_CLOSING_SINGLE = |
|
| 35 | new LexemeType[]{WORD, NUMBER, PERIOD, PUNCT, ELLIPSIS, QUOTE_DOUBLE}; |
|
| 36 | ||
| 37 | /** |
|
| 38 | * Single quotes succeeded by these {@link LexemeType}s may be closing quotes. |
|
| 39 | */ |
|
| 40 | private static final LexemeType[] LAGGING_QUOTE_CLOSING_SINGLE = |
|
| 41 | new LexemeType[]{SPACE, HYPHEN, DASH, |
|
| 42 | QUOTE_DOUBLE, CLOSING_GROUP, EOL, EOP}; |
|
| 43 | ||
| 44 | /** |
|
| 45 | * Double quotes preceded by these {@link LexemeType}s may be opening quotes. |
|
| 46 | */ |
|
| 47 | private static final LexemeType[] LEADING_QUOTE_OPENING_DOUBLE = |
|
| 48 | new LexemeType[]{SPACE, DASH, EQUALS, QUOTE_SINGLE, OPENING_GROUP, EOL, |
|
| 49 | EOP}; |
|
| 50 | ||
| 51 | /** |
|
| 52 | * Double quotes succeeded by these {@link LexemeType}s may be opening quotes. |
|
| 53 | */ |
|
| 54 | private static final LexemeType[] LAGGING_QUOTE_OPENING_DOUBLE = |
|
| 55 | new LexemeType[]{WORD, NUMBER, ELLIPSIS, OPENING_GROUP, |
|
| 56 | QUOTE_SINGLE, QUOTE_SINGLE_OPENING, QUOTE_SINGLE_CLOSING, QUOTE_DOUBLE}; |
|
| 57 | ||
| 58 | /** |
|
| 59 | * Double quotes preceded by these {@link LexemeType}s may be closing quotes. |
|
| 60 | */ |
|
| 61 | private static final LexemeType[] LEADING_QUOTE_CLOSING_DOUBLE = |
|
| 62 | new LexemeType[]{WORD, NUMBER, PERIOD, PUNCT, DASH, ELLIPSIS, CLOSING_GROUP, |
|
| 63 | QUOTE_SINGLE, QUOTE_SINGLE_CLOSING, QUOTE_SINGLE_OPENING}; |
|
| 64 | ||
| 65 | /** |
|
| 66 | * Double quotes succeeded by these {@link LexemeType}s may be closing quotes. |
|
| 67 | */ |
|
| 68 | private static final LexemeType[] LAGGING_QUOTE_CLOSING_DOUBLE = |
|
| 69 | new LexemeType[]{SPACE, PUNCT, PERIOD, EQUALS, HYPHEN, DASH, |
|
| 70 | QUOTE_SINGLE, CLOSING_GROUP, EOL, EOP}; |
|
| 71 | ||
| 72 | /** |
|
| 73 | * The text to parse. A reference is required as a minor optimization in |
|
| 74 | * memory and speed: the lexer records integer offsets, rather than new |
|
| 75 | * {@link String} instances, to track parsed lexemes. |
|
| 76 | */ |
|
| 77 | private final String mText; |
|
| 78 | ||
| 79 | /** |
|
| 80 | * Converts a string into an iterable list of {@link Lexeme} instances. |
|
| 81 | */ |
|
| 82 | private final Lexer mLexer; |
|
| 83 | ||
| 84 | /** |
|
| 85 | * Sets of contractions that help disambiguate single quotes in the text. |
|
| 86 | * These are effectively immutable while parsing. |
|
| 87 | */ |
|
| 88 | private final Contractions sContractions; |
|
| 89 | ||
| 90 | /** |
|
| 91 | * Contains each emitted opening single quote per paragraph. |
|
| 92 | */ |
|
| 93 | private final List<Lexeme> mOpeningSingleQuotes = new ArrayList<>(); |
|
| 94 | ||
| 95 | /** |
|
| 96 | * Contains each emitted closing single quote per paragraph. |
|
| 97 | */ |
|
| 98 | private final List<Lexeme> mClosingSingleQuotes = new ArrayList<>(); |
|
| 99 | ||
| 100 | /** |
|
| 101 | * Contains each emitted opening double quote per paragraph. |
|
| 102 | */ |
|
| 103 | private final List<Lexeme> mOpeningDoubleQuotes = new ArrayList<>(); |
|
| 104 | ||
| 105 | /** |
|
| 106 | * Contains each emitted closing double quote per paragraph. |
|
| 107 | */ |
|
| 108 | private final List<Lexeme> mClosingDoubleQuotes = new ArrayList<>(); |
|
| 109 | ||
| 110 | /** |
|
| 111 | * Constructs a new {@link Parser} using the default contraction sets |
|
| 112 | * to help resolve some ambiguous scenarios. |
|
| 113 | * |
|
| 114 | * @param text The prose to parse, containing zero or more quotation |
|
| 115 | * characters. |
|
| 116 | * @param contractions Custom sets of contractions to help resolve |
|
| 117 | * ambiguities. |
|
| 118 | */ |
|
| 119 | public Parser( final String text, final Contractions contractions ) { |
|
| 120 | mText = text; |
|
| 121 | mLexer = createLexer( mText ); |
|
| 122 | sContractions = contractions; |
|
| 123 | } |
|
| 124 | ||
| 125 | /** |
|
| 126 | * Iterates over the entire text provided at construction, emitting |
|
| 127 | * {@link Token}s that can be used to convert straight quotes to curly |
|
| 128 | * quotes. |
|
| 129 | * |
|
| 130 | * @param tokenConsumer Receives emitted {@link Token}s. |
|
| 131 | */ |
|
| 132 | public void parse( |
|
| 133 | final Consumer<Token> tokenConsumer, |
|
| 134 | final Consumer<Lexeme> lexemeConsumer ) { |
|
| 135 | final var lexemes = new CircularFifoQueue<Lexeme>( 3 ); |
|
| 136 | ||
| 137 | // Allow consuming the very first token without needing a queue size check. |
|
| 138 | flush( lexemes ); |
|
| 139 | ||
| 140 | final var unresolved = new ArrayList<Lexeme[]>(); |
|
| 141 | Lexeme lexeme; |
|
| 142 | ||
| 143 | // Create and convert a list of all unambiguous quote characters. |
|
| 144 | while( (lexeme = mLexer.next()) != EOT ) { |
|
| 145 | // Reset after tokenizing a paragraph. |
|
| 146 | if( tokenize( lexeme, lexemes, tokenConsumer, unresolved ) ) { |
|
| 147 | // Attempt to resolve any remaining unambiguous quotes. |
|
| 148 | resolve( unresolved, tokenConsumer ); |
|
| 149 | ||
| 150 | // Notify of any unambiguous quotes that could not be resolved. |
|
| 151 | unresolved.forEach( ( lex ) -> lexemeConsumer.accept( lex[ 1 ] ) ); |
|
| 152 | unresolved.clear(); |
|
| 153 | mOpeningSingleQuotes.clear(); |
|
| 154 | mClosingSingleQuotes.clear(); |
|
| 155 | mOpeningDoubleQuotes.clear(); |
|
| 156 | mClosingDoubleQuotes.clear(); |
|
| 157 | } |
|
| 158 | } |
|
| 159 | ||
| 160 | // By loop's end, the lexemes list contains tokens for all except the |
|
| 161 | // final two elements (from tokenizing in triplets). Tokenize the remaining |
|
| 162 | // unprocessed lexemes. |
|
| 163 | tokenize( EOT, lexemes, tokenConsumer, unresolved ); |
|
| 164 | tokenize( EOT, lexemes, tokenConsumer, unresolved ); |
|
| 165 | ||
| 166 | // Attempt to resolve any remaining unambiguous quotes. |
|
| 167 | resolve( unresolved, tokenConsumer ); |
|
| 168 | ||
| 169 | // Notify of any unambiguous quotes that could not be resolved. |
|
| 170 | unresolved.forEach( ( lex ) -> lexemeConsumer.accept( lex[ 1 ] ) ); |
|
| 171 | } |
|
| 172 | ||
| 173 | /** |
|
| 174 | * Converts {@link Lexeme}s identified as straight quotes into {@link Token}s |
|
| 175 | * that represent the curly equivalent. The {@link Token}s are passed to |
|
| 176 | * the given {@link Consumer} for further processing (e.g., replaced in |
|
| 177 | * the original text being parsed). |
|
| 178 | * |
|
| 179 | * @param lexeme A part of the text being parsed. |
|
| 180 | * @param lexemes A 3-element queue of lexemes that provide sufficient |
|
| 181 | * context to identify curly quotes. |
|
| 182 | * @param consumer Recipient of equivalent quotes. |
|
| 183 | * @param unresolved Rolling list of potentially ambiguous {@link Lexeme}s |
|
| 184 | * that could not be tokenized, yet. |
|
| 185 | * @return {@code true} if an end-of-paragraph is detected. |
|
| 186 | */ |
|
| 187 | private boolean tokenize( final Lexeme lexeme, |
|
| 188 | final CircularFifoQueue<Lexeme> lexemes, |
|
| 189 | final Consumer<Token> consumer, |
|
| 190 | final List<Lexeme[]> unresolved ) { |
|
| 191 | // Add the next lexeme to tokenize into the queue for immediate processing. |
|
| 192 | lexemes.add( lexeme ); |
|
| 193 | ||
| 194 | final var lex1 = lexemes.get( 0 ); |
|
| 195 | final var lex2 = lexemes.get( 1 ); |
|
| 196 | final var lex3 = lexemes.get( 2 ); |
|
| 197 | ||
| 198 | if( lex2.isType( QUOTE_SINGLE ) && lex3.isType( WORD ) && |
|
| 199 | lex1.isType( WORD, PERIOD, NUMBER ) ) { |
|
| 200 | // Examples: y'all, Ph.D.'ll, 20's, she's |
|
| 201 | consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) ); |
|
| 202 | } |
|
| 203 | else if( lex1.isType( QUOTE_SINGLE ) && lex3.isType( QUOTE_SINGLE ) && |
|
| 204 | "n".equalsIgnoreCase( lex2.toString( mText ) ) ) { |
|
| 205 | // I.e., 'n' |
|
| 206 | consumer.accept( new Token( QUOTE_APOSTROPHE, lex1 ) ); |
|
| 207 | consumer.accept( new Token( QUOTE_APOSTROPHE, lex3 ) ); |
|
| 208 | flush( lexemes ); |
|
| 209 | truncate( unresolved ); |
|
| 210 | } |
|
| 211 | else if( lex2.isType( QUOTE_SINGLE ) && lex1.isType( NUMBER ) ) { |
|
| 212 | if( lex3.isType( QUOTE_SINGLE ) ) { |
|
| 213 | // E.g., 2'' |
|
| 214 | consumer.accept( |
|
| 215 | new Token( QUOTE_PRIME_DOUBLE, lex2.began(), lex3.ended() ) ); |
|
| 216 | flush( lexemes ); |
|
| 217 | } |
|
| 218 | else { |
|
| 219 | // E.g., 2' |
|
| 220 | consumer.accept( new Token( QUOTE_PRIME_SINGLE, lex2 ) ); |
|
| 221 | } |
|
| 222 | } |
|
| 223 | else if( lex2.isType( QUOTE_DOUBLE ) && lex1.isType( NUMBER ) ) { |
|
| 224 | // E.g., 2" |
|
| 225 | consumer.accept( new Token( QUOTE_PRIME_DOUBLE, lex2 ) ); |
|
| 226 | } |
|
| 227 | else if( lex2.isType( WORD ) && lex3.isType( QUOTE_SINGLE ) && |
|
| 228 | sContractions.endedUnambiguously( lex2.toString( mText ) ) ) { |
|
| 229 | // E.g., thinkin' |
|
| 230 | consumer.accept( new Token( QUOTE_APOSTROPHE, lex3 ) ); |
|
| 231 | flush( lexemes ); |
|
| 232 | } |
|
| 233 | else if( lex2.isType( NUMBER ) && lex1.isType( QUOTE_SINGLE ) ) { |
|
| 234 | // Sentences must re-written to avoid starting with numerals. |
|
| 235 | if( lex3.isType( SPACE, PUNCT ) || (lex3.isType( WORD ) && |
|
| 236 | lex3.toString( mText ).equalsIgnoreCase( "s" )) ) { |
|
| 237 | // Examples: '20s, '02 |
|
| 238 | consumer.accept( new Token( QUOTE_APOSTROPHE, lex1 ) ); |
|
| 239 | } |
|
| 240 | else { |
|
| 241 | // E.g., '2'' |
|
| 242 | consumer.accept( new Token( QUOTE_OPENING_SINGLE, lex1 ) ); |
|
| 243 | mOpeningSingleQuotes.add( lex1 ); |
|
| 244 | } |
|
| 245 | ||
| 246 | truncate( unresolved ); |
|
| 247 | } |
|
| 248 | else if( lex2.isType( QUOTE_SINGLE ) && |
|
| 249 | lex1.isType( PUNCT, PERIOD, ELLIPSIS, DASH ) && |
|
| 250 | (lex3.isType( EOL, EOP ) || lex3.isEot()) ) { |
|
| 251 | consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) ); |
|
| 252 | mClosingSingleQuotes.add( lex2 ); |
|
| 253 | } |
|
| 254 | else if( lex1.isType( ESC_SINGLE ) ) { |
|
| 255 | // E.g., \' |
|
| 256 | consumer.accept( new Token( QUOTE_STRAIGHT_SINGLE, lex1 ) ); |
|
| 257 | } |
|
| 258 | else if( lex1.isType( ESC_DOUBLE ) ) { |
|
| 259 | // E.g., \" |
|
| 260 | consumer.accept( new Token( QUOTE_STRAIGHT_DOUBLE, lex1 ) ); |
|
| 261 | ||
| 262 | if( lex2.isType( QUOTE_SINGLE ) && |
|
| 263 | (lex3.isEot() || lex3.isType( SPACE, DASH, EOL, EOP )) ) { |
|
| 264 | consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) ); |
|
| 265 | mClosingSingleQuotes.add( lex2 ); |
|
| 266 | } |
|
| 267 | } |
|
| 268 | else if( lex2.isType( QUOTE_DOUBLE ) && |
|
| 269 | (lex1.isSot() || lex1.isType( LEADING_QUOTE_OPENING_DOUBLE )) && |
|
| 270 | lex3.isType( LAGGING_QUOTE_OPENING_DOUBLE ) ) { |
|
| 271 | // Examples: "", "..., "word, ---"word |
|
| 272 | consumer.accept( new Token( QUOTE_OPENING_DOUBLE, lex2 ) ); |
|
| 273 | mOpeningDoubleQuotes.add( lex2 ); |
|
| 274 | } |
|
| 275 | else if( lex2.isType( QUOTE_DOUBLE ) && |
|
| 276 | lex1.isType( LEADING_QUOTE_CLOSING_DOUBLE ) && |
|
| 277 | (lex3.isEot() || lex3.isType( LAGGING_QUOTE_CLOSING_DOUBLE )) ) { |
|
| 278 | // Examples: ..."', word"', ?"', word"? |
|
| 279 | consumer.accept( new Token( QUOTE_CLOSING_DOUBLE, lex2 ) ); |
|
| 280 | mClosingDoubleQuotes.add( lex2 ); |
|
| 281 | } |
|
| 282 | else if( lex1.isType( WORD ) && lex2.isType( QUOTE_SINGLE ) && |
|
| 283 | lex3.isType( PUNCT, PERIOD ) ) { |
|
| 284 | // E.g., word', (contraction ruled out by previous conditions) |
|
| 285 | consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) ); |
|
| 286 | mClosingSingleQuotes.add( lex2 ); |
|
| 287 | } |
|
| 288 | else if( lex2.isType( QUOTE_SINGLE, QUOTE_DOUBLE ) ) { |
|
| 289 | // After tokenizing, the parser will attempt to resolve ambiguities. |
|
| 290 | unresolved.add( new Lexeme[]{lex1, lex2, lex3} ); |
|
| 291 | } |
|
| 292 | ||
| 293 | // Suggest to the caller that resolution should be performed. This allows |
|
| 294 | // the algorithm to reset the opening/closing quote balance before the |
|
| 295 | // next paragraph is parsed. |
|
| 296 | return lex3.isType( EOP ); |
|
| 297 | } |
|
| 298 | ||
| 299 | private void resolve( |
|
| 300 | final List<Lexeme[]> unresolved, final Consumer<Token> consumer ) { |
|
| 301 | // Some non-emitted tokenized lexemes may be ambiguous. |
|
| 302 | final var ambiguousLeadingQuotes = new ArrayList<Lexeme[]>( 16 ); |
|
| 303 | final var ambiguousLaggingQuotes = new ArrayList<Lexeme[]>( 16 ); |
|
| 304 | var resolvedLeadingQuotes = 0; |
|
| 305 | var resolvedLaggingQuotes = 0; |
|
| 306 | ||
| 307 | // Count the number of ambiguous and non-ambiguous open single quotes. |
|
| 308 | for( var i = unresolved.iterator(); i.hasNext(); ) { |
|
| 309 | final var quotes = i.next(); |
|
| 310 | final var lex1 = quotes[ 0 ]; |
|
| 311 | final var lex2 = quotes[ 1 ]; |
|
| 312 | final var lex3 = quotes[ 2 ]; |
|
| 313 | ||
| 314 | if( lex2.isType( QUOTE_SINGLE ) ) { |
|
| 315 | final var word1 = lex1 == SOT ? "" : lex1.toString( mText ); |
|
| 316 | final var word3 = lex3 == EOT ? "" : lex3.toString( mText ); |
|
| 317 | ||
| 318 | if( sContractions.beganAmbiguously( word3 ) ) { |
|
| 319 | // E.g., 'Cause |
|
| 320 | if( lex1.isType( QUOTE_SINGLE ) ) { |
|
| 321 | // E.g., ''Cause |
|
| 322 | consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) ); |
|
| 323 | i.remove(); |
|
| 324 | } |
|
| 325 | else { |
|
| 326 | // The contraction is uncertain until a closing quote is found that |
|
| 327 | // may balance this single quote. |
|
| 328 | ambiguousLeadingQuotes.add( quotes ); |
|
| 329 | } |
|
| 330 | } |
|
| 331 | else if( sContractions.beganUnambiguously( word3 ) ) { |
|
| 332 | // The quote mark forms a word that does not stand alone from its |
|
| 333 | // contraction. For example, twas is not a word: it's 'twas. |
|
| 334 | consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) ); |
|
| 335 | i.remove(); |
|
| 336 | } |
|
| 337 | else if( sContractions.endedAmbiguously( word1 ) ) { |
|
| 338 | ambiguousLaggingQuotes.add( quotes ); |
|
| 339 | } |
|
| 340 | else if( (lex1.isSot() || lex1.isType( LEADING_QUOTE_OPENING_SINGLE )) && |
|
| 341 | lex3.isType( LAGGING_QUOTE_OPENING_SINGLE ) ) { |
|
| 342 | consumer.accept( new Token( QUOTE_OPENING_SINGLE, lex2 ) ); |
|
| 343 | resolvedLeadingQuotes++; |
|
| 344 | mOpeningSingleQuotes.add( lex2 ); |
|
| 345 | i.remove(); |
|
| 346 | } |
|
| 347 | else if( lex1.isType( LEADING_QUOTE_CLOSING_SINGLE ) && |
|
| 348 | (lex3.isEot() || lex3.isType( LAGGING_QUOTE_CLOSING_SINGLE )) ) { |
|
| 349 | consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) ); |
|
| 350 | resolvedLaggingQuotes++; |
|
| 351 | mClosingSingleQuotes.add( lex2 ); |
|
| 352 | i.remove(); |
|
| 353 | } |
|
| 354 | else if( lex3.isType( NUMBER ) ) { |
|
| 355 | // E.g., '04 |
|
| 356 | ambiguousLeadingQuotes.add( quotes ); |
|
| 357 | } |
|
| 358 | } |
|
| 359 | } |
|
| 360 | ||
| 361 | sort( mOpeningSingleQuotes ); |
|
| 362 | sort( mClosingSingleQuotes ); |
|
| 363 | sort( mOpeningDoubleQuotes ); |
|
| 364 | sort( mClosingDoubleQuotes ); |
|
| 365 | ||
| 366 | final var singleQuoteEmpty = |
|
| 367 | mOpeningSingleQuotes.isEmpty() || mClosingSingleQuotes.isEmpty(); |
|
| 368 | final var doubleQuoteEmpty = |
|
| 369 | mOpeningDoubleQuotes.isEmpty() || mClosingDoubleQuotes.isEmpty(); |
|
| 370 | ||
| 371 | final var singleQuoteDelta = abs( |
|
| 372 | mClosingSingleQuotes.size() - mOpeningSingleQuotes.size() |
|
| 373 | ); |
|
| 374 | ||
| 375 | final var doubleQuoteDelta = abs( |
|
| 376 | mClosingDoubleQuotes.size() - mOpeningDoubleQuotes.size() |
|
| 377 | ); |
|
| 378 | ||
| 379 | final var ambiguousLeadingCount = ambiguousLeadingQuotes.size(); |
|
| 380 | final var ambiguousLaggingCount = ambiguousLaggingQuotes.size(); |
|
| 381 | ||
| 382 | if( resolvedLeadingQuotes == 1 && resolvedLaggingQuotes == 0 ) { |
|
| 383 | if( ambiguousLeadingCount == 0 && ambiguousLaggingCount == 1 ) { |
|
| 384 | final var balanced = singleQuoteDelta == 0; |
|
| 385 | final var quote = balanced ? QUOTE_APOSTROPHE : QUOTE_CLOSING_SINGLE; |
|
| 386 | final var lex = ambiguousLaggingQuotes.get( 0 ); |
|
| 387 | consumer.accept( new Token( quote, lex[ 1 ] ) ); |
|
| 388 | unresolved.remove( lex ); |
|
| 389 | } |
|
| 390 | else if( ambiguousLeadingCount == 0 && unresolved.size() == 1 ) { |
|
| 391 | // Must be a closing quote. |
|
| 392 | final var closing = unresolved.get( 0 ); |
|
| 393 | consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) ); |
|
| 394 | unresolved.remove( closing ); |
|
| 395 | } |
|
| 396 | } |
|
| 397 | else if( ambiguousLeadingCount == 0 && ambiguousLaggingCount > 0 ) { |
|
| 398 | // If there are no ambiguous leading quotes then all ambiguous lagging |
|
| 399 | // quotes must be contractions. |
|
| 400 | ambiguousLaggingQuotes.forEach( |
|
| 401 | lex -> { |
|
| 402 | consumer.accept( new Token( QUOTE_APOSTROPHE, lex[ 1 ] ) ); |
|
| 403 | unresolved.remove( lex ); |
|
| 404 | } |
|
| 405 | ); |
|
| 406 | } |
|
| 407 | else if( mOpeningSingleQuotes.size() == 0 && |
|
| 408 | mClosingSingleQuotes.size() == 1 && !unresolved.isEmpty() ) { |
|
| 409 | final var opening = unresolved.get( 0 ); |
|
| 410 | consumer.accept( new Token( QUOTE_OPENING_SINGLE, opening[ 1 ] ) ); |
|
| 411 | unresolved.remove( opening ); |
|
| 412 | } |
|
| 413 | else if( ambiguousLeadingCount == 0 ) { |
|
| 414 | if( resolvedLaggingQuotes < resolvedLeadingQuotes ) { |
|
| 415 | for( final var i = unresolved.iterator(); i.hasNext(); ) { |
|
| 416 | final var closing = i.next()[ 1 ]; |
|
| 417 | consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing ) ); |
|
| 418 | i.remove(); |
|
| 419 | } |
|
| 420 | } |
|
| 421 | else if( singleQuoteDelta == unresolved.size() ) { |
|
| 422 | for( final var i = unresolved.iterator(); i.hasNext(); ) { |
|
| 423 | final var closing = i.next(); |
|
| 424 | consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) ); |
|
| 425 | i.remove(); |
|
| 426 | } |
|
| 427 | } |
|
| 428 | else if( unresolved.size() == 2 ) { |
|
| 429 | final var closing = unresolved.get( 0 ); |
|
| 430 | final var opening = unresolved.get( 1 ); |
|
| 431 | consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) ); |
|
| 432 | consumer.accept( new Token( QUOTE_OPENING_SINGLE, opening[ 1 ] ) ); |
|
| 433 | ||
| 434 | // Doesn't affect the algorithm. |
|
| 435 | unresolved.clear(); |
|
| 436 | } |
|
| 437 | } |
|
| 438 | else if( (singleQuoteDelta == 0 && !singleQuoteEmpty) || |
|
| 439 | (doubleQuoteDelta == 0 && !doubleQuoteEmpty) ) { |
|
| 440 | // An apostrophe stands betwixt opening/closing single quotes. |
|
| 441 | for( final var lexemes = unresolved.iterator(); lexemes.hasNext(); ) { |
|
| 442 | final var quote = lexemes.next()[ 1 ]; |
|
| 443 | ||
| 444 | for( int i = 0; i < mOpeningSingleQuotes.size(); i++ ) { |
|
| 445 | // An apostrophe must fall between an open/close pair. |
|
| 446 | final var openingQuote = mOpeningSingleQuotes.get( i ); |
|
| 447 | final var closingQuote = mClosingSingleQuotes.get( i ); |
|
| 448 | ||
| 449 | if( openingQuote.before( quote ) && closingQuote.after( quote ) ) { |
|
| 450 | consumer.accept( new Token( QUOTE_APOSTROPHE, quote ) ); |
|
| 451 | lexemes.remove(); |
|
| 452 | } |
|
| 453 | } |
|
| 454 | } |
|
| 455 | ||
| 456 | // An apostrophe stands betwixt opening/closing double quotes. |
|
| 457 | for( final var lexemes = unresolved.iterator(); lexemes.hasNext(); ) { |
|
| 458 | final var quote = lexemes.next()[ 1 ]; |
|
| 459 | ||
| 460 | for( int i = 0; i < mOpeningDoubleQuotes.size(); i++ ) { |
|
| 461 | // An apostrophe must fall between an open/close pair. |
|
| 462 | final var openingQuote = mOpeningDoubleQuotes.get( i ); |
|
| 463 | final var closingQuote = mClosingDoubleQuotes.get( i ); |
|
| 464 | ||
| 465 | if( openingQuote.before( quote ) && closingQuote.after( quote ) ) { |
|
| 466 | consumer.accept( new Token( QUOTE_APOSTROPHE, quote ) ); |
|
| 467 | lexemes.remove(); |
|
| 468 | } |
|
| 469 | } |
|
| 470 | } |
|
| 471 | } |
|
| 472 | else if( ambiguousLeadingCount == 1 && resolvedLaggingQuotes == 1 ) { |
|
| 473 | final var opening = ambiguousLeadingQuotes.get( 0 ); |
|
| 474 | consumer.accept( new Token( QUOTE_OPENING_SINGLE, opening[ 1 ] ) ); |
|
| 475 | unresolved.remove( opening ); |
|
| 476 | } |
|
| 477 | } |
|
| 478 | ||
| 479 | /** |
|
| 480 | * Allow subclasses to change the type of {@link Lexer} |
|
| 481 | * |
|
| 482 | * @param text The text to lex. |
|
| 483 | * @return A {@link Lexer} that can split the text into {@link Lexeme}s. |
|
| 484 | */ |
|
| 485 | Lexer createLexer( final String text ) { |
|
| 486 | return new Lexer( text ); |
|
| 477 | 487 | } |
| 478 | 488 |
| 1 | /* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */ |
|
| 2 | package com.whitemagicsoftware.keenquotes; |
|
| 3 | ||
| 4 | /** |
|
| 5 | * Responsible for creating new {@link Parser} instances based on the |
|
| 6 | * {@link ParserType}. The document content format must be known in advance. |
|
| 7 | */ |
|
| 8 | public class ParserFactory { |
|
| 9 | public enum ParserType { |
|
| 10 | PARSER_PLAIN, |
|
| 11 | PARSER_XML |
|
| 12 | } |
|
| 13 | ||
| 14 | private final ParserType mParserType; |
|
| 15 | ||
| 16 | public ParserFactory( final ParserType parserType ) { |
|
| 17 | mParserType = parserType; |
|
| 18 | } |
|
| 19 | ||
| 20 | public Parser createParser( |
|
| 21 | final String text, final Contractions contractions ) { |
|
| 22 | ||
| 23 | return mParserType == ParserType.PARSER_PLAIN |
|
| 24 | ? new Parser( text, contractions ) |
|
| 25 | : new XmlParser( text, contractions ); |
|
| 26 | } |
|
| 27 | } |
|
| 1 | 28 |
| 1 | /* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */ |
|
| 2 | package com.whitemagicsoftware.keenquotes; |
|
| 3 | ||
| 4 | import java.text.CharacterIterator; |
|
| 5 | ||
| 6 | /** |
|
| 7 | * Responsible for lexing text while ignoring XML elements. The document must |
|
| 8 | * be both sane and well-formed. This is not intended to lex documents in the |
|
| 9 | * wild where the user has injected custom HTML. The lexer will fail if the |
|
| 10 | * angle brackets are not balanced. Additionally, any less than or greater than |
|
| 11 | * symbols must be encoded as {@code <} or {@code >}, respectively. |
|
| 12 | */ |
|
| 13 | final class XmlLexer extends Lexer { |
|
| 14 | /** |
|
| 15 | * Constructs a {@link Lexer} capable of turning text int {@link Lexeme}s. |
|
| 16 | * |
|
| 17 | * @param text The text to lex. |
|
| 18 | */ |
|
| 19 | XmlLexer( final String text ) { |
|
| 20 | super( text ); |
|
| 21 | } |
|
| 22 | ||
| 23 | /** |
|
| 24 | * Skip (do not emit) XML tags found within the prose. This effectively hides |
|
| 25 | * the element. |
|
| 26 | * |
|
| 27 | * @param i The {@link CharacterIterator} used to scan through the text, one |
|
| 28 | * character at a time. |
|
| 29 | */ |
|
| 30 | @Override |
|
| 31 | boolean skip( final CharacterIterator i ) { |
|
| 32 | final boolean match = i.current() == '<'; |
|
| 33 | ||
| 34 | if( match ) { |
|
| 35 | slurp( i, ( next, ci ) -> next != '>' ); |
|
| 36 | ||
| 37 | // Swallow the trailing greater than symbol. |
|
| 38 | i.next(); |
|
| 39 | ||
| 40 | // Skip to the character following the greater than symbol. |
|
| 41 | i.next(); |
|
| 42 | } |
|
| 43 | ||
| 44 | return match; |
|
| 45 | } |
|
| 46 | } |
|
| 1 | 47 |
| 1 | /* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */ |
|
| 2 | package com.whitemagicsoftware.keenquotes; |
|
| 3 | ||
| 4 | /** |
|
| 5 | * Turns text into {@link Lexeme}s, allowing XML elements to be ignored. |
|
| 6 | */ |
|
| 7 | public final class XmlParser extends Parser { |
|
| 8 | /** |
|
| 9 | * Constructs a new {@link Parser} using the default contraction sets |
|
| 10 | * to help resolve some ambiguous scenarios. |
|
| 11 | * |
|
| 12 | * @param text The prose to parse, containing zero or more quotation |
|
| 13 | * characters. |
|
| 14 | * @param contractions Custom sets of contractions to help resolve |
|
| 15 | */ |
|
| 16 | public XmlParser( |
|
| 17 | final String text, final Contractions contractions ) { |
|
| 18 | super( text, contractions ); |
|
| 19 | } |
|
| 20 | ||
| 21 | @Override |
|
| 22 | public Lexer createLexer( final String text ) { |
|
| 23 | return new XmlLexer( text ); |
|
| 24 | } |
|
| 25 | } |
|
| 1 | 26 |
| 2 | 2 | package com.whitemagicsoftware.keenquotes; |
| 3 | 3 | |
| 4 | import com.whitemagicsoftware.keenquotes.ParserFactory.ParserType; |
|
| 4 | 5 | import org.junit.jupiter.api.Disabled; |
| 5 | 6 | import org.junit.jupiter.api.Test; |
| 6 | 7 | import org.junit.jupiter.params.ParameterizedTest; |
| 8 | import org.junit.jupiter.params.provider.Arguments; |
|
| 9 | import org.junit.jupiter.params.provider.MethodSource; |
|
| 7 | 10 | import org.junit.jupiter.params.provider.ValueSource; |
| 8 | 11 | |
| 9 | 12 | import java.io.BufferedReader; |
| 10 | 13 | import java.io.IOException; |
| 11 | 14 | import java.io.InputStreamReader; |
| 15 | import java.util.function.Consumer; |
|
| 12 | 16 | import java.util.function.Function; |
| 17 | import java.util.stream.Stream; |
|
| 13 | 18 | |
| 19 | import static com.whitemagicsoftware.keenquotes.ParserFactory.ParserType.PARSER_PLAIN; |
|
| 20 | import static com.whitemagicsoftware.keenquotes.ParserFactory.ParserType.PARSER_XML; |
|
| 14 | 21 | import static java.lang.System.out; |
| 15 | 22 | import static org.junit.jupiter.api.Assertions.assertEquals; |
| 16 | 23 | import static org.junit.jupiter.api.Assertions.assertNotNull; |
| 24 | import static org.junit.jupiter.params.provider.Arguments.arguments; |
|
| 17 | 25 | |
| 18 | 26 | /** |
| ... | ||
| 27 | 35 | @Disabled |
| 28 | 36 | public void test_parse_SingleLine_Parsed() { |
| 29 | final var converter = new Converter( out::println ); |
|
| 37 | final var converter = createConverter( out::println ); |
|
| 30 | 38 | out.println( converter.apply( |
| 31 | 39 | "'A', 'B', and 'C' are letters." |
| ... | ||
| 40 | 48 | @Test |
| 41 | 49 | public void test_Parse_StraightQuotes_CurlyQuotes() throws IOException { |
| 42 | testConverter( new Converter( ( lex ) -> {} ) ); |
|
| 50 | testConverter( createConverter( ( lex ) -> {} ) ); |
|
| 51 | } |
|
| 52 | ||
| 53 | @ParameterizedTest |
|
| 54 | @MethodSource( "param_XmlParse_StraightQuotes_CurlyQuotes" ) |
|
| 55 | public void test_XmlParse_StraightQuotes_CurlyQuotes( |
|
| 56 | final String input, final String expected ) { |
|
| 57 | final var converter = createConverter( out::println, PARSER_XML ); |
|
| 58 | final var actual = converter.apply( input ); |
|
| 59 | assertEquals( expected, actual ); |
|
| 43 | 60 | } |
| 44 | 61 | |
| ... | ||
| 65 | 82 | } |
| 66 | 83 | |
| 67 | final var converter = new Converter( out::println ); |
|
| 84 | final var converter = createConverter( out::println ); |
|
| 68 | 85 | System.out.println( converter.apply( sb.toString() ) ); |
| 86 | } |
|
| 87 | ||
| 88 | @SuppressWarnings( "unused" ) |
|
| 89 | static Stream<Arguments> param_XmlParse_StraightQuotes_CurlyQuotes() { |
|
| 90 | return Stream.of( |
|
| 91 | arguments( |
|
| 92 | "<em>'twas</em>", |
|
| 93 | "<em>'twas</em>" |
|
| 94 | ), |
|
| 95 | arguments( |
|
| 96 | "<bold>'twas</bold> redeemed for the <em>cat</em>'s eye", |
|
| 97 | "<bold>'twas</bold> redeemed for the <em>cat</em>'s eye" |
|
| 98 | ), |
|
| 99 | arguments( |
|
| 100 | "<a href=\"https://x.org\" title=\"X's Homepage\">X11's bomb</a>", |
|
| 101 | "<a href=\"https://x.org\" title=\"X's Homepage\">X11's bomb</a>" |
|
| 102 | ), |
|
| 103 | arguments( |
|
| 104 | "''<em>Twas</em> happening!'", |
|
| 105 | "‘'<em>Twas</em> happening!’" |
|
| 106 | ) |
|
| 107 | ); |
|
| 69 | 108 | } |
| 70 | 109 | |
| ... | ||
| 127 | 166 | |
| 128 | 167 | return new BufferedReader( new InputStreamReader( is ) ); |
| 168 | } |
|
| 169 | ||
| 170 | private Function<String, String> createConverter( |
|
| 171 | final Consumer<Lexeme> unresolved ) { |
|
| 172 | return createConverter( unresolved, PARSER_PLAIN ); |
|
| 173 | } |
|
| 174 | ||
| 175 | private Function<String, String> createConverter( |
|
| 176 | final Consumer<Lexeme> unresolved, final ParserType parserType ) { |
|
| 177 | return new Converter( unresolved, parserType ); |
|
| 129 | 178 | } |
| 130 | 179 | } |
| 4 | 4 | import org.junit.jupiter.api.Test; |
| 5 | 5 | |
| 6 | import java.util.Arrays; |
|
| 7 | 6 | import java.util.List; |
| 8 | 7 | import java.util.function.BiFunction; |
| 9 | 8 | |
| 10 | 9 | import static com.whitemagicsoftware.keenquotes.Lexeme.EOT; |
| 11 | 10 | import static com.whitemagicsoftware.keenquotes.LexemeType.*; |
| 11 | import static java.util.Arrays.asList; |
|
| 12 | 12 | import static org.junit.jupiter.api.Assertions.assertEquals; |
| 13 | 13 | |
| 14 | 14 | /** |
| 15 | 15 | * Tests lexing words, numbers, punctuation, spaces, newlines, etc. |
| 16 | 16 | */ |
| 17 | class LexerTest { |
|
| 17 | final class LexerTest { |
|
| 18 | ||
| 18 | 19 | @Test |
| 19 | void test_Lexing_Words_TokenValues() { |
|
| 20 | void test_Lexing_Words_LexemeValues() { |
|
| 20 | 21 | testText( "abc 123", "abc", " ", "123" ); |
| 21 | 22 | testText( "-123 abc", "-123", " ", "abc" ); |
| ... | ||
| 89 | 90 | } |
| 90 | 91 | |
| 91 | private void testType( final String actual, final LexemeType... expected ) { |
|
| 92 | final var list = Arrays.asList( expected ); |
|
| 93 | testType( actual, ( lexeme, text ) -> lexeme.getType(), list ); |
|
| 92 | static void testType( |
|
| 93 | final String actual, final LexemeType... expected ) { |
|
| 94 | final var list = asList( expected ); |
|
| 95 | final var lexer = createLexer( actual ); |
|
| 96 | testType( lexer, actual, ( lexeme, text ) -> lexeme.getType(), list ); |
|
| 94 | 97 | } |
| 95 | 98 | |
| 96 | private void testText( final String actual, final String... expected ) { |
|
| 97 | testType( actual, Lexeme::toString, Arrays.asList( expected ) ); |
|
| 99 | static void testText( |
|
| 100 | final String actual, final String... expected ) { |
|
| 101 | final var lexer = createLexer( actual ); |
|
| 102 | testType( lexer, actual, Lexeme::toString, asList( expected ) ); |
|
| 98 | 103 | } |
| 99 | 104 | |
| 100 | private <A, E> void testType( |
|
| 105 | static void testType( |
|
| 106 | final Lexer lexer, final String actual, final LexemeType... expected ) { |
|
| 107 | final var list = asList( expected ); |
|
| 108 | testType( lexer, actual, ( lexeme, text ) -> lexeme.getType(), list ); |
|
| 109 | } |
|
| 110 | ||
| 111 | private static <A, E> void testType( |
|
| 112 | final Lexer lexer, |
|
| 101 | 113 | final String text, |
| 102 | 114 | final BiFunction<Lexeme, String, A> f, |
| 103 | 115 | final List<E> elements ) { |
| 104 | final var lexer = new Lexer( text ); |
|
| 105 | 116 | var counter = 0; |
| 106 | 117 | |
| ... | ||
| 116 | 127 | // Ensure all expected values are matched (verify end of text reached). |
| 117 | 128 | assertEquals( elements.size(), counter ); |
| 129 | } |
|
| 130 | ||
| 131 | static Lexer createLexer( final String text ) { |
|
| 132 | return new Lexer( text ); |
|
| 118 | 133 | } |
| 119 | 134 | } |
| 13 | 13 | * Test that all unambiguous apostrophes are emitted once. |
| 14 | 14 | */ |
| 15 | class ParserTest { |
|
| 15 | final class ParserTest { |
|
| 16 | ||
| 16 | 17 | @SuppressWarnings( "TextBlockMigration" ) |
| 17 | 18 | private final static Map<String, Map<TokenType, Integer>> TEST_CASES = |
| 1 | /* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */ |
|
| 2 | package com.whitemagicsoftware.keenquotes; |
|
| 3 | ||
| 4 | import org.junit.jupiter.api.Test; |
|
| 5 | ||
| 6 | import static com.whitemagicsoftware.keenquotes.LexemeType.*; |
|
| 7 | import static com.whitemagicsoftware.keenquotes.LexerTest.testType; |
|
| 8 | ||
| 9 | /** |
|
| 10 | * Test that parsing XML documents ignores elements. |
|
| 11 | */ |
|
| 12 | final class XmlLexerTest { |
|
| 13 | ||
| 14 | @Test |
|
| 15 | void test_Lexing_Xml_EmitTags() { |
|
| 16 | final var actual = "The <em>world's</em> aflame."; |
|
| 17 | testType( |
|
| 18 | createXmlLexer( actual ), actual, |
|
| 19 | WORD, SPACE, WORD, QUOTE_SINGLE, WORD, SPACE, WORD, PERIOD |
|
| 20 | ); |
|
| 21 | } |
|
| 22 | ||
| 23 | @Test |
|
| 24 | void test_Lexing_XmlAttribute_EmitTags() { |
|
| 25 | final var actual = "<a href=\"http://x.org\">X11</a>"; |
|
| 26 | testType( createXmlLexer( actual ), actual, WORD, NUMBER ); |
|
| 27 | } |
|
| 28 | ||
| 29 | static Lexer createXmlLexer( final String text ) { |
|
| 30 | return new XmlLexer( text ); |
|
| 31 | } |
|
| 32 | } |
|
| 1 | 33 |