| 13 | 13 | import static java.lang.String.format; |
| 14 | 14 | import static java.lang.System.*; |
| 15 | import static java.nio.charset.StandardCharsets.UTF_8; |
|
| 15 | 16 | import static picocli.CommandLine.Help.Ansi.Style.*; |
| 16 | 17 | import static picocli.CommandLine.Help.ColorScheme; |
| ... | ||
| 62 | 63 | |
| 63 | 64 | private String convert( final Curler curler ) throws IOException { |
| 64 | return curler.apply( new String( in.readAllBytes() ) ); |
|
| 65 | return curler.apply( new String( in.readAllBytes(), UTF_8 ) ); |
|
| 65 | 66 | } |
| 66 | 67 | |
| 64 | 64 | mTree = mTree.closing( token ); |
| 65 | 65 | } |
| 66 | else if( token.isType( QUOTE_AMBIGUOUS_DOUBLE ) ) { |
|
| 67 | // Create subtrees for: <" ... ">, <" ">, <"">, etc. |
|
| 68 | if( mTree.hasOpeningDoubleQuote() ) { |
|
| 69 | token.setTokenType( QUOTE_CLOSING_DOUBLE ); |
|
| 70 | mTree = mTree.closing( token ); |
|
| 71 | } |
|
| 72 | else { |
|
| 73 | token.setTokenType( QUOTE_OPENING_DOUBLE ); |
|
| 74 | mTree = mTree.opening( token ); |
|
| 75 | } |
|
| 76 | } |
|
| 66 | 77 | // Add ambiguous tokens to be resolved; add apostrophes for later emitting. |
| 67 | 78 | else { |
| ... | ||
| 107 | 118 | private void resolve( final List<Token> tokens ) { |
| 108 | 119 | assert tokens != null; |
| 120 | ||
| 121 | boolean leader = false; |
|
| 109 | 122 | |
| 110 | 123 | for( final var token : tokens ) { |
| 111 | 124 | if( token.isType( QUOTE_AMBIGUOUS_LEADING ) ) { |
| 112 | 125 | // Once a leader quote is found, any laggard could be a closing quote. |
| 113 | break; |
|
| 126 | leader = true; |
|
| 114 | 127 | } |
| 115 | else if( token.isType( QUOTE_AMBIGUOUS_LAGGING ) ) { |
|
| 128 | else if( !leader && token.isType( QUOTE_AMBIGUOUS_LAGGING ) ) { |
|
| 116 | 129 | token.setTokenType( QUOTE_APOSTROPHE ); |
| 130 | } |
|
| 131 | else if( token.isType( QUOTE_AMBIGUOUS_SINGLE ) ) { |
|
| 132 | // Create subtrees for: <' ... '>, <' '>, <''>, etc. |
|
| 133 | if( mTree.hasOpeningSingleQuote() ) { |
|
| 134 | token.setTokenType( QUOTE_CLOSING_SINGLE ); |
|
| 135 | mTree = mTree.closing( token ); |
|
| 136 | } |
|
| 137 | else { |
|
| 138 | token.setTokenType( QUOTE_OPENING_SINGLE ); |
|
| 139 | mTree = mTree.opening( token ); |
|
| 140 | } |
|
| 117 | 141 | } |
| 118 | 142 | } |
| ... | ||
| 129 | 153 | final var countLeading = tree.count( QUOTE_AMBIGUOUS_LEADING ); |
| 130 | 154 | final var countLagging = tree.count( QUOTE_AMBIGUOUS_LAGGING ); |
| 131 | final var countUnknown = tree.count( AMBIGUOUS ); |
|
| 155 | final var countSingles = tree.count( QUOTE_AMBIGUOUS_SINGLE ); |
|
| 132 | 156 | |
| 133 | 157 | if( tree.hasOpeningSingleQuote() && !tree.hasClosingSingleQuote() ) { |
| 134 | if( countUnknown == 0 && countLeading == 0 && countLagging == 1 ) { |
|
| 158 | if( countSingles == 0 && countLeading == 0 && countLagging == 1 ) { |
|
| 135 | 159 | tree.replaceAll( QUOTE_AMBIGUOUS_LAGGING, QUOTE_CLOSING_SINGLE ); |
| 136 | 160 | } |
| 137 | else if( countUnknown == 1 && countLagging == 0 ) { |
|
| 138 | tree.replaceAll( AMBIGUOUS, QUOTE_CLOSING_SINGLE ); |
|
| 161 | else if( countSingles == 1 && countLagging == 0 ) { |
|
| 162 | tree.replaceAll( QUOTE_AMBIGUOUS_SINGLE, QUOTE_CLOSING_SINGLE ); |
|
| 139 | 163 | } |
| 140 | 164 | } |
| 141 | 165 | |
| 142 | if( countUnknown == 0 && countLeading == 1 && countLagging == 0 && |
|
| 166 | if( countSingles == 0 && countLeading == 1 && countLagging == 0 && |
|
| 143 | 167 | !tree.hasOpeningSingleQuote() && tree.hasClosingSingleQuote() ) { |
| 144 | 168 | tree.replaceAll( QUOTE_AMBIGUOUS_LEADING, QUOTE_OPENING_SINGLE ); |
| 145 | 169 | } |
| 146 | 170 | |
| 147 | 171 | if( !tree.hasOpeningSingleQuote() && !tree.hasClosingSingleQuote() || |
| 148 | 172 | tree.isBalanced() ) { |
| 149 | if( countUnknown == 0 && countLeading > 0 && countLagging == 0 ) { |
|
| 173 | if( countSingles == 0 && countLeading > 0 && countLagging == 0 ) { |
|
| 150 | 174 | tree.replaceAll( QUOTE_AMBIGUOUS_LEADING, QUOTE_APOSTROPHE ); |
| 151 | 175 | } |
| 152 | 176 | |
| 153 | if( countUnknown == 0 && countLeading == 0 && countLagging > 0 ) { |
|
| 177 | if( countSingles == 0 && countLeading == 0 && countLagging > 0 ) { |
|
| 154 | 178 | tree.replaceAll( QUOTE_AMBIGUOUS_LAGGING, QUOTE_APOSTROPHE ); |
| 155 | 179 | } |
| 97 | 97 | } |
| 98 | 98 | |
| 99 | public boolean beganEndedUmambiguously( final String word ) { |
|
| 99 | public boolean beganEndedUnambiguously( final String word ) { |
|
| 100 | 100 | assert word != null; |
| 101 | 101 | return getBeganEndedUnambiguous().contains( word ); |
| ... | ||
| 170 | 170 | public String toString() { |
| 171 | 171 | return |
| 172 | toString( getBeganAmbiguous(), "Ambiguous Began", "'%s" ) + |
|
| 173 | toString( getEndedAmbiguous(), "Ambiguous Ended", "%s'" ) + |
|
| 172 | toString( getBeganEndedUnambiguous(), "Unambiguous Began/Ended", "'%s" ) + |
|
| 174 | 173 | toString( getBeganUnambiguous(), "Unambiguous Began", "'%s" ) + |
| 175 | toString( getEndedUnambiguous(), "Unambiguous Ended", "%s'" ); |
|
| 174 | toString( getEndedUnambiguous(), "Unambiguous Ended", "%s'" ) + |
|
| 175 | toString( getBeganAmbiguous(), "Ambiguous Began", "'%s" ) + |
|
| 176 | toString( getEndedAmbiguous(), "Ambiguous Ended", "%s'" ); |
|
| 176 | 177 | } |
| 177 | 178 | |
| 178 | 179 | private String toString( |
| 179 | final Set<String> words, final String category, final String fmt ) { |
|
| 180 | final Set<String> words, |
|
| 181 | final String category, |
|
| 182 | final String fmt |
|
| 183 | ) { |
|
| 180 | 184 | final var sb = new StringBuilder( 16384 ); |
| 181 | 185 | final var newline = System.lineSeparator(); |
| ... | ||
| 193 | 197 | */ |
| 194 | 198 | private static final Set<String> BEGAN_ENDED_UNAMBIGUOUS = Set.of( |
| 199 | // hacking |
|
| 195 | 200 | "ackin", |
| 201 | // hammering |
|
| 196 | 202 | "ammerin", |
| 197 | 203 | // hankering |
| 198 | 204 | "ankerin", |
| 199 | 205 | // having |
| 200 | 206 | "avin", |
| 207 | // hawking |
|
| 201 | 208 | "awkin", |
| 202 | 209 | "cepin", |
| 203 | 210 | "ceppin", |
| 204 | 211 | "ceptin", |
| 205 | 212 | "cordin", |
| 206 | 213 | "eadin", |
| 207 | 214 | "eavin", |
| 208 | 215 | "elpin", |
| 216 | // hindering |
|
| 209 | 217 | "inderin", |
| 210 | 218 | "lectioneerin", |
| 219 | // amazing |
|
| 211 | 220 | "mazin", |
| 221 | // remembering |
|
| 212 | 222 | "memberin", |
| 213 | 223 | // fish 'n' chips |
| 214 | 224 | "n", |
| 225 | // hobbling |
|
| 215 | 226 | "obblin", |
| 227 | // holding |
|
| 216 | 228 | "oldin", |
| 229 | // hollering |
|
| 217 | 230 | "ollerin", |
| 231 | // hopping |
|
| 218 | 232 | "oppin", |
| 219 | 233 | "ousekeepin", |
| 234 | // howling |
|
| 220 | 235 | "owlin", |
| 221 | 236 | "sceptin", |
| 222 | 237 | "spectin", |
| 223 | 238 | "splainin", |
| 239 | // supposing |
|
| 224 | 240 | "sposin", |
| 225 | 241 | "sputin", |
| 226 | 242 | "stonishin", |
| 243 | // destroying |
|
| 227 | 244 | "stroyin", |
| 245 | // persuading |
|
| 228 | 246 | "suadin", |
| 229 | 247 | "titivatin", |
| 248 | // introducing |
|
| 230 | 249 | "troducin", |
| 250 | // hugging |
|
| 231 | 251 | "uggin", |
| 252 | // hulking |
|
| 232 | 253 | "ulkin", |
| 254 | // humbugging |
|
| 233 | 255 | "umbuggin", |
| 256 | // humiliating |
|
| 234 | 257 | "umiliatin", |
| 258 | // humming |
|
| 235 | 259 | "ummin", |
| 260 | // humping |
|
| 236 | 261 | "umpin", |
| 262 | // hurrying |
|
| 237 | 263 | "urryin", |
| 264 | // hurting |
|
| 238 | 265 | "urtin", |
| 266 | // hustling |
|
| 239 | 267 | "ustlin", |
| 268 | // investigating |
|
| 240 | 269 | "vestigatin", |
| 270 | // inviting |
|
| 241 | 271 | "vitin", |
| 242 | 272 | "xceptin", |
| ... | ||
| 345 | 375 | "twould", |
| 346 | 376 | "twouldn", |
| 377 | // one |
|
| 378 | "un", |
|
| 347 | 379 | // have |
| 348 | 380 | "ve", |
| 53 | 53 | }; |
| 54 | 54 | |
| 55 | private static final LexemeType[] QUOTE_SINGLE_QUOTE_DOUBLE = { |
|
| 56 | QUOTE_SINGLE, QUOTE_DOUBLE |
|
| 57 | }; |
|
| 58 | ||
| 59 | /** |
|
| 60 | * Single quotes preceded by these {@link LexemeType}s may be opening quotes. |
|
| 61 | */ |
|
| 62 | private static final LexemeType[] LEADING_QUOTE_OPENING_SINGLE = |
|
| 63 | new LexemeType[]{ |
|
| 64 | LexemeType.SOT, SPACE, DASH, QUOTE_DOUBLE, OPENING_GROUP, EOL, EOP |
|
| 65 | }; |
|
| 66 | ||
| 67 | /** |
|
| 68 | * Single quotes succeeded by these {@link LexemeType}s may be opening quotes. |
|
| 69 | */ |
|
| 70 | private static final LexemeType[] LAGGING_QUOTE_OPENING_SINGLE = |
|
| 71 | new LexemeType[]{ |
|
| 72 | WORD, ELLIPSIS, QUOTE_SINGLE, QUOTE_DOUBLE |
|
| 73 | }; |
|
| 74 | ||
| 75 | /** |
|
| 76 | * Single quotes preceded by these {@link LexemeType}s may be closing quotes. |
|
| 77 | */ |
|
| 78 | private static final LexemeType[] LEADING_QUOTE_CLOSING_SINGLE = |
|
| 79 | new LexemeType[]{ |
|
| 80 | WORD, NUMBER, PERIOD, PUNCT, ELLIPSIS, QUOTE_DOUBLE |
|
| 81 | }; |
|
| 82 | ||
| 83 | /** |
|
| 84 | * Single quotes succeeded by these {@link LexemeType}s may be closing quotes. |
|
| 85 | */ |
|
| 86 | private static final LexemeType[] LAGGING_QUOTE_CLOSING_SINGLE = |
|
| 87 | new LexemeType[]{ |
|
| 88 | SPACE, HYPHEN, DASH, PUNCT, PERIOD, ELLIPSIS, QUOTE_DOUBLE, CLOSING_GROUP, |
|
| 89 | ENDING |
|
| 90 | }; |
|
| 91 | ||
| 92 | /** |
|
| 93 | * Double quotes preceded by these {@link LexemeType}s may be opening quotes. |
|
| 94 | */ |
|
| 95 | private static final LexemeType[] LEADING_QUOTE_OPENING_DOUBLE = |
|
| 96 | new LexemeType[]{ |
|
| 97 | LexemeType.SOT, SPACE, DASH, EQUALS, QUOTE_SINGLE, OPENING_GROUP, EOL, EOP |
|
| 98 | }; |
|
| 99 | ||
| 100 | /** |
|
| 101 | * Double quotes succeeded by these {@link LexemeType}s may be opening quotes. |
|
| 102 | */ |
|
| 103 | private static final LexemeType[] LAGGING_QUOTE_OPENING_DOUBLE = |
|
| 104 | new LexemeType[]{ |
|
| 105 | WORD, NUMBER, DASH, ELLIPSIS, OPENING_GROUP, QUOTE_SINGLE, |
|
| 106 | QUOTE_SINGLE_OPENING, QUOTE_SINGLE_CLOSING, QUOTE_DOUBLE |
|
| 107 | }; |
|
| 108 | ||
| 109 | /** |
|
| 110 | * Double quotes preceded by these {@link LexemeType}s may be closing quotes. |
|
| 111 | */ |
|
| 112 | private static final LexemeType[] LEADING_QUOTE_CLOSING_DOUBLE = |
|
| 113 | new LexemeType[]{ |
|
| 114 | WORD, NUMBER, PERIOD, PUNCT, DASH, ELLIPSIS, CLOSING_GROUP, QUOTE_SINGLE, |
|
| 115 | QUOTE_SINGLE_CLOSING, QUOTE_SINGLE_OPENING |
|
| 116 | }; |
|
| 117 | ||
| 118 | /** |
|
| 119 | * Double quotes succeeded by these {@link LexemeType}s may be closing quotes. |
|
| 120 | */ |
|
| 121 | private static final LexemeType[] LAGGING_QUOTE_CLOSING_DOUBLE = |
|
| 122 | new LexemeType[]{ |
|
| 123 | SPACE, PUNCT, PERIOD, EQUALS, HYPHEN, DASH, QUOTE_SINGLE, CLOSING_GROUP, |
|
| 124 | ENDING |
|
| 125 | }; |
|
| 126 | ||
| 127 | private final CircularFifoQueue<Lexeme> mQ = new CircularFifoQueue<>( 4 ); |
|
| 128 | private final String mText; |
|
| 129 | private final Contractions mContractions; |
|
| 130 | private final Consumer<Token> mConsumer; |
|
| 131 | ||
| 132 | public QuoteEmitter( |
|
| 133 | final String text, |
|
| 134 | final Contractions contractions, |
|
| 135 | final Consumer<Token> consumer |
|
| 136 | ) { |
|
| 137 | assert text != null; |
|
| 138 | assert contractions != null; |
|
| 139 | ||
| 140 | mText = text; |
|
| 141 | mContractions = contractions; |
|
| 142 | mConsumer = consumer; |
|
| 143 | } |
|
| 144 | ||
| 145 | /** |
|
| 146 | * Scans the given text document for quotation marks and passes them to the |
|
| 147 | * given {@link Token} {@link Consumer}. |
|
| 148 | * |
|
| 149 | * @param text The prose to lex. |
|
| 150 | * @param contractions List of ambiguous and unambiguous contractions. |
|
| 151 | * @param consumer Receives |
|
| 152 | */ |
|
| 153 | public static void analyze( |
|
| 154 | final String text, |
|
| 155 | final Contractions contractions, |
|
| 156 | final Consumer<Token> consumer, |
|
| 157 | final LexerFilter filter |
|
| 158 | ) { |
|
| 159 | final var emitter = new QuoteEmitter( text, contractions, consumer ); |
|
| 160 | Lexer.lex( text, emitter, filter ); |
|
| 161 | } |
|
| 162 | ||
| 163 | /** |
|
| 164 | * @param lexeme the input argument |
|
| 165 | */ |
|
| 166 | @Override |
|
| 167 | public void accept( final Lexeme lexeme ) { |
|
| 168 | mQ.add( lexeme ); |
|
| 169 | ||
| 170 | if( mQ.size() == 4 ) { |
|
| 171 | parse(); |
|
| 172 | } |
|
| 173 | } |
|
| 174 | ||
| 175 | private void parse() { |
|
| 176 | final var lex1 = mQ.get( 0 ); |
|
| 177 | final var lex2 = mQ.get( 1 ); |
|
| 178 | final var lex3 = mQ.get( 2 ); |
|
| 179 | final var lex4 = mQ.get( 3 ); |
|
| 180 | ||
| 181 | // <y'all>, <Ph.D.'ll>, <20's>, <she's> |
|
| 182 | if( match( WORD_PERIOD_NUMBER, QUOTE_SINGLE, WORD, ANY ) ) { |
|
| 183 | emit( QUOTE_APOSTROPHE, lex2 ); |
|
| 184 | } |
|
| 185 | // <'n'>, <'N'>, <'owlin'> |
|
| 186 | else if( |
|
| 187 | match( ANY, QUOTE_SINGLE, WORD, QUOTE_SINGLE ) && |
|
| 188 | mContractions.beganEndedUmambiguously(lex3.toString(mText)) |
|
| 189 | ) { |
|
| 190 | emit( QUOTE_APOSTROPHE, lex2 ); |
|
| 191 | emit( QUOTE_APOSTROPHE, lex4 ); |
|
| 192 | mQ.set( Lexeme.NONE, 3 ); |
|
| 193 | } |
|
| 194 | // <2''> |
|
| 195 | else if( match( NUMBER, QUOTE_SINGLE, QUOTE_SINGLE, ANY ) ) { |
|
| 196 | emit( QUOTE_PRIME_DOUBLE, lex2.began(), lex3.ended() ); |
|
| 197 | mQ.set( Lexeme.NONE, 2 ); |
|
| 198 | } |
|
| 199 | // <2'> |
|
| 200 | else if( match( NUMBER, QUOTE_SINGLE, ANY, ANY ) ) { |
|
| 201 | emit( QUOTE_PRIME_SINGLE, lex2 ); |
|
| 202 | } |
|
| 203 | // <2"> |
|
| 204 | else if( match( NUMBER, QUOTE_DOUBLE, ANY, ANY ) ) { |
|
| 205 | emit( QUOTE_PRIME_DOUBLE, lex2 ); |
|
| 206 | } |
|
| 207 | // <thinkin'> |
|
| 208 | else if( |
|
| 209 | match( WORD, QUOTE_SINGLE, ANY, ANY ) && |
|
| 210 | mContractions.endedUnambiguously( lex1.toString( mText ) ) |
|
| 211 | ) { |
|
| 212 | emit( QUOTE_APOSTROPHE, lex2 ); |
|
| 213 | } |
|
| 214 | // <'02> |
|
| 215 | else if( match( ANY, QUOTE_SINGLE, NUMBER, SPACE_PUNCT ) ) { |
|
| 216 | emit( QUOTE_APOSTROPHE, lex2 ); |
|
| 217 | } |
|
| 218 | // <'20s> |
|
| 219 | else if( |
|
| 220 | match( ANY, QUOTE_SINGLE, NUMBER, WORD ) && |
|
| 221 | "s".equalsIgnoreCase( lex4.toString( mText ) ) |
|
| 222 | ) { |
|
| 223 | emit( QUOTE_APOSTROPHE, lex2 ); |
|
| 224 | } |
|
| 225 | // <.'\n> |
|
| 226 | else if( match( PUNCT_PERIOD_ELLIPSIS_DASH, QUOTE_SINGLE, ENDING, ANY ) ) { |
|
| 227 | emit( QUOTE_CLOSING_SINGLE, lex2 ); |
|
| 228 | } |
|
| 229 | // <\'> |
|
| 230 | else if( match( ESC_SINGLE, ANY, ANY, ANY ) ) { |
|
| 231 | emit( QUOTE_STRAIGHT_SINGLE, lex1 ); |
|
| 232 | } |
|
| 233 | // <\"> |
|
| 234 | else if( match( ESC_DOUBLE, ANY, ANY, ANY ) ) { |
|
| 235 | emit( QUOTE_STRAIGHT_DOUBLE, lex1 ); |
|
| 236 | ||
| 237 | // <\"'---> |
|
| 238 | if( match( ESC_DOUBLE, QUOTE_SINGLE, SPACE_DASH_ENDING, ANY ) ) { |
|
| 239 | emit( QUOTE_CLOSING_SINGLE, lex2 ); |
|
| 240 | } |
|
| 241 | } |
|
| 242 | // <---'" > |
|
| 243 | else if( match( DASH, QUOTE_SINGLE, QUOTE_DOUBLE, SPACE_ENDING ) ) { |
|
| 244 | emit( QUOTE_CLOSING_SINGLE, lex2 ); |
|
| 245 | } |
|
| 246 | // <o’-lantern>, <o' fellow>, <O'-the> |
|
| 247 | else if( |
|
| 248 | match( WORD, QUOTE_SINGLE, SPACE_HYPHEN, WORD ) && |
|
| 249 | "o".equalsIgnoreCase( lex1.toString( mText ) ) |
|
| 250 | ) { |
|
| 251 | emit( QUOTE_APOSTROPHE, lex2 ); |
|
| 252 | } |
|
| 253 | // <"">, <"...>, <"word>, <---"word> |
|
| 254 | else if( |
|
| 255 | match( |
|
| 256 | LEADING_QUOTE_OPENING_DOUBLE, QUOTE_DOUBLE, |
|
| 257 | LAGGING_QUOTE_OPENING_DOUBLE, ANY |
|
| 258 | ) |
|
| 259 | ) { |
|
| 260 | emit( QUOTE_OPENING_DOUBLE, lex2 ); |
|
| 261 | } |
|
| 262 | // <..."'>, <word"'>, <?"'>, <word"?> |
|
| 263 | else if( |
|
| 264 | match( |
|
| 265 | LEADING_QUOTE_CLOSING_DOUBLE, QUOTE_DOUBLE, |
|
| 266 | LAGGING_QUOTE_CLOSING_DOUBLE, ANY |
|
| 267 | ) |
|
| 268 | ) { |
|
| 269 | emit( QUOTE_CLOSING_DOUBLE, lex2 ); |
|
| 270 | } |
|
| 271 | // < ''E> |
|
| 272 | else if( match( SPACE_SOT, QUOTE_SINGLE, QUOTE_SINGLE, WORD ) ) { |
|
| 273 | // Consume both immediately to avoid the false ambiguity <'e>. |
|
| 274 | emit( QUOTE_OPENING_SINGLE, lex2 ); |
|
| 275 | emit( QUOTE_APOSTROPHE, lex3 ); |
|
| 276 | mQ.set( Lexeme.NONE, 1 ); |
|
| 277 | mQ.set( Lexeme.NONE, 2 ); |
|
| 278 | } |
|
| 279 | // <'...>, <'word>, <---'word>, < 'nation> |
|
| 280 | else if( |
|
| 281 | match( |
|
| 282 | LEADING_QUOTE_OPENING_SINGLE, QUOTE_SINGLE, |
|
| 283 | LAGGING_QUOTE_OPENING_SINGLE, ANY ) |
|
| 284 | ) { |
|
| 285 | final var word = lex3.toString( mText ); |
|
| 286 | ||
| 287 | if( mContractions.beganAmbiguously( word ) ) { |
|
| 288 | emit( QUOTE_AMBIGUOUS_LEADING, lex2 ); |
|
| 289 | } |
|
| 290 | else if( mContractions.beganUnambiguously( word ) ) { |
|
| 291 | emit( QUOTE_APOSTROPHE, lex2 ); |
|
| 292 | } |
|
| 293 | // <"'"nested> |
|
| 294 | else if( match( QUOTE_DOUBLE, QUOTE_SINGLE, QUOTE_DOUBLE, WORD ) ) { |
|
| 295 | emit( QUOTE_OPENING_SINGLE, lex2 ); |
|
| 296 | } |
|
| 297 | // <"'" > |
|
| 298 | else if( match( QUOTE_DOUBLE, QUOTE_SINGLE, QUOTE_DOUBLE, ANY ) ) { |
|
| 299 | emit( lex2 ); |
|
| 300 | } |
|
| 301 | // < '" > |
|
| 302 | else if( match( ANY, QUOTE_SINGLE, LAGGING_QUOTE_OPENING_SINGLE, ANY ) ) { |
|
| 303 | emit( QUOTE_OPENING_SINGLE, lex2 ); |
|
| 304 | } |
|
| 305 | // Ambiguous |
|
| 306 | else { |
|
| 307 | emit( QUOTE_AMBIGUOUS_LEADING, lex2 ); |
|
| 308 | } |
|
| 309 | } |
|
| 310 | // <word'">, <...'--->, <"' > |
|
| 311 | else if( |
|
| 312 | match( |
|
| 313 | LEADING_QUOTE_CLOSING_SINGLE, QUOTE_SINGLE, |
|
| 314 | LAGGING_QUOTE_CLOSING_SINGLE, ANY |
|
| 315 | ) |
|
| 316 | ) { |
|
| 317 | final var word = lex1.toString( mText ); |
|
| 318 | ||
| 319 | if( mContractions.endedAmbiguously( word ) ) { |
|
| 320 | emit( QUOTE_AMBIGUOUS_LAGGING, lex2 ); |
|
| 321 | } |
|
| 322 | else { |
|
| 323 | emit( QUOTE_CLOSING_SINGLE, lex2 ); |
|
| 324 | } |
|
| 325 | } |
|
| 326 | // <word';> (contraction inferred by previous matches) |
|
| 327 | else if( match( WORD, QUOTE_SINGLE, PUNCT_PERIOD, ANY ) ) { |
|
| 328 | emit( QUOTE_APOSTROPHE, lex2 ); |
|
| 329 | } |
|
| 330 | // <---'"> |
|
| 331 | else if( match( DASH, QUOTE_SINGLE, QUOTE_DOUBLE, ANY ) ) { |
|
| 332 | emit( QUOTE_CLOSING_SINGLE, lex2 ); |
|
| 333 | } |
|
| 334 | // <'42>, <'-3.14> |
|
| 335 | else if( match( ANY, QUOTE_SINGLE, NUMBER, ANY ) ) { |
|
| 336 | emit( QUOTE_OPENING_SINGLE, lex2 ); |
|
| 337 | } |
|
| 338 | // <PRE-PARSED><'---.> |
|
| 339 | else if( match( LexemeType.NONE, QUOTE_SINGLE, ANY, ANY ) ) { |
|
| 340 | emit( QUOTE_CLOSING_SINGLE, lex2 ); |
|
| 341 | } |
|
| 342 | // <''Cause > |
|
| 343 | else if( match( QUOTE_SINGLE, QUOTE_SINGLE, WORD, ANY ) ) { |
|
| 344 | final var word = lex3.toString( mText ); |
|
| 345 | ||
| 346 | if( mContractions.beganAmbiguously( word ) ) { |
|
| 347 | emit( QUOTE_AMBIGUOUS_LEADING, lex2 ); |
|
| 348 | } |
|
| 349 | else if( mContractions.beganUnambiguously( word ) ) { |
|
| 350 | emit( QUOTE_APOSTROPHE, lex2 ); |
|
| 351 | } |
|
| 352 | else { |
|
| 353 | emit( lex2 ); |
|
| 354 | } |
|
| 355 | } |
|
| 356 | // Ambiguous (no match) |
|
| 357 | else if( match( ANY, QUOTE_SINGLE_QUOTE_DOUBLE, ANY, ANY ) ) { |
|
| 358 | emit( lex2 ); |
|
| 359 | } |
|
| 360 | } |
|
| 361 | ||
| 362 | private void emit( final TokenType tokenType, final Lexeme lexeme ) { |
|
| 363 | mConsumer.accept( new Token( tokenType, lexeme ) ); |
|
| 364 | } |
|
| 365 | ||
| 366 | private void emit( |
|
| 367 | final TokenType tokenType, |
|
| 368 | final int began, |
|
| 369 | final int ended ) { |
|
| 370 | mConsumer.accept( new Token( tokenType, began, ended ) ); |
|
| 371 | } |
|
| 372 | ||
| 373 | /** |
|
| 374 | * Emits a token that represents an ambiguous quotation mark. |
|
| 375 | * |
|
| 376 | * @param lexeme A quotation mark that could not be curled. |
|
| 377 | */ |
|
| 378 | private void emit( final Lexeme lexeme ) { |
|
| 379 | mConsumer.accept( new Token( AMBIGUOUS, lexeme ) ); |
|
| 380 | } |
|
| 381 | ||
| 382 | private boolean match( |
|
| 383 | final LexemeType l1, |
|
| 384 | final LexemeType l2, |
|
| 385 | final LexemeType l3, |
|
| 386 | final LexemeType l4 ) { |
|
| 387 | return mQ.get( 0 ).isType( l1 ) && |
|
| 388 | mQ.get( 1 ).isType( l2 ) && |
|
| 389 | mQ.get( 2 ).isType( l3 ) && |
|
| 390 | mQ.get( 3 ).isType( l4 ); |
|
| 391 | } |
|
| 392 | ||
| 393 | private boolean match( |
|
| 394 | final LexemeType[] l1, |
|
| 395 | final LexemeType l2, |
|
| 396 | final LexemeType l3, |
|
| 397 | final LexemeType l4 ) { |
|
| 398 | return mQ.get( 0 ).isType( l1 ) && |
|
| 399 | mQ.get( 1 ).isType( l2 ) && |
|
| 400 | mQ.get( 2 ).isType( l3 ) && |
|
| 401 | mQ.get( 3 ).isType( l4 ); |
|
| 402 | } |
|
| 403 | ||
| 404 | private boolean match( |
|
| 405 | final LexemeType l1, |
|
| 406 | final LexemeType[] l2, |
|
| 55 | /** |
|
| 56 | * Single quotes preceded by these {@link LexemeType}s may be opening quotes. |
|
| 57 | */ |
|
| 58 | private static final LexemeType[] LEADING_QUOTE_OPENING_SINGLE = |
|
| 59 | new LexemeType[]{ |
|
| 60 | LexemeType.SOT, SPACE, DASH, QUOTE_DOUBLE, OPENING_GROUP, EOL, EOP |
|
| 61 | }; |
|
| 62 | ||
| 63 | /** |
|
| 64 | * Single quotes succeeded by these {@link LexemeType}s may be opening quotes. |
|
| 65 | */ |
|
| 66 | private static final LexemeType[] LAGGING_QUOTE_OPENING_SINGLE = |
|
| 67 | new LexemeType[]{ |
|
| 68 | WORD, ELLIPSIS, QUOTE_SINGLE, QUOTE_DOUBLE |
|
| 69 | }; |
|
| 70 | ||
| 71 | /** |
|
| 72 | * Single quotes preceded by these {@link LexemeType}s may be closing quotes. |
|
| 73 | */ |
|
| 74 | private static final LexemeType[] LEADING_QUOTE_CLOSING_SINGLE = |
|
| 75 | new LexemeType[]{ |
|
| 76 | WORD, NUMBER, PERIOD, PUNCT, ELLIPSIS, QUOTE_DOUBLE |
|
| 77 | }; |
|
| 78 | ||
| 79 | /** |
|
| 80 | * Single quotes succeeded by these {@link LexemeType}s may be closing quotes. |
|
| 81 | */ |
|
| 82 | private static final LexemeType[] LAGGING_QUOTE_CLOSING_SINGLE = |
|
| 83 | new LexemeType[]{ |
|
| 84 | SPACE, HYPHEN, DASH, PUNCT, PERIOD, ELLIPSIS, QUOTE_DOUBLE, CLOSING_GROUP, |
|
| 85 | ENDING |
|
| 86 | }; |
|
| 87 | ||
| 88 | /** |
|
| 89 | * Double quotes preceded by these {@link LexemeType}s may be opening quotes. |
|
| 90 | */ |
|
| 91 | private static final LexemeType[] LEADING_QUOTE_OPENING_DOUBLE = |
|
| 92 | new LexemeType[]{ |
|
| 93 | LexemeType.SOT, SPACE, DASH, EQUALS, QUOTE_SINGLE, OPENING_GROUP, EOL, EOP |
|
| 94 | }; |
|
| 95 | ||
| 96 | /** |
|
| 97 | * Double quotes succeeded by these {@link LexemeType}s may be opening quotes. |
|
| 98 | */ |
|
| 99 | private static final LexemeType[] LAGGING_QUOTE_OPENING_DOUBLE = |
|
| 100 | new LexemeType[]{ |
|
| 101 | WORD, PUNCT, NUMBER, DASH, ELLIPSIS, OPENING_GROUP, QUOTE_SINGLE, |
|
| 102 | QUOTE_SINGLE_OPENING, QUOTE_SINGLE_CLOSING, QUOTE_DOUBLE |
|
| 103 | }; |
|
| 104 | ||
| 105 | /** |
|
| 106 | * Double quotes preceded by these {@link LexemeType}s may be closing quotes. |
|
| 107 | */ |
|
| 108 | private static final LexemeType[] LEADING_QUOTE_CLOSING_DOUBLE = |
|
| 109 | new LexemeType[]{ |
|
| 110 | WORD, NUMBER, PERIOD, PUNCT, DASH, ELLIPSIS, CLOSING_GROUP, QUOTE_SINGLE, |
|
| 111 | QUOTE_SINGLE_CLOSING, QUOTE_SINGLE_OPENING |
|
| 112 | }; |
|
| 113 | ||
| 114 | /** |
|
| 115 | * Double quotes succeeded by these {@link LexemeType}s may be closing quotes. |
|
| 116 | */ |
|
| 117 | private static final LexemeType[] LAGGING_QUOTE_CLOSING_DOUBLE = |
|
| 118 | new LexemeType[]{ |
|
| 119 | SPACE, PUNCT, PERIOD, EQUALS, HYPHEN, DASH, QUOTE_SINGLE, CLOSING_GROUP, |
|
| 120 | ENDING |
|
| 121 | }; |
|
| 122 | ||
| 123 | private final CircularFifoQueue<Lexeme> mQ = new CircularFifoQueue<>( 4 ); |
|
| 124 | private final String mText; |
|
| 125 | private final Contractions mContractions; |
|
| 126 | private final Consumer<Token> mConsumer; |
|
| 127 | ||
| 128 | public QuoteEmitter( |
|
| 129 | final String text, |
|
| 130 | final Contractions contractions, |
|
| 131 | final Consumer<Token> consumer |
|
| 132 | ) { |
|
| 133 | assert text != null; |
|
| 134 | assert contractions != null; |
|
| 135 | ||
| 136 | mText = text; |
|
| 137 | mContractions = contractions; |
|
| 138 | mConsumer = consumer; |
|
| 139 | } |
|
| 140 | ||
| 141 | /** |
|
| 142 | * Scans the given text document for quotation marks and passes them to the |
|
| 143 | * given {@link Token} {@link Consumer}. |
|
| 144 | * |
|
| 145 | * @param text The prose to lex. |
|
| 146 | * @param contractions List of ambiguous and unambiguous contractions. |
|
| 147 | * @param consumer Receives |
|
| 148 | */ |
|
| 149 | public static void analyze( |
|
| 150 | final String text, |
|
| 151 | final Contractions contractions, |
|
| 152 | final Consumer<Token> consumer, |
|
| 153 | final LexerFilter filter |
|
| 154 | ) { |
|
| 155 | final var emitter = new QuoteEmitter( text, contractions, consumer ); |
|
| 156 | Lexer.lex( text, emitter, filter ); |
|
| 157 | } |
|
| 158 | ||
| 159 | /** |
|
| 160 | * @param lexeme the input argument |
|
| 161 | */ |
|
| 162 | @Override |
|
| 163 | public void accept( final Lexeme lexeme ) { |
|
| 164 | mQ.add( lexeme ); |
|
| 165 | ||
| 166 | if( mQ.size() == 4 ) { |
|
| 167 | parse(); |
|
| 168 | } |
|
| 169 | } |
|
| 170 | ||
| 171 | private void parse() { |
|
| 172 | final var lex1 = mQ.get( 0 ); |
|
| 173 | final var lex2 = mQ.get( 1 ); |
|
| 174 | final var lex3 = mQ.get( 2 ); |
|
| 175 | final var lex4 = mQ.get( 3 ); |
|
| 176 | ||
| 177 | // <y'all>, <Ph.D.'ll>, <20's>, <she's> |
|
| 178 | if( match( WORD_PERIOD_NUMBER, QUOTE_SINGLE, WORD, ANY ) ) { |
|
| 179 | emit( QUOTE_APOSTROPHE, lex2 ); |
|
| 180 | } |
|
| 181 | // <'n'>, <'N'>, <'owlin'> |
|
| 182 | else if( |
|
| 183 | match( ANY, QUOTE_SINGLE, WORD, QUOTE_SINGLE ) && |
|
| 184 | mContractions.beganEndedUnambiguously( lex3.toString( mText ) ) |
|
| 185 | ) { |
|
| 186 | emit( QUOTE_APOSTROPHE, lex2 ); |
|
| 187 | emit( QUOTE_APOSTROPHE, lex4 ); |
|
| 188 | mQ.set( Lexeme.NONE, 3 ); |
|
| 189 | } |
|
| 190 | // <2''> |
|
| 191 | else if( match( NUMBER, QUOTE_SINGLE, QUOTE_SINGLE, ANY ) ) { |
|
| 192 | emit( QUOTE_PRIME_DOUBLE, lex2.began(), lex3.ended() ); |
|
| 193 | mQ.set( Lexeme.NONE, 2 ); |
|
| 194 | } |
|
| 195 | // <2'> |
|
| 196 | else if( match( NUMBER, QUOTE_SINGLE, ANY, ANY ) ) { |
|
| 197 | emit( QUOTE_PRIME_SINGLE, lex2 ); |
|
| 198 | } |
|
| 199 | // <2"> |
|
| 200 | else if( match( NUMBER, QUOTE_DOUBLE, ANY, ANY ) ) { |
|
| 201 | emit( QUOTE_PRIME_DOUBLE, lex2 ); |
|
| 202 | } |
|
| 203 | // <thinkin'> |
|
| 204 | else if( |
|
| 205 | match( WORD, QUOTE_SINGLE, ANY, ANY ) && |
|
| 206 | mContractions.endedUnambiguously( lex1.toString( mText ) ) |
|
| 207 | ) { |
|
| 208 | emit( QUOTE_APOSTROPHE, lex2 ); |
|
| 209 | } |
|
| 210 | // <'02> |
|
| 211 | else if( match( ANY, QUOTE_SINGLE, NUMBER, SPACE_PUNCT ) ) { |
|
| 212 | emit( QUOTE_APOSTROPHE, lex2 ); |
|
| 213 | } |
|
| 214 | // <'20s> |
|
| 215 | else if( |
|
| 216 | match( ANY, QUOTE_SINGLE, NUMBER, WORD ) && |
|
| 217 | "s".equalsIgnoreCase( lex4.toString( mText ) ) |
|
| 218 | ) { |
|
| 219 | emit( QUOTE_APOSTROPHE, lex2 ); |
|
| 220 | } |
|
| 221 | // <.'\n> |
|
| 222 | else if( match( PUNCT_PERIOD_ELLIPSIS_DASH, QUOTE_SINGLE, ENDING, ANY ) ) { |
|
| 223 | emit( QUOTE_CLOSING_SINGLE, lex2 ); |
|
| 224 | } |
|
| 225 | // <\'> |
|
| 226 | else if( match( ESC_SINGLE, ANY, ANY, ANY ) ) { |
|
| 227 | emit( QUOTE_STRAIGHT_SINGLE, lex1 ); |
|
| 228 | } |
|
| 229 | // <\"> |
|
| 230 | else if( match( ESC_DOUBLE, ANY, ANY, ANY ) ) { |
|
| 231 | emit( QUOTE_STRAIGHT_DOUBLE, lex1 ); |
|
| 232 | ||
| 233 | // <\"'---> |
|
| 234 | if( match( ESC_DOUBLE, QUOTE_SINGLE, SPACE_DASH_ENDING, ANY ) ) { |
|
| 235 | emit( QUOTE_CLOSING_SINGLE, lex2 ); |
|
| 236 | } |
|
| 237 | } |
|
| 238 | // <---'" > |
|
| 239 | else if( match( DASH, QUOTE_SINGLE, QUOTE_DOUBLE, SPACE_ENDING ) ) { |
|
| 240 | emit( QUOTE_CLOSING_SINGLE, lex2 ); |
|
| 241 | } |
|
| 242 | // <o’-lantern>, <o' fellow>, <O'-the> |
|
| 243 | else if( |
|
| 244 | match( WORD, QUOTE_SINGLE, SPACE_HYPHEN, WORD ) && |
|
| 245 | "o".equalsIgnoreCase( lex1.toString( mText ) ) |
|
| 246 | ) { |
|
| 247 | emit( QUOTE_APOSTROPHE, lex2 ); |
|
| 248 | } |
|
| 249 | // <"">, <"...>, <"word>, <---"word> |
|
| 250 | else if( |
|
| 251 | match( |
|
| 252 | LEADING_QUOTE_OPENING_DOUBLE, QUOTE_DOUBLE, |
|
| 253 | LAGGING_QUOTE_OPENING_DOUBLE, ANY |
|
| 254 | ) |
|
| 255 | ) { |
|
| 256 | emit( QUOTE_OPENING_DOUBLE, lex2 ); |
|
| 257 | } |
|
| 258 | // <..."'>, <word"'>, <?"'>, <word"?> |
|
| 259 | else if( |
|
| 260 | match( |
|
| 261 | LEADING_QUOTE_CLOSING_DOUBLE, QUOTE_DOUBLE, |
|
| 262 | LAGGING_QUOTE_CLOSING_DOUBLE, ANY |
|
| 263 | ) |
|
| 264 | ) { |
|
| 265 | emit( QUOTE_CLOSING_DOUBLE, lex2 ); |
|
| 266 | } |
|
| 267 | // < ''E> |
|
| 268 | else if( match( SPACE_SOT, QUOTE_SINGLE, QUOTE_SINGLE, WORD ) ) { |
|
| 269 | // Consume both immediately to avoid the false ambiguity <'e>. |
|
| 270 | emit( QUOTE_OPENING_SINGLE, lex2 ); |
|
| 271 | emit( QUOTE_APOSTROPHE, lex3 ); |
|
| 272 | mQ.set( Lexeme.NONE, 1 ); |
|
| 273 | mQ.set( Lexeme.NONE, 2 ); |
|
| 274 | } |
|
| 275 | // <'...>, <'word>, <---'word>, < 'nation> |
|
| 276 | else if( |
|
| 277 | match( |
|
| 278 | LEADING_QUOTE_OPENING_SINGLE, QUOTE_SINGLE, |
|
| 279 | LAGGING_QUOTE_OPENING_SINGLE, ANY ) |
|
| 280 | ) { |
|
| 281 | final var word = lex3.toString( mText ); |
|
| 282 | ||
| 283 | if( mContractions.beganAmbiguously( word ) ) { |
|
| 284 | emit( QUOTE_AMBIGUOUS_LEADING, lex2 ); |
|
| 285 | } |
|
| 286 | else if( mContractions.beganUnambiguously( word ) ) { |
|
| 287 | emit( QUOTE_APOSTROPHE, lex2 ); |
|
| 288 | } |
|
| 289 | // <"'"nested> |
|
| 290 | else if( match( QUOTE_DOUBLE, QUOTE_SINGLE, QUOTE_DOUBLE, WORD ) ) { |
|
| 291 | emit( QUOTE_OPENING_SINGLE, lex2 ); |
|
| 292 | } |
|
| 293 | // <"'" > |
|
| 294 | else if( match( QUOTE_DOUBLE, QUOTE_SINGLE, QUOTE_DOUBLE, ANY ) ) { |
|
| 295 | emit( QUOTE_AMBIGUOUS_SINGLE, lex2 ); |
|
| 296 | } |
|
| 297 | // < '" > |
|
| 298 | else if( match( ANY, QUOTE_SINGLE, LAGGING_QUOTE_OPENING_SINGLE, ANY ) ) { |
|
| 299 | emit( QUOTE_OPENING_SINGLE, lex2 ); |
|
| 300 | } |
|
| 301 | // Ambiguous |
|
| 302 | else { |
|
| 303 | emit( QUOTE_AMBIGUOUS_LEADING, lex2 ); |
|
| 304 | } |
|
| 305 | } |
|
| 306 | // <word'">, <...'--->, <"' > |
|
| 307 | else if( |
|
| 308 | match( |
|
| 309 | LEADING_QUOTE_CLOSING_SINGLE, QUOTE_SINGLE, |
|
| 310 | LAGGING_QUOTE_CLOSING_SINGLE, ANY |
|
| 311 | ) |
|
| 312 | ) { |
|
| 313 | final var word = lex1.toString( mText ); |
|
| 314 | ||
| 315 | if( mContractions.endedAmbiguously( word ) ) { |
|
| 316 | emit( QUOTE_AMBIGUOUS_LAGGING, lex2 ); |
|
| 317 | } |
|
| 318 | else { |
|
| 319 | emit( QUOTE_CLOSING_SINGLE, lex2 ); |
|
| 320 | } |
|
| 321 | } |
|
| 322 | // <word';> (contraction inferred by previous matches) |
|
| 323 | else if( match( WORD, QUOTE_SINGLE, PUNCT_PERIOD, ANY ) ) { |
|
| 324 | emit( QUOTE_APOSTROPHE, lex2 ); |
|
| 325 | } |
|
| 326 | // <---'"> |
|
| 327 | else if( match( DASH, QUOTE_SINGLE, QUOTE_DOUBLE, ANY ) ) { |
|
| 328 | emit( QUOTE_CLOSING_SINGLE, lex2 ); |
|
| 329 | } |
|
| 330 | // <'42>, <'-3.14> |
|
| 331 | else if( match( ANY, QUOTE_SINGLE, NUMBER, ANY ) ) { |
|
| 332 | emit( QUOTE_OPENING_SINGLE, lex2 ); |
|
| 333 | } |
|
| 334 | // <PRE-PARSED><'---.> |
|
| 335 | else if( match( LexemeType.NONE, QUOTE_SINGLE, ANY, ANY ) ) { |
|
| 336 | emit( QUOTE_CLOSING_SINGLE, lex2 ); |
|
| 337 | } |
|
| 338 | // <''Cause > |
|
| 339 | else if( match( QUOTE_SINGLE, QUOTE_SINGLE, WORD, ANY ) ) { |
|
| 340 | final var word = lex3.toString( mText ); |
|
| 341 | ||
| 342 | if( mContractions.beganAmbiguously( word ) ) { |
|
| 343 | emit( QUOTE_AMBIGUOUS_LEADING, lex2 ); |
|
| 344 | } |
|
| 345 | else if( mContractions.beganUnambiguously( word ) ) { |
|
| 346 | emit( QUOTE_APOSTROPHE, lex2 ); |
|
| 347 | } |
|
| 348 | else { |
|
| 349 | emit( QUOTE_AMBIGUOUS_SINGLE, lex2 ); |
|
| 350 | } |
|
| 351 | } |
|
| 352 | else if( match( ANY, QUOTE_DOUBLE, ANY, ANY ) ) { |
|
| 353 | emit( QUOTE_AMBIGUOUS_DOUBLE, lex2 ); |
|
| 354 | } |
|
| 355 | // Ambiguous (no match) |
|
| 356 | else if( match( ANY, QUOTE_SINGLE, ANY, ANY ) ) { |
|
| 357 | emit( QUOTE_AMBIGUOUS_SINGLE, lex2 ); |
|
| 358 | } |
|
| 359 | } |
|
| 360 | ||
| 361 | private void emit( final TokenType tokenType, final Lexeme lexeme ) { |
|
| 362 | mConsumer.accept( new Token( tokenType, lexeme ) ); |
|
| 363 | } |
|
| 364 | ||
| 365 | private void emit( |
|
| 366 | final TokenType tokenType, |
|
| 367 | final int began, |
|
| 368 | final int ended ) { |
|
| 369 | mConsumer.accept( new Token( tokenType, began, ended ) ); |
|
| 370 | } |
|
| 371 | ||
| 372 | private boolean match( |
|
| 373 | final LexemeType l1, |
|
| 374 | final LexemeType l2, |
|
| 375 | final LexemeType l3, |
|
| 376 | final LexemeType l4 ) { |
|
| 377 | return mQ.get( 0 ).isType( l1 ) && |
|
| 378 | mQ.get( 1 ).isType( l2 ) && |
|
| 379 | mQ.get( 2 ).isType( l3 ) && |
|
| 380 | mQ.get( 3 ).isType( l4 ); |
|
| 381 | } |
|
| 382 | ||
| 383 | private boolean match( |
|
| 384 | final LexemeType[] l1, |
|
| 385 | final LexemeType l2, |
|
| 407 | 386 | final LexemeType l3, |
| 408 | 387 | final LexemeType l4 ) { |
| 101 | 101 | |
| 102 | 102 | boolean isAmbiguous() { |
| 103 | return mTokenType == AMBIGUOUS || |
|
| 103 | return mTokenType == QUOTE_AMBIGUOUS_SINGLE || |
|
| 104 | mTokenType == QUOTE_AMBIGUOUS_DOUBLE || |
|
| 104 | 105 | mTokenType == QUOTE_AMBIGUOUS_LEADING || |
| 105 | 106 | mTokenType == QUOTE_AMBIGUOUS_LAGGING; |
| 19 | 19 | QUOTE_AMBIGUOUS_LEADING( "leading-ambiguous" ), |
| 20 | 20 | QUOTE_AMBIGUOUS_LAGGING( "lagging-ambiguous" ), |
| 21 | AMBIGUOUS( "ambiguous" ), |
|
| 21 | QUOTE_AMBIGUOUS_DOUBLE( "double-ambiguous" ), |
|
| 22 | QUOTE_AMBIGUOUS_SINGLE( "single-ambiguous" ), |
|
| 22 | 23 | NONE; |
| 23 | 24 |
| 72 | 72 | */ |
| 73 | 73 | public boolean hasOpeningSingleQuote() { |
| 74 | return mOpening.isType( QUOTE_OPENING_SINGLE ); |
|
| 74 | return isOpening( QUOTE_OPENING_SINGLE ); |
|
| 75 | } |
|
| 76 | ||
| 77 | public boolean hasOpeningDoubleQuote() { |
|
| 78 | return isOpening( QUOTE_OPENING_DOUBLE ); |
|
| 79 | } |
|
| 80 | ||
| 81 | private boolean isOpening( final TokenType tokenType ) { |
|
| 82 | return mOpening.isType( tokenType ); |
|
| 75 | 83 | } |
| 76 | 84 |
| 26 | 26 | /** |
| 27 | 27 | * Tests that straight quotes are converted to curly quotes. |
| 28 | * |
|
| 29 | * @throws IOException Error opening file full of fun. |
|
| 30 | 28 | */ |
| 31 | 29 | @Test |
| 32 | public void test_Parse_StraightQuotes_CurlyQuotes() throws IOException { |
|
| 30 | public void test_Parse_UncurledQuotes1_CurlyQuotes() throws IOException { |
|
| 33 | 31 | testCurler( createCurler( FILTER_PLAIN ), "unambiguous-1-pass.txt" ); |
| 34 | 32 | } |
| 35 | 33 | |
| 36 | 34 | @Test |
| 37 | public void test_XmlParse_StraightQuotes_CurlyQuotes() throws IOException { |
|
| 35 | public void test_Parse_UncurledQuotes2_CurlyQuotes() throws IOException { |
|
| 36 | testCurler( createCurler( FILTER_PLAIN ), "unambiguous-2-pass.txt" ); |
|
| 37 | } |
|
| 38 | ||
| 39 | @Test |
|
| 40 | public void test_Parse_UncurledQuotesXml_CurlyQuotes() throws IOException { |
|
| 38 | 41 | testCurler( createCurler( FILTER_XML ), "xml.txt" ); |
| 39 | 42 | } |
| 1 | 1 | # ######################################################################## |
| 2 | # Double quotes |
|
| 3 | # ######################################################################## |
|
| 4 | ||
| 5 | "" |
|
| 6 | “” |
|
| 7 | ||
| 8 | "" "" |
|
| 9 | “” “” |
|
| 10 | ||
| 11 | """" |
|
| 12 | “”“” |
|
| 13 | ||
| 14 | " " |
|
| 15 | “ ” |
|
| 16 | ||
| 17 | " " " " |
|
| 18 | “ ” “ ” |
|
| 19 | ||
| 20 | "?" |
|
| 21 | “?” |
|
| 22 | ||
| 23 | "!" |
|
| 24 | “!” |
|
| 25 | ||
| 26 | "‽" |
|
| 27 | “‽” |
|
| 28 | ||
| 29 | "...?!" |
|
| 30 | “...?!” |
|
| 31 | ||
| 32 | " ... " |
|
| 33 | “ ... ” |
|
| 34 | ||
| 35 | # ######################################################################## |
|
| 2 | 36 | # Single quotes |
| 3 | 37 | # ######################################################################## |
| 38 | ||
| 39 | ' ' |
|
| 40 | ‘ ’ |
|
| 41 | ||
| 42 | ' ' ' ' |
|
| 43 | ‘ ’ ‘ ’ |
|
| 44 | ||
| 45 | ' ... ' |
|
| 46 | ‘ ... ’ |
|
| 47 | ||
| 48 | ' ... ' ' ... ' |
|
| 49 | ‘ ... ’ ‘ ... ’ |
|
| 50 | ||
| 51 | '?' |
|
| 52 | ‘?’ |
|
| 53 | ||
| 54 | '!' |
|
| 55 | ‘!’ |
|
| 56 | ||
| 57 | '?!' |
|
| 58 | ‘?!’ |
|
| 59 | ||
| 60 | '‽' |
|
| 61 | ‘‽’ |
|
| 62 | ||
| 4 | 63 | With 'quotes' in the middle. |
| 5 | 64 | With ‘quotes’ in the middle. |