Dave Jarvis' Repositories

git clone https://repo.autonoma.ca/repo/keenquotes.git

Fix empty quotation mark edge cases

AuthorDave Jarvis <email>
Date2022-08-22 19:15:39 GMT-0700
Commitbc2b1af27be6304b46b74991fb0d93bf32a552ea
Parent0e878a3
src/main/java/com/whitemagicsoftware/keenquotes/parser/AmbiguityResolver.java
mTree = mTree.closing( token );
}
+ else if( token.isType( QUOTE_AMBIGUOUS_DOUBLE ) ) {
+ // Create subtrees for: <" ... ">, <" ">, <"">, etc.
+ if( mTree.hasOpeningDoubleQuote() ) {
+ token.setTokenType( QUOTE_CLOSING_DOUBLE );
+ mTree = mTree.closing( token );
+ }
+ else {
+ token.setTokenType( QUOTE_OPENING_DOUBLE );
+ mTree = mTree.opening( token );
+ }
+ }
// Add ambiguous tokens to be resolved; add apostrophes for later emitting.
else {
private void resolve( final List<Token> tokens ) {
assert tokens != null;
+
+ boolean leader = false;
for( final var token : tokens ) {
if( token.isType( QUOTE_AMBIGUOUS_LEADING ) ) {
// Once a leader quote is found, any laggard could be a closing quote.
- break;
+ leader = true;
}
- else if( token.isType( QUOTE_AMBIGUOUS_LAGGING ) ) {
+ else if( !leader && token.isType( QUOTE_AMBIGUOUS_LAGGING ) ) {
token.setTokenType( QUOTE_APOSTROPHE );
+ }
+ else if( token.isType( QUOTE_AMBIGUOUS_SINGLE ) ) {
+ // Create subtrees for: <' ... '>, <' '>, <''>, etc.
+ if( mTree.hasOpeningSingleQuote() ) {
+ token.setTokenType( QUOTE_CLOSING_SINGLE );
+ mTree = mTree.closing( token );
+ }
+ else {
+ token.setTokenType( QUOTE_OPENING_SINGLE );
+ mTree = mTree.opening( token );
+ }
}
}
final var countLeading = tree.count( QUOTE_AMBIGUOUS_LEADING );
final var countLagging = tree.count( QUOTE_AMBIGUOUS_LAGGING );
- final var countUnknown = tree.count( AMBIGUOUS );
+ final var countSingles = tree.count( QUOTE_AMBIGUOUS_SINGLE );
if( tree.hasOpeningSingleQuote() && !tree.hasClosingSingleQuote() ) {
- if( countUnknown == 0 && countLeading == 0 && countLagging == 1 ) {
+ if( countSingles == 0 && countLeading == 0 && countLagging == 1 ) {
tree.replaceAll( QUOTE_AMBIGUOUS_LAGGING, QUOTE_CLOSING_SINGLE );
}
- else if( countUnknown == 1 && countLagging == 0 ) {
- tree.replaceAll( AMBIGUOUS, QUOTE_CLOSING_SINGLE );
+ else if( countSingles == 1 && countLagging == 0 ) {
+ tree.replaceAll( QUOTE_AMBIGUOUS_SINGLE, QUOTE_CLOSING_SINGLE );
}
}
- if( countUnknown == 0 && countLeading == 1 && countLagging == 0 &&
+ if( countSingles == 0 && countLeading == 1 && countLagging == 0 &&
!tree.hasOpeningSingleQuote() && tree.hasClosingSingleQuote() ) {
tree.replaceAll( QUOTE_AMBIGUOUS_LEADING, QUOTE_OPENING_SINGLE );
}
if( !tree.hasOpeningSingleQuote() && !tree.hasClosingSingleQuote() ||
tree.isBalanced() ) {
- if( countUnknown == 0 && countLeading > 0 && countLagging == 0 ) {
+ if( countSingles == 0 && countLeading > 0 && countLagging == 0 ) {
tree.replaceAll( QUOTE_AMBIGUOUS_LEADING, QUOTE_APOSTROPHE );
}
- if( countUnknown == 0 && countLeading == 0 && countLagging > 0 ) {
+ if( countSingles == 0 && countLeading == 0 && countLagging > 0 ) {
tree.replaceAll( QUOTE_AMBIGUOUS_LAGGING, QUOTE_APOSTROPHE );
}
src/main/java/com/whitemagicsoftware/keenquotes/parser/Contractions.java
}
- public boolean beganEndedUmambiguously( final String word ) {
+ public boolean beganEndedUnambiguously( final String word ) {
assert word != null;
return getBeganEndedUnambiguous().contains( word );
src/main/java/com/whitemagicsoftware/keenquotes/parser/QuoteEmitter.java
};
- private static final LexemeType[] QUOTE_SINGLE_QUOTE_DOUBLE = {
- QUOTE_SINGLE, QUOTE_DOUBLE
- };
-
- /**
- * Single quotes preceded by these {@link LexemeType}s may be opening quotes.
- */
- private static final LexemeType[] LEADING_QUOTE_OPENING_SINGLE =
- new LexemeType[]{
- LexemeType.SOT, SPACE, DASH, QUOTE_DOUBLE, OPENING_GROUP, EOL, EOP
- };
-
- /**
- * Single quotes succeeded by these {@link LexemeType}s may be opening quotes.
- */
- private static final LexemeType[] LAGGING_QUOTE_OPENING_SINGLE =
- new LexemeType[]{
- WORD, ELLIPSIS, QUOTE_SINGLE, QUOTE_DOUBLE
- };
-
- /**
- * Single quotes preceded by these {@link LexemeType}s may be closing quotes.
- */
- private static final LexemeType[] LEADING_QUOTE_CLOSING_SINGLE =
- new LexemeType[]{
- WORD, NUMBER, PERIOD, PUNCT, ELLIPSIS, QUOTE_DOUBLE
- };
-
- /**
- * Single quotes succeeded by these {@link LexemeType}s may be closing quotes.
- */
- private static final LexemeType[] LAGGING_QUOTE_CLOSING_SINGLE =
- new LexemeType[]{
- SPACE, HYPHEN, DASH, PUNCT, PERIOD, ELLIPSIS, QUOTE_DOUBLE, CLOSING_GROUP,
- ENDING
- };
-
- /**
- * Double quotes preceded by these {@link LexemeType}s may be opening quotes.
- */
- private static final LexemeType[] LEADING_QUOTE_OPENING_DOUBLE =
- new LexemeType[]{
- LexemeType.SOT, SPACE, DASH, EQUALS, QUOTE_SINGLE, OPENING_GROUP, EOL, EOP
- };
-
- /**
- * Double quotes succeeded by these {@link LexemeType}s may be opening quotes.
- */
- private static final LexemeType[] LAGGING_QUOTE_OPENING_DOUBLE =
- new LexemeType[]{
- WORD, NUMBER, DASH, ELLIPSIS, OPENING_GROUP, QUOTE_SINGLE,
- QUOTE_SINGLE_OPENING, QUOTE_SINGLE_CLOSING, QUOTE_DOUBLE
- };
-
- /**
- * Double quotes preceded by these {@link LexemeType}s may be closing quotes.
- */
- private static final LexemeType[] LEADING_QUOTE_CLOSING_DOUBLE =
- new LexemeType[]{
- WORD, NUMBER, PERIOD, PUNCT, DASH, ELLIPSIS, CLOSING_GROUP, QUOTE_SINGLE,
- QUOTE_SINGLE_CLOSING, QUOTE_SINGLE_OPENING
- };
-
- /**
- * Double quotes succeeded by these {@link LexemeType}s may be closing quotes.
- */
- private static final LexemeType[] LAGGING_QUOTE_CLOSING_DOUBLE =
- new LexemeType[]{
- SPACE, PUNCT, PERIOD, EQUALS, HYPHEN, DASH, QUOTE_SINGLE, CLOSING_GROUP,
- ENDING
- };
-
- private final CircularFifoQueue<Lexeme> mQ = new CircularFifoQueue<>( 4 );
- private final String mText;
- private final Contractions mContractions;
- private final Consumer<Token> mConsumer;
-
- public QuoteEmitter(
- final String text,
- final Contractions contractions,
- final Consumer<Token> consumer
- ) {
- assert text != null;
- assert contractions != null;
-
- mText = text;
- mContractions = contractions;
- mConsumer = consumer;
- }
-
- /**
- * Scans the given text document for quotation marks and passes them to the
- * given {@link Token} {@link Consumer}.
- *
- * @param text The prose to lex.
- * @param contractions List of ambiguous and unambiguous contractions.
- * @param consumer Receives
- */
- public static void analyze(
- final String text,
- final Contractions contractions,
- final Consumer<Token> consumer,
- final LexerFilter filter
- ) {
- final var emitter = new QuoteEmitter( text, contractions, consumer );
- Lexer.lex( text, emitter, filter );
- }
-
- /**
- * @param lexeme the input argument
- */
- @Override
- public void accept( final Lexeme lexeme ) {
- mQ.add( lexeme );
-
- if( mQ.size() == 4 ) {
- parse();
- }
- }
-
- private void parse() {
- final var lex1 = mQ.get( 0 );
- final var lex2 = mQ.get( 1 );
- final var lex3 = mQ.get( 2 );
- final var lex4 = mQ.get( 3 );
-
- // <y'all>, <Ph.D.'ll>, <20's>, <she's>
- if( match( WORD_PERIOD_NUMBER, QUOTE_SINGLE, WORD, ANY ) ) {
- emit( QUOTE_APOSTROPHE, lex2 );
- }
- // <'n'>, <'N'>, <'owlin'>
- else if(
- match( ANY, QUOTE_SINGLE, WORD, QUOTE_SINGLE ) &&
- mContractions.beganEndedUmambiguously(lex3.toString(mText))
- ) {
- emit( QUOTE_APOSTROPHE, lex2 );
- emit( QUOTE_APOSTROPHE, lex4 );
- mQ.set( Lexeme.NONE, 3 );
- }
- // <2''>
- else if( match( NUMBER, QUOTE_SINGLE, QUOTE_SINGLE, ANY ) ) {
- emit( QUOTE_PRIME_DOUBLE, lex2.began(), lex3.ended() );
- mQ.set( Lexeme.NONE, 2 );
- }
- // <2'>
- else if( match( NUMBER, QUOTE_SINGLE, ANY, ANY ) ) {
- emit( QUOTE_PRIME_SINGLE, lex2 );
- }
- // <2">
- else if( match( NUMBER, QUOTE_DOUBLE, ANY, ANY ) ) {
- emit( QUOTE_PRIME_DOUBLE, lex2 );
- }
- // <thinkin'>
- else if(
- match( WORD, QUOTE_SINGLE, ANY, ANY ) &&
- mContractions.endedUnambiguously( lex1.toString( mText ) )
- ) {
- emit( QUOTE_APOSTROPHE, lex2 );
- }
- // <'02>
- else if( match( ANY, QUOTE_SINGLE, NUMBER, SPACE_PUNCT ) ) {
- emit( QUOTE_APOSTROPHE, lex2 );
- }
- // <'20s>
- else if(
- match( ANY, QUOTE_SINGLE, NUMBER, WORD ) &&
- "s".equalsIgnoreCase( lex4.toString( mText ) )
- ) {
- emit( QUOTE_APOSTROPHE, lex2 );
- }
- // <.'\n>
- else if( match( PUNCT_PERIOD_ELLIPSIS_DASH, QUOTE_SINGLE, ENDING, ANY ) ) {
- emit( QUOTE_CLOSING_SINGLE, lex2 );
- }
- // <\'>
- else if( match( ESC_SINGLE, ANY, ANY, ANY ) ) {
- emit( QUOTE_STRAIGHT_SINGLE, lex1 );
- }
- // <\">
- else if( match( ESC_DOUBLE, ANY, ANY, ANY ) ) {
- emit( QUOTE_STRAIGHT_DOUBLE, lex1 );
-
- // <\"'--->
- if( match( ESC_DOUBLE, QUOTE_SINGLE, SPACE_DASH_ENDING, ANY ) ) {
- emit( QUOTE_CLOSING_SINGLE, lex2 );
- }
- }
- // <---'" >
- else if( match( DASH, QUOTE_SINGLE, QUOTE_DOUBLE, SPACE_ENDING ) ) {
- emit( QUOTE_CLOSING_SINGLE, lex2 );
- }
- // <o’-lantern>, <o' fellow>, <O'-the>
- else if(
- match( WORD, QUOTE_SINGLE, SPACE_HYPHEN, WORD ) &&
- "o".equalsIgnoreCase( lex1.toString( mText ) )
- ) {
- emit( QUOTE_APOSTROPHE, lex2 );
- }
- // <"">, <"...>, <"word>, <---"word>
- else if(
- match(
- LEADING_QUOTE_OPENING_DOUBLE, QUOTE_DOUBLE,
- LAGGING_QUOTE_OPENING_DOUBLE, ANY
- )
- ) {
- emit( QUOTE_OPENING_DOUBLE, lex2 );
- }
- // <..."'>, <word"'>, <?"'>, <word"?>
- else if(
- match(
- LEADING_QUOTE_CLOSING_DOUBLE, QUOTE_DOUBLE,
- LAGGING_QUOTE_CLOSING_DOUBLE, ANY
- )
- ) {
- emit( QUOTE_CLOSING_DOUBLE, lex2 );
- }
- // < ''E>
- else if( match( SPACE_SOT, QUOTE_SINGLE, QUOTE_SINGLE, WORD ) ) {
- // Consume both immediately to avoid the false ambiguity <'e>.
- emit( QUOTE_OPENING_SINGLE, lex2 );
- emit( QUOTE_APOSTROPHE, lex3 );
- mQ.set( Lexeme.NONE, 1 );
- mQ.set( Lexeme.NONE, 2 );
- }
- // <'...>, <'word>, <---'word>, < 'nation>
- else if(
- match(
- LEADING_QUOTE_OPENING_SINGLE, QUOTE_SINGLE,
- LAGGING_QUOTE_OPENING_SINGLE, ANY )
- ) {
- final var word = lex3.toString( mText );
-
- if( mContractions.beganAmbiguously( word ) ) {
- emit( QUOTE_AMBIGUOUS_LEADING, lex2 );
- }
- else if( mContractions.beganUnambiguously( word ) ) {
- emit( QUOTE_APOSTROPHE, lex2 );
- }
- // <"'"nested>
- else if( match( QUOTE_DOUBLE, QUOTE_SINGLE, QUOTE_DOUBLE, WORD ) ) {
- emit( QUOTE_OPENING_SINGLE, lex2 );
- }
- // <"'" >
- else if( match( QUOTE_DOUBLE, QUOTE_SINGLE, QUOTE_DOUBLE, ANY ) ) {
- emit( lex2 );
- }
- // < '" >
- else if( match( ANY, QUOTE_SINGLE, LAGGING_QUOTE_OPENING_SINGLE, ANY ) ) {
- emit( QUOTE_OPENING_SINGLE, lex2 );
- }
- // Ambiguous
- else {
- emit( QUOTE_AMBIGUOUS_LEADING, lex2 );
- }
- }
- // <word'">, <...'--->, <"' >
- else if(
- match(
- LEADING_QUOTE_CLOSING_SINGLE, QUOTE_SINGLE,
- LAGGING_QUOTE_CLOSING_SINGLE, ANY
- )
- ) {
- final var word = lex1.toString( mText );
-
- if( mContractions.endedAmbiguously( word ) ) {
- emit( QUOTE_AMBIGUOUS_LAGGING, lex2 );
- }
- else {
- emit( QUOTE_CLOSING_SINGLE, lex2 );
- }
- }
- // <word';> (contraction inferred by previous matches)
- else if( match( WORD, QUOTE_SINGLE, PUNCT_PERIOD, ANY ) ) {
- emit( QUOTE_APOSTROPHE, lex2 );
- }
- // <---'">
- else if( match( DASH, QUOTE_SINGLE, QUOTE_DOUBLE, ANY ) ) {
- emit( QUOTE_CLOSING_SINGLE, lex2 );
- }
- // <'42>, <'-3.14>
- else if( match( ANY, QUOTE_SINGLE, NUMBER, ANY ) ) {
- emit( QUOTE_OPENING_SINGLE, lex2 );
- }
- // <PRE-PARSED><'---.>
- else if( match( LexemeType.NONE, QUOTE_SINGLE, ANY, ANY ) ) {
- emit( QUOTE_CLOSING_SINGLE, lex2 );
- }
- // <''Cause >
- else if( match( QUOTE_SINGLE, QUOTE_SINGLE, WORD, ANY ) ) {
- final var word = lex3.toString( mText );
-
- if( mContractions.beganAmbiguously( word ) ) {
- emit( QUOTE_AMBIGUOUS_LEADING, lex2 );
- }
- else if( mContractions.beganUnambiguously( word ) ) {
- emit( QUOTE_APOSTROPHE, lex2 );
- }
- else {
- emit( lex2 );
- }
- }
- // Ambiguous (no match)
- else if( match( ANY, QUOTE_SINGLE_QUOTE_DOUBLE, ANY, ANY ) ) {
- emit( lex2 );
- }
- }
-
- private void emit( final TokenType tokenType, final Lexeme lexeme ) {
- mConsumer.accept( new Token( tokenType, lexeme ) );
- }
-
- private void emit(
- final TokenType tokenType,
- final int began,
- final int ended ) {
- mConsumer.accept( new Token( tokenType, began, ended ) );
- }
-
- /**
- * Emits a token that represents an ambiguous quotation mark.
- *
- * @param lexeme A quotation mark that could not be curled.
- */
- private void emit( final Lexeme lexeme ) {
- mConsumer.accept( new Token( AMBIGUOUS, lexeme ) );
- }
-
- private boolean match(
- final LexemeType l1,
- final LexemeType l2,
- final LexemeType l3,
- final LexemeType l4 ) {
- return mQ.get( 0 ).isType( l1 ) &&
- mQ.get( 1 ).isType( l2 ) &&
- mQ.get( 2 ).isType( l3 ) &&
- mQ.get( 3 ).isType( l4 );
- }
-
- private boolean match(
- final LexemeType[] l1,
- final LexemeType l2,
- final LexemeType l3,
- final LexemeType l4 ) {
- return mQ.get( 0 ).isType( l1 ) &&
- mQ.get( 1 ).isType( l2 ) &&
- mQ.get( 2 ).isType( l3 ) &&
- mQ.get( 3 ).isType( l4 );
- }
-
- private boolean match(
- final LexemeType l1,
- final LexemeType[] l2,
+ /**
+ * Single quotes preceded by these {@link LexemeType}s may be opening quotes.
+ */
+ private static final LexemeType[] LEADING_QUOTE_OPENING_SINGLE =
+ new LexemeType[]{
+ LexemeType.SOT, SPACE, DASH, QUOTE_DOUBLE, OPENING_GROUP, EOL, EOP
+ };
+
+ /**
+ * Single quotes succeeded by these {@link LexemeType}s may be opening quotes.
+ */
+ private static final LexemeType[] LAGGING_QUOTE_OPENING_SINGLE =
+ new LexemeType[]{
+ WORD, ELLIPSIS, QUOTE_SINGLE, QUOTE_DOUBLE
+ };
+
+ /**
+ * Single quotes preceded by these {@link LexemeType}s may be closing quotes.
+ */
+ private static final LexemeType[] LEADING_QUOTE_CLOSING_SINGLE =
+ new LexemeType[]{
+ WORD, NUMBER, PERIOD, PUNCT, ELLIPSIS, QUOTE_DOUBLE
+ };
+
+ /**
+ * Single quotes succeeded by these {@link LexemeType}s may be closing quotes.
+ */
+ private static final LexemeType[] LAGGING_QUOTE_CLOSING_SINGLE =
+ new LexemeType[]{
+ SPACE, HYPHEN, DASH, PUNCT, PERIOD, ELLIPSIS, QUOTE_DOUBLE, CLOSING_GROUP,
+ ENDING
+ };
+
+ /**
+ * Double quotes preceded by these {@link LexemeType}s may be opening quotes.
+ */
+ private static final LexemeType[] LEADING_QUOTE_OPENING_DOUBLE =
+ new LexemeType[]{
+ LexemeType.SOT, SPACE, DASH, EQUALS, QUOTE_SINGLE, OPENING_GROUP, EOL, EOP
+ };
+
+ /**
+ * Double quotes succeeded by these {@link LexemeType}s may be opening quotes.
+ */
+ private static final LexemeType[] LAGGING_QUOTE_OPENING_DOUBLE =
+ new LexemeType[]{
+ WORD, PUNCT, NUMBER, DASH, ELLIPSIS, OPENING_GROUP, QUOTE_SINGLE,
+ QUOTE_SINGLE_OPENING, QUOTE_SINGLE_CLOSING, QUOTE_DOUBLE
+ };
+
+ /**
+ * Double quotes preceded by these {@link LexemeType}s may be closing quotes.
+ */
+ private static final LexemeType[] LEADING_QUOTE_CLOSING_DOUBLE =
+ new LexemeType[]{
+ WORD, NUMBER, PERIOD, PUNCT, DASH, ELLIPSIS, CLOSING_GROUP, QUOTE_SINGLE,
+ QUOTE_SINGLE_CLOSING, QUOTE_SINGLE_OPENING
+ };
+
+ /**
+ * Double quotes succeeded by these {@link LexemeType}s may be closing quotes.
+ */
+ private static final LexemeType[] LAGGING_QUOTE_CLOSING_DOUBLE =
+ new LexemeType[]{
+ SPACE, PUNCT, PERIOD, EQUALS, HYPHEN, DASH, QUOTE_SINGLE, CLOSING_GROUP,
+ ENDING
+ };
+
+ private final CircularFifoQueue<Lexeme> mQ = new CircularFifoQueue<>( 4 );
+ private final String mText;
+ private final Contractions mContractions;
+ private final Consumer<Token> mConsumer;
+
+ public QuoteEmitter(
+ final String text,
+ final Contractions contractions,
+ final Consumer<Token> consumer
+ ) {
+ assert text != null;
+ assert contractions != null;
+
+ mText = text;
+ mContractions = contractions;
+ mConsumer = consumer;
+ }
+
+ /**
+ * Scans the given text document for quotation marks and passes them to the
+ * given {@link Token} {@link Consumer}.
+ *
+ * @param text The prose to lex.
+ * @param contractions List of ambiguous and unambiguous contractions.
+ * @param consumer Receives
+ */
+ public static void analyze(
+ final String text,
+ final Contractions contractions,
+ final Consumer<Token> consumer,
+ final LexerFilter filter
+ ) {
+ final var emitter = new QuoteEmitter( text, contractions, consumer );
+ Lexer.lex( text, emitter, filter );
+ }
+
+ /**
+ * @param lexeme the input argument
+ */
+ @Override
+ public void accept( final Lexeme lexeme ) {
+ mQ.add( lexeme );
+
+ if( mQ.size() == 4 ) {
+ parse();
+ }
+ }
+
+ private void parse() {
+ final var lex1 = mQ.get( 0 );
+ final var lex2 = mQ.get( 1 );
+ final var lex3 = mQ.get( 2 );
+ final var lex4 = mQ.get( 3 );
+
+ // <y'all>, <Ph.D.'ll>, <20's>, <she's>
+ if( match( WORD_PERIOD_NUMBER, QUOTE_SINGLE, WORD, ANY ) ) {
+ emit( QUOTE_APOSTROPHE, lex2 );
+ }
+ // <'n'>, <'N'>, <'owlin'>
+ else if(
+ match( ANY, QUOTE_SINGLE, WORD, QUOTE_SINGLE ) &&
+ mContractions.beganEndedUnambiguously( lex3.toString( mText ) )
+ ) {
+ emit( QUOTE_APOSTROPHE, lex2 );
+ emit( QUOTE_APOSTROPHE, lex4 );
+ mQ.set( Lexeme.NONE, 3 );
+ }
+ // <2''>
+ else if( match( NUMBER, QUOTE_SINGLE, QUOTE_SINGLE, ANY ) ) {
+ emit( QUOTE_PRIME_DOUBLE, lex2.began(), lex3.ended() );
+ mQ.set( Lexeme.NONE, 2 );
+ }
+ // <2'>
+ else if( match( NUMBER, QUOTE_SINGLE, ANY, ANY ) ) {
+ emit( QUOTE_PRIME_SINGLE, lex2 );
+ }
+ // <2">
+ else if( match( NUMBER, QUOTE_DOUBLE, ANY, ANY ) ) {
+ emit( QUOTE_PRIME_DOUBLE, lex2 );
+ }
+ // <thinkin'>
+ else if(
+ match( WORD, QUOTE_SINGLE, ANY, ANY ) &&
+ mContractions.endedUnambiguously( lex1.toString( mText ) )
+ ) {
+ emit( QUOTE_APOSTROPHE, lex2 );
+ }
+ // <'02>
+ else if( match( ANY, QUOTE_SINGLE, NUMBER, SPACE_PUNCT ) ) {
+ emit( QUOTE_APOSTROPHE, lex2 );
+ }
+ // <'20s>
+ else if(
+ match( ANY, QUOTE_SINGLE, NUMBER, WORD ) &&
+ "s".equalsIgnoreCase( lex4.toString( mText ) )
+ ) {
+ emit( QUOTE_APOSTROPHE, lex2 );
+ }
+ // <.'\n>
+ else if( match( PUNCT_PERIOD_ELLIPSIS_DASH, QUOTE_SINGLE, ENDING, ANY ) ) {
+ emit( QUOTE_CLOSING_SINGLE, lex2 );
+ }
+ // <\'>
+ else if( match( ESC_SINGLE, ANY, ANY, ANY ) ) {
+ emit( QUOTE_STRAIGHT_SINGLE, lex1 );
+ }
+ // <\">
+ else if( match( ESC_DOUBLE, ANY, ANY, ANY ) ) {
+ emit( QUOTE_STRAIGHT_DOUBLE, lex1 );
+
+ // <\"'--->
+ if( match( ESC_DOUBLE, QUOTE_SINGLE, SPACE_DASH_ENDING, ANY ) ) {
+ emit( QUOTE_CLOSING_SINGLE, lex2 );
+ }
+ }
+ // <---'" >
+ else if( match( DASH, QUOTE_SINGLE, QUOTE_DOUBLE, SPACE_ENDING ) ) {
+ emit( QUOTE_CLOSING_SINGLE, lex2 );
+ }
+ // <o’-lantern>, <o' fellow>, <O'-the>
+ else if(
+ match( WORD, QUOTE_SINGLE, SPACE_HYPHEN, WORD ) &&
+ "o".equalsIgnoreCase( lex1.toString( mText ) )
+ ) {
+ emit( QUOTE_APOSTROPHE, lex2 );
+ }
+ // <"">, <"...>, <"word>, <---"word>
+ else if(
+ match(
+ LEADING_QUOTE_OPENING_DOUBLE, QUOTE_DOUBLE,
+ LAGGING_QUOTE_OPENING_DOUBLE, ANY
+ )
+ ) {
+ emit( QUOTE_OPENING_DOUBLE, lex2 );
+ }
+ // <..."'>, <word"'>, <?"'>, <word"?>
+ else if(
+ match(
+ LEADING_QUOTE_CLOSING_DOUBLE, QUOTE_DOUBLE,
+ LAGGING_QUOTE_CLOSING_DOUBLE, ANY
+ )
+ ) {
+ emit( QUOTE_CLOSING_DOUBLE, lex2 );
+ }
+ // < ''E>
+ else if( match( SPACE_SOT, QUOTE_SINGLE, QUOTE_SINGLE, WORD ) ) {
+ // Consume both immediately to avoid the false ambiguity <'e>.
+ emit( QUOTE_OPENING_SINGLE, lex2 );
+ emit( QUOTE_APOSTROPHE, lex3 );
+ mQ.set( Lexeme.NONE, 1 );
+ mQ.set( Lexeme.NONE, 2 );
+ }
+ // <'...>, <'word>, <---'word>, < 'nation>
+ else if(
+ match(
+ LEADING_QUOTE_OPENING_SINGLE, QUOTE_SINGLE,
+ LAGGING_QUOTE_OPENING_SINGLE, ANY )
+ ) {
+ final var word = lex3.toString( mText );
+
+ if( mContractions.beganAmbiguously( word ) ) {
+ emit( QUOTE_AMBIGUOUS_LEADING, lex2 );
+ }
+ else if( mContractions.beganUnambiguously( word ) ) {
+ emit( QUOTE_APOSTROPHE, lex2 );
+ }
+ // <"'"nested>
+ else if( match( QUOTE_DOUBLE, QUOTE_SINGLE, QUOTE_DOUBLE, WORD ) ) {
+ emit( QUOTE_OPENING_SINGLE, lex2 );
+ }
+ // <"'" >
+ else if( match( QUOTE_DOUBLE, QUOTE_SINGLE, QUOTE_DOUBLE, ANY ) ) {
+ emit( QUOTE_AMBIGUOUS_SINGLE, lex2 );
+ }
+ // < '" >
+ else if( match( ANY, QUOTE_SINGLE, LAGGING_QUOTE_OPENING_SINGLE, ANY ) ) {
+ emit( QUOTE_OPENING_SINGLE, lex2 );
+ }
+ // Ambiguous
+ else {
+ emit( QUOTE_AMBIGUOUS_LEADING, lex2 );
+ }
+ }
+ // <word'">, <...'--->, <"' >
+ else if(
+ match(
+ LEADING_QUOTE_CLOSING_SINGLE, QUOTE_SINGLE,
+ LAGGING_QUOTE_CLOSING_SINGLE, ANY
+ )
+ ) {
+ final var word = lex1.toString( mText );
+
+ if( mContractions.endedAmbiguously( word ) ) {
+ emit( QUOTE_AMBIGUOUS_LAGGING, lex2 );
+ }
+ else {
+ emit( QUOTE_CLOSING_SINGLE, lex2 );
+ }
+ }
+ // <word';> (contraction inferred by previous matches)
+ else if( match( WORD, QUOTE_SINGLE, PUNCT_PERIOD, ANY ) ) {
+ emit( QUOTE_APOSTROPHE, lex2 );
+ }
+ // <---'">
+ else if( match( DASH, QUOTE_SINGLE, QUOTE_DOUBLE, ANY ) ) {
+ emit( QUOTE_CLOSING_SINGLE, lex2 );
+ }
+ // <'42>, <'-3.14>
+ else if( match( ANY, QUOTE_SINGLE, NUMBER, ANY ) ) {
+ emit( QUOTE_OPENING_SINGLE, lex2 );
+ }
+ // <PRE-PARSED><'---.>
+ else if( match( LexemeType.NONE, QUOTE_SINGLE, ANY, ANY ) ) {
+ emit( QUOTE_CLOSING_SINGLE, lex2 );
+ }
+ // <''Cause >
+ else if( match( QUOTE_SINGLE, QUOTE_SINGLE, WORD, ANY ) ) {
+ final var word = lex3.toString( mText );
+
+ if( mContractions.beganAmbiguously( word ) ) {
+ emit( QUOTE_AMBIGUOUS_LEADING, lex2 );
+ }
+ else if( mContractions.beganUnambiguously( word ) ) {
+ emit( QUOTE_APOSTROPHE, lex2 );
+ }
+ else {
+ emit( QUOTE_AMBIGUOUS_SINGLE, lex2 );
+ }
+ }
+ else if( match( ANY, QUOTE_DOUBLE, ANY, ANY ) ) {
+ emit( QUOTE_AMBIGUOUS_DOUBLE, lex2 );
+ }
+ // Ambiguous (no match)
+ else if( match( ANY, QUOTE_SINGLE, ANY, ANY ) ) {
+ emit( QUOTE_AMBIGUOUS_SINGLE, lex2 );
+ }
+ }
+
+ private void emit( final TokenType tokenType, final Lexeme lexeme ) {
+ mConsumer.accept( new Token( tokenType, lexeme ) );
+ }
+
+ private void emit(
+ final TokenType tokenType,
+ final int began,
+ final int ended ) {
+ mConsumer.accept( new Token( tokenType, began, ended ) );
+ }
+
+ private boolean match(
+ final LexemeType l1,
+ final LexemeType l2,
+ final LexemeType l3,
+ final LexemeType l4 ) {
+ return mQ.get( 0 ).isType( l1 ) &&
+ mQ.get( 1 ).isType( l2 ) &&
+ mQ.get( 2 ).isType( l3 ) &&
+ mQ.get( 3 ).isType( l4 );
+ }
+
+ private boolean match(
+ final LexemeType[] l1,
+ final LexemeType l2,
final LexemeType l3,
final LexemeType l4 ) {
src/main/java/com/whitemagicsoftware/keenquotes/parser/Token.java
boolean isAmbiguous() {
- return mTokenType == AMBIGUOUS ||
+ return mTokenType == QUOTE_AMBIGUOUS_SINGLE ||
+ mTokenType == QUOTE_AMBIGUOUS_DOUBLE ||
mTokenType == QUOTE_AMBIGUOUS_LEADING ||
mTokenType == QUOTE_AMBIGUOUS_LAGGING;
src/main/java/com/whitemagicsoftware/keenquotes/parser/TokenType.java
QUOTE_AMBIGUOUS_LEADING( "leading-ambiguous" ),
QUOTE_AMBIGUOUS_LAGGING( "lagging-ambiguous" ),
- AMBIGUOUS( "ambiguous" ),
+ QUOTE_AMBIGUOUS_DOUBLE( "double-ambiguous" ),
+ QUOTE_AMBIGUOUS_SINGLE( "single-ambiguous" ),
NONE;
src/main/java/com/whitemagicsoftware/keenquotes/parser/Tree.java
*/
public boolean hasOpeningSingleQuote() {
- return mOpening.isType( QUOTE_OPENING_SINGLE );
+ return isOpening( QUOTE_OPENING_SINGLE );
+ }
+
+ public boolean hasOpeningDoubleQuote() {
+ return isOpening( QUOTE_OPENING_DOUBLE );
+ }
+
+ private boolean isOpening( final TokenType tokenType ) {
+ return mOpening.isType( tokenType );
}
src/test/java/com/whitemagicsoftware/keenquotes/parser/CurlerTest.java
/**
* Tests that straight quotes are converted to curly quotes.
- *
- * @throws IOException Error opening file full of fun.
*/
@Test
- public void test_Parse_StraightQuotes_CurlyQuotes() throws IOException {
+ public void test_Parse_UncurledQuotes1_CurlyQuotes() throws IOException {
testCurler( createCurler( FILTER_PLAIN ), "unambiguous-1-pass.txt" );
}
@Test
- public void test_XmlParse_StraightQuotes_CurlyQuotes() throws IOException {
+ public void test_Parse_UncurledQuotes2_CurlyQuotes() throws IOException {
+ testCurler( createCurler( FILTER_PLAIN ), "unambiguous-2-pass.txt" );
+ }
+
+ @Test
+ public void test_Parse_UncurledQuotesXml_CurlyQuotes() throws IOException {
testCurler( createCurler( FILTER_XML ), "xml.txt" );
}
src/test/resources/com/whitemagicsoftware/keenquotes/texts/unambiguous-2-pass.txt
# ########################################################################
+# Double quotes
+# ########################################################################
+
+""
+&ldquo;&rdquo;
+
+"" ""
+&ldquo;&rdquo; &ldquo;&rdquo;
+
+""""
+&ldquo;&rdquo;&ldquo;&rdquo;
+
+" "
+&ldquo; &rdquo;
+
+" " " "
+&ldquo; &rdquo; &ldquo; &rdquo;
+
+"?"
+&ldquo;?&rdquo;
+
+"!"
+&ldquo;!&rdquo;
+
+"‽"
+&ldquo;‽&rdquo;
+
+"...?!"
+&ldquo;...?!&rdquo;
+
+" ... "
+&ldquo; ... &rdquo;
+
+# ########################################################################
# Single quotes
# ########################################################################
+
+' '
+&lsquo; &rsquo;
+
+' ' ' '
+&lsquo; &rsquo; &lsquo; &rsquo;
+
+' ... '
+&lsquo; ... &rsquo;
+
+' ... ' ' ... '
+&lsquo; ... &rsquo; &lsquo; ... &rsquo;
+
+'?'
+&lsquo;?&rsquo;
+
+'!'
+&lsquo;!&rsquo;
+
+'?!'
+&lsquo;?!&rsquo;
+
+'‽'
+&lsquo;‽&rsquo;
+
With 'quotes' in the middle.
With &lsquo;quotes&rsquo; in the middle.
Delta444 lines added, 369 lines removed, 75-line increase