Dave Jarvis' Repositories

git clone https://repo.autonoma.ca/repo/keenquotes.git

Emits em- and en-dashes

AuthorDaveJarvis <email>
Date2025-08-30 11:16:16 GMT-0700
Commit51c9559ce6a62a51631ee589e6e72af20ac0c47d
Parentd4a8ea6
src/main/java/com/whitemagicsoftware/keenquotes/lex/LexemeGlyph.java
LEX_DOUBLE_CHEVRON_RIGHT( '»' ),
LEX_SINGLE_CHEVRON_LEFT( '‹' ),
- LEX_SINGLE_CHEVRON_RIGHT( '›' );
+ LEX_SINGLE_CHEVRON_RIGHT( '›' ),
+
+ LEX_SPACE( ' ' ),
+
+ LEX_ELLIPSES( '…' ),
+ LEX_DASH_EN( '–' ),
+ LEX_DASH_EM( '—' ),
+ LEX_HYPHEN( '-' );
private final char mGlyph;
src/main/java/com/whitemagicsoftware/keenquotes/lex/LexemeType.java
public static final LexemeType EOP = new LexemeType();
public static final LexemeType EOT = new LexemeType();
- public static final LexemeType SPACE = new LexemeType();
+ public static final LexemeType SPACE = new LexemeType( LEX_SPACE );
public static final LexemeType WORD = new LexemeType();
public static final LexemeType NUMBER = new LexemeType();
public static final LexemeType PUNCT = new LexemeType();
public static final LexemeType OPENING_GROUP = new LexemeType();
public static final LexemeType CLOSING_GROUP = new LexemeType();
- public static final LexemeType HYPHEN = new LexemeType();
- public static final LexemeType DASH = new LexemeType();
+ public static final LexemeType DASH_EN = new LexemeType( LEX_DASH_EN );
+ public static final LexemeType DASH_EM = new LexemeType( LEX_DASH_EM );
+ public static final LexemeType DASH_LINE = new LexemeType();
+ public static final LexemeType HYPHEN = new LexemeType( LEX_HYPHEN );
public static final LexemeType EQUALS = new LexemeType();
public static final LexemeType PERIOD = new LexemeType();
- public static final LexemeType ELLIPSIS = new LexemeType();
+ public static final LexemeType ELLIPSIS = new LexemeType( LEX_ELLIPSES );
public static final LexemeType ENDING = new LexemeType();
public static final LexemeType ANY = new LexemeType();
src/main/java/com/whitemagicsoftware/keenquotes/lex/Lexer.java
);
- private static final Set<Character> DASH_CHARS = new HashSet<>(
- Arrays.asList( '-', '–', '—', '―' )
- );
-
/**
* Consider {@code _} and {@code *} as letters (in a word) because plain
}
else if( curr == '\r' || curr == '\n' ) {
- final var cr = new int[]{curr == '\r' ? 1 : 0};
- final var lf = new int[]{curr == '\n' ? 1 : 0};
+ final var cr = new int[]{ curr == '\r' ? 1 : 0 };
+ final var lf = new int[]{ curr == '\n' ? 1 : 0 };
// Swallow all consecutive CR (Mac), CRLF (Windows), and/or LF (Unix).
token = QUOTE_SINGLE;
}
- else if( curr == '-' && i.peek() != '-' ) {
- token = HYPHEN;
+ else if( isHyphen( curr ) ) {
+ final var began = i.index();
+ i.skip( Lexer::isHyphen );
+ final var count = i.index() - began;
+
+ token = switch( count ) {
+ case 0 -> HYPHEN;
+ case 1 -> DASH_EN;
+ case 2 -> DASH_EM;
+ default -> DASH_LINE;
+ };
}
- else if( isDash( curr ) ) {
- i.skip( Lexer::isDash );
- token = DASH;
+ else if( curr == '–' ) {
+ token = DASH_EN;
+ }
+ else if( curr == '—' ) {
+ token = DASH_EM;
}
else if( curr == '(' || curr == '{' || curr == '[' ) {
/**
- * Answers whether the given character may be part of an en- or em-dash.
- * This must be called after it is known that the character isn't a lone
- * hyphen.
+ * Answers whether the given character is a hyphen.
*
- * @param curr The character to check as being a dash.
- * @return {@code true} if the given character is part of a dash.
+ * @param curr The character to check as being a hyphen.
+ * @return {@code true} if the given character is a hyphen.
*/
- private static boolean isDash( final char curr ) {
- return DASH_CHARS.contains( curr );
+ private static boolean isHyphen( final char curr ) {
+ return curr == '-';
}
src/main/java/com/whitemagicsoftware/keenquotes/parser/QuoteEmitter.java
};
- private static final LexemeType[] PUNCT_PERIOD_ELLIPSIS_DASH = {
- PUNCT, PERIOD, ELLIPSIS, DASH
- };
-
- private static final LexemeType[] PUNCT_PERIOD = {
- PUNCT, PERIOD
- };
-
- private static final LexemeType[] SPACE_DASH_ENDING = {
- SPACE, DASH, ENDING
- };
-
- private static final LexemeType[] SPACE_ENDING = {
- SPACE, ENDING
- };
-
- private static final LexemeType[] SPACE_HYPHEN = {
- SPACE, HYPHEN
- };
-
- private static final LexemeType[] SPACE_PUNCT = {
- SPACE, PUNCT
- };
-
- private static final LexemeType[] SPACE_SOT = {
- SPACE, SOT
- };
-
- /**
- * Single quotes preceded by these {@link LexemeType}s may be opening quotes.
- */
- private static final LexemeType[] LEADING_QUOTE_OPENING_SINGLE =
- new LexemeType[]{
- LexemeType.SOT, SPACE, DASH, QUOTE_DOUBLE, OPENING_GROUP, EOL, EOP
- };
-
- /**
- * Single quotes succeeded by these {@link LexemeType}s may be opening quotes.
- */
- private static final LexemeType[] LAGGING_QUOTE_OPENING_SINGLE =
- new LexemeType[]{
- WORD, ELLIPSIS, QUOTE_SINGLE, QUOTE_DOUBLE
- };
-
- /**
- * Single quotes preceded by these {@link LexemeType}s may be closing quotes.
- */
- private static final LexemeType[] LEADING_QUOTE_CLOSING_SINGLE =
- new LexemeType[]{
- WORD, NUMBER, PERIOD, PUNCT, ELLIPSIS, QUOTE_DOUBLE
- };
-
- /**
- * Single quotes succeeded by these {@link LexemeType}s may be closing quotes.
- */
- private static final LexemeType[] LAGGING_QUOTE_CLOSING_SINGLE =
- new LexemeType[]{
- SPACE, HYPHEN, DASH, PUNCT, PERIOD, ELLIPSIS, QUOTE_DOUBLE, CLOSING_GROUP,
- ENDING
- };
-
- /**
- * Double quotes preceded by these {@link LexemeType}s may be opening quotes.
- */
- private static final LexemeType[] LEADING_QUOTE_OPENING_DOUBLE =
- new LexemeType[]{
- LexemeType.SOT, SPACE, DASH, EQUALS, OPENING_GROUP, EOL, EOP
- };
-
- /**
- * Double quotes succeeded by these {@link LexemeType}s may be opening quotes.
- */
- private static final LexemeType[] LAGGING_QUOTE_OPENING_DOUBLE =
- new LexemeType[]{
- WORD, PUNCT, NUMBER, DASH, ELLIPSIS, OPENING_GROUP, QUOTE_SINGLE,
- QUOTE_SINGLE_OPENING, QUOTE_SINGLE_CLOSING, QUOTE_DOUBLE
- };
-
- /**
- * Double quotes preceded by these {@link LexemeType}s may be closing quotes.
- */
- private static final LexemeType[] LEADING_QUOTE_CLOSING_DOUBLE =
- new LexemeType[]{
- WORD, NUMBER, PERIOD, PUNCT, DASH, ELLIPSIS, CLOSING_GROUP, QUOTE_SINGLE,
- QUOTE_SINGLE_CLOSING, QUOTE_SINGLE_OPENING
- };
-
- /**
- * Double quotes succeeded by these {@link LexemeType}s may be closing quotes.
- */
- private static final LexemeType[] LAGGING_QUOTE_CLOSING_DOUBLE =
- new LexemeType[]{
- SPACE, PUNCT, PERIOD, EQUALS, HYPHEN, DASH, QUOTE_SINGLE, CLOSING_GROUP,
- ENDING
- };
-
- private final CircularFifoQueue<Lexeme> mQ = new CircularFifoQueue<>( 4 );
- private final String mText;
- private final Contractions mContractions;
- private final Consumer<Token> mConsumer;
-
- public QuoteEmitter(
- final String text,
- final Contractions contractions,
- final Consumer<Token> consumer
- ) {
- assert text != null;
- assert contractions != null;
-
- mText = text;
- mContractions = contractions;
- mConsumer = consumer;
- }
-
- /**
- * Scans the given text document for quotation marks and passes them to the
- * given {@link Token} {@link Consumer}.
- *
- * @param text The prose to lex.
- * @param contractions List of ambiguous and unambiguous contractions.
- * @param consumer Receives
- */
- public static void analyze(
- final String text,
- final Contractions contractions,
- final Consumer<Token> consumer,
- final LexerFilter filter
- ) {
- final var emitter = new QuoteEmitter( text, contractions, consumer );
- Lexer.lex( text, emitter, filter );
- }
-
- /**
- * @param lexeme the input argument
- */
- @Override
- public void accept( final Lexeme lexeme ) {
- mQ.add( lexeme );
-
- if( mQ.size() == 4 ) {
- parse();
- }
- }
-
- private void parse() {
- final var lex1 = mQ.get( 0 );
- final var lex2 = mQ.get( 1 );
- final var lex3 = mQ.get( 2 );
- final var lex4 = mQ.get( 3 );
-
- // <y'all>, <Ph.D.'ll>, <20's>, <she's>
- if( match( WORD_PERIOD_NUMBER, QUOTE_SINGLE, WORD, ANY ) ) {
- emit( QUOTE_APOSTROPHE, lex2 );
- }
- // <'n'>, <'N'>, <'owlin'>
- else if(
- match( ANY, QUOTE_SINGLE, WORD, QUOTE_SINGLE ) &&
- mContractions.beganEndedUnambiguously( lex3.toString( mText ) )
- ) {
- emit( QUOTE_APOSTROPHE, lex2 );
- emit( QUOTE_APOSTROPHE, lex4 );
- mQ.set( Lexeme.NONE, 3 );
- }
- // <2''>
- else if( match( NUMBER, QUOTE_SINGLE, QUOTE_SINGLE, ANY ) ) {
- // Force double primes to conform to the same constructor usage. This
- // simplifies the tokens, reduces some memory usage,
- final var lex = new Lexeme( PRIME_DOUBLE, lex2.began(), lex3.ended() );
-
- emit( QUOTE_PRIME_DOUBLE, lex );
- mQ.set( Lexeme.NONE, 2 );
- }
- // <2'>
- else if( match( NUMBER, QUOTE_SINGLE, ANY, ANY ) ) {
- emit( QUOTE_PRIME_SINGLE, lex2 );
- }
- // <2">
- else if( match( NUMBER, QUOTE_DOUBLE, ANY, ANY ) ) {
- emit( QUOTE_PRIME_DOUBLE, lex2 );
- }
- // <thinkin'>
- else if(
- match( WORD, QUOTE_SINGLE, ANY, ANY ) &&
- mContractions.endedUnambiguously( lex1.toString( mText ) )
- ) {
- emit( QUOTE_APOSTROPHE, lex2 );
- }
- // <'02>
- else if( match( ANY, QUOTE_SINGLE, NUMBER, SPACE_PUNCT ) ) {
- emit( QUOTE_APOSTROPHE, lex2 );
- }
- // <'20s>
- else if(
- match( ANY, QUOTE_SINGLE, NUMBER, WORD ) &&
- "s".equalsIgnoreCase( lex4.toString( mText ) )
- ) {
- emit( QUOTE_APOSTROPHE, lex2 );
- }
- // <.'\n>
- else if( match( PUNCT_PERIOD_ELLIPSIS_DASH, QUOTE_SINGLE, ENDING, ANY ) ) {
- emit( QUOTE_CLOSING_SINGLE, lex2 );
- }
- // <\'>
- else if( match( ESC_SINGLE, ANY, ANY, ANY ) ) {
- emit( QUOTE_STRAIGHT_SINGLE, lex1 );
- }
- // <\">
- else if( match( ESC_DOUBLE, ANY, ANY, ANY ) ) {
- emit( QUOTE_STRAIGHT_DOUBLE, lex1 );
-
- // <\"'--->
- if( match( ESC_DOUBLE, QUOTE_SINGLE, SPACE_DASH_ENDING, ANY ) ) {
- emit( QUOTE_CLOSING_SINGLE, lex2 );
- }
- }
- // <---'" >
- else if( match( DASH, QUOTE_SINGLE, QUOTE_DOUBLE, SPACE_ENDING ) ) {
- emit( QUOTE_CLOSING_SINGLE, lex2 );
- }
- // <o’-lantern>, <o' fellow>, <O'-the>
- else if(
- match( WORD, QUOTE_SINGLE, SPACE_HYPHEN, WORD ) &&
- "o".equalsIgnoreCase( lex1.toString( mText ) )
- ) {
- emit( QUOTE_APOSTROPHE, lex2 );
- }
- // <"">, <"...>, <"word>, <---"word>
- else if(
- match(
- LEADING_QUOTE_OPENING_DOUBLE, QUOTE_DOUBLE,
- LAGGING_QUOTE_OPENING_DOUBLE, ANY
- )
- ) {
- emit( QUOTE_OPENING_DOUBLE, lex2 );
- }
- // <..."'>, <word"'>, <?"'>, <word"?>
- else if(
- match(
- LEADING_QUOTE_CLOSING_DOUBLE, QUOTE_DOUBLE,
- LAGGING_QUOTE_CLOSING_DOUBLE, ANY
- )
- ) {
- emit( QUOTE_CLOSING_DOUBLE, lex2 );
- }
- // < ''E>
- else if( match( SPACE_SOT, QUOTE_SINGLE, QUOTE_SINGLE, WORD ) ) {
- // Consume both immediately to avoid the false ambiguity <'e>.
- emit( QUOTE_OPENING_SINGLE, lex2 );
- emit( QUOTE_APOSTROPHE, lex3 );
- mQ.set( Lexeme.NONE, 1 );
- mQ.set( Lexeme.NONE, 2 );
- }
- // <'...>, <'word>, <---'word>, < 'nation>
- else if(
- match(
- LEADING_QUOTE_OPENING_SINGLE, QUOTE_SINGLE,
- LAGGING_QUOTE_OPENING_SINGLE, ANY )
- ) {
- final var word = lex3.toString( mText );
-
- if( mContractions.beganAmbiguously( word ) ) {
- emit( QUOTE_AMBIGUOUS_LEADING, lex2 );
- }
- else if( mContractions.beganUnambiguously( word ) ) {
- emit( QUOTE_APOSTROPHE, lex2 );
- }
- // <"'"nested>
- else if( match( QUOTE_DOUBLE, QUOTE_SINGLE, QUOTE_DOUBLE, WORD ) ) {
- emit( QUOTE_OPENING_SINGLE, lex2 );
- }
- // <"'" >
- else if( match( QUOTE_DOUBLE, QUOTE_SINGLE, QUOTE_DOUBLE, ANY ) ) {
- emit( QUOTE_AMBIGUOUS_SINGLE, lex2 );
- }
- // < '" >
- else if( match( ANY, QUOTE_SINGLE, LAGGING_QUOTE_OPENING_SINGLE, ANY ) ) {
- emit( QUOTE_OPENING_SINGLE, lex2 );
- }
- // Ambiguous
- else {
- emit( QUOTE_AMBIGUOUS_LEADING, lex2 );
- }
- }
- // <"'--- >
- else if( match( QUOTE_DOUBLE, QUOTE_SINGLE, DASH, ANY ) ) {
- emit( QUOTE_OPENING_SINGLE, lex2 );
- }
- // <word'">, <...'--->, <"' >
- else if(
- match(
- LEADING_QUOTE_CLOSING_SINGLE, QUOTE_SINGLE,
- LAGGING_QUOTE_CLOSING_SINGLE, ANY
- )
- ) {
- final var word = lex1.toString( mText );
-
- if( mContractions.endedAmbiguously( word ) ) {
- emit( QUOTE_AMBIGUOUS_LAGGING, lex2 );
- }
- else {
- emit( QUOTE_CLOSING_SINGLE, lex2 );
- }
- }
- // <word';> (contraction inferred by previous matches)
- else if( match( WORD, QUOTE_SINGLE, PUNCT_PERIOD, ANY ) ) {
- emit( QUOTE_APOSTROPHE, lex2 );
- }
- // <---'">
- else if( match( DASH, QUOTE_SINGLE, QUOTE_DOUBLE, ANY ) ) {
- emit( QUOTE_CLOSING_SINGLE, lex2 );
- }
- // <'42>, <'-3.14>
- else if( match( ANY, QUOTE_SINGLE, NUMBER, ANY ) ) {
- emit( QUOTE_OPENING_SINGLE, lex2 );
- }
- // <PRE-PARSED><'---.>
- else if( match( LexemeType.NONE, QUOTE_SINGLE, ANY, ANY ) ) {
- emit( QUOTE_CLOSING_SINGLE, lex2 );
- }
- // <''Cause >
- else if( match( QUOTE_SINGLE, QUOTE_SINGLE, WORD, ANY ) ) {
- final var word = lex3.toString( mText );
-
- if( mContractions.beganAmbiguously( word ) ) {
- emit( QUOTE_AMBIGUOUS_LEADING, lex2 );
- }
- else if( mContractions.beganUnambiguously( word ) ) {
- emit( QUOTE_APOSTROPHE, lex2 );
- }
- else {
- emit( QUOTE_AMBIGUOUS_SINGLE, lex2 );
- }
- }
- // <'"Trouble>
- else if( match( QUOTE_SINGLE, QUOTE_DOUBLE, WORD, ANY ) ) {
- emit( QUOTE_OPENING_DOUBLE, lex2 );
- }
- // International quotation marks.
- else if( match( ANY, QUOTE_DOUBLE_OPENING, ANY, ANY ) ) {
- emit( QUOTE_OPENING_DOUBLE, lex2 );
- }
- else if( match( ANY, QUOTE_SINGLE_OPENING, ANY, ANY ) ) {
- emit( QUOTE_OPENING_SINGLE, lex2 );
- }
- else if( match( ANY, QUOTE_DOUBLE_CLOSING, ANY, ANY ) ) {
- emit( QUOTE_CLOSING_DOUBLE, lex2 );
- }
- else if( match( ANY, QUOTE_SINGLE_CLOSING, ANY, ANY ) ) {
- emit( QUOTE_CLOSING_SINGLE, lex2 );
- }
- // Ambiguous (no match)
- else if( match( ANY, QUOTE_SINGLE, ANY, ANY ) ) {
- emit( QUOTE_AMBIGUOUS_SINGLE, lex2 );
- }
- else if( match( ANY, QUOTE_DOUBLE, ANY, ANY ) ) {
- emit( QUOTE_AMBIGUOUS_DOUBLE, lex2 );
- }
- else if( match( ANY, ELLIPSIS, ANY, ANY ) ) {
- emit( QUOTE_ELLIPSIS, lex2 );
- }
- }
-
- private void emit( final TokenType tokenType, final Lexeme lexeme ) {
- mConsumer.accept( new Token( tokenType, lexeme ) );
- }
-
- private boolean match(
- final LexemeType l1,
- final LexemeType l2,
- final LexemeType l3,
- final LexemeType l4 ) {
- return mQ.get( 0 ).isType( l1 ) &&
- mQ.get( 1 ).isType( l2 ) &&
- mQ.get( 2 ).isType( l3 ) &&
- mQ.get( 3 ).isType( l4 );
- }
-
- private boolean match(
- final LexemeType[] l1,
- final LexemeType l2,
- final LexemeType l3,
- final LexemeType l4 ) {
- return mQ.get( 0 ).isType( l1 ) &&
- mQ.get( 1 ).isType( l2 ) &&
- mQ.get( 2 ).isType( l3 ) &&
- mQ.get( 3 ).isType( l4 );
- }
-
- private boolean match(
- final LexemeType l1,
- final LexemeType l2,
- final LexemeType[] l3,
- final LexemeType l4 ) {
- return mQ.get( 0 ).isType( l1 ) &&
- mQ.get( 1 ).isType( l2 ) &&
- mQ.get( 2 ).isType( l3 ) &&
- mQ.get( 3 ).isType( l4 );
- }
-
- private boolean match(
- final LexemeType l1,
- final LexemeType l2,
- final LexemeType l3,
- final LexemeType[] l4 ) {
- return mQ.get( 0 ).isType( l1 ) &&
- mQ.get( 1 ).isType( l2 ) &&
- mQ.get( 2 ).isType( l3 ) &&
- mQ.get( 3 ).isType( l4 );
- }
-
- private boolean match(
- final LexemeType[] l1,
- final LexemeType l2,
- final LexemeType[] l3,
- final LexemeType l4 ) {
+ private static final LexemeType[] DASHES = {
+ DASH_EN, DASH_EM, DASH_LINE, HYPHEN
+ };
+
+ private static final LexemeType[] PUNCT_PERIOD_ELLIPSIS_DASH = {
+ PUNCT, PERIOD, ELLIPSIS, DASH_EN, DASH_EM, DASH_LINE, HYPHEN
+ };
+
+ private static final LexemeType[] PUNCT_PERIOD = {
+ PUNCT, PERIOD
+ };
+
+ private static final LexemeType[] SPACE_DASH_ENDING = {
+ SPACE, DASH_EN, DASH_EM, DASH_LINE, HYPHEN, ENDING
+ };
+
+ private static final LexemeType[] SPACE_ENDING = {
+ SPACE, ENDING
+ };
+
+ private static final LexemeType[] SPACE_HYPHEN = {
+ SPACE, HYPHEN
+ };
+
+ private static final LexemeType[] SPACE_PUNCT = {
+ SPACE, PUNCT
+ };
+
+ private static final LexemeType[] SPACE_SOT = {
+ SPACE, SOT
+ };
+
+ /**
+ * Single quotes preceded by these {@link LexemeType}s may be opening quotes.
+ */
+ private static final LexemeType[] LEADING_QUOTE_OPENING_SINGLE =
+ new LexemeType[]{
+ LexemeType.SOT, SPACE,
+ DASH_EN, DASH_EM, DASH_LINE, HYPHEN,
+ QUOTE_DOUBLE, OPENING_GROUP, EOL, EOP
+ };
+
+ /**
+ * Single quotes succeeded by these {@link LexemeType}s may be opening quotes.
+ */
+ private static final LexemeType[] LAGGING_QUOTE_OPENING_SINGLE =
+ new LexemeType[]{
+ WORD, ELLIPSIS, QUOTE_SINGLE, QUOTE_DOUBLE
+ };
+
+ /**
+ * Single quotes preceded by these {@link LexemeType}s may be closing quotes.
+ */
+ private static final LexemeType[] LEADING_QUOTE_CLOSING_SINGLE =
+ new LexemeType[]{
+ WORD, NUMBER, PERIOD, PUNCT, ELLIPSIS, QUOTE_DOUBLE
+ };
+
+ /**
+ * Single quotes succeeded by these {@link LexemeType}s may be closing quotes.
+ */
+ private static final LexemeType[] LAGGING_QUOTE_CLOSING_SINGLE =
+ new LexemeType[]{
+ SPACE,
+ DASH_EN, DASH_EM, DASH_LINE, HYPHEN,
+ PUNCT, PERIOD, ELLIPSIS, QUOTE_DOUBLE, CLOSING_GROUP, ENDING
+ };
+
+ /**
+ * Double quotes preceded by these {@link LexemeType}s may be opening quotes.
+ */
+ private static final LexemeType[] LEADING_QUOTE_OPENING_DOUBLE =
+ new LexemeType[]{
+ LexemeType.SOT, SPACE,
+ DASH_EN, DASH_EM, DASH_LINE, HYPHEN,
+ EQUALS, OPENING_GROUP, EOL, EOP
+ };
+
+ /**
+ * Double quotes succeeded by these {@link LexemeType}s may be opening quotes.
+ */
+ private static final LexemeType[] LAGGING_QUOTE_OPENING_DOUBLE =
+ new LexemeType[]{
+ WORD, PUNCT, NUMBER,
+ DASH_EN, DASH_EM, DASH_LINE, HYPHEN,
+ ELLIPSIS, OPENING_GROUP,
+ QUOTE_SINGLE, QUOTE_SINGLE_OPENING, QUOTE_SINGLE_CLOSING,
+ QUOTE_DOUBLE
+ };
+
+ /**
+ * Double quotes preceded by these {@link LexemeType}s may be closing quotes.
+ */
+ private static final LexemeType[] LEADING_QUOTE_CLOSING_DOUBLE =
+ new LexemeType[]{
+ WORD, NUMBER, PERIOD, PUNCT,
+ DASH_EN, DASH_EM, DASH_LINE, HYPHEN,
+ ELLIPSIS, CLOSING_GROUP,
+ QUOTE_SINGLE, QUOTE_SINGLE_CLOSING, QUOTE_SINGLE_OPENING
+ };
+
+ /**
+ * Double quotes succeeded by these {@link LexemeType}s may be closing quotes.
+ */
+ private static final LexemeType[] LAGGING_QUOTE_CLOSING_DOUBLE =
+ new LexemeType[]{
+ SPACE, PUNCT, PERIOD, EQUALS,
+ DASH_EN, DASH_EM, DASH_LINE, HYPHEN,
+ QUOTE_SINGLE, CLOSING_GROUP, ENDING
+ };
+
+ private final CircularFifoQueue<Lexeme> mQ = new CircularFifoQueue<>( 4 );
+ private final String mText;
+ private final Contractions mContractions;
+ private final Consumer<Token> mConsumer;
+
+ public QuoteEmitter(
+ final String text,
+ final Contractions contractions,
+ final Consumer<Token> consumer
+ ) {
+ assert text != null;
+ assert contractions != null;
+
+ mText = text;
+ mContractions = contractions;
+ mConsumer = consumer;
+ }
+
+ /**
+ * Scans the given text document for quotation marks and passes them to the
+ * given {@link Token} {@link Consumer}.
+ *
+ * @param text The prose to lex.
+ * @param contractions List of ambiguous and unambiguous contractions.
+ * @param consumer Receives
+ */
+ public static void analyze(
+ final String text,
+ final Contractions contractions,
+ final Consumer<Token> consumer,
+ final LexerFilter filter
+ ) {
+ final var emitter = new QuoteEmitter( text, contractions, consumer );
+ Lexer.lex( text, emitter, filter );
+ }
+
+ /**
+ * @param lexeme the input argument
+ */
+ @Override
+ public void accept( final Lexeme lexeme ) {
+ mQ.add( lexeme );
+
+ if( mQ.size() == 4 ) {
+ parse();
+ }
+ }
+
+ private void parse() {
+ final var lex1 = mQ.get( 0 );
+ final var lex2 = mQ.get( 1 );
+ final var lex3 = mQ.get( 2 );
+ final var lex4 = mQ.get( 3 );
+
+ // <y'all>, <Ph.D.'ll>, <20's>, <she's>
+ if( match( WORD_PERIOD_NUMBER, QUOTE_SINGLE, WORD, ANY ) ) {
+ emit( QUOTE_APOSTROPHE, lex2 );
+ }
+ // <'n'>, <'N'>, <'owlin'>
+ else if(
+ match( ANY, QUOTE_SINGLE, WORD, QUOTE_SINGLE ) &&
+ mContractions.beganEndedUnambiguously( lex3.toString( mText ) )
+ ) {
+ emit( QUOTE_APOSTROPHE, lex2 );
+ emit( QUOTE_APOSTROPHE, lex4 );
+ mQ.set( Lexeme.NONE, 3 );
+ }
+ // <2''>
+ else if( match( NUMBER, QUOTE_SINGLE, QUOTE_SINGLE, ANY ) ) {
+ // Force double primes to conform to the same constructor usage. This
+ // simplifies the tokens, reduces some memory usage,
+ final var lex = new Lexeme( PRIME_DOUBLE, lex2.began(), lex3.ended() );
+
+ emit( QUOTE_PRIME_DOUBLE, lex );
+ mQ.set( Lexeme.NONE, 2 );
+ }
+ // <2'>
+ else if( match( NUMBER, QUOTE_SINGLE, ANY, ANY ) ) {
+ emit( QUOTE_PRIME_SINGLE, lex2 );
+ }
+ // <2">
+ else if( match( NUMBER, QUOTE_DOUBLE, ANY, ANY ) ) {
+ emit( QUOTE_PRIME_DOUBLE, lex2 );
+ }
+ // <thinkin'>
+ else if(
+ match( WORD, QUOTE_SINGLE, ANY, ANY ) &&
+ mContractions.endedUnambiguously( lex1.toString( mText ) )
+ ) {
+ emit( QUOTE_APOSTROPHE, lex2 );
+ }
+ // <'02>
+ else if( match( ANY, QUOTE_SINGLE, NUMBER, SPACE_PUNCT ) ) {
+ emit( QUOTE_APOSTROPHE, lex2 );
+ }
+ // <'20s>
+ else if(
+ match( ANY, QUOTE_SINGLE, NUMBER, WORD ) &&
+ "s".equalsIgnoreCase( lex4.toString( mText ) )
+ ) {
+ emit( QUOTE_APOSTROPHE, lex2 );
+ }
+ // <.'\n>
+ else if( match( PUNCT_PERIOD_ELLIPSIS_DASH, QUOTE_SINGLE, ENDING, ANY ) ) {
+ emit( QUOTE_CLOSING_SINGLE, lex2 );
+ }
+ // <\'>
+ else if( match( ESC_SINGLE, ANY, ANY, ANY ) ) {
+ emit( QUOTE_STRAIGHT_SINGLE, lex1 );
+ }
+ // <\">
+ else if( match( ESC_DOUBLE, ANY, ANY, ANY ) ) {
+ emit( QUOTE_STRAIGHT_DOUBLE, lex1 );
+
+ // <\"'--->
+ if( match( ESC_DOUBLE, QUOTE_SINGLE, SPACE_DASH_ENDING, ANY ) ) {
+ emit( QUOTE_CLOSING_SINGLE, lex2 );
+ }
+ }
+ // <---'" >
+ else if( match( DASHES, QUOTE_SINGLE, QUOTE_DOUBLE, SPACE_ENDING ) ) {
+ emit( QUOTE_CLOSING_SINGLE, lex2 );
+ }
+ // <o’-lantern>, <o' fellow>, <O'-the>
+ else if(
+ match( WORD, QUOTE_SINGLE, SPACE_HYPHEN, WORD ) &&
+ "o".equalsIgnoreCase( lex1.toString( mText ) )
+ ) {
+ emit( QUOTE_APOSTROPHE, lex2 );
+ }
+ // <"">, <"...>, <"word>, <---"word>
+ else if(
+ match(
+ LEADING_QUOTE_OPENING_DOUBLE, QUOTE_DOUBLE,
+ LAGGING_QUOTE_OPENING_DOUBLE, ANY
+ )
+ ) {
+ emit( QUOTE_OPENING_DOUBLE, lex2 );
+ }
+ // <..."'>, <word"'>, <?"'>, <word"?>
+ else if(
+ match(
+ LEADING_QUOTE_CLOSING_DOUBLE, QUOTE_DOUBLE,
+ LAGGING_QUOTE_CLOSING_DOUBLE, ANY
+ )
+ ) {
+ emit( QUOTE_CLOSING_DOUBLE, lex2 );
+ }
+ // < ''E>
+ else if( match( SPACE_SOT, QUOTE_SINGLE, QUOTE_SINGLE, WORD ) ) {
+ // Consume both immediately to avoid the false ambiguity <'e>.
+ emit( QUOTE_OPENING_SINGLE, lex2 );
+ emit( QUOTE_APOSTROPHE, lex3 );
+ mQ.set( Lexeme.NONE, 1 );
+ mQ.set( Lexeme.NONE, 2 );
+ }
+ // <'...>, <'word>, <---'word>, < 'nation>
+ else if(
+ match(
+ LEADING_QUOTE_OPENING_SINGLE, QUOTE_SINGLE,
+ LAGGING_QUOTE_OPENING_SINGLE, ANY )
+ ) {
+ final var word = lex3.toString( mText );
+
+ if( mContractions.beganAmbiguously( word ) ) {
+ emit( QUOTE_AMBIGUOUS_LEADING, lex2 );
+ }
+ else if( mContractions.beganUnambiguously( word ) ) {
+ emit( QUOTE_APOSTROPHE, lex2 );
+ }
+ // <"'"nested>
+ else if( match( QUOTE_DOUBLE, QUOTE_SINGLE, QUOTE_DOUBLE, WORD ) ) {
+ emit( QUOTE_OPENING_SINGLE, lex2 );
+ }
+ // <"'" >
+ else if( match( QUOTE_DOUBLE, QUOTE_SINGLE, QUOTE_DOUBLE, ANY ) ) {
+ emit( QUOTE_AMBIGUOUS_SINGLE, lex2 );
+ }
+ // < '" >
+ else if( match( ANY, QUOTE_SINGLE, LAGGING_QUOTE_OPENING_SINGLE, ANY ) ) {
+ emit( QUOTE_OPENING_SINGLE, lex2 );
+ }
+ // Ambiguous
+ else {
+ emit( QUOTE_AMBIGUOUS_LEADING, lex2 );
+ }
+ }
+ // <"'--- >
+ else if( match( QUOTE_DOUBLE, QUOTE_SINGLE, DASHES, ANY ) ) {
+ emit( QUOTE_OPENING_SINGLE, lex2 );
+ }
+ // <word'">, <...'--->, <"' >
+ else if(
+ match(
+ LEADING_QUOTE_CLOSING_SINGLE, QUOTE_SINGLE,
+ LAGGING_QUOTE_CLOSING_SINGLE, ANY
+ )
+ ) {
+ final var word = lex1.toString( mText );
+
+ if( mContractions.endedAmbiguously( word ) ) {
+ emit( QUOTE_AMBIGUOUS_LAGGING, lex2 );
+ }
+ else {
+ emit( QUOTE_CLOSING_SINGLE, lex2 );
+ }
+ }
+ // <word';> (contraction inferred by previous matches)
+ else if( match( WORD, QUOTE_SINGLE, PUNCT_PERIOD, ANY ) ) {
+ emit( QUOTE_APOSTROPHE, lex2 );
+ }
+ // <---'">
+ else if( match( DASHES, QUOTE_SINGLE, QUOTE_DOUBLE, ANY ) ) {
+ emit( QUOTE_CLOSING_SINGLE, lex2 );
+ }
+ // <'42>, <'-3.14>
+ else if( match( ANY, QUOTE_SINGLE, NUMBER, ANY ) ) {
+ emit( QUOTE_OPENING_SINGLE, lex2 );
+ }
+ // <PRE-PARSED><'---.>
+ else if( match( LexemeType.NONE, QUOTE_SINGLE, ANY, ANY ) ) {
+ emit( QUOTE_CLOSING_SINGLE, lex2 );
+ }
+ // <''Cause >
+ else if( match( QUOTE_SINGLE, QUOTE_SINGLE, WORD, ANY ) ) {
+ final var word = lex3.toString( mText );
+
+ if( mContractions.beganAmbiguously( word ) ) {
+ emit( QUOTE_AMBIGUOUS_LEADING, lex2 );
+ }
+ else if( mContractions.beganUnambiguously( word ) ) {
+ emit( QUOTE_APOSTROPHE, lex2 );
+ }
+ else {
+ emit( QUOTE_AMBIGUOUS_SINGLE, lex2 );
+ }
+ }
+ // <'"Trouble>
+ else if( match( QUOTE_SINGLE, QUOTE_DOUBLE, WORD, ANY ) ) {
+ emit( QUOTE_OPENING_DOUBLE, lex2 );
+ }
+ // International quotation marks.
+ else if( match( ANY, QUOTE_DOUBLE_OPENING, ANY, ANY ) ) {
+ emit( QUOTE_OPENING_DOUBLE, lex2 );
+ }
+ else if( match( ANY, QUOTE_SINGLE_OPENING, ANY, ANY ) ) {
+ emit( QUOTE_OPENING_SINGLE, lex2 );
+ }
+ else if( match( ANY, QUOTE_DOUBLE_CLOSING, ANY, ANY ) ) {
+ emit( QUOTE_CLOSING_DOUBLE, lex2 );
+ }
+ else if( match( ANY, QUOTE_SINGLE_CLOSING, ANY, ANY ) ) {
+ emit( QUOTE_CLOSING_SINGLE, lex2 );
+ }
+ // Ambiguous (no match)
+ else if( match( ANY, QUOTE_SINGLE, ANY, ANY ) ) {
+ emit( QUOTE_AMBIGUOUS_SINGLE, lex2 );
+ }
+ else if( match( ANY, QUOTE_DOUBLE, ANY, ANY ) ) {
+ emit( QUOTE_AMBIGUOUS_DOUBLE, lex2 );
+ }
+ else if( match( ANY, ELLIPSIS, ANY, ANY ) ) {
+ emit( QUOTE_ELLIPSIS, lex2 );
+ }
+ else if( match( ANY, DASH_EN, ANY, ANY ) ) {
+ emit( QUOTE_DASH_EN, lex2 );
+ }
+ else if( match( ANY, DASH_EM, ANY, ANY ) ) {
+ emit( QUOTE_DASH_EM, lex2 );
+ }
+ }
+
+ private void emit( final TokenType tokenType, final Lexeme lexeme ) {
+ mConsumer.accept( new Token( tokenType, lexeme ) );
+ }
+
+ private boolean match(
+ final LexemeType l1,
+ final LexemeType l2,
+ final LexemeType l3,
+ final LexemeType l4 ) {
+ return mQ.get( 0 ).isType( l1 ) &&
+ mQ.get( 1 ).isType( l2 ) &&
+ mQ.get( 2 ).isType( l3 ) &&
+ mQ.get( 3 ).isType( l4 );
+ }
+
+ private boolean match(
+ final LexemeType[] l1,
+ final LexemeType l2,
+ final LexemeType l3,
+ final LexemeType l4 ) {
+ return mQ.get( 0 ).isType( l1 ) &&
+ mQ.get( 1 ).isType( l2 ) &&
+ mQ.get( 2 ).isType( l3 ) &&
+ mQ.get( 3 ).isType( l4 );
+ }
+
+ private boolean match(
+ final LexemeType l1,
+ final LexemeType l2,
+ final LexemeType[] l3,
+ final LexemeType l4 ) {
+ return mQ.get( 0 ).isType( l1 ) &&
+ mQ.get( 1 ).isType( l2 ) &&
+ mQ.get( 2 ).isType( l3 ) &&
+ mQ.get( 3 ).isType( l4 );
+ }
+
+ private boolean match(
+ final LexemeType l1,
+ final LexemeType l2,
+ final LexemeType l3,
+ final LexemeType[] l4 ) {
+ return mQ.get( 0 ).isType( l1 ) &&
+ mQ.get( 1 ).isType( l2 ) &&
+ mQ.get( 2 ).isType( l3 ) &&
+ mQ.get( 3 ).isType( l4 );
+ }
+
+ private boolean match(
+ final LexemeType[] l1,
+ final LexemeType l2,
+ final LexemeType[] l3,
+ final LexemeType l4 ) {
+ return mQ.get( 0 ).isType( l1 ) &&
+ mQ.get( 1 ).isType( l2 ) &&
+ mQ.get( 2 ).isType( l3 ) &&
+ mQ.get( 3 ).isType( l4 );
+ }
+
+ private boolean match(
+ final LexemeType[] l1,
+ final LexemeType l2,
+ final LexemeType l3,
+ final LexemeType[] l4 ) {
return mQ.get( 0 ).isType( l1 ) &&
mQ.get( 1 ).isType( l2 ) &&
src/main/java/com/whitemagicsoftware/keenquotes/parser/Token.java
ENTITIES.put( QUOTE_PRIME_QUADRUPLE, "&qprime;" );
ENTITIES.put( QUOTE_ELLIPSIS, "&hellip;" );
+ ENTITIES.put( QUOTE_DASH_EN, "&ndash;" );
+ ENTITIES.put( QUOTE_DASH_EM, "&mdash;" );
+ ENTITIES.put( QUOTE_DASH_MINUS, "&minus;" );
}
CHARS.put( QUOTE_PRIME_QUADRUPLE, "⁗" );
CHARS.put( QUOTE_ELLIPSIS, "…" );
+ CHARS.put( QUOTE_DASH_MINUS, "−" );
+ CHARS.put( QUOTE_DASH_EN, "–" );
+ CHARS.put( QUOTE_DASH_EM, "—" );
}
src/main/java/com/whitemagicsoftware/keenquotes/parser/TokenType.java
QUOTE_AMBIGUOUS_SINGLE( "single-ambiguous" ),
QUOTE_ELLIPSIS,
+ QUOTE_DASH_MINUS,
+ QUOTE_DASH_EN,
+ QUOTE_DASH_EM,
NONE;
src/main/java/com/whitemagicsoftware/keenquotes/util/FastCharacterIterator.java
*/
public final class FastCharacterIterator {
-
private final String mS;
private final int mLen;
src/test/java/com/whitemagicsoftware/keenquotes/lex/LexerTest.java
@Test
void test_Lexing_PunctuationMarks_EmitPunctuationMarks() {
+ testType( "-", HYPHEN );
+ testType( "--", DASH_EN );
+ testType( "–", DASH_EN );
+ testType( "---", DASH_EM );
+ testType( "--- -- -", DASH_EM, SPACE, DASH_EN, SPACE, HYPHEN );
+ testType( "—", DASH_EM );
+ testType( "----", DASH_LINE );
+ testType( "-----", DASH_LINE );
+ testType( "------", DASH_LINE );
+ testType( "—-—", DASH_EM, HYPHEN, DASH_EM );
testType( "!", PUNCT );
testType( ";", PUNCT );
testType( "\\", PUNCT );
testType( ".", PERIOD );
- testType( "-", HYPHEN );
- testType( "--", DASH );
- testType( "---", DASH );
- testType( "–", DASH );
- testType( "―", DASH );
- testType( "—", DASH );
- testType( "—-—", DASH );
testType( "...", ELLIPSIS );
testType( ". .", ELLIPSIS );
var counter = new AtomicInteger();
- Lexer.lex( text, lexeme -> {
- // Ignore the SOT and EOT lexemes (avoids duplication). Each test will
- // include EOL, EOP, and EOT tokens due to the lexer's algorithm.
- if( !lexeme.isType( SOT, EOT ) && counter.get() < elements.size() ) {
- final var expected = elements.get( counter.getAndIncrement() );
- final var actual = f.apply( lexeme, text );
+ Lexer.lex(
+ text, lexeme -> {
+ // Ignore the SOT and EOT lexemes (avoids duplication). Each test will
+ // include EOL, EOP, and EOT tokens due to the lexer's algorithm.
+ if( !lexeme.isType( SOT, EOT ) && counter.get() < elements.size() ) {
+ final var expected = elements.get( counter.getAndIncrement() );
+ final var actual = f.apply( lexeme, text );
- assertEquals( expected, actual );
- }
- }, filter );
+ assertEquals( expected, actual );
+ }
+ }, filter
+ );
// Ensure all expected values are matched (verify end of text reached).
src/test/resources/com/whitemagicsoftware/keenquotes/texts/unambiguous-1-pass.txt
With--"air quotes"--and dashes.
-With--&ldquo;air quotes&rdquo;--and dashes.
+With&ndash;&ldquo;air quotes&rdquo;&ndash;and dashes.
"Not all open quotes are closed...
&ldquo;Not all open quotes are closed&hellip;
"---retroactively!"
-&ldquo;---retroactively!&rdquo;
+&ldquo;&mdash;retroactively!&rdquo;
"And this" and "and this" and "and this" and "and another."
&ldquo;And this&rdquo; and &ldquo;and this&rdquo; and &ldquo;and this&rdquo; and &ldquo;and another.&rdquo;
"Will'll invite?" Mrs. Thorne asked;\n"because he's coming--he's at the gate."
-&ldquo;Will&apos;ll invite?&rdquo; Mrs. Thorne asked;\n&ldquo;because he&apos;s coming--he&apos;s at the gate.&rdquo;
+&ldquo;Will&apos;ll invite?&rdquo; Mrs. Thorne asked;\n&ldquo;because he&apos;s coming&ndash;he&apos;s at the gate.&rdquo;
Model "T2000"
'...even better!'
&lsquo;&hellip;even better!&rsquo;
+
+With-'imaginary'-dashes.
+With-&lsquo;imaginary&rsquo;-dashes.
With--'imaginary'--dashes.
-With--&lsquo;imaginary&rsquo;--dashes.
+With&ndash;&lsquo;imaginary&rsquo;&ndash;dashes.
+
+With---'imaginary'---dashes.
+With&mdash;&lsquo;imaginary&rsquo;&mdash;dashes.
'It's a beautiful day!'
"'I'm in danger. Help? Take me to'--an address on Van Ness Avenue.
-&ldquo;&lsquo;I&apos;m in danger. Help? Take me to&rsquo;--an address on Van Ness Avenue.
+&ldquo;&lsquo;I&apos;m in danger. Help? Take me to&rsquo;&ndash;an address on Van Ness Avenue.
"Ah," she said, "you knew! He told you--and you said 'my dear'! How could you?!"
-&ldquo;Ah,&rdquo; she said, &ldquo;you knew! He told you--and you said &lsquo;my dear&rsquo;! How could you?!&rdquo;
+&ldquo;Ah,&rdquo; she said, &ldquo;you knew! He told you&ndash;and you said &lsquo;my dear&rsquo;! How could you?!&rdquo;
shouldn't drop letters, nor say "ain't" or "hain't."
shouldn&apos;t drop letters, nor say &ldquo;ain&apos;t&rdquo; or &ldquo;hain&apos;t.&rdquo;
"’Kearney lives on the banks of Killarney—’
-&ldquo;&rsquo;Kearney lives on the banks of Killarney—&rsquo;
+&ldquo;&rsquo;Kearney lives on the banks of Killarney&mdash;&rsquo;
# ########################################################################
"She said, 'Llamas'll languish, they'll--
-&ldquo;She said, &lsquo;Llamas&apos;ll languish, they&apos;ll--
+&ldquo;She said, &lsquo;Llamas&apos;ll languish, they&apos;ll&ndash;
'The 70s are my favorite numbers,' she said.
Computer says, "'It is mysteries---'"
-Computer says, &ldquo;&lsquo;It is mysteries---&rsquo;&rdquo;
+Computer says, &ldquo;&lsquo;It is mysteries&mdash;&rsquo;&rdquo;
src/test/resources/com/whitemagicsoftware/keenquotes/texts/unambiguous-2-pass.txt
"Jane said, ''E'll be spooky, Sam's son with the jack-o'-lantern!'" said the O'Mally twins'---y'know---ghosts in unison.
-&ldquo;Jane said, &lsquo;&apos;E&apos;ll be spooky, Sam&apos;s son with the jack-o&apos;-lantern!&rsquo;&rdquo; said the O&apos;Mally twins&apos;---y&apos;know---ghosts in unison.
+&ldquo;Jane said, &lsquo;&apos;E&apos;ll be spooky, Sam&apos;s son with the jack-o&apos;-lantern!&rsquo;&rdquo; said the O&apos;Mally twins&apos;&mdash;y&apos;know&mdash;ghosts in unison.
''Sup, Doc?'
"Not much o' fellow-creaturs, I think, Miss; all I know—my old master, as war a knowin' man, used to say, says he, 'If e'er I sow my wheat wi'out brinin', I'm a Dutchman,' says he; an' that war as much as to say as a Dutchman war a fool, or next door. Nay, nay, I aren't goin' to bother mysen about Dutchmen. There's fools enoo, an' rogues enoo, wi'out lookin' i' books for 'em."
-&ldquo;Not much o&apos; fellow-creaturs, I think, Miss; all I know—my old master, as war a knowin&apos; man, used to say, says he, &lsquo;If e&apos;er I sow my wheat wi&apos;out brinin&apos;, I&apos;m a Dutchman,&rsquo; says he; an&apos; that war as much as to say as a Dutchman war a fool, or next door. Nay, nay, I aren&apos;t goin&apos; to bother mysen about Dutchmen. There&apos;s fools enoo, an&apos; rogues enoo, wi&apos;out lookin&apos; i&apos; books for &apos;em.&rdquo;
+&ldquo;Not much o&apos; fellow-creaturs, I think, Miss; all I know&mdash;my old master, as war a knowin&apos; man, used to say, says he, &lsquo;If e&apos;er I sow my wheat wi&apos;out brinin&apos;, I&apos;m a Dutchman,&rsquo; says he; an&apos; that war as much as to say as a Dutchman war a fool, or next door. Nay, nay, I aren&apos;t goin&apos; to bother mysen about Dutchmen. There&apos;s fools enoo, an&apos; rogues enoo, wi&apos;out lookin&apos; i&apos; books for &apos;em.&rdquo;
I leave proofs to those who 've seen 'em;\n\nBut this I heard her say, and can't be wrong\n\nAnd all may think which way their judgments lean 'em,\n\n''T is strange---the Hebrew noun which means "I am,"\n\nThe English always use to govern d--n.'
-I leave proofs to those who &apos;ve seen &apos;em;\n\nBut this I heard her say, and can&apos;t be wrong\n\nAnd all may think which way their judgments lean &apos;em,\n\n&lsquo;&apos;T is strange---the Hebrew noun which means &ldquo;I am,&rdquo;\n\nThe English always use to govern d--n.&rsquo;
+I leave proofs to those who &apos;ve seen &apos;em;\n\nBut this I heard her say, and can&apos;t be wrong\n\nAnd all may think which way their judgments lean &apos;em,\n\n&lsquo;&apos;T is strange&mdash;the Hebrew noun which means &ldquo;I am,&rdquo;\n\nThe English always use to govern d&ndash;n.&rsquo;
''E's got a 'ittle box 'n a big 'un,' she said, 'wit' th' 'ittle 'un 'bout 2'×6". An' no, y'ain't cryin' on th' "soap box" to me no mo, y'hear. 'Cause it 'tweren't ever a spec o' fun!' I says to my frien'.
"You 'cause---" all fifteen years' worth.
-&ldquo;You &apos;cause---&rdquo; all fifteen years&apos; worth.
+&ldquo;You &apos;cause&mdash;&rdquo; all fifteen years&apos; worth.
"'---has a prison.'"
-&ldquo;&lsquo;---has a prison.&rsquo;&rdquo;
+&ldquo;&lsquo;&mdash;has a prison.&rsquo;&rdquo;
"Teach 'em t'be more 'an we ever been!"
src/test/resources/com/whitemagicsoftware/keenquotes/texts/xml.txt
<bold>&apos;twas</bold> redeemed for the <em>cat</em>&apos;s eye
-<a href="https://x.org" title="X's Homepage">X11's bomb</a>
-<a href="https://x.org" title="X's Homepage">X11&apos;s bomb</a>
+<a href="https://dj.ca" title="X's Homepage">X11's bomb</a>
+<a href="https://dj.ca" title="X's Homepage">X11&apos;s bomb</a>
+
+<a href="https://d--j.ca">D--J</a>
+<a href="https://d--j.ca">D&ndash;J</a>
<body><p>Test lungs' capacity; shovel's handle hit his head's electronics.</p><p>Hoped to bring him 'round.</p></body>
Delta539 lines added, 471 lines removed, 68-line increase