Dave Jarvis' Repositories

git clone https://repo.autonoma.ca/repo/keenquotes.git

Add comments, rename methods, emit ambiguities

AuthorDave Jarvis <email>
Date2021-06-15 18:55:05 GMT-0700
Commit3d0f24912f298335bd55f42ca7302f2078034299
Parent9d7091e
lib/src/main/java/com/keenwrite/quotes/KeenQuotes.java
+/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */
+package com.keenwrite.quotes;
+
+import java.util.ArrayList;
+import java.util.Map;
+import java.util.function.Consumer;
+
+import static com.keenwrite.quotes.TokenType.*;
+import static java.util.Collections.sort;
+
+/**
+ * Responsible for replacing {@link Token} instances with equivalent smart
+ * quotes (or straight quotes). This will inform the caller when ambiguous
+ * quotes cannot be reliably resolved.
+ */
+public final class KeenQuotes {
+ private static final Map<TokenType, String> REPLACEMENTS = Map.of(
+ QUOTE_OPENING_SINGLE, "&lsquo;",
+ QUOTE_CLOSING_SINGLE, "&rsquo;",
+ QUOTE_OPENING_DOUBLE, "&ldquo;",
+ QUOTE_CLOSING_DOUBLE, "&rdquo;",
+ QUOTE_STRAIGHT_SINGLE, "'",
+ QUOTE_STRAIGHT_DOUBLE, "\"",
+ QUOTE_APOSTROPHE, "&apos;",
+ QUOTE_PRIME_SINGLE, "&prime;",
+ QUOTE_PRIME_DOUBLE, "&Prime;"
+ );
+
+ /**
+ * Converts straight quotes to curly quotes and primes. Any quotation marks
+ * that cannot be converted are passed to the {@link Consumer}.
+ *
+ * @param text The text to parse.
+ * @param unresolved Recipient for ambiguous {@link Lexeme}s.
+ * @return The given text string with as many straight quotes converted to
+ * curly quotes as is feasible.
+ */
+ public static String convert(
+ final String text, final Consumer<Lexeme> unresolved ) {
+ final var parser = new Parser( text );
+ final var tokens = new ArrayList<Token>();
+
+ // Parse the tokens and consume all unresolved lexemes.
+ parser.parse( tokens::add, unresolved );
+
+ // The parser may emit tokens in any order.
+ sort( tokens );
+
+ final var result = new StringBuilder( text.length() );
+ var position = 0;
+
+ for( final var token : tokens ) {
+ if( position <= token.began() ) {
+ result.append( text, position, token.began() );
+ result.append( REPLACEMENTS.get( token.getType() ) );
+ }
+
+ position = token.ended();
+ }
+
+ return result.append( text.substring( position ) ).toString();
+ }
+}
lib/src/main/java/com/keenwrite/quotes/Lexeme.java
* a similar approach to run-length encoding).
*/
-public class Lexeme implements Comparable<Lexeme> {
+public final class Lexeme implements Comparable<Lexeme> {
+ /**
+ * Signifies an invalid index to help distinguish EOT/SOT.
+ */
+ private static final int E_INDEX = -2;
+
/**
* Denotes there are no more lexemes: the end of text (EOT) has been reached.
* The beginning index differentiates between EOT and SOT.
*/
- public static final Lexeme EOT = new Lexeme( FLAG, -1, -2 );
+ public static final Lexeme EOT = new Lexeme( FLAG, -1, E_INDEX );
/**
* Denotes parsing at the start of text (SOT). This is useful to avoid
* branching conditions while iterating. The beginning index differentiates
* between EOT and SOT.
*/
- public static final Lexeme SOT = new Lexeme( FLAG, 0, -2 );
+ public static final Lexeme SOT = new Lexeme( FLAG, 0, E_INDEX );
private final LexemeType mType;
private final int mBegan;
private final int mEnded;
/**
* Create a lexeme that represents a section of the text.
+ *
+ * @param type Type of {@link Lexeme} to create.
+ * @param began Offset into the text where this instance starts (0-based).
+ * @param ended Offset into the text where this instance stops (0-based).
*/
private Lexeme( final LexemeType type, final int began, final int ended ) {
assert type != null;
- assert began >= 0 || ended == -2;
- assert ended >= began || ended == -2;
+ assert began >= 0 || ended == E_INDEX;
+ assert ended >= began || ended == E_INDEX;
mType = type;
*/
public String toString( final String text ) {
+ assert text != null;
return text.substring( mBegan, mEnded );
}
+ /**
+ * Answers whether the given {@link LexemeType} is the same as this
+ * instance's internal {@link LexemeType}.
+ *
+ * @param type The {@link LexemeType} to compare.
+ * @return {@code true} if the given {@link LexemeType} is equal to the
+ * internal {@link LexemeType}.
+ */
public boolean isType( final LexemeType type ) {
+ assert type != null;
return mType == type;
}
+ /**
+ * Answers whether any of the given {@link LexemeType} matches this
+ * instance's internal {@link LexemeType}.
+ *
+ * @param types The {@link LexemeType}s to compare.
+ * @return {@code true} if the internal {@link LexemeType} matches any one
+ * of the given {@link LexemeType}s.
+ */
public boolean anyType( final LexemeType... types ) {
+ assert types != null;
+
for( final var type : types ) {
if( mType == type ) {
return true;
}
}
return false;
- }
-
- LexemeType getType() {
- return mType;
}
- int began() {
+ public int began() {
return mBegan;
}
- int ended() {
+ public int ended() {
return mEnded;
+ }
+
+ boolean isSot() {
+ return mBegan == 0;
}
boolean isEot() {
return mBegan == -1;
}
- boolean isSot() {
- return mBegan == 0;
+ LexemeType getType() {
+ return mType;
}
+ /**
+ * Compares the starting offset of the given {@link Lexeme} to the starting
+ * offset of this {@link Lexeme} instance. This allows a list {@link Lexeme}s
+ * to be sorted by order of appearance in the parsed text.
+ */
@Override
public int compareTo( final Lexeme that ) {
+ assert that != null;
return this.mBegan - that.mBegan;
- }
-
- @Override
- public String toString() {
- return getClass().getSimpleName() + "{" +
- "mType=" + mType +
- ", mBegan=" + mBegan +
- ", mEnded=" + mEnded +
- '}';
}
static Lexeme createLexeme(
final LexemeType lexeme, final int began, final int ended ) {
+ assert lexeme != null;
return new Lexeme( lexeme, began, ended );
}
lib/src/main/java/com/keenwrite/quotes/LexemeType.java
package com.keenwrite.quotes;
+/**
+ * Represents the type of a {@link Lexeme} parsed by the {@link Lexer}.
+ */
public enum LexemeType {
QUOTE_SINGLE,
lib/src/main/java/com/keenwrite/quotes/Parser.java
* </ol>
*/
-public class Parser {
- /**
- * Single quotes preceded by these {@link LexemeType}s may be opening quotes.
- */
- private static final LexemeType[] LEADING_QUOTE_OPENING_SINGLE =
- new LexemeType[]{SPACE, DASH, QUOTE_DOUBLE, OPENING_GROUP, EOP};
-
- /**
- * Single quotes succeeded by these {@link LexemeType}s may be opening quotes.
- */
- private static final LexemeType[] LAGGING_QUOTE_OPENING_SINGLE =
- new LexemeType[]{WORD, ELLIPSIS, QUOTE_SINGLE, QUOTE_DOUBLE};
-
- /**
- * Single quotes preceded by these {@link LexemeType}s may be closing quotes.
- */
- private static final LexemeType[] LEADING_QUOTE_CLOSING_SINGLE =
- new LexemeType[]{WORD, NUMBER, PERIOD, PUNCT, ELLIPSIS, QUOTE_DOUBLE};
-
- /**
- * Single quotes succeeded by these {@link LexemeType}s may be closing quotes.
- */
- private static final LexemeType[] LAGGING_QUOTE_CLOSING_SINGLE =
- new LexemeType[]{SPACE, DASH, QUOTE_DOUBLE, CLOSING_GROUP, EOL};
-
- /**
- * Double quotes preceded by these {@link LexemeType}s may be opening quotes.
- */
- private static final LexemeType[] LEADING_QUOTE_OPENING_DOUBLE =
- new LexemeType[]{SPACE, DASH, QUOTE_SINGLE, OPENING_GROUP, EOP};
-
- /**
- * Double quotes succeeded by these {@link LexemeType}s may be opening quotes.
- */
- private static final LexemeType[] LAGGING_QUOTE_OPENING_DOUBLE =
- new LexemeType[]{WORD, NUMBER, ELLIPSIS, QUOTE_SINGLE, QUOTE_DOUBLE};
-
- /**
- * Double quotes preceded by these {@link LexemeType}s may be closing quotes.
- */
- private static final LexemeType[] LEADING_QUOTE_CLOSING_DOUBLE =
- new LexemeType[]{WORD, NUMBER, PERIOD, PUNCT, ELLIPSIS, QUOTE_SINGLE};
-
- /**
- * Double quotes succeeded by these {@link LexemeType}s may be closing quotes.
- */
- private static final LexemeType[] LAGGING_QUOTE_CLOSING_DOUBLE =
- new LexemeType[]{SPACE, DASH, QUOTE_SINGLE, CLOSING_GROUP, EOL};
-
- /**
- * The text to parse. A reference is required as a minor optimization in
- * memory and speed: the lexer records integer offsets, rather than new
- * {@link String} instances, to track parsed lexemes.
- */
- private final String mText;
-
- /**
- * Converts a string into an iterable list of {@link Lexeme} instances.
- */
- private final Lexer mLexer;
-
- /**
- * Sets of contractions that help disambiguate single quotes in the text.
- * These are effectively immutable while parsing.
- */
- private final Contractions sContractions;
-
- /**
- * Incremented for each opening single quote emitted. Used to help resolve
- * ambiguities when single quote marks are balanced.
- */
- private int mOpeningSingleQuote;
-
- /**
- * Incremented for each closing single quote emitted. Used to help resolve
- * ambiguities when single quote marks are balanced.
- */
- private int mClosingSingleQuote;
-
- /**
- * Constructs a new {@link Parser} using the default contraction sets
- * to help resolve some ambiguous scenarios.
- *
- * @param text The prose to parse, containing zero or more quotation
- * characters.
- */
- public Parser( final String text ) {
- this( text, new Contractions.Builder().build() );
- }
-
- /**
- * Constructs a new {@link Parser} using the default contraction sets
- * to help resolve some ambiguous scenarios.
- *
- * @param text The prose to parse, containing zero or more quotation
- * characters.
- * @param contractions Custom sets of contractions to help resolve
- * ambiguities.
- */
- public Parser( final String text, final Contractions contractions ) {
- mText = text;
- mLexer = new Lexer( mText );
- sContractions = contractions;
- }
-
- /**
- * Iterates over the entire text provided at construction, emitting
- * {@link Token}s that can be used to convert straight quotes to curly
- * quotes.
- *
- * @param consumer Receives emitted {@link Token}s.
- * @return The list of lexemes that could not be resolved.
- */
- public List<Lexeme> parse( final Consumer<Token> consumer ) {
- final var lexemes = new CircularFifoQueue<Lexeme>( 3 );
-
- // Allow consuming the very first token without checking the queue size.
- flush( lexemes );
-
- final var unresolved = new ArrayList<Lexeme[]>();
- Lexeme lexeme;
-
- // Create and convert a list of all unambiguous quote characters.
- while( (lexeme = mLexer.next()) != EOT ) {
- tokenize( lexeme, lexemes, consumer, unresolved );
- }
-
- // By loop's end, the lexemes list contains tokens for all except the
- // final two elements (from tokenizing in triplets). Tokenize the remaining
- // unprocessed lexemes.
- tokenize( EOT, lexemes, consumer, unresolved );
- tokenize( EOT, lexemes, consumer, unresolved );
-
- // Attempt to resolve any remaining unambiguous quotes.
- resolve( unresolved, consumer );
-
- final var result = new ArrayList<Lexeme>( unresolved.size() );
-
- unresolved.forEach( ( lex ) -> result.add( lex[ 1 ] ) );
-
- return result;
- }
-
- private void tokenize( final Lexeme lexeme,
- final CircularFifoQueue<Lexeme> lexemes,
- final Consumer<Token> consumer,
- final List<Lexeme[]> unresolved ) {
- // Add the next lexeme to tokenize into the queue for immediate processing.
- lexemes.add( lexeme );
-
- final var lex1 = lexemes.get( 0 );
- final var lex2 = lexemes.get( 1 );
- final var lex3 = lexemes.get( 2 );
-
- if( lex2.isType( QUOTE_SINGLE ) && lex3.isType( WORD ) &&
- lex1.anyType( WORD, PERIOD, NUMBER ) ) {
- // Examples: y'all, Ph.D.'ll, 20's, she's
- consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) );
- flush( lexemes );
- }
- else if( lex1.isType( QUOTE_SINGLE ) && lex3.isType( QUOTE_SINGLE ) &&
- "n".equalsIgnoreCase( lex2.toString( mText ) ) ) {
- // I.e., 'n'
- consumer.accept( new Token( QUOTE_APOSTROPHE, lex1 ) );
- consumer.accept( new Token( QUOTE_APOSTROPHE, lex3 ) );
- flush( lexemes );
- resolved( unresolved );
- }
- else if( lex2.isType( QUOTE_SINGLE ) && lex1.isType( NUMBER ) ) {
- if( lex3.isType( QUOTE_SINGLE ) ) {
- // E.g., 2''
- consumer.accept(
- new Token( QUOTE_PRIME_DOUBLE, lex2.began(), lex3.ended() ) );
- flush( lexemes );
- }
- else {
- // E.g., 2'
- consumer.accept( new Token( QUOTE_PRIME_SINGLE, lex2 ) );
- }
- }
- else if( lex2.isType( QUOTE_DOUBLE ) && lex1.isType( NUMBER ) ) {
- // E.g., 2"
- consumer.accept( new Token( QUOTE_PRIME_DOUBLE, lex2 ) );
- }
- else if( lex2.isType( WORD ) && lex3.isType( QUOTE_SINGLE ) &&
- sContractions.endedUnambiguously( lex2.toString( mText ) ) ) {
- // E.g., thinkin'
- consumer.accept( new Token( QUOTE_APOSTROPHE, lex3 ) );
- flush( lexemes );
- }
- else if( lex2.isType( NUMBER ) && lex1.isType( QUOTE_SINGLE ) ) {
- if( lex3.anyType( SPACE, PUNCT ) || (lex3.isType( WORD ) &&
- lex3.toString( mText ).equalsIgnoreCase( "s" )) ) {
- // Sentences must re-written to avoid starting with numerals.
- // Examples: '20s, '02
- consumer.accept( new Token( QUOTE_APOSTROPHE, lex1 ) );
- }
- else {
- // E.g., '2''
- consumer.accept( new Token( QUOTE_OPENING_SINGLE, lex1 ) );
- mOpeningSingleQuote++;
- }
-
- resolved( unresolved );
- }
- else if( lex2.isType( QUOTE_SINGLE ) &&
- lex1.anyType( PUNCT, PERIOD, ELLIPSIS, DASH ) &&
- (lex3.anyType( EOL, EOP ) || lex3.isEot()) ) {
- consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) );
- mClosingSingleQuote++;
- }
- else if( lex1.isType( ESC_SINGLE ) ) {
- // E.g., \'
- consumer.accept( new Token( QUOTE_STRAIGHT_SINGLE, lex1 ) );
- }
- else if( lex1.isType( ESC_DOUBLE ) ) {
- // E.g., \"
- consumer.accept( new Token( QUOTE_STRAIGHT_DOUBLE, lex1 ) );
-
- if( lex2.isType( QUOTE_SINGLE ) &&
- (lex3.isEot() || lex3.anyType( SPACE, DASH, EOL, EOP )) ) {
- consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) );
- mClosingSingleQuote++;
- }
- }
- else if( lex2.isType( QUOTE_DOUBLE ) &&
- (lex1.isSot() || lex1.anyType( LEADING_QUOTE_OPENING_DOUBLE )) &&
- lex3.anyType( LAGGING_QUOTE_OPENING_DOUBLE ) ) {
- // Examples: "", "..., "word, ---"word
- consumer.accept( new Token( QUOTE_OPENING_DOUBLE, lex2 ) );
- }
- else if( lex2.isType( QUOTE_DOUBLE ) &&
- lex1.anyType( LEADING_QUOTE_CLOSING_DOUBLE ) &&
- (lex3.isEot() || lex3.anyType( LAGGING_QUOTE_CLOSING_DOUBLE )) ) {
- // E.g., ..."', word"', ?"'
- consumer.accept( new Token( QUOTE_CLOSING_DOUBLE, lex2 ) );
- }
- else if( lex1.isType( QUOTE_SINGLE ) &&
- lex2.anyType( PUNCT, PERIOD, DASH ) && lex3.isType( QUOTE_DOUBLE ) ) {
- // E.g., '," (contraction ruled out from previous conditionals)
- consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex1 ) );
- resolved( unresolved );
- mClosingSingleQuote++;
- }
- else if( lex2.anyType( QUOTE_SINGLE, QUOTE_DOUBLE ) ) {
- // After tokenizing, the parser will attempt to resolve ambiguities.
- unresolved.add( new Lexeme[]{lex1, lex2, lex3} );
- }
- }
-
- private void resolve(
- final List<Lexeme[]> unresolved, final Consumer<Token> consumer ) {
- // Some non-emitted tokenized lexemes may be ambiguous.
- final var ambiguousLeadingQuotes = new ArrayList<Lexeme[]>( 16 );
- final var ambiguousLaggingQuotes = new ArrayList<Lexeme[]>( 16 );
- var resolvedLeadingQuotes = 0;
- var resolvedLaggingQuotes = 0;
-
- // Count the number of ambiguous and non-ambiguous open single quotes.
- for( var i = unresolved.iterator(); i.hasNext(); ) {
- final var quotes = i.next();
- final var lex1 = quotes[ 0 ];
- final var lex2 = quotes[ 1 ];
- final var lex3 = quotes[ 2 ];
-
- if( lex2.isType( QUOTE_SINGLE ) ) {
- final var word1 = lex1 == SOT ? "" : lex1.toString( mText );
- final var word3 = lex3 == EOT ? "" : lex3.toString( mText );
-
- if( sContractions.beganAmbiguously( word3 ) ) {
- // E.g., 'Cause
- if( lex1.isType( QUOTE_SINGLE ) ) {
- // E.g., ''Cause
- consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) );
- i.remove();
- }
- else {
- // The contraction is uncertain until a closing quote is found that
- // may balance this single quote.
- ambiguousLeadingQuotes.add( quotes );
- }
- }
- else if( sContractions.beganUnambiguously( word3 ) ) {
- // The quote mark forms a word that does not stand alone from its
- // contraction. For example, twas is not a word: it's 'twas.
- consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) );
- i.remove();
- }
- else if( sContractions.endedAmbiguously( word1 ) ) {
- ambiguousLaggingQuotes.add( quotes );
- }
- else if( (lex1.isSot() || lex1.anyType( LEADING_QUOTE_OPENING_SINGLE ))
- && lex3.anyType( LAGGING_QUOTE_OPENING_SINGLE ) ) {
- consumer.accept( new Token( QUOTE_OPENING_SINGLE, lex2 ) );
- resolvedLeadingQuotes++;
- mOpeningSingleQuote++;
- i.remove();
- }
- else if( lex1.anyType( LEADING_QUOTE_CLOSING_SINGLE ) &&
- (lex3.isEot() || lex3.anyType( LAGGING_QUOTE_CLOSING_SINGLE )) ) {
- consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) );
- resolvedLaggingQuotes++;
- mClosingSingleQuote++;
- i.remove();
- }
- else if( lex3.isType( NUMBER ) ) {
- // E.g., '04
- ambiguousLeadingQuotes.add( quotes );
- }
- }
- }
-
- final var ambiguousLeadingCount = ambiguousLeadingQuotes.size();
- final var ambiguousLaggingCount = ambiguousLaggingQuotes.size();
-
- if( resolvedLeadingQuotes == 1 && resolvedLaggingQuotes == 0 ) {
- if( ambiguousLeadingCount == 0 && ambiguousLaggingCount == 1 ) {
- final var balanced = mClosingSingleQuote - mOpeningSingleQuote == 0;
- final var quote = balanced ? QUOTE_APOSTROPHE : QUOTE_CLOSING_SINGLE;
- final var lex = ambiguousLaggingQuotes.get( 0 );
- consumer.accept( new Token( quote, lex[ 1 ] ) );
- unresolved.remove( lex );
- }
- else if( ambiguousLeadingCount == 0 && unresolved.size() == 1 ) {
- // Must be a closing quote.
- final var closing = unresolved.get( 0 );
- consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) );
- unresolved.remove( closing );
- }
- }
- else if( ambiguousLeadingCount == 0 && ambiguousLaggingCount > 0 ) {
- // If there are no ambiguous leading quotes then all ambiguous lagging
- // quotes must be contractions.
- ambiguousLaggingQuotes.forEach(
- lex -> {
- consumer.accept( new Token( QUOTE_APOSTROPHE, lex[ 1 ] ) );
- unresolved.remove( lex );
- }
- );
- }
- else if( ambiguousLeadingCount == 0 ) {
- if( resolvedLaggingQuotes < resolvedLeadingQuotes ) {
- for( final var i = unresolved.iterator(); i.hasNext(); ) {
- final var closing = i.next()[ 1 ];
- consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing ) );
- i.remove();
- }
- }
- else if( mOpeningSingleQuote - mClosingSingleQuote == unresolved.size() ) {
- for( final var i = unresolved.iterator(); i.hasNext(); ) {
- final var closing = i.next();
- consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) );
- i.remove();
- }
- }
- else if( unresolved.size() == 2 ) {
- final var closing = unresolved.get( 0 );
- final var opening = unresolved.get( 1 );
- consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) );
- consumer.accept( new Token( QUOTE_OPENING_SINGLE, opening[ 1 ] ) );
-
- unresolved.clear();
- }
- }
- else if( ambiguousLeadingCount == 1 && resolvedLaggingQuotes == 1 ) {
- final var opening = ambiguousLeadingQuotes.get( 0 );
- consumer.accept( new Token( QUOTE_OPENING_SINGLE, opening[ 1 ] ) );
- unresolved.remove( opening );
- }
- }
-
- /**
- * Remove the last {@link Lexeme}s from the given list.
- *
- * @param unresolved The list of {@link Lexeme}s to modify.
- */
- private void resolved( final List<Lexeme[]> unresolved ) {
+public final class Parser {
+ /**
+ * Single quotes preceded by these {@link LexemeType}s may be opening quotes.
+ */
+ private static final LexemeType[] LEADING_QUOTE_OPENING_SINGLE =
+ new LexemeType[]{SPACE, DASH, QUOTE_DOUBLE, OPENING_GROUP, EOP};
+
+ /**
+ * Single quotes succeeded by these {@link LexemeType}s may be opening quotes.
+ */
+ private static final LexemeType[] LAGGING_QUOTE_OPENING_SINGLE =
+ new LexemeType[]{WORD, ELLIPSIS, QUOTE_SINGLE, QUOTE_DOUBLE};
+
+ /**
+ * Single quotes preceded by these {@link LexemeType}s may be closing quotes.
+ */
+ private static final LexemeType[] LEADING_QUOTE_CLOSING_SINGLE =
+ new LexemeType[]{WORD, NUMBER, PERIOD, PUNCT, ELLIPSIS, QUOTE_DOUBLE};
+
+ /**
+ * Single quotes succeeded by these {@link LexemeType}s may be closing quotes.
+ */
+ private static final LexemeType[] LAGGING_QUOTE_CLOSING_SINGLE =
+ new LexemeType[]{SPACE, DASH, QUOTE_DOUBLE, CLOSING_GROUP, EOL};
+
+ /**
+ * Double quotes preceded by these {@link LexemeType}s may be opening quotes.
+ */
+ private static final LexemeType[] LEADING_QUOTE_OPENING_DOUBLE =
+ new LexemeType[]{SPACE, DASH, QUOTE_SINGLE, OPENING_GROUP, EOP};
+
+ /**
+ * Double quotes succeeded by these {@link LexemeType}s may be opening quotes.
+ */
+ private static final LexemeType[] LAGGING_QUOTE_OPENING_DOUBLE =
+ new LexemeType[]{WORD, NUMBER, ELLIPSIS, QUOTE_SINGLE, QUOTE_DOUBLE};
+
+ /**
+ * Double quotes preceded by these {@link LexemeType}s may be closing quotes.
+ */
+ private static final LexemeType[] LEADING_QUOTE_CLOSING_DOUBLE =
+ new LexemeType[]{WORD, NUMBER, PERIOD, PUNCT, ELLIPSIS, QUOTE_SINGLE};
+
+ /**
+ * Double quotes succeeded by these {@link LexemeType}s may be closing quotes.
+ */
+ private static final LexemeType[] LAGGING_QUOTE_CLOSING_DOUBLE =
+ new LexemeType[]{SPACE, DASH, QUOTE_SINGLE, CLOSING_GROUP, EOL};
+
+ /**
+ * The text to parse. A reference is required as a minor optimization in
+ * memory and speed: the lexer records integer offsets, rather than new
+ * {@link String} instances, to track parsed lexemes.
+ */
+ private final String mText;
+
+ /**
+ * Converts a string into an iterable list of {@link Lexeme} instances.
+ */
+ private final Lexer mLexer;
+
+ /**
+ * Sets of contractions that help disambiguate single quotes in the text.
+ * These are effectively immutable while parsing.
+ */
+ private final Contractions sContractions;
+
+ /**
+ * Incremented for each opening single quote emitted. Used to help resolve
+ * ambiguities when single quote marks are balanced.
+ */
+ private int mOpeningSingleQuote;
+
+ /**
+ * Incremented for each closing single quote emitted. Used to help resolve
+ * ambiguities when single quote marks are balanced.
+ */
+ private int mClosingSingleQuote;
+
+ /**
+ * Constructs a new {@link Parser} using the default contraction sets
+ * to help resolve some ambiguous scenarios.
+ *
+ * @param text The prose to parse, containing zero or more quotation
+ * characters.
+ */
+ public Parser( final String text ) {
+ this( text, new Contractions.Builder().build() );
+ }
+
+ /**
+ * Constructs a new {@link Parser} using the default contraction sets
+ * to help resolve some ambiguous scenarios.
+ *
+ * @param text The prose to parse, containing zero or more quotation
+ * characters.
+ * @param contractions Custom sets of contractions to help resolve
+ * ambiguities.
+ */
+ public Parser( final String text, final Contractions contractions ) {
+ mText = text;
+ mLexer = new Lexer( mText );
+ sContractions = contractions;
+ }
+
+ /**
+ * Iterates over the entire text provided at construction, emitting
+ * {@link Token}s that can be used to convert straight quotes to curly
+ * quotes.
+ *
+ * @param tokenConsumer Receives emitted {@link Token}s.
+ */
+ public void parse(
+ final Consumer<Token> tokenConsumer,
+ final Consumer<Lexeme> lexemeConsumer ) {
+ final var lexemes = new CircularFifoQueue<Lexeme>( 3 );
+
+ // Allow consuming the very first token without checking the queue size.
+ flush( lexemes );
+
+ final var unresolved = new ArrayList<Lexeme[]>();
+ Lexeme lexeme;
+
+ // Create and convert a list of all unambiguous quote characters.
+ while( (lexeme = mLexer.next()) != EOT ) {
+ tokenize( lexeme, lexemes, tokenConsumer, unresolved );
+ }
+
+ // By loop's end, the lexemes list contains tokens for all except the
+ // final two elements (from tokenizing in triplets). Tokenize the remaining
+ // unprocessed lexemes.
+ tokenize( EOT, lexemes, tokenConsumer, unresolved );
+ tokenize( EOT, lexemes, tokenConsumer, unresolved );
+
+ // Attempt to resolve any remaining unambiguous quotes.
+ resolve( unresolved, tokenConsumer );
+
+ // Notify of any unambiguous quotes that could not be resolved.
+ unresolved.forEach( ( lex ) -> lexemeConsumer.accept( lex[ 1 ] ) );
+ }
+
+ private void tokenize( final Lexeme lexeme,
+ final CircularFifoQueue<Lexeme> lexemes,
+ final Consumer<Token> consumer,
+ final List<Lexeme[]> unresolved ) {
+ // Add the next lexeme to tokenize into the queue for immediate processing.
+ lexemes.add( lexeme );
+
+ final var lex1 = lexemes.get( 0 );
+ final var lex2 = lexemes.get( 1 );
+ final var lex3 = lexemes.get( 2 );
+
+ if( lex2.isType( QUOTE_SINGLE ) && lex3.isType( WORD ) &&
+ lex1.anyType( WORD, PERIOD, NUMBER ) ) {
+ // Examples: y'all, Ph.D.'ll, 20's, she's
+ consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) );
+ flush( lexemes );
+ }
+ else if( lex1.isType( QUOTE_SINGLE ) && lex3.isType( QUOTE_SINGLE ) &&
+ "n".equalsIgnoreCase( lex2.toString( mText ) ) ) {
+ // I.e., 'n'
+ consumer.accept( new Token( QUOTE_APOSTROPHE, lex1 ) );
+ consumer.accept( new Token( QUOTE_APOSTROPHE, lex3 ) );
+ flush( lexemes );
+ truncate( unresolved );
+ }
+ else if( lex2.isType( QUOTE_SINGLE ) && lex1.isType( NUMBER ) ) {
+ if( lex3.isType( QUOTE_SINGLE ) ) {
+ // E.g., 2''
+ consumer.accept(
+ new Token( QUOTE_PRIME_DOUBLE, lex2.began(), lex3.ended() ) );
+ flush( lexemes );
+ }
+ else {
+ // E.g., 2'
+ consumer.accept( new Token( QUOTE_PRIME_SINGLE, lex2 ) );
+ }
+ }
+ else if( lex2.isType( QUOTE_DOUBLE ) && lex1.isType( NUMBER ) ) {
+ // E.g., 2"
+ consumer.accept( new Token( QUOTE_PRIME_DOUBLE, lex2 ) );
+ }
+ else if( lex2.isType( WORD ) && lex3.isType( QUOTE_SINGLE ) &&
+ sContractions.endedUnambiguously( lex2.toString( mText ) ) ) {
+ // E.g., thinkin'
+ consumer.accept( new Token( QUOTE_APOSTROPHE, lex3 ) );
+ flush( lexemes );
+ }
+ else if( lex2.isType( NUMBER ) && lex1.isType( QUOTE_SINGLE ) ) {
+ if( lex3.anyType( SPACE, PUNCT ) || (lex3.isType( WORD ) &&
+ lex3.toString( mText ).equalsIgnoreCase( "s" )) ) {
+ // Sentences must re-written to avoid starting with numerals.
+ // Examples: '20s, '02
+ consumer.accept( new Token( QUOTE_APOSTROPHE, lex1 ) );
+ }
+ else {
+ // E.g., '2''
+ consumer.accept( new Token( QUOTE_OPENING_SINGLE, lex1 ) );
+ mOpeningSingleQuote++;
+ }
+
+ truncate( unresolved );
+ }
+ else if( lex2.isType( QUOTE_SINGLE ) &&
+ lex1.anyType( PUNCT, PERIOD, ELLIPSIS, DASH ) &&
+ (lex3.anyType( EOL, EOP ) || lex3.isEot()) ) {
+ consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) );
+ mClosingSingleQuote++;
+ }
+ else if( lex1.isType( ESC_SINGLE ) ) {
+ // E.g., \'
+ consumer.accept( new Token( QUOTE_STRAIGHT_SINGLE, lex1 ) );
+ }
+ else if( lex1.isType( ESC_DOUBLE ) ) {
+ // E.g., \"
+ consumer.accept( new Token( QUOTE_STRAIGHT_DOUBLE, lex1 ) );
+
+ if( lex2.isType( QUOTE_SINGLE ) &&
+ (lex3.isEot() || lex3.anyType( SPACE, DASH, EOL, EOP )) ) {
+ consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) );
+ mClosingSingleQuote++;
+ }
+ }
+ else if( lex2.isType( QUOTE_DOUBLE ) &&
+ (lex1.isSot() || lex1.anyType( LEADING_QUOTE_OPENING_DOUBLE )) &&
+ lex3.anyType( LAGGING_QUOTE_OPENING_DOUBLE ) ) {
+ // Examples: "", "..., "word, ---"word
+ consumer.accept( new Token( QUOTE_OPENING_DOUBLE, lex2 ) );
+ }
+ else if( lex2.isType( QUOTE_DOUBLE ) &&
+ lex1.anyType( LEADING_QUOTE_CLOSING_DOUBLE ) &&
+ (lex3.isEot() || lex3.anyType( LAGGING_QUOTE_CLOSING_DOUBLE )) ) {
+ // Examples: ..."', word"', ?"'
+ consumer.accept( new Token( QUOTE_CLOSING_DOUBLE, lex2 ) );
+ }
+ else if( lex1.isType( QUOTE_SINGLE ) &&
+ lex2.anyType( PUNCT, PERIOD, DASH ) && lex3.isType( QUOTE_DOUBLE ) ) {
+ // E.g., '," (contraction ruled out from previous conditionals)
+ consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex1 ) );
+ truncate( unresolved );
+ mClosingSingleQuote++;
+ }
+ else if( lex2.anyType( QUOTE_SINGLE, QUOTE_DOUBLE ) ) {
+ // After tokenizing, the parser will attempt to resolve ambiguities.
+ unresolved.add( new Lexeme[]{lex1, lex2, lex3} );
+ }
+ }
+
+ private void resolve(
+ final List<Lexeme[]> unresolved, final Consumer<Token> consumer ) {
+ // Some non-emitted tokenized lexemes may be ambiguous.
+ final var ambiguousLeadingQuotes = new ArrayList<Lexeme[]>( 16 );
+ final var ambiguousLaggingQuotes = new ArrayList<Lexeme[]>( 16 );
+ var resolvedLeadingQuotes = 0;
+ var resolvedLaggingQuotes = 0;
+
+ // Count the number of ambiguous and non-ambiguous open single quotes.
+ for( var i = unresolved.iterator(); i.hasNext(); ) {
+ final var quotes = i.next();
+ final var lex1 = quotes[ 0 ];
+ final var lex2 = quotes[ 1 ];
+ final var lex3 = quotes[ 2 ];
+
+ if( lex2.isType( QUOTE_SINGLE ) ) {
+ final var word1 = lex1 == SOT ? "" : lex1.toString( mText );
+ final var word3 = lex3 == EOT ? "" : lex3.toString( mText );
+
+ if( sContractions.beganAmbiguously( word3 ) ) {
+ // E.g., 'Cause
+ if( lex1.isType( QUOTE_SINGLE ) ) {
+ // E.g., ''Cause
+ consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) );
+ i.remove();
+ }
+ else {
+ // The contraction is uncertain until a closing quote is found that
+ // may balance this single quote.
+ ambiguousLeadingQuotes.add( quotes );
+ }
+ }
+ else if( sContractions.beganUnambiguously( word3 ) ) {
+ // The quote mark forms a word that does not stand alone from its
+ // contraction. For example, twas is not a word: it's 'twas.
+ consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) );
+ i.remove();
+ }
+ else if( sContractions.endedAmbiguously( word1 ) ) {
+ ambiguousLaggingQuotes.add( quotes );
+ }
+ else if( (lex1.isSot() || lex1.anyType( LEADING_QUOTE_OPENING_SINGLE ))
+ && lex3.anyType( LAGGING_QUOTE_OPENING_SINGLE ) ) {
+ consumer.accept( new Token( QUOTE_OPENING_SINGLE, lex2 ) );
+ resolvedLeadingQuotes++;
+ mOpeningSingleQuote++;
+ i.remove();
+ }
+ else if( lex1.anyType( LEADING_QUOTE_CLOSING_SINGLE ) &&
+ (lex3.isEot() || lex3.anyType( LAGGING_QUOTE_CLOSING_SINGLE )) ) {
+ consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) );
+ resolvedLaggingQuotes++;
+ mClosingSingleQuote++;
+ i.remove();
+ }
+ else if( lex3.isType( NUMBER ) ) {
+ // E.g., '04
+ ambiguousLeadingQuotes.add( quotes );
+ }
+ }
+ }
+
+ final var ambiguousLeadingCount = ambiguousLeadingQuotes.size();
+ final var ambiguousLaggingCount = ambiguousLaggingQuotes.size();
+
+ if( resolvedLeadingQuotes == 1 && resolvedLaggingQuotes == 0 ) {
+ if( ambiguousLeadingCount == 0 && ambiguousLaggingCount == 1 ) {
+ final var balanced = mClosingSingleQuote - mOpeningSingleQuote == 0;
+ final var quote = balanced ? QUOTE_APOSTROPHE : QUOTE_CLOSING_SINGLE;
+ final var lex = ambiguousLaggingQuotes.get( 0 );
+ consumer.accept( new Token( quote, lex[ 1 ] ) );
+ unresolved.remove( lex );
+ }
+ else if( ambiguousLeadingCount == 0 && unresolved.size() == 1 ) {
+ // Must be a closing quote.
+ final var closing = unresolved.get( 0 );
+ consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) );
+ unresolved.remove( closing );
+ }
+ }
+ else if( ambiguousLeadingCount == 0 && ambiguousLaggingCount > 0 ) {
+ // If there are no ambiguous leading quotes then all ambiguous lagging
+ // quotes must be contractions.
+ ambiguousLaggingQuotes.forEach(
+ lex -> {
+ consumer.accept( new Token( QUOTE_APOSTROPHE, lex[ 1 ] ) );
+ unresolved.remove( lex );
+ }
+ );
+ }
+ else if( ambiguousLeadingCount == 0 ) {
+ if( resolvedLaggingQuotes < resolvedLeadingQuotes ) {
+ for( final var i = unresolved.iterator(); i.hasNext(); ) {
+ final var closing = i.next()[ 1 ];
+ consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing ) );
+ i.remove();
+ }
+ }
+ else if( mOpeningSingleQuote - mClosingSingleQuote == unresolved.size() ) {
+ for( final var i = unresolved.iterator(); i.hasNext(); ) {
+ final var closing = i.next();
+ consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) );
+ i.remove();
+ }
+ }
+ else if( unresolved.size() == 2 ) {
+ final var closing = unresolved.get( 0 );
+ final var opening = unresolved.get( 1 );
+ consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) );
+ consumer.accept( new Token( QUOTE_OPENING_SINGLE, opening[ 1 ] ) );
+
+ unresolved.clear();
+ }
+ }
+ else if( ambiguousLeadingCount == 1 && resolvedLaggingQuotes == 1 ) {
+ final var opening = ambiguousLeadingQuotes.get( 0 );
+ consumer.accept( new Token( QUOTE_OPENING_SINGLE, opening[ 1 ] ) );
+ unresolved.remove( opening );
+ }
+ }
+
+ /**
+ * Remove the last {@link Lexeme}s from the given list.
+ *
+ * @param unresolved The list of {@link Lexeme}s to modify.
+ */
+ private void truncate( final List<Lexeme[]> unresolved ) {
if( !unresolved.isEmpty() ) {
unresolved.remove( unresolved.size() - 1 );
lib/src/main/java/com/keenwrite/quotes/SmartQuotes.java
-/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */
-package com.keenwrite.quotes;
-
-import java.util.ArrayList;
-import java.util.Map;
-import java.util.function.Consumer;
-
-import static com.keenwrite.quotes.TokenType.*;
-import static java.util.Collections.sort;
-
-/**
- * Responsible for replacing {@link Token} instances with equivalent smart
- * quotes (or straight quotes). This will inform the caller when ambiguous
- * quotes cannot be reliably resolved.
- */
-public class SmartQuotes {
- private static final Map<TokenType, String> REPLACEMENTS = Map.of(
- QUOTE_OPENING_SINGLE, "&lsquo;",
- QUOTE_CLOSING_SINGLE, "&rsquo;",
- QUOTE_OPENING_DOUBLE, "&ldquo;",
- QUOTE_CLOSING_DOUBLE, "&rdquo;",
- QUOTE_STRAIGHT_SINGLE, "'",
- QUOTE_STRAIGHT_DOUBLE, "\"",
- QUOTE_APOSTROPHE, "&apos;",
- QUOTE_PRIME_SINGLE, "&prime;",
- QUOTE_PRIME_DOUBLE, "&Prime;"
- );
-
- /**
- * Converts
- * @param text
- * @return
- */
- public static String convert( final String text ) {
- return convert( text, ( lexeme ) -> {} );
- }
-
- public static String convert(
- final String text, final Consumer<Lexeme> consumer ) {
- final var parser = new Parser( text );
- final var tokens = new ArrayList<Token>();
-
- // Store all parsed quotation marks.
- parser.parse( tokens::add );
-
- // The parser may emit tokens in any order.
- sort( tokens );
-
- final var result = new StringBuilder( text.length() );
- var position = 0;
-
- for( final var token : tokens ) {
- if( position <= token.began() ) {
- result.append( text, position, token.began() );
- result.append( REPLACEMENTS.get( token.getType() ) );
- }
-
- position = token.ended();
- }
-
- return result.append( text.substring( position ) ).toString();
- }
-}
lib/src/main/java/com/keenwrite/quotes/Token.java
* Represents a high-level token read from the text.
*/
-public class Token implements Comparable<Token> {
+final class Token implements Comparable<Token> {
private final TokenType mType;
final int mBegan;
final int mEnded;
/**
- * Convenience constructor to create a token that uses the lexeme's offsets.
+ * Convenience constructor to create a token that uses the lexeme's
+ * beginning and ending offsets to represent a complete token.
*
* @param type The type of {@link Token} to create.
* @param lexeme Container for beginning and ending text offsets.
*/
Token( final TokenType type, final Lexeme lexeme ) {
this( type, lexeme.began(), lexeme.ended() );
}
+ /**
+ * This constructor can be used to create tokens that span more than a
+ * single character. Almost all tokens represent a single character, only
+ * the double-prime sequence ({@code ''}) is more than one character.
+ *
+ * @param type The type of {@link Token} to create.
+ * @param began Beginning offset into text where token is found.
+ * @param ended Ending offset into text where token is found.
+ */
Token( final TokenType type, final int began, final int ended ) {
assert type != null;
public int compareTo( final Token that ) {
return this.mBegan - that.mBegan;
- }
-
- @Override
- public String toString() {
- return getClass().getSimpleName() + "{" +
- "mType=" + mType +
- ", mBegan=" + mBegan +
- ", mEnded=" + mEnded +
- '}';
}
}
lib/src/main/java/com/keenwrite/quotes/TokenType.java
QUOTE_PRIME_SINGLE,
QUOTE_PRIME_DOUBLE,
- TEXT,
}
lib/src/test/java/com/keenwrite/quotes/KeenQuotesTest.java
+/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */
+package com.keenwrite.quotes;
+
+import org.junit.jupiter.api.Disabled;
+import org.junit.jupiter.api.Test;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.util.function.Function;
+
+import static java.lang.System.out;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+
+/**
+ * Test that English straight quotes are converted to curly quotes and
+ * apostrophes.
+ */
+public class KeenQuotesTest {
+ /**
+ * This is a single-use test that is useful for debugging.
+ */
+ @Test
+ @Disabled
+ public void test_parse_SingleLine_Parsed() {
+ out.println( KeenQuotes.convert(
+ "What's this '-5.5'',' '-10.2'' cm,' and another '-7.25''' thing?",
+ out::println
+ ) );
+ }
+
+ /**
+ * Tests that straight quotes are converted to curly quotes.
+ *
+ * @throws IOException Error opening file full of fun.
+ */
+ @Test
+ public void test_Parse_StraightQuotes_CurlyQuotes() throws IOException {
+ testConverter( text -> KeenQuotes.convert( text, ( lexeme ) -> {} ) );
+ }
+
+ /**
+ * Reads a file full of couplets. The first of the pair is the input,
+ * the second of the pair is the expected result. Couplets may include
+ * newline characters to indicate end of lines and end of paragraphs.
+ * Lines that start with {@code #} are ignored.
+ *
+ * @param parser The text processor capable of straight quote conversion.
+ * @throws IOException Error opening file full of fun.
+ */
+ private void testConverter( final Function<String, String> parser )
+ throws IOException {
+ try( final var reader = openResource( "smartypants.txt" ) ) {
+ String line;
+ String testLine = "";
+ String expected = "";
+
+ while( ((line = reader.readLine()) != null) ) {
+ if( line.startsWith( "#" ) || line.isBlank() ) { continue; }
+
+ // Read the first line of the couplet.
+ if( testLine.isBlank() ) {
+ testLine = line;
+ continue;
+ }
+
+ // Read the second line of the couplet.
+ if( expected.isBlank() ) {
+ expected = line;
+ }
+
+ testLine = unescapeEol( testLine );
+ expected = unescapeEol( expected );
+
+ final var actual = parser.apply( testLine );
+ assertEquals( expected, actual );
+
+ testLine = "";
+ expected = "";
+ }
+ }
+ }
+
+ private static String unescapeEol( final String s ) {
+ return String.join( "\n", s.split( "\\\\n" ) );
+ }
+
+ @SuppressWarnings( "SameParameterValue" )
+ private BufferedReader openResource( final String filename ) {
+ final var is = getClass().getResourceAsStream( filename );
+ assertNotNull( is );
+
+ return new BufferedReader( new InputStreamReader( is ) );
+ }
+}
lib/src/test/java/com/keenwrite/quotes/ParserTest.java
@Test
- void test_Conversion_Straight_Curly() {
+ void test_Conversion_StraightQuotes_ExpectedConversionCount() {
for( final var entry : TEST_CASES.entrySet() ) {
parse( entry.getKey(), entry.getValue() );
parser.parse(
- ( token ) -> actual.merge( token.getType(), 1, Integer::sum )
+ ( token ) -> actual.merge( token.getType(), 1, Integer::sum ),
+ ( lexeme ) -> {}
);
lib/src/test/java/com/keenwrite/quotes/SmartQuotesTest.java
-/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */
-package com.keenwrite.quotes;
-
-import org.junit.jupiter.api.Disabled;
-import org.junit.jupiter.api.Test;
-
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.util.function.Function;
-
-import static java.lang.System.out;
-import static org.junit.jupiter.api.Assertions.assertEquals;
-import static org.junit.jupiter.api.Assertions.assertNotNull;
-
-/**
- * Test that English straight quotes are converted to curly quotes and
- * apostrophes.
- */
-public class SmartQuotesTest {
- /**
- * This is a single-use test that is useful for debugging.
- */
- @Test
- @Disabled
- public void test_parse_SingleLine_Parsed() {
- out.println( SmartQuotes.convert(
- "What's this '-5.5'',' '-10.2'' cm,' and another '-7.25''' thing?",
- out::println
- ) );
- }
-
- /**
- * Tests that straight quotes are converted to curly quotes.
- *
- * @throws IOException Error opening file full of fun.
- */
- @Test
- public void test_Parse_StraightQuotes_CurlyQuotes() throws IOException {
- testConverter( );
- }
-
- /**
- * Reads a file full of couplets. The first of the pair is the input,
- * the second of the pair is the expected result. Couplets may include
- * newline characters to indicate end of lines and end of paragraphs.
- * Lines that start with {@code #} are ignored.
- *
- * @param parser The text processor capable of straight quote conversion.
- * @throws IOException Error opening file full of fun.
- */
- private void testConverter( final Function<String, String> parser )
- throws IOException {
- try( final var reader = openResource( "smartypants.txt" ) ) {
- String line;
- String testLine = "";
- String expected = "";
-
- while( ((line = reader.readLine()) != null) ) {
- if( line.startsWith( "#" ) || line.isBlank() ) { continue; }
-
- // Read the first line of the couplet.
- if( testLine.isBlank() ) {
- testLine = line;
- continue;
- }
-
- // Read the second line of the couplet.
- if( expected.isBlank() ) {
- expected = line;
- }
-
- testLine = unescapeEol( testLine );
- expected = unescapeEol( expected );
-
- final var actual = parser.apply( testLine );
- assertEquals( expected, actual );
-
- testLine = "";
- expected = "";
- }
- }
- }
-
- private static String unescapeEol( final String s ) {
- return String.join( "\n", s.split( "\\\\n" ) );
- }
-
- @SuppressWarnings( "SameParameterValue" )
- private BufferedReader openResource( final String filename ) {
- final var is = getClass().getResourceAsStream( filename );
- assertNotNull( is );
-
- return new BufferedReader( new InputStreamReader( is ) );
- }
-}
Delta601 lines added, 572 lines removed, 29-line increase