|
|
*/ |
|
|
private static final LexemeType[] LEADING_QUOTE_CLOSING_DOUBLE = |
|
|
- new LexemeType[]{WORD, NUMBER, PERIOD, PUNCT, ELLIPSIS, QUOTE_SINGLE}; |
|
|
- |
|
|
- /** |
|
|
- * Double quotes succeeded by these {@link LexemeType}s may be closing quotes. |
|
|
- */ |
|
|
- private static final LexemeType[] LAGGING_QUOTE_CLOSING_DOUBLE = |
|
|
- new LexemeType[]{SPACE, PUNCT, DASH, QUOTE_SINGLE, CLOSING_GROUP, EOL, EOP}; |
|
|
- |
|
|
- /** |
|
|
- * The text to parse. A reference is required as a minor optimization in |
|
|
- * memory and speed: the lexer records integer offsets, rather than new |
|
|
- * {@link String} instances, to track parsed lexemes. |
|
|
- */ |
|
|
- private final String mText; |
|
|
- |
|
|
- /** |
|
|
- * Converts a string into an iterable list of {@link Lexeme} instances. |
|
|
- */ |
|
|
- private final Lexer mLexer; |
|
|
- |
|
|
- /** |
|
|
- * Sets of contractions that help disambiguate single quotes in the text. |
|
|
- * These are effectively immutable while parsing. |
|
|
- */ |
|
|
- private final Contractions sContractions; |
|
|
- |
|
|
- /** |
|
|
- * Incremented for each opening single quote emitted. Used to help resolve |
|
|
- * ambiguities when single quote marks are balanced. |
|
|
- */ |
|
|
- private int mOpeningSingleQuote; |
|
|
- |
|
|
- /** |
|
|
- * Incremented for each closing single quote emitted. Used to help resolve |
|
|
- * ambiguities when single quote marks are balanced. |
|
|
- */ |
|
|
- private int mClosingSingleQuote; |
|
|
- |
|
|
- /** |
|
|
- * Constructs a new {@link Parser} using the default contraction sets |
|
|
- * to help resolve some ambiguous scenarios. |
|
|
- * |
|
|
- * @param text The prose to parse, containing zero or more quotation |
|
|
- * characters. |
|
|
- */ |
|
|
- public Parser( final String text ) { |
|
|
- this( text, new Contractions.Builder().build() ); |
|
|
- } |
|
|
- |
|
|
- /** |
|
|
- * Constructs a new {@link Parser} using the default contraction sets |
|
|
- * to help resolve some ambiguous scenarios. |
|
|
- * |
|
|
- * @param text The prose to parse, containing zero or more quotation |
|
|
- * characters. |
|
|
- * @param contractions Custom sets of contractions to help resolve |
|
|
- * ambiguities. |
|
|
- */ |
|
|
- public Parser( final String text, final Contractions contractions ) { |
|
|
- mText = text; |
|
|
- mLexer = new Lexer( mText ); |
|
|
- sContractions = contractions; |
|
|
- } |
|
|
- |
|
|
- /** |
|
|
- * Iterates over the entire text provided at construction, emitting |
|
|
- * {@link Token}s that can be used to convert straight quotes to curly |
|
|
- * quotes. |
|
|
- * |
|
|
- * @param tokenConsumer Receives emitted {@link Token}s. |
|
|
- */ |
|
|
- public void parse( |
|
|
- final Consumer<Token> tokenConsumer, |
|
|
- final Consumer<Lexeme> lexemeConsumer ) { |
|
|
- final var lexemes = new CircularFifoQueue<Lexeme>( 3 ); |
|
|
- |
|
|
- // Allow consuming the very first token without checking the queue size. |
|
|
- flush( lexemes ); |
|
|
- |
|
|
- final var unresolved = new ArrayList<Lexeme[]>(); |
|
|
- Lexeme lexeme; |
|
|
- |
|
|
- // Create and convert a list of all unambiguous quote characters. |
|
|
- while( (lexeme = mLexer.next()) != EOT ) { |
|
|
- if( tokenize( lexeme, lexemes, tokenConsumer, unresolved ) ) { |
|
|
- // Attempt to resolve any remaining unambiguous quotes. |
|
|
- resolve( unresolved, tokenConsumer ); |
|
|
- |
|
|
- // Notify of any unambiguous quotes that could not be resolved. |
|
|
- unresolved.forEach( ( lex ) -> lexemeConsumer.accept( lex[ 1 ] ) ); |
|
|
- unresolved.clear(); |
|
|
- mOpeningSingleQuote = 0; |
|
|
- mClosingSingleQuote = 0; |
|
|
- } |
|
|
- } |
|
|
- |
|
|
- // By loop's end, the lexemes list contains tokens for all except the |
|
|
- // final two elements (from tokenizing in triplets). Tokenize the remaining |
|
|
- // unprocessed lexemes. |
|
|
- tokenize( EOT, lexemes, tokenConsumer, unresolved ); |
|
|
- tokenize( EOT, lexemes, tokenConsumer, unresolved ); |
|
|
- |
|
|
- // Attempt to resolve any remaining unambiguous quotes. |
|
|
- resolve( unresolved, tokenConsumer ); |
|
|
- |
|
|
- // Notify of any unambiguous quotes that could not be resolved. |
|
|
- unresolved.forEach( ( lex ) -> lexemeConsumer.accept( lex[ 1 ] ) ); |
|
|
- } |
|
|
- |
|
|
- /** |
|
|
- * Converts {@link Lexeme}s identified as straight quotes into {@link Token}s |
|
|
- * that represent the curly equivalent. The {@link Token}s are passed to |
|
|
- * the given {@link Consumer} for further processing (e.g., replaced in |
|
|
- * the original text being parsed). |
|
|
- * |
|
|
- * @param lexeme A part of the text being parsed. |
|
|
- * @param lexemes A 3-element queue of lexemes that provide sufficient |
|
|
- * context to identify curly quotes. |
|
|
- * @param consumer Recipient of equivalent quotes. |
|
|
- * @param unresolved Rolling list of potentially ambiguous {@link Lexeme}s |
|
|
- * that could not be tokenized, yet. |
|
|
- * @return {@code true} if an end-of-paragraph is detected. |
|
|
- */ |
|
|
- private boolean tokenize( final Lexeme lexeme, |
|
|
- final CircularFifoQueue<Lexeme> lexemes, |
|
|
- final Consumer<Token> consumer, |
|
|
- final List<Lexeme[]> unresolved ) { |
|
|
- // Add the next lexeme to tokenize into the queue for immediate processing. |
|
|
- lexemes.add( lexeme ); |
|
|
- |
|
|
- final var lex1 = lexemes.get( 0 ); |
|
|
- final var lex2 = lexemes.get( 1 ); |
|
|
- final var lex3 = lexemes.get( 2 ); |
|
|
- |
|
|
- if( lex2.isType( QUOTE_SINGLE ) && lex3.isType( WORD ) && |
|
|
- lex1.anyType( WORD, PERIOD, NUMBER ) ) { |
|
|
- // Examples: y'all, Ph.D.'ll, 20's, she's |
|
|
- consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) ); |
|
|
- flush( lexemes ); |
|
|
- } |
|
|
- else if( lex1.isType( QUOTE_SINGLE ) && lex3.isType( QUOTE_SINGLE ) && |
|
|
- "n".equalsIgnoreCase( lex2.toString( mText ) ) ) { |
|
|
- // I.e., 'n' |
|
|
- consumer.accept( new Token( QUOTE_APOSTROPHE, lex1 ) ); |
|
|
- consumer.accept( new Token( QUOTE_APOSTROPHE, lex3 ) ); |
|
|
- flush( lexemes ); |
|
|
- truncate( unresolved ); |
|
|
- } |
|
|
- else if( lex2.isType( QUOTE_SINGLE ) && lex1.isType( NUMBER ) ) { |
|
|
- if( lex3.isType( QUOTE_SINGLE ) ) { |
|
|
- // E.g., 2'' |
|
|
- consumer.accept( |
|
|
- new Token( QUOTE_PRIME_DOUBLE, lex2.began(), lex3.ended() ) ); |
|
|
- flush( lexemes ); |
|
|
- } |
|
|
- else { |
|
|
- // E.g., 2' |
|
|
- consumer.accept( new Token( QUOTE_PRIME_SINGLE, lex2 ) ); |
|
|
- } |
|
|
- } |
|
|
- else if( lex2.isType( QUOTE_DOUBLE ) && lex1.isType( NUMBER ) ) { |
|
|
- // E.g., 2" |
|
|
- consumer.accept( new Token( QUOTE_PRIME_DOUBLE, lex2 ) ); |
|
|
- } |
|
|
- else if( lex2.isType( WORD ) && lex3.isType( QUOTE_SINGLE ) && |
|
|
- sContractions.endedUnambiguously( lex2.toString( mText ) ) ) { |
|
|
- // E.g., thinkin' |
|
|
- consumer.accept( new Token( QUOTE_APOSTROPHE, lex3 ) ); |
|
|
- flush( lexemes ); |
|
|
- } |
|
|
- else if( lex2.isType( NUMBER ) && lex1.isType( QUOTE_SINGLE ) ) { |
|
|
- if( lex3.anyType( SPACE, PUNCT ) || (lex3.isType( WORD ) && |
|
|
- lex3.toString( mText ).equalsIgnoreCase( "s" )) ) { |
|
|
- // Sentences must re-written to avoid starting with numerals. |
|
|
- // Examples: '20s, '02 |
|
|
- consumer.accept( new Token( QUOTE_APOSTROPHE, lex1 ) ); |
|
|
- } |
|
|
- else { |
|
|
- // E.g., '2'' |
|
|
- consumer.accept( new Token( QUOTE_OPENING_SINGLE, lex1 ) ); |
|
|
- mOpeningSingleQuote++; |
|
|
- } |
|
|
- |
|
|
- truncate( unresolved ); |
|
|
- } |
|
|
- else if( lex2.isType( QUOTE_SINGLE ) && |
|
|
- lex1.anyType( PUNCT, PERIOD, ELLIPSIS, DASH ) && |
|
|
- (lex3.anyType( EOL, EOP ) || lex3.isEot()) ) { |
|
|
- consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) ); |
|
|
- mClosingSingleQuote++; |
|
|
- } |
|
|
- else if( lex1.isType( ESC_SINGLE ) ) { |
|
|
- // E.g., \' |
|
|
- consumer.accept( new Token( QUOTE_STRAIGHT_SINGLE, lex1 ) ); |
|
|
- } |
|
|
- else if( lex1.isType( ESC_DOUBLE ) ) { |
|
|
- // E.g., \" |
|
|
- consumer.accept( new Token( QUOTE_STRAIGHT_DOUBLE, lex1 ) ); |
|
|
- |
|
|
- if( lex2.isType( QUOTE_SINGLE ) && |
|
|
- (lex3.isEot() || lex3.anyType( SPACE, DASH, EOL, EOP )) ) { |
|
|
- consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) ); |
|
|
- mClosingSingleQuote++; |
|
|
- } |
|
|
- } |
|
|
- else if( lex2.isType( QUOTE_DOUBLE ) && |
|
|
- (lex1.isSot() || lex1.anyType( LEADING_QUOTE_OPENING_DOUBLE )) && |
|
|
- lex3.anyType( LAGGING_QUOTE_OPENING_DOUBLE ) ) { |
|
|
- // Examples: "", "..., "word, ---"word |
|
|
- consumer.accept( new Token( QUOTE_OPENING_DOUBLE, lex2 ) ); |
|
|
- } |
|
|
- else if( lex2.isType( QUOTE_DOUBLE ) && |
|
|
- lex1.anyType( LEADING_QUOTE_CLOSING_DOUBLE ) && |
|
|
- (lex3.isEot() || lex3.anyType( LAGGING_QUOTE_CLOSING_DOUBLE )) ) { |
|
|
- // Examples: ..."', word"', ?"', word"? |
|
|
- consumer.accept( new Token( QUOTE_CLOSING_DOUBLE, lex2 ) ); |
|
|
- } |
|
|
- else if( lex1.isType( QUOTE_SINGLE ) && |
|
|
- lex2.anyType( PUNCT, PERIOD, DASH ) && lex3.isType( QUOTE_DOUBLE ) ) { |
|
|
- // E.g., '," (contraction ruled out from previous conditionals) |
|
|
- consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex1 ) ); |
|
|
- truncate( unresolved ); |
|
|
- mClosingSingleQuote++; |
|
|
- } |
|
|
- else if( lex2.anyType( QUOTE_SINGLE, QUOTE_DOUBLE ) ) { |
|
|
- // After tokenizing, the parser will attempt to resolve ambiguities. |
|
|
- unresolved.add( new Lexeme[]{lex1, lex2, lex3} ); |
|
|
- } |
|
|
- |
|
|
- // Suggest to the caller that resolution should be performed. This allows |
|
|
- // the algorithm to reset the opening/closing quote balance before the |
|
|
- // next paragraph is parsed. |
|
|
- return lex3.isType( EOP ); |
|
|
- } |
|
|
- |
|
|
- private void resolve( |
|
|
- final List<Lexeme[]> unresolved, final Consumer<Token> consumer ) { |
|
|
- // Some non-emitted tokenized lexemes may be ambiguous. |
|
|
- final var ambiguousLeadingQuotes = new ArrayList<Lexeme[]>( 16 ); |
|
|
- final var ambiguousLaggingQuotes = new ArrayList<Lexeme[]>( 16 ); |
|
|
- var resolvedLeadingQuotes = 0; |
|
|
- var resolvedLaggingQuotes = 0; |
|
|
- |
|
|
- // Count the number of ambiguous and non-ambiguous open single quotes. |
|
|
- for( var i = unresolved.iterator(); i.hasNext(); ) { |
|
|
- final var quotes = i.next(); |
|
|
- final var lex1 = quotes[ 0 ]; |
|
|
- final var lex2 = quotes[ 1 ]; |
|
|
- final var lex3 = quotes[ 2 ]; |
|
|
- |
|
|
- if( lex2.isType( QUOTE_SINGLE ) ) { |
|
|
- final var word1 = lex1 == SOT ? "" : lex1.toString( mText ); |
|
|
- final var word3 = lex3 == EOT ? "" : lex3.toString( mText ); |
|
|
- |
|
|
- if( sContractions.beganAmbiguously( word3 ) ) { |
|
|
- // E.g., 'Cause |
|
|
- if( lex1.isType( QUOTE_SINGLE ) ) { |
|
|
- // E.g., ''Cause |
|
|
- consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) ); |
|
|
- i.remove(); |
|
|
- } |
|
|
- else { |
|
|
- // The contraction is uncertain until a closing quote is found that |
|
|
- // may balance this single quote. |
|
|
- ambiguousLeadingQuotes.add( quotes ); |
|
|
- } |
|
|
- } |
|
|
- else if( sContractions.beganUnambiguously( word3 ) ) { |
|
|
- // The quote mark forms a word that does not stand alone from its |
|
|
- // contraction. For example, twas is not a word: it's 'twas. |
|
|
- consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) ); |
|
|
- i.remove(); |
|
|
- } |
|
|
- else if( sContractions.endedAmbiguously( word1 ) ) { |
|
|
- ambiguousLaggingQuotes.add( quotes ); |
|
|
- } |
|
|
- else if( (lex1.isSot() || lex1.anyType( LEADING_QUOTE_OPENING_SINGLE )) |
|
|
- && lex3.anyType( LAGGING_QUOTE_OPENING_SINGLE ) ) { |
|
|
- consumer.accept( new Token( QUOTE_OPENING_SINGLE, lex2 ) ); |
|
|
- resolvedLeadingQuotes++; |
|
|
- mOpeningSingleQuote++; |
|
|
- i.remove(); |
|
|
- } |
|
|
- else if( lex1.anyType( LEADING_QUOTE_CLOSING_SINGLE ) && |
|
|
- (lex3.isEot() || lex3.anyType( LAGGING_QUOTE_CLOSING_SINGLE )) ) { |
|
|
- consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) ); |
|
|
- resolvedLaggingQuotes++; |
|
|
- mClosingSingleQuote++; |
|
|
- i.remove(); |
|
|
- } |
|
|
- else if( lex3.isType( NUMBER ) ) { |
|
|
- // E.g., '04 |
|
|
- ambiguousLeadingQuotes.add( quotes ); |
|
|
- } |
|
|
- } |
|
|
- } |
|
|
- |
|
|
- final var ambiguousLeadingCount = ambiguousLeadingQuotes.size(); |
|
|
- final var ambiguousLaggingCount = ambiguousLaggingQuotes.size(); |
|
|
- |
|
|
- if( resolvedLeadingQuotes == 1 && resolvedLaggingQuotes == 0 ) { |
|
|
- if( ambiguousLeadingCount == 0 && ambiguousLaggingCount == 1 ) { |
|
|
- final var balanced = mClosingSingleQuote - mOpeningSingleQuote == 0; |
|
|
- final var quote = balanced ? QUOTE_APOSTROPHE : QUOTE_CLOSING_SINGLE; |
|
|
- final var lex = ambiguousLaggingQuotes.get( 0 ); |
|
|
- consumer.accept( new Token( quote, lex[ 1 ] ) ); |
|
|
- unresolved.remove( lex ); |
|
|
- } |
|
|
- else if( ambiguousLeadingCount == 0 && unresolved.size() == 1 ) { |
|
|
- // Must be a closing quote. |
|
|
- final var closing = unresolved.get( 0 ); |
|
|
- consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) ); |
|
|
- unresolved.remove( closing ); |
|
|
- } |
|
|
- } |
|
|
- else if( ambiguousLeadingCount == 0 && ambiguousLaggingCount > 0 ) { |
|
|
- // If there are no ambiguous leading quotes then all ambiguous lagging |
|
|
- // quotes must be contractions. |
|
|
- ambiguousLaggingQuotes.forEach( |
|
|
- lex -> { |
|
|
- consumer.accept( new Token( QUOTE_APOSTROPHE, lex[ 1 ] ) ); |
|
|
- unresolved.remove( lex ); |
|
|
- } |
|
|
- ); |
|
|
- } |
|
|
- else if( ambiguousLeadingCount == 0 ) { |
|
|
- if( resolvedLaggingQuotes < resolvedLeadingQuotes ) { |
|
|
- for( final var i = unresolved.iterator(); i.hasNext(); ) { |
|
|
- final var closing = i.next()[ 1 ]; |
|
|
- consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing ) ); |
|
|
- i.remove(); |
|
|
- } |
|
|
- } |
|
|
- else if( mOpeningSingleQuote - mClosingSingleQuote == unresolved.size() ) { |
|
|
- for( final var i = unresolved.iterator(); i.hasNext(); ) { |
|
|
- final var closing = i.next(); |
|
|
- consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) ); |
|
|
- i.remove(); |
|
|
- } |
|
|
- } |
|
|
- else if( unresolved.size() == 2 ) { |
|
|
- final var closing = unresolved.get( 0 ); |
|
|
- final var opening = unresolved.get( 1 ); |
|
|
- consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) ); |
|
|
- consumer.accept( new Token( QUOTE_OPENING_SINGLE, opening[ 1 ] ) ); |
|
|
+ new LexemeType[]{WORD, NUMBER, PERIOD, PUNCT, DASH, ELLIPSIS, QUOTE_SINGLE}; |
|
|
+ |
|
|
+ /** |
|
|
+ * Double quotes succeeded by these {@link LexemeType}s may be closing quotes. |
|
|
+ */ |
|
|
+ private static final LexemeType[] LAGGING_QUOTE_CLOSING_DOUBLE = |
|
|
+ new LexemeType[]{SPACE, PUNCT, DASH, QUOTE_SINGLE, CLOSING_GROUP, EOL, EOP}; |
|
|
+ |
|
|
+ /** |
|
|
+ * The text to parse. A reference is required as a minor optimization in |
|
|
+ * memory and speed: the lexer records integer offsets, rather than new |
|
|
+ * {@link String} instances, to track parsed lexemes. |
|
|
+ */ |
|
|
+ private final String mText; |
|
|
+ |
|
|
+ /** |
|
|
+ * Converts a string into an iterable list of {@link Lexeme} instances. |
|
|
+ */ |
|
|
+ private final Lexer mLexer; |
|
|
+ |
|
|
+ /** |
|
|
+ * Sets of contractions that help disambiguate single quotes in the text. |
|
|
+ * These are effectively immutable while parsing. |
|
|
+ */ |
|
|
+ private final Contractions sContractions; |
|
|
+ |
|
|
+ /** |
|
|
+ * Incremented for each opening single quote emitted. Used to help resolve |
|
|
+ * ambiguities when single quote marks are balanced. |
|
|
+ */ |
|
|
+ private int mOpeningSingleQuote; |
|
|
+ |
|
|
+ /** |
|
|
+ * Incremented for each closing single quote emitted. Used to help resolve |
|
|
+ * ambiguities when single quote marks are balanced. |
|
|
+ */ |
|
|
+ private int mClosingSingleQuote; |
|
|
+ |
|
|
+ /** |
|
|
+ * Constructs a new {@link Parser} using the default contraction sets |
|
|
+ * to help resolve some ambiguous scenarios. |
|
|
+ * |
|
|
+ * @param text The prose to parse, containing zero or more quotation |
|
|
+ * characters. |
|
|
+ */ |
|
|
+ public Parser( final String text ) { |
|
|
+ this( text, new Contractions.Builder().build() ); |
|
|
+ } |
|
|
+ |
|
|
+ /** |
|
|
+ * Constructs a new {@link Parser} using the default contraction sets |
|
|
+ * to help resolve some ambiguous scenarios. |
|
|
+ * |
|
|
+ * @param text The prose to parse, containing zero or more quotation |
|
|
+ * characters. |
|
|
+ * @param contractions Custom sets of contractions to help resolve |
|
|
+ * ambiguities. |
|
|
+ */ |
|
|
+ public Parser( final String text, final Contractions contractions ) { |
|
|
+ mText = text; |
|
|
+ mLexer = new Lexer( mText ); |
|
|
+ sContractions = contractions; |
|
|
+ } |
|
|
+ |
|
|
+ /** |
|
|
+ * Iterates over the entire text provided at construction, emitting |
|
|
+ * {@link Token}s that can be used to convert straight quotes to curly |
|
|
+ * quotes. |
|
|
+ * |
|
|
+ * @param tokenConsumer Receives emitted {@link Token}s. |
|
|
+ */ |
|
|
+ public void parse( |
|
|
+ final Consumer<Token> tokenConsumer, |
|
|
+ final Consumer<Lexeme> lexemeConsumer ) { |
|
|
+ final var lexemes = new CircularFifoQueue<Lexeme>( 3 ); |
|
|
+ |
|
|
+ // Allow consuming the very first token without checking the queue size. |
|
|
+ flush( lexemes ); |
|
|
+ |
|
|
+ final var unresolved = new ArrayList<Lexeme[]>(); |
|
|
+ Lexeme lexeme; |
|
|
+ |
|
|
+ // Create and convert a list of all unambiguous quote characters. |
|
|
+ while( (lexeme = mLexer.next()) != EOT ) { |
|
|
+ if( tokenize( lexeme, lexemes, tokenConsumer, unresolved ) ) { |
|
|
+ // Attempt to resolve any remaining unambiguous quotes. |
|
|
+ resolve( unresolved, tokenConsumer ); |
|
|
+ |
|
|
+ // Notify of any unambiguous quotes that could not be resolved. |
|
|
+ unresolved.forEach( ( lex ) -> lexemeConsumer.accept( lex[ 1 ] ) ); |
|
|
+ unresolved.clear(); |
|
|
+ mOpeningSingleQuote = 0; |
|
|
+ mClosingSingleQuote = 0; |
|
|
+ } |
|
|
+ } |
|
|
+ |
|
|
+ // By loop's end, the lexemes list contains tokens for all except the |
|
|
+ // final two elements (from tokenizing in triplets). Tokenize the remaining |
|
|
+ // unprocessed lexemes. |
|
|
+ tokenize( EOT, lexemes, tokenConsumer, unresolved ); |
|
|
+ tokenize( EOT, lexemes, tokenConsumer, unresolved ); |
|
|
+ |
|
|
+ // Attempt to resolve any remaining unambiguous quotes. |
|
|
+ resolve( unresolved, tokenConsumer ); |
|
|
+ |
|
|
+ // Notify of any unambiguous quotes that could not be resolved. |
|
|
+ unresolved.forEach( ( lex ) -> lexemeConsumer.accept( lex[ 1 ] ) ); |
|
|
+ } |
|
|
+ |
|
|
+ /** |
|
|
+ * Converts {@link Lexeme}s identified as straight quotes into {@link Token}s |
|
|
+ * that represent the curly equivalent. The {@link Token}s are passed to |
|
|
+ * the given {@link Consumer} for further processing (e.g., replaced in |
|
|
+ * the original text being parsed). |
|
|
+ * |
|
|
+ * @param lexeme A part of the text being parsed. |
|
|
+ * @param lexemes A 3-element queue of lexemes that provide sufficient |
|
|
+ * context to identify curly quotes. |
|
|
+ * @param consumer Recipient of equivalent quotes. |
|
|
+ * @param unresolved Rolling list of potentially ambiguous {@link Lexeme}s |
|
|
+ * that could not be tokenized, yet. |
|
|
+ * @return {@code true} if an end-of-paragraph is detected. |
|
|
+ */ |
|
|
+ private boolean tokenize( final Lexeme lexeme, |
|
|
+ final CircularFifoQueue<Lexeme> lexemes, |
|
|
+ final Consumer<Token> consumer, |
|
|
+ final List<Lexeme[]> unresolved ) { |
|
|
+ // Add the next lexeme to tokenize into the queue for immediate processing. |
|
|
+ lexemes.add( lexeme ); |
|
|
+ |
|
|
+ final var lex1 = lexemes.get( 0 ); |
|
|
+ final var lex2 = lexemes.get( 1 ); |
|
|
+ final var lex3 = lexemes.get( 2 ); |
|
|
+ |
|
|
+ if( lex2.isType( QUOTE_SINGLE ) && lex3.isType( WORD ) && |
|
|
+ lex1.anyType( WORD, PERIOD, NUMBER ) ) { |
|
|
+ // Examples: y'all, Ph.D.'ll, 20's, she's |
|
|
+ consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) ); |
|
|
+ } |
|
|
+ else if( lex1.isType( QUOTE_SINGLE ) && lex3.isType( QUOTE_SINGLE ) && |
|
|
+ "n".equalsIgnoreCase( lex2.toString( mText ) ) ) { |
|
|
+ // I.e., 'n' |
|
|
+ consumer.accept( new Token( QUOTE_APOSTROPHE, lex1 ) ); |
|
|
+ consumer.accept( new Token( QUOTE_APOSTROPHE, lex3 ) ); |
|
|
+ flush( lexemes ); |
|
|
+ truncate( unresolved ); |
|
|
+ } |
|
|
+ else if( lex2.isType( QUOTE_SINGLE ) && lex1.isType( NUMBER ) ) { |
|
|
+ if( lex3.isType( QUOTE_SINGLE ) ) { |
|
|
+ // E.g., 2'' |
|
|
+ consumer.accept( |
|
|
+ new Token( QUOTE_PRIME_DOUBLE, lex2.began(), lex3.ended() ) ); |
|
|
+ flush( lexemes ); |
|
|
+ } |
|
|
+ else { |
|
|
+ // E.g., 2' |
|
|
+ consumer.accept( new Token( QUOTE_PRIME_SINGLE, lex2 ) ); |
|
|
+ } |
|
|
+ } |
|
|
+ else if( lex2.isType( QUOTE_DOUBLE ) && lex1.isType( NUMBER ) ) { |
|
|
+ // E.g., 2" |
|
|
+ consumer.accept( new Token( QUOTE_PRIME_DOUBLE, lex2 ) ); |
|
|
+ } |
|
|
+ else if( lex2.isType( WORD ) && lex3.isType( QUOTE_SINGLE ) && |
|
|
+ sContractions.endedUnambiguously( lex2.toString( mText ) ) ) { |
|
|
+ // E.g., thinkin' |
|
|
+ consumer.accept( new Token( QUOTE_APOSTROPHE, lex3 ) ); |
|
|
+ flush( lexemes ); |
|
|
+ } |
|
|
+ else if( lex2.isType( NUMBER ) && lex1.isType( QUOTE_SINGLE ) ) { |
|
|
+ if( lex3.anyType( SPACE, PUNCT ) || (lex3.isType( WORD ) && |
|
|
+ lex3.toString( mText ).equalsIgnoreCase( "s" )) ) { |
|
|
+ // Sentences must re-written to avoid starting with numerals. |
|
|
+ // Examples: '20s, '02 |
|
|
+ consumer.accept( new Token( QUOTE_APOSTROPHE, lex1 ) ); |
|
|
+ } |
|
|
+ else { |
|
|
+ // E.g., '2'' |
|
|
+ consumer.accept( new Token( QUOTE_OPENING_SINGLE, lex1 ) ); |
|
|
+ mOpeningSingleQuote++; |
|
|
+ } |
|
|
+ |
|
|
+ truncate( unresolved ); |
|
|
+ } |
|
|
+ else if( lex2.isType( QUOTE_SINGLE ) && |
|
|
+ lex1.anyType( PUNCT, PERIOD, ELLIPSIS, DASH ) && |
|
|
+ (lex3.anyType( EOL, EOP ) || lex3.isEot()) ) { |
|
|
+ consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) ); |
|
|
+ mClosingSingleQuote++; |
|
|
+ } |
|
|
+ else if( lex1.isType( ESC_SINGLE ) ) { |
|
|
+ // E.g., \' |
|
|
+ consumer.accept( new Token( QUOTE_STRAIGHT_SINGLE, lex1 ) ); |
|
|
+ } |
|
|
+ else if( lex1.isType( ESC_DOUBLE ) ) { |
|
|
+ // E.g., \" |
|
|
+ consumer.accept( new Token( QUOTE_STRAIGHT_DOUBLE, lex1 ) ); |
|
|
+ |
|
|
+ if( lex2.isType( QUOTE_SINGLE ) && |
|
|
+ (lex3.isEot() || lex3.anyType( SPACE, DASH, EOL, EOP )) ) { |
|
|
+ consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) ); |
|
|
+ mClosingSingleQuote++; |
|
|
+ } |
|
|
+ } |
|
|
+ else if( lex2.isType( QUOTE_DOUBLE ) && |
|
|
+ (lex1.isSot() || lex1.anyType( LEADING_QUOTE_OPENING_DOUBLE )) && |
|
|
+ lex3.anyType( LAGGING_QUOTE_OPENING_DOUBLE ) ) { |
|
|
+ // Examples: "", "..., "word, ---"word |
|
|
+ consumer.accept( new Token( QUOTE_OPENING_DOUBLE, lex2 ) ); |
|
|
+ } |
|
|
+ else if( lex2.isType( QUOTE_DOUBLE ) && |
|
|
+ lex1.anyType( LEADING_QUOTE_CLOSING_DOUBLE ) && |
|
|
+ (lex3.isEot() || lex3.anyType( LAGGING_QUOTE_CLOSING_DOUBLE )) ) { |
|
|
+ // Examples: ..."', word"', ?"', word"? |
|
|
+ consumer.accept( new Token( QUOTE_CLOSING_DOUBLE, lex2 ) ); |
|
|
+ } |
|
|
+ else if( lex1.isType( QUOTE_SINGLE ) && |
|
|
+ lex2.anyType( PUNCT, PERIOD, DASH ) && lex3.isType( QUOTE_DOUBLE ) ) { |
|
|
+ // E.g., '," (contraction ruled out from previous conditionals) |
|
|
+ consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex1 ) ); |
|
|
+ truncate( unresolved ); |
|
|
+ mClosingSingleQuote++; |
|
|
+ } |
|
|
+ else if( lex2.anyType( QUOTE_SINGLE, QUOTE_DOUBLE ) ) { |
|
|
+ // After tokenizing, the parser will attempt to resolve ambiguities. |
|
|
+ unresolved.add( new Lexeme[]{lex1, lex2, lex3} ); |
|
|
+ } |
|
|
+ |
|
|
+ // Suggest to the caller that resolution should be performed. This allows |
|
|
+ // the algorithm to reset the opening/closing quote balance before the |
|
|
+ // next paragraph is parsed. |
|
|
+ return lex3.isType( EOP ); |
|
|
+ } |
|
|
+ |
|
|
+ private void resolve( |
|
|
+ final List<Lexeme[]> unresolved, final Consumer<Token> consumer ) { |
|
|
+ // Some non-emitted tokenized lexemes may be ambiguous. |
|
|
+ final var ambiguousLeadingQuotes = new ArrayList<Lexeme[]>( 16 ); |
|
|
+ final var ambiguousLaggingQuotes = new ArrayList<Lexeme[]>( 16 ); |
|
|
+ var resolvedLeadingQuotes = 0; |
|
|
+ var resolvedLaggingQuotes = 0; |
|
|
+ |
|
|
+ // Count the number of ambiguous and non-ambiguous open single quotes. |
|
|
+ for( var i = unresolved.iterator(); i.hasNext(); ) { |
|
|
+ final var quotes = i.next(); |
|
|
+ final var lex1 = quotes[ 0 ]; |
|
|
+ final var lex2 = quotes[ 1 ]; |
|
|
+ final var lex3 = quotes[ 2 ]; |
|
|
+ |
|
|
+ if( lex2.isType( QUOTE_SINGLE ) ) { |
|
|
+ final var word1 = lex1 == SOT ? "" : lex1.toString( mText ); |
|
|
+ final var word3 = lex3 == EOT ? "" : lex3.toString( mText ); |
|
|
+ |
|
|
+ if( sContractions.beganAmbiguously( word3 ) ) { |
|
|
+ // E.g., 'Cause |
|
|
+ if( lex1.isType( QUOTE_SINGLE ) ) { |
|
|
+ // E.g., ''Cause |
|
|
+ consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) ); |
|
|
+ i.remove(); |
|
|
+ } |
|
|
+ else { |
|
|
+ // The contraction is uncertain until a closing quote is found that |
|
|
+ // may balance this single quote. |
|
|
+ ambiguousLeadingQuotes.add( quotes ); |
|
|
+ } |
|
|
+ } |
|
|
+ else if( sContractions.beganUnambiguously( word3 ) ) { |
|
|
+ // The quote mark forms a word that does not stand alone from its |
|
|
+ // contraction. For example, twas is not a word: it's 'twas. |
|
|
+ consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) ); |
|
|
+ i.remove(); |
|
|
+ } |
|
|
+ else if( sContractions.endedAmbiguously( word1 ) ) { |
|
|
+ ambiguousLaggingQuotes.add( quotes ); |
|
|
+ } |
|
|
+ else if( (lex1.isSot() || lex1.anyType( LEADING_QUOTE_OPENING_SINGLE )) |
|
|
+ && lex3.anyType( LAGGING_QUOTE_OPENING_SINGLE ) ) { |
|
|
+ consumer.accept( new Token( QUOTE_OPENING_SINGLE, lex2 ) ); |
|
|
+ resolvedLeadingQuotes++; |
|
|
+ mOpeningSingleQuote++; |
|
|
+ i.remove(); |
|
|
+ } |
|
|
+ else if( lex1.anyType( LEADING_QUOTE_CLOSING_SINGLE ) && |
|
|
+ (lex3.isEot() || lex3.anyType( LAGGING_QUOTE_CLOSING_SINGLE )) ) { |
|
|
+ consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) ); |
|
|
+ resolvedLaggingQuotes++; |
|
|
+ mClosingSingleQuote++; |
|
|
+ i.remove(); |
|
|
+ } |
|
|
+ else if( lex3.isType( NUMBER ) ) { |
|
|
+ // E.g., '04 |
|
|
+ ambiguousLeadingQuotes.add( quotes ); |
|
|
+ } |
|
|
+ } |
|
|
+ } |
|
|
+ |
|
|
+ final var ambiguousLeadingCount = ambiguousLeadingQuotes.size(); |
|
|
+ final var ambiguousLaggingCount = ambiguousLaggingQuotes.size(); |
|
|
+ |
|
|
+ if( resolvedLeadingQuotes == 1 && resolvedLaggingQuotes == 0 ) { |
|
|
+ if( ambiguousLeadingCount == 0 && ambiguousLaggingCount == 1 ) { |
|
|
+ final var balanced = mClosingSingleQuote - mOpeningSingleQuote == 0; |
|
|
+ final var quote = balanced ? QUOTE_APOSTROPHE : QUOTE_CLOSING_SINGLE; |
|
|
+ final var lex = ambiguousLaggingQuotes.get( 0 ); |
|
|
+ consumer.accept( new Token( quote, lex[ 1 ] ) ); |
|
|
+ unresolved.remove( lex ); |
|
|
+ } |
|
|
+ else if( ambiguousLeadingCount == 0 && unresolved.size() == 1 ) { |
|
|
+ // Must be a closing quote. |
|
|
+ final var closing = unresolved.get( 0 ); |
|
|
+ consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) ); |
|
|
+ unresolved.remove( closing ); |
|
|
+ } |
|
|
+ } |
|
|
+ else if( ambiguousLeadingCount == 0 && ambiguousLaggingCount > 0 ) { |
|
|
+ // If there are no ambiguous leading quotes then all ambiguous lagging |
|
|
+ // quotes must be contractions. |
|
|
+ ambiguousLaggingQuotes.forEach( |
|
|
+ lex -> { |
|
|
+ consumer.accept( new Token( QUOTE_APOSTROPHE, lex[ 1 ] ) ); |
|
|
+ unresolved.remove( lex ); |
|
|
+ } |
|
|
+ ); |
|
|
+ } |
|
|
+ else if( ambiguousLeadingCount == 0 ) { |
|
|
+ if( resolvedLaggingQuotes < resolvedLeadingQuotes ) { |
|
|
+ for( final var i = unresolved.iterator(); i.hasNext(); ) { |
|
|
+ final var closing = i.next()[ 1 ]; |
|
|
+ consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing ) ); |
|
|
+ i.remove(); |
|
|
+ } |
|
|
+ } |
|
|
+ else if( mOpeningSingleQuote - mClosingSingleQuote == unresolved.size() ) { |
|
|
+ for( final var i = unresolved.iterator(); i.hasNext(); ) { |
|
|
+ final var closing = i.next(); |
|
|
+ consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) ); |
|
|
+ i.remove(); |
|
|
+ } |
|
|
+ } |
|
|
+ else if( unresolved.size() == 2 ) { |
|
|
+ final var closing = unresolved.get( 0 ); |
|
|
+ final var opening = unresolved.get( 1 ); |
|
|
+ consumer.accept( new Token( QUOTE_CLOSING_SINGLE, closing[ 1 ] ) ); |
|
|
+ consumer.accept( new Token( QUOTE_OPENING_SINGLE, opening[ 1 ] ) ); |
|
|
+ |
|
|
+ // Doesn't affect the algorithm. |
|
|
+ unresolved.clear(); |
|
|
} |
|
|
} |