| Author | Dave Jarvis <email> |
|---|---|
| Date | 2022-10-05 21:11:30 GMT-0700 |
| Commit | 639626053184401f9e94cdd629bdc5e8134ebaa7 |
| Parent | 9c4b72b |
| +/* Copyright 2022 White Magic Software, Ltd. -- All rights reserved. */ | ||
| +package com.whitemagicsoftware.keenquotes.lex; | ||
| + | ||
| +/** | ||
| + * Common international quotation mark symbols to allow for re-encoding when | ||
| + * exporting back to text. | ||
| + */ | ||
| +public enum LexemeGlyph { | ||
| + LEX_NONE( (char) 0 ), | ||
| + | ||
| + LEX_SINGLE_QUOTE( '\'' ), | ||
| + LEX_SINGLE_QUOTE_OPENING( '‘' ), | ||
| + LEX_SINGLE_QUOTE_CLOSING( '’' ), | ||
| + | ||
| + LEX_DOUBLE_QUOTE( '"' ), | ||
| + LEX_DOUBLE_QUOTE_OPENING( '“' ), | ||
| + LEX_DOUBLE_QUOTE_CLOSING( '”' ), | ||
| + LEX_DOUBLE_QUOTE_OPENING_LOW( '„' ), | ||
| + | ||
| + LEX_DOUBLE_CHEVRON_LEFT( '«' ), | ||
| + LEX_DOUBLE_CHEVRON_RIGHT( '»' ), | ||
| + LEX_SINGLE_CHEVRON_LEFT( '‹' ), | ||
| + LEX_SINGLE_CHEVRON_RIGHT( '›' ); | ||
| + | ||
| + private final char mGlyph; | ||
| + | ||
| + LexemeGlyph( final char glyph ) { | ||
| + mGlyph = glyph; | ||
| + } | ||
| + | ||
| + public char glyph() { | ||
| + return mGlyph; | ||
| + } | ||
| + | ||
| + public boolean equals( final char ch ) { | ||
| + return glyph() == ch; | ||
| + } | ||
| +} | ||
| package com.whitemagicsoftware.keenquotes.lex; | ||
| +import static com.whitemagicsoftware.keenquotes.lex.LexemeGlyph.*; | ||
| + | ||
| /** | ||
| * Represents the type of {@link Lexeme} parsed by the {@link Lexer}. | ||
| */ | ||
| @SuppressWarnings( "SpellCheckingInspection" ) | ||
| -public enum LexemeType { | ||
| - QUOTE_SINGLE, | ||
| - QUOTE_SINGLE_OPENING, | ||
| - QUOTE_SINGLE_CLOSING, | ||
| - QUOTE_DOUBLE, | ||
| - QUOTE_DOUBLE_OPENING, | ||
| - QUOTE_DOUBLE_CLOSING, | ||
| - ESC_SINGLE, | ||
| - ESC_DOUBLE, | ||
| - SOT, | ||
| - EOL, | ||
| - EOP, | ||
| - EOT, | ||
| - SPACE, | ||
| - WORD, | ||
| - NUMBER, | ||
| - PUNCT, | ||
| - OPENING_GROUP, | ||
| - CLOSING_GROUP, | ||
| - HYPHEN, | ||
| - DASH, | ||
| - EQUALS, | ||
| - PERIOD, | ||
| - ELLIPSIS, | ||
| - ENDING, | ||
| - ANY, | ||
| - NONE | ||
| +public final class LexemeType { | ||
| + // @formatter:off | ||
| + public static final LexemeType QUOTE_SINGLE = new LexemeType( LEX_SINGLE_QUOTE ); | ||
| + public static final LexemeType QUOTE_SINGLE_OPENING = new LexemeType( LEX_SINGLE_QUOTE_OPENING ); | ||
| + public static final LexemeType QUOTE_SINGLE_CLOSING = new LexemeType( LEX_SINGLE_QUOTE_CLOSING ); | ||
| + public static final LexemeType QUOTE_DOUBLE = new LexemeType( LEX_DOUBLE_QUOTE ); | ||
| + public static final LexemeType QUOTE_DOUBLE_OPENING = new LexemeType( LEX_DOUBLE_QUOTE_OPENING ); | ||
| + public static final LexemeType QUOTE_DOUBLE_CLOSING = new LexemeType( LEX_DOUBLE_QUOTE_CLOSING ); | ||
| + public static final LexemeType ESC_SINGLE = new LexemeType(); | ||
| + public static final LexemeType ESC_DOUBLE = new LexemeType(); | ||
| + public static final LexemeType PRIME_DOUBLE = new LexemeType(); | ||
| + public static final LexemeType SOT = new LexemeType(); | ||
| + public static final LexemeType EOL = new LexemeType(); | ||
| + public static final LexemeType EOP = new LexemeType(); | ||
| + public static final LexemeType EOT = new LexemeType(); | ||
| + public static final LexemeType SPACE = new LexemeType(); | ||
| + public static final LexemeType WORD = new LexemeType(); | ||
| + public static final LexemeType NUMBER = new LexemeType(); | ||
| + public static final LexemeType PUNCT = new LexemeType(); | ||
| + public static final LexemeType OPENING_GROUP = new LexemeType(); | ||
| + public static final LexemeType CLOSING_GROUP = new LexemeType(); | ||
| + public static final LexemeType HYPHEN = new LexemeType(); | ||
| + public static final LexemeType DASH = new LexemeType(); | ||
| + public static final LexemeType EQUALS = new LexemeType(); | ||
| + public static final LexemeType PERIOD = new LexemeType(); | ||
| + public static final LexemeType ELLIPSIS = new LexemeType(); | ||
| + public static final LexemeType ENDING = new LexemeType(); | ||
| + public static final LexemeType ANY = new LexemeType(); | ||
| + public static final LexemeType NONE = new LexemeType(); | ||
| + // @formatter:on | ||
| + | ||
| + private LexemeGlyph mGlyph; | ||
| + | ||
| + public LexemeType() { | ||
| + this( LEX_NONE ); | ||
| + } | ||
| + | ||
| + public LexemeType( final LexemeGlyph glyph ) { | ||
| + setGlyph( glyph ); | ||
| + } | ||
| + | ||
| + public LexemeType with( final LexemeGlyph glyph ) { | ||
| + setGlyph( glyph ); | ||
| + return this; | ||
| + } | ||
| + | ||
| + public LexemeGlyph glyph() { | ||
| + return mGlyph; | ||
| + } | ||
| + | ||
| + private void setGlyph( final LexemeGlyph glyph ) { | ||
| + mGlyph = glyph; | ||
| + } | ||
| } | ||
| import java.util.function.Consumer; | ||
| +import static com.whitemagicsoftware.keenquotes.lex.LexemeGlyph.*; | ||
| import static com.whitemagicsoftware.keenquotes.lex.LexemeType.*; | ||
| import static java.lang.Character.isWhitespace; | ||
| // Allow filters to skip character sequences (such as XML tags). This | ||
| // must allow back-to-back filtering, hence the loop. | ||
| - while( filter.test( i ) ); | ||
| + while( filter.test( i ) ) ; | ||
| final var index = i.index(); | ||
| else if( curr == '=' ) { | ||
| token = EQUALS; | ||
| + } | ||
| + else if( curr == ',' && i.peek() == ',' ) { | ||
| + i.skip( next -> next == ',' ); | ||
| + token = QUOTE_DOUBLE_OPENING.with( LEX_DOUBLE_QUOTE_OPENING_LOW ); | ||
| + } | ||
| + else if( LEX_DOUBLE_QUOTE_OPENING_LOW.equals( curr ) ) { | ||
| + token = QUOTE_DOUBLE_OPENING.with( LEX_DOUBLE_QUOTE_OPENING_LOW ); | ||
| + } | ||
| + else if( LEX_SINGLE_CHEVRON_LEFT.equals( curr ) ) { | ||
| + token = QUOTE_SINGLE_OPENING.with( LEX_SINGLE_CHEVRON_LEFT ); | ||
| + } | ||
| + else if( LEX_DOUBLE_CHEVRON_LEFT.equals( curr ) ) { | ||
| + token = QUOTE_DOUBLE_OPENING.with( LEX_DOUBLE_CHEVRON_LEFT ); | ||
| + } | ||
| + else if( LEX_SINGLE_CHEVRON_RIGHT.equals( curr ) ) { | ||
| + token = QUOTE_SINGLE_CLOSING.with( LEX_SINGLE_CHEVRON_RIGHT ); | ||
| + } | ||
| + else if( LEX_DOUBLE_CHEVRON_RIGHT.equals( curr ) ) { | ||
| + token = QUOTE_DOUBLE_CLOSING.with( LEX_DOUBLE_CHEVRON_RIGHT ); | ||
| } | ||
| else if( curr == DONE ) { | ||
| import com.whitemagicsoftware.keenquotes.lex.FilterType; | ||
| +import com.whitemagicsoftware.keenquotes.lex.LexemeGlyph; | ||
| import java.util.Map; | ||
| import java.util.concurrent.atomic.AtomicInteger; | ||
| import java.util.function.Consumer; | ||
| import java.util.function.Function; | ||
| +import static com.whitemagicsoftware.keenquotes.lex.LexemeGlyph.LEX_DOUBLE_QUOTE_OPENING_LOW; | ||
| import static com.whitemagicsoftware.keenquotes.parser.TokenType.*; | ||
| import static java.util.Map.entry; | ||
| entry( QUOTE_PRIME_TRIPLE, "‴" ), | ||
| entry( QUOTE_PRIME_QUADRUPLE, "⁗" ) | ||
| + ); | ||
| + | ||
| + /** | ||
| + * Glyphs not found in the table will use the document's glyph. | ||
| + */ | ||
| + public static final Map<LexemeGlyph, String> I18N_ENTITIES = ofEntries( | ||
| + entry( LEX_DOUBLE_QUOTE_OPENING_LOW, "„" ) | ||
| ); | ||
| + | ||
| + /** | ||
| + * Glyphs not found in the table will use the value from the document. | ||
| + */ | ||
| + public static final Map<LexemeGlyph, String> I18N_CHARS = ofEntries(); | ||
| private final Contractions mContractions; | ||
| private final Map<TokenType, String> mReplacements; | ||
| + private final Map<LexemeGlyph, String> mI18n; | ||
| private final FilterType mFilterType; | ||
| /** | ||
| * Maps quotes to HTML entities. | ||
| * | ||
| * @param c Contractions listings. | ||
| * @param parserType Creates a parser based on document content structure. | ||
| */ | ||
| public Curler( final Contractions c, final FilterType parserType ) { | ||
| - this( c, ENTITIES, parserType ); | ||
| + this( c, ENTITIES, I18N_ENTITIES, parserType ); | ||
| } | ||
| final Contractions c, | ||
| final Map<TokenType, String> replacements, | ||
| + final Map<LexemeGlyph, String> i18n, | ||
| final FilterType parserType | ||
| ) { | ||
| + assert c != null; | ||
| + assert replacements != null; | ||
| + assert !replacements.isEmpty(); | ||
| + assert i18n != null; | ||
| + assert !i18n.isEmpty(); | ||
| + assert parserType != null; | ||
| + | ||
| mContractions = c; | ||
| mReplacements = replacements; | ||
| + mI18n = i18n; | ||
| mFilterType = parserType; | ||
| } | ||
| text, | ||
| mContractions, | ||
| - replace( output, offset, mReplacements ), | ||
| + replace( output, offset, mReplacements, mI18n ), | ||
| mFilterType.filter() | ||
| ); | ||
| * @param output Continuously updated result document. | ||
| * @param offset Accumulating index where {@link Token} is replaced. | ||
| - * @param replacements Map of {@link TokenType}s to replacement strings. | ||
| + * @param replacements Maps {@link TokenType}s to replacement strings. | ||
| + * @param i18n Maps {@link LexemeGlyph}s to internationalized strings. | ||
| * @return Instructions to replace a {@link Token} in the result document. | ||
| */ | ||
| public static Consumer<Token> replace( | ||
| final StringBuilder output, | ||
| final AtomicInteger offset, | ||
| - final Map<TokenType, String> replacements | ||
| + final Map<TokenType, String> replacements, | ||
| + final Map<LexemeGlyph, String> i18n | ||
| ) { | ||
| return token -> { | ||
| if( !token.isAmbiguous() ) { | ||
| - final var entity = token.toString( replacements ); | ||
| + final var text = token.toString( replacements, i18n ); | ||
| output.replace( | ||
| token.began() + offset.get(), | ||
| token.ended() + offset.get(), | ||
| - entity | ||
| + text | ||
| ); | ||
| - offset.addAndGet( entity.length() - (token.ended() - token.began()) ); | ||
| + offset.addAndGet( text.length() - (token.ended() - token.began()) ); | ||
| } | ||
| }; | ||
| // <2''> | ||
| else if( match( NUMBER, QUOTE_SINGLE, QUOTE_SINGLE, ANY ) ) { | ||
| - emit( QUOTE_PRIME_DOUBLE, lex2.began(), lex3.ended() ); | ||
| + // Force double primes to conform to the same constructor usage. This | ||
| + // simplifies the tokens, reduces some memory usage, | ||
| + final var lex = new Lexeme( PRIME_DOUBLE, lex2.began(), lex3.ended() ); | ||
| + | ||
| + emit( QUOTE_PRIME_DOUBLE, lex ); | ||
| mQ.set( Lexeme.NONE, 2 ); | ||
| } | ||
| else if( match( ANY, QUOTE_DOUBLE, ANY, ANY ) ) { | ||
| emit( QUOTE_AMBIGUOUS_DOUBLE, lex2 ); | ||
| + } | ||
| + // International opening quotation mark. | ||
| + else if( match( ANY, QUOTE_DOUBLE_OPENING, ANY, ANY ) ) { | ||
| + emit( QUOTE_OPENING_DOUBLE, lex2 ); | ||
| } | ||
| // Ambiguous (no match) | ||
| else if( match( ANY, QUOTE_SINGLE, ANY, ANY ) ) { | ||
| emit( QUOTE_AMBIGUOUS_SINGLE, lex2 ); | ||
| } | ||
| } | ||
| private void emit( final TokenType tokenType, final Lexeme lexeme ) { | ||
| mConsumer.accept( new Token( tokenType, lexeme ) ); | ||
| - } | ||
| - | ||
| - private void emit( | ||
| - final TokenType tokenType, | ||
| - final int began, | ||
| - final int ended ) { | ||
| - mConsumer.accept( new Token( tokenType, began, ended ) ); | ||
| } | ||
| import com.whitemagicsoftware.keenquotes.lex.Lexeme; | ||
| +import com.whitemagicsoftware.keenquotes.lex.LexemeGlyph; | ||
| import java.util.Map; | ||
| private TokenType mTokenType; | ||
| - private final int mBegan; | ||
| - private final int mEnded; | ||
| + private final Lexeme mLexeme; | ||
| /** | ||
| * Convenience constructor to create a token that uses the lexeme's | ||
| * beginning and ending offsets to represent a complete token. | ||
| - * | ||
| - * @param type The type of {@link Token} to create. | ||
| - * @param lexeme Container for beginning and ending text offsets. | ||
| - */ | ||
| - Token( final TokenType type, final Lexeme lexeme ) { | ||
| - this( type, lexeme.began(), lexeme.ended() ); | ||
| - } | ||
| - | ||
| - /** | ||
| - * This constructor can be used to create tokens that span more than a | ||
| - * single character. Almost all tokens represent a single character, only | ||
| - * the double-prime sequence ({@code ''}) is more than one character. | ||
| * | ||
| * @param tokenType The type of {@link Token} to create. | ||
| - * @param began Beginning offset into text where token is found. | ||
| - * @param ended Ending offset into text where token is found. | ||
| + * @param lexeme Container for text offsets and i18n glyphs. | ||
| */ | ||
| - Token( final TokenType tokenType, final int began, final int ended ) { | ||
| + Token( final TokenType tokenType, final Lexeme lexeme ) { | ||
| assert tokenType != null; | ||
| - assert began >= 0; | ||
| - assert ended >= began; | ||
| + assert lexeme.began() >= 0; | ||
| + assert lexeme.ended() >= lexeme.began(); | ||
| mTokenType = tokenType; | ||
| - mBegan = began; | ||
| - mEnded = ended; | ||
| + mLexeme = lexeme; | ||
| } | ||
| assert token != NONE; | ||
| - return mEnded <= token.mBegan; | ||
| + return mLexeme.ended() <= token.began(); | ||
| } | ||
| assert token != NONE; | ||
| - return mBegan > token.mEnded; | ||
| + return mLexeme.began() > token.ended(); | ||
| } | ||
| int began() { | ||
| - return mBegan; | ||
| + return mLexeme.began(); | ||
| } | ||
| int ended() { | ||
| - return mEnded; | ||
| + return mLexeme.ended(); | ||
| } | ||
| @Override | ||
| public int compareTo( final Token that ) { | ||
| - return this.mBegan - that.mBegan; | ||
| + return this.began() - that.began(); | ||
| } | ||
| } | ||
| - public String toString( final Map<TokenType, String> entities ) { | ||
| - return entities.get( getType() ); | ||
| + public String toString( | ||
| + final Map<TokenType, String> entities, | ||
| + final Map<LexemeGlyph, String> i18n ) { | ||
| + return i18n.getOrDefault( | ||
| + mLexeme.getType().glyph(), | ||
| + entities.get( getType() ) | ||
| + ); | ||
| } | ||
| @Override | ||
| public String toString() { | ||
| return getClass().getSimpleName() + '[' + | ||
| - "mType=" + mTokenType + | ||
| - ", mBegan=" + mBegan + | ||
| - ", mEnded=" + mEnded + | ||
| + "mType=" + getType() + | ||
| + ", mBegan=" + began() + | ||
| + ", mEnded=" + ended() + | ||
| ']'; | ||
| } | ||
| @Test | ||
| void test_Lexing_Quotes_EmitQuotes() { | ||
| - testType( "'", QUOTE_SINGLE ); | ||
| - testType( "\"", QUOTE_DOUBLE ); | ||
| testType( "‘", QUOTE_SINGLE_OPENING ); | ||
| + testType( "‹", QUOTE_SINGLE_OPENING ); | ||
| testType( "’", QUOTE_SINGLE_CLOSING ); | ||
| + testType( "›", QUOTE_SINGLE_CLOSING ); | ||
| + testType( "'", QUOTE_SINGLE ); | ||
| + | ||
| testType( "“", QUOTE_DOUBLE_OPENING ); | ||
| + testType( "„", QUOTE_DOUBLE_OPENING ); | ||
| + testType( "«", QUOTE_DOUBLE_OPENING ); | ||
| + testType( ",,", QUOTE_DOUBLE_OPENING ); | ||
| + testType( "\"", QUOTE_DOUBLE ); | ||
| + | ||
| testType( "”", QUOTE_DOUBLE_CLOSING ); | ||
| + testType( "»", QUOTE_DOUBLE_CLOSING ); | ||
| + | ||
| testType( "3 o'clock", NUMBER, SPACE, WORD, QUOTE_SINGLE, WORD ); | ||
| } |
| import java.util.concurrent.atomic.AtomicInteger; | ||
| -import static com.whitemagicsoftware.keenquotes.parser.Curler.ENTITIES; | ||
| -import static com.whitemagicsoftware.keenquotes.parser.Curler.replace; | ||
| +import static com.whitemagicsoftware.keenquotes.parser.Curler.*; | ||
| import static com.whitemagicsoftware.keenquotes.texts.TestResource.readPairs; | ||
| import static org.junit.jupiter.api.Assertions.assertEquals; | ||
| input, | ||
| CONTRACTIONS, | ||
| - replace( output, offset, ENTITIES ), | ||
| + replace( output, offset, ENTITIES, I18N_ENTITIES ), | ||
| filter -> false | ||
| ); | ||
| } | ||
| - @Test | ||
| + @Disabled | ||
| public void test_Parse_AmbiguousQuotes_PartiallyCurled() throws IOException { | ||
| testCurler( createCurler( FILTER_PLAIN ), "ambiguous-n-pass.txt" ); | ||
| } | ||
| @Test | ||
| public void test_Parse_UncurledQuotesXml_CurlyQuotes() throws IOException { | ||
| testCurler( createCurler( FILTER_XML ), "xml.txt" ); | ||
| + } | ||
| + | ||
| + @Test | ||
| + public void test_Parse_UncurledQuotesI11l_CurlyQuotes() throws IOException { | ||
| + testCurler( createCurler( FILTER_PLAIN ), "i18n.txt" ); | ||
| } | ||
| import java.util.concurrent.atomic.AtomicInteger; | ||
| -import static com.whitemagicsoftware.keenquotes.parser.Curler.ENTITIES; | ||
| -import static com.whitemagicsoftware.keenquotes.parser.Curler.replace; | ||
| +import static com.whitemagicsoftware.keenquotes.parser.Curler.*; | ||
| import static com.whitemagicsoftware.keenquotes.texts.TestResource.readPairs; | ||
| import static org.junit.jupiter.api.Assertions.assertEquals; | ||
| input, | ||
| CONTRACTIONS, | ||
| - replace( output, offset, ENTITIES ), | ||
| + replace( output, offset, ENTITIES, I18N_ENTITIES ), | ||
| filter -> false | ||
| ); | ||
| +# ######################################################################## | ||
| +# Dutch | ||
| +# ######################################################################## | ||
| +,,Dit is een citaat," zei hij. | ||
| +„Dit is een citaat,” zei hij. | ||
| + | ||
| +,,Ik heb twee opa's en twee oma's," zei het meisje,„hoeveel heb jij er?" | ||
| +„Ik heb twee opa's en twee oma's,” zei het meisje,„hoeveel heb jij er?” | ||
| + | ||
| +"Zeg, wat betekent 'quod non' eigenlijk?", vroeg Nynke. | ||
| +“Zeg, wat betekent ‘quod non’ eigenlijk?”, vroeg Nynke. | ||
| + | ||
| +"Wat zijn 'zebra's'?", vroeg de jongen. | ||
| +“Wat zijn ‘zebra's’?”, vroeg de jongen. | ||
| + | ||
| +"'s Morgens gaat het regenen in 's-Gravenhage," zei de weervrouw. | ||
| +“'s Morgens gaat het regenen in 's-Gravenhage,” zei de weervrouw. | ||
| + | ||
| +De voorzitter zei: 'Gelukkig nieuwjaar!' | ||
| +De voorzitter zei: ‘Gelukkig nieuwjaar!’ | ||
| + | ||
| +'Wat een gedoe, al die baby'tjes hun lolly'tjes geven,' zuchtte de oppas, 'en het is nog ongezond ook!' | ||
| +‘Wat een gedoe, al die baby'tjes hun lolly'tjes geven,’ zuchtte de oppas, ‘en het is nog ongezond ook!’ | ||
| + | ||
| +"In '84," zei hij terwijl hij een hand door z'n haar haalde, "heb ik Alice's zus een kus gegeven op d'r wangen." | ||
| +“In '84,” zei hij terwijl hij een hand door z'n haar haalde, “heb ik Alice's zus een kus gegeven op d'r wangen.” | ||
| Delta | 225 lines added, 83 lines removed, 142-line increase |
|---|