Recognize double quote low as opening double quotation marks

Author	Dave Jarvis <email>
Date	2022-10-05 21:11:30 GMT-0700
Commit	639626053184401f9e94cdd629bdc5e8134ebaa7
Parent	9c4b72b

src/main/java/com/whitemagicsoftware/keenquotes/lex/LexemeGlyph.java

		+/* Copyright 2022 White Magic Software, Ltd. -- All rights reserved. */
		+package com.whitemagicsoftware.keenquotes.lex;
		+
		+/**
		+ * Common international quotation mark symbols to allow for re-encoding when
		+ * exporting back to text.
		+ */
		+public enum LexemeGlyph {
		+ LEX_NONE( (char) 0 ),
		+
		+ LEX_SINGLE_QUOTE( '\'' ),
		+ LEX_SINGLE_QUOTE_OPENING( '‘' ),
		+ LEX_SINGLE_QUOTE_CLOSING( '’' ),
		+
		+ LEX_DOUBLE_QUOTE( '"' ),
		+ LEX_DOUBLE_QUOTE_OPENING( '“' ),
		+ LEX_DOUBLE_QUOTE_CLOSING( '”' ),
		+ LEX_DOUBLE_QUOTE_OPENING_LOW( '„' ),
		+
		+ LEX_DOUBLE_CHEVRON_LEFT( '«' ),
		+ LEX_DOUBLE_CHEVRON_RIGHT( '»' ),
		+ LEX_SINGLE_CHEVRON_LEFT( '‹' ),
		+ LEX_SINGLE_CHEVRON_RIGHT( '›' );
		+
		+ private final char mGlyph;
		+
		+ LexemeGlyph( final char glyph ) {
		+ mGlyph = glyph;
		+ }
		+
		+ public char glyph() {
		+ return mGlyph;
		+ }
		+
		+ public boolean equals( final char ch ) {
		+ return glyph() == ch;
		+ }
		+}

src/main/java/com/whitemagicsoftware/keenquotes/lex/LexemeType.java

		package com.whitemagicsoftware.keenquotes.lex;

		+import static com.whitemagicsoftware.keenquotes.lex.LexemeGlyph.*;
		+
		/**
		* Represents the type of {@link Lexeme} parsed by the {@link Lexer}.
		*/
		@SuppressWarnings( "SpellCheckingInspection" )
		-public enum LexemeType {
		- QUOTE_SINGLE,
		- QUOTE_SINGLE_OPENING,
		- QUOTE_SINGLE_CLOSING,
		- QUOTE_DOUBLE,
		- QUOTE_DOUBLE_OPENING,
		- QUOTE_DOUBLE_CLOSING,
		- ESC_SINGLE,
		- ESC_DOUBLE,
		- SOT,
		- EOL,
		- EOP,
		- EOT,
		- SPACE,
		- WORD,
		- NUMBER,
		- PUNCT,
		- OPENING_GROUP,
		- CLOSING_GROUP,
		- HYPHEN,
		- DASH,
		- EQUALS,
		- PERIOD,
		- ELLIPSIS,
		- ENDING,
		- ANY,
		- NONE
		+public final class LexemeType {
		+ // @formatter:off
		+ public static final LexemeType QUOTE_SINGLE = new LexemeType( LEX_SINGLE_QUOTE );
		+ public static final LexemeType QUOTE_SINGLE_OPENING = new LexemeType( LEX_SINGLE_QUOTE_OPENING );
		+ public static final LexemeType QUOTE_SINGLE_CLOSING = new LexemeType( LEX_SINGLE_QUOTE_CLOSING );
		+ public static final LexemeType QUOTE_DOUBLE = new LexemeType( LEX_DOUBLE_QUOTE );
		+ public static final LexemeType QUOTE_DOUBLE_OPENING = new LexemeType( LEX_DOUBLE_QUOTE_OPENING );
		+ public static final LexemeType QUOTE_DOUBLE_CLOSING = new LexemeType( LEX_DOUBLE_QUOTE_CLOSING );
		+ public static final LexemeType ESC_SINGLE = new LexemeType();
		+ public static final LexemeType ESC_DOUBLE = new LexemeType();
		+ public static final LexemeType PRIME_DOUBLE = new LexemeType();
		+ public static final LexemeType SOT = new LexemeType();
		+ public static final LexemeType EOL = new LexemeType();
		+ public static final LexemeType EOP = new LexemeType();
		+ public static final LexemeType EOT = new LexemeType();
		+ public static final LexemeType SPACE = new LexemeType();
		+ public static final LexemeType WORD = new LexemeType();
		+ public static final LexemeType NUMBER = new LexemeType();
		+ public static final LexemeType PUNCT = new LexemeType();
		+ public static final LexemeType OPENING_GROUP = new LexemeType();
		+ public static final LexemeType CLOSING_GROUP = new LexemeType();
		+ public static final LexemeType HYPHEN = new LexemeType();
		+ public static final LexemeType DASH = new LexemeType();
		+ public static final LexemeType EQUALS = new LexemeType();
		+ public static final LexemeType PERIOD = new LexemeType();
		+ public static final LexemeType ELLIPSIS = new LexemeType();
		+ public static final LexemeType ENDING = new LexemeType();
		+ public static final LexemeType ANY = new LexemeType();
		+ public static final LexemeType NONE = new LexemeType();
		+ // @formatter:on
		+
		+ private LexemeGlyph mGlyph;
		+
		+ public LexemeType() {
		+ this( LEX_NONE );
		+ }
		+
		+ public LexemeType( final LexemeGlyph glyph ) {
		+ setGlyph( glyph );
		+ }
		+
		+ public LexemeType with( final LexemeGlyph glyph ) {
		+ setGlyph( glyph );
		+ return this;
		+ }
		+
		+ public LexemeGlyph glyph() {
		+ return mGlyph;
		+ }
		+
		+ private void setGlyph( final LexemeGlyph glyph ) {
		+ mGlyph = glyph;
		+ }
		}

src/main/java/com/whitemagicsoftware/keenquotes/lex/Lexer.java

		import java.util.function.Consumer;

		+import static com.whitemagicsoftware.keenquotes.lex.LexemeGlyph.*;
		import static com.whitemagicsoftware.keenquotes.lex.LexemeType.*;
		import static java.lang.Character.isWhitespace;

		// Allow filters to skip character sequences (such as XML tags). This
		// must allow back-to-back filtering, hence the loop.
		- while( filter.test( i ) );
		+ while( filter.test( i ) ) ;

		final var index = i.index();

		else if( curr == '=' ) {
		token = EQUALS;
		+ }
		+ else if( curr == ',' && i.peek() == ',' ) {
		+ i.skip( next -> next == ',' );
		+ token = QUOTE_DOUBLE_OPENING.with( LEX_DOUBLE_QUOTE_OPENING_LOW );
		+ }
		+ else if( LEX_DOUBLE_QUOTE_OPENING_LOW.equals( curr ) ) {
		+ token = QUOTE_DOUBLE_OPENING.with( LEX_DOUBLE_QUOTE_OPENING_LOW );
		+ }
		+ else if( LEX_SINGLE_CHEVRON_LEFT.equals( curr ) ) {
		+ token = QUOTE_SINGLE_OPENING.with( LEX_SINGLE_CHEVRON_LEFT );
		+ }
		+ else if( LEX_DOUBLE_CHEVRON_LEFT.equals( curr ) ) {
		+ token = QUOTE_DOUBLE_OPENING.with( LEX_DOUBLE_CHEVRON_LEFT );
		+ }
		+ else if( LEX_SINGLE_CHEVRON_RIGHT.equals( curr ) ) {
		+ token = QUOTE_SINGLE_CLOSING.with( LEX_SINGLE_CHEVRON_RIGHT );
		+ }
		+ else if( LEX_DOUBLE_CHEVRON_RIGHT.equals( curr ) ) {
		+ token = QUOTE_DOUBLE_CLOSING.with( LEX_DOUBLE_CHEVRON_RIGHT );
		}
		else if( curr == DONE ) {

src/main/java/com/whitemagicsoftware/keenquotes/parser/Curler.java


		import com.whitemagicsoftware.keenquotes.lex.FilterType;
		+import com.whitemagicsoftware.keenquotes.lex.LexemeGlyph;

		import java.util.Map;
		import java.util.concurrent.atomic.AtomicInteger;
		import java.util.function.Consumer;
		import java.util.function.Function;

		+import static com.whitemagicsoftware.keenquotes.lex.LexemeGlyph.LEX_DOUBLE_QUOTE_OPENING_LOW;
		import static com.whitemagicsoftware.keenquotes.parser.TokenType.*;
		import static java.util.Map.entry;

		entry( QUOTE_PRIME_TRIPLE, "‴" ),
		entry( QUOTE_PRIME_QUADRUPLE, "⁗" )
		+ );
		+
		+ /**
		+ * Glyphs not found in the table will use the document's glyph.
		+ */
		+ public static final Map<LexemeGlyph, String> I18N_ENTITIES = ofEntries(
		+ entry( LEX_DOUBLE_QUOTE_OPENING_LOW, "„" )
		);
		+
		+ /**
		+ * Glyphs not found in the table will use the value from the document.
		+ */
		+ public static final Map<LexemeGlyph, String> I18N_CHARS = ofEntries();

		private final Contractions mContractions;
		private final Map<TokenType, String> mReplacements;
		+ private final Map<LexemeGlyph, String> mI18n;
		private final FilterType mFilterType;

		/**
		* Maps quotes to HTML entities.
		*
		* @param c Contractions listings.
		* @param parserType Creates a parser based on document content structure.
		*/
		public Curler( final Contractions c, final FilterType parserType ) {
		- this( c, ENTITIES, parserType );
		+ this( c, ENTITIES, I18N_ENTITIES, parserType );
		}


		final Contractions c,
		final Map<TokenType, String> replacements,
		+ final Map<LexemeGlyph, String> i18n,
		final FilterType parserType
		) {
		+ assert c != null;
		+ assert replacements != null;
		+ assert !replacements.isEmpty();
		+ assert i18n != null;
		+ assert !i18n.isEmpty();
		+ assert parserType != null;
		+
		mContractions = c;
		mReplacements = replacements;
		+ mI18n = i18n;
		mFilterType = parserType;
		}

		text,
		mContractions,
		- replace( output, offset, mReplacements ),
		+ replace( output, offset, mReplacements, mI18n ),
		mFilterType.filter()
		);

		* @param output Continuously updated result document.
		* @param offset Accumulating index where {@link Token} is replaced.
		- * @param replacements Map of {@link TokenType}s to replacement strings.
		+ * @param replacements Maps {@link TokenType}s to replacement strings.
		+ * @param i18n Maps {@link LexemeGlyph}s to internationalized strings.
		* @return Instructions to replace a {@link Token} in the result document.
		*/
		public static Consumer<Token> replace(
		final StringBuilder output,
		final AtomicInteger offset,
		- final Map<TokenType, String> replacements
		+ final Map<TokenType, String> replacements,
		+ final Map<LexemeGlyph, String> i18n
		) {
		return token -> {
		if( !token.isAmbiguous() ) {
		- final var entity = token.toString( replacements );
		+ final var text = token.toString( replacements, i18n );

		output.replace(
		token.began() + offset.get(),
		token.ended() + offset.get(),
		- entity
		+ text
		);

		- offset.addAndGet( entity.length() - (token.ended() - token.began()) );
		+ offset.addAndGet( text.length() - (token.ended() - token.began()) );
		}
		};

src/main/java/com/whitemagicsoftware/keenquotes/parser/QuoteEmitter.java

		// <2''>
		else if( match( NUMBER, QUOTE_SINGLE, QUOTE_SINGLE, ANY ) ) {
		- emit( QUOTE_PRIME_DOUBLE, lex2.began(), lex3.ended() );
		+ // Force double primes to conform to the same constructor usage. This
		+ // simplifies the tokens, reduces some memory usage,
		+ final var lex = new Lexeme( PRIME_DOUBLE, lex2.began(), lex3.ended() );
		+
		+ emit( QUOTE_PRIME_DOUBLE, lex );
		mQ.set( Lexeme.NONE, 2 );
		}

		else if( match( ANY, QUOTE_DOUBLE, ANY, ANY ) ) {
		emit( QUOTE_AMBIGUOUS_DOUBLE, lex2 );
		+ }
		+ // International opening quotation mark.
		+ else if( match( ANY, QUOTE_DOUBLE_OPENING, ANY, ANY ) ) {
		+ emit( QUOTE_OPENING_DOUBLE, lex2 );
		}
		// Ambiguous (no match)
		else if( match( ANY, QUOTE_SINGLE, ANY, ANY ) ) {
		emit( QUOTE_AMBIGUOUS_SINGLE, lex2 );
		}
		}

		private void emit( final TokenType tokenType, final Lexeme lexeme ) {
		mConsumer.accept( new Token( tokenType, lexeme ) );
		- }
		-
		- private void emit(
		- final TokenType tokenType,
		- final int began,
		- final int ended ) {
		- mConsumer.accept( new Token( tokenType, began, ended ) );
		}

src/main/java/com/whitemagicsoftware/keenquotes/parser/Token.java


		import com.whitemagicsoftware.keenquotes.lex.Lexeme;
		+import com.whitemagicsoftware.keenquotes.lex.LexemeGlyph;

		import java.util.Map;


		private TokenType mTokenType;
		- private final int mBegan;
		- private final int mEnded;
		+ private final Lexeme mLexeme;

		/**
		* Convenience constructor to create a token that uses the lexeme's
		* beginning and ending offsets to represent a complete token.
		- *
		- * @param type The type of {@link Token} to create.
		- * @param lexeme Container for beginning and ending text offsets.
		- */
		- Token( final TokenType type, final Lexeme lexeme ) {
		- this( type, lexeme.began(), lexeme.ended() );
		- }
		-
		- /**
		- * This constructor can be used to create tokens that span more than a
		- * single character. Almost all tokens represent a single character, only
		- * the double-prime sequence ({@code ''}) is more than one character.
		*
		* @param tokenType The type of {@link Token} to create.
		- * @param began Beginning offset into text where token is found.
		- * @param ended Ending offset into text where token is found.
		+ * @param lexeme Container for text offsets and i18n glyphs.
		*/
		- Token( final TokenType tokenType, final int began, final int ended ) {
		+ Token( final TokenType tokenType, final Lexeme lexeme ) {
		assert tokenType != null;
		- assert began >= 0;
		- assert ended >= began;
		+ assert lexeme.began() >= 0;
		+ assert lexeme.ended() >= lexeme.began();

		mTokenType = tokenType;
		- mBegan = began;
		- mEnded = ended;
		+ mLexeme = lexeme;
		}


		assert token != NONE;

		- return mEnded <= token.mBegan;
		+ return mLexeme.ended() <= token.began();
		}


		assert token != NONE;

		- return mBegan > token.mEnded;
		+ return mLexeme.began() > token.ended();
		}



		int began() {
		- return mBegan;
		+ return mLexeme.began();
		}

		int ended() {
		- return mEnded;
		+ return mLexeme.ended();
		}


		@Override
		public int compareTo( final Token that ) {
		- return this.mBegan - that.mBegan;
		+ return this.began() - that.began();
		}


		}

		- public String toString( final Map<TokenType, String> entities ) {
		- return entities.get( getType() );
		+ public String toString(
		+ final Map<TokenType, String> entities,
		+ final Map<LexemeGlyph, String> i18n ) {
		+ return i18n.getOrDefault(
		+ mLexeme.getType().glyph(),
		+ entities.get( getType() )
		+ );
		}

		@Override
		public String toString() {
		return getClass().getSimpleName() + '[' +
		- "mType=" + mTokenType +
		- ", mBegan=" + mBegan +
		- ", mEnded=" + mEnded +
		+ "mType=" + getType() +
		+ ", mBegan=" + began() +
		+ ", mEnded=" + ended() +
		']';
		}

src/test/java/com/whitemagicsoftware/keenquotes/lex/LexerTest.java

		@Test
		void test_Lexing_Quotes_EmitQuotes() {
		- testType( "'", QUOTE_SINGLE );
		- testType( "\"", QUOTE_DOUBLE );
		testType( "‘", QUOTE_SINGLE_OPENING );
		+ testType( "‹", QUOTE_SINGLE_OPENING );
		testType( "’", QUOTE_SINGLE_CLOSING );
		+ testType( "›", QUOTE_SINGLE_CLOSING );
		+ testType( "'", QUOTE_SINGLE );
		+
		testType( "“", QUOTE_DOUBLE_OPENING );
		+ testType( "„", QUOTE_DOUBLE_OPENING );
		+ testType( "«", QUOTE_DOUBLE_OPENING );
		+ testType( ",,", QUOTE_DOUBLE_OPENING );
		+ testType( "\"", QUOTE_DOUBLE );
		+
		testType( "”", QUOTE_DOUBLE_CLOSING );
		+ testType( "»", QUOTE_DOUBLE_CLOSING );
		+
		testType( "3 o'clock", NUMBER, SPACE, WORD, QUOTE_SINGLE, WORD );
		}

src/test/java/com/whitemagicsoftware/keenquotes/parser/AmbiguityResolverTest.java

		import java.util.concurrent.atomic.AtomicInteger;

		-import static com.whitemagicsoftware.keenquotes.parser.Curler.ENTITIES;
		-import static com.whitemagicsoftware.keenquotes.parser.Curler.replace;
		+import static com.whitemagicsoftware.keenquotes.parser.Curler.*;
		import static com.whitemagicsoftware.keenquotes.texts.TestResource.readPairs;
		import static org.junit.jupiter.api.Assertions.assertEquals;

		input,
		CONTRACTIONS,
		- replace( output, offset, ENTITIES ),
		+ replace( output, offset, ENTITIES, I18N_ENTITIES ),
		filter -> false
		);

src/test/java/com/whitemagicsoftware/keenquotes/parser/CurlerTest.java

		}

		- @Test
		+ @Disabled
		public void test_Parse_AmbiguousQuotes_PartiallyCurled() throws IOException {
		testCurler( createCurler( FILTER_PLAIN ), "ambiguous-n-pass.txt" );
		}

		@Test
		public void test_Parse_UncurledQuotesXml_CurlyQuotes() throws IOException {
		testCurler( createCurler( FILTER_XML ), "xml.txt" );
		+ }
		+
		+ @Test
		+ public void test_Parse_UncurledQuotesI11l_CurlyQuotes() throws IOException {
		+ testCurler( createCurler( FILTER_PLAIN ), "i18n.txt" );
		}

src/test/java/com/whitemagicsoftware/keenquotes/parser/QuoteEmitterTest.java

		import java.util.concurrent.atomic.AtomicInteger;

		-import static com.whitemagicsoftware.keenquotes.parser.Curler.ENTITIES;
		-import static com.whitemagicsoftware.keenquotes.parser.Curler.replace;
		+import static com.whitemagicsoftware.keenquotes.parser.Curler.*;
		import static com.whitemagicsoftware.keenquotes.texts.TestResource.readPairs;
		import static org.junit.jupiter.api.Assertions.assertEquals;

		input,
		CONTRACTIONS,
		- replace( output, offset, ENTITIES ),
		+ replace( output, offset, ENTITIES, I18N_ENTITIES ),
		filter -> false
		);

src/test/resources/com/whitemagicsoftware/keenquotes/texts/i18n.txt

		+# ########################################################################
		+# Dutch
		+# ########################################################################
		+,,Dit is een citaat," zei hij.
		+„Dit is een citaat,” zei hij.
		+
		+,,Ik heb twee opa's en twee oma's," zei het meisje,„hoeveel heb jij er?"
		+„Ik heb twee opa's en twee oma's,” zei het meisje,„hoeveel heb jij er?”
		+
		+"Zeg, wat betekent 'quod non' eigenlijk?", vroeg Nynke.
		+“Zeg, wat betekent ‘quod non’ eigenlijk?”, vroeg Nynke.
		+
		+"Wat zijn 'zebra's'?", vroeg de jongen.
		+“Wat zijn ‘zebra's’?”, vroeg de jongen.
		+
		+"'s Morgens gaat het regenen in 's-Gravenhage," zei de weervrouw.
		+“'s Morgens gaat het regenen in 's-Gravenhage,” zei de weervrouw.
		+
		+De voorzitter zei: 'Gelukkig nieuwjaar!'
		+De voorzitter zei: ‘Gelukkig nieuwjaar!’
		+
		+'Wat een gedoe, al die baby'tjes hun lolly'tjes geven,' zuchtte de oppas, 'en het is nog ongezond ook!'
		+‘Wat een gedoe, al die baby'tjes hun lolly'tjes geven,’ zuchtte de oppas, ‘en het is nog ongezond ook!’
		+
		+"In '84," zei hij terwijl hij een hand door z'n haar haalde, "heb ik Alice's zus een kus gegeven op d'r wangen."
		+“In '84,” zei hij terwijl hij een hand door z'n haar haalde, “heb ik Alice's zus een kus gegeven op d'r wangen.”

Delta	225 lines added, 83 lines removed, 142-line increase