Fix opening/closing single/double guillemets

Author	Dave Jarvis <email>
Date	2022-10-07 23:23:44 GMT-0700
Commit	944cce6277a98ac6a533f4f958a1a818dd6d4bbd
Parent	8509d5e

src/main/java/com/whitemagicsoftware/keenquotes/app/KeenQuotes.java

		import java.util.Properties;

		-import static com.whitemagicsoftware.keenquotes.lex.FilterType.FILTER_PLAIN;
		-import static com.whitemagicsoftware.keenquotes.lex.FilterType.FILTER_XML;
		import static java.lang.String.format;
		import static java.lang.System.*;

		else {
		try {
		- final var filter = settings.filterXml() ? FILTER_XML : FILTER_PLAIN;
		+ final var filter = settings.filterXml();
		final var c = new Curler( contractions, filter );

src/main/java/com/whitemagicsoftware/keenquotes/lex/LexemeGlyph.java

		return mGlyph == glyph;
		}
		+
		+ public String text() {
		+ return Character.toString( mGlyph );
		+ }
		}

src/main/java/com/whitemagicsoftware/keenquotes/lex/Lexer.java

		token = CLOSING_GROUP;
		}
		- else if( curr == '“' ) {
		- token = QUOTE_DOUBLE_OPENING;
		+ else if( LEX_DOUBLE_QUOTE_OPENING.equals( curr ) ) {
		+ token = QUOTE_DOUBLE_OPENING.with( LEX_DOUBLE_QUOTE_OPENING );
		}
		- else if( curr == '”' ) {
		- token = QUOTE_DOUBLE_CLOSING;
		+ else if( LEX_DOUBLE_QUOTE_CLOSING.equals( curr ) ) {
		+ token = QUOTE_DOUBLE_CLOSING.with( LEX_DOUBLE_QUOTE_CLOSING );
		}
		- else if( curr == '‘' ) {
		- token = QUOTE_SINGLE_OPENING;
		+ else if( LEX_SINGLE_QUOTE_OPENING.equals( curr ) ) {
		+ token = QUOTE_SINGLE_OPENING.with( LEX_SINGLE_QUOTE_OPENING );
		}
		- else if( curr == '’' ) {
		- token = QUOTE_SINGLE_CLOSING;
		+ else if( LEX_SINGLE_QUOTE_CLOSING.equals( curr ) ) {
		+ token = QUOTE_SINGLE_CLOSING.with( LEX_SINGLE_QUOTE_CLOSING );
		}
		else if( curr == '\\' ) {

src/main/java/com/whitemagicsoftware/keenquotes/parser/Curler.java

		package com.whitemagicsoftware.keenquotes.parser;

		-import com.whitemagicsoftware.keenquotes.lex.FilterType;
		-import com.whitemagicsoftware.keenquotes.lex.LexemeGlyph;
		-
		-import java.util.Map;
		import java.util.concurrent.atomic.AtomicInteger;
		import java.util.function.Consumer;
		import java.util.function.Function;

		-import static com.whitemagicsoftware.keenquotes.lex.LexemeGlyph.LEX_DOUBLE_QUOTE_OPENING_LOW;
		-import static com.whitemagicsoftware.keenquotes.parser.TokenType.*;
		-import static java.util.Map.entry;
		-import static java.util.Map.ofEntries;
		+import static com.whitemagicsoftware.keenquotes.lex.FilterType.FILTER_PLAIN;
		+import static com.whitemagicsoftware.keenquotes.lex.FilterType.FILTER_XML;

		/**
		* Resolves straight quotes into curly quotes throughout a document.
		*/
		@SuppressWarnings( "unused" )
		public class Curler implements Function<String, String> {
		- /**
		- * Provides an entity-based set of {@link Token} replacements.
		- */
		- public static final Map<TokenType, String> ENTITIES = ofEntries(
		- entry( QUOTE_OPENING_SINGLE, "‘" ),
		- entry( QUOTE_CLOSING_SINGLE, "’" ),
		- entry( QUOTE_OPENING_DOUBLE, "“" ),
		- entry( QUOTE_CLOSING_DOUBLE, "”" ),
		- entry( QUOTE_STRAIGHT_SINGLE, "'" ),
		- entry( QUOTE_STRAIGHT_DOUBLE, "\"" ),
		- entry( QUOTE_APOSTROPHE, "'" ),
		- entry( QUOTE_PRIME_SINGLE, "′" ),
		- entry( QUOTE_PRIME_DOUBLE, "″" ),
		- entry( QUOTE_PRIME_TRIPLE, "‴" ),
		- entry( QUOTE_PRIME_QUADRUPLE, "&qprime;" )
		- );
		-
		- /**
		- * Provides a character-based set of {@link Token} replacements.
		- */
		- public static final Map<TokenType, String> CHARS = ofEntries(
		- entry( QUOTE_OPENING_SINGLE, "‘" ),
		- entry( QUOTE_CLOSING_SINGLE, "’" ),
		- entry( QUOTE_OPENING_DOUBLE, "“" ),
		- entry( QUOTE_CLOSING_DOUBLE, "”" ),
		- entry( QUOTE_STRAIGHT_SINGLE, "'" ),
		- entry( QUOTE_STRAIGHT_DOUBLE, "\"" ),
		- entry( QUOTE_APOSTROPHE, "’" ),
		- entry( QUOTE_PRIME_SINGLE, "′" ),
		- entry( QUOTE_PRIME_DOUBLE, "″" ),
		- entry( QUOTE_PRIME_TRIPLE, "‴" ),
		- entry( QUOTE_PRIME_QUADRUPLE, "⁗" )
		- );
		-
		- /**
		- * Glyphs not found in the table will use the document's glyph.
		- */
		- public static final Map<LexemeGlyph, String> I18N_ENTITIES = ofEntries(
		- entry( LEX_DOUBLE_QUOTE_OPENING_LOW, "„" )
		- );
		-
		- /**
		- * Glyphs not found in the table will use the value from the document.
		- */
		- public static final Map<LexemeGlyph, String> I18N_CHARS = ofEntries();

		private final Contractions mContractions;
		- private final Map<TokenType, String> mReplacements;
		- private final Map<LexemeGlyph, String> mI18n;
		- private final FilterType mFilterType;
		-
		- /**
		- * Maps quotes to HTML entities.
		- *
		- * @param c Contractions listings.
		- * @param parserType Creates a parser based on document content structure.
		- */
		- public Curler( final Contractions c, final FilterType parserType ) {
		- this( c, ENTITIES, I18N_ENTITIES, parserType );
		- }
		+ private final boolean mEntities;

		/**
		* Maps quotes to curled character equivalents.
		*
		- * @param c Contractions listings.
		- * @param replacements Map of recognized quotes to output types (entity or
		- * Unicode character).
		+ * @param c Contractions listings.
		+ * @param entities {@code true} to convert quotation marks to HTML entities.
		*/
		public Curler(
		final Contractions c,
		- final Map<TokenType, String> replacements,
		- final Map<LexemeGlyph, String> i18n,
		- final FilterType parserType
		+ final boolean entities
		) {
		assert c != null;
		- assert replacements != null;
		- assert !replacements.isEmpty();
		- assert i18n != null;
		- assert parserType != null;

		mContractions = c;
		- mReplacements = replacements;
		- mI18n = i18n;
		- mFilterType = parserType;
		+ mEntities = entities;
		}


		text,
		mContractions,
		- replace( output, offset, mReplacements, mI18n ),
		- mFilterType.filter()
		+ replace( output, offset, mEntities ),
		+ (mEntities ? FILTER_XML : FILTER_PLAIN).filter()
		);

		return output.toString();
		}

		/**
		* Replaces non-ambiguous tokens with their equivalent string representation.
		*
		- * @param output Continuously updated result document.
		- * @param offset Accumulating index where {@link Token} is replaced.
		- * @param replacements Maps {@link TokenType}s to replacement strings.
		- * @param i18n Maps {@link LexemeGlyph}s to internationalized strings.
		+ * @param output Continuously updated result document.
		+ * @param offset Accumulating index where {@link Token} is replaced.
		+ * @param entities {@code true} to convert quotation marks to HTML entities.
		* @return Instructions to replace a {@link Token} in the result document.
		*/
		public static Consumer<Token> replace(
		final StringBuilder output,
		final AtomicInteger offset,
		- final Map<TokenType, String> replacements,
		- final Map<LexemeGlyph, String> i18n
		+ final boolean entities
		) {
		return token -> {
		if( !token.isAmbiguous() ) {
		- final var text = token.toString( replacements, i18n );
		+ final var text = token.toString( entities );

		output.replace(

src/main/java/com/whitemagicsoftware/keenquotes/parser/QuoteEmitter.java

		emit( QUOTE_AMBIGUOUS_DOUBLE, lex2 );
		}
		- // International opening quotation mark.
		+ // International opening double quotation mark.
		else if( match( ANY, QUOTE_DOUBLE_OPENING, ANY, ANY ) ) {
		emit( QUOTE_OPENING_DOUBLE, lex2 );
		+ }
		+ // International opening single quotation mark.
		+ else if( match( ANY, QUOTE_SINGLE_OPENING, ANY, ANY ) ) {
		+ emit( QUOTE_OPENING_SINGLE, lex2 );
		+ }
		+ // International double closing quotation mark.
		+ else if( match( ANY, ANY, ANY, QUOTE_DOUBLE_CLOSING ) ) {
		+ emit( QUOTE_CLOSING_DOUBLE, lex4 );
		+ }
		+ // International single closing quotation mark.
		+ else if( match( ANY, ANY, ANY, QUOTE_SINGLE_CLOSING ) ) {
		+ emit( QUOTE_CLOSING_SINGLE, lex4 );
		}
		// Ambiguous (no match)

src/main/java/com/whitemagicsoftware/keenquotes/parser/Token.java

		import java.util.Map;

		+import static com.whitemagicsoftware.keenquotes.lex.LexemeGlyph.*;
		import static com.whitemagicsoftware.keenquotes.parser.TokenType.*;
		+import static java.util.Map.entry;
		+import static java.util.Map.ofEntries;

		/**
		* Represents a high-level token read from a text document.
		*/
		final class Token implements Comparable<Token>, Stem {
		+ /**
		+ * Provides an entity-based set of {@link Token} replacements.
		+ */
		+ private static final Map<TokenType, String> ENTITIES = ofEntries(
		+ entry( QUOTE_OPENING_SINGLE, "‘" ),
		+ entry( QUOTE_CLOSING_SINGLE, "’" ),
		+ entry( QUOTE_OPENING_DOUBLE, "“" ),
		+ entry( QUOTE_CLOSING_DOUBLE, "”" ),
		+ entry( QUOTE_STRAIGHT_SINGLE, "'" ),
		+ entry( QUOTE_STRAIGHT_DOUBLE, "\"" ),
		+ entry( QUOTE_APOSTROPHE, "'" ),
		+ entry( QUOTE_PRIME_SINGLE, "′" ),
		+ entry( QUOTE_PRIME_DOUBLE, "″" ),
		+ entry( QUOTE_PRIME_TRIPLE, "‴" ),
		+ entry( QUOTE_PRIME_QUADRUPLE, "&qprime;" )
		+ );
		+
		+ /**
		+ * Provides a character-based set of {@link Token} replacements.
		+ */
		+ private static final Map<TokenType, String> CHARS = ofEntries(
		+ entry( QUOTE_OPENING_SINGLE, "‘" ),
		+ entry( QUOTE_CLOSING_SINGLE, "’" ),
		+ entry( QUOTE_OPENING_DOUBLE, "“" ),
		+ entry( QUOTE_CLOSING_DOUBLE, "”" ),
		+ entry( QUOTE_STRAIGHT_SINGLE, "'" ),
		+ entry( QUOTE_STRAIGHT_DOUBLE, "\"" ),
		+ entry( QUOTE_APOSTROPHE, "’" ),
		+ entry( QUOTE_PRIME_SINGLE, "′" ),
		+ entry( QUOTE_PRIME_DOUBLE, "″" ),
		+ entry( QUOTE_PRIME_TRIPLE, "‴" ),
		+ entry( QUOTE_PRIME_QUADRUPLE, "⁗" )
		+ );
		+
		+ /**
		+ * Glyphs not found in the table will use the document's glyph.
		+ */
		+ private static final Map<LexemeGlyph, String> I18N_ENTITIES = ofEntries(
		+ entry( LEX_DOUBLE_QUOTE_OPENING_LOW, "„" ),
		+ entry( LEX_DOUBLE_CHEVRON_LEFT, "«" ),
		+ entry( LEX_DOUBLE_CHEVRON_RIGHT, "»" ),
		+ entry( LEX_SINGLE_CHEVRON_LEFT, "&lsaquo;" ),
		+ entry( LEX_SINGLE_CHEVRON_RIGHT, "&rsaquo;" )
		+ );
		+
		/**
		* Denotes that the token does not represent a value in the parsed document.

		}

		- public String toString(
		- final Map<TokenType, String> entities,
		- final Map<LexemeGlyph, String> i18n ) {
		- return i18n.getOrDefault(
		- mLexeme.getType().glyph(),
		- entities.get( getType() )
		- );
		+ /**
		+ * Converts this token to its string representation, which will either be
		+ * an HTML entity or a character.
		+ *
		+ * @param entities {@code true} to convert quotation marks to HTML entities.
		+ * @return A plain quotation mark character or an HTML entity.
		+ */
		+ public String toString( final boolean entities ) {
		+ final var glyph = mLexeme.getType().glyph();
		+
		+ return entities
		+ ? I18N_ENTITIES.getOrDefault( glyph, ENTITIES.get( getType() ) )
		+ : CHARS.getOrDefault( getType(), glyph.text() );
		}

src/test/java/com/whitemagicsoftware/keenquotes/parser/AmbiguityResolverTest.java


		@Disabled
		+ @SuppressWarnings( "unused" )
		void test_Resolve_InvalidGrammar_AmbiguousRemain() throws IOException {
		test( "invalid-grammar.txt" );

		input,
		CONTRACTIONS,
		- replace( output, offset, ENTITIES, I18N_ENTITIES ),
		+ replace( output, offset, true ),
		filter -> false
		);

src/test/java/com/whitemagicsoftware/keenquotes/parser/CurlerTest.java

		package com.whitemagicsoftware.keenquotes.parser;

		-import com.whitemagicsoftware.keenquotes.lex.FilterType;
		import org.junit.jupiter.api.Disabled;
		import org.junit.jupiter.api.Test;
		import org.junit.jupiter.params.ParameterizedTest;
		import org.junit.jupiter.params.provider.ValueSource;

		import java.io.IOException;
		import java.util.function.Function;

		-import static com.whitemagicsoftware.keenquotes.lex.FilterType.FILTER_PLAIN;
		-import static com.whitemagicsoftware.keenquotes.lex.FilterType.FILTER_XML;
		import static com.whitemagicsoftware.keenquotes.texts.TestResource.open;
		import static com.whitemagicsoftware.keenquotes.texts.TestResource.readPairs;

		@Test
		public void test_Parse_UncurledQuotes1_CurlyQuotes() throws IOException {
		- testCurler( createCurler( FILTER_PLAIN ), "unambiguous-1-pass.txt" );
		+ testCurler( createCurler( true ), "unambiguous-1-pass.txt" );
		}

		@Test
		public void test_Parse_UncurledQuotes2_CurlyQuotes() throws IOException {
		- testCurler( createCurler( FILTER_PLAIN ), "unambiguous-2-pass.txt" );
		+ testCurler( createCurler( true ), "unambiguous-2-pass.txt" );
		}

		@Disabled
		+ @SuppressWarnings( {"unused", "JUnit3StyleTestMethodInJUnit4Class"} )
		public void test_Parse_AmbiguousQuotes_PartiallyCurled() throws IOException {
		- testCurler( createCurler( FILTER_PLAIN ), "ambiguous-n-pass.txt" );
		+ testCurler( createCurler( false ), "ambiguous-n-pass.txt" );
		}

		@Test
		public void test_Parse_UncurledQuotesXml_CurlyQuotes() throws IOException {
		- testCurler( createCurler( FILTER_XML ), "xml.txt" );
		+ testCurler( createCurler( true ), "xml.txt" );
		}

		@Test
		public void test_Parse_UncurledQuotesI11l_CurlyQuotes() throws IOException {
		- testCurler( createCurler( FILTER_PLAIN ), "i18n.txt" );
		+ testCurler( createCurler( true ), "i18n.txt" );
		}


		}

		- final var curler = createCurler( FILTER_XML );
		+ final var curler = createCurler( true );
		System.out.println( curler.apply( sb.toString() ) );
		}

		}

		- private Function<String, String> createCurler( final FilterType parserType ) {
		- return new Curler( new Contractions.Builder().build(), parserType );
		+ private Function<String, String> createCurler( final boolean entities ) {
		+ return new Curler( createContractions(), entities );
		+ }
		+
		+ private Contractions createContractions() {
		+ return new Contractions.Builder().build();
		}
		}

src/test/java/com/whitemagicsoftware/keenquotes/parser/QuoteEmitterTest.java

		import java.util.concurrent.atomic.AtomicInteger;

		-import static com.whitemagicsoftware.keenquotes.parser.Curler.*;
		+import static com.whitemagicsoftware.keenquotes.parser.Curler.replace;
		import static com.whitemagicsoftware.keenquotes.texts.TestResource.readPairs;
		import static org.junit.jupiter.api.Assertions.assertEquals;

		input,
		CONTRACTIONS,
		- replace( output, offset, ENTITIES, I18N_ENTITIES ),
		+ replace( output, offset, true ),
		filter -> false
		);

src/test/resources/com/whitemagicsoftware/keenquotes/texts/i18n.txt

		# ########################################################################
		+# French
		+# ########################################################################
		+«Ce n'est pas la mer à boire,» il me dit.
		+«Ce n'est pas la mer à boire,» il me dit.
		+
		+Dit il, ‹Ce n'est pas la mer à boire?›
		+Dit il, &lsaquo;Ce n'est pas la mer à boire?&rsaquo;
		+
		+# ########################################################################
		# Dutch
		# ########################################################################

src/test/resources/com/whitemagicsoftware/keenquotes/texts/unambiguous-1-pass.txt


		"’Kearney lives on the banks of Killarney—’
		-“’Kearney lives on the banks of Killarney—’
		+“’Kearney lives on the banks of Killarney—’

		# ########################################################################

Delta	128 lines added, 122 lines removed, 6-line increase