Dave Jarvis' Repositories

git clone https://repo.autonoma.ca/repo/keenquotes.git

Fix opening/closing single/double guillemets

AuthorDave Jarvis <email>
Date2022-10-07 23:23:44 GMT-0700
Commit944cce6277a98ac6a533f4f958a1a818dd6d4bbd
Parent8509d5e
src/main/java/com/whitemagicsoftware/keenquotes/app/KeenQuotes.java
import java.util.Properties;
-import static com.whitemagicsoftware.keenquotes.lex.FilterType.FILTER_PLAIN;
-import static com.whitemagicsoftware.keenquotes.lex.FilterType.FILTER_XML;
import static java.lang.String.format;
import static java.lang.System.*;
else {
try {
- final var filter = settings.filterXml() ? FILTER_XML : FILTER_PLAIN;
+ final var filter = settings.filterXml();
final var c = new Curler( contractions, filter );
src/main/java/com/whitemagicsoftware/keenquotes/lex/LexemeGlyph.java
return mGlyph == glyph;
}
+
+ public String text() {
+ return Character.toString( mGlyph );
+ }
}
src/main/java/com/whitemagicsoftware/keenquotes/lex/Lexer.java
token = CLOSING_GROUP;
}
- else if( curr == '“' ) {
- token = QUOTE_DOUBLE_OPENING;
+ else if( LEX_DOUBLE_QUOTE_OPENING.equals( curr ) ) {
+ token = QUOTE_DOUBLE_OPENING.with( LEX_DOUBLE_QUOTE_OPENING );
}
- else if( curr == '”' ) {
- token = QUOTE_DOUBLE_CLOSING;
+ else if( LEX_DOUBLE_QUOTE_CLOSING.equals( curr ) ) {
+ token = QUOTE_DOUBLE_CLOSING.with( LEX_DOUBLE_QUOTE_CLOSING );
}
- else if( curr == '‘' ) {
- token = QUOTE_SINGLE_OPENING;
+ else if( LEX_SINGLE_QUOTE_OPENING.equals( curr ) ) {
+ token = QUOTE_SINGLE_OPENING.with( LEX_SINGLE_QUOTE_OPENING );
}
- else if( curr == '’' ) {
- token = QUOTE_SINGLE_CLOSING;
+ else if( LEX_SINGLE_QUOTE_CLOSING.equals( curr ) ) {
+ token = QUOTE_SINGLE_CLOSING.with( LEX_SINGLE_QUOTE_CLOSING );
}
else if( curr == '\\' ) {
src/main/java/com/whitemagicsoftware/keenquotes/parser/Curler.java
package com.whitemagicsoftware.keenquotes.parser;
-import com.whitemagicsoftware.keenquotes.lex.FilterType;
-import com.whitemagicsoftware.keenquotes.lex.LexemeGlyph;
-
-import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.Consumer;
import java.util.function.Function;
-import static com.whitemagicsoftware.keenquotes.lex.LexemeGlyph.LEX_DOUBLE_QUOTE_OPENING_LOW;
-import static com.whitemagicsoftware.keenquotes.parser.TokenType.*;
-import static java.util.Map.entry;
-import static java.util.Map.ofEntries;
+import static com.whitemagicsoftware.keenquotes.lex.FilterType.FILTER_PLAIN;
+import static com.whitemagicsoftware.keenquotes.lex.FilterType.FILTER_XML;
/**
* Resolves straight quotes into curly quotes throughout a document.
*/
@SuppressWarnings( "unused" )
public class Curler implements Function<String, String> {
- /**
- * Provides an entity-based set of {@link Token} replacements.
- */
- public static final Map<TokenType, String> ENTITIES = ofEntries(
- entry( QUOTE_OPENING_SINGLE, "&lsquo;" ),
- entry( QUOTE_CLOSING_SINGLE, "&rsquo;" ),
- entry( QUOTE_OPENING_DOUBLE, "&ldquo;" ),
- entry( QUOTE_CLOSING_DOUBLE, "&rdquo;" ),
- entry( QUOTE_STRAIGHT_SINGLE, "'" ),
- entry( QUOTE_STRAIGHT_DOUBLE, "\"" ),
- entry( QUOTE_APOSTROPHE, "&apos;" ),
- entry( QUOTE_PRIME_SINGLE, "&prime;" ),
- entry( QUOTE_PRIME_DOUBLE, "&Prime;" ),
- entry( QUOTE_PRIME_TRIPLE, "&tprime;" ),
- entry( QUOTE_PRIME_QUADRUPLE, "&qprime;" )
- );
-
- /**
- * Provides a character-based set of {@link Token} replacements.
- */
- public static final Map<TokenType, String> CHARS = ofEntries(
- entry( QUOTE_OPENING_SINGLE, "‘" ),
- entry( QUOTE_CLOSING_SINGLE, "’" ),
- entry( QUOTE_OPENING_DOUBLE, "“" ),
- entry( QUOTE_CLOSING_DOUBLE, "”" ),
- entry( QUOTE_STRAIGHT_SINGLE, "'" ),
- entry( QUOTE_STRAIGHT_DOUBLE, "\"" ),
- entry( QUOTE_APOSTROPHE, "’" ),
- entry( QUOTE_PRIME_SINGLE, "′" ),
- entry( QUOTE_PRIME_DOUBLE, "″" ),
- entry( QUOTE_PRIME_TRIPLE, "‴" ),
- entry( QUOTE_PRIME_QUADRUPLE, "⁗" )
- );
-
- /**
- * Glyphs not found in the table will use the document's glyph.
- */
- public static final Map<LexemeGlyph, String> I18N_ENTITIES = ofEntries(
- entry( LEX_DOUBLE_QUOTE_OPENING_LOW, "&#8222;" )
- );
-
- /**
- * Glyphs not found in the table will use the value from the document.
- */
- public static final Map<LexemeGlyph, String> I18N_CHARS = ofEntries();
private final Contractions mContractions;
- private final Map<TokenType, String> mReplacements;
- private final Map<LexemeGlyph, String> mI18n;
- private final FilterType mFilterType;
-
- /**
- * Maps quotes to HTML entities.
- *
- * @param c Contractions listings.
- * @param parserType Creates a parser based on document content structure.
- */
- public Curler( final Contractions c, final FilterType parserType ) {
- this( c, ENTITIES, I18N_ENTITIES, parserType );
- }
+ private final boolean mEntities;
/**
* Maps quotes to curled character equivalents.
*
- * @param c Contractions listings.
- * @param replacements Map of recognized quotes to output types (entity or
- * Unicode character).
+ * @param c Contractions listings.
+ * @param entities {@code true} to convert quotation marks to HTML entities.
*/
public Curler(
final Contractions c,
- final Map<TokenType, String> replacements,
- final Map<LexemeGlyph, String> i18n,
- final FilterType parserType
+ final boolean entities
) {
assert c != null;
- assert replacements != null;
- assert !replacements.isEmpty();
- assert i18n != null;
- assert parserType != null;
mContractions = c;
- mReplacements = replacements;
- mI18n = i18n;
- mFilterType = parserType;
+ mEntities = entities;
}
text,
mContractions,
- replace( output, offset, mReplacements, mI18n ),
- mFilterType.filter()
+ replace( output, offset, mEntities ),
+ (mEntities ? FILTER_XML : FILTER_PLAIN).filter()
);
return output.toString();
}
/**
* Replaces non-ambiguous tokens with their equivalent string representation.
*
- * @param output Continuously updated result document.
- * @param offset Accumulating index where {@link Token} is replaced.
- * @param replacements Maps {@link TokenType}s to replacement strings.
- * @param i18n Maps {@link LexemeGlyph}s to internationalized strings.
+ * @param output Continuously updated result document.
+ * @param offset Accumulating index where {@link Token} is replaced.
+ * @param entities {@code true} to convert quotation marks to HTML entities.
* @return Instructions to replace a {@link Token} in the result document.
*/
public static Consumer<Token> replace(
final StringBuilder output,
final AtomicInteger offset,
- final Map<TokenType, String> replacements,
- final Map<LexemeGlyph, String> i18n
+ final boolean entities
) {
return token -> {
if( !token.isAmbiguous() ) {
- final var text = token.toString( replacements, i18n );
+ final var text = token.toString( entities );
output.replace(
src/main/java/com/whitemagicsoftware/keenquotes/parser/QuoteEmitter.java
emit( QUOTE_AMBIGUOUS_DOUBLE, lex2 );
}
- // International opening quotation mark.
+ // International opening double quotation mark.
else if( match( ANY, QUOTE_DOUBLE_OPENING, ANY, ANY ) ) {
emit( QUOTE_OPENING_DOUBLE, lex2 );
+ }
+ // International opening single quotation mark.
+ else if( match( ANY, QUOTE_SINGLE_OPENING, ANY, ANY ) ) {
+ emit( QUOTE_OPENING_SINGLE, lex2 );
+ }
+ // International double closing quotation mark.
+ else if( match( ANY, ANY, ANY, QUOTE_DOUBLE_CLOSING ) ) {
+ emit( QUOTE_CLOSING_DOUBLE, lex4 );
+ }
+ // International single closing quotation mark.
+ else if( match( ANY, ANY, ANY, QUOTE_SINGLE_CLOSING ) ) {
+ emit( QUOTE_CLOSING_SINGLE, lex4 );
}
// Ambiguous (no match)
src/main/java/com/whitemagicsoftware/keenquotes/parser/Token.java
import java.util.Map;
+import static com.whitemagicsoftware.keenquotes.lex.LexemeGlyph.*;
import static com.whitemagicsoftware.keenquotes.parser.TokenType.*;
+import static java.util.Map.entry;
+import static java.util.Map.ofEntries;
/**
* Represents a high-level token read from a text document.
*/
final class Token implements Comparable<Token>, Stem {
+ /**
+ * Provides an entity-based set of {@link Token} replacements.
+ */
+ private static final Map<TokenType, String> ENTITIES = ofEntries(
+ entry( QUOTE_OPENING_SINGLE, "&lsquo;" ),
+ entry( QUOTE_CLOSING_SINGLE, "&rsquo;" ),
+ entry( QUOTE_OPENING_DOUBLE, "&ldquo;" ),
+ entry( QUOTE_CLOSING_DOUBLE, "&rdquo;" ),
+ entry( QUOTE_STRAIGHT_SINGLE, "'" ),
+ entry( QUOTE_STRAIGHT_DOUBLE, "\"" ),
+ entry( QUOTE_APOSTROPHE, "&apos;" ),
+ entry( QUOTE_PRIME_SINGLE, "&prime;" ),
+ entry( QUOTE_PRIME_DOUBLE, "&Prime;" ),
+ entry( QUOTE_PRIME_TRIPLE, "&tprime;" ),
+ entry( QUOTE_PRIME_QUADRUPLE, "&qprime;" )
+ );
+
+ /**
+ * Provides a character-based set of {@link Token} replacements.
+ */
+ private static final Map<TokenType, String> CHARS = ofEntries(
+ entry( QUOTE_OPENING_SINGLE, "‘" ),
+ entry( QUOTE_CLOSING_SINGLE, "’" ),
+ entry( QUOTE_OPENING_DOUBLE, "“" ),
+ entry( QUOTE_CLOSING_DOUBLE, "”" ),
+ entry( QUOTE_STRAIGHT_SINGLE, "'" ),
+ entry( QUOTE_STRAIGHT_DOUBLE, "\"" ),
+ entry( QUOTE_APOSTROPHE, "’" ),
+ entry( QUOTE_PRIME_SINGLE, "′" ),
+ entry( QUOTE_PRIME_DOUBLE, "″" ),
+ entry( QUOTE_PRIME_TRIPLE, "‴" ),
+ entry( QUOTE_PRIME_QUADRUPLE, "⁗" )
+ );
+
+ /**
+ * Glyphs not found in the table will use the document's glyph.
+ */
+ private static final Map<LexemeGlyph, String> I18N_ENTITIES = ofEntries(
+ entry( LEX_DOUBLE_QUOTE_OPENING_LOW, "&#8222;" ),
+ entry( LEX_DOUBLE_CHEVRON_LEFT, "&laquo;" ),
+ entry( LEX_DOUBLE_CHEVRON_RIGHT, "&raquo;" ),
+ entry( LEX_SINGLE_CHEVRON_LEFT, "&lsaquo;" ),
+ entry( LEX_SINGLE_CHEVRON_RIGHT, "&rsaquo;" )
+ );
+
/**
* Denotes that the token does not represent a value in the parsed document.
}
- public String toString(
- final Map<TokenType, String> entities,
- final Map<LexemeGlyph, String> i18n ) {
- return i18n.getOrDefault(
- mLexeme.getType().glyph(),
- entities.get( getType() )
- );
+ /**
+ * Converts this token to its string representation, which will either be
+ * an HTML entity or a character.
+ *
+ * @param entities {@code true} to convert quotation marks to HTML entities.
+ * @return A plain quotation mark character or an HTML entity.
+ */
+ public String toString( final boolean entities ) {
+ final var glyph = mLexeme.getType().glyph();
+
+ return entities
+ ? I18N_ENTITIES.getOrDefault( glyph, ENTITIES.get( getType() ) )
+ : CHARS.getOrDefault( getType(), glyph.text() );
}
src/test/java/com/whitemagicsoftware/keenquotes/parser/AmbiguityResolverTest.java
@Disabled
+ @SuppressWarnings( "unused" )
void test_Resolve_InvalidGrammar_AmbiguousRemain() throws IOException {
test( "invalid-grammar.txt" );
input,
CONTRACTIONS,
- replace( output, offset, ENTITIES, I18N_ENTITIES ),
+ replace( output, offset, true ),
filter -> false
);
src/test/java/com/whitemagicsoftware/keenquotes/parser/CurlerTest.java
package com.whitemagicsoftware.keenquotes.parser;
-import com.whitemagicsoftware.keenquotes.lex.FilterType;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.ValueSource;
import java.io.IOException;
import java.util.function.Function;
-import static com.whitemagicsoftware.keenquotes.lex.FilterType.FILTER_PLAIN;
-import static com.whitemagicsoftware.keenquotes.lex.FilterType.FILTER_XML;
import static com.whitemagicsoftware.keenquotes.texts.TestResource.open;
import static com.whitemagicsoftware.keenquotes.texts.TestResource.readPairs;
@Test
public void test_Parse_UncurledQuotes1_CurlyQuotes() throws IOException {
- testCurler( createCurler( FILTER_PLAIN ), "unambiguous-1-pass.txt" );
+ testCurler( createCurler( true ), "unambiguous-1-pass.txt" );
}
@Test
public void test_Parse_UncurledQuotes2_CurlyQuotes() throws IOException {
- testCurler( createCurler( FILTER_PLAIN ), "unambiguous-2-pass.txt" );
+ testCurler( createCurler( true ), "unambiguous-2-pass.txt" );
}
@Disabled
+ @SuppressWarnings( {"unused", "JUnit3StyleTestMethodInJUnit4Class"} )
public void test_Parse_AmbiguousQuotes_PartiallyCurled() throws IOException {
- testCurler( createCurler( FILTER_PLAIN ), "ambiguous-n-pass.txt" );
+ testCurler( createCurler( false ), "ambiguous-n-pass.txt" );
}
@Test
public void test_Parse_UncurledQuotesXml_CurlyQuotes() throws IOException {
- testCurler( createCurler( FILTER_XML ), "xml.txt" );
+ testCurler( createCurler( true ), "xml.txt" );
}
@Test
public void test_Parse_UncurledQuotesI11l_CurlyQuotes() throws IOException {
- testCurler( createCurler( FILTER_PLAIN ), "i18n.txt" );
+ testCurler( createCurler( true ), "i18n.txt" );
}
}
- final var curler = createCurler( FILTER_XML );
+ final var curler = createCurler( true );
System.out.println( curler.apply( sb.toString() ) );
}
}
- private Function<String, String> createCurler( final FilterType parserType ) {
- return new Curler( new Contractions.Builder().build(), parserType );
+ private Function<String, String> createCurler( final boolean entities ) {
+ return new Curler( createContractions(), entities );
+ }
+
+ private Contractions createContractions() {
+ return new Contractions.Builder().build();
}
}
src/test/java/com/whitemagicsoftware/keenquotes/parser/QuoteEmitterTest.java
import java.util.concurrent.atomic.AtomicInteger;
-import static com.whitemagicsoftware.keenquotes.parser.Curler.*;
+import static com.whitemagicsoftware.keenquotes.parser.Curler.replace;
import static com.whitemagicsoftware.keenquotes.texts.TestResource.readPairs;
import static org.junit.jupiter.api.Assertions.assertEquals;
input,
CONTRACTIONS,
- replace( output, offset, ENTITIES, I18N_ENTITIES ),
+ replace( output, offset, true ),
filter -> false
);
src/test/resources/com/whitemagicsoftware/keenquotes/texts/i18n.txt
# ########################################################################
+# French
+# ########################################################################
+«Ce n'est pas la mer à boire,» il me dit.
+&laquo;Ce n&apos;est pas la mer à boire,&raquo; il me dit.
+
+Dit il, ‹Ce n'est pas la mer à boire?›
+Dit il, &lsaquo;Ce n&apos;est pas la mer à boire?&rsaquo;
+
+# ########################################################################
# Dutch
# ########################################################################
src/test/resources/com/whitemagicsoftware/keenquotes/texts/unambiguous-1-pass.txt
"’Kearney lives on the banks of Killarney—’
-&ldquo;’Kearney lives on the banks of Killarney—’
+&ldquo;’Kearney lives on the banks of Killarney—&rsquo;
# ########################################################################
Delta128 lines added, 122 lines removed, 6-line increase