Dave Jarvis' Repositories

git clone https://repo.autonoma.ca/repo/keenquotes.git

Silently ignore XML elements

AuthorDave Jarvis <email>
Date2021-07-02 20:10:52 GMT-0700
Commit3a2ec71735df42e81a6e7ae96282ef4115ab5de3
Parentd7274c4
src/main/java/com/whitemagicsoftware/keenquotes/Converter.java
+/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */
package com.whitemagicsoftware.keenquotes;
+
+import com.whitemagicsoftware.keenquotes.ParserFactory.ParserType;
import java.util.ArrayList;
private final Contractions mContractions;
private final Map<TokenType, String> mReplacements;
+ private final ParserFactory mFactory;
/**
* Maps quotes to HTML entities.
*
* @param unresolved Consumes {@link Lexeme}s that could not be converted
* into HTML entities.
+ * @param parserType Creates a parser based on document content structure.
*/
- public Converter( final Consumer<Lexeme> unresolved ) {
- this( unresolved, new Contractions.Builder().build() );
+ public Converter(
+ final Consumer<Lexeme> unresolved, final ParserType parserType ) {
+ this( unresolved, new Contractions.Builder().build(), parserType );
}
/**
* Maps quotes to HTML entities.
*
* @param unresolved Consumes {@link Lexeme}s that could not be converted
* into HTML entities.
+ * @param parserType Creates a parser based on document content structure.
*/
public Converter(
final Consumer<Lexeme> unresolved,
- final Map<TokenType, String> replacements ) {
- this( unresolved, new Contractions.Builder().build(), replacements );
+ final Map<TokenType, String> replacements,
+ final ParserType parserType ) {
+ this(
+ unresolved, new Contractions.Builder().build(), replacements, parserType
+ );
}
/**
* Maps quotes to HTML entities.
*
* @param unresolved Consumes {@link Lexeme}s that could not be converted
* into HTML entities.
* @param c Contractions listings.
+ * @param parserType Creates a parser based on document content structure.
*/
- public Converter( final Consumer<Lexeme> unresolved, final Contractions c ) {
- this( unresolved, c, ENTITIES );
+ public Converter(
+ final Consumer<Lexeme> unresolved,
+ final Contractions c,
+ final ParserType parserType ) {
+ this( unresolved, c, ENTITIES, parserType );
}
final Consumer<Lexeme> unresolved,
final Contractions c,
- final Map<TokenType, String> replacements ) {
+ final Map<TokenType, String> replacements,
+ final ParserType parserType ) {
mUnresolved = unresolved;
mContractions = c;
mReplacements = replacements;
+ mFactory = new ParserFactory( parserType );
}
@Override
public String apply( final String text ) {
- final var parser = new Parser( text, mContractions );
+ final var parser = mFactory.createParser( text, mContractions );
final var tokens = new ArrayList<Token>();
src/main/java/com/whitemagicsoftware/keenquotes/KeenQuotes.java
import java.util.Properties;
+import static com.whitemagicsoftware.keenquotes.ParserFactory.ParserType.PARSER_PLAIN;
import static java.lang.String.format;
import static java.lang.System.*;
else {
try {
- out.print( convert( new Converter( err::println, contractions ) ) );
+ final var c = new Converter( err::println, contractions, PARSER_PLAIN );
+ out.print( convert( c ) );
} catch( final Exception ex ) {
ex.printStackTrace( err );
src/main/java/com/whitemagicsoftware/keenquotes/Lexer.java
int began = i.getIndex();
boolean isWord = false;
- Lexeme lexeme;
+ Lexeme lexeme = null;
do {
- if( (lexeme = preprocess( i )) != null ) {
- return lexeme;
+ // Allow subclasses to skip character sequences. This allows XML tags
+ // to be skipped.
+ if( skip( i ) ) {
+ began = i.getIndex();
}
}
- Lexeme preprocess( final CharacterIterator i ) {
- return null;
+ boolean skip( final CharacterIterator i ) {
+ return false;
}
src/main/java/com/whitemagicsoftware/keenquotes/ParserFactory.java
+/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */
+package com.whitemagicsoftware.keenquotes;
+
+/**
+ * Responsible for creating new {@link Parser} instances based on the
+ * {@link ParserType}. The document content format must be known in advance.
+ */
+public class ParserFactory {
+ public enum ParserType {
+ PARSER_PLAIN,
+ PARSER_XML
+ }
+
+ private final ParserType mParserType;
+
+ public ParserFactory( final ParserType parserType ) {
+ mParserType = parserType;
+ }
+
+ public Parser createParser(
+ final String text, final Contractions contractions ) {
+
+ return mParserType == ParserType.PARSER_PLAIN
+ ? new Parser( text, contractions )
+ : new XmlParser( text, contractions );
+ }
+}
src/main/java/com/whitemagicsoftware/keenquotes/XmlLexer.java
import java.text.CharacterIterator;
-import static com.whitemagicsoftware.keenquotes.Lexeme.createLexeme;
-import static com.whitemagicsoftware.keenquotes.LexemeType.TAG;
-
/**
* Responsible for lexing text while ignoring XML elements. The document must
}
+ /**
+ * Skip (do not emit) XML tags found within the prose. This effectively hides
+ * the element.
+ *
+ * @param i The {@link CharacterIterator} used to scan through the text, one
+ * character at a time.
+ */
@Override
- Lexeme preprocess( final CharacterIterator i ) {
- final var began = i.getIndex();
- final var curr = i.current();
- Lexeme lexeme = null;
+ boolean skip( final CharacterIterator i ) {
+ final boolean match = i.current() == '<';
- if( curr == '<' ) {
+ if( match ) {
slurp( i, ( next, ci ) -> next != '>' );
+
+ // Swallow the trailing greater than symbol.
i.next();
- lexeme = createLexeme( TAG, began, i.getIndex() );
+
+ // Skip to the character following the greater than symbol.
i.next();
}
- return lexeme;
+ return match;
}
}
src/test/java/com/whitemagicsoftware/keenquotes/KeenQuotesTest.java
package com.whitemagicsoftware.keenquotes;
+import com.whitemagicsoftware.keenquotes.ParserFactory.ParserType;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.Arguments;
+import org.junit.jupiter.params.provider.MethodSource;
import org.junit.jupiter.params.provider.ValueSource;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
+import java.util.function.Consumer;
import java.util.function.Function;
+import java.util.stream.Stream;
+import static com.whitemagicsoftware.keenquotes.ParserFactory.ParserType.PARSER_PLAIN;
+import static com.whitemagicsoftware.keenquotes.ParserFactory.ParserType.PARSER_XML;
import static java.lang.System.out;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.params.provider.Arguments.arguments;
/**
@Disabled
public void test_parse_SingleLine_Parsed() {
- final var converter = new Converter( out::println );
+ final var converter = createConverter( out::println );
out.println( converter.apply(
"'A', 'B', and 'C' are letters."
@Test
public void test_Parse_StraightQuotes_CurlyQuotes() throws IOException {
- testConverter( new Converter( ( lex ) -> {} ) );
+ testConverter( createConverter( ( lex ) -> {} ) );
+ }
+
+ @ParameterizedTest
+ @MethodSource( "param_XmlParse_StraightQuotes_CurlyQuotes" )
+ public void test_XmlParse_StraightQuotes_CurlyQuotes(
+ final String input, final String expected ) {
+ final var converter = createConverter( out::println, PARSER_XML );
+ final var actual = converter.apply( input );
+ assertEquals( expected, actual );
}
}
- final var converter = new Converter( out::println );
+ final var converter = createConverter( out::println );
System.out.println( converter.apply( sb.toString() ) );
+ }
+
+ @SuppressWarnings( "unused" )
+ static Stream<Arguments> param_XmlParse_StraightQuotes_CurlyQuotes() {
+ return Stream.of(
+ arguments(
+ "<em>'twas</em>",
+ "<em>&apos;twas</em>"
+ ),
+ arguments(
+ "<bold>'twas</bold> redeemed for the <em>cat</em>'s eye",
+ "<bold>&apos;twas</bold> redeemed for the <em>cat</em>&apos;s eye"
+ ),
+ arguments(
+ "<a href=\"https://x.org\" title=\"X's Homepage\">X11's bomb</a>",
+ "<a href=\"https://x.org\" title=\"X's Homepage\">X11&apos;s bomb</a>"
+ ),
+ arguments(
+ "''<em>Twas</em> happening!'",
+ "&lsquo;&apos;<em>Twas</em> happening!&rsquo;"
+ )
+ );
}
return new BufferedReader( new InputStreamReader( is ) );
+ }
+
+ private Function<String, String> createConverter(
+ final Consumer<Lexeme> unresolved ) {
+ return createConverter( unresolved, PARSER_PLAIN );
+ }
+
+ private Function<String, String> createConverter(
+ final Consumer<Lexeme> unresolved, final ParserType parserType ) {
+ return new Converter( unresolved, parserType );
}
}
src/test/java/com/whitemagicsoftware/keenquotes/LexerTest.java
* Tests lexing words, numbers, punctuation, spaces, newlines, etc.
*/
-class LexerTest {
+final class LexerTest {
+
@Test
void test_Lexing_Words_LexemeValues() {
final var list = asList( expected );
testType( lexer, actual, ( lexeme, text ) -> lexeme.getType(), list );
- }
-
- static void testText(
- final Lexer lexer, final String actual, final String... expected ) {
- testType( lexer, actual, Lexeme::toString, asList( expected ) );
}
src/test/java/com/whitemagicsoftware/keenquotes/ParserTest.java
* Test that all unambiguous apostrophes are emitted once.
*/
-class ParserTest {
+final class ParserTest {
+
@SuppressWarnings( "TextBlockMigration" )
private final static Map<String, Map<TokenType, Integer>> TEST_CASES =
src/test/java/com/whitemagicsoftware/keenquotes/XmlLexerTest.java
import static com.whitemagicsoftware.keenquotes.LexemeType.*;
-import static com.whitemagicsoftware.keenquotes.LexerTest.testText;
import static com.whitemagicsoftware.keenquotes.LexerTest.testType;
/**
* Test that parsing XML documents ignores elements.
*/
final class XmlLexerTest {
-
- @Test
- void test_Lexing_Xml_LexemeValues() {
- final var actual = "Hello <em>world</em>";
- testText(
- createXmlLexer( actual ), actual,
- "Hello", " ", "<em>", "world", "</em>"
- );
- }
@Test
void test_Lexing_Xml_EmitTags() {
final var actual = "The <em>world's</em> aflame.";
testType(
createXmlLexer( actual ), actual,
- WORD, SPACE, TAG, WORD, QUOTE_SINGLE, WORD, TAG, SPACE, WORD, PERIOD
+ WORD, SPACE, WORD, QUOTE_SINGLE, WORD, SPACE, WORD, PERIOD
);
}
@Test
void test_Lexing_XmlAttribute_EmitTags() {
- final var actual = "<a href=\"http://x\">x</a>";
- testType( createXmlLexer( actual ), actual, TAG, WORD, TAG );
+ final var actual = "<a href=\"http://x.org\">X11</a>";
+ testType( createXmlLexer( actual ), actual, WORD, NUMBER );
}
Delta135 lines added, 47 lines removed, 88-line increase