Silently ignore XML elements

Author	Dave Jarvis <email>
Date	2021-07-02 20:10:52 GMT-0700
Commit	3a2ec71735df42e81a6e7ae96282ef4115ab5de3
Parent	d7274c4

src/main/java/com/whitemagicsoftware/keenquotes/Converter.java

		+/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */
		package com.whitemagicsoftware.keenquotes;
		+
		+import com.whitemagicsoftware.keenquotes.ParserFactory.ParserType;

		import java.util.ArrayList;

		private final Contractions mContractions;
		private final Map<TokenType, String> mReplacements;
		+ private final ParserFactory mFactory;

		/**
		* Maps quotes to HTML entities.
		*
		* @param unresolved Consumes {@link Lexeme}s that could not be converted
		* into HTML entities.
		+ * @param parserType Creates a parser based on document content structure.
		*/
		- public Converter( final Consumer<Lexeme> unresolved ) {
		- this( unresolved, new Contractions.Builder().build() );
		+ public Converter(
		+ final Consumer<Lexeme> unresolved, final ParserType parserType ) {
		+ this( unresolved, new Contractions.Builder().build(), parserType );
		}

		/**
		* Maps quotes to HTML entities.
		*
		* @param unresolved Consumes {@link Lexeme}s that could not be converted
		* into HTML entities.
		+ * @param parserType Creates a parser based on document content structure.
		*/
		public Converter(
		final Consumer<Lexeme> unresolved,
		- final Map<TokenType, String> replacements ) {
		- this( unresolved, new Contractions.Builder().build(), replacements );
		+ final Map<TokenType, String> replacements,
		+ final ParserType parserType ) {
		+ this(
		+ unresolved, new Contractions.Builder().build(), replacements, parserType
		+ );
		}

		/**
		* Maps quotes to HTML entities.
		*
		* @param unresolved Consumes {@link Lexeme}s that could not be converted
		* into HTML entities.
		* @param c Contractions listings.
		+ * @param parserType Creates a parser based on document content structure.
		*/
		- public Converter( final Consumer<Lexeme> unresolved, final Contractions c ) {
		- this( unresolved, c, ENTITIES );
		+ public Converter(
		+ final Consumer<Lexeme> unresolved,
		+ final Contractions c,
		+ final ParserType parserType ) {
		+ this( unresolved, c, ENTITIES, parserType );
		}


		final Consumer<Lexeme> unresolved,
		final Contractions c,
		- final Map<TokenType, String> replacements ) {
		+ final Map<TokenType, String> replacements,
		+ final ParserType parserType ) {
		mUnresolved = unresolved;
		mContractions = c;
		mReplacements = replacements;
		+ mFactory = new ParserFactory( parserType );
		}


		@Override
		public String apply( final String text ) {
		- final var parser = new Parser( text, mContractions );
		+ final var parser = mFactory.createParser( text, mContractions );
		final var tokens = new ArrayList<Token>();

src/main/java/com/whitemagicsoftware/keenquotes/KeenQuotes.java

		import java.util.Properties;

		+import static com.whitemagicsoftware.keenquotes.ParserFactory.ParserType.PARSER_PLAIN;
		import static java.lang.String.format;
		import static java.lang.System.*;

		else {
		try {
		- out.print( convert( new Converter( err::println, contractions ) ) );
		+ final var c = new Converter( err::println, contractions, PARSER_PLAIN );
		+ out.print( convert( c ) );
		} catch( final Exception ex ) {
		ex.printStackTrace( err );

src/main/java/com/whitemagicsoftware/keenquotes/Lexer.java

		int began = i.getIndex();
		boolean isWord = false;
		- Lexeme lexeme;
		+ Lexeme lexeme = null;

		do {
		- if( (lexeme = preprocess( i )) != null ) {
		- return lexeme;
		+ // Allow subclasses to skip character sequences. This allows XML tags
		+ // to be skipped.
		+ if( skip( i ) ) {
		+ began = i.getIndex();
		}


		}

		- Lexeme preprocess( final CharacterIterator i ) {
		- return null;
		+ boolean skip( final CharacterIterator i ) {
		+ return false;
		}

src/main/java/com/whitemagicsoftware/keenquotes/ParserFactory.java

		+/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */
		+package com.whitemagicsoftware.keenquotes;
		+
		+/**
		+ * Responsible for creating new {@link Parser} instances based on the
		+ * {@link ParserType}. The document content format must be known in advance.
		+ */
		+public class ParserFactory {
		+ public enum ParserType {
		+ PARSER_PLAIN,
		+ PARSER_XML
		+ }
		+
		+ private final ParserType mParserType;
		+
		+ public ParserFactory( final ParserType parserType ) {
		+ mParserType = parserType;
		+ }
		+
		+ public Parser createParser(
		+ final String text, final Contractions contractions ) {
		+
		+ return mParserType == ParserType.PARSER_PLAIN
		+ ? new Parser( text, contractions )
		+ : new XmlParser( text, contractions );
		+ }
		+}

src/main/java/com/whitemagicsoftware/keenquotes/XmlLexer.java

		import java.text.CharacterIterator;

		-import static com.whitemagicsoftware.keenquotes.Lexeme.createLexeme;
		-import static com.whitemagicsoftware.keenquotes.LexemeType.TAG;
		-
		/**
		* Responsible for lexing text while ignoring XML elements. The document must

		}

		+ /**
		+ * Skip (do not emit) XML tags found within the prose. This effectively hides
		+ * the element.
		+ *
		+ * @param i The {@link CharacterIterator} used to scan through the text, one
		+ * character at a time.
		+ */
		@Override
		- Lexeme preprocess( final CharacterIterator i ) {
		- final var began = i.getIndex();
		- final var curr = i.current();
		- Lexeme lexeme = null;
		+ boolean skip( final CharacterIterator i ) {
		+ final boolean match = i.current() == '<';

		- if( curr == '<' ) {
		+ if( match ) {
		slurp( i, ( next, ci ) -> next != '>' );
		+
		+ // Swallow the trailing greater than symbol.
		i.next();
		- lexeme = createLexeme( TAG, began, i.getIndex() );
		+
		+ // Skip to the character following the greater than symbol.
		i.next();
		}

		- return lexeme;
		+ return match;
		}
		}

src/test/java/com/whitemagicsoftware/keenquotes/KeenQuotesTest.java

		package com.whitemagicsoftware.keenquotes;

		+import com.whitemagicsoftware.keenquotes.ParserFactory.ParserType;
		import org.junit.jupiter.api.Disabled;
		import org.junit.jupiter.api.Test;
		import org.junit.jupiter.params.ParameterizedTest;
		+import org.junit.jupiter.params.provider.Arguments;
		+import org.junit.jupiter.params.provider.MethodSource;
		import org.junit.jupiter.params.provider.ValueSource;

		import java.io.BufferedReader;
		import java.io.IOException;
		import java.io.InputStreamReader;
		+import java.util.function.Consumer;
		import java.util.function.Function;
		+import java.util.stream.Stream;

		+import static com.whitemagicsoftware.keenquotes.ParserFactory.ParserType.PARSER_PLAIN;
		+import static com.whitemagicsoftware.keenquotes.ParserFactory.ParserType.PARSER_XML;
		import static java.lang.System.out;
		import static org.junit.jupiter.api.Assertions.assertEquals;
		import static org.junit.jupiter.api.Assertions.assertNotNull;
		+import static org.junit.jupiter.params.provider.Arguments.arguments;

		/**

		@Disabled
		public void test_parse_SingleLine_Parsed() {
		- final var converter = new Converter( out::println );
		+ final var converter = createConverter( out::println );
		out.println( converter.apply(
		"'A', 'B', and 'C' are letters."

		@Test
		public void test_Parse_StraightQuotes_CurlyQuotes() throws IOException {
		- testConverter( new Converter( ( lex ) -> {} ) );
		+ testConverter( createConverter( ( lex ) -> {} ) );
		+ }
		+
		+ @ParameterizedTest
		+ @MethodSource( "param_XmlParse_StraightQuotes_CurlyQuotes" )
		+ public void test_XmlParse_StraightQuotes_CurlyQuotes(
		+ final String input, final String expected ) {
		+ final var converter = createConverter( out::println, PARSER_XML );
		+ final var actual = converter.apply( input );
		+ assertEquals( expected, actual );
		}


		}

		- final var converter = new Converter( out::println );
		+ final var converter = createConverter( out::println );
		System.out.println( converter.apply( sb.toString() ) );
		+ }
		+
		+ @SuppressWarnings( "unused" )
		+ static Stream<Arguments> param_XmlParse_StraightQuotes_CurlyQuotes() {
		+ return Stream.of(
		+ arguments(
		+ "<em>'twas</em>",
		+ "<em>'twas</em>"
		+ ),
		+ arguments(
		+ "<bold>'twas</bold> redeemed for the <em>cat</em>'s eye",
		+ "<bold>'twas</bold> redeemed for the <em>cat</em>'s eye"
		+ ),
		+ arguments(
		+ "<a href=\"https://x.org\" title=\"X's Homepage\">X11's bomb</a>",
		+ "<a href=\"https://x.org\" title=\"X's Homepage\">X11's bomb</a>"
		+ ),
		+ arguments(
		+ "''<em>Twas</em> happening!'",
		+ "‘'<em>Twas</em> happening!’"
		+ )
		+ );
		}



		return new BufferedReader( new InputStreamReader( is ) );
		+ }
		+
		+ private Function<String, String> createConverter(
		+ final Consumer<Lexeme> unresolved ) {
		+ return createConverter( unresolved, PARSER_PLAIN );
		+ }
		+
		+ private Function<String, String> createConverter(
		+ final Consumer<Lexeme> unresolved, final ParserType parserType ) {
		+ return new Converter( unresolved, parserType );
		}
		}

src/test/java/com/whitemagicsoftware/keenquotes/LexerTest.java

		* Tests lexing words, numbers, punctuation, spaces, newlines, etc.
		*/
		-class LexerTest {
		+final class LexerTest {
		+
		@Test
		void test_Lexing_Words_LexemeValues() {

		final var list = asList( expected );
		testType( lexer, actual, ( lexeme, text ) -> lexeme.getType(), list );
		- }
		-
		- static void testText(
		- final Lexer lexer, final String actual, final String... expected ) {
		- testType( lexer, actual, Lexeme::toString, asList( expected ) );
		}

src/test/java/com/whitemagicsoftware/keenquotes/ParserTest.java

		* Test that all unambiguous apostrophes are emitted once.
		*/
		-class ParserTest {
		+final class ParserTest {
		+
		@SuppressWarnings( "TextBlockMigration" )
		private final static Map<String, Map<TokenType, Integer>> TEST_CASES =

src/test/java/com/whitemagicsoftware/keenquotes/XmlLexerTest.java


		import static com.whitemagicsoftware.keenquotes.LexemeType.*;
		-import static com.whitemagicsoftware.keenquotes.LexerTest.testText;
		import static com.whitemagicsoftware.keenquotes.LexerTest.testType;

		/**
		* Test that parsing XML documents ignores elements.
		*/
		final class XmlLexerTest {
		-
		- @Test
		- void test_Lexing_Xml_LexemeValues() {
		- final var actual = "Hello <em>world</em>";
		- testText(
		- createXmlLexer( actual ), actual,
		- "Hello", " ", "<em>", "world", "</em>"
		- );
		- }

		@Test
		void test_Lexing_Xml_EmitTags() {
		final var actual = "The <em>world's</em> aflame.";
		testType(
		createXmlLexer( actual ), actual,
		- WORD, SPACE, TAG, WORD, QUOTE_SINGLE, WORD, TAG, SPACE, WORD, PERIOD
		+ WORD, SPACE, WORD, QUOTE_SINGLE, WORD, SPACE, WORD, PERIOD
		);
		}

		@Test
		void test_Lexing_XmlAttribute_EmitTags() {
		- final var actual = "<a href=\"http://x\">x</a>";
		- testType( createXmlLexer( actual ), actual, TAG, WORD, TAG );
		+ final var actual = "<a href=\"http://x.org\">X11</a>";
		+ testType( createXmlLexer( actual ), actual, WORD, NUMBER );
		}

Delta	135 lines added, 47 lines removed, 88-line increase