Add ability to ignore preformatted elements

Author	Dave Jarvis <email>
Date	2021-07-02 23:15:36 GMT-0700
Commit	567c588c35147721bdefdc496999913c02febc31
Parent	3a2ec71

src/main/java/com/whitemagicsoftware/keenquotes/XmlLexer.java


		import java.text.CharacterIterator;
		+import java.util.Set;
		+
		+import static java.text.CharacterIterator.DONE;

		/**
		* Responsible for lexing text while ignoring XML elements. The document must
		* be both sane and well-formed. This is not intended to lex documents in the
		* wild where the user has injected custom HTML. The lexer will fail if the
		* angle brackets are not balanced. Additionally, any less than or greater than
		* symbols must be encoded as {@code <} or {@code >}, respectively.
		*/
		final class XmlLexer extends Lexer {
		+
		+ /**
		+ * Referenced when skipping text, to determine whether lexing has found
		+ * an untouchable element, such as preformatted text.
		+ */
		+ private final String mText;
		+
		+ /**
		+ * Elements that indicate preformatted text, intentional straight quotes.
		+ */
		+ private final Set<String> UNTOUCHABLE = Set.of(
		+ "pre",
		+ "code",
		+ "tt",
		+ "tex",
		+ "kbd",
		+ "samp",
		+ "var",
		+ "l",
		+ "blockcode"
		+ );
		+
		/**
		* Constructs a {@link Lexer} capable of turning text int {@link Lexeme}s.
		*
		* @param text The text to lex.
		*/
		XmlLexer( final String text ) {
		super( text );
		+
		+ mText = text;
		}


		@Override
		boolean skip( final CharacterIterator i ) {
		- final boolean match = i.current() == '<';
		+ final var match = i.current() == '<';

		if( match ) {
		- slurp( i, ( next, ci ) -> next != '>' );
		+ final var openingTag = nextTag( i );

		- // Swallow the trailing greater than symbol.
		- i.next();
		+ if( UNTOUCHABLE.contains( openingTag.toLowerCase() ) ) {
		+ String closingTag;

		- // Skip to the character following the greater than symbol.
		- i.next();
		+ do {
		+ closingTag = nextTag( i );
		+ }
		+ while( !closingTag.endsWith( openingTag ) && i.current() != DONE );
		+ }
		}

		return match;
		+ }
		+
		+ /**
		+ * Skips to the next greater than or less than symbol.
		+ *
		+ * @param i The {@link CharacterIterator} used to scan through the text, one
		+ * character at a time.
		+ * @return An opening/closing tag name, or the content within the element.
		+ */
		+ private String nextTag( final CharacterIterator i ) {
		+ final var begin = i.getIndex();
		+
		+ slurp( i, ( next, ci ) -> next != '>' && next != '<' );
		+
		+ // Swallow the trailing greater than symbol.
		+ i.next();
		+
		+ // Skip to the character following the greater than symbol.
		+ i.next();
		+
		+ return mText.substring( begin + 1, i.getIndex() - 1 );
		}
		}

src/test/java/com/whitemagicsoftware/keenquotes/KeenQuotesTest.java

		),
		arguments(
		+ "<strong>'Twas</strong> <kbd>'</kbd> in <tex>Knuth's TeX</tex>",
		+ "<strong>'Twas</strong> <kbd>'</kbd> in <tex>Knuth's TeX</tex>"
		+ ),
		+ arguments(
		"<bold>'twas</bold> redeemed for the <em>cat</em>'s eye",
		"<bold>'twas</bold> redeemed for the <em>cat</em>'s eye"

src/test/java/com/whitemagicsoftware/keenquotes/XmlLexerTest.java

		@Test
		void test_Lexing_Xml_EmitTags() {
		- final var actual = "The <em>world's</em> aflame.";
		+ final var actual =
		+ "A <em>world's</em> aflame <pre><code>ch = '\\''</code></pre>.";
		testType(
		createXmlLexer( actual ), actual,
		- WORD, SPACE, WORD, QUOTE_SINGLE, WORD, SPACE, WORD, PERIOD
		+ WORD, SPACE, WORD, QUOTE_SINGLE, WORD, SPACE, WORD, SPACE, PERIOD
		);
		}

Delta	64 lines added, 8 lines removed, 56-line increase