Dave Jarvis' Repositories

git clone https://repo.autonoma.ca/repo/keenquotes.git

Add ability to ignore preformatted elements

AuthorDave Jarvis <email>
Date2021-07-02 23:15:36 GMT-0700
Commit567c588c35147721bdefdc496999913c02febc31
Parent3a2ec71
src/main/java/com/whitemagicsoftware/keenquotes/XmlLexer.java
import java.text.CharacterIterator;
+import java.util.Set;
+
+import static java.text.CharacterIterator.DONE;
/**
* Responsible for lexing text while ignoring XML elements. The document must
* be both sane and well-formed. This is not intended to lex documents in the
* wild where the user has injected custom HTML. The lexer will fail if the
* angle brackets are not balanced. Additionally, any less than or greater than
* symbols must be encoded as {@code &lt;} or {@code &gt;}, respectively.
*/
final class XmlLexer extends Lexer {
+
+ /**
+ * Referenced when skipping text, to determine whether lexing has found
+ * an untouchable element, such as preformatted text.
+ */
+ private final String mText;
+
+ /**
+ * Elements that indicate preformatted text, intentional straight quotes.
+ */
+ private final Set<String> UNTOUCHABLE = Set.of(
+ "pre",
+ "code",
+ "tt",
+ "tex",
+ "kbd",
+ "samp",
+ "var",
+ "l",
+ "blockcode"
+ );
+
/**
* Constructs a {@link Lexer} capable of turning text int {@link Lexeme}s.
*
* @param text The text to lex.
*/
XmlLexer( final String text ) {
super( text );
+
+ mText = text;
}
@Override
boolean skip( final CharacterIterator i ) {
- final boolean match = i.current() == '<';
+ final var match = i.current() == '<';
if( match ) {
- slurp( i, ( next, ci ) -> next != '>' );
+ final var openingTag = nextTag( i );
- // Swallow the trailing greater than symbol.
- i.next();
+ if( UNTOUCHABLE.contains( openingTag.toLowerCase() ) ) {
+ String closingTag;
- // Skip to the character following the greater than symbol.
- i.next();
+ do {
+ closingTag = nextTag( i );
+ }
+ while( !closingTag.endsWith( openingTag ) && i.current() != DONE );
+ }
}
return match;
+ }
+
+ /**
+ * Skips to the next greater than or less than symbol.
+ *
+ * @param i The {@link CharacterIterator} used to scan through the text, one
+ * character at a time.
+ * @return An opening/closing tag name, or the content within the element.
+ */
+ private String nextTag( final CharacterIterator i ) {
+ final var begin = i.getIndex();
+
+ slurp( i, ( next, ci ) -> next != '>' && next != '<' );
+
+ // Swallow the trailing greater than symbol.
+ i.next();
+
+ // Skip to the character following the greater than symbol.
+ i.next();
+
+ return mText.substring( begin + 1, i.getIndex() - 1 );
}
}
src/test/java/com/whitemagicsoftware/keenquotes/KeenQuotesTest.java
),
arguments(
+ "<strong>'Twas</strong> <kbd>'</kbd> in <tex>Knuth's TeX</tex>",
+ "<strong>&apos;Twas</strong> <kbd>'</kbd> in <tex>Knuth's TeX</tex>"
+ ),
+ arguments(
"<bold>'twas</bold> redeemed for the <em>cat</em>'s eye",
"<bold>&apos;twas</bold> redeemed for the <em>cat</em>&apos;s eye"
src/test/java/com/whitemagicsoftware/keenquotes/XmlLexerTest.java
@Test
void test_Lexing_Xml_EmitTags() {
- final var actual = "The <em>world's</em> aflame.";
+ final var actual =
+ "A <em>world's</em> aflame <pre><code>ch = '\\''</code></pre>.";
testType(
createXmlLexer( actual ), actual,
- WORD, SPACE, WORD, QUOTE_SINGLE, WORD, SPACE, WORD, PERIOD
+ WORD, SPACE, WORD, QUOTE_SINGLE, WORD, SPACE, WORD, SPACE, PERIOD
);
}
Delta64 lines added, 8 lines removed, 56-line increase