Dave Jarvis' Repositories

git clone https://repo.autonoma.ca/repo/keenquotes.git

Fix bug that could occur against non-well-formed XML documents

AuthorDave Jarvis <email>
Date2022-11-02 20:16:44 GMT-0700
Commitca3c3a32cebb7612243c3dd65ef97872cef76329
Parent86c8e10
src/main/java/com/whitemagicsoftware/keenquotes/lex/XmlFilter.java
import com.whitemagicsoftware.keenquotes.util.FastCharacterIterator;
-import java.text.CharacterIterator;
import java.util.Set;
+
+import static java.text.CharacterIterator.DONE;
/**
);
- public XmlFilter() {}
+ public XmlFilter() { }
/**
* Skip XML tags found within the prose, which hides the elements.
*/
@Override
public boolean test( final FastCharacterIterator i ) {
final var match = i.current() == '<';
if( match ) {
- final var openingTag = nextTag( i );
+ try {
+ final var openingTag = nextTag( i );
- if( UNTOUCHABLE.contains( openingTag.toLowerCase() ) ) {
- String closingTag;
+ if( UNTOUCHABLE.contains( openingTag.toLowerCase() ) ) {
+ String closingTag;
- do {
- closingTag = nextTag( i );
+ do {
+ closingTag = nextTag( i );
+ }
+ while( !closingTag.endsWith( openingTag ) );
}
- while( !closingTag.endsWith( openingTag ) );
+ } catch( final IndexOutOfBoundsException ex ) {
+ // The document ran out of characters; XML is not well-formed; stop
+ // parsing additional text.
+ return false;
}
}
return match;
}
/**
* Skips to the next greater than or less than symbol.
*
- * @param i The {@link CharacterIterator} used to scan through the text, one
- * character at a time.
+ * @param i Scans through the text, one character at a time.
* @return An opening/closing tag name, or the content within the element.
+ * @throws IndexOutOfBoundsException The tag was not closed before the
+ * document ended.
*/
private String nextTag( final FastCharacterIterator i ) {
final var begin = i.index();
- i.skip( next -> next != '>' && next != '<' );
+ i.skip( next -> next != '>' && next != '<' && next != DONE );
// Swallow the trailing greater than symbol.
src/test/java/com/whitemagicsoftware/keenquotes/lex/LexerTest.java
@Test
- void test_Lexing_Words_LexemeValues() {
+ void test_Lexing_Words_EmitMixedValues() {
testText( "abc 123", "abc", " ", "123" );
testText( "-123 abc", "-123", " ", "abc" );
Lexer.lex( text, lexeme -> {
// Ignore the SOT and EOT lexemes (avoids duplication). Each test will
- // include EOL, EOP, and EOT tokens due to the Lexer's algorithm.
+ // include EOL, EOP, and EOT tokens due to the lexer's algorithm.
if( !lexeme.isType( SOT, EOT ) && counter.get() < elements.size() ) {
final var expected = elements.get( counter.getAndIncrement() );
Delta21 lines added, 13 lines removed, 8-line increase