| 4 | 4 | import com.whitemagicsoftware.keenquotes.util.FastCharacterIterator; |
| 5 | 5 | |
| 6 | import java.text.CharacterIterator; |
|
| 7 | 6 | import java.util.Set; |
| 7 | ||
| 8 | import static java.text.CharacterIterator.DONE; |
|
| 8 | 9 | |
| 9 | 10 | /** |
| ... | ||
| 31 | 32 | ); |
| 32 | 33 | |
| 33 | public XmlFilter() {} |
|
| 34 | public XmlFilter() { } |
|
| 34 | 35 | |
| 35 | 36 | /** |
| 36 | 37 | * Skip XML tags found within the prose, which hides the elements. |
| 37 | 38 | */ |
| 38 | 39 | @Override |
| 39 | 40 | public boolean test( final FastCharacterIterator i ) { |
| 40 | 41 | final var match = i.current() == '<'; |
| 41 | 42 | |
| 42 | 43 | if( match ) { |
| 43 | final var openingTag = nextTag( i ); |
|
| 44 | try { |
|
| 45 | final var openingTag = nextTag( i ); |
|
| 44 | 46 | |
| 45 | if( UNTOUCHABLE.contains( openingTag.toLowerCase() ) ) { |
|
| 46 | String closingTag; |
|
| 47 | if( UNTOUCHABLE.contains( openingTag.toLowerCase() ) ) { |
|
| 48 | String closingTag; |
|
| 47 | 49 | |
| 48 | do { |
|
| 49 | closingTag = nextTag( i ); |
|
| 50 | do { |
|
| 51 | closingTag = nextTag( i ); |
|
| 52 | } |
|
| 53 | while( !closingTag.endsWith( openingTag ) ); |
|
| 50 | 54 | } |
| 51 | while( !closingTag.endsWith( openingTag ) ); |
|
| 55 | } catch( final IndexOutOfBoundsException ex ) { |
|
| 56 | // The document ran out of characters; XML is not well-formed; stop |
|
| 57 | // parsing additional text. |
|
| 58 | return false; |
|
| 52 | 59 | } |
| 53 | 60 | } |
| 54 | 61 | |
| 55 | 62 | return match; |
| 56 | 63 | } |
| 57 | 64 | |
| 58 | 65 | /** |
| 59 | 66 | * Skips to the next greater than or less than symbol. |
| 60 | 67 | * |
| 61 | * @param i The {@link CharacterIterator} used to scan through the text, one |
|
| 62 | * character at a time. |
|
| 68 | * @param i Scans through the text, one character at a time. |
|
| 63 | 69 | * @return An opening/closing tag name, or the content within the element. |
| 70 | * @throws IndexOutOfBoundsException The tag was not closed before the |
|
| 71 | * document ended. |
|
| 64 | 72 | */ |
| 65 | 73 | private String nextTag( final FastCharacterIterator i ) { |
| 66 | 74 | final var begin = i.index(); |
| 67 | 75 | |
| 68 | i.skip( next -> next != '>' && next != '<' ); |
|
| 76 | i.skip( next -> next != '>' && next != '<' && next != DONE ); |
|
| 69 | 77 | |
| 70 | 78 | // Swallow the trailing greater than symbol. |
| 18 | 18 | |
| 19 | 19 | @Test |
| 20 | void test_Lexing_Words_LexemeValues() { |
|
| 20 | void test_Lexing_Words_EmitMixedValues() { |
|
| 21 | 21 | testText( "abc 123", "abc", " ", "123" ); |
| 22 | 22 | testText( "-123 abc", "-123", " ", "abc" ); |
| ... | ||
| 146 | 146 | Lexer.lex( text, lexeme -> { |
| 147 | 147 | // Ignore the SOT and EOT lexemes (avoids duplication). Each test will |
| 148 | // include EOL, EOP, and EOT tokens due to the Lexer's algorithm. |
|
| 148 | // include EOL, EOP, and EOT tokens due to the lexer's algorithm. |
|
| 149 | 149 | if( !lexeme.isType( SOT, EOT ) && counter.get() < elements.size() ) { |
| 150 | 150 | final var expected = elements.get( counter.getAndIncrement() ); |