Dave Jarvis' Repositories

git clone https://repo.autonoma.ca/repo/keenquotes.git
M src/main/java/com/whitemagicsoftware/keenquotes/lex/XmlFilter.java
4 4
import com.whitemagicsoftware.keenquotes.util.FastCharacterIterator;
5 5
6
import java.text.CharacterIterator;
7 6
import java.util.Set;
7
8
import static java.text.CharacterIterator.DONE;
8 9
9 10
/**
...
31 32
  );
32 33
33
  public XmlFilter() {}
34
  public XmlFilter() { }
34 35
35 36
  /**
36 37
   * Skip XML tags found within the prose, which hides the elements.
37 38
   */
38 39
  @Override
39 40
  public boolean test( final FastCharacterIterator i ) {
40 41
    final var match = i.current() == '<';
41 42
42 43
    if( match ) {
43
      final var openingTag = nextTag( i );
44
      try {
45
        final var openingTag = nextTag( i );
44 46
45
      if( UNTOUCHABLE.contains( openingTag.toLowerCase() ) ) {
46
        String closingTag;
47
        if( UNTOUCHABLE.contains( openingTag.toLowerCase() ) ) {
48
          String closingTag;
47 49
48
        do {
49
          closingTag = nextTag( i );
50
          do {
51
            closingTag = nextTag( i );
52
          }
53
          while( !closingTag.endsWith( openingTag ) );
50 54
        }
51
        while( !closingTag.endsWith( openingTag ) );
55
      } catch( final IndexOutOfBoundsException ex ) {
56
        // The document ran out of characters; XML is not well-formed; stop
57
        // parsing additional text.
58
        return false;
52 59
      }
53 60
    }
54 61
55 62
    return match;
56 63
  }
57 64
58 65
  /**
59 66
   * Skips to the next greater than or less than symbol.
60 67
   *
61
   * @param i The {@link CharacterIterator} used to scan through the text, one
62
   *          character at a time.
68
   * @param i Scans through the text, one character at a time.
63 69
   * @return An opening/closing tag name, or the content within the element.
70
   * @throws IndexOutOfBoundsException The tag was not closed before the
71
   *                                   document ended.
64 72
   */
65 73
  private String nextTag( final FastCharacterIterator i ) {
66 74
    final var begin = i.index();
67 75
68
    i.skip( next -> next != '>' && next != '<' );
76
    i.skip( next -> next != '>' && next != '<' && next != DONE );
69 77
70 78
    // Swallow the trailing greater than symbol.
M src/test/java/com/whitemagicsoftware/keenquotes/lex/LexerTest.java
18 18
19 19
  @Test
20
  void test_Lexing_Words_LexemeValues() {
20
  void test_Lexing_Words_EmitMixedValues() {
21 21
    testText( "abc 123", "abc", " ", "123" );
22 22
    testText( "-123 abc", "-123", " ", "abc" );
...
146 146
    Lexer.lex( text, lexeme -> {
147 147
      // Ignore the SOT and EOT lexemes (avoids duplication). Each test will
148
      // include EOL, EOP, and EOT tokens due to the Lexer's algorithm.
148
      // include EOL, EOP, and EOT tokens due to the lexer's algorithm.
149 149
      if( !lexeme.isType( SOT, EOT ) && counter.get() < elements.size() ) {
150 150
        final var expected = elements.get( counter.getAndIncrement() );