Dave Jarvis' Repositories

git clone https://repo.autonoma.ca/repo/keenquotes.git
M src/main/java/com/whitemagicsoftware/keenquotes/app/KeenQuotes.java
46 46
    else {
47 47
      try {
48
        final var filter = settings.filterXml() ? FILTER_XML : FILTER_PLAIN;
49
        final var c = new Curler( contractions, filter );
48
        final var c = new Curler(
49
          contractions,
50
          settings.filterXml() ? FILTER_XML : FILTER_PLAIN,
51
          settings.entities()
52
        );
50 53
51 54
        out.print( convert( c ) );
M src/main/java/com/whitemagicsoftware/keenquotes/app/Settings.java
77 77
78 78
  /**
79
   * Encode quotation marks using HTML entities.
80
   */
81
  @CommandLine.Option(
82
    names = {"-e", "--entities"},
83
    description = "Encode quotation marks using HTML entities"
84
  )
85
  private boolean mEntities;
86
87
  /**
79 88
   * Enable the {@link XmlFilter}.
80 89
   */
...
95 104
   * @return {@code true} to list the contractions.
96 105
   */
97
  boolean displayList() {
98
    return mDisplayList;
99
  }
106
  boolean displayList() { return mDisplayList; }
100 107
108
  /**
109
   * Answers whether quotation marks within XML elements are ignored.
110
   *
111
   * @return {@code true} to honour quotation marks inside XML elements.
112
   */
101 113
  boolean filterXml() { return mFilterXml; }
114
115
  /**
116
   * Answers whether entities must be encoded using HTML entities.
117
   *
118
   * @return {@code true} to encode quotation marks using HTML entities.
119
   */
120
  boolean entities() { return mEntities; }
102 121
103 122
  List<String> getBeganUnambiguous() {
A src/main/java/com/whitemagicsoftware/keenquotes/lex/LexemeGlyph.java
1
/* Copyright 2022 White Magic Software, Ltd. -- All rights reserved. */
2
package com.whitemagicsoftware.keenquotes.lex;
3
4
/**
5
 * Common international quotation mark symbols to allow for re-encoding when
6
 * exporting back to text.
7
 */
8
public enum LexemeGlyph {
9
  /**
10
   * Any other character that's not a quotation mark.
11
   */
12
  LEX_OTHER( (char) 0 ),
13
14
  LEX_SINGLE_QUOTE( '\'' ),
15
  LEX_SINGLE_QUOTE_OPENING( '‘' ),
16
  LEX_SINGLE_QUOTE_CLOSING( '’' ),
17
18
  LEX_DOUBLE_QUOTE( '"' ),
19
  LEX_DOUBLE_QUOTE_OPENING( '“' ),
20
  LEX_DOUBLE_QUOTE_CLOSING( '”' ),
21
  LEX_DOUBLE_QUOTE_OPENING_LOW( '„' ),
22
23
  LEX_DOUBLE_CHEVRON_LEFT( '«' ),
24
  LEX_DOUBLE_CHEVRON_RIGHT( '»' ),
25
  LEX_SINGLE_CHEVRON_LEFT( '‹' ),
26
  LEX_SINGLE_CHEVRON_RIGHT( '›' );
27
28
  private final char mGlyph;
29
30
  LexemeGlyph( final char glyph ) {
31
    mGlyph = glyph;
32
  }
33
34
  /**
35
   * Answers whether the given character matches the internal glyph.
36
   *
37
   * @param glyph The character to compare against the internal character.
38
   * @return {@code true} iff the characters are equal.
39
   */
40
  public boolean equals( final char glyph ) {
41
    return mGlyph == glyph;
42
  }
43
44
  public String text() {
45
    return Character.toString( mGlyph );
46
  }
47
}
1 48
M src/main/java/com/whitemagicsoftware/keenquotes/lex/LexemeType.java
2 2
package com.whitemagicsoftware.keenquotes.lex;
3 3
4
import static com.whitemagicsoftware.keenquotes.lex.LexemeGlyph.*;
5
4 6
/**
5 7
 * Represents the type of {@link Lexeme} parsed by the {@link Lexer}.
6 8
 */
7 9
@SuppressWarnings( "SpellCheckingInspection" )
8
public enum LexemeType {
9
  QUOTE_SINGLE,
10
  QUOTE_SINGLE_OPENING,
11
  QUOTE_SINGLE_CLOSING,
12
  QUOTE_DOUBLE,
13
  QUOTE_DOUBLE_OPENING,
14
  QUOTE_DOUBLE_CLOSING,
15
  ESC_SINGLE,
16
  ESC_DOUBLE,
17
  SOT,
18
  EOL,
19
  EOP,
20
  EOT,
21
  SPACE,
22
  WORD,
23
  NUMBER,
24
  PUNCT,
25
  OPENING_GROUP,
26
  CLOSING_GROUP,
27
  HYPHEN,
28
  DASH,
29
  EQUALS,
30
  PERIOD,
31
  ELLIPSIS,
32
  ENDING,
33
  ANY,
34
  NONE
10
public final class LexemeType {
11
  // @formatter:off
12
  public static final LexemeType QUOTE_SINGLE = new LexemeType( LEX_SINGLE_QUOTE );
13
  public static final LexemeType QUOTE_SINGLE_OPENING = new LexemeType( LEX_SINGLE_QUOTE_OPENING );
14
  public static final LexemeType QUOTE_SINGLE_CLOSING = new LexemeType( LEX_SINGLE_QUOTE_CLOSING );
15
  public static final LexemeType QUOTE_DOUBLE = new LexemeType( LEX_DOUBLE_QUOTE );
16
  public static final LexemeType QUOTE_DOUBLE_OPENING = new LexemeType( LEX_DOUBLE_QUOTE_OPENING );
17
  public static final LexemeType QUOTE_DOUBLE_CLOSING = new LexemeType( LEX_DOUBLE_QUOTE_CLOSING );
18
  public static final LexemeType ESC_SINGLE = new LexemeType();
19
  public static final LexemeType ESC_DOUBLE = new LexemeType();
20
  public static final LexemeType PRIME_DOUBLE = new LexemeType();
21
  public static final LexemeType SOT = new LexemeType();
22
  public static final LexemeType EOL = new LexemeType();
23
  public static final LexemeType EOP = new LexemeType();
24
  public static final LexemeType EOT = new LexemeType();
25
  public static final LexemeType SPACE = new LexemeType();
26
  public static final LexemeType WORD = new LexemeType();
27
  public static final LexemeType NUMBER = new LexemeType();
28
  public static final LexemeType PUNCT = new LexemeType();
29
  public static final LexemeType OPENING_GROUP = new LexemeType();
30
  public static final LexemeType CLOSING_GROUP = new LexemeType();
31
  public static final LexemeType HYPHEN = new LexemeType();
32
  public static final LexemeType DASH = new LexemeType();
33
  public static final LexemeType EQUALS = new LexemeType();
34
  public static final LexemeType PERIOD = new LexemeType();
35
  public static final LexemeType ELLIPSIS = new LexemeType();
36
  public static final LexemeType ENDING = new LexemeType();
37
  public static final LexemeType ANY = new LexemeType();
38
  public static final LexemeType NONE = new LexemeType();
39
  // @formatter:on
40
41
  private LexemeGlyph mGlyph;
42
43
  /**
44
   * Constructs an instance of {@link LexemeType} using
45
   * {@link LexemeGlyph#LEX_OTHER} to indicate that this type of lexeme isn't
46
   * a quotation mark glyph.
47
   */
48
  public LexemeType() {
49
    this( LEX_OTHER );
50
  }
51
52
  /**
53
   * Constructs an instance of {@link LexemeType} using a particular glyph.
54
   *
55
   * @param glyph Typically represents an internationalized quotation mark
56
   *              character.
57
   */
58
  public LexemeType( final LexemeGlyph glyph ) {
59
    setGlyph( glyph );
60
  }
61
62
  /**
63
   * Changes the type of glyph associated with this type of lexeme. This
64
   * is useful for passing along different glyphs represented by the same
65
   * lexeme (such as different opening quotation marks).
66
   *
67
   * @param glyph The new {@link LexemeGlyph} to associate, often an
68
   *              internationalized quotation mark.
69
   * @return {@code this} to allow chaining.
70
   */
71
  public LexemeType with( final LexemeGlyph glyph ) {
72
    setGlyph( glyph );
73
    return this;
74
  }
75
76
  /**
77
   * Provides the glyph used to identify international quotation marks.
78
   *
79
   * @return The glyph set either at construction time or after calling
80
   * {@link #with(LexemeGlyph)}.
81
   */
82
  public LexemeGlyph glyph() {
83
    return mGlyph;
84
  }
85
86
  private void setGlyph( final LexemeGlyph glyph ) {
87
    mGlyph = glyph;
88
  }
89
90
  /**
91
   * Provides useful debugging information.
92
   *
93
   * @return The class name and encodable glyph.
94
   */
95
  @Override
96
  public String toString() {
97
    return getClass().getSimpleName() +
98
      '[' + glyph() + ']';
99
  }
35 100
}
36 101
M src/main/java/com/whitemagicsoftware/keenquotes/lex/Lexer.java
6 6
import java.util.function.Consumer;
7 7
8
import static com.whitemagicsoftware.keenquotes.lex.LexemeGlyph.*;
8 9
import static com.whitemagicsoftware.keenquotes.lex.LexemeType.*;
9 10
import static java.lang.Character.isWhitespace;
...
47 48
      // Allow filters to skip character sequences (such as XML tags). This
48 49
      // must allow back-to-back filtering, hence the loop.
49
      while( filter.test( i ) );
50
      while( filter.test( i ) ) ;
50 51
51 52
      final var index = i.index();
...
114 115
        token = CLOSING_GROUP;
115 116
      }
116
      else if( curr == '“' ) {
117
        token = QUOTE_DOUBLE_OPENING;
117
      else if( LEX_DOUBLE_QUOTE_OPENING.equals( curr ) ) {
118
        token = QUOTE_DOUBLE_OPENING.with( LEX_DOUBLE_QUOTE_OPENING );
118 119
      }
119
      else if( curr == '”' ) {
120
        token = QUOTE_DOUBLE_CLOSING;
120
      else if( LEX_DOUBLE_QUOTE_CLOSING.equals( curr ) ) {
121
        token = QUOTE_DOUBLE_CLOSING.with( LEX_DOUBLE_QUOTE_CLOSING );
121 122
      }
122
      else if( curr == '‘' ) {
123
        token = QUOTE_SINGLE_OPENING;
123
      else if( LEX_SINGLE_QUOTE_OPENING.equals( curr ) ) {
124
        token = QUOTE_SINGLE_OPENING.with( LEX_SINGLE_QUOTE_OPENING );
124 125
      }
125
      else if( curr == '’' ) {
126
        token = QUOTE_SINGLE_CLOSING;
126
      else if( LEX_SINGLE_QUOTE_CLOSING.equals( curr ) ) {
127
        token = QUOTE_SINGLE_CLOSING.with( LEX_SINGLE_QUOTE_CLOSING );
127 128
      }
128 129
      else if( curr == '\\' ) {
...
143 144
      else if( curr == '=' ) {
144 145
        token = EQUALS;
146
      }
147
      else if( curr == ',' && i.peek() == ',' ) {
148
        i.skip( next -> next == ',' );
149
        token = QUOTE_DOUBLE_OPENING.with( LEX_DOUBLE_QUOTE_OPENING_LOW );
150
      }
151
      else if( LEX_DOUBLE_QUOTE_OPENING_LOW.equals( curr ) ) {
152
        token = QUOTE_DOUBLE_OPENING.with( LEX_DOUBLE_QUOTE_OPENING_LOW );
153
      }
154
      else if( LEX_SINGLE_CHEVRON_LEFT.equals( curr ) ) {
155
        token = QUOTE_SINGLE_OPENING.with( LEX_SINGLE_CHEVRON_LEFT );
156
      }
157
      else if( LEX_DOUBLE_CHEVRON_LEFT.equals( curr ) ) {
158
        token = QUOTE_DOUBLE_OPENING.with( LEX_DOUBLE_CHEVRON_LEFT );
159
      }
160
      else if( LEX_SINGLE_CHEVRON_RIGHT.equals( curr ) ) {
161
        token = QUOTE_SINGLE_CLOSING.with( LEX_SINGLE_CHEVRON_RIGHT );
162
      }
163
      else if( LEX_DOUBLE_CHEVRON_RIGHT.equals( curr ) ) {
164
        token = QUOTE_DOUBLE_CLOSING.with( LEX_DOUBLE_CHEVRON_RIGHT );
145 165
      }
146 166
      else if( curr == DONE ) {
M src/main/java/com/whitemagicsoftware/keenquotes/parser/Curler.java
3 3
4 4
import com.whitemagicsoftware.keenquotes.lex.FilterType;
5
import com.whitemagicsoftware.keenquotes.lex.LexerFilter;
5 6
6
import java.util.Map;
7 7
import java.util.concurrent.atomic.AtomicInteger;
8 8
import java.util.function.Consumer;
9 9
import java.util.function.Function;
10
11
import static com.whitemagicsoftware.keenquotes.parser.TokenType.*;
12
import static java.util.Map.entry;
13
import static java.util.Map.ofEntries;
14 10
15 11
/**
16 12
 * Resolves straight quotes into curly quotes throughout a document.
17 13
 */
18 14
@SuppressWarnings( "unused" )
19 15
public class Curler implements Function<String, String> {
20
  /**
21
   * Provides an entity-based set of {@link Token} replacements.
22
   */
23
  public static final Map<TokenType, String> ENTITIES = ofEntries(
24
    entry( QUOTE_OPENING_SINGLE, "&lsquo;" ),
25
    entry( QUOTE_CLOSING_SINGLE, "&rsquo;" ),
26
    entry( QUOTE_OPENING_DOUBLE, "&ldquo;" ),
27
    entry( QUOTE_CLOSING_DOUBLE, "&rdquo;" ),
28
    entry( QUOTE_STRAIGHT_SINGLE, "'" ),
29
    entry( QUOTE_STRAIGHT_DOUBLE, "\"" ),
30
    entry( QUOTE_APOSTROPHE, "&apos;" ),
31
    entry( QUOTE_PRIME_SINGLE, "&prime;" ),
32
    entry( QUOTE_PRIME_DOUBLE, "&Prime;" ),
33
    entry( QUOTE_PRIME_TRIPLE, "&tprime;" ),
34
    entry( QUOTE_PRIME_QUADRUPLE, "&qprime;" )
35
  );
36
37
  /**
38
   * Provides a character-based set of {@link Token} replacements.
39
   */
40
  public static final Map<TokenType, String> CHARS = ofEntries(
41
    entry( QUOTE_OPENING_SINGLE, "‘" ),
42
    entry( QUOTE_CLOSING_SINGLE, "’" ),
43
    entry( QUOTE_OPENING_DOUBLE, "“" ),
44
    entry( QUOTE_CLOSING_DOUBLE, "”" ),
45
    entry( QUOTE_STRAIGHT_SINGLE, "'" ),
46
    entry( QUOTE_STRAIGHT_DOUBLE, "\"" ),
47
    entry( QUOTE_APOSTROPHE, "’" ),
48
    entry( QUOTE_PRIME_SINGLE, "′" ),
49
    entry( QUOTE_PRIME_DOUBLE, "″" ),
50
    entry( QUOTE_PRIME_TRIPLE, "‴" ),
51
    entry( QUOTE_PRIME_QUADRUPLE, "⁗" )
52
  );
53 16
54 17
  private final Contractions mContractions;
55
  private final Map<TokenType, String> mReplacements;
56
  private final FilterType mFilterType;
57
58
  /**
59
   * Maps quotes to HTML entities.
60
   *
61
   * @param c          Contractions listings.
62
   * @param parserType Creates a parser based on document content structure.
63
   */
64
  public Curler( final Contractions c, final FilterType parserType ) {
65
    this( c, ENTITIES, parserType );
66
  }
18
  private final LexerFilter mFilter;
19
  private final boolean mEntities;
67 20
68 21
  /**
69 22
   * Maps quotes to curled character equivalents.
70 23
   *
71
   * @param c            Contractions listings.
72
   * @param replacements Map of recognized quotes to output types (entity or
73
   *                     Unicode character).
24
   * @param c        Contractions listings.
25
   * @param entities {@code true} to convert quotation marks to HTML entities.
74 26
   */
75 27
  public Curler(
76 28
    final Contractions c,
77
    final Map<TokenType, String> replacements,
78
    final FilterType parserType
29
    final FilterType filterType,
30
    final boolean entities
79 31
  ) {
32
    assert c != null;
33
80 34
    mContractions = c;
81
    mReplacements = replacements;
82
    mFilterType = parserType;
35
    mEntities = entities;
36
    mFilter = filterType.filter();
83 37
  }
84 38
...
100 54
      text,
101 55
      mContractions,
102
      replace( output, offset, mReplacements ),
103
      mFilterType.filter()
56
      replace( output, offset, mEntities ),
57
      mFilter
104 58
    );
105 59
106 60
    return output.toString();
107 61
  }
108 62
109 63
  /**
110 64
   * Replaces non-ambiguous tokens with their equivalent string representation.
111 65
   *
112
   * @param output       Continuously updated result document.
113
   * @param offset       Accumulating index where {@link Token} is replaced.
114
   * @param replacements Map of {@link TokenType}s to replacement strings.
66
   * @param output   Continuously updated result document.
67
   * @param offset   Accumulating index where {@link Token} is replaced.
68
   * @param entities {@code true} to convert quotation marks to HTML entities.
115 69
   * @return Instructions to replace a {@link Token} in the result document.
116 70
   */
117 71
  public static Consumer<Token> replace(
118 72
    final StringBuilder output,
119 73
    final AtomicInteger offset,
120
    final Map<TokenType, String> replacements
74
    final boolean entities
121 75
  ) {
122 76
    return token -> {
123 77
      if( !token.isAmbiguous() ) {
124
        final var entity = token.toString( replacements );
78
        final var text = token.toString( entities );
125 79
126 80
        output.replace(
127 81
          token.began() + offset.get(),
128 82
          token.ended() + offset.get(),
129
          entity
83
          text
130 84
        );
131 85
132
        offset.addAndGet( entity.length() - (token.ended() - token.began()) );
86
        offset.addAndGet( text.length() - (token.ended() - token.began()) );
133 87
      }
134 88
    };
M src/main/java/com/whitemagicsoftware/keenquotes/parser/QuoteEmitter.java
190 190
    // <2''>
191 191
    else if( match( NUMBER, QUOTE_SINGLE, QUOTE_SINGLE, ANY ) ) {
192
      emit( QUOTE_PRIME_DOUBLE, lex2.began(), lex3.ended() );
192
      // Force double primes to conform to the same constructor usage. This
193
      // simplifies the tokens, reduces some memory usage,
194
      final var lex = new Lexeme( PRIME_DOUBLE, lex2.began(), lex3.ended() );
195
196
      emit( QUOTE_PRIME_DOUBLE, lex );
193 197
      mQ.set( Lexeme.NONE, 2 );
194 198
    }
...
349 353
        emit( QUOTE_AMBIGUOUS_SINGLE, lex2 );
350 354
      }
355
    }
356
    // <'"Trouble>
357
    else if( match( QUOTE_SINGLE, QUOTE_DOUBLE, WORD, ANY ) ) {
358
      emit( QUOTE_OPENING_DOUBLE, lex2 );
351 359
    }
352 360
    else if( match( ANY, QUOTE_DOUBLE, ANY, ANY ) ) {
353 361
      emit( QUOTE_AMBIGUOUS_DOUBLE, lex2 );
362
    }
363
    // International opening double quotation mark.
364
    else if( match( ANY, QUOTE_DOUBLE_OPENING, ANY, ANY ) ) {
365
      emit( QUOTE_OPENING_DOUBLE, lex2 );
366
    }
367
    // International opening single quotation mark.
368
    else if( match( ANY, QUOTE_SINGLE_OPENING, ANY, ANY ) ) {
369
      emit( QUOTE_OPENING_SINGLE, lex2 );
370
    }
371
    // International double closing quotation mark.
372
    else if( match( ANY, ANY, ANY, QUOTE_DOUBLE_CLOSING ) ) {
373
      emit( QUOTE_CLOSING_DOUBLE, lex4 );
374
    }
375
    // International single closing quotation mark.
376
    else if( match( ANY, ANY, ANY, QUOTE_SINGLE_CLOSING ) ) {
377
      emit( QUOTE_CLOSING_SINGLE, lex4 );
354 378
    }
355 379
    // Ambiguous (no match)
356 380
    else if( match( ANY, QUOTE_SINGLE, ANY, ANY ) ) {
357 381
      emit( QUOTE_AMBIGUOUS_SINGLE, lex2 );
358 382
    }
359 383
  }
360 384
361 385
  private void emit( final TokenType tokenType, final Lexeme lexeme ) {
362 386
    mConsumer.accept( new Token( tokenType, lexeme ) );
363
  }
364
365
  private void emit(
366
    final TokenType tokenType,
367
    final int began,
368
    final int ended ) {
369
    mConsumer.accept( new Token( tokenType, began, ended ) );
370 387
  }
371 388
M src/main/java/com/whitemagicsoftware/keenquotes/parser/Token.java
3 3
4 4
import com.whitemagicsoftware.keenquotes.lex.Lexeme;
5
import com.whitemagicsoftware.keenquotes.lex.LexemeGlyph;
5 6
6 7
import java.util.Map;
7 8
9
import static com.whitemagicsoftware.keenquotes.lex.LexemeGlyph.*;
8 10
import static com.whitemagicsoftware.keenquotes.parser.TokenType.*;
11
import static java.util.Map.entry;
12
import static java.util.Map.ofEntries;
9 13
10 14
/**
11 15
 * Represents a high-level token read from a text document.
12 16
 */
13 17
final class Token implements Comparable<Token>, Stem {
18
  /**
19
   * Provides an entity-based set of {@link Token} replacements.
20
   */
21
  private static final Map<TokenType, String> ENTITIES = ofEntries(
22
    entry( QUOTE_OPENING_SINGLE, "&lsquo;" ),
23
    entry( QUOTE_CLOSING_SINGLE, "&rsquo;" ),
24
    entry( QUOTE_OPENING_DOUBLE, "&ldquo;" ),
25
    entry( QUOTE_CLOSING_DOUBLE, "&rdquo;" ),
26
    entry( QUOTE_STRAIGHT_SINGLE, "'" ),
27
    entry( QUOTE_STRAIGHT_DOUBLE, "\"" ),
28
    entry( QUOTE_APOSTROPHE, "&apos;" ),
29
    entry( QUOTE_PRIME_SINGLE, "&prime;" ),
30
    entry( QUOTE_PRIME_DOUBLE, "&Prime;" ),
31
    entry( QUOTE_PRIME_TRIPLE, "&tprime;" ),
32
    entry( QUOTE_PRIME_QUADRUPLE, "&qprime;" )
33
  );
34
35
  /**
36
   * Provides a character-based set of {@link Token} replacements.
37
   */
38
  private static final Map<TokenType, String> CHARS = ofEntries(
39
    entry( QUOTE_OPENING_SINGLE, "‘" ),
40
    entry( QUOTE_CLOSING_SINGLE, "’" ),
41
    entry( QUOTE_OPENING_DOUBLE, "“" ),
42
    entry( QUOTE_CLOSING_DOUBLE, "”" ),
43
    entry( QUOTE_STRAIGHT_SINGLE, "'" ),
44
    entry( QUOTE_STRAIGHT_DOUBLE, "\"" ),
45
    entry( QUOTE_APOSTROPHE, "’" ),
46
    entry( QUOTE_PRIME_SINGLE, "′" ),
47
    entry( QUOTE_PRIME_DOUBLE, "″" ),
48
    entry( QUOTE_PRIME_TRIPLE, "‴" ),
49
    entry( QUOTE_PRIME_QUADRUPLE, "⁗" )
50
  );
51
52
  /**
53
   * Glyphs not found in the table will use the document's glyph.
54
   */
55
  private static final Map<LexemeGlyph, String> I18N_ENTITIES = ofEntries(
56
    entry( LEX_DOUBLE_QUOTE_OPENING_LOW, "&#8222;" ),
57
    entry( LEX_DOUBLE_CHEVRON_LEFT, "&laquo;" ),
58
    entry( LEX_DOUBLE_CHEVRON_RIGHT, "&raquo;" ),
59
    entry( LEX_SINGLE_CHEVRON_LEFT, "&lsaquo;" ),
60
    entry( LEX_SINGLE_CHEVRON_RIGHT, "&rsaquo;" )
61
  );
62
14 63
  /**
15 64
   * Denotes that the token does not represent a value in the parsed document.
16 65
   */
17 66
  public static final Token NONE = new Token( TokenType.NONE, Lexeme.NONE );
18 67
19 68
  private TokenType mTokenType;
20
  private final int mBegan;
21
  private final int mEnded;
69
  private final Lexeme mLexeme;
22 70
23 71
  /**
24 72
   * Convenience constructor to create a token that uses the lexeme's
25 73
   * beginning and ending offsets to represent a complete token.
26
   *
27
   * @param type   The type of {@link Token} to create.
28
   * @param lexeme Container for beginning and ending text offsets.
29
   */
30
  Token( final TokenType type, final Lexeme lexeme ) {
31
    this( type, lexeme.began(), lexeme.ended() );
32
  }
33
34
  /**
35
   * This constructor can be used to create tokens that span more than a
36
   * single character. Almost all tokens represent a single character, only
37
   * the double-prime sequence ({@code ''}) is more than one character.
38 74
   *
39 75
   * @param tokenType The type of {@link Token} to create.
40
   * @param began     Beginning offset into text where token is found.
41
   * @param ended     Ending offset into text where token is found.
76
   * @param lexeme    Container for text offsets and i18n glyphs.
42 77
   */
43
  Token( final TokenType tokenType, final int began, final int ended ) {
78
  Token( final TokenType tokenType, final Lexeme lexeme ) {
44 79
    assert tokenType != null;
45
    assert began >= 0;
46
    assert ended >= began;
80
    assert lexeme.began() >= 0;
81
    assert lexeme.ended() >= lexeme.began();
47 82
48 83
    mTokenType = tokenType;
49
    mBegan = began;
50
    mEnded = ended;
84
    mLexeme = lexeme;
51 85
  }
52 86
...
63 97
    assert token != NONE;
64 98
65
    return mEnded <= token.mBegan;
99
    return mLexeme.ended() <= token.began();
66 100
  }
67 101
...
79 113
    assert token != NONE;
80 114
81
    return mBegan > token.mEnded;
115
    return mLexeme.began() > token.ended();
82 116
  }
83 117
...
93 127
94 128
  int began() {
95
    return mBegan;
129
    return mLexeme.began();
96 130
  }
97 131
98 132
  int ended() {
99
    return mEnded;
133
    return mLexeme.ended();
100 134
  }
101 135
...
125 159
  @Override
126 160
  public int compareTo( final Token that ) {
127
    return this.mBegan - that.mBegan;
161
    return this.began() - that.began();
128 162
  }
129 163
...
137 171
  }
138 172
139
  public String toString( final Map<TokenType, String> entities ) {
140
    return entities.get( getType() );
173
  /**
174
   * Converts this token to its string representation, which will either be
175
   * an HTML entity or a character.
176
   *
177
   * @param entities {@code true} to convert quotation marks to HTML entities.
178
   * @return A plain quotation mark character or an HTML entity.
179
   */
180
  public String toString( final boolean entities ) {
181
    final var glyph = mLexeme.getType().glyph();
182
183
    return entities
184
      ? I18N_ENTITIES.getOrDefault( glyph, ENTITIES.get( getType() ) )
185
      : CHARS.getOrDefault( getType(), glyph.text() );
141 186
  }
142 187
143 188
  @Override
144 189
  public String toString() {
145 190
    return getClass().getSimpleName() + '[' +
146
      "mType=" + mTokenType +
147
      ", mBegan=" + mBegan +
148
      ", mEnded=" + mEnded +
191
      "mType=" + getType() +
192
      ", mBegan=" + began() +
193
      ", mEnded=" + ended() +
149 194
      ']';
150 195
  }
M src/test/java/com/whitemagicsoftware/keenquotes/lex/LexerTest.java
68 68
  @Test
69 69
  void test_Lexing_Quotes_EmitQuotes() {
70
    testType( "'", QUOTE_SINGLE );
71
    testType( "\"", QUOTE_DOUBLE );
72 70
    testType( "‘", QUOTE_SINGLE_OPENING );
71
    testType( "‹", QUOTE_SINGLE_OPENING );
73 72
    testType( "’", QUOTE_SINGLE_CLOSING );
73
    testType( "›", QUOTE_SINGLE_CLOSING );
74
    testType( "'", QUOTE_SINGLE );
75
74 76
    testType( "“", QUOTE_DOUBLE_OPENING );
77
    testType( "„", QUOTE_DOUBLE_OPENING );
78
    testType( "«", QUOTE_DOUBLE_OPENING );
79
    testType( ",,", QUOTE_DOUBLE_OPENING );
80
    testType( "\"", QUOTE_DOUBLE );
81
75 82
    testType( "”", QUOTE_DOUBLE_CLOSING );
83
    testType( "»", QUOTE_DOUBLE_CLOSING );
84
76 85
    testType( "3 o'clock", NUMBER, SPACE, WORD, QUOTE_SINGLE, WORD );
77 86
  }
M src/test/java/com/whitemagicsoftware/keenquotes/parser/AmbiguityResolverTest.java
8 8
import java.util.concurrent.atomic.AtomicInteger;
9 9
10
import static com.whitemagicsoftware.keenquotes.parser.Curler.ENTITIES;
11
import static com.whitemagicsoftware.keenquotes.parser.Curler.replace;
10
import static com.whitemagicsoftware.keenquotes.parser.Curler.*;
12 11
import static com.whitemagicsoftware.keenquotes.texts.TestResource.readPairs;
13 12
import static org.junit.jupiter.api.Assertions.assertEquals;
...
26 25
  }
27 26
28
  @Test
29 27
  @Disabled
28
  @SuppressWarnings( "unused" )
30 29
  void test_Resolve_InvalidGrammar_AmbiguousRemain() throws IOException {
31 30
    test( "invalid-grammar.txt" );
...
44 43
        input,
45 44
        CONTRACTIONS,
46
        replace( output, offset, ENTITIES ),
45
        replace( output, offset, true ),
47 46
        filter -> false
48 47
      );
M src/test/java/com/whitemagicsoftware/keenquotes/parser/CurlerTest.java
29 29
  @Test
30 30
  public void test_Parse_UncurledQuotes1_CurlyQuotes() throws IOException {
31
    testCurler( createCurler( FILTER_PLAIN ), "unambiguous-1-pass.txt" );
31
    testCurler( createCurler( FILTER_PLAIN, true ), "unambiguous-1-pass.txt" );
32 32
  }
33 33
34 34
  @Test
35 35
  public void test_Parse_UncurledQuotes2_CurlyQuotes() throws IOException {
36
    testCurler( createCurler( FILTER_PLAIN ), "unambiguous-2-pass.txt" );
36
    testCurler( createCurler( FILTER_PLAIN, true ), "unambiguous-2-pass.txt" );
37 37
  }
38 38
39
  @Test
39
  @Disabled
40
  @SuppressWarnings( {"unused", "JUnit3StyleTestMethodInJUnit4Class"} )
40 41
  public void test_Parse_AmbiguousQuotes_PartiallyCurled() throws IOException {
41
    testCurler( createCurler( FILTER_PLAIN ), "ambiguous-n-pass.txt" );
42
    testCurler( createCurler( FILTER_PLAIN, false ), "ambiguous-n-pass.txt" );
42 43
  }
43 44
44 45
  @Test
45 46
  public void test_Parse_UncurledQuotesXml_CurlyQuotes() throws IOException {
46
    testCurler( createCurler( FILTER_XML ), "xml.txt" );
47
    testCurler( createCurler( FILTER_XML, true ), "xml.txt" );
48
  }
49
50
  @Test
51
  public void test_Parse_UncurledQuotesI11l_CurlyQuotes() throws IOException {
52
    testCurler( createCurler( FILTER_PLAIN, true ), "i18n.txt" );
47 53
  }
48 54
...
68 74
    }
69 75
70
    final var curler = createCurler( FILTER_XML );
76
    final var curler = createCurler( FILTER_XML, true );
71 77
    System.out.println( curler.apply( sb.toString() ) );
72 78
  }
...
95 101
  }
96 102
97
  private Function<String, String> createCurler( final FilterType parserType ) {
98
    return new Curler( new Contractions.Builder().build(), parserType );
103
  private Function<String, String> createCurler(
104
    final FilterType filterType,
105
    final boolean entities ) {
106
    return new Curler( createContractions(), filterType, entities );
107
  }
108
109
  private Contractions createContractions() {
110
    return new Contractions.Builder().build();
99 111
  }
100 112
}
M src/test/java/com/whitemagicsoftware/keenquotes/parser/QuoteEmitterTest.java
7 7
import java.util.concurrent.atomic.AtomicInteger;
8 8
9
import static com.whitemagicsoftware.keenquotes.parser.Curler.ENTITIES;
10 9
import static com.whitemagicsoftware.keenquotes.parser.Curler.replace;
11 10
import static com.whitemagicsoftware.keenquotes.texts.TestResource.readPairs;
12 11
import static org.junit.jupiter.api.Assertions.assertEquals;
13 12
14 13
class QuoteEmitterTest {
15 14
  private final Contractions CONTRACTIONS = new Contractions.Builder().build();
16 15
17 16
  @Test
18 17
  void test_Emit_MultipleInputs_QuotesEmitted() throws IOException {
19
    final var couplets = readPairs(
20
      "unambiguous-1-pass.txt" );
18
    final var couplets = readPairs( "unambiguous-1-pass.txt" );
21 19
22 20
    couplets.forEach( couplet -> {
...
29 27
        input,
30 28
        CONTRACTIONS,
31
        replace( output, offset, ENTITIES ),
29
        replace( output, offset, true ),
32 30
        filter -> false
33 31
      );
A src/test/resources/com/whitemagicsoftware/keenquotes/texts/i18n.txt
1
# ########################################################################
2
# French
3
# ########################################################################
4
«Ce n'est pas la mer à boire,» il me dit.
5
&laquo;Ce n&apos;est pas la mer à boire,&raquo; il me dit.
6
7
Dit il, ‹Ce n'est pas la mer à boire?›
8
Dit il, &lsaquo;Ce n&apos;est pas la mer à boire?&rsaquo;
9
10
# ########################################################################
11
# Dutch
12
# ########################################################################
13
,,Dit is een citaat," zei hij.
14
&#8222;Dit is een citaat,&rdquo; zei hij.
15
16
,,Ik heb twee opa's en twee oma's," zei het meisje,„hoeveel heb jij er?"
17
&#8222;Ik heb twee opa&apos;s en twee oma&apos;s,&rdquo; zei het meisje,&#8222;hoeveel heb jij er?&rdquo;
18
19
"Zeg, wat betekent 'quod non' eigenlijk?", vroeg Nynke.
20
&ldquo;Zeg, wat betekent &lsquo;quod non&rsquo; eigenlijk?&rdquo;, vroeg Nynke.
21
22
"Wat zijn 'zebra's'?", vroeg de jongen.
23
&ldquo;Wat zijn &lsquo;zebra&apos;s&rsquo;?&rdquo;, vroeg de jongen.
24
25
"'s Morgens gaat het regenen in 's-Gravenhage," zei de weervrouw.
26
&ldquo;&apos;s Morgens gaat het regenen in &apos;s-Gravenhage,&rdquo; zei de weervrouw.
27
28
De voorzitter zei: 'Gelukkig nieuwjaar!'
29
De voorzitter zei: &lsquo;Gelukkig nieuwjaar!&rsquo;
30
31
'Wat een gedoe, al die baby'tjes hun lolly'tjes geven,' zuchtte de oppas, 'en het is nog ongezond ook!'
32
&lsquo;Wat een gedoe, al die baby&apos;tjes hun lolly&apos;tjes geven,&rsquo; zuchtte de oppas, &lsquo;en het is nog ongezond ook!&rsquo;
33
34
"In '84," zei hij terwijl hij een hand door z'n haar haalde, "heb ik Alice's zus een kus gegeven op d'r wangen."
35
&ldquo;In &apos;84,&rdquo; zei hij terwijl hij een hand door z&apos;n haar haalde, &ldquo;heb ik Alice&apos;s zus een kus gegeven op d&apos;r wangen.&rdquo;
1 36
M src/test/resources/com/whitemagicsoftware/keenquotes/texts/invalid-grammar.txt
1
# """wtf"""
2
# &ldquo;&ldquo;&ldquo;wtf&rdquo;&rdquo;&rdquo;
1
"""insane"""
2
&ldquo;&rdquo;&ldquo;insane&rdquo;&ldquo;&rdquo;
3 3
4
# '''wtf'''
5
# &lsquo;&lsquo;&lsquo;wtf&rsquo;&rsquo;&rsquo;
4
'''wtf'''
5
&lsquo;&lsquo;&lsquo;wtf&rsquo;&rsquo;&rsquo;
6 6
M src/test/resources/com/whitemagicsoftware/keenquotes/texts/unambiguous-1-pass.txt
180 180
181 181
"’Kearney lives on the banks of Killarney—’
182
&ldquo;’Kearney lives on the banks of Killarney—’
182
&ldquo;’Kearney lives on the banks of Killarney—&rsquo;
183 183
184 184
# ########################################################################