Dave Jarvis' Repositories

git clone https://repo.autonoma.ca/repo/keenquotes.git
M src/main/java/com/whitemagicsoftware/keenquotes/Contractions.java
4 4
import java.util.ArrayList;
5 5
import java.util.HashSet;
6
import java.util.List;
6 7
import java.util.Set;
7 8
...
32 33
    private final Set<String> mEndedAmbiguous = new HashSet<>();
33 34
34
    public void withBeganUnambiguous( final Set<String> words ) {
35
    public void withBeganUnambiguous( final List<String> words ) {
35 36
      mBeganUnambiguous.addAll( words );
36 37
    }
37 38
38
    public void withEndedUnambiguous( final Set<String> words ) {
39
    public void withEndedUnambiguous( final List<String> words ) {
39 40
      mEndedUnambiguous.addAll( words );
40 41
    }
41 42
42
    public void withBeganAmbiguous( final Set<String> words ) {
43
    public void withBeganAmbiguous( final List<String> words ) {
43 44
      mBeganAmbiguous.addAll( words );
44 45
    }
45 46
46
    public void withEndedAmbiguous( final Set<String> words ) {
47
    public void withEndedAmbiguous( final List<String> words ) {
47 48
      mEndedAmbiguous.addAll( words );
48 49
    }
...
55 56
     */
56 57
    public Contractions build() {
57
      mBeganUnambiguous.addAll( from( mBeganUnambiguous, BEGAN_UNAMBIGUOUS ) );
58
      mEndedUnambiguous.addAll( from( mEndedUnambiguous, ENDED_UNAMBIGUOUS ) );
59
      mBeganAmbiguous.addAll( from( mBeganAmbiguous, BEGAN_AMBIGUOUS ) );
60
      mEndedAmbiguous.addAll( from( mEndedAmbiguous, ENDED_AMBIGUOUS ) );
58
      mBeganUnambiguous.addAll( BEGAN_UNAMBIGUOUS );
59
      mEndedUnambiguous.addAll( ENDED_UNAMBIGUOUS );
60
      mBeganAmbiguous.addAll( BEGAN_AMBIGUOUS );
61
      mEndedAmbiguous.addAll( ENDED_AMBIGUOUS );
61 62
62 63
      return new Contractions( this );
...
180 181
    "neath",
181 182
    "nother",
183
    "nuff",
182 184
    "onna",
183 185
    "onna'",
M src/main/java/com/whitemagicsoftware/keenquotes/Converter.java
4 4
import java.util.Map;
5 5
import java.util.function.Consumer;
6
import java.util.function.Function;
6 7
7 8
import static com.whitemagicsoftware.keenquotes.TokenType.*;
8 9
import static java.util.Collections.sort;
9 10
10 11
/**
11 12
 * Responsible for converting curly quotes to HTML entities throughout a
12 13
 * text string.
13 14
 */
14
public class Converter {
15
public class Converter implements Function<String, String> {
15 16
  private static final Map<TokenType, String> REPLACEMENTS = Map.of(
16 17
    QUOTE_OPENING_SINGLE, "&lsquo;",
...
24 25
    QUOTE_PRIME_DOUBLE, "&Prime;"
25 26
  );
27
28
  private final Consumer<Lexeme> mUnresolved;
29
  private final Contractions mContractions;
30
31
  public Converter( final Consumer<Lexeme> unresolved ) {
32
    this( unresolved, new Contractions.Builder().build() );
33
  }
34
35
  public Converter( final Consumer<Lexeme> unresolved, final Contractions c ) {
36
    mUnresolved = unresolved;
37
    mContractions = c;
38
  }
26 39
27 40
  /**
28 41
   * Converts straight quotes to curly quotes and primes. Any quotation marks
29
   * that cannot be converted are passed to the {@link Consumer}.
42
   * that cannot be converted are passed to the {@link Consumer}. This method
43
   * is re-entrant, but not tested to be thread-safe.
30 44
   *
31
   * @param text       The text to parse.
32
   * @param unresolved Recipient for ambiguous {@link Lexeme}s.
45
   * @param text The text to parse.
33 46
   * @return The given text string with as many straight quotes converted to
34 47
   * curly quotes as is feasible.
35 48
   */
36
  public static String convert(
37
    final String text, final Consumer<Lexeme> unresolved ) {
38
    final var parser = new Parser( text );
49
  @Override
50
  public String apply( final String text ) {
51
    final var parser = new Parser( text, mContractions );
39 52
    final var tokens = new ArrayList<Token>();
40 53
41 54
    // Parse the tokens and consume all unresolved lexemes.
42
    parser.parse( tokens::add, unresolved );
55
    parser.parse( tokens::add, mUnresolved );
43 56
44 57
    // The parser may emit tokens in any order.
M src/main/java/com/whitemagicsoftware/keenquotes/KeenQuotes.java
34 34
35 35
  public void run() {
36
    if( getSettings().displayList() ) {
37
      displayList();
36
    final var settings = getSettings();
37
    final var contractions = createContractions( settings );
38
39
    if( settings.displayList() ) {
40
      System.out.println( contractions.toString() );
38 41
    }
39 42
    else {
40
      convert();
43
      convert( new Converter( System.err::println, contractions ) );
41 44
    }
42 45
  }
43 46
44
  private void displayList() {
45
    System.out.println( new Contractions.Builder().build().toString() );
47
  private Contractions createContractions( final Settings settings ) {
48
    final var builder = new Contractions.Builder();
49
50
    builder.withBeganUnambiguous( settings.getBeganUnambiguous() );
51
    builder.withEndedUnambiguous( settings.getEndedUnambiguous() );
52
    builder.withBeganAmbiguous( settings.getBeganAmbiguous() );
53
    builder.withEndedAmbiguous( settings.getEndedAmbiguous() );
54
55
    return builder.build();
46 56
  }
47 57
48
  private void convert() {
49
    final StringBuilder sb = new StringBuilder();
58
  private void convert( final Converter converter ) {
59
    final var sb = new StringBuilder();
50 60
51
    try( final BufferedReader reader = open( System.in ) ) {
61
    try( final var reader = open( System.in ) ) {
52 62
      String line;
53 63
      final var sep = System.lineSeparator();
54 64
55 65
      while( (line = reader.readLine()) != null ) {
56 66
        sb.append( line );
57 67
        sb.append( sep );
58 68
      }
59 69
60
      System.out.println(
61
        Converter.convert( sb.toString(), System.err::println )
62
      );
70
      System.out.println( converter.apply( sb.toString() ) );
63 71
    } catch( final Exception ex ) {
64 72
      ex.printStackTrace( System.err );
...
112 120
  }
113 121
122
  /**
123
   * Main application entry point.
124
   *
125
   * @param args Command-line arguments.
126
   */
114 127
  public static void main( final String[] args ) {
115 128
    final var app = new KeenQuotes();
M src/main/java/com/whitemagicsoftware/keenquotes/Lexer.java
65 65
        lexeme = createLexeme( QUOTE_DOUBLE, began, i.getIndex() );
66 66
      }
67
      else if( curr == '‘') {
67
      else if( curr == '‘' ) {
68 68
        lexeme = createLexeme( QUOTE_SINGLE_OPENING, began, i.getIndex() );
69 69
      }
70
      else if( curr == '’') {
70
      else if( curr == '’' ) {
71 71
        lexeme = createLexeme( QUOTE_SINGLE_CLOSING, began, i.getIndex() );
72
      }
73
      else if( curr == '-' && peek( i ) == '-' || curr == '—' ) {
74
        slurp( i, ( next, ci ) -> next == '-' || next == '—' );
75
76
        lexeme = createLexeme( DASH, began, i.getIndex() );
77 72
      }
78 73
      else if( isDigit( curr ) || isNumeric( curr ) && isDigit( peek( i ) ) ) {
...
85 80
        lexeme = createLexeme( isWord ? WORD : NUMBER, began, i.getIndex() );
86 81
      }
87
      else if( curr == '-' ) {
82
      else if( curr == '-' && peek( i ) != '-' ) {
88 83
        lexeme = createLexeme( HYPHEN, began, i.getIndex() );
84
      }
85
      else if( isDash( curr ) ) {
86
        slurp( i, ( next, ci ) -> isDash( next ) );
87
88
        lexeme = createLexeme( DASH, began, i.getIndex() );
89 89
      }
90 90
      else if( curr == '.' ) {
91
        // Parse all consecutive periods into an ellipsis lexeme. This will
92
        // not capture space-separated ellipsis (such as ". . .").
93 91
        lexeme = createLexeme(
94
          slurp( i, ( next, ci ) -> next == '.' ) == 0 ? PERIOD : ELLIPSIS,
92
          slurp( i, ( next, ci ) ->
93
            next == '.' || (next == ' ' && peek( ci ) == '.') ) == 0
94
            ? PERIOD
95
            : ELLIPSIS,
95 96
          began, i.getIndex()
96 97
        );
...
177 178
    return
178 179
      curr == '.' || curr == ',' || curr == '-' || curr == '+' || curr == '^';
180
  }
181
182
  /**
183
   * Answers whether the given character may be part of an en- or em-dash.
184
   * This must be called after it is known that the character isn't a lone
185
   * hyphen.
186
   *
187
   * @param curr The character to check as being a dash.
188
   * @return {@code true} if the given character is part of a dash.
189
   */
190
  private boolean isDash( final char curr ) {
191
    return curr == '-' || curr == '–' || curr == '—';
179 192
  }
180 193
M src/main/java/com/whitemagicsoftware/keenquotes/Parser.java
44 44
   */
45 45
  private static final LexemeType[] LEADING_QUOTE_OPENING_DOUBLE =
46
    new LexemeType[]{SPACE, DASH, EQUALS, QUOTE_SINGLE, OPENING_GROUP, EOL, EOP};
46
    new LexemeType[]{SPACE, DASH, EQUALS, QUOTE_SINGLE, OPENING_GROUP, EOL,
47
      EOP};
47 48
48 49
  /**
...
96 97
   */
97 98
  private int mClosingSingleQuote;
98
99
  /**
100
   * Constructs a new {@link Parser} using the default contraction sets
101
   * to help resolve some ambiguous scenarios.
102
   *
103
   * @param text The prose to parse, containing zero or more quotation
104
   *             characters.
105
   */
106
  public Parser( final String text ) {
107
    this( text, new Contractions.Builder().build() );
108
  }
109 99
110 100
  /**
M src/main/java/com/whitemagicsoftware/keenquotes/Settings.java
4 4
import picocli.CommandLine;
5 5
6
import java.util.List;
6 7
import java.util.concurrent.Callable;
8
9
import static java.util.Arrays.asList;
10
import static java.util.Collections.emptyList;
7 11
8 12
@CommandLine.Command(
...
18 22
  private final KeenQuotes mMain;
19 23
20
//  /**
21
//   * List of unambiguous contractions having leading apostrophes.
22
//   */
23
//  @CommandLine.Option(
24
//    names = {"-ub", "--unamb-began"},
25
//    description =
26
//      "Contractions to treat as unambiguous (e.g., cause,bout)",
27
//    paramLabel = "words"
28
//  )
29
//  private String[] mUnambiguousBegan;
30
//
31
//  /**
32
//   * List of unambiguous contractions having lagging apostrophes.
33
//   */
34
//  @CommandLine.Option(
35
//    names = {"-ue", "--unamb-ended"},
36
//    description =
37
//      "Contractions to treat as unambiguous (e.g., frien,thinkin)",
38
//    paramLabel = "words"
39
//  )
40
//  private String[] mUnambiguousEnded;
41
//
42
//  /**
43
//   * List of ambiguous contractions having leading apostrophes.
44
//   */
45
//  @CommandLine.Option(
46
//    names = {"-ab", "--amb-began"},
47
//    description =
48
//      "Contractions to treat as ambiguous (e.g., sup,kay)",
49
//    paramLabel = "words"
50
//  )
51
//  private String[] mAmbiguousBegan;
52
//
53
//  /**
54
//   * List of ambiguous contractions having lagging apostrophes.
55
//   */
56
//  @CommandLine.Option(
57
//    names = {"-ae", "--amb-ended"},
58
//    description =
59
//      "Contractions to treat as ambiguous (e.g., gi,o)",
60
//    paramLabel = "words"
61
//  )
62
//  private String[] mAmbiguousEnded;
63
//
24
  /**
25
   * List of unambiguous contractions having leading apostrophes.
26
   */
27
  @CommandLine.Option(
28
    names = {"-ub", "--unamb-began"},
29
    description =
30
      "Contractions to treat as unambiguous (e.g., cause,bout)",
31
    paramLabel = "words"
32
  )
33
  private String[] mBeganUnambiguous;
34
35
  /**
36
   * List of unambiguous contractions having lagging apostrophes.
37
   */
38
  @CommandLine.Option(
39
    names = {"-ue", "--unamb-ended"},
40
    description =
41
      "Contractions to treat as unambiguous (e.g., frien,thinkin)",
42
    paramLabel = "words"
43
  )
44
  private String[] mEndedUnambiguous;
45
46
  /**
47
   * List of ambiguous contractions having leading apostrophes.
48
   */
49
  @CommandLine.Option(
50
    names = {"-ab", "--amb-began"},
51
    description =
52
      "Contractions to treat as ambiguous (e.g., sup,kay)",
53
    paramLabel = "words"
54
  )
55
  private String[] mBeganAmbiguous;
56
57
  /**
58
   * List of ambiguous contractions having lagging apostrophes.
59
   */
60
  @CommandLine.Option(
61
    names = {"-ae", "--amb-ended"},
62
    description =
63
      "Contractions to treat as ambiguous (e.g., gi,o)",
64
    paramLabel = "words"
65
  )
66
  private String[] mEndedAmbiguous;
67
64 68
  /**
65 69
   * Display default values.
...
81 85
   * @return {@code true} to list the contractions.
82 86
   */
83
  public boolean displayList() {
87
  boolean displayList() {
84 88
    return mDisplayList;
89
  }
90
91
  List<String> getBeganUnambiguous() {
92
    return nullSafe( mBeganUnambiguous );
93
  }
94
95
  List<String> getEndedUnambiguous() {
96
    return nullSafe( mEndedUnambiguous );
97
  }
98
99
  List<String> getBeganAmbiguous() {
100
    return nullSafe( mBeganAmbiguous );
101
  }
102
103
  List<String> getEndedAmbiguous() {
104
    return nullSafe( mEndedAmbiguous );
105
  }
106
107
  private List<String> nullSafe( final String[] words ) {
108
    return words == null ? emptyList() : asList( words );
85 109
  }
86 110
M src/test/java/com/whitemagicsoftware/keenquotes/KeenQuotesTest.java
12 12
import java.util.function.Function;
13 13
14
import static com.whitemagicsoftware.keenquotes.Converter.convert;
15 14
import static java.lang.System.out;
16 15
import static org.junit.jupiter.api.Assertions.assertEquals;
...
28 27
  @Disabled
29 28
  public void test_parse_SingleLine_Parsed() {
30
    out.println( convert(
31
      "\"’Kearney lives on the banks of Killarney—’",
32
      out::println
29
    final var converter = new Converter( out::println );
30
    out.println( converter.apply(
31
      "\"’Kearney lives on the banks of Killarney—’"
33 32
    ) );
34 33
  }
...
41 40
  @Test
42 41
  public void test_Parse_StraightQuotes_CurlyQuotes() throws IOException {
43
    testConverter( text -> convert( text, ( lexeme ) -> {} ) );
42
    testConverter( new Converter( ( lex ) -> {} ) );
44 43
  }
45 44
...
65 64
    }
66 65
67
    System.out.println( convert( sb.toString(), out::println ) );
66
    final var converter = new Converter( out::println );
67
    System.out.println( converter.apply( sb.toString() ) );
68 68
  }
69 69
M src/test/java/com/whitemagicsoftware/keenquotes/LexerTest.java
50 50
    testType( "--", DASH );
51 51
    testType( "---", DASH );
52
    testType( "–", DASH );
53
    testType( "—", DASH );
54
    testType( "—-—", DASH );
52 55
    testType( "...", ELLIPSIS );
56
    testType( ". .", ELLIPSIS );
57
    testType( ". . .", ELLIPSIS );
58
    testType( ".. ... ....", ELLIPSIS );
53 59
  }
54 60
M src/test/java/com/whitemagicsoftware/keenquotes/ParserTest.java
14 14
 */
15 15
class ParserTest {
16
  @SuppressWarnings( "TextBlockMigration" )
16 17
  private final static Map<String, Map<TokenType, Integer>> TEST_CASES =
17 18
    Map.of(
...
26 27
      "'Twas and 'tis whate'er lay 'twixt dawn and dusk 'n River Styx.",
27 28
      Map.of( QUOTE_APOSTROPHE, 5 ),
28
      """
29
             But I must leave the proofs to those who 've seen 'em;
30
             But this I heard her say, and can't be wrong
31
             And all may think which way their judgments lean 'em,
32
             ''T is strange---the Hebrew noun which means "I am,"
33
             The English always use to govern d--n.'
34
        """,
29
      "But I must leave the proofs to those who 've seen 'em;\n" +
30
        "But this I heard her say, and can't be wrong\n" +
31
        "And all may think which way their judgments lean 'em,\n" +
32
        "''T is strange---the Hebrew noun which means \"I am,\"\n" +
33
        "The English always use to govern d--n.'",
35 34
      Map.of( QUOTE_APOSTROPHE, 5 )
36 35
    );
...
44 43
45 44
  private void parse( final String text, final Map<TokenType, Integer> tally ) {
46
    final var parser = new Parser( text );
45
    final var contractions = new Contractions.Builder().build();
46
    final var parser = new Parser( text, contractions );
47 47
    final var actual = new HashMap<TokenType, Integer>();
48 48