Dave Jarvis' Repositories

git clone https://repo.autonoma.ca/repo/keenquotes.git

Add bobbin as ambiguous contraction

AuthorDave Jarvis <email>
Date2022-08-10 22:34:48 GMT-0700
Commit93c7eff6a01e83ec1528a20a140987e233968afc
Parent32765e7
src/main/java/com/whitemagicsoftware/keenquotes/Contractions.java
// are|regarding
"re",
+ // circular|around
+ "round",
// what's up|to eat
"sup",
private static final Set<String> ENDED_AMBIGUOUS = Set.of(
+ // pin|head dunk
+ "bobbin",
// give|martial arts garment
"gi",
// in|I
"i",
// of|letter o
- "o"
+ "o",
);
src/main/java/com/whitemagicsoftware/keenquotes/Lexer.java
* @return The next token in the sequence.
*/
- Lexeme parse( final CharacterIterator i ) {
+ private Lexeme parse( final CharacterIterator i ) {
int began = i.getIndex();
boolean isWord = false;
isWord = true;
- final var next = peek(i);
+ final var next = peek( i );
if( !isLetter( next ) && !isDigit( next ) ) {
}
+ /**
+ * @param i The {@link CharacterIterator} used to scan through the text, one
+ * character at a time.
+ * @return {@code true} if any characters were skipped.
+ */
boolean skip( final CharacterIterator i ) {
return false;
return Character.isDigit( curr ) ||
"¼½¾⅐⅑⅒⅓⅔⅕⅖⅗⅘⅙⅚⅛⅜⅝⅞".indexOf( curr ) > -1;
+ }
+
+ /**
+ * Answers whether the given character may be part of an en- or em-dash.
+ * This must be called after it is known that the character isn't a lone
+ * hyphen.
+ *
+ * @param curr The character to check as being a dash.
+ * @return {@code true} if the given character is part of a dash.
+ */
+ private static boolean isDash( final char curr ) {
+ return curr == '-' || curr == '–' || curr == '—' || curr == '―';
}
curr == '.' || curr == ',' || curr == '-' || curr == '+' ||
curr == '^' || curr == '⅟' || curr == '⁄';
- }
-
- /**
- * Answers whether the given character may be part of an en- or em-dash.
- * This must be called after it is known that the character isn't a lone
- * hyphen.
- *
- * @param curr The character to check as being a dash.
- * @return {@code true} if the given character is part of a dash.
- */
- private boolean isDash( final char curr ) {
- return curr == '-' || curr == '–' || curr == '—' || curr == '―';
}
- static char peek( final CharacterIterator ci ) {
+ private static char peek( final CharacterIterator ci ) {
final var ch = ci.next();
ci.previous();
* @return The number of characters parsed.
*/
- static int slurp(
+ protected static int slurp(
final CharacterIterator ci,
final BiFunction<Character, CharacterIterator, Boolean> f ) {
while( f.apply( next, ci ) );
- // The loop above will overshoot the tally by one character.
+ // The loop will have overshot the tally by one character.
ci.previous();
src/main/java/com/whitemagicsoftware/keenquotes/XmlLexer.java
* Skip (do not emit) XML tags found within the prose. This effectively hides
* the element.
- *
- * @param i The {@link CharacterIterator} used to scan through the text, one
- * character at a time.
*/
@Override
src/test/resources/com/whitemagicsoftware/keenquotes/smartypants.txt
Sam&apos;s Sams&apos; and the Ross&apos;s roses&apos; thorns were prickly.
+"You 'cause---" all fifteen years' worth.
+&ldquo;You &apos;cause---&rdquo; all fifteen years&apos; worth.
+
# ########################################################################
# Inside contractions (no leading/trailing apostrophes)
Delta30 lines added, 21 lines removed, 9-line increase