Dave Jarvis' Repositories

git clone https://repo.autonoma.ca/repo/keenquotes.git

Update test strings, resolve contraction numerals, resolve limited ambiguities

AuthorDave Jarvis <email>
Date2021-06-06 17:19:09 GMT-0700
Commita74d89eb350c38254091b2c575b99cb7f3c37e5e
Parent5749c98
lib/src/main/java/com/keenwrite/quotes/Contractions.java
/**
* Placeholder for various types of contractions.
- * <p>
- * TODO: Read from an external resource file.
- * </p>
*/
public class Contractions {
// about|boxing match
"bout",
+ // because|causal
+ "cause",
// what you|choo choo train
"choo",
// afro|to and fro
"fro",
- // whore|stop
+ // whore|ho ho!
"ho",
// okay|letter K
public static boolean contractionBeganUnambiguously( final String word ) {
assert word != null;
+
return BEGAN_UNAMBIGUOUS.contains( word.toLowerCase() );
}
public static boolean contractionBeganAmbiguously( final String word ) {
assert word != null;
+
return BEGAN_AMBIGUOUS.contains( word.toLowerCase() );
}
public static boolean contractionEndedAmbiguously( final String word ) {
- return ENDED_AMBIGUOUS.contains( word ) || word.endsWith( "s" ) ||
- word.endsWith( "n" ) || word.endsWith( "z" ) ||
- word.endsWith( "x" ) || word.endsWith( "ch" );
+ assert word != null;
+
+ final var check = word.toLowerCase();
+
+ return ENDED_AMBIGUOUS.contains( check ) || check.endsWith( "s" ) ||
+ check.endsWith( "n" ) || check.endsWith( "z" ) ||
+ check.endsWith( "x" ) || check.endsWith( "ch" );
}
public static boolean contractionEndedUnambiguously( final String word ) {
- return ENDED_UNAMBIGUOUS.contains( word );
+ assert word != null;
+
+ return ENDED_UNAMBIGUOUS.contains( word.toLowerCase() );
}
}
lib/src/main/java/com/keenwrite/quotes/Parser.java
* Single quotes preceded by these {@link LexemeType}s may be opening quotes.
*/
- private static final LexemeType[] LEADING_OPENING_QUOTE_SINGLE =
+ private static final LexemeType[] LEADING_QUOTE_OPENING_SINGLE =
new LexemeType[]{SPACE, HYPHEN, QUOTE_DOUBLE};
/**
* Single quotes succeeded by these {@link LexemeType}s may be opening quotes.
*/
- private static final LexemeType[] TRAILING_OPENING_QUOTE_SINGLE =
- new LexemeType[]{WORD, NUMBER, ELLIPSIS, QUOTE_SINGLE, QUOTE_DOUBLE};
+ private static final LexemeType[] LAGGING_QUOTE_OPENING_SINGLE =
+ new LexemeType[]{WORD, ELLIPSIS, QUOTE_SINGLE, QUOTE_DOUBLE};
/**
* Single quotes preceded by these {@link LexemeType}s may be closing quotes.
*/
- private static final LexemeType[] LEADING_CLOSING_QUOTE_SINGLE =
+ private static final LexemeType[] LEADING_QUOTE_CLOSING_SINGLE =
new LexemeType[]{WORD, NUMBER, PERIOD, PUNCT, ELLIPSIS, QUOTE_DOUBLE};
/**
* Single quotes succeeded by these {@link LexemeType}s may be closing quotes.
*/
- private static final LexemeType[] TRAILING_CLOSING_QUOTE_SINGLE =
- new LexemeType[]{SPACE, HYPHEN, EOL};
+ private static final LexemeType[] LAGGING_QUOTE_CLOSING_SINGLE =
+ new LexemeType[]{SPACE, HYPHEN, QUOTE_DOUBLE, EOL};
/**
* Double quotes preceded by these {@link LexemeType}s may be opening quotes.
*/
- private static final LexemeType[] LEADING_OPENING_QUOTE_DOUBLE =
+ private static final LexemeType[] LEADING_QUOTE_OPENING_DOUBLE =
new LexemeType[]{SPACE, HYPHEN, QUOTE_SINGLE};
/**
* Double quotes succeeded by these {@link LexemeType}s may be opening quotes.
*/
- private static final LexemeType[] TRAILING_OPENING_QUOTE_DOUBLE =
+ private static final LexemeType[] LAGGING_QUOTE_OPENING_DOUBLE =
new LexemeType[]{WORD, NUMBER, ELLIPSIS, QUOTE_SINGLE, QUOTE_DOUBLE};
/**
* Double quotes preceded by these {@link LexemeType}s may be closing quotes.
*/
- private static final LexemeType[] LEADING_CLOSING_QUOTE_DOUBLE =
+ private static final LexemeType[] LEADING_QUOTE_CLOSING_DOUBLE =
new LexemeType[]{WORD, NUMBER, PERIOD, PUNCT, ELLIPSIS, QUOTE_SINGLE};
/**
* Double quotes succeeded by these {@link LexemeType}s may be closing quotes.
*/
- private static final LexemeType[] TRAILING_CLOSING_QUOTE_DOUBLE =
+ private static final LexemeType[] LAGGING_QUOTE_CLOSING_DOUBLE =
new LexemeType[]{SPACE, HYPHEN, QUOTE_SINGLE, EOL};
// Some non-emitted tokenized lexemes may be ambiguous.
- final var ambiguousOpeningQuotes = new ArrayList<Lexeme>( 16 );
- final var ambiguousClosingQuotes = new ArrayList<Lexeme>( 16 );
+ final var ambiguousLeadingQuotes = new ArrayList<Lexeme>( 16 );
+ final var ambiguousLaggingQuotes = new ArrayList<Lexeme>( 16 );
+ var resolvedLeadingQuotes = 0;
+ var resolvedLaggingQuotes = 0;
// Count the number of ambiguous and non-ambiguous open single quotes.
if( contractionBeganAmbiguously( word3 ) ) {
- // The contraction is uncertain until closing quote is found that
- // balances this single quote.
- ambiguousOpeningQuotes.add( lex2 );
+ // E.g., ''Cause
+ if( lex1.isType( QUOTE_SINGLE ) ) {
+ consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) );
+ i.remove();
+ }
+ else {
+ // The contraction is uncertain until a closing quote is found that
+ // balances this single quote.
+ ambiguousLeadingQuotes.add( lex2 );
+ }
}
else if( contractionBeganUnambiguously( word3 ) ) {
// The quote mark forms a word that does not stand alone from its
// contraction. For example, twas is not a word: it's 'twas.
consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) );
i.remove();
}
else if( contractionEndedAmbiguously( word1 ) ) {
- ambiguousClosingQuotes.add( lex2 );
+ ambiguousLaggingQuotes.add( lex2 );
}
else if( contractionEndedUnambiguously( word1 ) ) {
consumer.accept( new Token( QUOTE_APOSTROPHE, lex2 ) );
i.remove();
}
- else if( (lex1.isSot() || lex1.anyType( LEADING_OPENING_QUOTE_SINGLE ))
- && lex3.anyType( TRAILING_OPENING_QUOTE_SINGLE ) ) {
+ else if( (lex1.isSot() || lex1.anyType( LEADING_QUOTE_OPENING_SINGLE ))
+ && lex3.anyType( LAGGING_QUOTE_OPENING_SINGLE ) ) {
+ resolvedLeadingQuotes++;
consumer.accept( new Token( QUOTE_OPENING_SINGLE, lex2 ) );
+ i.remove();
}
- else if( lex1.anyType( LEADING_CLOSING_QUOTE_SINGLE ) &&
- (lex3.isEot() || lex3.anyType( TRAILING_CLOSING_QUOTE_SINGLE )) ) {
+ else if( lex1.anyType( LEADING_QUOTE_CLOSING_SINGLE ) &&
+ (lex3.isEot() || lex3.anyType( LAGGING_QUOTE_CLOSING_SINGLE )) ) {
+ resolvedLaggingQuotes++;
consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) );
+ i.remove();
+ }
+ else if( lex3.isType( NUMBER ) ) {
+ // E.g., '04
+ ambiguousLeadingQuotes.add( lex2 );
}
}
}
- if( ambiguousOpeningQuotes.size() == 0 ) {
- ambiguousClosingQuotes.forEach(
+ System.out.println( "ambig leading: " + ambiguousLeadingQuotes.size() );
+ System.out.println( "ambig lagging: " + ambiguousLaggingQuotes.size() );
+ System.out.println( "unambig leading: " + resolvedLeadingQuotes );
+ System.out.println( "unambig lagging: " + resolvedLaggingQuotes );
+
+ if( resolvedLeadingQuotes == 1 && resolvedLaggingQuotes == 0 ) {
+ if( ambiguousLeadingQuotes.size() == 0 && ambiguousLaggingQuotes.size() == 1 ) {
+ consumer.accept(
+ new Token( QUOTE_CLOSING_SINGLE, ambiguousLaggingQuotes.get( 0 ) )
+ );
+ }
+ }
+ else if( ambiguousLeadingQuotes.size() > 0 && ambiguousLaggingQuotes.size() == 0 ) {
+ // If there are no ambiguous lagging quotes then all ambiguous leading
+ // quotes must be contractions.
+ ambiguousLeadingQuotes.forEach(
lex -> consumer.accept( new Token( QUOTE_APOSTROPHE, lex ) )
);
}
- else {
- System.out.println( "ambig opening: " + ambiguousOpeningQuotes.size() );
- System.out.println( "ambig closing: " + ambiguousClosingQuotes.size() );
-
- // No opening quotes, then there are no closing quotes, which
- // means the remaining single quotes are all apostrophes.
- if( ambiguousOpeningQuotes == ambiguousClosingQuotes ) {
- System.out.println( "MATCHING OPENING/CLOSING SINGLE QUOTES" );
- }
+ else if( ambiguousLeadingQuotes.size() == 0 && ambiguousLaggingQuotes.size() > 0 ) {
+ // If there are no ambiguous leading quotes then all ambiguous lagging
+ // quotes must be contractions.
+ ambiguousLaggingQuotes.forEach(
+ lex -> consumer.accept( new Token( QUOTE_APOSTROPHE, lex ) )
+ );
+ }
+ else if( resolvedLeadingQuotes == 1 && ambiguousLaggingQuotes.size() == 1 ) {
+ consumer.accept(
+ new Token( QUOTE_CLOSING_SINGLE, ambiguousLaggingQuotes.get( 0 ) )
+ );
}
}
}
else if( lex1.isType( QUOTE_SINGLE ) && lex2.isType( WORD ) &&
- // E.g., 'twas
contractionBeganUnambiguously( lex2.toString( mText ) ) ) {
+ // E.g., 'twas
+ consumer.accept( new Token( QUOTE_APOSTROPHE, lex1 ) );
+ }
+ else if( lex1.isType( QUOTE_SINGLE ) && lex2.isType( NUMBER ) &&
+ lex3.isType( WORD ) &&
+ lex3.toString( mText ).equalsIgnoreCase( "s" ) ) {
+ // E.g., '70s
+ // Sentences are re-written to avoid starting with numerals.
consumer.accept( new Token( QUOTE_APOSTROPHE, lex1 ) );
}
}
else if( lex2.isType( QUOTE_DOUBLE ) &&
- (lex1.isSot() || lex1.anyType( LEADING_OPENING_QUOTE_DOUBLE )) &&
- lex3.anyType( TRAILING_OPENING_QUOTE_DOUBLE ) ) {
+ (lex1.isSot() || lex1.anyType( LEADING_QUOTE_OPENING_DOUBLE )) &&
+ lex3.anyType( LAGGING_QUOTE_OPENING_DOUBLE ) ) {
// Examples: "'Twas, "", "...
consumer.accept( new Token( QUOTE_OPENING_DOUBLE, lex2 ) );
}
else if( lex2.isType( QUOTE_DOUBLE ) &&
- lex1.anyType( LEADING_CLOSING_QUOTE_DOUBLE ) &&
- (lex3.isEot() || lex3.anyType( TRAILING_CLOSING_QUOTE_DOUBLE )) ) {
+ lex1.anyType( LEADING_QUOTE_CLOSING_DOUBLE ) &&
+ (lex3.isEot() || lex3.anyType( LAGGING_QUOTE_CLOSING_DOUBLE )) ) {
consumer.accept( new Token( QUOTE_CLOSING_DOUBLE, lex2 ) );
}
lib/src/main/java/com/keenwrite/quotes/SmartQuotes.java
);
- public String replace( final String text ) {
+ public static String replace( final String text ) {
final var parser = new Parser( text );
final var tokens = new ArrayList<Token>();
for( final var token : tokens ) {
- assert position <= token.began();
-
- result.append( text, position, token.began() );
- result.append( REPLACEMENTS.get( token.getType() ) );
+ if( position <= token.began() ) {
+ result.append( text, position, token.began() );
+ result.append( REPLACEMENTS.get( token.getType() ) );
+ }
position = token.ended();
lib/src/test/java/com/keenwrite/quotes/SmartQuotesTest.java
@Test
public void test_parse_SingleLine_Parsed() {
- final var fixer = new SmartQuotes();
- out.println( fixer.replace( "\"Didn' get th' message.\"" ) );
+ out.println( SmartQuotes.replace( "Took place in '04, yes'm!"));
out.println( "-------" );
-
- out.println( fixer.replace( "'Bout that time I says, 'Boys! I been " +
+ out.println( SmartQuotes.replace( "'Bout that time I says, 'Boys! I been " +
"thinkin' 'bout th' Universe.'"));
out.println( "-------" );
- out.println( fixer.replace( "\"John asked, 'What are you, beyond " +
+ out.println( SmartQuotes.replace( "\"John asked, 'What are you, beyond " +
"\"somethin' shiny?\"'\" said Fred.\n" ) );
}
@Test
public void test_Parse_StraightQuotes_CurlyQuotes() throws IOException {
- final var fixer = new SmartQuotes();
- testParser( fixer::replace );
+ testParser( SmartQuotes::replace );
}
final var actual = parser.apply( testLine );
+ System.out.println( "ACTUAL:" + actual );
assertEquals( expected, actual );
lib/src/test/resources/com/keenwrite/quotes/smartypants.txt
''Cause I don't like it, 's why,' said Pat.
-&lsquo;&apos;Cause I don't like it, &apos;s why,&rsquo; said Pat.
+&lsquo;&apos;Cause I don&apos;t like it, &apos;s why,&rsquo; said Pat.
'It's a beautiful day!'
Took place in '04, yes'm!
Took place in &apos;04, yes&apos;m!
-
-# ########################################################################
-# Backticks (left and right double quotes)
-# ########################################################################
-``I am Sam''
-&ldquo;I am Sam&rdquo;
-
-``Sam's away today''
-&ldquo;Sam&apos;s away today&rdquo;
-
-``Sam's gone!
-&ldquo;Sam&apos;s gone!
-
-``5'10" tall 'e was!''
-&ldquo;5&prime;10&Prime; tall &apos;e was!&rdquo;
# ########################################################################
# Consecutive quotes
# ########################################################################
"'I'm trouble.'"
&ldquo;&lsquo;I&apos;m trouble.&rsquo;&rdquo;
'"Trouble's my name."'
-&lsquo;&ldquo;Trouble&apos;s my name.&ldquo;&lsquo;
+&lsquo;&ldquo;Trouble&apos;s my name.&rdquo;&rsquo;
# ########################################################################
# Nested quotations
# ########################################################################
"'Here I am,' said Sam"
&ldquo;&lsquo;Here I am,&rsquo; said Sam&rdquo;
'"Here I am," said Sam'
-&lsquo;&ldquo;Here I am,&rdquo;, said Sam&rsquo;
+&lsquo;&ldquo;Here I am,&rdquo; said Sam&rsquo;
'Hello, "Dr. Brown," what's your real name?'
-&lsquo;Hello, &ldquo;Dr. Brown,&rdquo; what's your real name?&rsquo;
+&lsquo;Hello, &ldquo;Dr. Brown,&rdquo; what&apos;s your real name?&rsquo;
"'Twas, t'wasn't thy name, 'twas it?" said Jim "the Barber" Brown.
&ldquo;She said, &lsquo;Llamas&apos;ll languish, they&apos;ll--
-'70s are my favorite numbers,' she said.
-&lsquo;70s are my favorite numbers,&rsquo; she said.
+'The 70s are my favorite numbers,' she said.
+&lsquo;The 70s are my favorite numbers,&rsquo; she said.
'70s fashion was weird.
Book 'em, Danno. Rock 'n' roll. 'Cause 'twas the season.
Book &apos;em, Danno. Rock &apos;n&apos; roll. &apos;Cause &apos;twas the season.
-
-'27 was a good year. (The entire '20s were.)
-&apos;27 was a good year. (The entire &apos;20s were.)
Delta107 lines added, 80 lines removed, 27-line increase