Dave Jarvis' Repositories

git clone https://repo.autonoma.ca/repo/keenquotes.git

Distinguish newlines from new paragraphs

Author Dave Jarvis <email>
Date 2021-06-08 21:13:54 GMT-0700
Commit ca8bd102a691e2716b90e1b6661d103ff2dbd32e
Parent 72021e7
Delta 58 lines added, 32 lines removed, 26-line increase
lib/src/main/java/com/keenwrite/quotes/Contractions.java
private static final Set<String> ENDED_AMBIGUOUS = Set.of(
- // he|a
- "a",
// and|an
"an",
return ENDED_AMBIGUOUS.contains( check ) || check.endsWith( "s" ) ||
- check.endsWith( "n" ) || check.endsWith( "z" ) ||
- check.endsWith( "x" ) || check.endsWith( "ch" );
+ check.endsWith( "n" ) || check.endsWith( "z" ) || check.endsWith( "x" );
}
lib/src/main/java/com/keenwrite/quotes/LexemeType.java
ESC_DOUBLE,
EOL,
+ EOP,
SPACE,
WORD,
lib/src/main/java/com/keenwrite/quotes/Lexer.java
*/
public class Lexer {
-
+ /**
+ * Iterates over the entire string of text to help produce lexemes.
+ */
private final CharacterIterator mIterator;
);
}
- else if( curr == '\r' ) {
- lexeme = createLexeme( EOL, began, i.getIndex() );
+ else if( curr == '\r' || curr == '\n' ) {
+ final var cr = new int[]{curr == '\r' ? 1 : 0};
+ final var lf = new int[]{curr == '\n' ? 1 : 0};
- // Swallow the LF in CRLF; peeking won't work here.
- if( i.next() != '\n' ) {
- // Push back the non-LF char.
- i.previous();
- }
- }
- else if( curr == '\n' ) {
- lexeme = createLexeme( EOL, began, i.getIndex() );
+ // Swallow all consecutive CR (Mac), CRLF (Windows), and/or LF (Unix).
+ slurp(
+ i, ( next, ci ) -> {
+ cr[ 0 ] += next == '\r' ? 1 : 0;
+ lf[ 0 ] += next == '\n' ? 1 : 0;
+ return next == '\r' || next == '\n';
+ }
+ );
+
+ final var eol = cr[ 0 ] + lf[ 0 ] == 1 || cr[ 0 ] == 1 && lf[ 0 ] == 1;
+ lexeme = createLexeme( eol ? EOL : EOP, began, i.getIndex() );
}
else if( isWhitespace( curr ) ) {
ci.previous();
- return count;
+ return --count;
}
}
lib/src/main/java/com/keenwrite/quotes/Parser.java
*/
private static final LexemeType[] LAGGING_QUOTE_OPENING_SINGLE =
- new LexemeType[]{WORD, ELLIPSIS, QUOTE_SINGLE, QUOTE_DOUBLE};
+ new LexemeType[]{WORD, ELLIPSIS, QUOTE_SINGLE, QUOTE_DOUBLE, EOP};
/**
*/
private static final LexemeType[] LEADING_QUOTE_OPENING_DOUBLE =
- new LexemeType[]{SPACE, HYPHEN, QUOTE_SINGLE, OPENING_GROUP};
+ new LexemeType[]{SPACE, HYPHEN, QUOTE_SINGLE, OPENING_GROUP, EOP};
/**
}
+ System.out.println( "--------------------" );
System.out.println( "ambig leading: " + ambiguousLeadingQuotes.size() );
System.out.println( "ambig lagging: " + ambiguousLaggingQuotes.size() );
System.out.println( "resolv leading: " + resolvedLeadingQuotes );
System.out.println( "resolv lagging: " + resolvedLaggingQuotes );
+
+ final var ambiguousLeadingCount = ambiguousLeadingQuotes.size();
+ final var ambiguousLaggingCount = ambiguousLaggingQuotes.size();
if( resolvedLeadingQuotes == 1 && resolvedLaggingQuotes == 0 ) {
- if( ambiguousLeadingQuotes.size() == 0 && ambiguousLaggingQuotes.size() == 1 ) {
+ if( ambiguousLeadingCount == 0 && ambiguousLaggingCount == 1 ) {
final var balanced = mClosingSingleQuote - mOpeningSingleQuote == 0;
final var quote = balanced ? QUOTE_APOSTROPHE : QUOTE_CLOSING_SINGLE;
consumer.accept( new Token( quote, ambiguousLaggingQuotes.get( 0 ) ) );
}
}
- else if( ambiguousLeadingQuotes.size() > 0 && ambiguousLaggingQuotes.size() == 0 ) {
+ else if( ambiguousLeadingCount > 0 && ambiguousLaggingCount == 0 ) {
// If there are no ambiguous lagging quotes then all ambiguous leading
// quotes must be contractions.
ambiguousLeadingQuotes.forEach(
lex -> consumer.accept( new Token( QUOTE_APOSTROPHE, lex ) )
);
}
- else if( ambiguousLeadingQuotes.size() == 0 && ambiguousLaggingQuotes.size() > 0 ) {
+ else if( ambiguousLeadingCount == 0 && ambiguousLaggingCount > 0 ) {
// If there are no ambiguous leading quotes then all ambiguous lagging
// quotes must be contractions.
ambiguousLaggingQuotes.forEach(
lex -> consumer.accept( new Token( QUOTE_APOSTROPHE, lex ) )
);
}
- else if( resolvedLeadingQuotes == 1 && ambiguousLaggingQuotes.size() == 1 ) {
+ else if( ambiguousLeadingCount == 0 ) {
+ if( resolvedLaggingQuotes < resolvedLeadingQuotes ) {
+ for( final var mark : mQuotationMarks ) {
+ consumer.accept( new Token( QUOTE_CLOSING_SINGLE, mark[ 1 ] ) );
+ }
+ }
+ }
+ else if( resolvedLeadingQuotes == 1 && ambiguousLaggingCount == 1 ) {
consumer.accept(
new Token( QUOTE_CLOSING_SINGLE, ambiguousLaggingQuotes.get( 0 ) )
if( lex2.isType( QUOTE_SINGLE ) &&
- (lex3.isEot() || lex3.anyType( SPACE, HYPHEN, EOL )) ) {
+ (lex3.isEot() || lex3.anyType( SPACE, HYPHEN, EOL, EOP )) ) {
consumer.accept( new Token( QUOTE_CLOSING_SINGLE, lex2 ) );
mClosingSingleQuote++;
lib/src/test/java/com/keenwrite/quotes/SmartQuotesTest.java
public void test_parse_SingleLine_Parsed() {
out.println( SmartQuotes.replace(
- "'Bout that time I says, 'Boys! I been thinkin' 'bout th' Universe.'" )
- );
+ "'Oak,' 'elm,' and 'beech' are names of trees. So is 'pine.'"
+ ) );
+
+ System.out.println(" ------------------------ ");
+
+ out.println( SmartQuotes.replace(
+ "'Bout that time I says, 'Boys! I been thinkin' 'bout th' Universe.'"
+ ) );
}
}
- System.out.println( "EXPECT:" + expected );
+ testLine = unescapeEol( testLine );
+ expected = unescapeEol( expected );
+ System.out.println( testLine );
final var actual = parser.apply( testLine );
- System.out.println( "ACTUAL:" + actual );
assertEquals( expected, actual );
testLine = "";
expected = "";
}
}
+ }
+
+ private static String unescapeEol( final String s) {
+ return String.join( "\n", s.split( "\\\\n" ) );
}
lib/src/test/resources/com/keenwrite/quotes/smartypants.txt
Model "T2000"
-iPad 3's battery life is not great.
-iPad 3&apos;s battery life is not great.
+The 2's complement.\n\n"Yar, binary!"
+The 2&apos;s complement.\n\n&ldquo;Yar, binary!&rdquo;
'Oak,' 'elm,' and 'beech' are names of trees. So is 'pine.'
&lsquo;Oak,&rsquo; &lsquo;elm,&rsquo; and &lsquo;beech&rsquo; are names of trees. So is &lsquo;pine.&rsquo;
+
+'A', 'B', and 'C' are letters.
+&lsquo;A&rsquo;, &lsquo;B&rsquo;, and &lsquo;C&rsquo; are letters.
Book 'em, Danno. Rock 'n' roll. 'Cause 'twas the season.
Book &apos;em, Danno. Rock &apos;n&apos; roll. &apos;Cause &apos;twas the season.
'He said, "Thinkin'."'
&lsquo;He said, &ldquo;Thinkin&apos;.&rdquo;&rsquo;
-
-'A', 'B', and 'C' are letters.
-&lsquo;A&rsquo; &lsquo;B&rsquo; and &lsquo;C&rsquo; are letters.