Dave Jarvis' Repositories

git clone https://repo.autonoma.ca/repo/keenquotes.git

Emit unambiguous double quotes

AuthorDave Jarvis <email>
Date2021-06-02 22:02:37 GMT-0700
Commit45f5d443fbe94dab1f5d613fc94d26970947860d
Parent1238740
lib/src/main/java/com/keenwrite/quotes/Lexeme.java
/**
* Denotes there are no more lexemes: the end of text (EOT) has been reached.
+ * The beginning index differentiates between EOT and SOT.
*/
- public static final Lexeme EOT = new Lexeme();
+ public static final Lexeme EOT = new Lexeme( FLAG, -1, -2 );
/**
- * Denotes parsing at the start of text. This is useful to avoid branching
- * conditions while iterating.
+ * Denotes parsing at the start of text (SOT). This is useful to avoid
+ * branching conditions while iterating. The beginning index differentiates
+ * between EOT and SOT.
*/
- public static final Lexeme SOT = new Lexeme();
+ public static final Lexeme SOT = new Lexeme( FLAG, 0, -2 );
private final LexemeType mType;
private final int mBegan;
private final int mEnded;
-
- /**
- * Create a flag that indicates a stream marker (start or end).
- */
- private Lexeme() {
- this( FLAG, -1, -1 );
- }
/**
int ended() {
return mEnded;
+ }
+
+ boolean isEot() {
+ return mBegan == -1;
+ }
+
+ boolean isSot() {
+ return mBegan == 0;
}
lib/src/main/java/com/keenwrite/quotes/LexemeType.java
ESC_SINGLE,
ESC_DOUBLE,
+ EOL,
+ SPACE,
WORD,
NUMBER,
- NEWLINE,
- SPACE,
PUNCT,
+ HYPHEN,
PERIOD,
+ ELLIPSIS,
FLAG
}
lib/src/main/java/com/keenwrite/quotes/Lexer.java
import java.text.CharacterIterator;
import java.text.StringCharacterIterator;
+import java.util.function.BiFunction;
import static com.keenwrite.quotes.Lexeme.createLexeme;
import static com.keenwrite.quotes.LexemeType.*;
import static java.lang.Character.*;
import static java.text.CharacterIterator.DONE;
/**
* Turns text into words, numbers, punctuation, spaces, and more.
*/
-public class Lexer {
+public class Lexer {
private final CharacterIterator mIterator;
else if( curr == '"' ) {
lexeme = createLexeme( QUOTE_DOUBLE, began, i.getIndex() );
+ }
+ else if( curr == '-' ) {
+ lexeme = createLexeme( HYPHEN, began, i.getIndex() );
}
else if( isDigit( curr ) || isNumeric( curr ) && isDigit( peek( i ) ) ) {
- // Tokenize all consecutive number characters at prevent the main
- // loop from switching back to word tokens.
- char next;
-
- do {
- next = i.next();
- }
- while( isDigit( next ) || isNumeric( next ) && isDigit( peek( i ) ) );
-
- // The loop above will overshoot the number by one character.
- i.previous();
+ // Parse all consecutive number characters to prevent the main loop
+ // from switching back to word tokens.
+ slurp( i, ( next, ci ) ->
+ isDigit( next ) || isNumeric( next ) && isDigit( peek( ci ) )
+ );
lexeme = createLexeme( isWord ? WORD : NUMBER, began, i.getIndex() );
}
else if( curr == '.' ) {
- lexeme = createLexeme( PERIOD, began, i.getIndex() );
+ // Parse all consecutive periods into an ellipsis lexeme. This will
+ // not capture space-separated ellipsis (such as ". . .").
+ lexeme = createLexeme(
+ slurp( i, ( next, ci ) -> next == '.' ) == 1 ? PERIOD : ELLIPSIS,
+ began, i.getIndex()
+ );
}
else if( curr == '\r' ) {
- lexeme = createLexeme( NEWLINE, began, i.getIndex() );
+ lexeme = createLexeme( EOL, began, i.getIndex() );
// Swallow the LF in CRLF; peeking won't work here.
if( i.next() != '\n' ) {
// Push back the non-LF char.
i.previous();
}
}
else if( curr == '\n' ) {
- lexeme = createLexeme( NEWLINE, began, i.getIndex() );
+ lexeme = createLexeme( EOL, began, i.getIndex() );
}
else if( isWhitespace( curr ) ) {
ci.previous();
return ch;
+ }
+
+ /**
+ * Parse all characters that match a given function.
+ *
+ * @param ci The iterator containing characters to parse.
+ * @param f The function that determines when slurping stops.
+ * @return The number of characters parsed.
+ */
+ private static int slurp(
+ final CharacterIterator ci,
+ final BiFunction<Character, CharacterIterator, Boolean> f ) {
+ char next;
+ int count = 0;
+
+ do {
+ next = ci.next();
+ count++;
+ }
+ while( f.apply( next, ci ) );
+
+ // The loop above will overshoot the number by one character.
+ ci.previous();
+
+ return count;
}
}
lib/src/main/java/com/keenwrite/quotes/Parser.java
import static com.keenwrite.quotes.Contractions.beginsUnambiguously;
import static com.keenwrite.quotes.Lexeme.EOT;
+import static com.keenwrite.quotes.Lexeme.SOT;
import static com.keenwrite.quotes.LexemeType.*;
import static com.keenwrite.quotes.TokenType.*;
-import static java.util.Comparator.comparingInt;
/**
while( (lexeme = mLexer.next()) != EOT ) {
- parse( lexeme, consumer );
+ tokenize( lexeme, consumer );
}
- // Parse the remaining lexemes because the EOT lexeme will terminate the
- // loop above without having examined the last lexemes.
- for( int i = 0; i < mLexemes.size(); i++ ) {
- parse( mLexemes.get( i ), consumer );
- }
+ // By loop's end, the lexemes list contains tokens for all except the
+ // final two elements (from tokenizing in triplets). Try emitting the last
+ // two unprocessed lexemes as tokens.
+ tokenize( EOT, consumer );
+ tokenize( EOT, consumer );
- mQuotationMarks.sort( comparingInt( lexemes -> lexemes[ 0 ].began() ) );
+ // Lexemes not emitted as tokens may be ambiguous.
+
+ //mQuotationMarks.sort( comparingInt( lexemes -> lexemes[ 0 ].began() ) );
+
+ //
+
for( final var unparsed : mQuotationMarks ) {
}
- private void parse( final Lexeme lexeme, final Consumer<Token> consumer ) {
+ private void tokenize( final Lexeme lexeme, final Consumer<Token> consumer ) {
mLexemes.add( lexeme );
+ tokenize( consumer );
+ }
+ private void tokenize( final Consumer<Token> consumer ) {
final var lex1 = mLexemes.get( 0 );
final var lex2 = mLexemes.get( 1 );
// E.g., \"
consumer.accept( new Token( QUOTE_STRAIGHT_DOUBLE, lex1 ) );
+ }
+ else if( lex2.isType( QUOTE_DOUBLE ) &&
+ (lex1.isSot() || lex1.anyType( SPACE, HYPHEN, QUOTE_SINGLE )) &&
+ lex3.anyType( WORD, NUMBER, ELLIPSIS, QUOTE_SINGLE, QUOTE_DOUBLE ) ) {
+ // Examples: "'Twas, "", "...
+ consumer.accept( new Token( QUOTE_OPENING_DOUBLE, lex2 ) );
+ }
+ else if( lex2.isType( QUOTE_DOUBLE ) &&
+ lex1.anyType( WORD, NUMBER, PERIOD, PUNCT, ELLIPSIS, QUOTE_SINGLE ) &&
+ (lex3.anyType( SPACE, HYPHEN, QUOTE_SINGLE, EOL ) || lex3.isEot()) ) {
+ consumer.accept( new Token( QUOTE_CLOSING_DOUBLE, lex2 ) );
}
else if( lex2.anyType( QUOTE_SINGLE, QUOTE_DOUBLE ) ) {
mQuotationMarks.add( new Lexeme[]{lex1, lex2, lex3} );
}
}
private void flush( final CircularFifoQueue<Lexeme> lexemes ) {
- lexemes.add( Lexeme.SOT );
- lexemes.add( Lexeme.SOT );
- lexemes.add( Lexeme.SOT );
+ lexemes.add( SOT );
+ lexemes.add( SOT );
+ lexemes.add( SOT );
}
}
lib/src/main/java/com/keenwrite/quotes/SmartQuotes.java
/**
- * Responsible for converting straight quotes into smart quotes.
+ * Responsible for replacing {@link Token} instances with equivalent smart
+ * quotes (or straight quotes).
*/
public class SmartQuotes {
private static final Map<TokenType, String> REPLACEMENTS = Map.of(
+ QUOTE_OPENING_SINGLE, "&lsquo;",
+ QUOTE_CLOSING_SINGLE, "&rsquo;",
+ QUOTE_OPENING_DOUBLE, "&ldquo;",
+ QUOTE_CLOSING_DOUBLE, "&rdquo;",
QUOTE_STRAIGHT_SINGLE, "'",
QUOTE_STRAIGHT_DOUBLE, "\"",
final var tokens = new ArrayList<Token>();
- // Store all parsed quotation marks; the parser may emit them in any order.
+ // Store all parsed quotation marks.
parser.parse( tokens::add );
+
+ // The parser may emit tokens in any order.
sort( tokens );
lib/src/test/java/com/keenwrite/quotes/LexerTest.java
void test_Lexing_Numbers_EmitNumbers() {
testType( ".123", NUMBER );
- testType( "-123.", PUNCT, NUMBER, PERIOD );
+ testType( "-123.", HYPHEN, NUMBER, PERIOD );
testType( " 123.123.123", SPACE, NUMBER );
testType( "123 123\"", NUMBER, SPACE, NUMBER, QUOTE_DOUBLE );
- testType( "-123,123.123", PUNCT, NUMBER );
+ testType( "-123,123.123", HYPHEN, NUMBER );
+ testType( "...1,023...", ELLIPSIS, NUMBER, ELLIPSIS );
}
@Test
void test_Lexing_Words_EmitWords() {
testType( "abc", WORD );
testType( "abc abc", WORD, SPACE, WORD );
+ testType( "abc...", WORD, ELLIPSIS );
testType( "abc123", WORD, NUMBER );
- testType( "-123abc", PUNCT, NUMBER, WORD );
- testType( "abc-o'-abc", WORD, PUNCT, WORD, QUOTE_SINGLE, PUNCT, WORD );
+ testType( "-123abc", HYPHEN, NUMBER, WORD );
+ testType( "abc-o'-abc", WORD, HYPHEN, WORD, QUOTE_SINGLE, HYPHEN, WORD );
}
@Test
void test_Lexing_PunctuationMarks_EmitPunctuationMarks() {
testType( "!", PUNCT );
testType( ";", PUNCT );
testType( ".", PERIOD );
+ testType( "-", HYPHEN );
+ testType( "...", ELLIPSIS );
}
@Test
void test_Lexing_Newlines_EmitNewlines() {
- testType( "\r", NEWLINE );
- testType( "\n", NEWLINE );
- testType( "\r\n", NEWLINE );
- testType( "\r\n\r\n", NEWLINE, NEWLINE );
- testType( "\r\n\n\r", NEWLINE, NEWLINE, NEWLINE );
- testType( "abc \r\nabc\n", WORD, SPACE, NEWLINE, WORD, NEWLINE );
+ testType( "\r", EOL );
+ testType( "\n", EOL );
+ testType( "\r\n", EOL );
+ testType( "\r\n\r\n", EOL, EOL );
+ testType( "\r\n\n\r", EOL, EOL, EOL );
+ testType( "abc \r\nabc\n", WORD, SPACE, EOL, WORD, EOL );
}
lib/src/test/java/com/keenwrite/quotes/SmartQuotesTest.java
import java.util.function.Function;
+import static java.lang.System.out;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
public void test_parse_SingleLine_Parsed() {
final var fixer = new SmartQuotes();
- final var output = fixer.replace( "Didn' get th' message." );
- System.out.println( output );
+ out.println( fixer.replace( "\"Didn' get th' message.\"" ) );
+ out.println( fixer.replace( "\"John asked, 'What are you, beyond " +
+ "\"somethin' shiny?\"'\" said Fred.\n" ) );
}
lib/src/test/resources/com/keenwrite/quotes/smartypants.txt
# Primes (single, double)
# ########################################################################
-
-# No space after the feet sign.
-It's 4'11" away.
-It&apos;s 4&prime;11&Prime; away.
-
Alice's friend is 6'3" tall.
Alice&apos;s friend is 6&prime;3&Prime; tall.
Angular measure 3° 5' 30" means 3 degs, 5 arcmins, & 30 arcsecs.
Angular measure 3° 5&prime; 30&Prime; means 3 degs, 5 arcmins, & 30 arcsecs.
+
+# ########################################################################
+# Double quotes
+# ########################################################################
+"I am Sam"
+&ldquo;I am Sam&rdquo;
+
+"...even better!"
+&ldquo;...even better!&rdquo;
+
+"It was so," said he.
+&ldquo;It was so,&rdquo; said he.
+
+With "air quotes" in the middle.
+With &ldquo;air quotes&rdquo; in the middle.
+
+With--"air quotes"--and dashes.
+With--&ldquo;air quotes&rdquo;--and dashes.
+
+"Not "quite" what you expected?"
+&ldquo;Not &ldquo;quite&rdquo; what you expected?&rdquo;
+
+"Not all open quotes are closed...
+&ldquo;Not all open quotes are closed...
+
+# ########################################################################
+# Single quotes
+# ########################################################################
+'I am Sam'
+&lsquo;I am Sam&rsquo;
+
+'It was so,' said he.
+&lsquo;It was so,&rsquo; said he.
+
+'...even better!'
+&lsquo;...even better!&rsquo;
+
+With 'quotes' in the middle.
+With &lsquo;quotes&rsquo; in the middle.
+
+With--'imaginary'--dashes.
+With--&lsquo;imaginary&rsquo;--dashes.
+
+'Not 'quite' what you expected?'
+&lsquo;Not &lsquo;quite&rsquo; what you expected?&rsquo;
+
+''Cause I don't like it, 's why,' said Pat.
+&lsquo;&apos;Cause I don't like it, &apos;s why,&rsquo; said Pat.
+
+'It's a beautiful day!'
+&lsquo;It&apos;s a beautiful day!&rsquo;
+
+'He said, "Thinkin'."'
+&lsquo;He said, &ldquo;Thinkin&rsquo;.&rdquo;&rsquo;
+
+"She said, 'Llamas'll languish, they'll--
+&ldquo;She said, &lsquo;Llamas&apos;ll languish, they&apos;ll--
+
+# ########################################################################
+# Possessives
+# ########################################################################
+Sam's Sams' and the Ross's roses' thorns were prickly.
+Sam&apos;s Sams&apos; and the Ross&apos;s roses&apos; thorns were prickly.
# ########################################################################
# Outside contractions (leading and trailing, no middle)
# ########################################################################
-Salt 'n' vinegar, fish-'n'-chips, sugar 'n' spice!
-Salt &apos;n&apos; vinegar, fish-&apos;n&apos;-chips, sugar &apos;n&apos; spice!
+Salt 'n' vinegar, fish-'n'-chips, sugar 'n spice!
+Salt &apos;n&apos; vinegar, fish-&apos;n&apos;-chips, sugar &apos;n spice!
# ########################################################################
'"Trouble's my name."'
&lsquo;&ldquo;Trouble&apos;s my name.&ldquo;&lsquo;
-
-# ########################################################################
-# Double quotes
-# ########################################################################
-"I am Sam"
-&ldquo;I am Sam&rdquo;
-
-"...even better!"
-&ldquo;...even better!&rdquo;
-
-"It was so," said he.
-&ldquo;It was so,&rdquo; said he.
-
-"She said, 'Llamas'll languish, they'll--
-&ldquo;She said, &lsquo;Llamas&apos;ll languish, they&apos;ll--
-
-With "air quotes" in the middle.
-With &ldquo;air quotes&rdquo; in the middle.
-
-With--"air quotes"--and dashes.
-With--&ldquo;air quotes&rdquo;--and dashes.
-
-"Not "quite" what you expected?"
-&ldquo;Not &ldquo;quite&rdquo; what you expected?&rdquo;
# ########################################################################
"'Twas, t'wasn't thy name, 'twas it?" said Jim "the Barber" Brown.
&ldquo;&apos;Twas, t&apos;wasn&apos;t thy name, &apos;twas it?&rdquo; said Jim &ldquo;the Barber&rdquo; Brown.
-
-# ########################################################################
-# Single quotes
-# ########################################################################
-'I am Sam'
-&lsquo;I am Sam&rsquo;
-
-'It was so,' said he.
-&lsquo;It was so,&rsquo; said he.
-
-'...even better!'
-&lsquo;...even better!&rsquo;
-
-With 'quotes' in the middle.
-With &lsquo;quotes&rsquo; in the middle.
-
-With--'imaginary'--dashes.
-With--&lsquo;imaginary&rsquo;--dashes.
-
-'Not 'quite' what you expected?'
-&lsquo;Not &lsquo;quite&rsquo; what you expected?&rsquo;
-
-''Cause I don't like it, 's why,' said Pat.
-&lsquo;&apos;Cause I don't like it, &apos;s why,&rsquo; said Pat.
-
-'It's a beautiful day!'
-&lsquo;It&apos;s a beautiful day!&rsquo;
-
-'He said, "Thinkin'."'
-&lsquo;He said, &ldquo;Thinkin&rsquo;.&rdquo;&rsquo;
-
-# ########################################################################
-# Possessives
-# ########################################################################
-Sam's Sams' and the Ross's roses' thorns were prickly.
-Sam&apos;s Sams&apos; and the Ross&apos;s roses&apos; thorns were prickly.
# ########################################################################
Book &apos;em, Danno. Rock &apos;n&apos; roll. &apos;Cause &apos;twas the season.
-'85 was a good year. (The entire '80s were.)
-&apos;85 was a good year. (The entire &apos;80s were.)
+'27 was a good year. (The entire '20s were.)
+&apos;27 was a good year. (The entire &apos;20s were.)
Delta186 lines added, 123 lines removed, 63-line increase