Dave Jarvis' Repositories

Author	Dave Jarvis <email>
Date	2021-06-02 22:02:37 GMT-0700
Commit	45f5d443fbe94dab1f5d613fc94d26970947860d
Parent	1238740

lib/src/main/java/com/keenwrite/quotes/Lexeme.java

		/**
		* Denotes there are no more lexemes: the end of text (EOT) has been reached.
		+ * The beginning index differentiates between EOT and SOT.
		*/
		- public static final Lexeme EOT = new Lexeme();
		+ public static final Lexeme EOT = new Lexeme( FLAG, -1, -2 );

		/**
		- * Denotes parsing at the start of text. This is useful to avoid branching
		- * conditions while iterating.
		+ * Denotes parsing at the start of text (SOT). This is useful to avoid
		+ * branching conditions while iterating. The beginning index differentiates
		+ * between EOT and SOT.
		*/
		- public static final Lexeme SOT = new Lexeme();
		+ public static final Lexeme SOT = new Lexeme( FLAG, 0, -2 );

		private final LexemeType mType;
		private final int mBegan;
		private final int mEnded;
		-
		- /**
		- * Create a flag that indicates a stream marker (start or end).
		- */
		- private Lexeme() {
		- this( FLAG, -1, -1 );
		- }

		/**

		int ended() {
		return mEnded;
		+ }
		+
		+ boolean isEot() {
		+ return mBegan == -1;
		+ }
		+
		+ boolean isSot() {
		+ return mBegan == 0;
		}

lib/src/main/java/com/keenwrite/quotes/LexemeType.java

		ESC_SINGLE,
		ESC_DOUBLE,
		+ EOL,
		+ SPACE,
		WORD,
		NUMBER,
		- NEWLINE,
		- SPACE,
		PUNCT,
		+ HYPHEN,
		PERIOD,
		+ ELLIPSIS,
		FLAG
		}

lib/src/main/java/com/keenwrite/quotes/Lexer.java

		import java.text.CharacterIterator;
		import java.text.StringCharacterIterator;
		+import java.util.function.BiFunction;

		import static com.keenwrite.quotes.Lexeme.createLexeme;
		import static com.keenwrite.quotes.LexemeType.*;
		import static java.lang.Character.*;
		import static java.text.CharacterIterator.DONE;

		/**
		* Turns text into words, numbers, punctuation, spaces, and more.
		*/
		-public class Lexer {
		+public class Lexer {

		private final CharacterIterator mIterator;

		else if( curr == '"' ) {
		lexeme = createLexeme( QUOTE_DOUBLE, began, i.getIndex() );
		+ }
		+ else if( curr == '-' ) {
		+ lexeme = createLexeme( HYPHEN, began, i.getIndex() );
		}
		else if( isDigit( curr ) \|\| isNumeric( curr ) && isDigit( peek( i ) ) ) {
		- // Tokenize all consecutive number characters at prevent the main
		- // loop from switching back to word tokens.
		- char next;
		-
		- do {
		- next = i.next();
		- }
		- while( isDigit( next ) \|\| isNumeric( next ) && isDigit( peek( i ) ) );
		-
		- // The loop above will overshoot the number by one character.
		- i.previous();
		+ // Parse all consecutive number characters to prevent the main loop
		+ // from switching back to word tokens.
		+ slurp( i, ( next, ci ) ->
		+ isDigit( next ) \|\| isNumeric( next ) && isDigit( peek( ci ) )
		+ );

		lexeme = createLexeme( isWord ? WORD : NUMBER, began, i.getIndex() );
		}
		else if( curr == '.' ) {
		- lexeme = createLexeme( PERIOD, began, i.getIndex() );
		+ // Parse all consecutive periods into an ellipsis lexeme. This will
		+ // not capture space-separated ellipsis (such as ". . .").
		+ lexeme = createLexeme(
		+ slurp( i, ( next, ci ) -> next == '.' ) == 1 ? PERIOD : ELLIPSIS,
		+ began, i.getIndex()
		+ );
		}
		else if( curr == '\r' ) {
		- lexeme = createLexeme( NEWLINE, began, i.getIndex() );
		+ lexeme = createLexeme( EOL, began, i.getIndex() );

		// Swallow the LF in CRLF; peeking won't work here.
		if( i.next() != '\n' ) {
		// Push back the non-LF char.
		i.previous();
		}
		}
		else if( curr == '\n' ) {
		- lexeme = createLexeme( NEWLINE, began, i.getIndex() );
		+ lexeme = createLexeme( EOL, began, i.getIndex() );
		}
		else if( isWhitespace( curr ) ) {

		ci.previous();
		return ch;
		+ }
		+
		+ /**
		+ * Parse all characters that match a given function.
		+ *
		+ * @param ci The iterator containing characters to parse.
		+ * @param f The function that determines when slurping stops.
		+ * @return The number of characters parsed.
		+ */
		+ private static int slurp(
		+ final CharacterIterator ci,
		+ final BiFunction<Character, CharacterIterator, Boolean> f ) {
		+ char next;
		+ int count = 0;
		+
		+ do {
		+ next = ci.next();
		+ count++;
		+ }
		+ while( f.apply( next, ci ) );
		+
		+ // The loop above will overshoot the number by one character.
		+ ci.previous();
		+
		+ return count;
		}
		}

lib/src/main/java/com/keenwrite/quotes/Parser.java

		import static com.keenwrite.quotes.Contractions.beginsUnambiguously;
		import static com.keenwrite.quotes.Lexeme.EOT;
		+import static com.keenwrite.quotes.Lexeme.SOT;
		import static com.keenwrite.quotes.LexemeType.*;
		import static com.keenwrite.quotes.TokenType.*;
		-import static java.util.Comparator.comparingInt;

		/**


		while( (lexeme = mLexer.next()) != EOT ) {
		- parse( lexeme, consumer );
		+ tokenize( lexeme, consumer );
		}

		- // Parse the remaining lexemes because the EOT lexeme will terminate the
		- // loop above without having examined the last lexemes.
		- for( int i = 0; i < mLexemes.size(); i++ ) {
		- parse( mLexemes.get( i ), consumer );
		- }
		+ // By loop's end, the lexemes list contains tokens for all except the
		+ // final two elements (from tokenizing in triplets). Try emitting the last
		+ // two unprocessed lexemes as tokens.
		+ tokenize( EOT, consumer );
		+ tokenize( EOT, consumer );

		- mQuotationMarks.sort( comparingInt( lexemes -> lexemes[ 0 ].began() ) );
		+ // Lexemes not emitted as tokens may be ambiguous.
		+
		+ //mQuotationMarks.sort( comparingInt( lexemes -> lexemes[ 0 ].began() ) );
		+
		+ //
		+

		for( final var unparsed : mQuotationMarks ) {

		}

		- private void parse( final Lexeme lexeme, final Consumer<Token> consumer ) {
		+ private void tokenize( final Lexeme lexeme, final Consumer<Token> consumer ) {
		mLexemes.add( lexeme );
		+ tokenize( consumer );
		+ }

		+ private void tokenize( final Consumer<Token> consumer ) {
		final var lex1 = mLexemes.get( 0 );
		final var lex2 = mLexemes.get( 1 );

		// E.g., \"
		consumer.accept( new Token( QUOTE_STRAIGHT_DOUBLE, lex1 ) );
		+ }
		+ else if( lex2.isType( QUOTE_DOUBLE ) &&
		+ (lex1.isSot() \|\| lex1.anyType( SPACE, HYPHEN, QUOTE_SINGLE )) &&
		+ lex3.anyType( WORD, NUMBER, ELLIPSIS, QUOTE_SINGLE, QUOTE_DOUBLE ) ) {
		+ // Examples: "'Twas, "", "...
		+ consumer.accept( new Token( QUOTE_OPENING_DOUBLE, lex2 ) );
		+ }
		+ else if( lex2.isType( QUOTE_DOUBLE ) &&
		+ lex1.anyType( WORD, NUMBER, PERIOD, PUNCT, ELLIPSIS, QUOTE_SINGLE ) &&
		+ (lex3.anyType( SPACE, HYPHEN, QUOTE_SINGLE, EOL ) \|\| lex3.isEot()) ) {
		+ consumer.accept( new Token( QUOTE_CLOSING_DOUBLE, lex2 ) );
		}
		else if( lex2.anyType( QUOTE_SINGLE, QUOTE_DOUBLE ) ) {
		mQuotationMarks.add( new Lexeme[]{lex1, lex2, lex3} );
		}
		}

		private void flush( final CircularFifoQueue<Lexeme> lexemes ) {
		- lexemes.add( Lexeme.SOT );
		- lexemes.add( Lexeme.SOT );
		- lexemes.add( Lexeme.SOT );
		+ lexemes.add( SOT );
		+ lexemes.add( SOT );
		+ lexemes.add( SOT );
		}
		}

lib/src/main/java/com/keenwrite/quotes/SmartQuotes.java


		/**
		- * Responsible for converting straight quotes into smart quotes.
		+ * Responsible for replacing {@link Token} instances with equivalent smart
		+ * quotes (or straight quotes).
		*/
		public class SmartQuotes {
		private static final Map<TokenType, String> REPLACEMENTS = Map.of(
		+ QUOTE_OPENING_SINGLE, "‘",
		+ QUOTE_CLOSING_SINGLE, "’",
		+ QUOTE_OPENING_DOUBLE, "“",
		+ QUOTE_CLOSING_DOUBLE, "”",
		QUOTE_STRAIGHT_SINGLE, "'",
		QUOTE_STRAIGHT_DOUBLE, "\"",

		final var tokens = new ArrayList<Token>();

		- // Store all parsed quotation marks; the parser may emit them in any order.
		+ // Store all parsed quotation marks.
		parser.parse( tokens::add );
		+
		+ // The parser may emit tokens in any order.
		sort( tokens );

lib/src/test/java/com/keenwrite/quotes/LexerTest.java

		void test_Lexing_Numbers_EmitNumbers() {
		testType( ".123", NUMBER );
		- testType( "-123.", PUNCT, NUMBER, PERIOD );
		+ testType( "-123.", HYPHEN, NUMBER, PERIOD );
		testType( " 123.123.123", SPACE, NUMBER );
		testType( "123 123\"", NUMBER, SPACE, NUMBER, QUOTE_DOUBLE );
		- testType( "-123,123.123", PUNCT, NUMBER );
		+ testType( "-123,123.123", HYPHEN, NUMBER );
		+ testType( "...1,023...", ELLIPSIS, NUMBER, ELLIPSIS );
		}

		@Test
		void test_Lexing_Words_EmitWords() {
		testType( "abc", WORD );
		testType( "abc abc", WORD, SPACE, WORD );
		+ testType( "abc...", WORD, ELLIPSIS );
		testType( "abc123", WORD, NUMBER );
		- testType( "-123abc", PUNCT, NUMBER, WORD );
		- testType( "abc-o'-abc", WORD, PUNCT, WORD, QUOTE_SINGLE, PUNCT, WORD );
		+ testType( "-123abc", HYPHEN, NUMBER, WORD );
		+ testType( "abc-o'-abc", WORD, HYPHEN, WORD, QUOTE_SINGLE, HYPHEN, WORD );
		}

		@Test
		void test_Lexing_PunctuationMarks_EmitPunctuationMarks() {
		testType( "!", PUNCT );
		testType( ";", PUNCT );
		testType( ".", PERIOD );
		+ testType( "-", HYPHEN );
		+ testType( "...", ELLIPSIS );
		}


		@Test
		void test_Lexing_Newlines_EmitNewlines() {
		- testType( "\r", NEWLINE );
		- testType( "\n", NEWLINE );
		- testType( "\r\n", NEWLINE );
		- testType( "\r\n\r\n", NEWLINE, NEWLINE );
		- testType( "\r\n\n\r", NEWLINE, NEWLINE, NEWLINE );
		- testType( "abc \r\nabc\n", WORD, SPACE, NEWLINE, WORD, NEWLINE );
		+ testType( "\r", EOL );
		+ testType( "\n", EOL );
		+ testType( "\r\n", EOL );
		+ testType( "\r\n\r\n", EOL, EOL );
		+ testType( "\r\n\n\r", EOL, EOL, EOL );
		+ testType( "abc \r\nabc\n", WORD, SPACE, EOL, WORD, EOL );
		}

lib/src/test/java/com/keenwrite/quotes/SmartQuotesTest.java

		import java.util.function.Function;

		+import static java.lang.System.out;
		import static org.junit.jupiter.api.Assertions.assertEquals;
		import static org.junit.jupiter.api.Assertions.assertNotNull;

		public void test_parse_SingleLine_Parsed() {
		final var fixer = new SmartQuotes();
		- final var output = fixer.replace( "Didn' get th' message." );
		- System.out.println( output );
		+ out.println( fixer.replace( "\"Didn' get th' message.\"" ) );
		+ out.println( fixer.replace( "\"John asked, 'What are you, beyond " +
		+ "\"somethin' shiny?\"'\" said Fred.\n" ) );
		}

lib/src/test/resources/com/keenwrite/quotes/smartypants.txt

		# Primes (single, double)
		# ########################################################################
		-
		-# No space after the feet sign.
		-It's 4'11" away.
		-It's 4′11″ away.
		-
		Alice's friend is 6'3" tall.
		Alice's friend is 6′3″ tall.

		Angular measure 3° 5' 30" means 3 degs, 5 arcmins, & 30 arcsecs.
		Angular measure 3° 5′ 30″ means 3 degs, 5 arcmins, & 30 arcsecs.
		+
		+# ########################################################################
		+# Double quotes
		+# ########################################################################
		+"I am Sam"
		+“I am Sam”
		+
		+"...even better!"
		+“...even better!”
		+
		+"It was so," said he.
		+“It was so,” said he.
		+
		+With "air quotes" in the middle.
		+With “air quotes” in the middle.
		+
		+With--"air quotes"--and dashes.
		+With--“air quotes”--and dashes.
		+
		+"Not "quite" what you expected?"
		+“Not “quite” what you expected?”
		+
		+"Not all open quotes are closed...
		+“Not all open quotes are closed...
		+
		+# ########################################################################
		+# Single quotes
		+# ########################################################################
		+'I am Sam'
		+‘I am Sam’
		+
		+'It was so,' said he.
		+‘It was so,’ said he.
		+
		+'...even better!'
		+‘...even better!’
		+
		+With 'quotes' in the middle.
		+With ‘quotes’ in the middle.
		+
		+With--'imaginary'--dashes.
		+With--‘imaginary’--dashes.
		+
		+'Not 'quite' what you expected?'
		+‘Not ‘quite’ what you expected?’
		+
		+''Cause I don't like it, 's why,' said Pat.
		+‘'Cause I don't like it, 's why,’ said Pat.
		+
		+'It's a beautiful day!'
		+‘It's a beautiful day!’
		+
		+'He said, "Thinkin'."'
		+‘He said, “Thinkin’.”’
		+
		+"She said, 'Llamas'll languish, they'll--
		+“She said, ‘Llamas'll languish, they'll--
		+
		+# ########################################################################
		+# Possessives
		+# ########################################################################
		+Sam's Sams' and the Ross's roses' thorns were prickly.
		+Sam's Sams' and the Ross's roses' thorns were prickly.

		# ########################################################################

		# Outside contractions (leading and trailing, no middle)
		# ########################################################################
		-Salt 'n' vinegar, fish-'n'-chips, sugar 'n' spice!
		-Salt 'n' vinegar, fish-'n'-chips, sugar 'n' spice!
		+Salt 'n' vinegar, fish-'n'-chips, sugar 'n spice!
		+Salt 'n' vinegar, fish-'n'-chips, sugar 'n spice!

		# ########################################################################

		'"Trouble's my name."'
		‘“Trouble's my name.“‘
		-
		-# ########################################################################
		-# Double quotes
		-# ########################################################################
		-"I am Sam"
		-“I am Sam”
		-
		-"...even better!"
		-“...even better!”
		-
		-"It was so," said he.
		-“It was so,” said he.
		-
		-"She said, 'Llamas'll languish, they'll--
		-“She said, ‘Llamas'll languish, they'll--
		-
		-With "air quotes" in the middle.
		-With “air quotes” in the middle.
		-
		-With--"air quotes"--and dashes.
		-With--“air quotes”--and dashes.
		-
		-"Not "quite" what you expected?"
		-“Not “quite” what you expected?”

		# ########################################################################

		"'Twas, t'wasn't thy name, 'twas it?" said Jim "the Barber" Brown.
		“'Twas, t'wasn't thy name, 'twas it?” said Jim “the Barber” Brown.
		-
		-# ########################################################################
		-# Single quotes
		-# ########################################################################
		-'I am Sam'
		-‘I am Sam’
		-
		-'It was so,' said he.
		-‘It was so,’ said he.
		-
		-'...even better!'
		-‘...even better!’
		-
		-With 'quotes' in the middle.
		-With ‘quotes’ in the middle.
		-
		-With--'imaginary'--dashes.
		-With--‘imaginary’--dashes.
		-
		-'Not 'quite' what you expected?'
		-‘Not ‘quite’ what you expected?’
		-
		-''Cause I don't like it, 's why,' said Pat.
		-‘'Cause I don't like it, 's why,’ said Pat.
		-
		-'It's a beautiful day!'
		-‘It's a beautiful day!’
		-
		-'He said, "Thinkin'."'
		-‘He said, “Thinkin’.”’
		-
		-# ########################################################################
		-# Possessives
		-# ########################################################################
		-Sam's Sams' and the Ross's roses' thorns were prickly.
		-Sam's Sams' and the Ross's roses' thorns were prickly.

		# ########################################################################

		Book 'em, Danno. Rock 'n' roll. 'Cause 'twas the season.

		-'85 was a good year. (The entire '80s were.)
		-'85 was a good year. (The entire '80s were.)
		+'27 was a good year. (The entire '20s were.)
		+'27 was a good year. (The entire '20s were.)

Emit unambiguous double quotes