Dave Jarvis' Repositories

git clone https://repo.autonoma.ca/repo/keenquotes.git

Add lexemes, emit apostrophes and primes

AuthorDave Jarvis <email>
Date2021-05-31 18:28:39 GMT-0700
Commitf40e32cbe72ac0af42e55bde67c9204ab88cf840
Parent5aaef29
lib/src/main/java/com/keenwrite/quotes/Contractions.java
+package com.keenwrite.quotes;
+
+import java.util.Set;
+
+public class Contractions {
+ /**
+ * Words having a straight apostrophe that cannot be mistaken for an
+ * opening single quote.
+ */
+ private static final Set<String> BEGAN_UNAMBIGUOUS = Set.of(
+ "aporth", "boutcha", "boutchu", "cept", "dillo", "e'll", "fraid",
+ "gainst", "n", "neath", "nother", "onna", "onna'", "pon", "s", "sblood",
+ "scuse", "sfar", "sfoot", "t", "taint", "tain't", "til", "tis", "tisn't",
+ "tshall", "twas", "twasn't", "tween", "twere", "tweren't", "twixt",
+ "twon't", "twou'd", "twou'dn't", "twould", "twouldn't", "ve"
+ );
+
+ /**
+ * Words having a straight apostrophe that may be either part of a
+ * contraction or a word that stands alone beside an opening single quote.
+ */
+ private static final Set<String> BEGAN_AMBIGUOUS = Set.of(
+ "bout", "choo", "ere", "e", "e's", "fro", "ho", "kay", "lo",
+ "re", "sup", "twill", "um", "zat"
+ );
+
+ /**
+ * Answers whether the given word is a contraction that always starts
+ * with an apostrophe. The comparison is case insensitive. This must
+ * only be called when a straight quote is followed by a word.
+ *
+ * @param word The word to compare against the list of known unambiguous
+ * contractions.
+ * @return {@code true} when the given word is in the set of unambiguous
+ * contractions.
+ */
+ public static boolean beginsUnambiguously( final String word ) {
+ assert word != null;
+ return BEGAN_UNAMBIGUOUS.contains( word.toLowerCase() );
+ }
+
+ /**
+ * Answers whether the given word could be a contraction but is also a
+ * valid word in non-contracted form.
+ *
+ * @param word The word to compare against the list of known ambiguous
+ * contractions.
+ * @return {@code true} when the given word is in the set of ambiguous
+ * contractions.
+ */
+ public static boolean beginsAmbiguously( final String word ) {
+ assert word != null;
+ return BEGAN_AMBIGUOUS.contains( word.toLowerCase() );
+ }
+}
lib/src/main/java/com/keenwrite/quotes/Lexeme.java
+/* Copyright 2020-2021 White Magic Software, Ltd. -- All rights reserved. */
+package com.keenwrite.quotes;
+
+import static com.keenwrite.quotes.LexemeType.INVALID;
+
+/**
+ * Responsible for tracking the beginning and ending offsets of a token within
+ * a text string. Tracking the beginning and ending indices should use less
+ * memory than duplicating the entire text of Unicode characters (i.e., using
+ * a similar approach to run-length encoding).
+ */
+public class Lexeme {
+ /**
+ * Denotes there are no more tokens: the end of text (EOT) has been reached.
+ */
+ public static final Lexeme EOT = new Lexeme();
+
+ private final LexemeType mType;
+ private final int mBegan;
+ private final int mEnded;
+
+ /**
+ * Set to {@code true} if there are more tokens to parse.
+ */
+ private final boolean mHasNext;
+
+ /**
+ * Create an end of text token.
+ */
+ private Lexeme() {
+ this( INVALID, -1, -1, false );
+ }
+
+ /**
+ * Create a token that indicates there are no more tokens.
+ */
+ private Lexeme( final LexemeType type, final int began, final int ended ) {
+ this( type, began, ended, true );
+ }
+
+ /**
+ * Create a token that represents a section of the text.
+ */
+ private Lexeme(
+ final LexemeType type, final int began, final int ended,
+ final boolean hasNext ) {
+ mType = type;
+ mBegan = began;
+ mEnded = ended + 1;
+ mHasNext = hasNext;
+ }
+
+ /**
+ * Extracts a sequence of characters from the given text at the offsets
+ * captured by this token.
+ *
+ * @param text The text that was parsed using this class.
+ * @return The character string captured by the token.
+ */
+ public String toString( final String text ) {
+ return text.substring( mBegan, mEnded );
+ }
+
+ public boolean hasNext() {
+ return mHasNext;
+ }
+
+ public boolean isType( final LexemeType type ) {
+ return mType == type;
+ }
+
+ public boolean anyType( final LexemeType... types ) {
+ for( final var type : types ) {
+ if( mType == type ) {
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ LexemeType getType() {
+ return mType;
+ }
+
+ @Override
+ public String toString() {
+ return getClass().getSimpleName() + "{" +
+ "mType=" + mType +
+ ", mBegan=" + mBegan +
+ ", mEnded=" + mEnded +
+ '}';
+ }
+
+ static Lexeme createLexeme(
+ final LexemeType token, final int began, final int ended ) {
+ return new Lexeme( token, began, ended );
+ }
+}
lib/src/main/java/com/keenwrite/quotes/LexemeType.java
+/* Copyright 2020-2021 White Magic Software, Ltd. -- All rights reserved. */
+package com.keenwrite.quotes;
+
+public enum LexemeType {
+ QUOTE_SINGLE,
+ QUOTE_DOUBLE,
+ WORD,
+ NUMBER,
+ NEWLINE,
+ SPACE,
+ PUNCT,
+ PERIOD,
+ INVALID
+}
lib/src/main/java/com/keenwrite/quotes/Lexer.java
+/* Copyright 2020-2021 White Magic Software, Ltd. -- All rights reserved. */
+package com.keenwrite.quotes;
+
+import java.text.CharacterIterator;
+import java.text.StringCharacterIterator;
+import java.util.function.Consumer;
+
+import static com.keenwrite.quotes.Lexeme.createLexeme;
+import static com.keenwrite.quotes.LexemeType.*;
+import static java.lang.Character.*;
+import static java.text.CharacterIterator.DONE;
+
+/**
+ * Turns text into words, numbers, punctuation, spaces, and more.
+ */
+public class Lexer {
+ /**
+ * Default constructor, no state.
+ */
+ public Lexer() {
+ }
+
+ /**
+ * Emits a series of tokens that represent information about text that is
+ * needed to convert straight quotes to curly quotes.
+ *
+ * @param text The text to split into tokens.
+ * @param consumer Receives each token as a separate event.
+ */
+ public void parse( final String text, final Consumer<Lexeme> consumer ) {
+ final var iterator = new StringCharacterIterator( text );
+ Lexeme lex;
+
+ while( (lex = parse( iterator )).hasNext() ) {
+ consumer.accept( lex );
+ }
+ }
+
+ /**
+ * Tokenizes a sequence of characters. The order of comparisons is optimized
+ * towards probability of the occurrence of a character in regular English
+ * prose: letters, space, quotation marks, numbers, periods, new lines,
+ * then end of text.
+ *
+ * @param i The sequence of characters to tokenize.
+ * @return The next token in the sequence.
+ */
+ private Lexeme parse( final CharacterIterator i ) {
+ int began = i.getIndex();
+ boolean isWord = false;
+ Lexeme lexeme = null;
+
+ do {
+ final var curr = i.current();
+
+ if( isLetter( curr ) ) {
+ isWord = true;
+
+ if( !isLetterOrDigit( peek( i ) ) ) {
+ lexeme = createLexeme( WORD, began, i.getIndex() );
+ }
+ }
+ else if( curr == ' ' ) {
+ lexeme = createLexeme( SPACE, began, i.getIndex() );
+ }
+ else if( curr == '\'' ) {
+ lexeme = createLexeme( QUOTE_SINGLE, began, i.getIndex() );
+ }
+ else if( curr == '"' ) {
+ lexeme = createLexeme( QUOTE_DOUBLE, began, i.getIndex() );
+ }
+ else if( isDigit( curr ) || isNumeric( curr ) && isDigit( peek( i ) ) ) {
+ // Tokenize all consecutive number characters at prevent the main
+ // loop from switching back to word tokens.
+ char next;
+
+ do {
+ next = i.next();
+ }
+ while( isDigit( next ) || isNumeric( next ) && isDigit( peek( i ) ) );
+
+ // The loop above will overshoot the number by one character.
+ i.previous();
+
+ lexeme = createLexeme( isWord ? WORD : NUMBER, began, i.getIndex() );
+ }
+ else if( curr == '.' ) {
+ lexeme = createLexeme( PERIOD, began, i.getIndex() );
+ }
+ else if( curr == '\r' ) {
+ lexeme = createLexeme( NEWLINE, began, i.getIndex() );
+
+ // Swallow the LF in CRLF; peeking won't work here.
+ if( i.next() != '\n' ) {
+ // Push back the non-LF char.
+ i.previous();
+ }
+ }
+ else if( curr == '\n' ) {
+ lexeme = createLexeme( NEWLINE, began, i.getIndex() );
+ }
+ else if( isWhitespace( curr ) ) {
+ lexeme = createLexeme( SPACE, began, i.getIndex() );
+ }
+ else if( curr != DONE ) {
+ lexeme = createLexeme( PUNCT, began, i.getIndex() );
+ }
+ else {
+ lexeme = Lexeme.EOT;
+ }
+
+ i.next();
+ }
+ while( lexeme == null );
+
+ return lexeme;
+ }
+
+ private static boolean isNumeric( final char curr ) {
+ return curr == '.' || curr == ',';
+ }
+
+ private static char peek( final CharacterIterator ci ) {
+ final var ch = ci.next();
+ ci.previous();
+ return ch;
+ }
+}
lib/src/main/java/com/keenwrite/quotes/Parser.java
import java.util.function.Consumer;
+import static com.keenwrite.quotes.Contractions.beginsUnambiguously;
+import static com.keenwrite.quotes.LexemeType.*;
import static com.keenwrite.quotes.TokenType.*;
* </ol>
*/
-public class Parser implements Consumer<Token> {
+public class Parser implements Consumer<Lexeme> {
private final String mText;
- private final CircularFifoQueue<Token> mTokens = new CircularFifoQueue<>( 3 );
+ private final CircularFifoQueue<Lexeme> mTokens =
+ new CircularFifoQueue<>( 3 );
- private final Deque<Token> mStack = new ArrayDeque<>();
+ private final Deque<Lexeme> mStack = new ArrayDeque<>();
+ private final Consumer<Token> mConsumer;
- public Parser( final String text ) {
+ public Parser( final String text, final Consumer<Token> consumer ) {
mText = text;
- mTokens.add( Token.EOT );
- mTokens.add( Token.EOT );
- mTokens.add( Token.EOT );
+
+ // Allow consuming the very first token without checking the queue size.
+ mTokens.add( Lexeme.EOT );
+ mTokens.add( Lexeme.EOT );
+ mTokens.add( Lexeme.EOT );
+
+ mConsumer = consumer;
}
public void parse() {
- final var tokenizer = new Tokenizer();
- tokenizer.tokenize( mText, this );
+ final var tokenizer = new Lexer();
+ tokenizer.parse( mText, this );
+ System.out.println(" DUH DONE!" );
}
@Override
- public void accept( final Token token ) {
+ public void accept( final Lexeme token ) {
mTokens.add( token );
final var token1 = mTokens.get( 0 );
final var token2 = mTokens.get( 1 );
final var token3 = mTokens.get( 2 );
- if( token2.isType( QUOTE_SINGLE ) &&
- token3.isType( WORD ) &&
+ if( token2.isType( QUOTE_SINGLE ) && token3.isType( WORD ) &&
token1.anyType( WORD, PERIOD, NUMBER ) ) {
- System.out.println( "APOSTROPHE: " + token2 );
+ mConsumer.accept( new Token( QUOTE_APOSTROPHE, token2 ) );
}
- else if( token1.isType( QUOTE_SINGLE ) &&
- "n".equalsIgnoreCase( token2.toString( mText ) ) &&
- token3.isType( QUOTE_SINGLE ) ) {
- System.out.printf( "APOSTROPHES: %s %s%n", token1, token3 );
+ else if( token1.isType( QUOTE_SINGLE ) && token3.isType( QUOTE_SINGLE ) &&
+ "n".equalsIgnoreCase( token2.toString( mText ) ) ) {
+ mConsumer.accept( new Token( QUOTE_APOSTROPHE, token1 ) );
+ mConsumer.accept( new Token( QUOTE_APOSTROPHE, token3 ) );
}
else if( token1.isType( NUMBER ) && token2.isType( QUOTE_SINGLE ) ) {
- System.out.println( "PRIME: " + token2 );
+ mConsumer.accept( new Token( QUOTE_PRIME_SINGLE, token2 ) );
}
else if( token1.isType( NUMBER ) && token2.isType( QUOTE_DOUBLE ) ) {
- System.out.println( "DOUBLE PRIME: " + token2 );
+ mConsumer.accept( new Token( QUOTE_PRIME_DOUBLE, token2 ) );
+ }
+ else if( token1.isType( QUOTE_SINGLE ) && token2.isType( WORD ) &&
+ beginsUnambiguously( token2.toString( mText ) ) ) {
+ mConsumer.accept( new Token( QUOTE_APOSTROPHE, token1 ) );
}
else if( token.anyType( QUOTE_SINGLE, QUOTE_DOUBLE ) ) {
}
}
-
-/*
- private enum TokenType {
- QUOTE_OPENING_SINGLE,
- QUOTE_OPENING_DOUBLE,
- QUOTE_CLOSING_SINGLE,
- QUOTE_CLOSING_DOUBLE,
- QUOTE_APOSTROPHE,
- QUOTE_PRIME_SINGLE,
- QUOTE_PRIME_DOUBLE,
- TEXT
- }
-*/
lib/src/main/java/com/keenwrite/quotes/Token.java
package com.keenwrite.quotes;
-import static com.keenwrite.quotes.TokenType.INVALID;
-
/**
- * Responsible for tracking the beginning and ending offsets of a token within
- * a text string.
+ * Represents a high-level token read from the text.
*/
public class Token {
- /**
- * Denotes there are no more tokens: the end of text (EOT) has been reached.
- */
- public static final Token EOT = new Token();
-
- private final TokenType mType;
- private final int mBegan;
- private final int mEnded;
-
- /**
- * Set to {@code true} if there are more tokens to parse.
- */
- private final boolean mHasNext;
-
- /**
- * Create an end of text token.
- */
- private Token() {
- this( INVALID, -1, -1, false );
- }
+ private TokenType mType;
+ private Lexeme mLexeme;
- /**
- * Create a token that indicates there are no more tokens.
- */
- private Token( final TokenType type, final int began, final int ended ) {
- this( type, began, ended, true );
- }
+ public Token( final TokenType type, final Lexeme lexeme ) {
+ assert type != null;
+ assert lexeme != null;
- /**
- * Create a token
- */
- private Token(
- final TokenType type, final int began, final int ended,
- final boolean hasNext ) {
mType = type;
- mBegan = began;
- mEnded = ended + 1;
- mHasNext = hasNext;
- }
-
- /**
- * Extracts a sequence of characters from the given text at the offsets
- * captured by this token.
- *
- * @param text The text that was parsed using this class.
- * @return The character string captured by the token.
- */
- public String toString( final String text ) {
- return text.substring( mBegan, mEnded );
- }
-
- public boolean hasNext() {
- return mHasNext;
- }
-
- public boolean isType( final TokenType type ) {
- return mType == type;
- }
-
- public boolean anyType( final TokenType... types ) {
- for( final var type : types ) {
- if( mType == type ) {
- return true;
- }
- }
-
- return false;
- }
-
- TokenType getType() {
- return mType;
+ mLexeme = lexeme;
}
@Override
public String toString() {
return getClass().getSimpleName() + "{" +
"mType=" + mType +
- ", mBegan=" + mBegan +
- ", mEnded=" + mEnded +
+ ", mLexeme=" + mLexeme +
'}';
- }
-
- static Token createToken(
- final TokenType token, final int began, final int ended ) {
- return new Token( token, began, ended );
}
}
lib/src/main/java/com/keenwrite/quotes/TokenType.java
package com.keenwrite.quotes;
+/**
+ * The {@link Parser} emits these token types.
+ */
public enum TokenType {
- QUOTE_SINGLE,
- QUOTE_DOUBLE,
- WORD,
- NUMBER,
- NEWLINE,
- SPACE,
- PUNCT,
- PERIOD,
- INVALID
+ QUOTE_OPENING_SINGLE,
+ QUOTE_OPENING_DOUBLE,
+ QUOTE_CLOSING_SINGLE,
+ QUOTE_CLOSING_DOUBLE,
+ QUOTE_APOSTROPHE,
+ QUOTE_PRIME_SINGLE,
+ QUOTE_PRIME_DOUBLE,
+ TEXT,
}
lib/src/main/java/com/keenwrite/quotes/Tokenizer.java
-/* Copyright 2020-2021 White Magic Software, Ltd. -- All rights reserved. */
-package com.keenwrite.quotes;
-
-import java.text.CharacterIterator;
-import java.text.StringCharacterIterator;
-import java.util.function.Consumer;
-
-import static com.keenwrite.quotes.Token.createToken;
-import static com.keenwrite.quotes.TokenType.*;
-import static java.lang.Character.*;
-import static java.text.CharacterIterator.DONE;
-
-/**
- * Tokenizes text into words, numbers, punctuation, spaces, and more.
- */
-public class Tokenizer {
- /**
- * Default constructor, no state.
- */
- public Tokenizer() {
- }
-
- /**
- * Emits a series of tokens that represent information about text that is
- * needed to convert straight quotes to curly quotes.
- *
- * @param text The text to split into tokens.
- * @param consumer Receives each token as a separate event.
- */
- public void tokenize( final String text, final Consumer<Token> consumer ) {
- final var iterator = new StringCharacterIterator( text );
- Token token;
-
- while( (token = tokenize( iterator )).hasNext() ) {
- consumer.accept( token );
- }
- }
-
- /**
- * Tokenizes a sequence of characters. The order of comparisons is optimized
- * towards probability of the occurrence of a character in regular English
- * prose: letters, space, quotation marks, numbers, periods, new lines,
- * then end of text.
- *
- * @param i The sequence of characters to tokenize.
- * @return The next token in the sequence.
- */
- private Token tokenize( final CharacterIterator i ) {
- int began = i.getIndex();
- boolean isWord = false;
- Token token = null;
-
- do {
- final var curr = i.current();
-
- if( isLetter( curr ) ) {
- isWord = true;
-
- if( !isLetterOrDigit( peek( i ) ) ) {
- token = createToken( WORD, began, i.getIndex() );
- }
- }
- else if( curr == ' ' ) {
- token = createToken( SPACE, began, i.getIndex() );
- }
- else if( curr == '\'' ) {
- token = createToken( QUOTE_SINGLE, began, i.getIndex() );
- }
- else if( curr == '"' ) {
- token = createToken( QUOTE_DOUBLE, began, i.getIndex() );
- }
- else if( isDigit( curr ) || isNumeric( curr ) && isDigit( peek( i ) ) ) {
- // Tokenize all consecutive number characters at prevent the main
- // loop from switching back to word tokens.
- char next;
-
- do {
- next = i.next();
- }
- while( isDigit( next ) || isNumeric( next ) && isDigit( peek( i ) ) );
-
- // The loop above will overshoot the number by one character.
- i.previous();
-
- token = createToken( isWord ? WORD : NUMBER, began, i.getIndex() );
- }
- else if( curr == '.' ) {
- token = createToken( PERIOD, began, i.getIndex() );
- }
- else if( curr == '\r' ) {
- token = createToken( NEWLINE, began, i.getIndex() );
-
- // Swallow the LF in CRLF; peeking won't work here.
- if( i.next() != '\n' ) {
- // Push back the non-LF char.
- i.previous();
- }
- }
- else if( curr == '\n' ) {
- token = createToken( NEWLINE, began, i.getIndex() );
- }
- else if( isWhitespace( curr ) ) {
- token = createToken( SPACE, began, i.getIndex() );
- }
- else if( curr != DONE ) {
- token = createToken( PUNCT, began, i.getIndex() );
- }
- else {
- token = Token.EOT;
- }
-
- i.next();
- }
- while( token == null );
-
- return token;
- }
-
- private static boolean isNumeric( final char curr ) {
- return curr == '.' || curr == ',';
- }
-
- private static char peek( final CharacterIterator ci ) {
- final var ch = ci.next();
- ci.previous();
- return ch;
- }
-}
lib/src/test/java/com/keenwrite/quotes/LexerTest.java
+/* Copyright 2020-2021 White Magic Software, Ltd. -- All rights reserved. */
+package com.keenwrite.quotes;
+
+import org.junit.jupiter.api.Test;
+
+import java.util.concurrent.atomic.AtomicInteger;
+
+import static com.keenwrite.quotes.LexemeType.*;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+/**
+ * Tests lexing words, numbers, punctuation, spaces, and newlines.
+ */
+class LexerTest {
+ @Test
+ void test_Lexing_Words_TokenValues() {
+ testText( "abc 123", "abc", " ", "123" );
+ testText( "-123 abc", "-", "123", " ", "abc" );
+ }
+
+ @Test
+ void test_Lexing_Numbers_EmitNumbers() {
+ testType( ".123", NUMBER );
+ testType( "-123.", PUNCT, NUMBER, PERIOD );
+ testType( " 123.123.123", SPACE, NUMBER );
+ testType( "123 123\"", NUMBER, SPACE, NUMBER, QUOTE_DOUBLE );
+ testType( "-123,123.123", PUNCT, NUMBER );
+ }
+
+ @Test
+ void test_Lexing_Words_EmitWords() {
+ testType( "abc", WORD );
+ testType( "abc abc", WORD, SPACE, WORD );
+ testType( "abc123", WORD );
+ testType( "-123abc", PUNCT, NUMBER, WORD );
+ testType( "abc-o'-abc", WORD, PUNCT, WORD, QUOTE_SINGLE, PUNCT, WORD );
+ }
+
+ @Test
+ void test_Lexing_PunctuationMarks_EmitPunctuationMarks() {
+ testType( "!", PUNCT );
+ testType( ";", PUNCT );
+ testType( ".", PERIOD );
+ }
+
+ @Test
+ void test_Lexing_Quotes_EmitQuotes() {
+ testType( "'", QUOTE_SINGLE );
+ testType( "\"", QUOTE_DOUBLE );
+ testType( "3 o'clock", NUMBER, SPACE, WORD, QUOTE_SINGLE, WORD );
+ }
+
+ @Test
+ void test_Lexing_Newlines_EmitNewlines() {
+ testType( "\r", NEWLINE );
+ testType( "\n", NEWLINE );
+ testType( "\r\n", NEWLINE );
+ testType( "\r\n\r\n", NEWLINE, NEWLINE );
+ testType( "\r\n\n\r", NEWLINE, NEWLINE, NEWLINE );
+ testType( "abc \r\nabc\n", WORD, SPACE, NEWLINE, WORD, NEWLINE );
+ }
+
+ private void testType(
+ final String actual, final LexemeType... expected ) {
+ final var tokenizer = new Lexer();
+ final var counter = new AtomicInteger();
+
+ tokenizer.parse( actual, ( token ) -> {
+ final var expectedType = expected[ counter.getAndIncrement() ];
+ final var actualType = token.getType();
+ assertEquals( expectedType, actualType );
+ } );
+
+ // Ensure all expected tokens are matched (verify end of text reached).
+ assertEquals( expected.length, counter.get() );
+ }
+
+ private void testText( final String actual, final String... expected ) {
+ final var tokenizer = new Lexer();
+ final var counter = new AtomicInteger();
+
+ tokenizer.parse( actual, ( token ) -> {
+ final var expectedText = expected[ counter.getAndIncrement() ];
+ final var actualText = token.toString( actual );
+ assertEquals( expectedText, actualText );
+ } );
+
+ // Ensure all expected tokens are matched (verify end of text reached).
+ assertEquals( expected.length, counter.get() );
+ }
+}
lib/src/test/java/com/keenwrite/quotes/ParserTest.java
@Test
void test_Conversion_Straight_Curly() {
- new Parser( "\"It's the 70's jack-o'-lantern\"").parse();
- new Parser( "Fish-'n'-chips!").parse();
- new Parser( "That's a 35' x 10\" yacht!").parse();
- //new Parser( "'70s are Sams' faves.'").parse();
+ parse( "\"It's the 70's jack-o'-lantern\"" );
+ parse( "Fish-'n'-chips!" );
+ parse( "That's a 35' x 10\" yacht!" );
+ parse( "He's a kinda' cat ya'll couldn't've known!" );
+ parse( "'70s are Sams' faves.'" );
+ parse( "'Twas and 'tis whate'er lay 'twixt dawn and dusk 'n River Styx." );
+ parse( """
+ But I must leave the proofs to those who 've seen 'em;
+ But this I heard her say, and can't be wrong
+ And all may think which way their judgments lean 'em,
+ ''T is strange—the Hebrew noun which means "I am,"
+ The English always use to govern d--n.'
+ """ );
+ }
+
+ private void parse( final String text ) {
+ final var parser = new Parser( text, System.out::println );
+ parser.parse();
}
}
lib/src/test/java/com/keenwrite/quotes/TokenizerTest.java
-/* Copyright 2020-2021 White Magic Software, Ltd. -- All rights reserved. */
-package com.keenwrite.quotes;
-
-import org.junit.jupiter.api.Test;
-
-import java.util.concurrent.atomic.AtomicInteger;
-
-import static com.keenwrite.quotes.TokenType.*;
-import static org.junit.jupiter.api.Assertions.assertEquals;
-
-/**
- * Tests tokenizing words, numbers, punctuation, spaces, and newlines.
- */
-class TokenizerTest {
- @Test
- void test_Tokenize_Words_TokenValues() {
- testText( "abc 123", "abc", " ", "123" );
- testText( "-123 abc", "-", "123", " ", "abc" );
- }
-
- @Test
- void test_Tokenize_Numbers_EmitNumbers() {
- testType( ".123", NUMBER );
- testType( "-123.", PUNCT, NUMBER, PERIOD );
- testType( " 123.123.123", SPACE, NUMBER );
- testType( "123 123\"", NUMBER, SPACE, NUMBER, QUOTE_DOUBLE );
- testType( "-123,123.123", PUNCT, NUMBER );
- }
-
- @Test
- void test_Tokenize_Words_EmitWords() {
- testType( "abc", WORD );
- testType( "abc abc", WORD, SPACE, WORD );
- testType( "abc123", WORD );
- testType( "-123abc", PUNCT, NUMBER, WORD );
- testType( "abc-o'-abc", WORD, PUNCT, WORD, QUOTE_SINGLE, PUNCT, WORD );
- }
-
- @Test
- void test_Tokenize_PunctuationMarks_EmitPunctuationMarks() {
- testType( "!", PUNCT );
- testType( ";", PUNCT );
- testType( ".", PERIOD );
- }
-
- @Test
- void test_Tokenize_Quotes_EmitQuotes() {
- testType( "'", QUOTE_SINGLE );
- testType( "\"", QUOTE_DOUBLE );
- testType( "3 o'clock", NUMBER, SPACE, WORD, QUOTE_SINGLE, WORD );
- }
-
- @Test
- void test_Tokenize_Newlines_EmitNewlines() {
- testType( "\r", NEWLINE );
- testType( "\n", NEWLINE );
- testType( "\r\n", NEWLINE );
- testType( "\r\n\r\n", NEWLINE, NEWLINE );
- testType( "\r\n\n\r", NEWLINE, NEWLINE, NEWLINE );
- testType( "abc \r\nabc\n", WORD, SPACE, NEWLINE, WORD, NEWLINE );
- }
-
- private void testType(
- final String actual, final TokenType... expected ) {
- final var tokenizer = new Tokenizer();
- final var counter = new AtomicInteger();
-
- tokenizer.tokenize( actual, ( token ) -> {
- final var expectedType = expected[ counter.getAndIncrement() ];
- final var actualType = token.getType();
- assertEquals( expectedType, actualType );
- } );
-
- // Ensure all expected tokens are matched (verify end of text reached).
- assertEquals( expected.length, counter.get() );
- }
-
- private void testText( final String actual, final String... expected ) {
- final var tokenizer = new Tokenizer();
- final var counter = new AtomicInteger();
-
- tokenizer.tokenize( actual, ( token ) -> {
- final var expectedText = expected[ counter.getAndIncrement() ];
- final var actualText = token.toString( actual );
- assertEquals( expectedText, actualText );
- } );
-
- // Ensure all expected tokens are matched (verify end of text reached).
- assertEquals( expected.length, counter.get() );
- }
-}
Delta455 lines added, 343 lines removed, 112-line increase