Dave Jarvis' Repositories

git clone https://repo.autonoma.ca/repo/keenquotes.git

Fix lexer tests

Author Dave Jarvis <email>
Date 2021-06-01 20:48:24 GMT-0700
Commit 2a21d6df7f12fb8eb056dbb2931044f65e30a772
Parent 6f11949
Delta 123 lines added, 306 lines removed, 183-line decrease
lib/src/main/java/com/keenwrite/quotes/Lexeme.java
-/* Copyright 2020-2021 White Magic Software, Ltd. -- All rights reserved. */
+/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */
package com.keenwrite.quotes;
* a similar approach to run-length encoding).
*/
-public class Lexeme {
+public class Lexeme implements Comparable<Lexeme> {
/**
* Denotes there are no more lexemes: the end of text (EOT) has been reached.
*/
- public static final Lexeme EOT = new Lexeme( false );
+ public static final Lexeme EOT = new Lexeme();
/**
* Denotes parsing at the start of text. This is useful to avoid branching
* conditions while iterating.
*/
- public static final Lexeme SOT = new Lexeme( true );
+ public static final Lexeme SOT = new Lexeme();
private final LexemeType mType;
private final int mBegan;
private final int mEnded;
-
- /**
- * Set to {@code true} if there are more lexemes to parse.
- */
- private final boolean mHasNext;
-
- /**
- * Create a
- */
- private Lexeme( final boolean next ) {
- this( FLAG, -1, -1, next );
- }
/**
- * Create a lexeme that indicates there are no more lexemes.
+ * Create a flag that indicates a stream marker (start or end).
*/
- private Lexeme( final LexemeType type, final int began, final int ended ) {
- this( type, began, ended, true );
+ private Lexeme() {
+ this( FLAG, -1, -1 );
}
/**
* Create a lexeme that represents a section of the text.
*/
- private Lexeme(
- final LexemeType type, final int began, final int ended,
- final boolean hasNext ) {
+ private Lexeme( final LexemeType type, final int began, final int ended ) {
mType = type;
mBegan = began;
mEnded = ended + 1;
- mHasNext = hasNext;
}
public String toString( final String text ) {
return text.substring( mBegan, mEnded );
- }
-
- public boolean hasNext() {
- return mHasNext;
}
LexemeType getType() {
return mType;
+ }
+
+ int began() {
+ return mBegan;
+ }
+
+ int ended() {
+ return mEnded;
+ }
+
+ @Override
+ public int compareTo( final Lexeme that ) {
+ return this.mBegan - that.mBegan;
}
lib/src/main/java/com/keenwrite/quotes/LexemeType.java
-/* Copyright 2020-2021 White Magic Software, Ltd. -- All rights reserved. */
+/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */
package com.keenwrite.quotes;
public enum LexemeType {
QUOTE_SINGLE,
QUOTE_DOUBLE,
+ ESC_SINGLE,
+ ESC_DOUBLE,
WORD,
NUMBER,
lib/src/main/java/com/keenwrite/quotes/Lexer.java
-/* Copyright 2020-2021 White Magic Software, Ltd. -- All rights reserved. */
+/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */
package com.keenwrite.quotes;
import java.text.CharacterIterator;
import java.text.StringCharacterIterator;
-import java.util.Iterator;
import static com.keenwrite.quotes.Lexeme.createLexeme;
import static com.keenwrite.quotes.LexemeType.*;
import static java.lang.Character.*;
import static java.text.CharacterIterator.DONE;
/**
* Turns text into words, numbers, punctuation, spaces, and more.
*/
-public class Lexer implements Iterator<Lexeme> {
+public class Lexer {
private final CharacterIterator mIterator;
- private Lexeme mLexeme = Lexeme.SOT;
/**
* Default constructor, no state.
*/
public Lexer( final String text ) {
mIterator = new StringCharacterIterator( text );
- }
-
- @Override
- public boolean hasNext() {
- return mLexeme.hasNext();
}
- @Override
public Lexeme next() {
- return mLexeme = parse( mIterator );
+ return parse( mIterator );
}
else if( isWhitespace( curr ) ) {
lexeme = createLexeme( SPACE, began, i.getIndex() );
+ }
+ else if( curr == '\\' ) {
+ final var next = i.next();
+
+ if( next == '\'' ) {
+ lexeme = createLexeme( ESC_SINGLE, began, i.getIndex() );
+ }
+ else if( next == '\"' ) {
+ lexeme = createLexeme( ESC_DOUBLE, began, i.getIndex() );
+ }
+ else {
+ // Push back the escaped character, which wasn't a straight quote.
+ i.previous();
+ }
}
else if( curr != DONE ) {
lib/src/main/java/com/keenwrite/quotes/Parser.java
* First, handle single quotes as apostrophes, which include:
* <ol>
+ * <li>Escaped single quote (BACKSLASH ' ) -- \'</li>
+ * <li>Escaped double quote (BACKSLASH " ) -- \"</li>
* <li>Inner contractions (WORD ' WORD) -- you'd've</li>
* <li>Inner contractions (PERIOD ' WORD) -- Ph.d.'ll</li>
public void parse( final Consumer<Token> consumer ) {
// Create/convert a list of all unambiguous quote characters.
- while( mLexer.hasNext() ) {
- parse( mLexer.next(), consumer );
+ Lexeme lexeme;
+
+ while( (lexeme = mLexer.next()) != EOT ) {
+ parse( lexeme, consumer );
}
beginsUnambiguously( lex2.toString( mText ) ) ) {
consumer.accept( new Token( QUOTE_APOSTROPHE, lex1 ) );
+ }
+ else if( lex1.isType( ESC_SINGLE ) ) {
+ consumer.accept( new Token( QUOTE_STRAIGHT_SINGLE, lex1 ) );
+ }
+ else if( lex1.isType( ESC_DOUBLE ) ) {
+ consumer.accept( new Token( QUOTE_STRAIGHT_DOUBLE, lex2 ) );
}
else if( lex1.anyType( QUOTE_SINGLE, QUOTE_DOUBLE ) ) {
mQuotations.push( lex1 );
- System.out.println( "FOUND QUOTE: " + lex1 + " " + lex1.toString(mText));
- }
- else {
- System.out.println( lex1 );
+ System.out.println( "FOUND QUOTE: " + lex1 + " " + lex1.toString( mText ) );
}
}
lib/src/main/java/com/keenwrite/quotes/SmartQuotes.java
package com.keenwrite.quotes;
+import java.util.ArrayList;
+import java.util.Map;
+
+import static com.keenwrite.quotes.TokenType.*;
+import static java.util.Collections.sort;
+
/**
* Responsible for converting straight quotes into smart quotes.
*/
public class SmartQuotes {
- public String replace( final String text ) {
- final StringBuilder sb = new StringBuilder( text );
+ private static final Map<TokenType, String> REPLACEMENTS = Map.of(
+ QUOTE_STRAIGHT_SINGLE, "'",
+ QUOTE_STRAIGHT_DOUBLE, "\"",
+ QUOTE_APOSTROPHE, "&apos;",
+ QUOTE_PRIME_SINGLE, "&prime;",
+ QUOTE_PRIME_DOUBLE, "&Prime;"
+ );
+ public String replace( final String text ) {
final var parser = new Parser( text );
- parser.parse( (token) -> {
+ final var tokens = new ArrayList<Token>();
- } );
+ parser.parse( tokens::add );
+ sort( tokens );
- return sb.toString();
+ final var result = new StringBuilder( text.length() );
+ var position = 0;
+
+ for( final var token : tokens ) {
+ result.append( text, position, token.began() );
+ result.append( REPLACEMENTS.get( token.getType() ) );
+
+ position = token.ended();
+ }
+
+ return result.append( text.substring( position ) ).toString();
}
}
lib/src/main/java/com/keenwrite/quotes/Token.java
-/* Copyright 2020-2021 White Magic Software, Ltd. -- All rights reserved. */
+/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */
package com.keenwrite.quotes;
/**
* Represents a high-level token read from the text.
*/
-public class Token {
- private TokenType mType;
- private Lexeme mLexeme;
+public class Token implements Comparable<Token> {
+ private final TokenType mType;
+ private final Lexeme mLexeme;
public Token( final TokenType type, final Lexeme lexeme ) {
TokenType getType() {
return mType;
+ }
+
+ int began() {
+ return mLexeme.began();
+ }
+
+ int ended() {
+ return mLexeme.ended();
+ }
+
+ @Override
+ public int compareTo( final Token that ) {
+ return mLexeme.compareTo( that.mLexeme );
}
lib/src/main/java/com/keenwrite/quotes/TokenType.java
-/* Copyright 2020-2021 White Magic Software, Ltd. -- All rights reserved. */
+/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */
package com.keenwrite.quotes;
QUOTE_CLOSING_DOUBLE,
QUOTE_APOSTROPHE,
+ QUOTE_STRAIGHT_SINGLE,
+ QUOTE_STRAIGHT_DOUBLE,
QUOTE_PRIME_SINGLE,
QUOTE_PRIME_DOUBLE,
lib/src/test/java/com/keenwrite/quotes/LexerTest.java
import java.util.function.BiFunction;
+import static com.keenwrite.quotes.Lexeme.EOT;
import static com.keenwrite.quotes.LexemeType.*;
import static org.junit.jupiter.api.Assertions.assertEquals;
testType( "\"", QUOTE_DOUBLE );
testType( "3 o'clock", NUMBER, SPACE, WORD, QUOTE_SINGLE, WORD );
+ }
+
+ @Test
+ void test_Lexing_Escapes_EmitEscapedQuotes() {
+ testType( "123\\'456\\\"", NUMBER, ESC_SINGLE, NUMBER, ESC_DOUBLE );
+ testText( "123\\'456\\\"", "123", "\\'", "456", "\\\"" );
}
private void testType( final String actual, final LexemeType... expected ) {
- testType(
- actual, ( lexeme, text ) -> lexeme.getType(), Arrays.asList( expected ) );
+ final var list = Arrays.asList( expected );
+ testType( actual, ( lexeme, text ) -> lexeme.getType(), list );
}
var counter = 0;
- do {
- final var lexeme = lexer.next();
+ Lexeme lexeme;
+
+ while( (lexeme = lexer.next()) != EOT ) {
final var expected = elements.get( counter++ );
final var actual = f.apply( lexeme, text );
+
assertEquals( expected, actual );
}
- while( lexer.hasNext() );
// Ensure all expected values are matched (verify end of text reached).
lib/src/test/java/com/keenwrite/quotes/SmartQuotesTest.java
-/* Copyright 2020-2021 White Magic Software, Ltd. -- All rights reserved. */
+/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */
package com.keenwrite.quotes;
import org.junit.jupiter.api.Test;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
+import java.util.function.Function;
import static org.junit.jupiter.api.Assertions.assertEquals;
public void test_Parse_StraightQuotes_CurlyQuotes() throws IOException {
final var fixer = new SmartQuotes();
+ testParser( fixer::replace );
+ }
+ public void testParser( final Function<String, String> parser )
+ throws IOException {
try( final var reader = openResource( "smartypants.txt" ) ) {
String line;
}
- final var actual = fixer.replace( testLine );
- assertEquals(expected, actual);
+ final var actual = parser.apply( testLine );
+ assertEquals( expected, actual );
testLine = "";
lib/src/test/resources/com.keenwrite.quotes/smartypants.txt
-# ########################################################################
-# Primes (single, double)
-# ########################################################################
-She stood 5\'7\".
-She stood 5&prime;7&Prime;.
-
-# No space after the feet sign.
-It's 4'11" away.
-It&apos;s 4&prime;11&Prime; away.
-
-Alice's friend is 6'3" tall.
-Alice&apos;s friend is 6&prime;3&Prime; tall.
-
-Bob's table is 5'' × 4''.
-Bob&apos;s table is 5&Prime; × 4&Prime;.
-
-Bob's table is 5'×4'.
-Bob&apos;s table is 5&prime;×4&prime;.
-
-What's this -5.5'' all about?
-What&apos;s this -5.5&Prime; all about?
-
-+7.9'' is weird.
-+7.9&Prime; is weird.
-
-Foolscap? Naw, I use 11.5"x14.25" paper!
-Foolscap? Naw, I use 11.5&Prime;x14.25&Prime; paper!
-
-An angular measurement, 3° 5' 30" means 3 degs, 5 arcmins, and 30 arcsecs.
-An angular measurement, 3° 5&prime; 30&Prime; means 3 degs, 5 arcmins, and 30 arcsecs.
-
-# ########################################################################
-# Inside contractions (no leading/trailing apostrophes)
-# ########################################################################
-I don't like it: I love's it!
-I don&apos;t like it: I love&apos;s it!
-
-We'd've thought that pancakes'll be sweeter there.
-We&apos;d&apos;ve thought that pancakes&apos;ll be sweeter there.
-
-She'd be coming o'er when the horse'd gone to pasture...
-She&apos;d be coming o&apos;er when the horse&apos;d gone to pasture...
-
-# ########################################################################
-# Beginning contractions (leading apostrophes)
-# ########################################################################
-'Twas and 'tis whate'er lay 'twixt dawn and dusk 'n River Styx.
-&apos;Twas and &apos;tis whate&apos;er lay &apos;twixt dawn and dusk &apos;n River Styx.
-
-# ########################################################################
-# Outside contractions (leading and trailing, no middle)
-# ########################################################################
-Salt 'n' vinegar, fish-'n'-chips, sugar 'n' spice!
-Salt &apos;n&apos; vinegar, fish-&apos;n&apos;-chips, sugar &apos;n&apos; spice!
-
-# ########################################################################
-# Ending contractions (trailing apostrophes)
-# ########################################################################
-Didn' get th' message.
-Didn&apos; get th&apos; message.
-
-Namsayin', y'know what I'ma sayin'?
-Namsayin&apos;, y&apos;know what I&apos;ma sayin&apos;?
-
-# ########################################################################
-# Decades
-# ########################################################################
-The Roaring '20s had the best music, no?
-The Roaring &apos;20s had the best music, no?
-
-Took place in '04, yes'm!
-Took place in &apos;04, yes&apos;m!
-
-# ########################################################################
-# Backticks (left and right double quotes)
-# ########################################################################
-``I am Sam''
-&ldquo;I am Sam&rdquo;
-
-``Sam's away today''
-&ldquo;Sam&apos;s away today&rdquo;
-
-``Sam's gone!
-&ldquo;Sam&apos;s gone!
-
-``5'10" tall 'e was!''
-&ldquo;5&prime;10&Prime; tall &apos;e was!&rdquo;
-
-# ########################################################################
-# Consecutive quotes
-# ########################################################################
-"'I'm trouble.'"
-&ldquo;&lsquo;I&apos;m trouble.&rsquo;&rdquo;
-
-'"Trouble's my name."'
-&lsquo;&ldquo;Trouble&apos;s my name.&ldquo;&lsquo;
-
-# ########################################################################
-# Escaped quotes
-# ########################################################################
-\"What?\"
-&ldquo;What?&rdquo;
-
-# ########################################################################
-# Double quotes
-# ########################################################################
-"I am Sam"
-&ldquo;I am Sam&rdquo;
-
-"...even better!"
-&ldquo;...even better!&rdquo;
-
-"It was so," said he.
-&ldquo;It was so,&rdquo; said he.
-
-"She said, 'Llamas'll languish, they'll--
-&ldquo;She said, &lsquo;Llamas&apos;ll languish, they&apos;ll--
-
-With "air quotes" in the middle.
-With &ldquo;air quotes&rdquo; in the middle.
-
-With--"air quotes"--and dashes.
-With--&ldquo;air quotes&rdquo;--and dashes.
-
-"Not "quite" what you expected?"
-&ldquo;Not &ldquo;quite&rdquo; what you expected?&rdquo;
-
-# ########################################################################
-# Nested quotations
-# ########################################################################
-"'Here I am,' said Sam"
-&ldquo;&lsquo;Here I am,&rsquo; said Sam&rdquo;
-
-'"Here I am," said Sam'
-&lsquo;&ldquo;Here I am,&rdquo;, said Sam&rsquo;
-
-'Hello, "Dr. Brown," what's your real name?'
-&lsquo;Hello, &ldquo;Dr. Brown,&rdquo; what's your real name?&rsquo;
-
-"'Twas, t'wasn't thy name, 'twas it?" said Jim "the Barber" Brown.
-&ldquo;&apos;Twas, t&apos;wasn&apos;t thy name, &apos;twas it?&rdquo; said Jim &ldquo;the Barber&rdquo; Brown.
-
-# ########################################################################
-# Single quotes
-# ########################################################################
-'I am Sam'
-&lsquo;I am Sam&rsquo;
-
-'It was so,' said he.
-&lsquo;It was so,&rsquo; said he.
-
-'...even better!'
-&lsquo;...even better!&rsquo;
-
-With 'quotes' in the middle.
-With &lsquo;quotes&rsquo; in the middle.
-
-With--'imaginary'--dashes.
-With--&lsquo;imaginary&rsquo;--dashes.
-
-'Not 'quite' what you expected?'
-&lsquo;Not &lsquo;quite&rsquo; what you expected?&rsquo;
-
-''Cause I don't like it, 's why,' said Pat.
-&lsquo;&apos;Cause I don't like it, &apos;s why,&rsquo; said Pat.
-
-'It's a beautiful day!'
-&lsquo;It&apos;s a beautiful day!&rsquo;
-
-'He said, "Thinkin'."'
-&lsquo;He said, &ldquo;Thinkin&rsquo;.&rdquo;&rsquo;
-
-# ########################################################################
-# Possessives
-# ########################################################################
-Sam's Sams' and the Ross's roses' thorns were prickly.
-Sam&apos;s Sams&apos; and the Ross&apos;s roses&apos; thorns were prickly.
-
-# ########################################################################
-# Mixed
-# ########################################################################
-"I heard she said, 'That's Sam's'," said the Sams' cat.
-&ldquo;I heard she said, &lsquo;That&apos;s Sam&apos;s&rsquo;,&rdquo; said the Sams&apos; cat.
-
-"'Janes' said, ''E'll be spooky, Sam's son with the jack-o'-lantern!'" said the O'Mally twins'---y'know---ghosts in unison.
-&ldquo;&lsquo;Janes&apos; said, &lsquo;&apos;E&apos;ll be spooky, Sam&apos;s son with the jack-o&apos;-lantern!&rsquo;&rdquo; said the O&apos;Mally twins&apos;---y&apos;know---ghosts in unison.
-
-'He's at Sams'
-&lsquo;He&apos; at Sams&rsquo;
-
-\"Hello!\"
-&ldquo;Hello!&rdquo;
-
-ma'am
-ma&apos;am
-
-'Twas midnight
-&apos;Twas midnight
-
-\"Hello,\" said the spider. \"'Shelob' is my name.\"
-&ldquo;Hello,&rdquo; said the spider. &ldquo;&lsquo;Shelob&rsquo; is my name.&rdquo;
-
-'A', 'B', and 'C' are letters.
-&lsquo;A&rsquo; &lsquo;B&rsquo; and &lsquo;C&rsquo; are letters.
-
-'Oak,' 'elm,' and 'beech' are names of trees. So is 'pine.'
-&lsquo;Oak,&rsquo; &lsquo;elm,&rsquo; and &lsquo;beech&rsquo; are names of trees. So is &lsquo;pine.&rsquo;
-
-'He said, \"I want to go.\"' Were you alive in the 70's?
-&lsquo;He said, &ldquo;I want to go.&rdquo;&rsquo; Were you alive in the 70&apos;s?
-
-\"That's a 'magic' sock.\"
-&ldquo;That&apos;s a &lsquo;magic&rsquo; sock.&rdquo;
-
-Website! Company Name, Inc. (\"Company Name\" or \"Company\") recommends reading the following terms and conditions, carefully:
-Website! Company Name, Inc. (&ldquo;Company Name&rdquo; or &ldquo;Company&rdquo;) recommends reading the following terms and conditions, carefully:
-
-Website! Company Name, Inc. ('Company Name' or 'Company') recommends reading the following terms and conditions, carefully:
-Website! Company Name, Inc. (&lsquo;Company Name&rsquo; or &lsquo;Company&rsquo;) recommends reading the following terms and conditions, carefully:
-
-Workin' hard
-Workin&apos; hard
-
-'70s are my favorite numbers,' she said.
-&lsquo;70s are my favorite numbers,&rsquo; she said.
-
-'70s fashion was weird.
-&apos;70s fashion was weird.
-
-12\" record, 5'10\" height
-12&Prime; record, 5&prime;10&Prime; height
-
-Model \"T2000\"
-Model &ldquo;T2000&rdquo;
-
-iPad 3's battery life is not great.
-iPad 3&apos;s battery life is not great.
-
-Book 'em, Danno. Rock 'n' roll. 'Cause 'twas the season.
-Book &apos;em, Danno. Rock &apos;n&apos; roll. &apos;Cause &apos;twas the season.
-
-'85 was a good year. (The entire '80s were.)
-&apos;85 was a good year. (The entire &apos;80s were.)