| Author | Dave Jarvis <email> |
|---|---|
| Date | 2021-06-01 20:48:24 GMT-0700 |
| Commit | 2a21d6df7f12fb8eb056dbb2931044f65e30a772 |
| Parent | 6f11949 |
| Delta | 123 lines added, 306 lines removed, 183-line decrease |
|---|
| -/* Copyright 2020-2021 White Magic Software, Ltd. -- All rights reserved. */ | ||
| +/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */ | ||
| package com.keenwrite.quotes; | ||
| * a similar approach to run-length encoding). | ||
| */ | ||
| -public class Lexeme { | ||
| +public class Lexeme implements Comparable<Lexeme> { | ||
| /** | ||
| * Denotes there are no more lexemes: the end of text (EOT) has been reached. | ||
| */ | ||
| - public static final Lexeme EOT = new Lexeme( false ); | ||
| + public static final Lexeme EOT = new Lexeme(); | ||
| /** | ||
| * Denotes parsing at the start of text. This is useful to avoid branching | ||
| * conditions while iterating. | ||
| */ | ||
| - public static final Lexeme SOT = new Lexeme( true ); | ||
| + public static final Lexeme SOT = new Lexeme(); | ||
| private final LexemeType mType; | ||
| private final int mBegan; | ||
| private final int mEnded; | ||
| - | ||
| - /** | ||
| - * Set to {@code true} if there are more lexemes to parse. | ||
| - */ | ||
| - private final boolean mHasNext; | ||
| - | ||
| - /** | ||
| - * Create a | ||
| - */ | ||
| - private Lexeme( final boolean next ) { | ||
| - this( FLAG, -1, -1, next ); | ||
| - } | ||
| /** | ||
| - * Create a lexeme that indicates there are no more lexemes. | ||
| + * Create a flag that indicates a stream marker (start or end). | ||
| */ | ||
| - private Lexeme( final LexemeType type, final int began, final int ended ) { | ||
| - this( type, began, ended, true ); | ||
| + private Lexeme() { | ||
| + this( FLAG, -1, -1 ); | ||
| } | ||
| /** | ||
| * Create a lexeme that represents a section of the text. | ||
| */ | ||
| - private Lexeme( | ||
| - final LexemeType type, final int began, final int ended, | ||
| - final boolean hasNext ) { | ||
| + private Lexeme( final LexemeType type, final int began, final int ended ) { | ||
| mType = type; | ||
| mBegan = began; | ||
| mEnded = ended + 1; | ||
| - mHasNext = hasNext; | ||
| } | ||
| public String toString( final String text ) { | ||
| return text.substring( mBegan, mEnded ); | ||
| - } | ||
| - | ||
| - public boolean hasNext() { | ||
| - return mHasNext; | ||
| } | ||
| LexemeType getType() { | ||
| return mType; | ||
| + } | ||
| + | ||
| + int began() { | ||
| + return mBegan; | ||
| + } | ||
| + | ||
| + int ended() { | ||
| + return mEnded; | ||
| + } | ||
| + | ||
| + @Override | ||
| + public int compareTo( final Lexeme that ) { | ||
| + return this.mBegan - that.mBegan; | ||
| } | ||
| -/* Copyright 2020-2021 White Magic Software, Ltd. -- All rights reserved. */ | ||
| +/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */ | ||
| package com.keenwrite.quotes; | ||
| public enum LexemeType { | ||
| QUOTE_SINGLE, | ||
| QUOTE_DOUBLE, | ||
| + ESC_SINGLE, | ||
| + ESC_DOUBLE, | ||
| WORD, | ||
| NUMBER, |
| -/* Copyright 2020-2021 White Magic Software, Ltd. -- All rights reserved. */ | ||
| +/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */ | ||
| package com.keenwrite.quotes; | ||
| import java.text.CharacterIterator; | ||
| import java.text.StringCharacterIterator; | ||
| -import java.util.Iterator; | ||
| import static com.keenwrite.quotes.Lexeme.createLexeme; | ||
| import static com.keenwrite.quotes.LexemeType.*; | ||
| import static java.lang.Character.*; | ||
| import static java.text.CharacterIterator.DONE; | ||
| /** | ||
| * Turns text into words, numbers, punctuation, spaces, and more. | ||
| */ | ||
| -public class Lexer implements Iterator<Lexeme> { | ||
| +public class Lexer { | ||
| private final CharacterIterator mIterator; | ||
| - private Lexeme mLexeme = Lexeme.SOT; | ||
| /** | ||
| * Default constructor, no state. | ||
| */ | ||
| public Lexer( final String text ) { | ||
| mIterator = new StringCharacterIterator( text ); | ||
| - } | ||
| - | ||
| - @Override | ||
| - public boolean hasNext() { | ||
| - return mLexeme.hasNext(); | ||
| } | ||
| - @Override | ||
| public Lexeme next() { | ||
| - return mLexeme = parse( mIterator ); | ||
| + return parse( mIterator ); | ||
| } | ||
| else if( isWhitespace( curr ) ) { | ||
| lexeme = createLexeme( SPACE, began, i.getIndex() ); | ||
| + } | ||
| + else if( curr == '\\' ) { | ||
| + final var next = i.next(); | ||
| + | ||
| + if( next == '\'' ) { | ||
| + lexeme = createLexeme( ESC_SINGLE, began, i.getIndex() ); | ||
| + } | ||
| + else if( next == '\"' ) { | ||
| + lexeme = createLexeme( ESC_DOUBLE, began, i.getIndex() ); | ||
| + } | ||
| + else { | ||
| + // Push back the escaped character, which wasn't a straight quote. | ||
| + i.previous(); | ||
| + } | ||
| } | ||
| else if( curr != DONE ) { | ||
| * First, handle single quotes as apostrophes, which include: | ||
| * <ol> | ||
| + * <li>Escaped single quote (BACKSLASH ' ) -- \'</li> | ||
| + * <li>Escaped double quote (BACKSLASH " ) -- \"</li> | ||
| * <li>Inner contractions (WORD ' WORD) -- you'd've</li> | ||
| * <li>Inner contractions (PERIOD ' WORD) -- Ph.d.'ll</li> | ||
| public void parse( final Consumer<Token> consumer ) { | ||
| // Create/convert a list of all unambiguous quote characters. | ||
| - while( mLexer.hasNext() ) { | ||
| - parse( mLexer.next(), consumer ); | ||
| + Lexeme lexeme; | ||
| + | ||
| + while( (lexeme = mLexer.next()) != EOT ) { | ||
| + parse( lexeme, consumer ); | ||
| } | ||
| beginsUnambiguously( lex2.toString( mText ) ) ) { | ||
| consumer.accept( new Token( QUOTE_APOSTROPHE, lex1 ) ); | ||
| + } | ||
| + else if( lex1.isType( ESC_SINGLE ) ) { | ||
| + consumer.accept( new Token( QUOTE_STRAIGHT_SINGLE, lex1 ) ); | ||
| + } | ||
| + else if( lex1.isType( ESC_DOUBLE ) ) { | ||
| + consumer.accept( new Token( QUOTE_STRAIGHT_DOUBLE, lex2 ) ); | ||
| } | ||
| else if( lex1.anyType( QUOTE_SINGLE, QUOTE_DOUBLE ) ) { | ||
| mQuotations.push( lex1 ); | ||
| - System.out.println( "FOUND QUOTE: " + lex1 + " " + lex1.toString(mText)); | ||
| - } | ||
| - else { | ||
| - System.out.println( lex1 ); | ||
| + System.out.println( "FOUND QUOTE: " + lex1 + " " + lex1.toString( mText ) ); | ||
| } | ||
| } | ||
| package com.keenwrite.quotes; | ||
| +import java.util.ArrayList; | ||
| +import java.util.Map; | ||
| + | ||
| +import static com.keenwrite.quotes.TokenType.*; | ||
| +import static java.util.Collections.sort; | ||
| + | ||
| /** | ||
| * Responsible for converting straight quotes into smart quotes. | ||
| */ | ||
| public class SmartQuotes { | ||
| - public String replace( final String text ) { | ||
| - final StringBuilder sb = new StringBuilder( text ); | ||
| + private static final Map<TokenType, String> REPLACEMENTS = Map.of( | ||
| + QUOTE_STRAIGHT_SINGLE, "'", | ||
| + QUOTE_STRAIGHT_DOUBLE, "\"", | ||
| + QUOTE_APOSTROPHE, "'", | ||
| + QUOTE_PRIME_SINGLE, "′", | ||
| + QUOTE_PRIME_DOUBLE, "″" | ||
| + ); | ||
| + public String replace( final String text ) { | ||
| final var parser = new Parser( text ); | ||
| - parser.parse( (token) -> { | ||
| + final var tokens = new ArrayList<Token>(); | ||
| - } ); | ||
| + parser.parse( tokens::add ); | ||
| + sort( tokens ); | ||
| - return sb.toString(); | ||
| + final var result = new StringBuilder( text.length() ); | ||
| + var position = 0; | ||
| + | ||
| + for( final var token : tokens ) { | ||
| + result.append( text, position, token.began() ); | ||
| + result.append( REPLACEMENTS.get( token.getType() ) ); | ||
| + | ||
| + position = token.ended(); | ||
| + } | ||
| + | ||
| + return result.append( text.substring( position ) ).toString(); | ||
| } | ||
| } |
| -/* Copyright 2020-2021 White Magic Software, Ltd. -- All rights reserved. */ | ||
| +/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */ | ||
| package com.keenwrite.quotes; | ||
| /** | ||
| * Represents a high-level token read from the text. | ||
| */ | ||
| -public class Token { | ||
| - private TokenType mType; | ||
| - private Lexeme mLexeme; | ||
| +public class Token implements Comparable<Token> { | ||
| + private final TokenType mType; | ||
| + private final Lexeme mLexeme; | ||
| public Token( final TokenType type, final Lexeme lexeme ) { | ||
| TokenType getType() { | ||
| return mType; | ||
| + } | ||
| + | ||
| + int began() { | ||
| + return mLexeme.began(); | ||
| + } | ||
| + | ||
| + int ended() { | ||
| + return mLexeme.ended(); | ||
| + } | ||
| + | ||
| + @Override | ||
| + public int compareTo( final Token that ) { | ||
| + return mLexeme.compareTo( that.mLexeme ); | ||
| } | ||
| -/* Copyright 2020-2021 White Magic Software, Ltd. -- All rights reserved. */ | ||
| +/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */ | ||
| package com.keenwrite.quotes; | ||
| QUOTE_CLOSING_DOUBLE, | ||
| QUOTE_APOSTROPHE, | ||
| + QUOTE_STRAIGHT_SINGLE, | ||
| + QUOTE_STRAIGHT_DOUBLE, | ||
| QUOTE_PRIME_SINGLE, | ||
| QUOTE_PRIME_DOUBLE, | ||
| import java.util.function.BiFunction; | ||
| +import static com.keenwrite.quotes.Lexeme.EOT; | ||
| import static com.keenwrite.quotes.LexemeType.*; | ||
| import static org.junit.jupiter.api.Assertions.assertEquals; | ||
| testType( "\"", QUOTE_DOUBLE ); | ||
| testType( "3 o'clock", NUMBER, SPACE, WORD, QUOTE_SINGLE, WORD ); | ||
| + } | ||
| + | ||
| + @Test | ||
| + void test_Lexing_Escapes_EmitEscapedQuotes() { | ||
| + testType( "123\\'456\\\"", NUMBER, ESC_SINGLE, NUMBER, ESC_DOUBLE ); | ||
| + testText( "123\\'456\\\"", "123", "\\'", "456", "\\\"" ); | ||
| } | ||
| private void testType( final String actual, final LexemeType... expected ) { | ||
| - testType( | ||
| - actual, ( lexeme, text ) -> lexeme.getType(), Arrays.asList( expected ) ); | ||
| + final var list = Arrays.asList( expected ); | ||
| + testType( actual, ( lexeme, text ) -> lexeme.getType(), list ); | ||
| } | ||
| var counter = 0; | ||
| - do { | ||
| - final var lexeme = lexer.next(); | ||
| + Lexeme lexeme; | ||
| + | ||
| + while( (lexeme = lexer.next()) != EOT ) { | ||
| final var expected = elements.get( counter++ ); | ||
| final var actual = f.apply( lexeme, text ); | ||
| + | ||
| assertEquals( expected, actual ); | ||
| } | ||
| - while( lexer.hasNext() ); | ||
| // Ensure all expected values are matched (verify end of text reached). | ||
| -/* Copyright 2020-2021 White Magic Software, Ltd. -- All rights reserved. */ | ||
| +/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */ | ||
| package com.keenwrite.quotes; | ||
| import org.junit.jupiter.api.Test; | ||
| import java.io.BufferedReader; | ||
| import java.io.IOException; | ||
| import java.io.InputStreamReader; | ||
| +import java.util.function.Function; | ||
| import static org.junit.jupiter.api.Assertions.assertEquals; | ||
| public void test_Parse_StraightQuotes_CurlyQuotes() throws IOException { | ||
| final var fixer = new SmartQuotes(); | ||
| + testParser( fixer::replace ); | ||
| + } | ||
| + public void testParser( final Function<String, String> parser ) | ||
| + throws IOException { | ||
| try( final var reader = openResource( "smartypants.txt" ) ) { | ||
| String line; | ||
| } | ||
| - final var actual = fixer.replace( testLine ); | ||
| - assertEquals(expected, actual); | ||
| + final var actual = parser.apply( testLine ); | ||
| + assertEquals( expected, actual ); | ||
| testLine = ""; | ||
| -# ######################################################################## | ||
| -# Primes (single, double) | ||
| -# ######################################################################## | ||
| -She stood 5\'7\". | ||
| -She stood 5′7″. | ||
| - | ||
| -# No space after the feet sign. | ||
| -It's 4'11" away. | ||
| -It's 4′11″ away. | ||
| - | ||
| -Alice's friend is 6'3" tall. | ||
| -Alice's friend is 6′3″ tall. | ||
| - | ||
| -Bob's table is 5'' × 4''. | ||
| -Bob's table is 5″ × 4″. | ||
| - | ||
| -Bob's table is 5'×4'. | ||
| -Bob's table is 5′×4′. | ||
| - | ||
| -What's this -5.5'' all about? | ||
| -What's this -5.5″ all about? | ||
| - | ||
| -+7.9'' is weird. | ||
| -+7.9″ is weird. | ||
| - | ||
| -Foolscap? Naw, I use 11.5"x14.25" paper! | ||
| -Foolscap? Naw, I use 11.5″x14.25″ paper! | ||
| - | ||
| -An angular measurement, 3° 5' 30" means 3 degs, 5 arcmins, and 30 arcsecs. | ||
| -An angular measurement, 3° 5′ 30″ means 3 degs, 5 arcmins, and 30 arcsecs. | ||
| - | ||
| -# ######################################################################## | ||
| -# Inside contractions (no leading/trailing apostrophes) | ||
| -# ######################################################################## | ||
| -I don't like it: I love's it! | ||
| -I don't like it: I love's it! | ||
| - | ||
| -We'd've thought that pancakes'll be sweeter there. | ||
| -We'd've thought that pancakes'll be sweeter there. | ||
| - | ||
| -She'd be coming o'er when the horse'd gone to pasture... | ||
| -She'd be coming o'er when the horse'd gone to pasture... | ||
| - | ||
| -# ######################################################################## | ||
| -# Beginning contractions (leading apostrophes) | ||
| -# ######################################################################## | ||
| -'Twas and 'tis whate'er lay 'twixt dawn and dusk 'n River Styx. | ||
| -'Twas and 'tis whate'er lay 'twixt dawn and dusk 'n River Styx. | ||
| - | ||
| -# ######################################################################## | ||
| -# Outside contractions (leading and trailing, no middle) | ||
| -# ######################################################################## | ||
| -Salt 'n' vinegar, fish-'n'-chips, sugar 'n' spice! | ||
| -Salt 'n' vinegar, fish-'n'-chips, sugar 'n' spice! | ||
| - | ||
| -# ######################################################################## | ||
| -# Ending contractions (trailing apostrophes) | ||
| -# ######################################################################## | ||
| -Didn' get th' message. | ||
| -Didn' get th' message. | ||
| - | ||
| -Namsayin', y'know what I'ma sayin'? | ||
| -Namsayin', y'know what I'ma sayin'? | ||
| - | ||
| -# ######################################################################## | ||
| -# Decades | ||
| -# ######################################################################## | ||
| -The Roaring '20s had the best music, no? | ||
| -The Roaring '20s had the best music, no? | ||
| - | ||
| -Took place in '04, yes'm! | ||
| -Took place in '04, yes'm! | ||
| - | ||
| -# ######################################################################## | ||
| -# Backticks (left and right double quotes) | ||
| -# ######################################################################## | ||
| -``I am Sam'' | ||
| -“I am Sam” | ||
| - | ||
| -``Sam's away today'' | ||
| -“Sam's away today” | ||
| - | ||
| -``Sam's gone! | ||
| -“Sam's gone! | ||
| - | ||
| -``5'10" tall 'e was!'' | ||
| -“5′10″ tall 'e was!” | ||
| - | ||
| -# ######################################################################## | ||
| -# Consecutive quotes | ||
| -# ######################################################################## | ||
| -"'I'm trouble.'" | ||
| -“‘I'm trouble.’” | ||
| - | ||
| -'"Trouble's my name."' | ||
| -‘“Trouble's my name.“‘ | ||
| - | ||
| -# ######################################################################## | ||
| -# Escaped quotes | ||
| -# ######################################################################## | ||
| -\"What?\" | ||
| -“What?” | ||
| - | ||
| -# ######################################################################## | ||
| -# Double quotes | ||
| -# ######################################################################## | ||
| -"I am Sam" | ||
| -“I am Sam” | ||
| - | ||
| -"...even better!" | ||
| -“...even better!” | ||
| - | ||
| -"It was so," said he. | ||
| -“It was so,” said he. | ||
| - | ||
| -"She said, 'Llamas'll languish, they'll-- | ||
| -“She said, ‘Llamas'll languish, they'll-- | ||
| - | ||
| -With "air quotes" in the middle. | ||
| -With “air quotes” in the middle. | ||
| - | ||
| -With--"air quotes"--and dashes. | ||
| -With--“air quotes”--and dashes. | ||
| - | ||
| -"Not "quite" what you expected?" | ||
| -“Not “quite” what you expected?” | ||
| - | ||
| -# ######################################################################## | ||
| -# Nested quotations | ||
| -# ######################################################################## | ||
| -"'Here I am,' said Sam" | ||
| -“‘Here I am,’ said Sam” | ||
| - | ||
| -'"Here I am," said Sam' | ||
| -‘“Here I am,”, said Sam’ | ||
| - | ||
| -'Hello, "Dr. Brown," what's your real name?' | ||
| -‘Hello, “Dr. Brown,” what's your real name?’ | ||
| - | ||
| -"'Twas, t'wasn't thy name, 'twas it?" said Jim "the Barber" Brown. | ||
| -“'Twas, t'wasn't thy name, 'twas it?” said Jim “the Barber” Brown. | ||
| - | ||
| -# ######################################################################## | ||
| -# Single quotes | ||
| -# ######################################################################## | ||
| -'I am Sam' | ||
| -‘I am Sam’ | ||
| - | ||
| -'It was so,' said he. | ||
| -‘It was so,’ said he. | ||
| - | ||
| -'...even better!' | ||
| -‘...even better!’ | ||
| - | ||
| -With 'quotes' in the middle. | ||
| -With ‘quotes’ in the middle. | ||
| - | ||
| -With--'imaginary'--dashes. | ||
| -With--‘imaginary’--dashes. | ||
| - | ||
| -'Not 'quite' what you expected?' | ||
| -‘Not ‘quite’ what you expected?’ | ||
| - | ||
| -''Cause I don't like it, 's why,' said Pat. | ||
| -‘'Cause I don't like it, 's why,’ said Pat. | ||
| - | ||
| -'It's a beautiful day!' | ||
| -‘It's a beautiful day!’ | ||
| - | ||
| -'He said, "Thinkin'."' | ||
| -‘He said, “Thinkin’.”’ | ||
| - | ||
| -# ######################################################################## | ||
| -# Possessives | ||
| -# ######################################################################## | ||
| -Sam's Sams' and the Ross's roses' thorns were prickly. | ||
| -Sam's Sams' and the Ross's roses' thorns were prickly. | ||
| - | ||
| -# ######################################################################## | ||
| -# Mixed | ||
| -# ######################################################################## | ||
| -"I heard she said, 'That's Sam's'," said the Sams' cat. | ||
| -“I heard she said, ‘That's Sam's’,” said the Sams' cat. | ||
| - | ||
| -"'Janes' said, ''E'll be spooky, Sam's son with the jack-o'-lantern!'" said the O'Mally twins'---y'know---ghosts in unison. | ||
| -“‘Janes' said, ‘'E'll be spooky, Sam's son with the jack-o'-lantern!’” said the O'Mally twins'---y'know---ghosts in unison. | ||
| - | ||
| -'He's at Sams' | ||
| -‘He' at Sams’ | ||
| - | ||
| -\"Hello!\" | ||
| -“Hello!” | ||
| - | ||
| -ma'am | ||
| -ma'am | ||
| - | ||
| -'Twas midnight | ||
| -'Twas midnight | ||
| - | ||
| -\"Hello,\" said the spider. \"'Shelob' is my name.\" | ||
| -“Hello,” said the spider. “‘Shelob’ is my name.” | ||
| - | ||
| -'A', 'B', and 'C' are letters. | ||
| -‘A’ ‘B’ and ‘C’ are letters. | ||
| - | ||
| -'Oak,' 'elm,' and 'beech' are names of trees. So is 'pine.' | ||
| -‘Oak,’ ‘elm,’ and ‘beech’ are names of trees. So is ‘pine.’ | ||
| - | ||
| -'He said, \"I want to go.\"' Were you alive in the 70's? | ||
| -‘He said, “I want to go.”’ Were you alive in the 70's? | ||
| - | ||
| -\"That's a 'magic' sock.\" | ||
| -“That's a ‘magic’ sock.” | ||
| - | ||
| -Website! Company Name, Inc. (\"Company Name\" or \"Company\") recommends reading the following terms and conditions, carefully: | ||
| -Website! Company Name, Inc. (“Company Name” or “Company”) recommends reading the following terms and conditions, carefully: | ||
| - | ||
| -Website! Company Name, Inc. ('Company Name' or 'Company') recommends reading the following terms and conditions, carefully: | ||
| -Website! Company Name, Inc. (‘Company Name’ or ‘Company’) recommends reading the following terms and conditions, carefully: | ||
| - | ||
| -Workin' hard | ||
| -Workin' hard | ||
| - | ||
| -'70s are my favorite numbers,' she said. | ||
| -‘70s are my favorite numbers,’ she said. | ||
| - | ||
| -'70s fashion was weird. | ||
| -'70s fashion was weird. | ||
| - | ||
| -12\" record, 5'10\" height | ||
| -12″ record, 5′10″ height | ||
| - | ||
| -Model \"T2000\" | ||
| -Model “T2000” | ||
| - | ||
| -iPad 3's battery life is not great. | ||
| -iPad 3's battery life is not great. | ||
| - | ||
| -Book 'em, Danno. Rock 'n' roll. 'Cause 'twas the season. | ||
| -Book 'em, Danno. Rock 'n' roll. 'Cause 'twas the season. | ||
| - | ||
| -'85 was a good year. (The entire '80s were.) | ||
| -'85 was a good year. (The entire '80s were.) | ||