| Author | Dave Jarvis <email> |
|---|---|
| Date | 2021-07-02 20:10:52 GMT-0700 |
| Commit | 3a2ec71735df42e81a6e7ae96282ef4115ab5de3 |
| Parent | d7274c4 |
| +/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */ | ||
| package com.whitemagicsoftware.keenquotes; | ||
| + | ||
| +import com.whitemagicsoftware.keenquotes.ParserFactory.ParserType; | ||
| import java.util.ArrayList; | ||
| private final Contractions mContractions; | ||
| private final Map<TokenType, String> mReplacements; | ||
| + private final ParserFactory mFactory; | ||
| /** | ||
| * Maps quotes to HTML entities. | ||
| * | ||
| * @param unresolved Consumes {@link Lexeme}s that could not be converted | ||
| * into HTML entities. | ||
| + * @param parserType Creates a parser based on document content structure. | ||
| */ | ||
| - public Converter( final Consumer<Lexeme> unresolved ) { | ||
| - this( unresolved, new Contractions.Builder().build() ); | ||
| + public Converter( | ||
| + final Consumer<Lexeme> unresolved, final ParserType parserType ) { | ||
| + this( unresolved, new Contractions.Builder().build(), parserType ); | ||
| } | ||
| /** | ||
| * Maps quotes to HTML entities. | ||
| * | ||
| * @param unresolved Consumes {@link Lexeme}s that could not be converted | ||
| * into HTML entities. | ||
| + * @param parserType Creates a parser based on document content structure. | ||
| */ | ||
| public Converter( | ||
| final Consumer<Lexeme> unresolved, | ||
| - final Map<TokenType, String> replacements ) { | ||
| - this( unresolved, new Contractions.Builder().build(), replacements ); | ||
| + final Map<TokenType, String> replacements, | ||
| + final ParserType parserType ) { | ||
| + this( | ||
| + unresolved, new Contractions.Builder().build(), replacements, parserType | ||
| + ); | ||
| } | ||
| /** | ||
| * Maps quotes to HTML entities. | ||
| * | ||
| * @param unresolved Consumes {@link Lexeme}s that could not be converted | ||
| * into HTML entities. | ||
| * @param c Contractions listings. | ||
| + * @param parserType Creates a parser based on document content structure. | ||
| */ | ||
| - public Converter( final Consumer<Lexeme> unresolved, final Contractions c ) { | ||
| - this( unresolved, c, ENTITIES ); | ||
| + public Converter( | ||
| + final Consumer<Lexeme> unresolved, | ||
| + final Contractions c, | ||
| + final ParserType parserType ) { | ||
| + this( unresolved, c, ENTITIES, parserType ); | ||
| } | ||
| final Consumer<Lexeme> unresolved, | ||
| final Contractions c, | ||
| - final Map<TokenType, String> replacements ) { | ||
| + final Map<TokenType, String> replacements, | ||
| + final ParserType parserType ) { | ||
| mUnresolved = unresolved; | ||
| mContractions = c; | ||
| mReplacements = replacements; | ||
| + mFactory = new ParserFactory( parserType ); | ||
| } | ||
| @Override | ||
| public String apply( final String text ) { | ||
| - final var parser = new Parser( text, mContractions ); | ||
| + final var parser = mFactory.createParser( text, mContractions ); | ||
| final var tokens = new ArrayList<Token>(); | ||
| import java.util.Properties; | ||
| +import static com.whitemagicsoftware.keenquotes.ParserFactory.ParserType.PARSER_PLAIN; | ||
| import static java.lang.String.format; | ||
| import static java.lang.System.*; | ||
| else { | ||
| try { | ||
| - out.print( convert( new Converter( err::println, contractions ) ) ); | ||
| + final var c = new Converter( err::println, contractions, PARSER_PLAIN ); | ||
| + out.print( convert( c ) ); | ||
| } catch( final Exception ex ) { | ||
| ex.printStackTrace( err ); | ||
| int began = i.getIndex(); | ||
| boolean isWord = false; | ||
| - Lexeme lexeme; | ||
| + Lexeme lexeme = null; | ||
| do { | ||
| - if( (lexeme = preprocess( i )) != null ) { | ||
| - return lexeme; | ||
| + // Allow subclasses to skip character sequences. This allows XML tags | ||
| + // to be skipped. | ||
| + if( skip( i ) ) { | ||
| + began = i.getIndex(); | ||
| } | ||
| } | ||
| - Lexeme preprocess( final CharacterIterator i ) { | ||
| - return null; | ||
| + boolean skip( final CharacterIterator i ) { | ||
| + return false; | ||
| } | ||
| +/* Copyright 2021 White Magic Software, Ltd. -- All rights reserved. */ | ||
| +package com.whitemagicsoftware.keenquotes; | ||
| + | ||
| +/** | ||
| + * Responsible for creating new {@link Parser} instances based on the | ||
| + * {@link ParserType}. The document content format must be known in advance. | ||
| + */ | ||
| +public class ParserFactory { | ||
| + public enum ParserType { | ||
| + PARSER_PLAIN, | ||
| + PARSER_XML | ||
| + } | ||
| + | ||
| + private final ParserType mParserType; | ||
| + | ||
| + public ParserFactory( final ParserType parserType ) { | ||
| + mParserType = parserType; | ||
| + } | ||
| + | ||
| + public Parser createParser( | ||
| + final String text, final Contractions contractions ) { | ||
| + | ||
| + return mParserType == ParserType.PARSER_PLAIN | ||
| + ? new Parser( text, contractions ) | ||
| + : new XmlParser( text, contractions ); | ||
| + } | ||
| +} | ||
| import java.text.CharacterIterator; | ||
| -import static com.whitemagicsoftware.keenquotes.Lexeme.createLexeme; | ||
| -import static com.whitemagicsoftware.keenquotes.LexemeType.TAG; | ||
| - | ||
| /** | ||
| * Responsible for lexing text while ignoring XML elements. The document must | ||
| } | ||
| + /** | ||
| + * Skip (do not emit) XML tags found within the prose. This effectively hides | ||
| + * the element. | ||
| + * | ||
| + * @param i The {@link CharacterIterator} used to scan through the text, one | ||
| + * character at a time. | ||
| + */ | ||
| @Override | ||
| - Lexeme preprocess( final CharacterIterator i ) { | ||
| - final var began = i.getIndex(); | ||
| - final var curr = i.current(); | ||
| - Lexeme lexeme = null; | ||
| + boolean skip( final CharacterIterator i ) { | ||
| + final boolean match = i.current() == '<'; | ||
| - if( curr == '<' ) { | ||
| + if( match ) { | ||
| slurp( i, ( next, ci ) -> next != '>' ); | ||
| + | ||
| + // Swallow the trailing greater than symbol. | ||
| i.next(); | ||
| - lexeme = createLexeme( TAG, began, i.getIndex() ); | ||
| + | ||
| + // Skip to the character following the greater than symbol. | ||
| i.next(); | ||
| } | ||
| - return lexeme; | ||
| + return match; | ||
| } | ||
| } | ||
| package com.whitemagicsoftware.keenquotes; | ||
| +import com.whitemagicsoftware.keenquotes.ParserFactory.ParserType; | ||
| import org.junit.jupiter.api.Disabled; | ||
| import org.junit.jupiter.api.Test; | ||
| import org.junit.jupiter.params.ParameterizedTest; | ||
| +import org.junit.jupiter.params.provider.Arguments; | ||
| +import org.junit.jupiter.params.provider.MethodSource; | ||
| import org.junit.jupiter.params.provider.ValueSource; | ||
| import java.io.BufferedReader; | ||
| import java.io.IOException; | ||
| import java.io.InputStreamReader; | ||
| +import java.util.function.Consumer; | ||
| import java.util.function.Function; | ||
| +import java.util.stream.Stream; | ||
| +import static com.whitemagicsoftware.keenquotes.ParserFactory.ParserType.PARSER_PLAIN; | ||
| +import static com.whitemagicsoftware.keenquotes.ParserFactory.ParserType.PARSER_XML; | ||
| import static java.lang.System.out; | ||
| import static org.junit.jupiter.api.Assertions.assertEquals; | ||
| import static org.junit.jupiter.api.Assertions.assertNotNull; | ||
| +import static org.junit.jupiter.params.provider.Arguments.arguments; | ||
| /** | ||
| @Disabled | ||
| public void test_parse_SingleLine_Parsed() { | ||
| - final var converter = new Converter( out::println ); | ||
| + final var converter = createConverter( out::println ); | ||
| out.println( converter.apply( | ||
| "'A', 'B', and 'C' are letters." | ||
| @Test | ||
| public void test_Parse_StraightQuotes_CurlyQuotes() throws IOException { | ||
| - testConverter( new Converter( ( lex ) -> {} ) ); | ||
| + testConverter( createConverter( ( lex ) -> {} ) ); | ||
| + } | ||
| + | ||
| + @ParameterizedTest | ||
| + @MethodSource( "param_XmlParse_StraightQuotes_CurlyQuotes" ) | ||
| + public void test_XmlParse_StraightQuotes_CurlyQuotes( | ||
| + final String input, final String expected ) { | ||
| + final var converter = createConverter( out::println, PARSER_XML ); | ||
| + final var actual = converter.apply( input ); | ||
| + assertEquals( expected, actual ); | ||
| } | ||
| } | ||
| - final var converter = new Converter( out::println ); | ||
| + final var converter = createConverter( out::println ); | ||
| System.out.println( converter.apply( sb.toString() ) ); | ||
| + } | ||
| + | ||
| + @SuppressWarnings( "unused" ) | ||
| + static Stream<Arguments> param_XmlParse_StraightQuotes_CurlyQuotes() { | ||
| + return Stream.of( | ||
| + arguments( | ||
| + "<em>'twas</em>", | ||
| + "<em>'twas</em>" | ||
| + ), | ||
| + arguments( | ||
| + "<bold>'twas</bold> redeemed for the <em>cat</em>'s eye", | ||
| + "<bold>'twas</bold> redeemed for the <em>cat</em>'s eye" | ||
| + ), | ||
| + arguments( | ||
| + "<a href=\"https://x.org\" title=\"X's Homepage\">X11's bomb</a>", | ||
| + "<a href=\"https://x.org\" title=\"X's Homepage\">X11's bomb</a>" | ||
| + ), | ||
| + arguments( | ||
| + "''<em>Twas</em> happening!'", | ||
| + "‘'<em>Twas</em> happening!’" | ||
| + ) | ||
| + ); | ||
| } | ||
| return new BufferedReader( new InputStreamReader( is ) ); | ||
| + } | ||
| + | ||
| + private Function<String, String> createConverter( | ||
| + final Consumer<Lexeme> unresolved ) { | ||
| + return createConverter( unresolved, PARSER_PLAIN ); | ||
| + } | ||
| + | ||
| + private Function<String, String> createConverter( | ||
| + final Consumer<Lexeme> unresolved, final ParserType parserType ) { | ||
| + return new Converter( unresolved, parserType ); | ||
| } | ||
| } | ||
| * Tests lexing words, numbers, punctuation, spaces, newlines, etc. | ||
| */ | ||
| -class LexerTest { | ||
| +final class LexerTest { | ||
| + | ||
| @Test | ||
| void test_Lexing_Words_LexemeValues() { | ||
| final var list = asList( expected ); | ||
| testType( lexer, actual, ( lexeme, text ) -> lexeme.getType(), list ); | ||
| - } | ||
| - | ||
| - static void testText( | ||
| - final Lexer lexer, final String actual, final String... expected ) { | ||
| - testType( lexer, actual, Lexeme::toString, asList( expected ) ); | ||
| } | ||
| * Test that all unambiguous apostrophes are emitted once. | ||
| */ | ||
| -class ParserTest { | ||
| +final class ParserTest { | ||
| + | ||
| @SuppressWarnings( "TextBlockMigration" ) | ||
| private final static Map<String, Map<TokenType, Integer>> TEST_CASES = |
| import static com.whitemagicsoftware.keenquotes.LexemeType.*; | ||
| -import static com.whitemagicsoftware.keenquotes.LexerTest.testText; | ||
| import static com.whitemagicsoftware.keenquotes.LexerTest.testType; | ||
| /** | ||
| * Test that parsing XML documents ignores elements. | ||
| */ | ||
| final class XmlLexerTest { | ||
| - | ||
| - @Test | ||
| - void test_Lexing_Xml_LexemeValues() { | ||
| - final var actual = "Hello <em>world</em>"; | ||
| - testText( | ||
| - createXmlLexer( actual ), actual, | ||
| - "Hello", " ", "<em>", "world", "</em>" | ||
| - ); | ||
| - } | ||
| @Test | ||
| void test_Lexing_Xml_EmitTags() { | ||
| final var actual = "The <em>world's</em> aflame."; | ||
| testType( | ||
| createXmlLexer( actual ), actual, | ||
| - WORD, SPACE, TAG, WORD, QUOTE_SINGLE, WORD, TAG, SPACE, WORD, PERIOD | ||
| + WORD, SPACE, WORD, QUOTE_SINGLE, WORD, SPACE, WORD, PERIOD | ||
| ); | ||
| } | ||
| @Test | ||
| void test_Lexing_XmlAttribute_EmitTags() { | ||
| - final var actual = "<a href=\"http://x\">x</a>"; | ||
| - testType( createXmlLexer( actual ), actual, TAG, WORD, TAG ); | ||
| + final var actual = "<a href=\"http://x.org\">X11</a>"; | ||
| + testType( createXmlLexer( actual ), actual, WORD, NUMBER ); | ||
| } | ||
| Delta | 135 lines added, 47 lines removed, 88-line increase |
|---|