Dave Jarvis' Repositories

git clone https://repo.autonoma.ca/repo/keenquotes.git

Resolve ambiguous lagging quotes ahead of first ambiguous leading quotes

AuthorDave Jarvis <email>
Date2022-08-20 23:12:23 GMT-0700
Commite1b113c04f0691926a7aee14058b6025b7f48577
Parente5d7b11
src/main/java/com/whitemagicsoftware/keenquotes/parser/AmbiguityResolver.java
import java.util.ArrayList;
import java.util.Collections;
+import java.util.List;
import java.util.function.Consumer;
Collections.sort( tokens );
+
+ // All laggards appearing before the first leader are apostrophes.
+ resolve( tokens );
+
+ // Replacing laggards may have made leaders resolvable.
+ mTree.visit( this::disambiguate );
// Relay the tokens, in order, for updating the parsed document.
tokens.forEach( mConsumer );
+ }
+
+ /**
+ * Converts all laggards into apostrophes up until the first leader is found.
+ *
+ * @param tokens The list of sorted {@link Token}s to convert.
+ */
+ private void resolve( final List<Token> tokens ) {
+ assert tokens != null;
+
+ for( final var token : tokens ) {
+ if( token.isType( QUOTE_AMBIGUOUS_LEADING ) ) {
+ // Once a leader quote is found, any laggard could be a closing quote.
+ break;
+ }
+ else if( token.isType( QUOTE_AMBIGUOUS_LAGGING ) ) {
+ token.setTokenType( QUOTE_APOSTROPHE );
+ }
+ }
}
src/main/java/com/whitemagicsoftware/keenquotes/parser/Contractions.java
// what's up|to eat
"sup",
- // cannot (tan't)|melanin
+ // cannot (tan't)|colour
"tan",
// still|turn soil
src/main/java/com/whitemagicsoftware/keenquotes/parser/Curler.java
* @param parserType Creates a parser based on document content structure.
*/
- public Curler(
- final Contractions c,
- final FilterType parserType
- ) {
+ public Curler( final Contractions c, final FilterType parserType ) {
this( c, ENTITIES, parserType );
}
src/test/java/com/whitemagicsoftware/keenquotes/parser/CurlerTest.java
import com.whitemagicsoftware.keenquotes.lex.FilterType;
+import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
*/
@ParameterizedTest
- @ValueSource( strings = {"habberton"} )
- //@Disabled
+ @ValueSource( strings = {"autonoma"} )
+ @Disabled
void test_Parse_Story_Converted( final String filename ) throws IOException {
final var sb = new StringBuilder( 2 ^ 20 );
- try( final var reader = open( filename + ".txt" ) ) {
+ try( final var reader = open( filename + ".html" ) ) {
String line;
while( (line = reader.readLine()) != null ) {
sb.append( line ).append( SEP );
}
}
- final var converter = createCurler( FILTER_PLAIN );
- System.out.println( converter.apply( sb.toString() ) );
+ final var curler = createCurler( FILTER_XML );
+ System.out.println( curler.apply( sb.toString() ) );
}
src/test/resources/com/whitemagicsoftware/keenquotes/texts/unambiguous-2-pass.txt
# Mixed
# ########################################################################
-#"Granpa Harry 'spects you, Miss Mayton."
-#&ldquo;Granpa Harry &apos;spects you, Miss Mayton.&rdquo;
+"Granpa Harry 'spects you, Miss Mayton."
+&ldquo;Granpa Harry &apos;spects you, Miss Mayton.&rdquo;
"She said, 'That's Sam's'," said the Sams' cat.
src/test/resources/com/whitemagicsoftware/keenquotes/texts/xml.txt
<a href="https://x.org" title="X's Homepage">X11&apos;s bomb</a>
+<body><p>Test lungs' capacity; shovel's handle hit his head's electronics.</p><p>Hoped to bring him 'round.</p></body>
+<body><p>Test lungs&apos; capacity; shovel&apos;s handle hit his head&apos;s electronics.</p><p>Hoped to bring him &apos;round.</p></body>
+
Delta39 lines added, 12 lines removed, 27-line increase