Dave Jarvis' Repositories

git clone https://repo.autonoma.ca/repo/keenquotes.git

Add contractions, address EOP issue, reset tokenizer at EOP

AuthorDave Jarvis <email>
Date2021-06-16 20:19:21 GMT-0700
Commit93631b87e26e26c58ede8667433ef6ad9be16ffa
Parent3d0f249
lib/src/main/java/com/keenwrite/quotes/Contractions.java
// the
"th",
- // Top ~500 common -ing words as English contractions.
- "acceptin",
- "accompanyin",
- "accordin",
- "accountin",
- "achievin",
- "acquirin",
- "actin",
- "addin",
- "addressin",
- "adjoinin",
- "adoptin",
- "advancin",
- "advertisin",
- "affectin",
- "agin",
- "allowin",
- "amazin",
- "analyzin",
- "answerin",
- "anythin",
- "appearin",
- "applyin",
- "approachin",
- "arguin",
- "arisin",
- "arrivin",
- "askin",
- "assessin",
- "assumin",
- "attackin",
- "attemptin",
- "attendin",
- "avoidin",
- "bankin",
- "bargainin",
- "bearin",
- "beatin",
- "becomin",
- "beginnin",
- "bein",
- "believin",
- "belongin",
- "bendin",
- "bindin",
- "bleedin",
- "blessin",
- "blowin",
- "boilin",
- "borrowin",
- "breakin",
- "breathin",
- "breedin",
- "bringin",
- "broadcastin",
- "buildin",
- "burnin",
- "buyin",
- "calculatin",
- "callin",
- "carryin",
- "castin",
- "causin",
- "ceilin",
- "challengin",
- "changin",
- "checkin",
- "choosin",
- "claimin",
- "cleanin",
- "clearin",
- "climbin",
- "closin",
- "clothin",
- "collectin",
- "combinin",
- "comin",
- "commandin",
- "comparin",
- "compellin",
- "competin",
- "computin",
- "concernin",
- "concludin",
- "conditionin",
- "conductin",
- "conflictin",
- "connectin",
- "considerin",
- "consistin",
- "constructin",
- "consultin",
- "consumin",
- "containin",
- "continuin",
- "contractin",
- "contributin",
- "controllin",
- "convincin",
- "cookin",
- "coolin",
- "copin",
- "correspondin",
- "counselin",
- "countin",
- "couplin",
- "coverin",
- "creatin",
- "crossin",
- "cryin",
- "cuttin",
- "dancin",
- "darlin",
- "datin",
- "dealin",
- "decidin",
- "declarin",
- "declinin",
- "decreasin",
- "definin",
- "demandin",
- "denyin",
- "dependin",
- "descendin",
- "describin",
- "designin",
- "destroyin",
- "determinin",
- "developin",
- "differin",
- "dinin",
- "directin",
- "discussin",
- "distinguishin",
- "disturbin",
- "dividin",
- "doin",
- "drawin",
- "dressin",
- "drinkin",
- "drivin",
- "droppin",
- "dryin",
- "durin",
- "dwellin",
- "dyin",
- "eatin",
- "editin",
- "emergin",
- "employin",
- "enablin",
- "encouragin",
- "endin",
- "engagin",
- "engineerin",
- "enjoyin",
- "enterin",
- "establishin",
- "evaluatin",
- "evenin",
- "everythin",
- "examinin",
- "exceedin",
- "excitin",
- "excludin",
- "existin",
- "expandin",
- "expectin",
- "experiencin",
- "explainin",
- "explorin",
- "expressin",
- "extendin",
- "facin",
- "failin",
- "fallin",
- "farmin",
- "fascinatin",
- "feedin",
- "feelin",
- "fightin",
- "filin",
- "fillin",
- "financin",
- "findin",
- "firin",
- "fishin",
- "fittin",
- "fixin",
- "floatin",
- "flowin",
- "flyin",
- "focusin",
- "followin",
- "forcin",
- "foregoin",
- "formin",
- "forthcomin",
- "foundin",
- "freezin",
- "fuckin",
- "functionin",
- "fundin",
- "gainin",
- "gatherin",
- "generatin",
- "gettin",
- "givin",
- "goin",
- "governin",
- "grantin",
- "growin",
- "hackin",
- "handlin",
- "hangin",
- "happenin",
- "havin",
- "headin",
- "healin",
- "hearin",
- "heatin",
- "helpin",
- "hidin",
- "holdin",
- "hopin",
- "housin",
- "huntin",
- "identifyin",
- "imagin",
- "implementin",
- "imposin",
- "improvin",
- "includin",
- "increasin",
- "indicatin",
- "interestin",
- "interpretin",
- "introducin",
- "involvin",
- "joinin",
- "judgin",
- "keepin",
- "killin",
- "knowin",
- "lackin",
- "landin",
- "lastin",
- "laughin",
- "layin",
- "leadin",
- "leanin",
- "learnin",
- "leavin",
- "lettin",
- "liftin",
- "lightin",
- "lightnin",
- "limitin",
- "listenin",
- "listin",
- "livin",
- "loadin",
- "lookin",
- "losin",
- "lovin",
- "lowerin",
- "lyin",
- "maintainin",
- "makin",
- "managin",
- "manufacturin",
- "mappin",
- "marketin",
- "markin",
- "matchin",
- "meanin",
- "measurin",
- "meetin",
- "meltin",
- "minin",
- "misleadin",
- "missin",
- "mixin",
- "modelin",
- "monitorin",
- "mornin",
- "movin",
- "neighborin",
- "nothin",
- "notin",
- "notwithstandin",
- "nursin",
- "observin",
- "obtainin",
- "occurrin",
- "offerin",
- "offsprin",
- "ongoin",
- "openin",
- "operatin",
- "opposin",
- "orderin",
- "organizin",
- "outstandin",
- "overwhelmin",
- "packin",
- "paintin",
- "parkin",
- "participatin",
- "passin",
- "payin",
- "pendin",
- "performin",
- "pickin",
- "pissin",
- "placin",
- "plannin",
- "plantin",
- "playin",
- "pleasin",
- "pointin",
- "possessin",
- "preachin",
- "precedin",
- "preparin",
- "presentin",
- "preservin",
- "pressin",
- "prevailin",
- "preventin",
- "pricin",
- "printin",
- "proceedin",
- "processin",
- "producin",
- "programmin",
- "promisin",
- "promotin",
- "protectin",
- "providin",
- "provin",
- "publishin",
- "pullin",
- "purchasin",
- "pursuin",
- "pushin",
- "puttin",
- "questionin",
- "rangin",
- "ratin",
- "reachin",
- "readin",
- "reasonin",
- "receivin",
- "recognizin",
- "recordin",
- "reducin",
- "referrin",
- "reflectin",
- "refusin",
- "regardin",
- "regulatin",
- "relatin",
- "remainin",
- "rememberin",
- "removin",
- "renderin",
- "repeatin",
- "replacin",
- "reportin",
- "representin",
- "requirin",
- "respectin",
- "respondin",
- "restin",
- "resultin",
- "returnin",
- "revealin",
- "ridin",
- "risin",
- "rulin",
- "runnin",
- "sailin",
- "samplin",
- "satisfyin",
- "savin",
- "sayin",
- "scatterin",
- "schoolin",
- "screenin",
- "searchin",
- "securin",
- "seein",
- "seekin",
- "selectin",
- "sellin",
- "sendin",
- "separatin",
- "servin",
- "settin",
- "settlin",
- "shakin",
- "shapin",
- "sharin",
- "shiftin",
- "shinin",
- "shippin",
+ // what
+ "wha",
+ // for/before
+ "fo",
+ // San (Francisco)
+ "Sa",
+ // Top ~500 common -ing words as English contractions.
+ "acceptin",
+ "accompanyin",
+ "accordin",
+ "accountin",
+ "achievin",
+ "acquirin",
+ "actin",
+ "addin",
+ "addressin",
+ "adjoinin",
+ "adoptin",
+ "advancin",
+ "advertisin",
+ "affectin",
+ "agin",
+ "allowin",
+ "amazin",
+ "analyzin",
+ "answerin",
+ "anythin",
+ "appearin",
+ "applyin",
+ "approachin",
+ "arguin",
+ "arisin",
+ "arrivin",
+ "askin",
+ "assessin",
+ "assumin",
+ "attackin",
+ "attemptin",
+ "attendin",
+ "avoidin",
+ "bankin",
+ "bargainin",
+ "bearin",
+ "beatin",
+ "becomin",
+ "beginnin",
+ "bein",
+ "believin",
+ "belongin",
+ "bendin",
+ "bindin",
+ "bleedin",
+ "blessin",
+ "blowin",
+ "boilin",
+ "borrowin",
+ "breakin",
+ "breathin",
+ "breedin",
+ "bringin",
+ "broadcastin",
+ "buildin",
+ "burnin",
+ "buyin",
+ "calculatin",
+ "callin",
+ "carryin",
+ "castin",
+ "causin",
+ "ceilin",
+ "challengin",
+ "changin",
+ "checkin",
+ "choosin",
+ "claimin",
+ "cleanin",
+ "clearin",
+ "climbin",
+ "closin",
+ "clothin",
+ "collectin",
+ "combinin",
+ "comin",
+ "commandin",
+ "comparin",
+ "compellin",
+ "competin",
+ "computin",
+ "concernin",
+ "concludin",
+ "conditionin",
+ "conductin",
+ "conflictin",
+ "connectin",
+ "considerin",
+ "consistin",
+ "constructin",
+ "consultin",
+ "consumin",
+ "containin",
+ "continuin",
+ "contractin",
+ "contributin",
+ "controllin",
+ "convincin",
+ "cookin",
+ "coolin",
+ "copin",
+ "correspondin",
+ "counselin",
+ "countin",
+ "couplin",
+ "coverin",
+ "creatin",
+ "crossin",
+ "cryin",
+ "cuttin",
+ "dancin",
+ "darlin",
+ "datin",
+ "dealin",
+ "decidin",
+ "declarin",
+ "declinin",
+ "decreasin",
+ "definin",
+ "demandin",
+ "denyin",
+ "dependin",
+ "descendin",
+ "describin",
+ "designin",
+ "destroyin",
+ "determinin",
+ "developin",
+ "differin",
+ "dinin",
+ "directin",
+ "discussin",
+ "distinguishin",
+ "disturbin",
+ "dividin",
+ "doin",
+ "drawin",
+ "dressin",
+ "drinkin",
+ "drivin",
+ "droppin",
+ "dryin",
+ "durin",
+ "dwellin",
+ "dyin",
+ "eatin",
+ "editin",
+ "emergin",
+ "employin",
+ "enablin",
+ "encouragin",
+ "endin",
+ "engagin",
+ "engineerin",
+ "enjoyin",
+ "enterin",
+ "establishin",
+ "evaluatin",
+ "evenin",
+ "everythin",
+ "examinin",
+ "exceedin",
+ "excitin",
+ "excludin",
+ "existin",
+ "expandin",
+ "expectin",
+ "experiencin",
+ "explainin",
+ "explorin",
+ "expressin",
+ "extendin",
+ "facin",
+ "failin",
+ "fallin",
+ "farmin",
+ "fascinatin",
+ "feedin",
+ "feelin",
+ "fightin",
+ "filin",
+ "fillin",
+ "financin",
+ "findin",
+ "firin",
+ "fishin",
+ "fittin",
+ "fixin",
+ "floatin",
+ "flowin",
+ "flyin",
+ "focusin",
+ "followin",
+ "forcin",
+ "foregoin",
+ "formin",
+ "forthcomin",
+ "foundin",
+ "freezin",
+ "fuckin",
+ "functionin",
+ "fundin",
+ "gainin",
+ "gatherin",
+ "generatin",
+ "gettin",
+ "givin",
+ "goin",
+ "governin",
+ "grantin",
+ "growin",
+ "hackin",
+ "handlin",
+ "hangin",
+ "happenin",
+ "havin",
+ "headin",
+ "healin",
+ "hearin",
+ "heatin",
+ "helpin",
+ "hidin",
+ "holdin",
+ "hopin",
+ "housin",
+ "huntin",
+ "identifyin",
+ "imagin",
+ "implementin",
+ "imposin",
+ "improvin",
+ "includin",
+ "increasin",
+ "indicatin",
+ "interestin",
+ "interpretin",
+ "introducin",
+ "involvin",
+ "joinin",
+ "judgin",
+ "keepin",
+ "killin",
+ "knowin",
+ "lackin",
+ "landin",
+ "lastin",
+ "laughin",
+ "layin",
+ "leadin",
+ "leanin",
+ "learnin",
+ "leavin",
+ "lettin",
+ "liftin",
+ "lightin",
+ "lightnin",
+ "limitin",
+ "listenin",
+ "listin",
+ "livin",
+ "loadin",
+ "lookin",
+ "losin",
+ "lovin",
+ "lowerin",
+ "lyin",
+ "maintainin",
+ "makin",
+ "managin",
+ "manufacturin",
+ "mappin",
+ "marketin",
+ "markin",
+ "matchin",
+ "meanin",
+ "measurin",
+ "meetin",
+ "meltin",
+ "minin",
+ "misleadin",
+ "missin",
+ "mixin",
+ "modelin",
+ "monitorin",
+ "mornin",
+ "movin",
+ "neighborin",
+ "nothin",
+ "notin",
+ "notwithstandin",
+ "nursin",
+ "observin",
+ "obtainin",
+ "occurrin",
+ "offerin",
+ "offsprin",
+ "ongoin",
+ "openin",
+ "operatin",
+ "opposin",
+ "orderin",
+ "organizin",
+ "outstandin",
+ "overwhelmin",
+ "packin",
+ "paintin",
+ "parkin",
+ "participatin",
+ "passin",
+ "payin",
+ "pendin",
+ "performin",
+ "pickin",
+ "pissin",
+ "placin",
+ "plannin",
+ "plantin",
+ "playin",
+ "pleasin",
+ "pointin",
+ "possessin",
+ "preachin",
+ "precedin",
+ "preparin",
+ "presentin",
+ "preservin",
+ "pressin",
+ "prevailin",
+ "preventin",
+ "pricin",
+ "printin",
+ "proceedin",
+ "processin",
+ "producin",
+ "programmin",
+ "promisin",
+ "promotin",
+ "protectin",
+ "providin",
+ "provin",
+ "publishin",
+ "pullin",
+ "purchasin",
+ "pursuin",
+ "pushin",
+ "puttin",
+ "questionin",
+ "rangin",
+ "ratin",
+ "reachin",
+ "readin",
+ "reasonin",
+ "receivin",
+ "recognizin",
+ "recordin",
+ "reducin",
+ "referrin",
+ "reflectin",
+ "refusin",
+ "regardin",
+ "regulatin",
+ "relatin",
+ "remainin",
+ "rememberin",
+ "removin",
+ "renderin",
+ "repeatin",
+ "replacin",
+ "reportin",
+ "representin",
+ "requirin",
+ "respectin",
+ "respondin",
+ "restin",
+ "resultin",
+ "returnin",
+ "revealin",
+ "ridin",
+ "risin",
+ "rulin",
+ "runnin",
+ "sailin",
+ "samplin",
+ "satisfyin",
+ "savin",
+ "sayin",
+ "scatterin",
+ "schoolin",
+ "screenin",
+ "searchin",
+ "securin",
+ "seein",
+ "seekin",
+ "selectin",
+ "sellin",
+ "sendin",
+ "separatin",
+ "servin",
+ "settin",
+ "settlin",
+ "shakin",
+ "shapin",
+ "sharin",
+ "shiftin",
+ "shinin",
+ "shippin",
+ "shittin",
"shootin",
"shoppin",
lib/src/main/java/com/keenwrite/quotes/Lexeme.java
}
+ @Override
+ public String toString() {
+ return getClass().getSimpleName() + '{' +
+ "mType=" + mType +
+ ", mBegan=" + mBegan +
+ ", mEnded=" + mEnded +
+ '}';
+ }
+
/**
* Answers whether the given {@link LexemeType} is the same as this
lib/src/main/java/com/keenwrite/quotes/Lexer.java
import static com.keenwrite.quotes.Lexeme.createLexeme;
import static com.keenwrite.quotes.LexemeType.*;
-import static java.lang.Character.*;
+import static java.lang.Character.isDigit;
+import static java.lang.Character.isWhitespace;
import static java.text.CharacterIterator.DONE;
return lexeme;
+ }
+
+ /**
+ * Answers whether the given character can be considered part of a word
+ * or not. This will include {@code _} and {@code *} because plain text
+ * formats often use those characters to emphasize a word.
+ *
+ * @param curr The character to check as being part of a word.
+ * @return {@code true} if the given character is a letter or a formatting
+ * indicator.
+ */
+ private static boolean isLetter( final char curr ) {
+ return Character.isLetter( curr ) || curr == '_' || curr == '*';
}
+ /**
+ * Answers whether the given character can be considered part of a number
+ * or not. This does not include digits, which are checked independently
+ * from this method.
+ *
+ * @param curr The character to check as being related to numbers.
+ * @return {@code true} if the given character can be considered part of
+ * a number (e.g., -2,000.2^2 is considered a single number).
+ */
private static boolean isNumeric( final char curr ) {
- return curr == '.' || curr == ',' || curr == '-' || curr == '+';
+ return
+ curr == '.' || curr == ',' || curr == '-' || curr == '+' || curr == '^';
}
lib/src/main/java/com/keenwrite/quotes/Parser.java
*/
private static final LexemeType[] LEADING_QUOTE_OPENING_SINGLE =
- new LexemeType[]{SPACE, DASH, QUOTE_DOUBLE, OPENING_GROUP, EOP};
+ new LexemeType[]{SPACE, DASH, QUOTE_DOUBLE, OPENING_GROUP, EOL, EOP};
/**
*/
private static final LexemeType[] LEADING_QUOTE_OPENING_DOUBLE =
- new LexemeType[]{SPACE, DASH, QUOTE_SINGLE, OPENING_GROUP, EOP};
+ new LexemeType[]{SPACE, DASH, QUOTE_SINGLE, OPENING_GROUP, EOL, EOP};
/**
*/
private static final LexemeType[] LAGGING_QUOTE_CLOSING_DOUBLE =
- new LexemeType[]{SPACE, DASH, QUOTE_SINGLE, CLOSING_GROUP, EOL};
+ new LexemeType[]{SPACE, DASH, QUOTE_SINGLE, CLOSING_GROUP, EOL, EOP};
/**
// Create and convert a list of all unambiguous quote characters.
while( (lexeme = mLexer.next()) != EOT ) {
- tokenize( lexeme, lexemes, tokenConsumer, unresolved );
+ if( tokenize( lexeme, lexemes, tokenConsumer, unresolved ) ) {
+ // Attempt to resolve any remaining unambiguous quotes.
+ resolve( unresolved, tokenConsumer );
+
+ // Notify of any unambiguous quotes that could not be resolved.
+ unresolved.forEach( ( lex ) -> lexemeConsumer.accept( lex[ 1 ] ) );
+ unresolved.clear();
+ mOpeningSingleQuote = 0;
+ mClosingSingleQuote = 0;
+ }
}
}
- private void tokenize( final Lexeme lexeme,
- final CircularFifoQueue<Lexeme> lexemes,
- final Consumer<Token> consumer,
- final List<Lexeme[]> unresolved ) {
+ /**
+ * Converts {@link Lexeme}s identified as straight quotes into {@link Token}s
+ * that represent the curly equivalent. The {@link Token}s are passed to
+ * the given {@link Consumer} for further processing (e.g., replaced in
+ * the original text being parsed).
+ *
+ * @param lexeme A part of the text being parsed.
+ * @param lexemes A 3-element queue of lexemes that provide sufficient
+ * context to identify curly quotes.
+ * @param consumer Recipient of equivalent quotes.
+ * @param unresolved Rolling list of potentially ambiguous {@link Lexeme}s
+ * that could not be tokenized, yet.
+ * @return {@code true} if an end-of-paragraph is detected.
+ */
+ private boolean tokenize( final Lexeme lexeme,
+ final CircularFifoQueue<Lexeme> lexemes,
+ final Consumer<Token> consumer,
+ final List<Lexeme[]> unresolved ) {
// Add the next lexeme to tokenize into the queue for immediate processing.
lexemes.add( lexeme );
unresolved.add( new Lexeme[]{lex1, lex2, lex3} );
}
+
+ // Suggest to the caller that resolution should be performed. This allows
+ // the algorithm to reset the opening/closing quote balance before the
+ // next paragraph is parsed.
+ return lex3.isType( EOP );
}
lib/src/main/java/com/keenwrite/quotes/TokenType.java
* The {@link Parser} emits these token types.
*/
-public enum TokenType {
+enum TokenType {
QUOTE_OPENING_SINGLE,
QUOTE_OPENING_DOUBLE,
lib/src/test/java/com/keenwrite/quotes/KeenQuotesTest.java
package com.keenwrite.quotes;
-import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
*/
@Test
- @Disabled
+ //@Disabled
public void test_parse_SingleLine_Parsed() {
out.println( KeenQuotes.convert(
public void test_Parse_StraightQuotes_CurlyQuotes() throws IOException {
testConverter( text -> KeenQuotes.convert( text, ( lexeme ) -> {} ) );
+ }
+
+ @Test
+ void test_Parse_Story_Converted() throws IOException {
+ final var sb = new StringBuilder( 2 ^ 20 );
+
+ try( final var reader = open( "foote.txt" ) ) {
+ String line;
+
+ while( (line = reader.readLine()) != null ) {
+ sb.append( line ).append( '\n' );
+ }
+ }
+
+ final var s = KeenQuotes.convert( sb.toString(), out::println );
+ System.out.println(s);
}
private void testConverter( final Function<String, String> parser )
throws IOException {
- try( final var reader = openResource( "smartypants.txt" ) ) {
+ try( final var reader = open( "smartypants.txt" ) ) {
String line;
String testLine = "";
}
- @SuppressWarnings( "SameParameterValue" )
- private BufferedReader openResource( final String filename ) {
+ /**
+ * Opens a text file for reading. Callers are responsible for closing.
+ *
+ * @param filename The file to open.
+ * @return An instance of {@link BufferedReader} that can be used to
+ * read all the lines in the file.
+ */
+ private BufferedReader open( final String filename ) {
final var is = getClass().getResourceAsStream( filename );
assertNotNull( is );
lib/src/test/java/com/keenwrite/quotes/StoryTest.java
-package com.keenwrite.quotes;
-
-public class StoryTest {
-
-}
lib/src/test/resources/com/keenwrite/quotes/smartypants.txt
&ldquo;And this&rdquo; and &ldquo;and this&rdquo; and &ldquo;and this&rdquo; and &ldquo;and another.&rdquo;
+"Will'll invite?" Mrs. Thorne asked;\n"because he's coming--he's at the gate."
+&ldquo;Will&apos;ll invite?&rdquo; Mrs. Thorne asked;\n&ldquo;because he&apos;s coming--he&apos;s at the gate.&rdquo;
+
# ########################################################################
# Single quotes
"'Twas, t'wasn't thy name, 'twas it?" said Jim "the Barber" Brown.
&ldquo;&apos;Twas, t&apos;wasn&apos;t thy name, &apos;twas it?&rdquo; said Jim &ldquo;the Barber&rdquo; Brown.
+
+"'I'm in danger. Help? Take me to--' an address on Van Ness Avenue.
+&ldquo;&lsquo;I&apos;m in danger. Help? Take me to--&rsquo; an address on Van Ness Avenue.
+
+"Ah," she said, "you knew! He told you--and you said 'my dear'! How could you?!"
+&ldquo;Ah,&rdquo; she said, &ldquo;you knew! He told you--and you said &lsquo;my dear&rsquo;! How could you?!&rdquo;
# ########################################################################
Delta521 lines added, 428 lines removed, 93-line increase