Dave Jarvis' Repositories

git clone https://repo.autonoma.ca/repo/keenwrite.git

Add EBNF for English straight quotes

Author DaveJarvis <email>
Date 2021-04-23 21:29:04 GMT-0700
Commit ee38c120ae2f1d78fb9091bee176762056d840b6
Parent ab09a63
build.gradle
}
- testImplementation "org.junit.jupiter:junit-jupiter-engine:${v_junit}"
testImplementation "org.junit.jupiter:junit-jupiter-api:${v_junit}"
+ testRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine'
+
testImplementation "org.testfx:testfx-junit5:4.0.16-alpha"
}
'--add-exports', "javafx.base/com.sun.javafx.event=ALL-UNNAMED",
]
-
- classpath = files()
+ //classpath = files()
}
docs/development/PROGUARD.md
-From https://github.com/greenrobot/EventBus#r8-proguard
-
- -keepattributes *Annotation*
- -keepclassmembers class * {
- @org.greenrobot.eventbus.Subscribe <methods>;
- }
- -keep enum org.greenrobot.eventbus.ThreadMode { *; }
-
docs/development/proguard/README.md
+From https://github.com/greenrobot/EventBus#r8-proguard
+
+ -keepattributes *Annotation*
+ -keepclassmembers class * {
+ @org.greenrobot.eventbus.Subscribe <methods>;
+ }
+ -keep enum org.greenrobot.eventbus.ThreadMode { *; }
+
docs/development/quotes/english.ebnf
+(* Extended Backus--Naur form definition for English quotes and quotations *)
+
+(* Word characters, not including straight quotes *)
+(* #, $, %, &, *, +, -, /, 0-9, <, =, >, @, A-Z, \, a-z, |, ~, unicode *)
+c ::= [#x23-#x26] | #x2A | #x2B | #x2D |
+ [#x2F-#x39] | [#x3C-#x3E] | [#x40-#x5A] | #x5C | #x5E | #x5F |
+ [#x61-#x7A] | #x7C |
+ [#x7E-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] ;
+
+(* Word separator characters *)
+s ::= (#x20 | #x9)+ ;
+
+(* Quotation mark and apostrphe characters *)
+single_quote ::= '\'' ;
+double_quote ::= '"' ;
+backtick ::= '`' ;
+op_quote ::= backtick backtick ;
+cl_quote ::= single_quote single_quote ;
+
+(* Line breaks, used as paragraph terminators *)
+cr ::= #xA ;
+lf ::= #xD ;
+eol ::= cr | lf | (cr lf) ;
+
+(* Terminal punctuation marks *)
+period ::= '.' ;
+colon ::= ':' ;
+comma ::= ',' ;
+semicolon ::= ';' ;
+exclamation ::= '!' ;
+question ::= '?' ;
+aposiopesis ::= s* ('-' '-' '-'? | period period period) s*;
+
+(* Phrase and sentence terminators *)
+terminator ::= period | exclamation | question | aposiopesis |
+ comma | colon | semicolon ;
+
+(* A word is one or more consecutive characters *)
+word ::= { c }+ ;
+
+(* TODO: DEFINE WORDS IN TERMS OF CONTRACTION CLASSES (inner, outer, etc.) *)
+began ::= single_quote word ;
+ended ::= word single_quote ;
+inner ::= word (single_quote word)+ ;
+outer ::= single_quote word single_quote ;
+
+(* Ensure quotes can be found in the parse tree *)
+quote ::= '"' words '"' ;
+words ::= '(' words ')' | '[' words ']' | '{' words '}' | quote | words ;
+words ::= word (s word)* ;
+
+(* Stixi is Greek, meaning punctuation *)
+stixi ::= words terminator ;
+stixi_d_quote ::= '"' stixi '"' ;
+sentence ::= '(' stixi ')'
+ | '[' stixi ']'
+ | '{' stixi '}'
+ | stixi_d_quote
+ | stixi ;
+sentences ::= sentence (s sentence)* ;
+paragraph ::= sentences eol eol+ ;
+
+(* Multiline quotation *)
+quotation ::= sentences* ('"' paragraph)+ '"'
+
src/main/java/com/keenwrite/quotes/SmartQuotes.java
+/* Copyright 2020-2021 White Magic Software, Ltd. -- All rights reserved. */
+package com.keenwrite.quotes;
+
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.util.AbstractMap.SimpleEntry;
+import java.util.ArrayList;
+import java.util.Map;
+import java.util.function.Function;
+import java.util.regex.Pattern;
+
+import static java.util.Arrays.binarySearch;
+import static java.util.Collections.sort;
+
+/**
+ * Responsible for converting straight quotes into smart quotes. This must be
+ * used on plain text. The class will not parse HTML, TeX, or non-English text.
+ */
+public class SmartQuotes {
+
+ /**
+ * The main regex captures all words that contain an apostrophe. The terms
+ * inner, outer, began, and ended define where the apostrophes can be found
+ * in a particular word. The following text contains 3 word matches against
+ * the "inner" pattern:
+ *
+ * <p>
+ * 'Janes' said, ''E'll be spooky, Sam's son with the jack-o'-lantern!',"
+ * said the O'Mally twins'---y'know---ghosts in unison.'
+ * </p>
+ */
+ private static final Map<String, Pattern> PATTERNS = Map.ofEntries(
+ // @formatter:off
+ createEntry( "inner", "(?<![\\p{L}'])(?:\\p{L}+')+\\p{L}+(?![\\p{L}'])" ),
+ createEntry( "began", "(?<!\\p{L})(?:'\\p{L}+)+(?![\\p{L}'])" ),
+ createEntry( "ended", "(?<![\\p{L}'])(?:\\p{L}+')+(?!\\p{L})" ),
+ createEntry( "outer", "(?<!\\p{L})'\\p{L}+'(?!\\p{L})" ),
+ createEntry( "years", "'(?=\\d{2}s?)" ),
+ createEntry( "+ings", "[\\p{L}]{2,}in'\\s?" ),
+ createEntry( "prime", "((-?[0-9]\\d*(\\.\\d+)?)\\\\?'\\s?(-?[0-9]\\d*(\\.\\d+)?)\\\\?\")|((-?[0-9]\\d*(\\.\\d+)?)(''|\")\\s?(x|×)\\s?(-?[0-9]\\d*(\\.\\d+)?)(''|\"))|((-?[0-9]\\d*(\\.\\d+)?)'')" ),
+ createEntry( "texop", "``" ),
+ createEntry( "texcl", "''" ),
+ createEntry( "white", "(?!\\s+)\"|\"(?!\\s+)" ),
+ createEntry( "slash", "\\\\\"" )
+ // @formatter:on
+ );
+
+ private static SimpleEntry<String, Pattern> createEntry(
+ final String key, final String regex ) {
+ return new SimpleEntry<>( key, Pattern.compile( regex ) );
+ }
+
+ /**
+ * Left single quote replacement text.
+ */
+ private static final String QUOTE_SINGLE_LEFT = "&lsquo;";
+
+ /**
+ * Right single quote replacement text.
+ */
+ private static final String QUOTE_SINGLE_RIGHT = "&rsquo;";
+
+ /**
+ * Left double quote replacement text.
+ */
+ private static final String QUOTE_DOUBLE_LEFT = "&ldquo;";
+
+ /**
+ * Right double quote replacement text.
+ */
+ private static final String QUOTE_DOUBLE_RIGHT = "&rdquo;";
+
+ /**
+ * Apostrophe replacement text.
+ */
+ private static final String APOSTROPHE = "&apos;";
+
+ /**
+ * Prime replacement text.
+ */
+ private static final String SINGLE_PRIME = "&prime;";
+
+ /**
+ * Double prime replacement text.
+ */
+ private static final String DOUBLE_PRIME = "&Prime;";
+
+ /**
+ * Temporary single quote marker near end of Unicode private use area.
+ */
+ private static final String SQ = "\uF8FE";
+
+ /**
+ * Temporary double quote marker near end of Unicode private use area.
+ */
+ private static final String DQ = "\uF8FD";
+
+ private final Map<String, String[]> CONTRACTIONS = Map.ofEntries(
+ load( "inner" ),
+ load( "began" ),
+ load( "ended" ),
+ load( "outer" ),
+ load( "verbs" )
+ );
+
+ public SmartQuotes() {
+ }
+
+ /**
+ * Replaces straight single and double quotes with curly quotes or primes,
+ * depending on the context.
+ *
+ * @param text The text that may contain straight single or double quotes.
+ * @return All single and double quotes replaced with typographically
+ * correct quotation marks.
+ */
+ public String replace( String text ) {
+ // Replace known contractions.
+ text = contractions( text );
+
+ // Replace miscellaneous verb contractions.
+ text = verbs( text );
+
+ // Replace primes and double-primes (e.g., 5'4").
+ text = primes( text );
+
+ // Replace decade contractions.
+ text = decades( text );
+
+ // Replace contractions of words ending in "ing" (e.g., washin').
+ text = suffixes( text );
+
+ // Replace double backticks.
+ text = backticks( text );
+
+ // Unescape straight double quotes.
+ text = escapes( text );
+
+ return text;
+ }
+
+ /**
+ * Replaces all strings in the given text that match the given pattern,
+ * provided the functor answers {@code true} to the matched regex.
+ *
+ * @param text The text to perform a replacement.
+ * @param pattern The regular expression pattern to match.
+ * @param filter Controls whether a text replacement is made.
+ * @return The given text with matching patterns replaced, conditionally.
+ */
+ private String replace( final String text,
+ final Pattern pattern,
+ final Function<String, Boolean> filter,
+ final Function<String, String> subst ) {
+ final var sb = new StringBuilder( text.length() * 2 );
+ final var matcher = pattern.matcher( text );
+
+ while( matcher.find() ) {
+ final var match = matcher.group( 0 );
+ if( filter.apply( match ) ) {
+ matcher.appendReplacement( sb, subst.apply( match ) );
+ }
+ }
+
+ matcher.appendTail( sb );
+ return sb.toString();
+ }
+
+ /**
+ * Convenience method that always performs string replacement upon a match,
+ * unconditionally.
+ */
+ private String apostrophize( final String text, final Pattern pattern ) {
+ return apostrophize( text, pattern, ( match ) -> true );
+ }
+
+ private String apostrophize( final String text, final String pattern ) {
+ return apostrophize( text, PATTERNS.get( pattern ) );
+ }
+
+ private String decades( final String text ) {
+ return apostrophize( text, "years" );
+ }
+
+ private String suffixes( final String text ) {
+ return apostrophize( text, "+ings" );
+ }
+
+ /**
+ * Convenience method that replaces each straight quote in the given {@code
+ * text} that passes through the given filter with an {@link #APOSTROPHE}.
+ */
+ private String apostrophize(
+ final String text,
+ final Pattern pattern,
+ final Function<String, Boolean> filter ) {
+ return replace(
+ text,
+ pattern,
+ filter,
+ ( match ) -> match.replaceAll( "'", APOSTROPHE ) );
+ }
+
+ private String contractions( String text ) {
+ final var elements = new String[]{"inner", "began", "ended", "outer"};
+
+ for( final var item : elements ) {
+ final var pattern = PATTERNS.get( item );
+ final var contractions = CONTRACTIONS.get( item );
+
+ text = apostrophize(
+ text,
+ pattern,
+ ( match ) -> binarySearch( contractions, match.toLowerCase() ) >= 0
+ );
+ }
+
+ return text;
+ }
+
+ /**
+ * Replaces verb endings, such as 'll and 've, with words not explicitly
+ * listed as contractions in the dictionary sources.
+ *
+ * @param text The text to replace.
+ * @return The given text with matching patterns replaced.
+ */
+ private String verbs( String text ) {
+ for( final var contraction : CONTRACTIONS.get( "verbs" ) ) {
+ final var pattern = Pattern.compile( "[\\p{L}]+" + contraction );
+ text = apostrophize( text, pattern );
+ }
+
+ return text;
+ }
+
+ private String primes( final String text ) {
+ System.out.println( "REPLACE: " + text);
+ return replace(
+ text,
+ PATTERNS.get( "prime" ),
+ ( match ) -> true,
+ ( match ) -> match.replaceAll( "''", DOUBLE_PRIME )
+ .replaceAll( "\"", DOUBLE_PRIME )
+ .replaceAll( "'", SINGLE_PRIME )
+ .replaceAll( "\\\\", "" )
+ );
+ }
+
+ /**
+ * Replace all double backticks with opening double quote.
+ */
+ private String backticks( String text ) {
+ final var sb = new StringBuilder( text.length() * 2 );
+ final var opening = PATTERNS.get( "texop" );
+ final var opener = opening.matcher( text );
+ var count = 0;
+
+ while( opener.find() ) {
+ count++;
+ opener.appendReplacement( sb, QUOTE_DOUBLE_LEFT );
+ }
+
+ opener.appendTail( sb );
+
+ if( count > 0 ) {
+ text = sb.toString();
+ sb.setLength( 0 );
+
+ final var closing = PATTERNS.get( "texcl" );
+ final var closer = closing.matcher( text );
+ while( count > 0 && closer.find() ) {
+ count--;
+ closer.appendReplacement( sb, QUOTE_DOUBLE_RIGHT );
+ }
+
+ closer.appendTail( sb );
+ }
+
+ return sb.toString();
+ }
+
+ private String escapes( final String text ) {
+ return replace(
+ text,
+ PATTERNS.get( "slash" ),
+ ( match ) -> true,
+ ( match ) -> match.replaceAll( "\\\\", "" )
+ );
+ }
+
+ /**
+ * Reads the list of words containing contractions.
+ */
+ @SuppressWarnings( "SameParameterValue" )
+ private SimpleEntry<String, String[]> load( final String prefix ) {
+ // Allocate enough elements to hold all the contractions.
+ final var result = new ArrayList<String>( 1024 );
+
+ try( final var in = openResource( prefix + ".txt" ) ) {
+ for( String line; ((line = in.readLine()) != null); ) {
+ result.add( line );
+ }
+
+ sort( result );
+ } catch( final Exception ex ) {
+ throw new RuntimeException( ex );
+ }
+
+ return new SimpleEntry<>( prefix, result.toArray( new String[ 0 ] ) );
+ }
+
+ private BufferedReader openResource( final String filename ) {
+ final var in = getClass().getResourceAsStream( filename );
+ assert in != null;
+
+ return new BufferedReader( new InputStreamReader( in ) );
+ }
+}
src/test/java/com/keenwrite/quotes/SmartQuotesTest.java
+/* Copyright 2020-2021 White Magic Software, Ltd. -- All rights reserved. */
+package com.keenwrite.quotes;
+
+import org.junit.jupiter.api.Test;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+
+/**
+ * Test that English straight quotes are converted to curly quotes and
+ * apostrophes.
+ */
+public class SmartQuotesTest {
+ @Test
+ public void test_Parse_StraightQuotes_CurlyQuotes() throws IOException {
+ final var fixer = new SmartQuotes();
+
+ try( final var reader = openResource( "smartypants.txt" ) ) {
+ String line;
+ String testLine = "";
+ String expected = "";
+
+ while( ((line = reader.readLine()) != null) ) {
+ if( line.startsWith( "#" ) || line.isBlank() ) { continue; }
+
+ // Read the first line of the couplet.
+ if( testLine.isBlank() ) {
+ testLine = line;
+ continue;
+ }
+
+ // Read the second line of the couplet.
+ if( expected.isBlank() ) {
+ expected = line;
+ }
+
+ final var actual = fixer.replace( testLine );
+ assertEquals(expected, actual);
+
+ testLine = "";
+ expected = "";
+ }
+ }
+ }
+
+ @SuppressWarnings( "SameParameterValue" )
+ private BufferedReader openResource( final String filename ) {
+ final var is = getClass().getResourceAsStream( filename );
+ assertNotNull( is );
+
+ return new BufferedReader( new InputStreamReader( is ) );
+ }
+}
src/test/resources/com/keenwrite/quotes/smartypants.txt
+# ########################################################################
+# Decades
+# ########################################################################
+The Roaring '20s had the best music, no?
+The Roaring &apos;20s had the best music, no?
+
+Took place in '04, yes'm!
+Took place in &apos;04, yes&apos;m!
+
+# ########################################################################
+# Inside contractions (no leading/trailing apostrophes)
+# ########################################################################
+I don't like it: I love's it!
+I don&apos;t like it: I love&apos;s it!
+
+We'd've thought that pancakes'll be sweeter there.
+We&apos;d&apos;ve thought that pancakes&apos;ll be sweeter there.
+
+She'd be coming o'er when the horse'd gone to pasture...
+She&apos;d be coming o&apos;er when the horse&apos;d gone to pasture...
+
+# ########################################################################
+# Beginning contractions (leading apostrophes)
+# ########################################################################
+'Twas and 'tis whate'er lay 'twixt dawn and dusk 'n River Styx.
+&apos;Twas and &apos;tis whate&apos;er lay &apos;twixt dawn and dusk &apos;n River Styx.
+
+# ########################################################################
+# Ending contractions (trailing apostrophes)
+# ########################################################################
+Didn' get th' message.
+Didn&apos; get th&apos; message.
+
+Namsayin', y'know what I'ma sayin'?
+Namsayin&apos;, y&apos;know what I&apos;ma sayin&apos;?
+
+# ########################################################################
+# Outside contractions (leading and trailing, no middle)
+# ########################################################################
+Salt 'n' vinegar, fish-'n'-chips, sugar 'n' spice!
+Salt &apos;n&apos; vinegar, fish-&apos;n&apos;-chips, sugar &apos;n&apos; spice!
+
+# ########################################################################
+# Primes (single, double)
+# ########################################################################
+She stood 5\'7\".
+She stood 5&prime;7&Prime;.
+
+# No space after the feet sign.
+It's 4'11" away.
+It&apos;s 4&prime;11&Prime; away.
+
+Alice's friend is 6'3" tall.
+Alice&apos;s friend is 6&prime;3&Prime; tall.
+
+Bob's table is 5'' × 4''.
+Bob&apos;s table is 5&Prime; × 4&Prime;.
+
+What's this -5.5'' all about?
+What&apos;s this -5.5&Prime; all about?
+
++7.9'' is weird.
++7.9&Prime; is weird.
+
+Foolscap? Naw, I use 11.5"x14.25" paper!
+Foolscap? Naw, I use 11.5&Prime;x14.25&Prime; paper!
+
+An angular measurement, 3° 5' 30" means 3 degs, 5 arcmins, and 30 arcsecs.
+An angular measurement, 3° 5&prime; 30&Prime; means 3 degs, 5 arcmins, and 30 arcsecs.
+
+# ########################################################################
+# Backticks (left and right double quotes)
+# ########################################################################
+``I am Sam''
+&ldquo;I am Sam&rdquo;
+
+``Sam's away today''
+&ldquo;Sam&apos;s away today&rdquo;
+
+``Sam's gone!
+&ldquo;Sam&apos;s gone!
+
+``5'10" tall 'e was!''
+&ldquo;5&prime;10&Prime; tall &apos;e was!&rdquo;
+
+# ########################################################################
+# Consecutive quotes
+# ########################################################################
+"'I'm trouble.'"
+&ldquo;&lsquo;I&apos;m trouble.&rsquo;&rdquo;
+
+'"Trouble's my name."'
+&lsquo;&ldquo;Trouble&apos;s my name.&ldquo;&lsquo;
+
+# ########################################################################
+# Escaped quotes
+# ########################################################################
+\"What?\"
+&ldquo;What?&rdquo;
+
+# ########################################################################
+# Double quotes
+# ########################################################################
+"I am Sam"
+&ldquo;I am Sam&rdquo;
+
+"...even better!"
+&ldquo;...even better!&rdquo;
+
+"It was so," said he.
+&ldquo;It was so,&rdquo; said he.
+
+"She said, 'Llamas'll languish, they'll--
+&ldquo;She said, &lsquo;Llamas&apos;ll languish, they&apos;ll--
+
+With "air quotes" in the middle.
+With &ldquo;air quotes&rdquo; in the middle.
+
+With--"air quotes"--and dashes.
+With--&ldquo;air quotes&rdquo;--and dashes.
+
+"Not "quite" what you expected?"
+&ldquo;Not &ldquo;quite&rdquo; what you expected?&rdquo;
+
+# ########################################################################
+# Nested quotations
+# ########################################################################
+"'Here I am,' said Sam"
+&ldquo;&lsquo;Here I am,&rsquo; said Sam&rdquo;
+
+'"Here I am," said Sam'
+&lsquo;&ldquo;Here I am,&rdquo;, said Sam&rsquo;
+
+'Hello, "Dr. Brown," what's your real name?'
+&lsquo;Hello, &ldquo;Dr. Brown,&rdquo; what's your real name?&rsquo;
+
+"'Twas, t'wasn't thy name, 'twas it?" said Jim "the Barber" Brown.
+&ldquo;&apos;Twas, t&apos;wasn&apos;t thy name, &apos;twas it?&rdquo; said Jim &ldquo;the Barber&rdquo; Brown.
+
+# ########################################################################
+# Single quotes
+# ########################################################################
+'I am Sam'
+&lsquo;I am Sam&rsquo;
+
+'It was so,' said he.
+&lsquo;It was so,&rsquo; said he.
+
+'...even better!'
+&lsquo;...even better!&rsquo;
+
+With 'quotes' in the middle.
+With &lsquo;quotes&rsquo; in the middle.
+
+With--'imaginary'--dashes.
+With--&lsquo;imaginary&rsquo;--dashes.
+
+'Not 'quite' what you expected?'
+&lsquo;Not &lsquo;quite&rsquo; what you expected?&rsquo;
+
+''Cause I don't like it, 's why,' said Pat.
+&lsquo;&apos;Cause I don't like it, &apos;s why,&rsquo; said Pat.
+
+'It's a beautiful day!'
+&lsquo;It&apos;s a beautiful day!&rsquo;
+
+'He said, 'Thinkin'.'
+&lsquo;He said, &lsquo;Thinkin&rsquo;.&rsquo;
+
+# ########################################################################
+# Possessives
+# ########################################################################
+Sam's Sams' and the Ross's roses' thorns were prickly.
+Sam&apos;s Sams&apos; and the Ross&apos;s roses&apos; thorns were prickly.
+
+# ########################################################################
+# Mixed
+# ########################################################################
+"I heard she said, 'That's Sam's'," said the Sams' cat.
+&ldquo;I heard she said, &lsquo;That&apos;s Sam&apos;s&rsquo;," said the Sams&apos; cat.
+
+"'Janes' said, ''E'll be spooky, Sam's son with the jack-o'-lantern!'" said the O'Mally twins'---y'know---ghosts in unison.
+&ldquo;&lsquo;Janes&apos; said, &lsquo;&apos;E&apos;ll be spooky, Sam&apos;s son with the jack-o&apos;-lantern!&rsquo;&rdquo; said the O&apos;Mally twins&apos;---y&apos;know---ghosts in unison.
+
+
+He said, 'Hello, Ross' friend who likes saying to Ross' cat, 'That's Ross' and then pretends to sleep.
+
+'He's at Sams'
+'Oh crud, the cat just jumped out the window!'
+Open quote always preceded by whitespace or quote.
Delta 642 lines added, 11 lines removed, 631-line increase