Dave Jarvis' Repositories

Author	djarvis <email>
Date	2018-12-17 23:57:46 GMT-0800
Commit	1030ac51e5a9c9657b0d133769be8fc5b3191fb7

Delta	1188 lines added, 1 line removed, 1187-line increase

LICENSE.txt

		+ Apache License
		+ Version 2.0, January 2004
		+ http://www.apache.org/licenses/
		+
		+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
		+
		+ 1. Definitions.
		+
		+ "License" shall mean the terms and conditions for use, reproduction,
		+ and distribution as defined by Sections 1 through 9 of this document.
		+
		+ "Licensor" shall mean the copyright owner or entity authorized by
		+ the copyright owner that is granting the License.
		+
		+ "Legal Entity" shall mean the union of the acting entity and all
		+ other entities that control, are controlled by, or are under common
		+ control with that entity. For the purposes of this definition,
		+ "control" means (i) the power, direct or indirect, to cause the
		+ direction or management of such entity, whether by contract or
		+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
		+ outstanding shares, or (iii) beneficial ownership of such entity.
		+
		+ "You" (or "Your") shall mean an individual or Legal Entity
		+ exercising permissions granted by this License.
		+
		+ "Source" form shall mean the preferred form for making modifications,
		+ including but not limited to software source code, documentation
		+ source, and configuration files.
		+
		+ "Object" form shall mean any form resulting from mechanical
		+ transformation or translation of a Source form, including but
		+ not limited to compiled object code, generated documentation,
		+ and conversions to other media types.
		+
		+ "Work" shall mean the work of authorship, whether in Source or
		+ Object form, made available under the License, as indicated by a
		+ copyright notice that is included in or attached to the work
		+ (an example is provided in the Appendix below).
		+
		+ "Derivative Works" shall mean any work, whether in Source or Object
		+ form, that is based on (or derived from) the Work and for which the
		+ editorial revisions, annotations, elaborations, or other modifications
		+ represent, as a whole, an original work of authorship. For the purposes
		+ of this License, Derivative Works shall not include works that remain
		+ separable from, or merely link (or bind by name) to the interfaces of,
		+ the Work and Derivative Works thereof.
		+
		+ "Contribution" shall mean any work of authorship, including
		+ the original version of the Work and any modifications or additions
		+ to that Work or Derivative Works thereof, that is intentionally
		+ submitted to Licensor for inclusion in the Work by the copyright owner
		+ or by an individual or Legal Entity authorized to submit on behalf of
		+ the copyright owner. For the purposes of this definition, "submitted"
		+ means any form of electronic, verbal, or written communication sent
		+ to the Licensor or its representatives, including but not limited to
		+ communication on electronic mailing lists, source code control systems,
		+ and issue tracking systems that are managed by, or on behalf of, the
		+ Licensor for the purpose of discussing and improving the Work, but
		+ excluding communication that is conspicuously marked or otherwise
		+ designated in writing by the copyright owner as "Not a Contribution."
		+
		+ "Contributor" shall mean Licensor and any individual or Legal Entity
		+ on behalf of whom a Contribution has been received by Licensor and
		+ subsequently incorporated within the Work.
		+
		+ 2. Grant of Copyright License. Subject to the terms and conditions of
		+ this License, each Contributor hereby grants to You a perpetual,
		+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
		+ copyright license to reproduce, prepare Derivative Works of,
		+ publicly display, publicly perform, sublicense, and distribute the
		+ Work and such Derivative Works in Source or Object form.
		+
		+ 3. Grant of Patent License. Subject to the terms and conditions of
		+ this License, each Contributor hereby grants to You a perpetual,
		+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
		+ (except as stated in this section) patent license to make, have made,
		+ use, offer to sell, sell, import, and otherwise transfer the Work,
		+ where such license applies only to those patent claims licensable
		+ by such Contributor that are necessarily infringed by their
		+ Contribution(s) alone or by combination of their Contribution(s)
		+ with the Work to which such Contribution(s) was submitted. If You
		+ institute patent litigation against any entity (including a
		+ cross-claim or counterclaim in a lawsuit) alleging that the Work
		+ or a Contribution incorporated within the Work constitutes direct
		+ or contributory patent infringement, then any patent licenses
		+ granted to You under this License for that Work shall terminate
		+ as of the date such litigation is filed.
		+
		+ 4. Redistribution. You may reproduce and distribute copies of the
		+ Work or Derivative Works thereof in any medium, with or without
		+ modifications, and in Source or Object form, provided that You
		+ meet the following conditions:
		+
		+ (a) You must give any other recipients of the Work or
		+ Derivative Works a copy of this License; and
		+
		+ (b) You must cause any modified files to carry prominent notices
		+ stating that You changed the files; and
		+
		+ (c) You must retain, in the Source form of any Derivative Works
		+ that You distribute, all copyright, patent, trademark, and
		+ attribution notices from the Source form of the Work,
		+ excluding those notices that do not pertain to any part of
		+ the Derivative Works; and
		+
		+ (d) If the Work includes a "NOTICE" text file as part of its
		+ distribution, then any Derivative Works that You distribute must
		+ include a readable copy of the attribution notices contained
		+ within such NOTICE file, excluding those notices that do not
		+ pertain to any part of the Derivative Works, in at least one
		+ of the following places: within a NOTICE text file distributed
		+ as part of the Derivative Works; within the Source form or
		+ documentation, if provided along with the Derivative Works; or,
		+ within a display generated by the Derivative Works, if and
		+ wherever such third-party notices normally appear. The contents
		+ of the NOTICE file are for informational purposes only and
		+ do not modify the License. You may add Your own attribution
		+ notices within Derivative Works that You distribute, alongside
		+ or as an addendum to the NOTICE text from the Work, provided
		+ that such additional attribution notices cannot be construed
		+ as modifying the License.
		+
		+ You may add Your own copyright statement to Your modifications and
		+ may provide additional or different license terms and conditions
		+ for use, reproduction, or distribution of Your modifications, or
		+ for any such Derivative Works as a whole, provided Your use,
		+ reproduction, and distribution of the Work otherwise complies with
		+ the conditions stated in this License.
		+
		+ 5. Submission of Contributions. Unless You explicitly state otherwise,
		+ any Contribution intentionally submitted for inclusion in the Work
		+ by You to the Licensor shall be under the terms and conditions of
		+ this License, without any additional terms or conditions.
		+ Notwithstanding the above, nothing herein shall supersede or modify
		+ the terms of any separate license agreement you may have executed
		+ with Licensor regarding such Contributions.
		+
		+ 6. Trademarks. This License does not grant permission to use the trade
		+ names, trademarks, service marks, or product names of the Licensor,
		+ except as required for reasonable and customary use in describing the
		+ origin of the Work and reproducing the content of the NOTICE file.
		+
		+ 7. Disclaimer of Warranty. Unless required by applicable law or
		+ agreed to in writing, Licensor provides the Work (and each
		+ Contributor provides its Contributions) on an "AS IS" BASIS,
		+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
		+ implied, including, without limitation, any warranties or conditions
		+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
		+ PARTICULAR PURPOSE. You are solely responsible for determining the
		+ appropriateness of using or redistributing the Work and assume any
		+ risks associated with Your exercise of permissions under this License.
		+
		+ 8. Limitation of Liability. In no event and under no legal theory,
		+ whether in tort (including negligence), contract, or otherwise,
		+ unless required by applicable law (such as deliberate and grossly
		+ negligent acts) or agreed to in writing, shall any Contributor be
		+ liable to You for damages, including any direct, indirect, special,
		+ incidental, or consequential damages of any character arising as a
		+ result of this License or out of the use or inability to use the
		+ Work (including but not limited to damages for loss of goodwill,
		+ work stoppage, computer failure or malfunction, or any and all
		+ other commercial damages or losses), even if such Contributor
		+ has been advised of the possibility of such damages.
		+
		+ 9. Accepting Warranty or Additional Liability. While redistributing
		+ the Work or Derivative Works thereof, You may choose to offer,
		+ and charge a fee for, acceptance of support, warranty, indemnity,
		+ or other liability obligations and/or rights consistent with this
		+ License. However, in accepting such obligations, You may act only
		+ on Your own behalf and on Your sole responsibility, not on behalf
		+ of any other Contributor, and only if You agree to indemnify,
		+ defend, and hold each Contributor harmless for any liability
		+ incurred by, or claims asserted against, such Contributor by reason
		+ of your accepting any such warranty or additional liability.
		+
		+ END OF TERMS AND CONDITIONS
		+

README.txt

		+ ================================================
		+ Word Split: Text Segmentation Tool
		+ ================================================
		+
		+Word Split is a Java application for Java software developers. The application
		+provides a way to split conjoined words that lack punctuation into separate
		+words. The software is intended to split database column names into their
		+human-readable equivalent text.
		+
		+Word Split takes the following input files:
		+ - a probability lexicon, one word and probability per line (CSV format)
		+ - a list of conjoined phrases, one per line
		+
		+Word Split will use the lexicon to separate the list of conjoined phrases.
		+The resulting segmented phrases are written to standard out.
		+
		+ ---------------------------------------------
		+ Contents
		+ ---------------------------------------------
		+
		+This release includes:
		+
		+ - README.txt This file
		+ - LICENSE.txt License for Word Split
		+
		+ - version.properties Build version
		+ - build.xml Build instructions for Ant
		+ - demos Example lexicons and conjoined files
		+ - scripts Corpus and lexicon helper scripts
		+ - scripts/tally-corpus.sh Creates a tally lexicon from a corpus.
		+ - scripts/probability.awk Creates a probability lexicon from tallies
		+
		+ ---------------------------------------------
		+ Requirements
		+ ---------------------------------------------
		+
		+The following software packages are required to compile and run Word Split:
		+
		+ - Java version 1.6 (or greater)
		+ - Ant version 1.8.2 (or greater)
		+
		+ ---------------------------------------------
		+ Installation
		+ ---------------------------------------------
		+
		+Installation is complete by unzipping the archive.
		+

build.xml

		+<?xml version="1.0" encoding="UTF-8"?>
		+<project name="project" default="build">
		+
		+<property file="version.properties" />
		+<property
		+ name="build.number"
		+ value="${build.major.number}.${build.minor.number}.${build.revision.number}" />
		+<property name="dir.build" value="build" />
		+<property name="dir.source" value="src" />
		+
		+<property name="file.jar" value="wordsplit.jar" />
		+
		+<property name="compile.debug" value="true"/>
		+<property name="compile.deprecation" value="false"/>
		+<property name="compile.optimize" value="false"/>
		+
		+<target name="version">
		+ <echo>Build: ${build.number}</echo>
		+</target>
		+
		+<target name="build">
		+ <antcall target="clean" />
		+ <antcall target="compile" />
		+ <antcall target="jar" />
		+</target>
		+
		+<target name="clean">
		+ <delete dir="${dir.build}" />
		+</target>
		+
		+<target name="compile">
		+ <antcall target="revision"></antcall>
		+ <mkdir dir="${dir.build}" />
		+ <javac
		+ includeantruntime="false"
		+ srcdir="${dir.source}"
		+ destdir="${dir.build}"
		+ debug="${compile.debug}"
		+ deprecation="${compile.deprecation}"
		+ optimize="${compile.optimize}"
		+ />
		+</target>
		+
		+<target name="jar">
		+ <jar destfile="${dir.build}/${file.jar}" basedir="${dir.build}">
		+ <manifest>
		+ <attribute
		+ name="Main-Class"
		+ value="com.whitemagicsoftware.wordsplit.Main" />
		+ </manifest>
		+ </jar>
		+</target>
		+
		+<target name="dist">
		+ <antcall target="minor"></antcall>
		+</target>
		+
		+<target name="revision">
		+ <propertyfile file="version.properties">
		+ <entry key="build.revision.number" type="int" operation="+" value="1" pattern="00" />
		+ </propertyfile>
		+</target>
		+
		+<target name="minor">
		+ <propertyfile file="version.properties">
		+ <entry key="build.minor.number" type="int" operation="+" value="1" pattern="00" />
		+ <entry key="build.revision.number" type="int" value="0" pattern="00" />
		+ </propertyfile>
		+</target>
		+
		+<target name="major">
		+ <propertyfile file="version.properties">
		+ <entry key="build.major.number" type="int" operation="+" value="1" pattern="00" />
		+ <entry key="build.minor.number" type="int" value="0" pattern="00" />
		+ <entry key="build.revision.number" type="int" value="0" pattern="00" />
		+ </propertyfile>
		+</target>
		+
		+<target name="all">
		+ <propertyfile file="version.properties">
		+ <entry key="build.major.number" type="int" operation="+" value="1" pattern="00" />
		+ <entry key="build.minor.number" type="int" operation="+" value="1" pattern="00" />
		+ <entry key="build.revision.number" type="int" operation="+" value="1" pattern="00" />
		+ </propertyfile>
		+</target>
		+
		+</project>
		+

demos/en/conjoined.txt

		+payperiodmatchcode
		+labordistributioncodedesc
		+dependentpsrelationships
		+actionendoption
		+actionendoptiondesc
		+addresstype
		+addresstypedesc
		+historytype
		+psaddresstype
		+rolename
		+bankaccountstatus
		+bankaccountstatusdesc
		+bankaccounttype
		+bankaccounttypedesc
		+beneficiaryamount
		+beneficiaryclass
		+beneficiarypercent
		+benefitsubclass
		+beneficiaryclass
		+beneficiaryclassdesc
		+benefitactioncode
		+benefitactioncodedesc
		+benefitagecontrol
		+benefitagecontroldesc
		+ageconrolagelimit
		+ageconrolnoticeperiod

demos/en/lexicon.csv

		+account,1.0
		+action,0.99
		+address,0.98
		+age,0.97
		+amount,0.96
		+bank,0.95
		+beneficiary,0.94
		+benefit,0.93
		+class,0.92
		+subclass,0.905
		+code,0.91
		+control,0.905
		+depend,0.75
		+dependent,0.8
		+desc,0.88
		+distribution,0.87
		+end,0.555
		+history,0.85
		+labor,0.84
		+limit,0.83
		+match,0.82
		+name,0.81
		+notice,0.8
		+option,0.79
		+pay,0.78
		+percent,0.77
		+period,0.76
		+relationship,0.89
		+relationships,0.9
		+role,0.74
		+status,0.73
		+type,0.72
		+ent,0.1

demos/kh/conjoined.txt

		-
		+វាជាការសំខាន់
		+ណាស់សំរាប់
		+បើអ្នកណាឮ
		+សំលេងអញ
		+ហើយមិនដែល
		+ត្រូវជំនុះជំរះ
		+ឡើយគឺបាន
		+កន្លងហួស
		+ពីសេចក្តី
		+នៅពេលព្រះអង្គ
		+បានប្រទាន
		+ជីវិតនេះឱ្យ
		+អ្នកនៅក្នុង
		+អង្គនោះអ្នក
		+បានកើតក្នុង
		+គ្រួសាររបស់
		+ព្រះខាងឯព្រលឹងវិញ្ញាណ
		+ព្រះអង្គជាបិតា
		+របស់អ្នក

demos/kh/lexicon.csv

Binary files differ

run.sh

		+#!/bin/bash
		+
		+MEM_MIN=1024m
		+MEM_MAX=1024m
		+ENCODING=UTF-8
		+
		+if [ -e build/wordsplit.jar ]; then
		+ java -Xmx$MEM_MAX -Xms$MEM_MIN -Dfile.encoding=$ENCODING \
		+ -jar build/wordsplit.jar $1 $2
		+else
		+ echo "To compile Word Split type: ant"
		+fi
		+

scripts/probability.awk

		+# Given a CSV file of the form <text,natural number>, this writes a CSV
		+# file of relative probabilities, based on the maximum natural number found
		+# in the CSV file.
		+#
		+# Based on code by Dennis Williamson.
		+#
		+# Usage: awk -f probability.awk < filename.csv \| sort -k2,2n -k1,1
		+#
		+BEGIN {
		+ OFS = FS = ","
		+}
		+
		+{ a[$1] = $2 } $2 > max { max = $2 }
		+
		+END {
		+ for( word in a ) print word, a[word] / max
		+}

scripts/tally-corpus.sh

		+#!/bin/bash
		+
		+echo Creating word frequency tallies...
		+sed -e 's/ /\n/g' -e 's/[^a-zA-Z\n]//g' corpus.txt \| \
		+ tr [:upper:] [:lower:] \| \
		+ sort \| \
		+ uniq -c \| \
		+ sort -rn > frequency.txt
		+
		+echo Creating lexicon...
		+grep -Fwf dictionary.txt frequency.txt \| awk '{print $2 "," $1}' > lexicon.csv
		+
		+echo Creating lexicon without single-occurrence 3-letter words...
		+grep -v "^[a-z][a-z][a-z],1" lexicon.csv > lexicon-3.csv

src/com/whitemagicsoftware/wordsplit/Combinations.java

		+package com.whitemagicsoftware.wordsplit;
		+
		+import java.util.ArrayList;
		+import java.util.List;
		+import java.util.Map;
		+
		+/**
		+ * An almost generic class for generating all possible combinations of
		+ * values in a list as a list.
		+ */
		+@SuppressWarnings("unchecked")
		+public class Combinations {
		+ private Visitor visitor;
		+ private List<SegmentAnalysis> analysis = new ArrayList<SegmentAnalysis>();
		+
		+ private final static int MAX_DEPTH = 22;
		+
		+ /**
		+ * @param visitor - The class used to examine each possible text segment.
		+ */
		+ public Combinations( Visitor visitor ) {
		+ setVisitor( visitor );
		+ }
		+
		+ /**
		+ * Entry point.
		+ *
		+ * @param initial - The list of possible words that could constitute the
		+ * solution.
		+ */
		+ public List<SegmentAnalysis> root( List initial ) {
		+ clearAnalysis();
		+ root( new ArrayList(), initial, 0 );
		+ return getAnalysis();
		+ }
		+
		+ /**
		+ * Print all subsets of the remaining elements, with given prefix.
		+ */
		+ private void root( List prefix, List remain, int depth ) {
		+ if( remain.size() > 0 && depth < MAX_DEPTH ) {
		+ List combination = new ArrayList( prefix.size() + 1 );
		+ combination.addAll( prefix );
		+ combination.add( remain.get( 0 ) );
		+
		+ addAnalysis( getVisitor().visit( combination ) );
		+
		+ List r = new ArrayList( remain.size() );
		+ r.addAll( remain.subList( 1, remain.size() ) );
		+
		+ root( combination, r, depth + 1 );
		+ root( prefix, r, depth + 1 );
		+ }
		+ }
		+
		+ private void setVisitor( Visitor visitor ) {
		+ this.visitor = visitor;
		+ }
		+
		+ private Visitor getVisitor() {
		+ return this.visitor;
		+ }
		+
		+ private void clearAnalysis() {
		+ getAnalysis().clear();
		+ }
		+
		+ private List<SegmentAnalysis> getAnalysis() {
		+ return this.analysis;
		+ }
		+
		+ private void addAnalysis( SegmentAnalysis sa ) {
		+ getAnalysis().add( sa );
		+ }
		+
		+ /**
		+ * Tests the class.
		+ */
		+ public static void main( String[] args ) {
		+ List<String> list = new ArrayList<String>();
		+ PrintVisitor pv = new PrintVisitor();
		+
		+ list.add( "a" );
		+ list.add( "b" );
		+ list.add( "c" );
		+ list.add( "d" );
		+
		+ Combinations combinations = new Combinations( pv );
		+ combinations.root( list );
		+ }
		+}

src/com/whitemagicsoftware/wordsplit/Main.java

		+package com.whitemagicsoftware.wordsplit;
		+
		+import java.util.AbstractMap;
		+import java.util.ArrayList;
		+import java.util.List;
		+import java.util.Map;
		+import java.util.TreeMap;
		+
		+import java.io.BufferedReader;
		+import java.io.BufferedWriter;
		+import java.io.File;
		+import java.io.FileInputStream;
		+import java.io.InputStreamReader;
		+import java.io.IOException;
		+
		+/**
		+ * Splits concatenated text into a sentence.
		+ */
		+@SuppressWarnings("unchecked")
		+public class Main {
		+ /**
		+ * Default constructor.
		+ */
		+ public Main() {
		+ }
		+
		+ private static void out( String s ) {
		+ System.out.println( s );
		+ }
		+
		+ /**
		+ * Main application. Takes a lexicon (with probabilities) and list of
		+ * concatenated strings. Writes the split strings to standard output.
		+ */
		+ public static void main( String args[] )
		+ throws IOException {
		+ TextSegmenter ts = new TextSegmenter();
		+
		+ if( args.length == 2 ) {
		+ try {
		+ ts.split( new File( args[0] ), new File( args[1] ) );
		+ }
		+ catch( Exception e ) {
		+ System.err.println( "Error: " + e.getMessage() );
		+ e.printStackTrace();
		+ }
		+ }
		+ else {
		+ out( "com.whitemagicsoftware.wordsplit.Main <lexicon> <conjoined>" );
		+ out( "<lexicon> - CSV file: word,probability" );
		+ out( "<conjoined> - Text file" );
		+ }
		+ }
		+}
		+

src/com/whitemagicsoftware/wordsplit/PrintVisitor.java

		+package com.whitemagicsoftware.wordsplit;
		+
		+import java.util.List;
		+import java.util.Map;
		+
		+/**
		+ * Responsible for writing out a list of words.
		+ */
		+public class PrintVisitor implements Visitor {
		+ /**
		+ * Default constructor.
		+ */
		+ public PrintVisitor() {
		+ }
		+
		+ /**
		+ * Writes the given parameter to standard output.
		+ *
		+ * @param list - The list of values to write to stdout.
		+ */
		+ public SegmentAnalysis visit( List list ) {
		+ System.out.println( list );
		+ return null;
		+ }
		+}

src/com/whitemagicsoftware/wordsplit/SegmentAnalysis.java

		+package com.whitemagicsoftware.wordsplit;
		+
		+import java.util.List;
		+import java.util.Map;
		+
		+/**
		+ * Stores the details about a possible solution to a concatenated phrase.
		+ * These details allow the TextSegmenter class to determine whether or not
		+ * the solution is the most likely.
		+ */
		+@SuppressWarnings("unchecked")
		+public class SegmentAnalysis {
		+ private int wordsUsed;
		+ private List<Map.Entry> words;
		+ private String remaining;
		+
		+ public SegmentAnalysis( List<Map.Entry> words ) {
		+ setWords( words );
		+ }
		+
		+ /**
		+ * Splits the given word (concatenated text) into multiple words, with
		+ * spaces to separate each word.
		+ *
		+ * @param concat - The words to split.
		+ * @return The given parameter with spaces in between each word.
		+ */
		+ public StringBuilder apply( String concat ) {
		+ for( Map.Entry entry : getWords() ) {
		+ String word = (String)(entry.getKey());
		+ concat = concat.replaceFirst( word, " " + word + " " );
		+ }
		+
		+ return new StringBuilder( normalise( concat ) );
		+ }
		+
		+ public boolean matchedAllWords() {
		+ return getWordCount() == getWordsUsed();
		+ }
		+
		+ public int length() {
		+ return getRemaining().length();
		+ }
		+
		+ /**
		+ * Removes multiple spaces from inside a string, as well as trimming white
		+ * space from both ends of the string.
		+ *
		+ * @return The value of s with its whitespace normalised.
		+ */
		+ private String normalise( String s ) {
		+ return s.replaceAll( "\\b\\s{2,}\\b", " " ).trim();
		+ }
		+
		+ public void setRemaining( String remaining ) {
		+ this.remaining = normalise( remaining );
		+ }
		+
		+ private String getRemaining() {
		+ return this.remaining;
		+ }
		+
		+ private double getWordCount() {
		+ return getWords().size();
		+ }
		+
		+ public void setWordsUsed( int wordsUsed ) {
		+ this.wordsUsed = wordsUsed;
		+ }
		+
		+ private double getWordsUsed() {
		+ return this.wordsUsed;
		+ }
		+
		+ /**
		+ * Returns the product of the probability of each word in this potential
		+ * solution.
		+ *
		+ * @return A number between 0 and 1.
		+ */
		+ public double getProbability() {
		+ double probability = 1;
		+
		+ for( Map.Entry entry : getWords() ) {
		+ probability *= ((Double)(entry.getValue())).doubleValue();
		+ }
		+
		+ return probability * (getWordsUsed() / getWordCount());
		+ }
		+
		+ private void setWords( List<Map.Entry> words ) {
		+ this.words = words;
		+ }
		+
		+ private List<Map.Entry> getWords() {
		+ return this.words;
		+ }
		+}

src/com/whitemagicsoftware/wordsplit/SegmentVisitor.java

		+package com.whitemagicsoftware.wordsplit;
		+
		+import java.util.List;
		+import java.util.Map;
		+
		+/**
		+ * Called by the Combinations class when a new combination of words has been
		+ * defined (recursively). This class gathers statistics about the list of
		+ * words that are a possible contender for being the solution.
		+ */
		+@SuppressWarnings("unchecked")
		+public class SegmentVisitor implements Visitor {
		+ private String concat;
		+ private SegmentAnalysis analysis;
		+
		+ /**
		+ * @param concat - The concatenated string that was analysed.
		+ */
		+ public SegmentVisitor( String concat ) {
		+ setConcatenated( concat );
		+ }
		+
		+ /**
		+ * Determines the following statistics with respect to the list.
		+ * <ul>
		+ * <li>The number of words used in the list versus in the string.</li>
		+ * <li>The popularity of proposed solution words.</li>
		+ * <li>The number of remaining characters (and words) after removing the
		+ * word list from the concatenated string.</li>
		+ * </ul>
		+ *
		+ * @param list - The list of words to examine.
		+ */
		+ public SegmentAnalysis visit( List list ) {
		+ String result = getConcatenated().toString();
		+ int wordsUsed = 0;
		+
		+ for( Object o : list ) {
		+ String word = (String)(((Map.Entry)o).getKey());
		+
		+ if( result.indexOf( word ) >= 0 ) {
		+ wordsUsed++;
		+ result = result.replaceFirst( word, " " );
		+ }
		+ }
		+
		+ SegmentAnalysis analysis = new SegmentAnalysis( list );
		+
		+ analysis.setWordsUsed( wordsUsed );
		+ analysis.setRemaining( result );
		+
		+ return analysis;
		+ }
		+
		+ private void setConcatenated( String concat ) {
		+ this.concat = concat;
		+ }
		+
		+ private String getConcatenated() {
		+ return this.concat;
		+ }
		+}
		+

src/com/whitemagicsoftware/wordsplit/SortableValueMap.java

		+package com.whitemagicsoftware.wordsplit;
		+
		+import java.util.Collections;
		+import java.util.Comparator;
		+import java.util.HashMap;
		+import java.util.Iterator;
		+import java.util.LinkedList;
		+import java.util.LinkedHashMap;
		+import java.util.List;
		+import java.util.Map;
		+
		+/**
		+ * A hashmap that can be sorted by its values.
		+ */
		+public class SortableValueMap<K, V extends Comparable<? super V>>
		+ extends LinkedHashMap<K, V> {
		+ /**
		+ * Default constructor.
		+ */
		+ public SortableValueMap() { }
		+
		+ /**
		+ * Populates this instance based on the given map.
		+ */
		+ public SortableValueMap( Map<K, V> map ) {
		+ super( map );
		+ }
		+
		+ public void sortByValue() {
		+ List<Map.Entry<K, V>> list = new LinkedList<Map.Entry<K, V>>( entrySet() );
		+
		+ Collections.sort( list, new Comparator<Map.Entry<K, V>>() {
		+ public int compare( Map.Entry<K, V> entry1, Map.Entry<K, V> entry2 ) {
		+ return entry2.getValue().compareTo( entry1.getValue() );
		+ }
		+ });
		+
		+ clear();
		+
		+ for( Map.Entry<K, V> entry : list ) {
		+ put( entry.getKey(), entry.getValue() );
		+ }
		+ }
		+
		+ /**
		+ * Used for debugging.
		+ */
		+ private static void print( String text, Map<String, Double> map ) {
		+ System.out.println( text );
		+
		+ for( String key : map.keySet() ) {
		+ System.out.println( "key/value: " + key + "/" + map.get( key ) );
		+ }
		+ }
		+
		+ /**
		+ * Tests the class.
		+ */
		+ public static void main(String[] args) {
		+ SortableValueMap<String, Double> map =
		+ new SortableValueMap<String, Double>();
		+
		+ map.put( "A", 67.5 );
		+ map.put( "B", 99.5 );
		+ map.put( "C", 82.4 );
		+ map.put( "D", 42.0 );
		+
		+ print( "Unsorted map", map );
		+ map.sortByValue();
		+ print( "Sorted map", map );
		+ }
		+}

src/com/whitemagicsoftware/wordsplit/TextSegmenter.java

		+package com.whitemagicsoftware.wordsplit;
		+
		+import java.util.AbstractMap;
		+import java.util.ArrayList;
		+import java.util.List;
		+import java.util.Map;
		+import java.util.TreeMap;
		+
		+import java.io.BufferedReader;
		+import java.io.BufferedWriter;
		+import java.io.File;
		+import java.io.FileInputStream;
		+import java.io.InputStreamReader;
		+import java.io.IOException;
		+
		+/**
		+ * Splits concatenated text into a sentence.
		+ */
		+@SuppressWarnings("unchecked")
		+public class TextSegmenter {
		+ /** Lexical and concatenated entries must be at least 2 characters. */
		+ private static final int MIN_LEX_LENGTH = 2;
		+
		+ /** Words and frequencies. */
		+ private Map<String, Double> dictionary = new TreeMap<String, Double>();
		+
		+ /** List of concatenated words to split. */
		+ private List<String> concat = new ArrayList<String>();
		+
		+ /**
		+ * Default constructor.
		+ */
		+ public TextSegmenter() {
		+ }
		+
		+ /**
		+ * Helper method.
		+ */
		+ public void split( File lexicon, File concat )
		+ throws IOException {
		+ BufferedReader lex = new BufferedReader(
		+ new InputStreamReader( new FileInputStream( lexicon ) ) );
		+ BufferedReader col = new BufferedReader(
		+ new InputStreamReader( new FileInputStream( concat ) ) );
		+
		+ split( lex, col );
		+
		+ lex.close();
		+ col.close();
		+ }
		+
		+ /**
		+ * Splits the text. Callers must close the streams.
		+ */
		+ public void split( BufferedReader lexicon, BufferedReader concat )
		+ throws IOException {
		+ loadLexicon( lexicon );
		+ loadConcat( concat );
		+ split();
		+ }
		+
		+ /**
		+ * Iterates over all of the contatenated text, splitting each concatenated
		+ * String into English words.
		+ */
		+ private void split() {
		+ for( String concat : getConcat() ) {
		+ System.out.printf( "%s,%s\n", concat, segments( concat ) );
		+ }
		+ }
		+
		+ /**
		+ * Returns a number between 0 and 1 that represents how often the word is
		+ * used relative to all the other words in the lexicon.
		+ */
		+ private double getProbability( String s ) {
		+ try {
		+ return getDictionary().get( s );
		+ }
		+ catch( Exception e ) {
		+ return 0.0;
		+ }
		+ }
		+
		+ /**
		+ * Splits a concatenated phrase into its constituent words. This will look
		+ * up the words in a dictionary and find the most likely combination that
		+ * satisifies the word segmentation.
		+ *
		+ * @param concat - The phrase without spaces to split into words.
		+ * @return The concat text with spaces.
		+ */
		+ private String segments( String concat ) {
		+ int length = concat.length();
		+ List<Map.Entry<String, Double>> words =
		+ new ArrayList<Map.Entry<String, Double>>();
		+
		+ // Put all the words that exist in the string into a map.
		+ //
		+ for( int i = 0; i < length; i++ ) {
		+ for( int j = 0; j < length - i; j++ ) {
		+ // Word and probability from the lexicon.
		+ //
		+ String w = concat.substring( j, length - i );
		+ double p = getProbability( w );
		+
		+ // Retain words that comprise the concatenated string in order.
		+ //
		+ if( p > 0 ) {
		+ words.add( 0, new AbstractMap.SimpleEntry<String, Double>( w, p ) );
		+ }
		+ }
		+ }
		+
		+ StringBuilder result = new StringBuilder( length * 2 );
		+ StringBuffer joined = new StringBuffer( concat );
		+ int wordCount = words.size();
		+ int wordsUsed = 0;
		+
		+ // If all the words can be accounted for, then the problem is solved.
		+ // If not, then a more complex analsyis is required.
		+ //
		+ for( Map.Entry<String, Double> word : words ) {
		+ String w = word.getKey();
		+ int wlen = w.length();
		+ int index = joined.indexOf( w );
		+
		+ wordsUsed++;
		+
		+ if( index == 0 ) {
		+ // The word (w) from the lexicon matched the beginning of
		+ // the concatenated string. Track the word within "result".
		+ //
		+ result.append( w ).append( ' ' );
		+ joined = joined.delete( 0, wlen );
		+ }
		+ else if( index > 0 ) {
		+ // The word (w) from the lexicon matched the concatenated string,
		+ // but not at the beginning.
		+ //
		+ result.append( joined.substring( 0, index ) ).append( ' ' );
		+ joined = joined.delete( 0, index );
		+ }
		+ else {
		+ // The word could not be found within the string, so lower the
		+ // count of the number of words (from the list) that were used
		+ // in this potential solution. The number of words used will be
		+ // checked against the number of words found. If they are not
		+ // equal then a deeper analysis must be performed.
		+ //
		+ wordsUsed--;
		+ }
		+ }
		+
		+ // Tack on the last word that was not accounted for in the loop.
		+ //
		+ result.append( joined );
		+
		+ // The 80% case is when there was a 1:1 match between the concatenated
		+ // text and having found all the suggested words in said text. If there
		+ // was only one possible match, then there is no point performing any
		+ // further analysis.
		+ //
		+ boolean solved = wordCount == wordsUsed;
		+
		+ if( !solved ) {
		+ result.setLength( 0 );
		+
		+ List<SegmentAnalysis> saList = combinations( concat, words );
		+ List<SegmentAnalysis> candidates = new ArrayList<SegmentAnalysis>();
		+
		+ int minLength = Integer.MAX_VALUE;
		+
		+ // Record the candidates with the shortest remaining character
		+ // count (after splitting and removing the most probable words).
		+ // This loop primarily reduces the candidates based on whether all
		+ // the words in one particular combination of words were used and
		+ // each of those words exists in the lexicon.
		+ //
		+ for( SegmentAnalysis sa : saList ) {
		+ if( sa.matchedAllWords() ) {
		+ int saLength = sa.length();
		+
		+ if( saLength < minLength ) {
		+ minLength = saLength;
		+ }
		+
		+ candidates.add( sa );
		+ }
		+ }
		+
		+ // Swap the segment analysis list for the candidate list. This
		+ // step isn't necessary, but it makes the previous loop and any
		+ // subsequent loops operate on the same variables with the same
		+ // meaning: the "candidates" list will shrink until there is only
		+ // one element -- the solution.
		+ //
		+ swap( saList, candidates );
		+
		+ // The solutions that have the fewest remaining letters are the
		+ // ones to keep. The winning solution will be decided by probability.
		+ //
		+ for( SegmentAnalysis sa : saList ) {
		+ if( sa.length() == minLength ) {
		+ candidates.add( sa );
		+ }
		+ }
		+
		+ swap( saList, candidates );
		+
		+ SegmentAnalysis solution = saList.get( 0 );
		+ double maxProbability = Double.MIN_VALUE;
		+
		+ // Find the solution with the highest probability. The probability
		+ // is calculated using the probabilities from the lexicon (which
		+ // are, in turn, used by the SegmentAnalysis instance).
		+ //
		+ for( SegmentAnalysis sa : saList ) {
		+ double probability = sa.getProbability();
		+
		+ if( probability > maxProbability ) {
		+ solution = sa;
		+ maxProbability = probability;
		+ }
		+ }
		+
		+ result = solution.apply( concat );
		+ }
		+
		+ return result.toString().trim();
		+ }
		+
		+ /**
		+ * Copies the elements from the second list into the first list, then
		+ * clears the second list. This method is used so that the candidates
		+ * variable in the 'segments' method always whittles down to the most
		+ * likely solution.
		+ */
		+ private void swap( List l1, List l2 ) {
		+ l1.clear();
		+ l1.addAll( l2 );
		+ l2.clear();
		+ }
		+
		+ /**
		+ * This method recursively generates a list of all possible word
		+ * combinations from a list of words. The result is an analysis of each
		+ * combination, containing details like probability, relative word
		+ * lengths, and so forth.
		+ */
		+ private List<SegmentAnalysis> combinations(
		+ String concat, List<Map.Entry<String, Double>> words ) {
		+ Visitor v = new SegmentVisitor( concat );
		+
		+ Combinations combinations = new Combinations( v );
		+ return combinations.root( words );
		+ }
		+
		+ /**
		+ * Loads all the words and word probability from the dictionary. Words
		+ * are separated from the probability by a comma.
		+ */
		+ private void loadLexicon( BufferedReader lexiconData )
		+ throws IOException {
		+ String line = null;
		+ Map<String, Double> dictionary = getDictionary();
		+
		+ dictionary.clear();
		+
		+ while( (line = lexiconData.readLine()) != null ) {
		+ String[] lex = line.toLowerCase().split( "," );
		+
		+ if( lex[0].length() >= MIN_LEX_LENGTH ) {
		+ try {
		+ dictionary.put( lex[0], Double.parseDouble( lex[1] ) );
		+ }
		+ catch( Exception e ) {
		+ dictionary.put( lex[0], getDefaultProbability() );
		+ }
		+ }
		+ }
		+ }
		+
		+ /**
		+ * Inserts the lines of concatenated text into the internal list.
		+ */
		+ private void loadConcat( BufferedReader concatData )
		+ throws IOException {
		+ String line = null;
		+ List<String> concat = getConcat();
		+
		+ concat.clear();
		+
		+ while( (line = concatData.readLine()) != null ) {
		+ if( line.length() >= MIN_LEX_LENGTH ) {
		+ concat.add( line.toLowerCase() );
		+ }
		+ }
		+ }
		+
		+ /**
		+ * Returns the list of strings that have been concatenated together (such
		+ * as those in a database column name).
		+ */
		+ private List<String> getConcat() {
		+ return this.concat;
		+ }
		+
		+ /**
		+ * Returns a unique set of words, each having a probability calculated
		+ * using the relative frequency of the word. (The word that appears most
		+ * often in the dictionary's source corpus has a probability of 1.)
		+ */
		+ private Map<String, Double> getDictionary() {
		+ return this.dictionary;
		+ }
		+
		+ /**
		+ * Returns the default probability when no value is given. This is
		+ * likely an error in the lexicon that should be fixed.
		+ */
		+ private Double getDefaultProbability() {
		+ return 0.0;
		+ }
		+}
		+

src/com/whitemagicsoftware/wordsplit/Visitor.java

		+package com.whitemagicsoftware.wordsplit;
		+
		+import java.util.List;
		+import java.util.Map;
		+
		+/**
		+ * Defines the mechanism that allows the SegmentVisitor to collect statistics
		+ * on the combination of words that forms a possible solution to the
		+ * text splitting.
		+ */
		+public interface Visitor {
		+ /**
		+ * Returns details about the likelihood that the given list of words will
		+ * solve the text splitting problem.
		+ *
		+ * @param list - The list of split words.
		+ */
		+ public SegmentAnalysis visit( List list );
		+}

version.properties

		+#Thu, 03 Feb 2011 22:09:43 -0800
		+build.major.number=00
		+build.minor.number=00
		+build.revision.number=26
		+

Initial commit.