| Author | djarvis <email> |
|---|---|
| Date | 2018-12-17 23:57:46 GMT-0800 |
| Commit | 1030ac51e5a9c9657b0d133769be8fc5b3191fb7 |
| Delta | 1188 lines added, 1 line removed, 1187-line increase |
|---|
| + Apache License | ||
| + Version 2.0, January 2004 | ||
| + http://www.apache.org/licenses/ | ||
| + | ||
| + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION | ||
| + | ||
| + 1. Definitions. | ||
| + | ||
| + "License" shall mean the terms and conditions for use, reproduction, | ||
| + and distribution as defined by Sections 1 through 9 of this document. | ||
| + | ||
| + "Licensor" shall mean the copyright owner or entity authorized by | ||
| + the copyright owner that is granting the License. | ||
| + | ||
| + "Legal Entity" shall mean the union of the acting entity and all | ||
| + other entities that control, are controlled by, or are under common | ||
| + control with that entity. For the purposes of this definition, | ||
| + "control" means (i) the power, direct or indirect, to cause the | ||
| + direction or management of such entity, whether by contract or | ||
| + otherwise, or (ii) ownership of fifty percent (50%) or more of the | ||
| + outstanding shares, or (iii) beneficial ownership of such entity. | ||
| + | ||
| + "You" (or "Your") shall mean an individual or Legal Entity | ||
| + exercising permissions granted by this License. | ||
| + | ||
| + "Source" form shall mean the preferred form for making modifications, | ||
| + including but not limited to software source code, documentation | ||
| + source, and configuration files. | ||
| + | ||
| + "Object" form shall mean any form resulting from mechanical | ||
| + transformation or translation of a Source form, including but | ||
| + not limited to compiled object code, generated documentation, | ||
| + and conversions to other media types. | ||
| + | ||
| + "Work" shall mean the work of authorship, whether in Source or | ||
| + Object form, made available under the License, as indicated by a | ||
| + copyright notice that is included in or attached to the work | ||
| + (an example is provided in the Appendix below). | ||
| + | ||
| + "Derivative Works" shall mean any work, whether in Source or Object | ||
| + form, that is based on (or derived from) the Work and for which the | ||
| + editorial revisions, annotations, elaborations, or other modifications | ||
| + represent, as a whole, an original work of authorship. For the purposes | ||
| + of this License, Derivative Works shall not include works that remain | ||
| + separable from, or merely link (or bind by name) to the interfaces of, | ||
| + the Work and Derivative Works thereof. | ||
| + | ||
| + "Contribution" shall mean any work of authorship, including | ||
| + the original version of the Work and any modifications or additions | ||
| + to that Work or Derivative Works thereof, that is intentionally | ||
| + submitted to Licensor for inclusion in the Work by the copyright owner | ||
| + or by an individual or Legal Entity authorized to submit on behalf of | ||
| + the copyright owner. For the purposes of this definition, "submitted" | ||
| + means any form of electronic, verbal, or written communication sent | ||
| + to the Licensor or its representatives, including but not limited to | ||
| + communication on electronic mailing lists, source code control systems, | ||
| + and issue tracking systems that are managed by, or on behalf of, the | ||
| + Licensor for the purpose of discussing and improving the Work, but | ||
| + excluding communication that is conspicuously marked or otherwise | ||
| + designated in writing by the copyright owner as "Not a Contribution." | ||
| + | ||
| + "Contributor" shall mean Licensor and any individual or Legal Entity | ||
| + on behalf of whom a Contribution has been received by Licensor and | ||
| + subsequently incorporated within the Work. | ||
| + | ||
| + 2. Grant of Copyright License. Subject to the terms and conditions of | ||
| + this License, each Contributor hereby grants to You a perpetual, | ||
| + worldwide, non-exclusive, no-charge, royalty-free, irrevocable | ||
| + copyright license to reproduce, prepare Derivative Works of, | ||
| + publicly display, publicly perform, sublicense, and distribute the | ||
| + Work and such Derivative Works in Source or Object form. | ||
| + | ||
| + 3. Grant of Patent License. Subject to the terms and conditions of | ||
| + this License, each Contributor hereby grants to You a perpetual, | ||
| + worldwide, non-exclusive, no-charge, royalty-free, irrevocable | ||
| + (except as stated in this section) patent license to make, have made, | ||
| + use, offer to sell, sell, import, and otherwise transfer the Work, | ||
| + where such license applies only to those patent claims licensable | ||
| + by such Contributor that are necessarily infringed by their | ||
| + Contribution(s) alone or by combination of their Contribution(s) | ||
| + with the Work to which such Contribution(s) was submitted. If You | ||
| + institute patent litigation against any entity (including a | ||
| + cross-claim or counterclaim in a lawsuit) alleging that the Work | ||
| + or a Contribution incorporated within the Work constitutes direct | ||
| + or contributory patent infringement, then any patent licenses | ||
| + granted to You under this License for that Work shall terminate | ||
| + as of the date such litigation is filed. | ||
| + | ||
| + 4. Redistribution. You may reproduce and distribute copies of the | ||
| + Work or Derivative Works thereof in any medium, with or without | ||
| + modifications, and in Source or Object form, provided that You | ||
| + meet the following conditions: | ||
| + | ||
| + (a) You must give any other recipients of the Work or | ||
| + Derivative Works a copy of this License; and | ||
| + | ||
| + (b) You must cause any modified files to carry prominent notices | ||
| + stating that You changed the files; and | ||
| + | ||
| + (c) You must retain, in the Source form of any Derivative Works | ||
| + that You distribute, all copyright, patent, trademark, and | ||
| + attribution notices from the Source form of the Work, | ||
| + excluding those notices that do not pertain to any part of | ||
| + the Derivative Works; and | ||
| + | ||
| + (d) If the Work includes a "NOTICE" text file as part of its | ||
| + distribution, then any Derivative Works that You distribute must | ||
| + include a readable copy of the attribution notices contained | ||
| + within such NOTICE file, excluding those notices that do not | ||
| + pertain to any part of the Derivative Works, in at least one | ||
| + of the following places: within a NOTICE text file distributed | ||
| + as part of the Derivative Works; within the Source form or | ||
| + documentation, if provided along with the Derivative Works; or, | ||
| + within a display generated by the Derivative Works, if and | ||
| + wherever such third-party notices normally appear. The contents | ||
| + of the NOTICE file are for informational purposes only and | ||
| + do not modify the License. You may add Your own attribution | ||
| + notices within Derivative Works that You distribute, alongside | ||
| + or as an addendum to the NOTICE text from the Work, provided | ||
| + that such additional attribution notices cannot be construed | ||
| + as modifying the License. | ||
| + | ||
| + You may add Your own copyright statement to Your modifications and | ||
| + may provide additional or different license terms and conditions | ||
| + for use, reproduction, or distribution of Your modifications, or | ||
| + for any such Derivative Works as a whole, provided Your use, | ||
| + reproduction, and distribution of the Work otherwise complies with | ||
| + the conditions stated in this License. | ||
| + | ||
| + 5. Submission of Contributions. Unless You explicitly state otherwise, | ||
| + any Contribution intentionally submitted for inclusion in the Work | ||
| + by You to the Licensor shall be under the terms and conditions of | ||
| + this License, without any additional terms or conditions. | ||
| + Notwithstanding the above, nothing herein shall supersede or modify | ||
| + the terms of any separate license agreement you may have executed | ||
| + with Licensor regarding such Contributions. | ||
| + | ||
| + 6. Trademarks. This License does not grant permission to use the trade | ||
| + names, trademarks, service marks, or product names of the Licensor, | ||
| + except as required for reasonable and customary use in describing the | ||
| + origin of the Work and reproducing the content of the NOTICE file. | ||
| + | ||
| + 7. Disclaimer of Warranty. Unless required by applicable law or | ||
| + agreed to in writing, Licensor provides the Work (and each | ||
| + Contributor provides its Contributions) on an "AS IS" BASIS, | ||
| + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or | ||
| + implied, including, without limitation, any warranties or conditions | ||
| + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A | ||
| + PARTICULAR PURPOSE. You are solely responsible for determining the | ||
| + appropriateness of using or redistributing the Work and assume any | ||
| + risks associated with Your exercise of permissions under this License. | ||
| + | ||
| + 8. Limitation of Liability. In no event and under no legal theory, | ||
| + whether in tort (including negligence), contract, or otherwise, | ||
| + unless required by applicable law (such as deliberate and grossly | ||
| + negligent acts) or agreed to in writing, shall any Contributor be | ||
| + liable to You for damages, including any direct, indirect, special, | ||
| + incidental, or consequential damages of any character arising as a | ||
| + result of this License or out of the use or inability to use the | ||
| + Work (including but not limited to damages for loss of goodwill, | ||
| + work stoppage, computer failure or malfunction, or any and all | ||
| + other commercial damages or losses), even if such Contributor | ||
| + has been advised of the possibility of such damages. | ||
| + | ||
| + 9. Accepting Warranty or Additional Liability. While redistributing | ||
| + the Work or Derivative Works thereof, You may choose to offer, | ||
| + and charge a fee for, acceptance of support, warranty, indemnity, | ||
| + or other liability obligations and/or rights consistent with this | ||
| + License. However, in accepting such obligations, You may act only | ||
| + on Your own behalf and on Your sole responsibility, not on behalf | ||
| + of any other Contributor, and only if You agree to indemnify, | ||
| + defend, and hold each Contributor harmless for any liability | ||
| + incurred by, or claims asserted against, such Contributor by reason | ||
| + of your accepting any such warranty or additional liability. | ||
| + | ||
| + END OF TERMS AND CONDITIONS | ||
| + | ||
| + ================================================ | ||
| + Word Split: Text Segmentation Tool | ||
| + ================================================ | ||
| + | ||
| +Word Split is a Java application for Java software developers. The application | ||
| +provides a way to split conjoined words that lack punctuation into separate | ||
| +words. The software is intended to split database column names into their | ||
| +human-readable equivalent text. | ||
| + | ||
| +Word Split takes the following input files: | ||
| + - a probability lexicon, one word and probability per line (CSV format) | ||
| + - a list of conjoined phrases, one per line | ||
| + | ||
| +Word Split will use the lexicon to separate the list of conjoined phrases. | ||
| +The resulting segmented phrases are written to standard out. | ||
| + | ||
| + --------------------------------------------- | ||
| + Contents | ||
| + --------------------------------------------- | ||
| + | ||
| +This release includes: | ||
| + | ||
| + - README.txt This file | ||
| + - LICENSE.txt License for Word Split | ||
| + | ||
| + - version.properties Build version | ||
| + - build.xml Build instructions for Ant | ||
| + - demos Example lexicons and conjoined files | ||
| + - scripts Corpus and lexicon helper scripts | ||
| + - scripts/tally-corpus.sh Creates a tally lexicon from a corpus. | ||
| + - scripts/probability.awk Creates a probability lexicon from tallies | ||
| + | ||
| + --------------------------------------------- | ||
| + Requirements | ||
| + --------------------------------------------- | ||
| + | ||
| +The following software packages are required to compile and run Word Split: | ||
| + | ||
| + - Java version 1.6 (or greater) | ||
| + - Ant version 1.8.2 (or greater) | ||
| + | ||
| + --------------------------------------------- | ||
| + Installation | ||
| + --------------------------------------------- | ||
| + | ||
| +Installation is complete by unzipping the archive. | ||
| + | ||
| +<?xml version="1.0" encoding="UTF-8"?> | ||
| +<project name="project" default="build"> | ||
| + | ||
| +<property file="version.properties" /> | ||
| +<property | ||
| + name="build.number" | ||
| + value="${build.major.number}.${build.minor.number}.${build.revision.number}" /> | ||
| +<property name="dir.build" value="build" /> | ||
| +<property name="dir.source" value="src" /> | ||
| + | ||
| +<property name="file.jar" value="wordsplit.jar" /> | ||
| + | ||
| +<property name="compile.debug" value="true"/> | ||
| +<property name="compile.deprecation" value="false"/> | ||
| +<property name="compile.optimize" value="false"/> | ||
| + | ||
| +<target name="version"> | ||
| + <echo>Build: ${build.number}</echo> | ||
| +</target> | ||
| + | ||
| +<target name="build"> | ||
| + <antcall target="clean" /> | ||
| + <antcall target="compile" /> | ||
| + <antcall target="jar" /> | ||
| +</target> | ||
| + | ||
| +<target name="clean"> | ||
| + <delete dir="${dir.build}" /> | ||
| +</target> | ||
| + | ||
| +<target name="compile"> | ||
| + <antcall target="revision"></antcall> | ||
| + <mkdir dir="${dir.build}" /> | ||
| + <javac | ||
| + includeantruntime="false" | ||
| + srcdir="${dir.source}" | ||
| + destdir="${dir.build}" | ||
| + debug="${compile.debug}" | ||
| + deprecation="${compile.deprecation}" | ||
| + optimize="${compile.optimize}" | ||
| + /> | ||
| +</target> | ||
| + | ||
| +<target name="jar"> | ||
| + <jar destfile="${dir.build}/${file.jar}" basedir="${dir.build}"> | ||
| + <manifest> | ||
| + <attribute | ||
| + name="Main-Class" | ||
| + value="com.whitemagicsoftware.wordsplit.Main" /> | ||
| + </manifest> | ||
| + </jar> | ||
| +</target> | ||
| + | ||
| +<target name="dist"> | ||
| + <antcall target="minor"></antcall> | ||
| +</target> | ||
| + | ||
| +<target name="revision"> | ||
| + <propertyfile file="version.properties"> | ||
| + <entry key="build.revision.number" type="int" operation="+" value="1" pattern="00" /> | ||
| + </propertyfile> | ||
| +</target> | ||
| + | ||
| +<target name="minor"> | ||
| + <propertyfile file="version.properties"> | ||
| + <entry key="build.minor.number" type="int" operation="+" value="1" pattern="00" /> | ||
| + <entry key="build.revision.number" type="int" value="0" pattern="00" /> | ||
| + </propertyfile> | ||
| +</target> | ||
| + | ||
| +<target name="major"> | ||
| + <propertyfile file="version.properties"> | ||
| + <entry key="build.major.number" type="int" operation="+" value="1" pattern="00" /> | ||
| + <entry key="build.minor.number" type="int" value="0" pattern="00" /> | ||
| + <entry key="build.revision.number" type="int" value="0" pattern="00" /> | ||
| + </propertyfile> | ||
| +</target> | ||
| + | ||
| +<target name="all"> | ||
| + <propertyfile file="version.properties"> | ||
| + <entry key="build.major.number" type="int" operation="+" value="1" pattern="00" /> | ||
| + <entry key="build.minor.number" type="int" operation="+" value="1" pattern="00" /> | ||
| + <entry key="build.revision.number" type="int" operation="+" value="1" pattern="00" /> | ||
| + </propertyfile> | ||
| +</target> | ||
| + | ||
| +</project> | ||
| + | ||
| +payperiodmatchcode | ||
| +labordistributioncodedesc | ||
| +dependentpsrelationships | ||
| +actionendoption | ||
| +actionendoptiondesc | ||
| +addresstype | ||
| +addresstypedesc | ||
| +historytype | ||
| +psaddresstype | ||
| +rolename | ||
| +bankaccountstatus | ||
| +bankaccountstatusdesc | ||
| +bankaccounttype | ||
| +bankaccounttypedesc | ||
| +beneficiaryamount | ||
| +beneficiaryclass | ||
| +beneficiarypercent | ||
| +benefitsubclass | ||
| +beneficiaryclass | ||
| +beneficiaryclassdesc | ||
| +benefitactioncode | ||
| +benefitactioncodedesc | ||
| +benefitagecontrol | ||
| +benefitagecontroldesc | ||
| +ageconrolagelimit | ||
| +ageconrolnoticeperiod | ||
| +account,1.0 | ||
| +action,0.99 | ||
| +address,0.98 | ||
| +age,0.97 | ||
| +amount,0.96 | ||
| +bank,0.95 | ||
| +beneficiary,0.94 | ||
| +benefit,0.93 | ||
| +class,0.92 | ||
| +subclass,0.905 | ||
| +code,0.91 | ||
| +control,0.905 | ||
| +depend,0.75 | ||
| +dependent,0.8 | ||
| +desc,0.88 | ||
| +distribution,0.87 | ||
| +end,0.555 | ||
| +history,0.85 | ||
| +labor,0.84 | ||
| +limit,0.83 | ||
| +match,0.82 | ||
| +name,0.81 | ||
| +notice,0.8 | ||
| +option,0.79 | ||
| +pay,0.78 | ||
| +percent,0.77 | ||
| +period,0.76 | ||
| +relationship,0.89 | ||
| +relationships,0.9 | ||
| +role,0.74 | ||
| +status,0.73 | ||
| +type,0.72 | ||
| +ent,0.1 | ||
| - | ||
| +វាជាការសំខាន់ | ||
| +ណាស់សំរាប់ | ||
| +បើអ្នកណាឮ | ||
| +សំលេងអញ | ||
| +ហើយមិនដែល | ||
| +ត្រូវជំនុះជំរះ | ||
| +ឡើយគឺបាន | ||
| +កន្លងហួស | ||
| +ពីសេចក្តី | ||
| +នៅពេលព្រះអង្គ | ||
| +បានប្រទាន | ||
| +ជីវិតនេះឱ្យ | ||
| +អ្នកនៅក្នុង | ||
| +អង្គនោះអ្នក | ||
| +បានកើតក្នុង | ||
| +គ្រួសាររបស់ | ||
| +ព្រះខាងឯព្រលឹងវិញ្ញាណ | ||
| +ព្រះអង្គជាបិតា | ||
| +របស់អ្នក |
| +#!/bin/bash | ||
| + | ||
| +MEM_MIN=1024m | ||
| +MEM_MAX=1024m | ||
| +ENCODING=UTF-8 | ||
| + | ||
| +if [ -e build/wordsplit.jar ]; then | ||
| + java -Xmx$MEM_MAX -Xms$MEM_MIN -Dfile.encoding=$ENCODING \ | ||
| + -jar build/wordsplit.jar $1 $2 | ||
| +else | ||
| + echo "To compile Word Split type: ant" | ||
| +fi | ||
| + | ||
| +# Given a CSV file of the form <text,natural number>, this writes a CSV | ||
| +# file of relative probabilities, based on the maximum natural number found | ||
| +# in the CSV file. | ||
| +# | ||
| +# Based on code by Dennis Williamson. | ||
| +# | ||
| +# Usage: awk -f probability.awk < filename.csv | sort -k2,2n -k1,1 | ||
| +# | ||
| +BEGIN { | ||
| + OFS = FS = "," | ||
| +} | ||
| + | ||
| +{ a[$1] = $2 } $2 > max { max = $2 } | ||
| + | ||
| +END { | ||
| + for( word in a ) print word, a[word] / max | ||
| +} | ||
| +#!/bin/bash | ||
| + | ||
| +echo Creating word frequency tallies... | ||
| +sed -e 's/ /\n/g' -e 's/[^a-zA-Z\n]//g' corpus.txt | \ | ||
| + tr [:upper:] [:lower:] | \ | ||
| + sort | \ | ||
| + uniq -c | \ | ||
| + sort -rn > frequency.txt | ||
| + | ||
| +echo Creating lexicon... | ||
| +grep -Fwf dictionary.txt frequency.txt | awk '{print $2 "," $1}' > lexicon.csv | ||
| + | ||
| +echo Creating lexicon without single-occurrence 3-letter words... | ||
| +grep -v "^[a-z][a-z][a-z],1" lexicon.csv > lexicon-3.csv | ||
| +package com.whitemagicsoftware.wordsplit; | ||
| + | ||
| +import java.util.ArrayList; | ||
| +import java.util.List; | ||
| +import java.util.Map; | ||
| + | ||
| +/** | ||
| + * An almost generic class for generating all possible combinations of | ||
| + * values in a list as a list. | ||
| + */ | ||
| +@SuppressWarnings("unchecked") | ||
| +public class Combinations { | ||
| + private Visitor visitor; | ||
| + private List<SegmentAnalysis> analysis = new ArrayList<SegmentAnalysis>(); | ||
| + | ||
| + private final static int MAX_DEPTH = 22; | ||
| + | ||
| + /** | ||
| + * @param visitor - The class used to examine each possible text segment. | ||
| + */ | ||
| + public Combinations( Visitor visitor ) { | ||
| + setVisitor( visitor ); | ||
| + } | ||
| + | ||
| + /** | ||
| + * Entry point. | ||
| + * | ||
| + * @param initial - The list of possible words that could constitute the | ||
| + * solution. | ||
| + */ | ||
| + public List<SegmentAnalysis> root( List initial ) { | ||
| + clearAnalysis(); | ||
| + root( new ArrayList(), initial, 0 ); | ||
| + return getAnalysis(); | ||
| + } | ||
| + | ||
| + /** | ||
| + * Print all subsets of the remaining elements, with given prefix. | ||
| + */ | ||
| + private void root( List prefix, List remain, int depth ) { | ||
| + if( remain.size() > 0 && depth < MAX_DEPTH ) { | ||
| + List combination = new ArrayList( prefix.size() + 1 ); | ||
| + combination.addAll( prefix ); | ||
| + combination.add( remain.get( 0 ) ); | ||
| + | ||
| + addAnalysis( getVisitor().visit( combination ) ); | ||
| + | ||
| + List r = new ArrayList( remain.size() ); | ||
| + r.addAll( remain.subList( 1, remain.size() ) ); | ||
| + | ||
| + root( combination, r, depth + 1 ); | ||
| + root( prefix, r, depth + 1 ); | ||
| + } | ||
| + } | ||
| + | ||
| + private void setVisitor( Visitor visitor ) { | ||
| + this.visitor = visitor; | ||
| + } | ||
| + | ||
| + private Visitor getVisitor() { | ||
| + return this.visitor; | ||
| + } | ||
| + | ||
| + private void clearAnalysis() { | ||
| + getAnalysis().clear(); | ||
| + } | ||
| + | ||
| + private List<SegmentAnalysis> getAnalysis() { | ||
| + return this.analysis; | ||
| + } | ||
| + | ||
| + private void addAnalysis( SegmentAnalysis sa ) { | ||
| + getAnalysis().add( sa ); | ||
| + } | ||
| + | ||
| + /** | ||
| + * Tests the class. | ||
| + */ | ||
| + public static void main( String[] args ) { | ||
| + List<String> list = new ArrayList<String>(); | ||
| + PrintVisitor pv = new PrintVisitor(); | ||
| + | ||
| + list.add( "a" ); | ||
| + list.add( "b" ); | ||
| + list.add( "c" ); | ||
| + list.add( "d" ); | ||
| + | ||
| + Combinations combinations = new Combinations( pv ); | ||
| + combinations.root( list ); | ||
| + } | ||
| +} | ||
| +package com.whitemagicsoftware.wordsplit; | ||
| + | ||
| +import java.util.AbstractMap; | ||
| +import java.util.ArrayList; | ||
| +import java.util.List; | ||
| +import java.util.Map; | ||
| +import java.util.TreeMap; | ||
| + | ||
| +import java.io.BufferedReader; | ||
| +import java.io.BufferedWriter; | ||
| +import java.io.File; | ||
| +import java.io.FileInputStream; | ||
| +import java.io.InputStreamReader; | ||
| +import java.io.IOException; | ||
| + | ||
| +/** | ||
| + * Splits concatenated text into a sentence. | ||
| + */ | ||
| +@SuppressWarnings("unchecked") | ||
| +public class Main { | ||
| + /** | ||
| + * Default constructor. | ||
| + */ | ||
| + public Main() { | ||
| + } | ||
| + | ||
| + private static void out( String s ) { | ||
| + System.out.println( s ); | ||
| + } | ||
| + | ||
| + /** | ||
| + * Main application. Takes a lexicon (with probabilities) and list of | ||
| + * concatenated strings. Writes the split strings to standard output. | ||
| + */ | ||
| + public static void main( String args[] ) | ||
| + throws IOException { | ||
| + TextSegmenter ts = new TextSegmenter(); | ||
| + | ||
| + if( args.length == 2 ) { | ||
| + try { | ||
| + ts.split( new File( args[0] ), new File( args[1] ) ); | ||
| + } | ||
| + catch( Exception e ) { | ||
| + System.err.println( "Error: " + e.getMessage() ); | ||
| + e.printStackTrace(); | ||
| + } | ||
| + } | ||
| + else { | ||
| + out( "com.whitemagicsoftware.wordsplit.Main <lexicon> <conjoined>" ); | ||
| + out( "<lexicon> - CSV file: word,probability" ); | ||
| + out( "<conjoined> - Text file" ); | ||
| + } | ||
| + } | ||
| +} | ||
| + | ||
| +package com.whitemagicsoftware.wordsplit; | ||
| + | ||
| +import java.util.List; | ||
| +import java.util.Map; | ||
| + | ||
| +/** | ||
| + * Responsible for writing out a list of words. | ||
| + */ | ||
| +public class PrintVisitor implements Visitor { | ||
| + /** | ||
| + * Default constructor. | ||
| + */ | ||
| + public PrintVisitor() { | ||
| + } | ||
| + | ||
| + /** | ||
| + * Writes the given parameter to standard output. | ||
| + * | ||
| + * @param list - The list of values to write to stdout. | ||
| + */ | ||
| + public SegmentAnalysis visit( List list ) { | ||
| + System.out.println( list ); | ||
| + return null; | ||
| + } | ||
| +} | ||
| +package com.whitemagicsoftware.wordsplit; | ||
| + | ||
| +import java.util.List; | ||
| +import java.util.Map; | ||
| + | ||
| +/** | ||
| + * Stores the details about a possible solution to a concatenated phrase. | ||
| + * These details allow the TextSegmenter class to determine whether or not | ||
| + * the solution is the most likely. | ||
| + */ | ||
| +@SuppressWarnings("unchecked") | ||
| +public class SegmentAnalysis { | ||
| + private int wordsUsed; | ||
| + private List<Map.Entry> words; | ||
| + private String remaining; | ||
| + | ||
| + public SegmentAnalysis( List<Map.Entry> words ) { | ||
| + setWords( words ); | ||
| + } | ||
| + | ||
| + /** | ||
| + * Splits the given word (concatenated text) into multiple words, with | ||
| + * spaces to separate each word. | ||
| + * | ||
| + * @param concat - The words to split. | ||
| + * @return The given parameter with spaces in between each word. | ||
| + */ | ||
| + public StringBuilder apply( String concat ) { | ||
| + for( Map.Entry entry : getWords() ) { | ||
| + String word = (String)(entry.getKey()); | ||
| + concat = concat.replaceFirst( word, " " + word + " " ); | ||
| + } | ||
| + | ||
| + return new StringBuilder( normalise( concat ) ); | ||
| + } | ||
| + | ||
| + public boolean matchedAllWords() { | ||
| + return getWordCount() == getWordsUsed(); | ||
| + } | ||
| + | ||
| + public int length() { | ||
| + return getRemaining().length(); | ||
| + } | ||
| + | ||
| + /** | ||
| + * Removes multiple spaces from inside a string, as well as trimming white | ||
| + * space from both ends of the string. | ||
| + * | ||
| + * @return The value of s with its whitespace normalised. | ||
| + */ | ||
| + private String normalise( String s ) { | ||
| + return s.replaceAll( "\\b\\s{2,}\\b", " " ).trim(); | ||
| + } | ||
| + | ||
| + public void setRemaining( String remaining ) { | ||
| + this.remaining = normalise( remaining ); | ||
| + } | ||
| + | ||
| + private String getRemaining() { | ||
| + return this.remaining; | ||
| + } | ||
| + | ||
| + private double getWordCount() { | ||
| + return getWords().size(); | ||
| + } | ||
| + | ||
| + public void setWordsUsed( int wordsUsed ) { | ||
| + this.wordsUsed = wordsUsed; | ||
| + } | ||
| + | ||
| + private double getWordsUsed() { | ||
| + return this.wordsUsed; | ||
| + } | ||
| + | ||
| + /** | ||
| + * Returns the product of the probability of each word in this potential | ||
| + * solution. | ||
| + * | ||
| + * @return A number between 0 and 1. | ||
| + */ | ||
| + public double getProbability() { | ||
| + double probability = 1; | ||
| + | ||
| + for( Map.Entry entry : getWords() ) { | ||
| + probability *= ((Double)(entry.getValue())).doubleValue(); | ||
| + } | ||
| + | ||
| + return probability * (getWordsUsed() / getWordCount()); | ||
| + } | ||
| + | ||
| + private void setWords( List<Map.Entry> words ) { | ||
| + this.words = words; | ||
| + } | ||
| + | ||
| + private List<Map.Entry> getWords() { | ||
| + return this.words; | ||
| + } | ||
| +} | ||
| +package com.whitemagicsoftware.wordsplit; | ||
| + | ||
| +import java.util.List; | ||
| +import java.util.Map; | ||
| + | ||
| +/** | ||
| + * Called by the Combinations class when a new combination of words has been | ||
| + * defined (recursively). This class gathers statistics about the list of | ||
| + * words that are a possible contender for being the solution. | ||
| + */ | ||
| +@SuppressWarnings("unchecked") | ||
| +public class SegmentVisitor implements Visitor { | ||
| + private String concat; | ||
| + private SegmentAnalysis analysis; | ||
| + | ||
| + /** | ||
| + * @param concat - The concatenated string that was analysed. | ||
| + */ | ||
| + public SegmentVisitor( String concat ) { | ||
| + setConcatenated( concat ); | ||
| + } | ||
| + | ||
| + /** | ||
| + * Determines the following statistics with respect to the list. | ||
| + * <ul> | ||
| + * <li>The number of words used in the list versus in the string.</li> | ||
| + * <li>The popularity of proposed solution words.</li> | ||
| + * <li>The number of remaining characters (and words) after removing the | ||
| + * word list from the concatenated string.</li> | ||
| + * </ul> | ||
| + * | ||
| + * @param list - The list of words to examine. | ||
| + */ | ||
| + public SegmentAnalysis visit( List list ) { | ||
| + String result = getConcatenated().toString(); | ||
| + int wordsUsed = 0; | ||
| + | ||
| + for( Object o : list ) { | ||
| + String word = (String)(((Map.Entry)o).getKey()); | ||
| + | ||
| + if( result.indexOf( word ) >= 0 ) { | ||
| + wordsUsed++; | ||
| + result = result.replaceFirst( word, " " ); | ||
| + } | ||
| + } | ||
| + | ||
| + SegmentAnalysis analysis = new SegmentAnalysis( list ); | ||
| + | ||
| + analysis.setWordsUsed( wordsUsed ); | ||
| + analysis.setRemaining( result ); | ||
| + | ||
| + return analysis; | ||
| + } | ||
| + | ||
| + private void setConcatenated( String concat ) { | ||
| + this.concat = concat; | ||
| + } | ||
| + | ||
| + private String getConcatenated() { | ||
| + return this.concat; | ||
| + } | ||
| +} | ||
| + | ||
| +package com.whitemagicsoftware.wordsplit; | ||
| + | ||
| +import java.util.Collections; | ||
| +import java.util.Comparator; | ||
| +import java.util.HashMap; | ||
| +import java.util.Iterator; | ||
| +import java.util.LinkedList; | ||
| +import java.util.LinkedHashMap; | ||
| +import java.util.List; | ||
| +import java.util.Map; | ||
| + | ||
| +/** | ||
| + * A hashmap that can be sorted by its values. | ||
| + */ | ||
| +public class SortableValueMap<K, V extends Comparable<? super V>> | ||
| + extends LinkedHashMap<K, V> { | ||
| + /** | ||
| + * Default constructor. | ||
| + */ | ||
| + public SortableValueMap() { } | ||
| + | ||
| + /** | ||
| + * Populates this instance based on the given map. | ||
| + */ | ||
| + public SortableValueMap( Map<K, V> map ) { | ||
| + super( map ); | ||
| + } | ||
| + | ||
| + public void sortByValue() { | ||
| + List<Map.Entry<K, V>> list = new LinkedList<Map.Entry<K, V>>( entrySet() ); | ||
| + | ||
| + Collections.sort( list, new Comparator<Map.Entry<K, V>>() { | ||
| + public int compare( Map.Entry<K, V> entry1, Map.Entry<K, V> entry2 ) { | ||
| + return entry2.getValue().compareTo( entry1.getValue() ); | ||
| + } | ||
| + }); | ||
| + | ||
| + clear(); | ||
| + | ||
| + for( Map.Entry<K, V> entry : list ) { | ||
| + put( entry.getKey(), entry.getValue() ); | ||
| + } | ||
| + } | ||
| + | ||
| + /** | ||
| + * Used for debugging. | ||
| + */ | ||
| + private static void print( String text, Map<String, Double> map ) { | ||
| + System.out.println( text ); | ||
| + | ||
| + for( String key : map.keySet() ) { | ||
| + System.out.println( "key/value: " + key + "/" + map.get( key ) ); | ||
| + } | ||
| + } | ||
| + | ||
| + /** | ||
| + * Tests the class. | ||
| + */ | ||
| + public static void main(String[] args) { | ||
| + SortableValueMap<String, Double> map = | ||
| + new SortableValueMap<String, Double>(); | ||
| + | ||
| + map.put( "A", 67.5 ); | ||
| + map.put( "B", 99.5 ); | ||
| + map.put( "C", 82.4 ); | ||
| + map.put( "D", 42.0 ); | ||
| + | ||
| + print( "Unsorted map", map ); | ||
| + map.sortByValue(); | ||
| + print( "Sorted map", map ); | ||
| + } | ||
| +} | ||
| +package com.whitemagicsoftware.wordsplit; | ||
| + | ||
| +import java.util.AbstractMap; | ||
| +import java.util.ArrayList; | ||
| +import java.util.List; | ||
| +import java.util.Map; | ||
| +import java.util.TreeMap; | ||
| + | ||
| +import java.io.BufferedReader; | ||
| +import java.io.BufferedWriter; | ||
| +import java.io.File; | ||
| +import java.io.FileInputStream; | ||
| +import java.io.InputStreamReader; | ||
| +import java.io.IOException; | ||
| + | ||
| +/** | ||
| + * Splits concatenated text into a sentence. | ||
| + */ | ||
| +@SuppressWarnings("unchecked") | ||
| +public class TextSegmenter { | ||
| + /** Lexical and concatenated entries must be at least 2 characters. */ | ||
| + private static final int MIN_LEX_LENGTH = 2; | ||
| + | ||
| + /** Words and frequencies. */ | ||
| + private Map<String, Double> dictionary = new TreeMap<String, Double>(); | ||
| + | ||
| + /** List of concatenated words to split. */ | ||
| + private List<String> concat = new ArrayList<String>(); | ||
| + | ||
| + /** | ||
| + * Default constructor. | ||
| + */ | ||
| + public TextSegmenter() { | ||
| + } | ||
| + | ||
| + /** | ||
| + * Helper method. | ||
| + */ | ||
| + public void split( File lexicon, File concat ) | ||
| + throws IOException { | ||
| + BufferedReader lex = new BufferedReader( | ||
| + new InputStreamReader( new FileInputStream( lexicon ) ) ); | ||
| + BufferedReader col = new BufferedReader( | ||
| + new InputStreamReader( new FileInputStream( concat ) ) ); | ||
| + | ||
| + split( lex, col ); | ||
| + | ||
| + lex.close(); | ||
| + col.close(); | ||
| + } | ||
| + | ||
| + /** | ||
| + * Splits the text. Callers must close the streams. | ||
| + */ | ||
| + public void split( BufferedReader lexicon, BufferedReader concat ) | ||
| + throws IOException { | ||
| + loadLexicon( lexicon ); | ||
| + loadConcat( concat ); | ||
| + split(); | ||
| + } | ||
| + | ||
| + /** | ||
| + * Iterates over all of the contatenated text, splitting each concatenated | ||
| + * String into English words. | ||
| + */ | ||
| + private void split() { | ||
| + for( String concat : getConcat() ) { | ||
| + System.out.printf( "%s,%s\n", concat, segments( concat ) ); | ||
| + } | ||
| + } | ||
| + | ||
| + /** | ||
| + * Returns a number between 0 and 1 that represents how often the word is | ||
| + * used relative to all the other words in the lexicon. | ||
| + */ | ||
| + private double getProbability( String s ) { | ||
| + try { | ||
| + return getDictionary().get( s ); | ||
| + } | ||
| + catch( Exception e ) { | ||
| + return 0.0; | ||
| + } | ||
| + } | ||
| + | ||
| + /** | ||
| + * Splits a concatenated phrase into its constituent words. This will look | ||
| + * up the words in a dictionary and find the most likely combination that | ||
| + * satisifies the word segmentation. | ||
| + * | ||
| + * @param concat - The phrase without spaces to split into words. | ||
| + * @return The concat text with spaces. | ||
| + */ | ||
| + private String segments( String concat ) { | ||
| + int length = concat.length(); | ||
| + List<Map.Entry<String, Double>> words = | ||
| + new ArrayList<Map.Entry<String, Double>>(); | ||
| + | ||
| + // Put all the words that exist in the string into a map. | ||
| + // | ||
| + for( int i = 0; i < length; i++ ) { | ||
| + for( int j = 0; j < length - i; j++ ) { | ||
| + // Word and probability from the lexicon. | ||
| + // | ||
| + String w = concat.substring( j, length - i ); | ||
| + double p = getProbability( w ); | ||
| + | ||
| + // Retain words that comprise the concatenated string in order. | ||
| + // | ||
| + if( p > 0 ) { | ||
| + words.add( 0, new AbstractMap.SimpleEntry<String, Double>( w, p ) ); | ||
| + } | ||
| + } | ||
| + } | ||
| + | ||
| + StringBuilder result = new StringBuilder( length * 2 ); | ||
| + StringBuffer joined = new StringBuffer( concat ); | ||
| + int wordCount = words.size(); | ||
| + int wordsUsed = 0; | ||
| + | ||
| + // If all the words can be accounted for, then the problem is solved. | ||
| + // If not, then a more complex analsyis is required. | ||
| + // | ||
| + for( Map.Entry<String, Double> word : words ) { | ||
| + String w = word.getKey(); | ||
| + int wlen = w.length(); | ||
| + int index = joined.indexOf( w ); | ||
| + | ||
| + wordsUsed++; | ||
| + | ||
| + if( index == 0 ) { | ||
| + // The word (w) from the lexicon matched the beginning of | ||
| + // the concatenated string. Track the word within "result". | ||
| + // | ||
| + result.append( w ).append( ' ' ); | ||
| + joined = joined.delete( 0, wlen ); | ||
| + } | ||
| + else if( index > 0 ) { | ||
| + // The word (w) from the lexicon matched the concatenated string, | ||
| + // but not at the beginning. | ||
| + // | ||
| + result.append( joined.substring( 0, index ) ).append( ' ' ); | ||
| + joined = joined.delete( 0, index ); | ||
| + } | ||
| + else { | ||
| + // The word could not be found within the string, so lower the | ||
| + // count of the number of words (from the list) that were used | ||
| + // in this potential solution. The number of words used will be | ||
| + // checked against the number of words found. If they are not | ||
| + // equal then a deeper analysis must be performed. | ||
| + // | ||
| + wordsUsed--; | ||
| + } | ||
| + } | ||
| + | ||
| + // Tack on the last word that was not accounted for in the loop. | ||
| + // | ||
| + result.append( joined ); | ||
| + | ||
| + // The 80% case is when there was a 1:1 match between the concatenated | ||
| + // text and having found all the suggested words in said text. If there | ||
| + // was only one possible match, then there is no point performing any | ||
| + // further analysis. | ||
| + // | ||
| + boolean solved = wordCount == wordsUsed; | ||
| + | ||
| + if( !solved ) { | ||
| + result.setLength( 0 ); | ||
| + | ||
| + List<SegmentAnalysis> saList = combinations( concat, words ); | ||
| + List<SegmentAnalysis> candidates = new ArrayList<SegmentAnalysis>(); | ||
| + | ||
| + int minLength = Integer.MAX_VALUE; | ||
| + | ||
| + // Record the candidates with the shortest remaining character | ||
| + // count (after splitting and removing the most probable words). | ||
| + // This loop primarily reduces the candidates based on whether all | ||
| + // the words in one particular combination of words were used and | ||
| + // each of those words exists in the lexicon. | ||
| + // | ||
| + for( SegmentAnalysis sa : saList ) { | ||
| + if( sa.matchedAllWords() ) { | ||
| + int saLength = sa.length(); | ||
| + | ||
| + if( saLength < minLength ) { | ||
| + minLength = saLength; | ||
| + } | ||
| + | ||
| + candidates.add( sa ); | ||
| + } | ||
| + } | ||
| + | ||
| + // Swap the segment analysis list for the candidate list. This | ||
| + // step isn't necessary, but it makes the previous loop and any | ||
| + // subsequent loops operate on the same variables with the same | ||
| + // meaning: the "candidates" list will shrink until there is only | ||
| + // one element -- the solution. | ||
| + // | ||
| + swap( saList, candidates ); | ||
| + | ||
| + // The solutions that have the fewest remaining letters are the | ||
| + // ones to keep. The winning solution will be decided by probability. | ||
| + // | ||
| + for( SegmentAnalysis sa : saList ) { | ||
| + if( sa.length() == minLength ) { | ||
| + candidates.add( sa ); | ||
| + } | ||
| + } | ||
| + | ||
| + swap( saList, candidates ); | ||
| + | ||
| + SegmentAnalysis solution = saList.get( 0 ); | ||
| + double maxProbability = Double.MIN_VALUE; | ||
| + | ||
| + // Find the solution with the highest probability. The probability | ||
| + // is calculated using the probabilities from the lexicon (which | ||
| + // are, in turn, used by the SegmentAnalysis instance). | ||
| + // | ||
| + for( SegmentAnalysis sa : saList ) { | ||
| + double probability = sa.getProbability(); | ||
| + | ||
| + if( probability > maxProbability ) { | ||
| + solution = sa; | ||
| + maxProbability = probability; | ||
| + } | ||
| + } | ||
| + | ||
| + result = solution.apply( concat ); | ||
| + } | ||
| + | ||
| + return result.toString().trim(); | ||
| + } | ||
| + | ||
| + /** | ||
| + * Copies the elements from the second list into the first list, then | ||
| + * clears the second list. This method is used so that the candidates | ||
| + * variable in the 'segments' method always whittles down to the most | ||
| + * likely solution. | ||
| + */ | ||
| + private void swap( List l1, List l2 ) { | ||
| + l1.clear(); | ||
| + l1.addAll( l2 ); | ||
| + l2.clear(); | ||
| + } | ||
| + | ||
| + /** | ||
| + * This method recursively generates a list of all possible word | ||
| + * combinations from a list of words. The result is an analysis of each | ||
| + * combination, containing details like probability, relative word | ||
| + * lengths, and so forth. | ||
| + */ | ||
| + private List<SegmentAnalysis> combinations( | ||
| + String concat, List<Map.Entry<String, Double>> words ) { | ||
| + Visitor v = new SegmentVisitor( concat ); | ||
| + | ||
| + Combinations combinations = new Combinations( v ); | ||
| + return combinations.root( words ); | ||
| + } | ||
| + | ||
| + /** | ||
| + * Loads all the words and word probability from the dictionary. Words | ||
| + * are separated from the probability by a comma. | ||
| + */ | ||
| + private void loadLexicon( BufferedReader lexiconData ) | ||
| + throws IOException { | ||
| + String line = null; | ||
| + Map<String, Double> dictionary = getDictionary(); | ||
| + | ||
| + dictionary.clear(); | ||
| + | ||
| + while( (line = lexiconData.readLine()) != null ) { | ||
| + String[] lex = line.toLowerCase().split( "," ); | ||
| + | ||
| + if( lex[0].length() >= MIN_LEX_LENGTH ) { | ||
| + try { | ||
| + dictionary.put( lex[0], Double.parseDouble( lex[1] ) ); | ||
| + } | ||
| + catch( Exception e ) { | ||
| + dictionary.put( lex[0], getDefaultProbability() ); | ||
| + } | ||
| + } | ||
| + } | ||
| + } | ||
| + | ||
| + /** | ||
| + * Inserts the lines of concatenated text into the internal list. | ||
| + */ | ||
| + private void loadConcat( BufferedReader concatData ) | ||
| + throws IOException { | ||
| + String line = null; | ||
| + List<String> concat = getConcat(); | ||
| + | ||
| + concat.clear(); | ||
| + | ||
| + while( (line = concatData.readLine()) != null ) { | ||
| + if( line.length() >= MIN_LEX_LENGTH ) { | ||
| + concat.add( line.toLowerCase() ); | ||
| + } | ||
| + } | ||
| + } | ||
| + | ||
| + /** | ||
| + * Returns the list of strings that have been concatenated together (such | ||
| + * as those in a database column name). | ||
| + */ | ||
| + private List<String> getConcat() { | ||
| + return this.concat; | ||
| + } | ||
| + | ||
| + /** | ||
| + * Returns a unique set of words, each having a probability calculated | ||
| + * using the relative frequency of the word. (The word that appears most | ||
| + * often in the dictionary's source corpus has a probability of 1.) | ||
| + */ | ||
| + private Map<String, Double> getDictionary() { | ||
| + return this.dictionary; | ||
| + } | ||
| + | ||
| + /** | ||
| + * Returns the default probability when no value is given. This is | ||
| + * likely an error in the lexicon that should be fixed. | ||
| + */ | ||
| + private Double getDefaultProbability() { | ||
| + return 0.0; | ||
| + } | ||
| +} | ||
| + | ||
| +package com.whitemagicsoftware.wordsplit; | ||
| + | ||
| +import java.util.List; | ||
| +import java.util.Map; | ||
| + | ||
| +/** | ||
| + * Defines the mechanism that allows the SegmentVisitor to collect statistics | ||
| + * on the combination of words that forms a possible solution to the | ||
| + * text splitting. | ||
| + */ | ||
| +public interface Visitor { | ||
| + /** | ||
| + * Returns details about the likelihood that the given list of words will | ||
| + * solve the text splitting problem. | ||
| + * | ||
| + * @param list - The list of split words. | ||
| + */ | ||
| + public SegmentAnalysis visit( List list ); | ||
| +} | ||
| +#Thu, 03 Feb 2011 22:09:43 -0800 | ||
| +build.major.number=00 | ||
| +build.minor.number=00 | ||
| +build.revision.number=26 | ||
| + | ||