forked from shuyo/language-detection
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
git-svn-id: http://language-detection.googlecode.com/svn/trunk@2 5bf80810-9c81-6ef6-3109-74533bb38634
- Loading branch information
nakatani.shuyo
committed
Aug 24, 2010
1 parent
b8acc6b
commit 7e1acb3
Showing
16 changed files
with
999 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<classpath> | ||
<classpathentry kind="src" path="src"/> | ||
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.6"/> | ||
<classpathentry kind="lib" path="C:/Users/shuyo/workspace/langdetect/jsonic-1.2.0/jsonic-1.2.0.jar"/> | ||
<classpathentry kind="con" path="org.eclipse.jdt.junit.JUNIT_CONTAINER/4"/> | ||
<classpathentry kind="output" path="bin"/> | ||
</classpath> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<projectDescription> | ||
<name>langdetect</name> | ||
<comment></comment> | ||
<projects> | ||
</projects> | ||
<buildSpec> | ||
<buildCommand> | ||
<name>org.eclipse.jdt.core.javabuilder</name> | ||
<arguments> | ||
</arguments> | ||
</buildCommand> | ||
</buildSpec> | ||
<natures> | ||
<nature>org.eclipse.jdt.core.javanature</nature> | ||
</natures> | ||
</projectDescription> |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
#Wed Aug 11 11:58:22 JST 2010 | ||
eclipse.preferences.version=1 | ||
formatter_profile=_gaia | ||
formatter_settings_version=11 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
#Wed Aug 18 20:01:31 JST 2010 | ||
eclipse.preferences.version=1 | ||
org.eclipse.ltk.core.refactoring.enable.project.refactoring.history=false |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,159 @@ | ||
package com.cybozu.labs.langdetect; | ||
|
||
import java.io.BufferedReader; | ||
import java.io.File; | ||
import java.io.FileInputStream; | ||
import java.io.FileOutputStream; | ||
import java.io.IOException; | ||
import java.io.InputStreamReader; | ||
import java.util.ArrayList; | ||
import java.util.HashMap; | ||
import java.util.HashSet; | ||
|
||
import com.cybozu.labs.langdetect.util.LangProfile; | ||
|
||
import net.arnx.jsonic.JSON; | ||
import net.arnx.jsonic.JSONException; | ||
|
||
public class Command { | ||
private static final double ALPHA = 1; | ||
private HashMap<String, String> opt_with_value = new HashMap<String, String>(); | ||
private HashMap<String, String> values = new HashMap<String, String>(); | ||
private HashSet<String> opt_without_value = new HashSet<String>(); | ||
private ArrayList<String> arglist = new ArrayList<String>(); | ||
|
||
private void add(String opt, String key, String value) { | ||
opt_with_value.put(opt, key); | ||
values.put(key, value); | ||
} | ||
private String get(String key) { | ||
return values.get(key); | ||
} | ||
private void parse(String[] args) { | ||
for(int i=0;i<args.length;++i) { | ||
if (opt_with_value.containsKey(args[i])) { | ||
String key = opt_with_value.get(args[i]); | ||
values.put(key, args[i+1]); | ||
++i; | ||
} else if (args[i].startsWith("-")) { | ||
opt_without_value.add(args[i]); | ||
} else { | ||
arglist.add(args[i]); | ||
} | ||
} | ||
} | ||
|
||
private boolean hasOpt(String opt) { | ||
return opt_without_value.contains(opt); | ||
} | ||
|
||
private void generateProfile() { | ||
String directory = get("directory") + "/"; | ||
for (String lang: arglist) { | ||
String filename = directory + lang + "wiki-latest-abstract.xml.gz"; | ||
|
||
LangProfile profile = GenProfile.load(lang, filename); | ||
profile.omitLessFreq(); | ||
|
||
File file = new File(directory + "profiles/" + lang); | ||
FileOutputStream os = null; | ||
try { | ||
os = new FileOutputStream(file); | ||
JSON.encode(profile, os); | ||
} catch (JSONException e) { | ||
e.printStackTrace(); | ||
} catch (IOException e) { | ||
e.printStackTrace(); | ||
} finally { | ||
try { | ||
if (os!=null) os.close(); | ||
} catch (IOException e) {} | ||
} | ||
} | ||
} | ||
|
||
private void detect() { | ||
String profileDirectory = get("directory") + "/"; | ||
DetectorFactory.loadProfile(profileDirectory); | ||
for (String filename: arglist) { | ||
Detector detector = DetectorFactory.create(ALPHA); | ||
BufferedReader is = null; | ||
try { | ||
is = new BufferedReader(new InputStreamReader(new FileInputStream(filename), "utf-8")); | ||
char[] buf = new char[1024]; | ||
while (is.ready()) { | ||
int length = is.read(buf); | ||
for(int i=0;i<length;++i) { | ||
detector.append(buf[i]); | ||
if (detector.isConvergence()) break; | ||
} | ||
} | ||
} catch (IOException e) { | ||
// TODO Auto-generated catch block | ||
e.printStackTrace(); | ||
} finally { | ||
try { | ||
if (is!=null) is.close(); | ||
} catch (IOException e) {} | ||
} | ||
System.out.println(filename + ":" + detector.getProbabilities()); | ||
|
||
} | ||
} | ||
|
||
private void batchtest() { | ||
String profileDirectory = get("directory") + "/"; | ||
DetectorFactory.loadProfile(profileDirectory); | ||
HashMap<String, ArrayList<String>> result = new HashMap<String, ArrayList<String>>(); | ||
for (String filename: arglist) { | ||
BufferedReader is = null; | ||
try { | ||
is = new BufferedReader(new InputStreamReader(new FileInputStream(filename), "utf-8")); | ||
while (is.ready()) { | ||
String line = is.readLine(); | ||
int i = line.indexOf('\t'); | ||
if (i<=0) continue; | ||
String correctLang = line.substring(0, i); | ||
String text = line.substring(i+1); | ||
|
||
Detector detector = DetectorFactory.create(ALPHA); | ||
for(int j=0;j<text.length();++j) { | ||
detector.append(text.charAt(j)); | ||
if (detector.isConvergence()) break; | ||
} | ||
String lang = detector.detect(); | ||
if (!result.containsKey(correctLang)) result.put(correctLang, new ArrayList<String>()); | ||
result.get(correctLang).add(lang); | ||
} | ||
|
||
} catch (IOException e) { | ||
// TODO Auto-generated catch block | ||
e.printStackTrace(); | ||
} finally { | ||
try { | ||
if (is!=null) is.close(); | ||
} catch (IOException e) {} | ||
} | ||
System.out.println(result); | ||
|
||
} | ||
|
||
} | ||
|
||
/** | ||
* @param args | ||
*/ | ||
public static void main(String[] args) { | ||
Command command = new Command(); | ||
command.add("-d", "directory", "./"); | ||
command.parse(args); | ||
|
||
if (command.hasOpt("-gp")) { | ||
command.generateProfile(); | ||
} else if (command.hasOpt("-ld")) { | ||
command.detect(); | ||
} else if (command.hasOpt("-bt")) { | ||
command.batchtest(); | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
package com.cybozu.labs.langdetect; | ||
|
||
import java.util.ArrayList; | ||
import java.util.HashMap; | ||
|
||
import com.cybozu.labs.langdetect.util.NGram; | ||
|
||
public class Detector { | ||
private static final double PROB_THRESHOLD = 0.1; | ||
private static final double CONV_THRESHOLD = 0.99999; | ||
public static final int BASE = 1000; | ||
private final HashMap<String, HashMap<String, Double>> p_ik; | ||
private final ArrayList<String> langlist; | ||
private HashMap<String, Double> prob; | ||
private NGram ngram; | ||
private double alpha; | ||
private boolean convergence; | ||
|
||
public Detector(DetectorFactory instance_) { | ||
p_ik = instance_.p_ik; | ||
langlist = instance_.langlist; | ||
prob = new HashMap<String, Double>(); | ||
for(String lang: langlist) { | ||
prob.put(lang, 1.0); | ||
} | ||
ngram = new NGram(); | ||
alpha = 1.0; | ||
convergence = false; | ||
} | ||
|
||
public void append(char ch) { | ||
ngram.addChar(ch); | ||
for(int length=1;length<=NGram.N_GRAM;++length){ | ||
String word = ngram.get(length); | ||
if (word == null || !p_ik.containsKey(word)) continue; | ||
HashMap<String, Double> wordprob = p_ik.get(word); | ||
double amount = 0; | ||
for(String lang: prob.keySet()) { | ||
double p = prob.get(lang); | ||
if (wordprob.containsKey(lang)) { | ||
p *= alpha + wordprob.get(lang) * BASE; | ||
} else { | ||
p *= alpha; | ||
} | ||
//p /= p_ik.size() * alpha + BASE; | ||
p /= alpha + BASE; | ||
prob.put(lang, p); | ||
amount += p; | ||
} | ||
// normalization & maximun probability | ||
double maxProb = 0.0; | ||
for(String lang: prob.keySet()) { | ||
double p = prob.get(lang) / amount; | ||
if (maxProb < p) maxProb = p; | ||
prob.put(lang, p); | ||
} | ||
if (maxProb > CONV_THRESHOLD) convergence = true; | ||
} | ||
} | ||
|
||
public ArrayList<Language> getProbabilities() { | ||
ArrayList<Language> list = new ArrayList<Language>(); | ||
for(String lang: prob.keySet()) { | ||
double p = prob.get(lang); | ||
if (p > PROB_THRESHOLD) { | ||
for (int i=0;i<=list.size();++i) { | ||
if (i==list.size() || list.get(i).prob < p) { | ||
list.add(i, new Language(lang, p)); | ||
break; | ||
} | ||
} | ||
|
||
} | ||
} | ||
return list ; | ||
} | ||
|
||
public String detect() { | ||
return getProbabilities().get(0).lang; | ||
} | ||
|
||
public void setAlpha(double alpha) { | ||
this.alpha = alpha; | ||
} | ||
public boolean isConvergence() { | ||
return convergence; | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
package com.cybozu.labs.langdetect; | ||
|
||
import java.io.File; | ||
import java.io.FileInputStream; | ||
import java.io.IOException; | ||
import java.util.ArrayList; | ||
import java.util.HashMap; | ||
|
||
import net.arnx.jsonic.JSON; | ||
import net.arnx.jsonic.JSONException; | ||
|
||
import com.cybozu.labs.langdetect.util.LangProfile; | ||
|
||
public class DetectorFactory { | ||
public HashMap<String, HashMap<String, Double>> p_ik; | ||
public ArrayList<String> langlist; | ||
private DetectorFactory() { | ||
p_ik = new HashMap<String, HashMap<String, Double>>(); | ||
langlist = new ArrayList<String>(); | ||
} | ||
static private DetectorFactory instance_ = new DetectorFactory(); | ||
|
||
public static void loadProfile(String profileDirectory) { | ||
File dir = new File(profileDirectory); | ||
for (File file: dir.listFiles()) { | ||
FileInputStream is = null; | ||
try { | ||
is = new FileInputStream(file); | ||
LangProfile profile = JSON.decode(is, LangProfile.class); | ||
addProfile(profile); | ||
} catch (JSONException e) { | ||
// TODO Auto-generated catch block | ||
System.out.println(file.getName()); | ||
e.printStackTrace(); | ||
} catch (IOException e) { | ||
// TODO Auto-generated catch block | ||
System.out.println(file.getName()); | ||
e.printStackTrace(); | ||
} finally { | ||
try { | ||
if (is!=null) | ||
is.close(); | ||
} catch (IOException e) {} | ||
} | ||
} | ||
} | ||
|
||
static public void addProfile(LangProfile profile) { | ||
String lang = profile.name; | ||
if (instance_.langlist.contains(lang)) { | ||
// TODO: | ||
throw new RuntimeException(); | ||
} | ||
instance_.langlist.add(lang); | ||
for (String word: profile.freq.keySet()) { | ||
if (!instance_.p_ik.containsKey(word)) { | ||
instance_.p_ik.put(word, new HashMap<String, Double>()); | ||
} | ||
double prob = profile.freq.get(word).doubleValue() / profile.n_words[word.length()-1]; | ||
instance_.p_ik.get(word).put(lang, prob); | ||
} | ||
} | ||
public static Detector create() { | ||
return new Detector(instance_); | ||
} | ||
|
||
public static Detector create(double alpha) { | ||
Detector detector = new Detector(instance_); | ||
detector.setAlpha(alpha); | ||
return detector; | ||
} | ||
} |
Oops, something went wrong.