Skip to content

Commit

Permalink
initialize
Browse files Browse the repository at this point in the history
git-svn-id: http://language-detection.googlecode.com/svn/trunk@2 5bf80810-9c81-6ef6-3109-74533bb38634
  • Loading branch information
nakatani.shuyo committed Aug 24, 2010
1 parent b8acc6b commit 7e1acb3
Show file tree
Hide file tree
Showing 16 changed files with 999 additions and 0 deletions.
8 changes: 8 additions & 0 deletions .classpath
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<classpath>
<classpathentry kind="src" path="src"/>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.6"/>
<classpathentry kind="lib" path="C:/Users/shuyo/workspace/langdetect/jsonic-1.2.0/jsonic-1.2.0.jar"/>
<classpathentry kind="con" path="org.eclipse.jdt.junit.JUNIT_CONTAINER/4"/>
<classpathentry kind="output" path="bin"/>
</classpath>
17 changes: 17 additions & 0 deletions .project
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
<name>langdetect</name>
<comment></comment>
<projects>
</projects>
<buildSpec>
<buildCommand>
<name>org.eclipse.jdt.core.javabuilder</name>
<arguments>
</arguments>
</buildCommand>
</buildSpec>
<natures>
<nature>org.eclipse.jdt.core.javanature</nature>
</natures>
</projectDescription>
280 changes: 280 additions & 0 deletions .settings/org.eclipse.jdt.core.prefs

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions .settings/org.eclipse.jdt.ui.prefs
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#Wed Aug 11 11:58:22 JST 2010
eclipse.preferences.version=1
formatter_profile=_gaia
formatter_settings_version=11
3 changes: 3 additions & 0 deletions .settings/org.eclipse.ltk.core.refactoring.prefs
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#Wed Aug 18 20:01:31 JST 2010
eclipse.preferences.version=1
org.eclipse.ltk.core.refactoring.enable.project.refactoring.history=false
159 changes: 159 additions & 0 deletions src/com/cybozu/labs/langdetect/Command.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
package com.cybozu.labs.langdetect;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;

import com.cybozu.labs.langdetect.util.LangProfile;

import net.arnx.jsonic.JSON;
import net.arnx.jsonic.JSONException;

public class Command {
private static final double ALPHA = 1;
private HashMap<String, String> opt_with_value = new HashMap<String, String>();
private HashMap<String, String> values = new HashMap<String, String>();
private HashSet<String> opt_without_value = new HashSet<String>();
private ArrayList<String> arglist = new ArrayList<String>();

private void add(String opt, String key, String value) {
opt_with_value.put(opt, key);
values.put(key, value);
}
private String get(String key) {
return values.get(key);
}
private void parse(String[] args) {
for(int i=0;i<args.length;++i) {
if (opt_with_value.containsKey(args[i])) {
String key = opt_with_value.get(args[i]);
values.put(key, args[i+1]);
++i;
} else if (args[i].startsWith("-")) {
opt_without_value.add(args[i]);
} else {
arglist.add(args[i]);
}
}
}

private boolean hasOpt(String opt) {
return opt_without_value.contains(opt);
}

private void generateProfile() {
String directory = get("directory") + "/";
for (String lang: arglist) {
String filename = directory + lang + "wiki-latest-abstract.xml.gz";

LangProfile profile = GenProfile.load(lang, filename);
profile.omitLessFreq();

File file = new File(directory + "profiles/" + lang);
FileOutputStream os = null;
try {
os = new FileOutputStream(file);
JSON.encode(profile, os);
} catch (JSONException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if (os!=null) os.close();
} catch (IOException e) {}
}
}
}

private void detect() {
String profileDirectory = get("directory") + "/";
DetectorFactory.loadProfile(profileDirectory);
for (String filename: arglist) {
Detector detector = DetectorFactory.create(ALPHA);
BufferedReader is = null;
try {
is = new BufferedReader(new InputStreamReader(new FileInputStream(filename), "utf-8"));
char[] buf = new char[1024];
while (is.ready()) {
int length = is.read(buf);
for(int i=0;i<length;++i) {
detector.append(buf[i]);
if (detector.isConvergence()) break;
}
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
try {
if (is!=null) is.close();
} catch (IOException e) {}
}
System.out.println(filename + ":" + detector.getProbabilities());

}
}

private void batchtest() {
String profileDirectory = get("directory") + "/";
DetectorFactory.loadProfile(profileDirectory);
HashMap<String, ArrayList<String>> result = new HashMap<String, ArrayList<String>>();
for (String filename: arglist) {
BufferedReader is = null;
try {
is = new BufferedReader(new InputStreamReader(new FileInputStream(filename), "utf-8"));
while (is.ready()) {
String line = is.readLine();
int i = line.indexOf('\t');
if (i<=0) continue;
String correctLang = line.substring(0, i);
String text = line.substring(i+1);

Detector detector = DetectorFactory.create(ALPHA);
for(int j=0;j<text.length();++j) {
detector.append(text.charAt(j));
if (detector.isConvergence()) break;
}
String lang = detector.detect();
if (!result.containsKey(correctLang)) result.put(correctLang, new ArrayList<String>());
result.get(correctLang).add(lang);
}

} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
try {
if (is!=null) is.close();
} catch (IOException e) {}
}
System.out.println(result);

}

}

/**
* @param args
*/
public static void main(String[] args) {
Command command = new Command();
command.add("-d", "directory", "./");
command.parse(args);

if (command.hasOpt("-gp")) {
command.generateProfile();
} else if (command.hasOpt("-ld")) {
command.detect();
} else if (command.hasOpt("-bt")) {
command.batchtest();
}
}
}
89 changes: 89 additions & 0 deletions src/com/cybozu/labs/langdetect/Detector.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
package com.cybozu.labs.langdetect;

import java.util.ArrayList;
import java.util.HashMap;

import com.cybozu.labs.langdetect.util.NGram;

public class Detector {
private static final double PROB_THRESHOLD = 0.1;
private static final double CONV_THRESHOLD = 0.99999;
public static final int BASE = 1000;
private final HashMap<String, HashMap<String, Double>> p_ik;
private final ArrayList<String> langlist;
private HashMap<String, Double> prob;
private NGram ngram;
private double alpha;
private boolean convergence;

public Detector(DetectorFactory instance_) {
p_ik = instance_.p_ik;
langlist = instance_.langlist;
prob = new HashMap<String, Double>();
for(String lang: langlist) {
prob.put(lang, 1.0);
}
ngram = new NGram();
alpha = 1.0;
convergence = false;
}

public void append(char ch) {
ngram.addChar(ch);
for(int length=1;length<=NGram.N_GRAM;++length){
String word = ngram.get(length);
if (word == null || !p_ik.containsKey(word)) continue;
HashMap<String, Double> wordprob = p_ik.get(word);
double amount = 0;
for(String lang: prob.keySet()) {
double p = prob.get(lang);
if (wordprob.containsKey(lang)) {
p *= alpha + wordprob.get(lang) * BASE;
} else {
p *= alpha;
}
//p /= p_ik.size() * alpha + BASE;
p /= alpha + BASE;
prob.put(lang, p);
amount += p;
}
// normalization & maximun probability
double maxProb = 0.0;
for(String lang: prob.keySet()) {
double p = prob.get(lang) / amount;
if (maxProb < p) maxProb = p;
prob.put(lang, p);
}
if (maxProb > CONV_THRESHOLD) convergence = true;
}
}

public ArrayList<Language> getProbabilities() {
ArrayList<Language> list = new ArrayList<Language>();
for(String lang: prob.keySet()) {
double p = prob.get(lang);
if (p > PROB_THRESHOLD) {
for (int i=0;i<=list.size();++i) {
if (i==list.size() || list.get(i).prob < p) {
list.add(i, new Language(lang, p));
break;
}
}

}
}
return list ;
}

public String detect() {
return getProbabilities().get(0).lang;
}

public void setAlpha(double alpha) {
this.alpha = alpha;
}
public boolean isConvergence() {
return convergence;
}

}
72 changes: 72 additions & 0 deletions src/com/cybozu/labs/langdetect/DetectorFactory.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
package com.cybozu.labs.langdetect;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;

import net.arnx.jsonic.JSON;
import net.arnx.jsonic.JSONException;

import com.cybozu.labs.langdetect.util.LangProfile;

public class DetectorFactory {
public HashMap<String, HashMap<String, Double>> p_ik;
public ArrayList<String> langlist;
private DetectorFactory() {
p_ik = new HashMap<String, HashMap<String, Double>>();
langlist = new ArrayList<String>();
}
static private DetectorFactory instance_ = new DetectorFactory();

public static void loadProfile(String profileDirectory) {
File dir = new File(profileDirectory);
for (File file: dir.listFiles()) {
FileInputStream is = null;
try {
is = new FileInputStream(file);
LangProfile profile = JSON.decode(is, LangProfile.class);
addProfile(profile);
} catch (JSONException e) {
// TODO Auto-generated catch block
System.out.println(file.getName());
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
System.out.println(file.getName());
e.printStackTrace();
} finally {
try {
if (is!=null)
is.close();
} catch (IOException e) {}
}
}
}

static public void addProfile(LangProfile profile) {
String lang = profile.name;
if (instance_.langlist.contains(lang)) {
// TODO:
throw new RuntimeException();
}
instance_.langlist.add(lang);
for (String word: profile.freq.keySet()) {
if (!instance_.p_ik.containsKey(word)) {
instance_.p_ik.put(word, new HashMap<String, Double>());
}
double prob = profile.freq.get(word).doubleValue() / profile.n_words[word.length()-1];
instance_.p_ik.get(word).put(lang, prob);
}
}
public static Detector create() {
return new Detector(instance_);
}

public static Detector create(double alpha) {
Detector detector = new Detector(instance_);
detector.setAlpha(alpha);
return detector;
}
}
Loading

0 comments on commit 7e1acb3

Please sign in to comment.