这里提供我开发的中文分词器源代码。代码分为三个部分:
(一)状态矩阵元素对象GNode。 动态规划中,状态矩阵的元素需要记录当前最大概率和前一个最优匹配状态,这个矩阵是一个二维数组,每一个元素就是一个GNode对象。
(二)统计学习使用的嵌套哈希表THash。 用于统计学习时记录结果的HashMap,这个哈希表有三层嵌套,对于收入数据有一些必要操作,为了简单起见,我将这个哈希表进行了封装,并加几个方法方便程序的编写。
(三)分词器对象CGSegmenter。 其中封装了学习方法,标注器行为等,是分词器的主要功能对象。
注:本文只分享代码,不对代码进行分析解释,需要了解原理和代码分析,请参考博文《【中文分词】亲手开发一款中文分词器——原理》
源代码如下:
GNode
public class GNode {
public Double MaxPos;
public char CurTag;
public char PreTag;
public GNode(){
this.MaxPos = 0.0;
}
}
THash
public class THash {
//一个哈希表,记录结构为:字符->符号->字符->符号->概率。代表一个字符标注为某一符号时,下一字符标注为某一符号的概率值,值为Double类型。
private HashMap<Character, HashMap<Character, HashMap<Character, HashMap<Character, Double>>>> _myHashMap;
private static Double INITIAL_VALUE = 1.0;
private static Double POSSIBILITY_INTERPOLATE_VALUE = 1.02;
public THash() {
this._myHashMap = new HashMap<>();
}
public void PutValue(char pri_key, char pri_tag, char sec_key, char sec_tag) {
if (!this._myHashMap.containsKey(pri_key)) {
this._myHashMap.put(pri_key, new HashMap<Character, HashMap<Character, HashMap<Character, Double>>>());
}
HashMap<Character, HashMap<Character, HashMap<Character, Double>>> prihash = this._myHashMap
.get(pri_key);
if (!prihash.containsKey(pri_tag)) {
prihash.put(pri_tag,
new HashMap<Character, HashMap<Character, Double>>());
}
HashMap<Character, HashMap<Character, Double>> seccharhash = prihash
.get(pri_tag);
if (!seccharhash.containsKey(sec_key)) {
seccharhash.put(sec_key, new HashMap<Character, Double>());
}
HashMap<Character, Double> sectaghash = seccharhash.get(sec_key);
if (!sectaghash.containsKey(sec_tag)) {
sectaghash.put(sec_tag, THash.INITIAL_VALUE);
} else {
Double _temp = sectaghash.get(sec_tag);
_temp++;
}
}
public void calculatePossibilityForAllCombinations(String path, String format) {
File f = new File(path);
try {
if (!f.exists())
f.createNewFile();
PrintWriter writer = new PrintWriter(path, format);
for(Character pri_key : this._myHashMap.keySet()){
HashMap<Character, HashMap<Character, HashMap<Character, Double>>>
_pritaghash = this._myHashMap.get(pri_key);
for(Character pri_tag : _pritaghash.keySet()){
HashMap<Character, HashMap<Character, Double>>
_sechash = _pritaghash.get(pri_tag);
for(Character sec_key : _sechash.keySet()){
HashMap<Character, Double> _sectaghash = _sechash.get(sec_key);
Double total = 0.0;
for(Character sec_tag : _sectaghash.keySet()){
total += _sectaghash.get(sec_tag);
}
total *= THash.POSSIBILITY_INTERPOLATE_VALUE;
for(Character sec_tag : _sectaghash.keySet()){
StringBuilder sb = new StringBuilder();
sb.append(pri_key);
sb.append(pri_tag);
sb.append(sec_key);
sb.append(sec_tag);
sb.append(_sectaghash.get(sec_tag) / total);
writer.println(sb.toString());
}
}
}
}
writer.close();
} catch (IOException e) {
System.err
.println("Error in method calculatePossibilityForAllCombinations()");
}
}
}
CGSegmenter
public class CGSegmenter {
private String TrainingMaterialPath;
private String TaggedTrainingMaterialPath;
private String FinalTagFilePathForRelation;
private String FinalTagFilePathForSingle;
private String DefaultFileFormat = "UTF-