package com.searchkiller;
public class BinarySplit {
/**
* 分词算法,比如给一个字符串(只包括中文和英文字母):
* "中华人ABC民共DE和国"
* 要切分为"中华","华人","ABC","民共","DE", "和国"
*
*/
String splitString;
public BinarySplit(String splitString) {
super();
this.splitString = splitString;
}
public void splitIt() {
StringBuffer sbChinese = new StringBuffer();
StringBuffer sbAlpha = new StringBuffer();
String current = "";
for (int i = 0; i < splitString.length(); i++) {
current = splitString.substring(i, i + 1);
if (current.matches("[\u4e00-\u9fa5]+")) {
sbChinese.append(splitString.substring(i, i + 1));
if ((i != splitString.length() - 1
&& !splitString.substring(i + 1, i + 2).matches(
"[\u4e00-\u9fa5]+"))||i == splitString.length() - 1) {
if (sbChinese.length() > 2) {
for (int j = 0; j +2 <= sbChinese.length(); j++) {
System.out.println(sbChinese.substring(j, j + 2));
}
} else {
System.out.println(sbChinese.toString());
}
sbChinese = new StringBuffer();
}
} else {
sbAlpha.append(current);
if (i != splitString.length() - 1
&& splitString.substring(i + 1, i + 2).matches(
"[\u4e00-\u9fa5]+")) {
System.out.println(sbAlpha.toString());
sbAlpha = new StringBuffer();
}
}
}
}
public static void main(String[] args) {
BinarySplit bs = new BinarySplit("中华人ABC民共DE和国人大代表");
bs.splitIt();
}
}
简单分词算法(二分法,java实现)
最新推荐文章于 2021-06-13 20:54:06 发布