简单分词算法(二分法,java实现)


package com.searchkiller;

public class BinarySplit {

/**
* 分词算法,比如给一个字符串(只包括中文和英文字母):
* "中华人ABC民共DE和国"
* 要切分为"中华","华人","ABC","民共","DE", "和国"
*
*/
String splitString;

public BinarySplit(String splitString) {
super();
this.splitString = splitString;
}

public void splitIt() {
StringBuffer sbChinese = new StringBuffer();
StringBuffer sbAlpha = new StringBuffer();
String current = "";
for (int i = 0; i < splitString.length(); i++) {
current = splitString.substring(i, i + 1);
if (current.matches("[\u4e00-\u9fa5]+")) {
sbChinese.append(splitString.substring(i, i + 1));
if ((i != splitString.length() - 1
&& !splitString.substring(i + 1, i + 2).matches(
"[\u4e00-\u9fa5]+"))||i == splitString.length() - 1) {
if (sbChinese.length() > 2) {
for (int j = 0; j +2 <= sbChinese.length(); j++) {
System.out.println(sbChinese.substring(j, j + 2));
}
} else {
System.out.println(sbChinese.toString());
}
sbChinese = new StringBuffer();
}
} else {
sbAlpha.append(current);
if (i != splitString.length() - 1
&& splitString.substring(i + 1, i + 2).matches(
"[\u4e00-\u9fa5]+")) {
System.out.println(sbAlpha.toString());
sbAlpha = new StringBuffer();
}
}
}
}

public static void main(String[] args) {
BinarySplit bs = new BinarySplit("中华人ABC民共DE和国人大代表");
bs.splitIt();
}

}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值