主要针对应用规模比较小不需要词表支持的分词
代码为:
import java.util.*;
public class Segment2
{
char[] stopWords = new char[] //这些词不予考虑在分词时候
{
'。', ',', ';', ':', '“', '”', '(', ')', '!', '?', '◎', '#',
'¥', '%', '…','※', '×', '【', '】', '『', '』', '《', '》', '、'
};
public Segment2()
{}
private String sourceText ="";
public void setText(String text)
{
this.sourceText = text;
}
public String[] segString() //分词
{
StringBuffer enWord = new StringBuffer(); //英文单词放在这里
StringBuffer cnWord = new StringBuffer(); //中文放在这里
for (int i = 0; i < sourceText.length(); i++)
{
char c = sourceText.charAt(i);
if (c > 255 && isWord(c)) //为中文时
{
cnWord.append(c);
}
else if (c < 255) //为英文或数字时
{
enWord.append(c);
}
}
String str = cnWord.toString();
/*实现二元分词*/
String[] result = new String[str.length() - 1];
for (int i = 0; i < str.length() - 1; i++)
{
char c1 = str.charAt(i);
char c2 = str.charAt(i + 1);
result[i] = "" + c1 + c2;
}
String[] allEnWords = enWord.toString().split("//p{Punct}|//s+");
//英文分词以空格分开--正则表达式
String[] ret = new String[allEnWords.length + result.length];
System.arraycopy(result,0,ret,0,result.length);
System.arraycopy(allEnWords,0,ret,result.length,allEnWords.length);
return ret;
}
private void printArray(Object[] os)
{
for (int i = 0; i < os.length; i++)
{
System.out.println(os[i]);
}
}
private boolean isWord(char c) //判断是否在stopWords表里面
{
for (int i = 0; i < stopWords.length; i++)
{
if (c == stopWords[i])
{
return false;
}
}
return true;
}
public static void main(String args[])
{
Segment2 seg = new Segment2();
seg.setText("北京BeiJing 天安门 Tian An Men");
seg.printArray(seg.segString());
}
}
输出结果为:
北京
京天
天安
安门
BeiJing
Tian
An
Men