关于如何使用庖丁解牛分词工具可参考:http://www.letiantian.me/2014-11-26-word-segmentation-paoding-analysis/
该工具可实现自定义词典,对于有些特殊的词,比如明星名字林心如霍建华等,可构建词典以.dic为后缀,放入paoding-analysis-2.0.4-beta\dic目录下,然后新建Java工程即可~
对于一些停用词,也可以处理~
主要代码如下:
</pre><pre class="java" name="code">import java.io.IOException;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.*;
public class fenciMain3 {
//停用词词表
public static final String stopWordTable = "." + File.separator + "srcFile" + File.separator + "StopWordTable_all.txt";
public static void main(String[] args) throws IOException {
String srcFile = "." + File.separator + "srcFile" + File.separator + "user_tag_query.txt";
//String srcFile = "." + File.separator + "srcFile" + File.separator + "test.txt";
String destFile = "." + File.separator + "destFile" + File.separator + "fileExcludeStopWord2.0.txt";
//String destFile = "." + File.separator + "destFile" + File.separator + "output.txt";
new fenciMain3().fenciMain3(srcFile, destFile);
System.out.println("OVER DONE!!!!!!!!!!");
// String text = "秋刀鱼的滋味,猫跟你都想了解";
//System.out.println(pd.fenci01(text));
}
public void fenciMain3(String srcFile, String destFile){
try {
//读取原文件和停用词表
BufferedReader srcFileBr = new BufferedReader(new InputStreamReader(new FileInputStream(new File(srcFile))));
BufferedReader StopWordFileBr = new BufferedReader(new InputStreamReader(new FileInputStream(stopWordTable), "UTF-8"));
//将分词好的文本信息存入输出文件
BufferedWriter destFileBw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(destFile))));
//用来存放停用词的集合
Set<String> stopWordSet = new HashSet<String>();
//初始化停用词集
String stopWord = null;
for(; (stopWord = StopWordFileBr.readLine()) != null;){
// System.out.println(stopWord);
stopWordSet.add(stopWord);
}
String paragraph = null;
paodingfenci pd = new paodingfenci();
for(; (paragraph = srcFileBr.readLine()) != null; ){
//对读入的文本进行分词
//显示结果
//System.out.println(pd.fenci01(paragraph));
String spiltResultStr = pd.fenci01(paragraph);
String[] resultArray = spiltResultStr.split(" ");
//过滤停用词
for(int i = 4; i< resultArray.length; i++){
if(stopWordSet.contains(resultArray[i])){
resultArray[i] = null;
}
else{
for(int j = resultArray[i].length(); --j >= 0;){
char c = resultArray[i].charAt(j);
if(Character.isDigit(c)){
resultArray[i] = null;
break;
}
if((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')){
resultArray[i] = null;
break;
}
}
}
}
//把过滤后的字符串数组存入到一个字符串中
StringBuffer finalStr = new StringBuffer();
for(int i = 0; i< resultArray.length; i++){
if(resultArray[i] != null){
finalStr = finalStr.append(resultArray[i]).append(" ");
}
}
//输出结果到指定文件
destFileBw.write(finalStr.toString());
destFileBw.newLine();
}
//关闭输入流
destFileBw.close();
srcFileBr.close();
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch(Exception e){
e.printStackTrace();
}
}
}