package SimilarityCompution;
import Java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.HashSet;
import java.util.Set;
import ICTCLAS.I3S.AC.ICTCLAS50;
public class FileExcludeStopWord {
//停用词词表
public static final String stopWordTable = "." + File.separator + "srcFile" + File.separator + "StopWordTable.txt";
public static void main(String[] args) {
//源文件和目的文件
String srcFile = "." + File.separator + "srcFile" + File.separator + "如何正确的使用化妆品效.txt";
String destFile = "." + File.separator + "destFile" + File.separator + "如何正确的使用化妆品效.txt";
new FileExcludeStopWord().fileExcludeStopWord(srcFile, destFile);
}
public void fileExcludeStopWord(String srcFile,StringdestFile){
try {
//读取原文件和停用词表
BufferedReadersrcFileBr = new BufferedReader(new InputStreamReader(new FileInputStream(new File(srcFile))));
BufferedReaderStopWordFileBr = new BufferedReader(new InputStreamReader(new FileInputStream(new File(stopWordTable))));
//将去除停用词的文本信息存入输出文件
BufferedWriterdestFileBw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(destFile))));
//用来存放停用词的集合 Set stopWordSet = new HashSet<String>(); //初如化停用词集 String stopWord = null; for(; (stopWord = StopWordFileBr.readLine()) != null;){ stopWordSet.add(stopWord); } //分词工具 ICTCLAS50 ICTCLAS = new ICTCLAS50();
// 初始化分词所用库的路径
String argu = ".";
if (ICTCLAS.ICTCLAS_Init(argu.getBytes("gb2312")) == false) {
System.out.println("分词所用库初始化失败。");
return;
}
String paragraph = null;
for(; (paragraph = srcFileBr.readLine()) != null;){
//对读入的文本进行分词
byte[] spiltResult = ICTCLAS.ICTCLAS_ParagraphProcess(paragraph.getBytes("gb2312"), 2, 0);
String spiltResultStr = new String(spiltResult,0,spiltResult.length,"gb2312");
//得到分词后的词汇数组,以便后续比较
String[] resultArray = spiltResultStr.split(" ");
//过滤停用词
for(int i = 0; i<resultArray.length; i++){
if(stopWordSet.contains(resultArray[i])){
resultArray[i] = null;
}
}
//把过滤后的字符串数组存入到一个字符串中
StringBufferfinalStr = new StringBuffer();
for(int i = 0; i<resultArray.length; i++){
if(resultArray[i] != null){
finalStr = finalStr.append(resultArray[i]).append(" ");
}
}
} } }
//将过滤后的文本信息写入到指定文件中
destFileBw.write(finalStr.toString());
destFileBw.newLine();
//关闭输入流
destFileBw.close();
StopWordFileBr.close(); srcFileBr.close(); }
catch (FileNotFoundException e) {
e.printStackTrace();
} catch(Exception e){
e.printStackTrace();