package com.yxcx.gettrain;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.wltea.analyzer.lucene.IKAnalyzer;
public class DataPreProcess {
public void doProcess(String strDir) throws IOException {
// strDir 要进行分词的网页的路径
System.out.println(strDir);
File fileDir = new File(strDir);
if (!fileDir.exists()) {
System.out.println("File not exist:" + strDir);
return;
}
File[] srcFiles = fileDir.listFiles();
String[] stemFileNames = new String[srcFiles.length];
for (int i = 0; i < srcFiles.length; i++) {
String fileFullName =
srcFiles[i].getCanonicalPath();
// 文件名称
String fileShortName = srcFiles[i].getName();
if (!new File(fileFullName).isDirectory()) {
System.out.println("Begin preprocess:" +
fileFullName);
StringBuilder stringBuilder = new
StringBuilder();
// 路径/名称
stringBuilder.append(strDir + "/" +
fileShortName);
createProcessFile(fileFullName,
stringBuilder.toString());
stemFileNames[i] =
stringBuilder.toString();
} else {
doProcess(fileFullName);
}
}
}
public static void createProcessFile(String fileFullName, String
toFile) {
BufferedWriter bw = null;
BufferedReader br = null;
FileWriter fileWriter = null;
FileReader fileReader = null;
String line, resLine = null;
try {
StringBuilder sb = new StringBuilder();
fileReader = new FileReader(fileFullName);
br = new BufferedReader(fileReader);
// readLine 方法:一行一行的读取文件
// 去掉 a 标签和 html 标签
while ((line = br.readLine()) != null) {
String conts = "";
line = line.replaceAll("<a(.*?)</a>.",
"")
.replaceAll("</?[^>]+>",
"")
.replaceAll("[^\\u4e00-\\u9fa5]", "");
conts += line;
// 开始进行中文分词,用/n 分割
if (!conts.isEmpty()) {
IKAnalyzer analyzer = new IKAnalyzer(true);
// 通过分析器 Analyzer 将一个字符串创建成 Taken 流,第一个参数是一个名
字没有实际作用
TokenStream tokenStream = analyzer.tokenStream("content",
new StringReader(conts));
while (tokenStream.incrementToken()) {
// 保存相应词汇
CharTermAttribute charTermAttribute = tokenStream
.getAttribute(CharTermAttribute.class);
sb.append(charTermAttribute.toString());
sb.append("\n");
}
resLine = sb.toString();
analyzer.close();
}
}
fileWriter = new FileWriter(toFile);
bw = new BufferedWriter(fileWriter);
bw.write(resLine);
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
bw.close();
br.close();
fileWriter.close();
fileReader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
/**
* @param args
* @throws IOException
*/
public void BPPMain(String[] args) throws IOException {
// TODO Auto-generated method stub
DataPreProcess dataPrePro = new DataPreProcess();
dataPrePro.doProcess("/data/classifydata/traindata/");
}
}
11111
最新推荐文章于 2024-10-10 09:24:58 发布
该代码实现了一个名为DataPreProcess的类,用于对指定目录下的文件进行预处理,包括去除HTML标签,进行中文分词。它使用了IKAnalyzer分词工具,将处理后的结果保存到新文件中。
摘要由CSDN通过智能技术生成