package com.yxcx.gettrain;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.SortedMap;
import java.util.TreeMap;
//创建训练集合与测试集合
public class CreateTrainAndTestSample {
// 根据包含非特征词的文档集生成只包含特征词的文档集到 trainSpecial 目
录下
ComputeWordsVector cwv = new ComputeWordsVector();
void trainSpecialWords() throws IOException {
File file2 = new
File("/data/classifydata/trainSpecial/" );
if(!file2.exists()){
file2.mkdir();
}
File file3 = new File("/data/classifydata/docVector/" );
if(!file3.exists()){
file3.mkdir();
}
// TODO Auto-generated method stub
String word;
String fileDir = "/data/classifydata/traindata/";
SortedMap<String, Double> wordMap = new TreeMap<String,
Double>();
wordMap = cwv.countWords(fileDir, wordMap);
// 把 wordMap 输出到文件
cwv.printWordMap(wordMap);
File[] sampleDir = new File(fileDir).listFiles();
for (int i = 0; i < sampleDir.length; i++) {
File[] sample = sampleDir[i].listFiles();
String targetDir =
"/data/classifydata/trainSpecial/"
+ sampleDir[i].getName();
File targetDirFile = new File(targetDir);
if (!targetDirFile.exists()) {
targetDirFile.mkdir();
}
for (int j = 0; j < sample.length; j++) {
String fileShortName =
sample[j].getName();
targetDir =
"/data/classifydata/trainSpecial/"
+ sampleDir[i].getName()
+ "/" + fileShortName;
FileWriter tgWriter = new
FileWriter(targetDir);
FileReader samReader = new
FileReader(sample[j]);
BufferedReader samBR = new
BufferedReader(samReader);
while ((word = samBR.readLine()) !=
null) {
if (wordMap.containsKey(word)) {
tgWriter.append(word +
"\n");
}
}
tgWriter.flush();
tgWriter.close();
}
}
}
public void NaiveBayesianClassifierMain(String[] args) throws
Exception {
CreateTrainAndTestSample ctt = new
CreateTrainAndTestSample();
ctt.trainSpecialWords();
}
public static void main(String[] args) throws Exception {
DataPreProcess DataPP = new DataPreProcess();
DataPP.BPPMain(args);
CreateTrainAndTestSample ctt = new
CreateTrainAndTestSample();
ctt.trainSpecialWords();
}
}
333333333
最新推荐文章于 2024-10-08 21:30:52 发布