packagetfidf;import java.io.*;import java.util.*;importorg.wltea.analyzer.lucene.IKAnalyzer;public classReadFiles {/***@paramargs*/
private static ArrayList FileList = new ArrayList(); //the list of file//get list of file for the directory, including sub-directory of it
public static List readDirs(String filepath) throwsFileNotFoundException, IOException
{try{
File file= newFile(filepath);if(!file.isDirectory())
{
System.out.println("输入的[]");
System.out.println("filepath:" +file.getAbsolutePath());
}else{
String[] flist=file.list();for(int i = 0; i < flist.length; i++)
{
File newfile= new File(filepath + "\\" +flist[i]);if(!newfile.isDirectory())
{
FileList.add(newfile.getAbsolutePath());
}else if(newfile.isDirectory()) //if file is a directory, call ReadDirs
{
readDirs(filepath+ "\\" +flist[i]);
}
}
}
}catch(FileNotFoundException e)
{
System.out.println(e.getMessage());
}returnFileList;
}//read file
public static String readFile(String file) throwsFileNotFoundException, IOException
{
StringBuffer strSb= new StringBuffer(); //String is constant, StringBuffer can be changed.
InputStreamReader inStrR = new InputStreamReader(new FileInputStream(file), "gbk"); //byte streams to character streams
BufferedReader br = newBufferedReader(inStrR);
String line=br.readLine();while(line != null){
strSb.append(line).append("\r\n");
line=br.readLine();
}returnstrSb.toString();
}//word segmentation
public static ArrayList cutWords(String file) throwsIOException{
ArrayList words = new ArrayList();
String text=ReadFiles.readFile(file);
IKAnalyzer analyzer= newIKAnalyzer();
words=analyzer.split(text);returnwords;
}//term frequency in a file, times for each word
public static HashMap normalTF(ArrayListcutwords){
HashMap resTF = new HashMap();for(String word : cutwords){if(resTF.get(word) == null){
resTF.put(word,1);
System.out.println(word);
}else{
resTF.put(word, resTF.get(word)+ 1);
System.out.println(word.toString());
}
}returnresTF;
}//term frequency in a file, frequency of each word
public static HashMap tf(ArrayListcutwords){
HashMap resTF = new HashMap();int wordLen =cutwords.size();
HashMap intTF =ReadFiles.normalTF(cutwords);
Iterator iter= intTF.entrySet().iterator(); //iterator for that get from TF
while(iter.hasNext()){
Map.Entry entry=(Map.Entry)iter.next();
resTF.put(entry.getKey().toString(), Float.parseFloat(entry.getValue().toString())/wordLen);
System.out.println(entry.getKey().toString()+ " = "+ Float.parseFloat(entry.getValue().toString()) /wordLen);
}returnresTF;
}//tf times for file
public static HashMap> normalTFAllFiles(String dirc) throwsIOException{
HashMap> allNormalTF = new HashMap>();
List filelist =ReadFiles.readDirs(dirc);for(String file : filelist){
HashMap dict = new HashMap();
ArrayList cutwords = ReadFiles.cutWords(file); //get cut word for one file
dict=ReadFiles.normalTF(cutwords);
allNormalTF.put(file, dict);
}returnallNormalTF;
}//tf for all file
public static HashMap> tfAllFiles(String dirc) throwsIOException{
HashMap> allTF = new HashMap>();
List filelist =ReadFiles.readDirs(dirc);for(String file : filelist){
HashMap dict = new HashMap();
ArrayList cutwords = ReadFiles.cutWords(file); //get cut words for one file
dict=ReadFiles.tf(cutwords);
allTF.put(file, dict);
}returnallTF;
}public static HashMap idf(HashMap>all_tf){
HashMap resIdf = new HashMap();
HashMap dict = new HashMap();int docNum =FileList.size();for(int i = 0; i < docNum; i++){
HashMap temp =all_tf.get(FileList.get(i));
Iterator iter=temp.entrySet().iterator();while(iter.hasNext()){
Map.Entry entry=(Map.Entry)iter.next();
String word=entry.getKey().toString();if(dict.get(word) == null){
dict.put(word,1);
}else{
dict.put(word, dict.get(word)+ 1);
}
}
}
System.out.println("IDF for every word is:");
Iterator iter_dict=dict.entrySet().iterator();while(iter_dict.hasNext()){
Map.Entry entry=(Map.Entry)iter_dict.next();float value = (float)Math.log(docNum /Float.parseFloat(entry.getValue().toString()));
resIdf.put(entry.getKey().toString(), value);
System.out.println(entry.getKey().toString()+ " = " +value);
}returnresIdf;
}public static void tf_idf(HashMap> all_tf,HashMapidfs){
HashMap> resTfIdf = new HashMap>();int docNum =FileList.size();for(int i = 0; i < docNum; i++){
String filepath=FileList.get(i);
HashMap tfidf = new HashMap();
HashMap temp =all_tf.get(filepath);
Iterator iter=temp.entrySet().iterator();while(iter.hasNext()){
Map.Entry entry=(Map.Entry)iter.next();
String word=entry.getKey().toString();
Float value= (float)Float.parseFloat(entry.getValue().toString()) *idfs.get(word);
tfidf.put(word, value);
}
resTfIdf.put(filepath, tfidf);
}
System.out.println("TF-IDF for Every file is :");
DisTfIdf(resTfIdf);
}public static void DisTfIdf(HashMap>tfidf){
Iterator iter1=tfidf.entrySet().iterator();while(iter1.hasNext()){
Map.Entry entrys=(Map.Entry)iter1.next();
System.out.println("FileName: " +entrys.getKey().toString());
System.out.print("{");
HashMap temp = (HashMap) entrys.getValue();
Iterator iter2=temp.entrySet().iterator();while(iter2.hasNext()){
Map.Entry entry=(Map.Entry)iter2.next();
System.out.print(entry.getKey().toString()+ " = " + entry.getValue().toString() + ", ");
}
System.out.println("}");
}
}public static void main(String[] args) throwsIOException {//TODO Auto-generated method stub
String file = "D:/testfiles";
HashMap> all_tf =tfAllFiles(file);
System.out.println();
HashMap idfs =idf(all_tf);
System.out.println();
tf_idf(all_tf, idfs);
}
}