java tf idf 2016_TF-IDF理解及其Java实现

packagetfidf;import java.io.*;import java.util.*;importorg.wltea.analyzer.lucene.IKAnalyzer;public classReadFiles {/***@paramargs*/

private static ArrayList FileList = new ArrayList(); //the list of file//get list of file for the directory, including sub-directory of it

public static List readDirs(String filepath) throwsFileNotFoundException, IOException

{try{

File file= newFile(filepath);if(!file.isDirectory())

{

System.out.println("输入的[]");

System.out.println("filepath:" +file.getAbsolutePath());

}else{

String[] flist=file.list();for(int i = 0; i < flist.length; i++)

{

File newfile= new File(filepath + "\\" +flist[i]);if(!newfile.isDirectory())

{

FileList.add(newfile.getAbsolutePath());

}else if(newfile.isDirectory()) //if file is a directory, call ReadDirs

{

readDirs(filepath+ "\\" +flist[i]);

}

}

}

}catch(FileNotFoundException e)

{

System.out.println(e.getMessage());

}returnFileList;

}//read file

public static String readFile(String file) throwsFileNotFoundException, IOException

{

StringBuffer strSb= new StringBuffer(); //String is constant, StringBuffer can be changed.

InputStreamReader inStrR = new InputStreamReader(new FileInputStream(file), "gbk"); //byte streams to character streams

BufferedReader br = newBufferedReader(inStrR);

String line=br.readLine();while(line != null){

strSb.append(line).append("\r\n");

line=br.readLine();

}returnstrSb.toString();

}//word segmentation

public static ArrayList cutWords(String file) throwsIOException{

ArrayList words = new ArrayList();

String text=ReadFiles.readFile(file);

IKAnalyzer analyzer= newIKAnalyzer();

words=analyzer.split(text);returnwords;

}//term frequency in a file, times for each word

public static HashMap normalTF(ArrayListcutwords){

HashMap resTF = new HashMap();for(String word : cutwords){if(resTF.get(word) == null){

resTF.put(word,1);

System.out.println(word);

}else{

resTF.put(word, resTF.get(word)+ 1);

System.out.println(word.toString());

}

}returnresTF;

}//term frequency in a file, frequency of each word

public static HashMap tf(ArrayListcutwords){

HashMap resTF = new HashMap();int wordLen =cutwords.size();

HashMap intTF =ReadFiles.normalTF(cutwords);

Iterator iter= intTF.entrySet().iterator(); //iterator for that get from TF

while(iter.hasNext()){

Map.Entry entry=(Map.Entry)iter.next();

resTF.put(entry.getKey().toString(), Float.parseFloat(entry.getValue().toString())/wordLen);

System.out.println(entry.getKey().toString()+ " = "+ Float.parseFloat(entry.getValue().toString()) /wordLen);

}returnresTF;

}//tf times for file

public static HashMap> normalTFAllFiles(String dirc) throwsIOException{

HashMap> allNormalTF = new HashMap>();

List filelist =ReadFiles.readDirs(dirc);for(String file : filelist){

HashMap dict = new HashMap();

ArrayList cutwords = ReadFiles.cutWords(file); //get cut word for one file

dict=ReadFiles.normalTF(cutwords);

allNormalTF.put(file, dict);

}returnallNormalTF;

}//tf for all file

public static HashMap> tfAllFiles(String dirc) throwsIOException{

HashMap> allTF = new HashMap>();

List filelist =ReadFiles.readDirs(dirc);for(String file : filelist){

HashMap dict = new HashMap();

ArrayList cutwords = ReadFiles.cutWords(file); //get cut words for one file

dict=ReadFiles.tf(cutwords);

allTF.put(file, dict);

}returnallTF;

}public static HashMap idf(HashMap>all_tf){

HashMap resIdf = new HashMap();

HashMap dict = new HashMap();int docNum =FileList.size();for(int i = 0; i < docNum; i++){

HashMap temp =all_tf.get(FileList.get(i));

Iterator iter=temp.entrySet().iterator();while(iter.hasNext()){

Map.Entry entry=(Map.Entry)iter.next();

String word=entry.getKey().toString();if(dict.get(word) == null){

dict.put(word,1);

}else{

dict.put(word, dict.get(word)+ 1);

}

}

}

System.out.println("IDF for every word is:");

Iterator iter_dict=dict.entrySet().iterator();while(iter_dict.hasNext()){

Map.Entry entry=(Map.Entry)iter_dict.next();float value = (float)Math.log(docNum /Float.parseFloat(entry.getValue().toString()));

resIdf.put(entry.getKey().toString(), value);

System.out.println(entry.getKey().toString()+ " = " +value);

}returnresIdf;

}public static void tf_idf(HashMap> all_tf,HashMapidfs){

HashMap> resTfIdf = new HashMap>();int docNum =FileList.size();for(int i = 0; i < docNum; i++){

String filepath=FileList.get(i);

HashMap tfidf = new HashMap();

HashMap temp =all_tf.get(filepath);

Iterator iter=temp.entrySet().iterator();while(iter.hasNext()){

Map.Entry entry=(Map.Entry)iter.next();

String word=entry.getKey().toString();

Float value= (float)Float.parseFloat(entry.getValue().toString()) *idfs.get(word);

tfidf.put(word, value);

}

resTfIdf.put(filepath, tfidf);

}

System.out.println("TF-IDF for Every file is :");

DisTfIdf(resTfIdf);

}public static void DisTfIdf(HashMap>tfidf){

Iterator iter1=tfidf.entrySet().iterator();while(iter1.hasNext()){

Map.Entry entrys=(Map.Entry)iter1.next();

System.out.println("FileName: " +entrys.getKey().toString());

System.out.print("{");

HashMap temp = (HashMap) entrys.getValue();

Iterator iter2=temp.entrySet().iterator();while(iter2.hasNext()){

Map.Entry entry=(Map.Entry)iter2.next();

System.out.print(entry.getKey().toString()+ " = " + entry.getValue().toString() + ", ");

}

System.out.println("}");

}

}public static void main(String[] args) throwsIOException {//TODO Auto-generated method stub

String file = "D:/testfiles";

HashMap> all_tf =tfAllFiles(file);

System.out.println();

HashMap idfs =idf(all_tf);

System.out.println();

tf_idf(all_tf, idfs);

}

}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值