TFIDF算法,java代码实现
TF-IDF算法全称为term frequency–inverse document frequency。TF就是term frequency的缩写,意为词频。IDF则是inverse document frequency的缩写,意为逆文档频率。
package com.aawant.nlp.featureExtraction.tfidf;
import java.io.BufferedReader;
import java.io.File;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
public class TFIDF {
/**
*
* @param p_path1 文章路径,文件格式
* @param p_path2 文档路径
* @return 词,TFIDF值
* @throws Exception
*/
public static Map<?, ?> getTFIDT(String p_path1, String p_path2)
throws Exception {
File file1 = new File(p_path1);
File file2 = new File(p_path2);
Map<String, Double> term_freq_map = new HashMap<String, Double>();// 词,该词在该文档中出现的频率
Map<String, Double> doc_freq_map = new HashMap<String, Double>(); // 词,文档频率
int doc_numbers_total = 0; // 总的文档数目
Double doc_numbers_term = 0.0;// 包含该词条的文档数目
//词频
BufferedReader br_doc = null;
br_doc = StringUtils.getBr(file1, "UTF-8");
String line = "";
while ((line = br_doc.readLine()) != null) {
String[] temp_words = line.split(" ");
for (String w : temp_words) {
if (term_freq_map.containsKey(w)) {
Double value = term_freq_map.get(w) + 1.0;
term_freq_map.put(w, value);
} else {
term_freq_map.put(w, 1.0);
}
}
}
br_doc.close();
// 文档频率
Set<String> set = term_freq_map.keySet();
Iterator<String> iter = set.iterator();
File[] files = file2.listFiles();
doc_numbers_total = files.length;
BufferedReader br_file = null;
String temp2 = "";
while (iter.hasNext()) {
temp2 = (String) iter.next();
String temp = "";
for (File f : files) {
StringBuilder sb = new StringBuilder();
br_file = StringUtils.getBr(f, "utf-8");
while ((temp = br_file.readLine()) != null) {
String words = temp;
sb.append(words);
}
String words_adoc = sb.toString();
if (words_adoc.contains(temp2)) {
doc_numbers_term += 1.0;
doc_freq_map.put(temp2, doc_numbers_term);
}
br_file.close();
}
doc_numbers_term = 0.0;
}
/*
* 计算TFIDF值
* w = 词频*log(总的文档数目/文档频率)
*/
Map<String, Double> re_map = new HashMap<String, Double>();
String term = "";
Set<String> set_t = term_freq_map.keySet();
Iterator<String> iter_t = set_t.iterator();
while(iter_t.hasNext()){
term = (String) iter_t.next();
Double A = term_freq_map.get(term);
Double B = doc_freq_map.get(term);
Double result = A*Math.log(doc_numbers_total/B)/Math.log(2.0);
re_map.put(term, result);
}
return re_map;
}
}