用Java实现特征提取计算tf-idf
(1)计算反文档频次公式如下:
(2)计算TF-IDF公式如下:
tf-idf=tf*idf
(2)Java代码实现
package com.panguoyuan.datamining.first;
import java.io.BufferedReader;
import java.io.FileReader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
public class TestTFIDF {
public static List<Map> articleList = new ArrayList<Map>();
public static List<Result> listResult = new ArrayList<Result>();
//文章 词 tf-idf
public static void main(String[] args) throws Exception{
BufferedReader fr = new BufferedReader(new FileReader("F:\\workspace1\\datamining\\data\\test_sohu_news_data"));
initD(fr);
calculateTF_IDF();print();
}
private static void print(){
Collections.sort(listResult);
for(Result r : listResult){
System.out.println("第【"+r.getArticleId()+"】篇文章 :"+r.getWord()+" "+r.getTFIDF());
}
}
public static void calculateTF_IDF() throws Exception{
int d = articleList.size();
for(int i=0;i<articleList.size();i++ ){
Map<String,Integer> wordMap = articleList.get(i);
Set set = wordMap.entrySet();
Iterator iter = set.iterator();
while(iter.hasNext()){
String obj = iter.next().toString();
String[] wordcount = obj.split("=");
int tf =Integer.parseInt(wordcount[1]);
double idf = log(d/(getDF(wordcount[0])+1),2);
double tf_idf = tf * idf ;
// if(tf_idf > 0){
// System.out.println("第【"+i+"】篇文章:"+wordcount[0]+" "+tf_idf);
Result r = new Result(i,wordcount[0],tf_idf);
listResult.add(r);
// }
}
}
}
public static void initD(BufferedReader br) throws Exception{
int i;
while ((i = br.read()) != -1) {
String line = br.readLine();
Map<String,Integer> wordMap = getWordCountMap(line);
articleList.add(wordMap);
}
}
public static int getDF(String word){
int count = 0;
for(int i=0;i<articleList.size();i++){
Map<String,Integer> map = articleList.get(i);
if(map.containsKey(word)){
count ++;
}
}
return count;
}
public static Map<String,Integer> getWordCountMap(String article) throws Exception{
Map<String,Integer> wordMap = new HashMap<String,Integer>();
String[] words = article.split(" ");
for (int j = 0; j < words.length; j++) {
String word = words[j];
if (wordMap.get(word) != null && !"".equals(wordMap.get(word))) {
wordMap.put(word, wordMap.get(word) + 1);
} else {
wordMap.put(word, 1);
}
}
return wordMap;
}
public static double log(double value, double base) {
return Math.log(value)/Math.log(base);
}
}
package com.panguoyuan.datamining.first;
public class Result implements Comparable {
private int articleId;
private String word;
private double TFIDF;
public Result(int articleId, String word, double tFIDF) {
super();
this.articleId = articleId;
this.word = word;
TFIDF = tFIDF;
}
public int getArticleId() {
return articleId;
}
public void setArticleId(int articleId) {
this.articleId = articleId;
}
public String getWord() {
return word;
}
public void setWord(String word) {
this.word = word;
}
public double getTFIDF() {
return TFIDF;
}
public void setTFIDF(double tFIDF) {
TFIDF = tFIDF;
}
@Override
public int compareTo(Object o) {
Result r = (Result)o;
if(r.getTFIDF() > this.TFIDF){
return 1;
}else if(r.getTFIDF()<this.TFIDF){
return -1;
}else{
return 0;
}
}
}
(3)排序后输出结果