代码模板:
jar包下载:https://download.csdn.net/download/dreamzuora/10853842
/**
*
*/
package TFIDF;
import java.util.Arrays;
import java.util.List;
/**
* @author weijie
* 作用:用来计算词项对于一个文档集或一个语料库中的一份文件的重要程度
* 2018年12月15日
*/
public class TfidfUtils {
// 词项频率(TF) = 单词在文档中出现的次数 / 文档的总词数
public double tf(List<String> doc, String term) {
double termFrequency = 0;
for (String str : doc) {
if (str.equalsIgnoreCase(term)) {
termFrequency++;
}
}
return termFrequency / doc.size();
}
// 文档频率(DF):代表文档集中包含某个词的所有文档数目
public int df(List<List<String>> docs, String term) {
int n = 0;
if (term != null && term != "") {
for (List<String> doc : docs) {
for (String word : doc) {
if (term.equalsIgnoreCase(word)) {
n++;
break;
}
}
}
} else {
System.out.println("term can not null or hava not content!");
}
return n;
}
// 逆文档率(IDF)= log(文档集总的文档数 / (包含某个词的文档数 + 1)) = log(N / df + 1)
public double idf(List<List<String>> docs, String term) {
return Math.log(docs.size() / (double) df(docs, term) + 1);
}
// TFIDF = 词频(tf) * 逆文档率(idf)
public double tfIdf(List<String> doc, List<List<String>> docs, String term) {
return tf(doc, term) * idf(docs, term);
}
public static void main(String[] args) {
List<String> doc1 = Arrays.asList("人工", "智能", "成为", "互联网", "大会", "焦点");
List<String> doc2 = Arrays.asList("谷歌", "推出", "开源", "人工", "智能", "系统", "工具");
List<String> doc3 = Arrays.asList("互联网", "的", "未来", "在", "人工", "智能");
List<String> doc4 = Arrays.asList("谷歌", "开源", "机器", "学习", "工具");
List<List<String>> documents = Arrays.asList(doc1, doc2, doc3, doc4);
TfIdfCal calculator = new TfIdfCal();
System.out.println(calculator.tf(doc2, "谷歌"));
System.out.println(calculator.df(documents, "谷歌"));
double tfidf = calculator.tfIdf(doc2, documents, "谷歌");
System.out.println("TF-IDF (谷歌) = " + tfidf);
}
}
优秀博客:https://www.cnblogs.com/ywl925/archive/2013/08/26/3275878.html