1、倒排索引模型
2、布尔检索类型
3、TF-IDF权重计算
下面是TF-IDF的JAVA代码实现。
public class TFIDF {
public double tf(List<String> doc,String term){
double termFrequency = 0;
for (String str : doc) {
if(str.equalsIgnoreCase(term)){
termFrequency ++ ;
}
}
return termFrequency / doc.size();
}
public double df(List<List<String>> docs,String term){
int n = 0;
if(term != null && term != ""){
for (List<String> doc : docs) {
for (String word : doc) {
if (term.equalsIgnoreCase(word)){
n ++;
break;
}
}
}
}else {
System.out.println("term不能为null或空串");
}
return n;
}
public double idf(List<List<String>> docs,String term){
return Math.log(docs.size()/(double)df(docs,term)+1);
}
public double tfidf(List<String> doc,List<List<String>> docs,String term){
return tf(doc,term) * idf(docs,term);
}
public static void main(String args[]){
List<String> doc1 = Arrays.asList("人工","智能","成为","互联网","大会","焦点");
List<String> doc2 = Arrays.asList("谷歌","推出","开源","人工","智能","系统","工具");
List<String> doc3 = Arrays.asList("互联网","的","未来","在","人工","智能");
List<String> doc4 = Arrays.asList("谷歌","开源","机器","学习","工具");
List<List<String>> documents = Arrays.asList(doc1,doc2,doc3,doc4);
TFIDF calculator = new TFIDF();
System.out.println(calculator.tf(doc2,"谷歌"));
System.out.println(calculator.df(documents,"谷歌"));
double tfidf = calculator.tfidf(doc2, documents, "谷歌");
System.out.println("TF-IDF(谷歌) = " + tfidf);
}
}
结果:
0.14285714285714285
2.0
TF-IDF(谷歌) = 0.15694461266687282
4、向量空间模型
下面是相似度的JAVA代码实现
public class Vsm {
public static double calCosSim(Map<String,Double> v1,Map<String,Double> v2){
double sclar = 0.0,norm1 = 0.0,norm2 = 0.0,similarity = 0.0;
Set<String> v1Keys = v1.keySet();
Set<String> v2Keys = v2.keySet();
Set<String> both = new HashSet<>();
both.addAll(v1Keys);
both.retainAll(v2Keys);
System.out.println(both);
for (String str1 : both) {
sclar += v1.get(str1) * v2.get(str1);
}
for (String str1 : v1.keySet()) {
norm1+= Math.pow(v1.get(str1),2);
}
for (String str2 : v2.keySet()) {
norm2+= Math.pow(v2.get(str2),2);
}
similarity = sclar / Math.sqrt(norm1*norm2);
System.out.println("sclar: "+sclar);
System.out.println("norm1: "+norm1);
System.out.println("norm2: "+norm2);
System.out.println("similarity : "+similarity);
return similarity;
}
public static void main(String[] args) {
Map<String,Double> m1 = new HashMap<>();
m1.put("Hello",1.0);
m1.put("css",2.0);
m1.put("Lucene",3.0);
Map<String,Double> m2 = new HashMap<>();
m2.put("Hello",1.0);
m2.put("Word",2.0);
m2.put("Hadoop",3.0);
m2.put("java",4.0);
m2.put("html",1.0);
m2.put("css",2.0);
calCosSim(m1,m2);
}
}
结果:
[css, Hello]
sclar: 5.0
norm1: 14.0
norm2: 35.0
similarity : 0.22587697572631282