准备
导入中文分词器的jar包
<dependency>
<groupId>com.hankcs</groupId>
<artifactId>hanlp</artifactId>
<version>portable-1.6.5</version>
</dependency>
需求
比较两个文本是否相似,相似度达到75%以上认为是相似文本。
代码实现
package com.isoft.test;
import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.seg.common.Term;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* @author th
*/
public class Test1 {
public static void main(String[] args) {
String text1 = "我爱北京天安门。";
String text2 = "天安门上太阳升。";
String text3 = "我买了一条裙子。";
String text4 = "我买了一条漂亮的碎花裙子。";
double similarity = cosineSimilarity(text3, text4);
System.out.println("相似度:"+similarity);
if (similarity >= 0.75) {
System.out.println("两个文本相似");
} else {
System.out.println("两个文本不相似");
}
}
public static double cosineSimilarity(String text1, String text2) {
Map<String, Integer> vector1 = getWordCount(text1);
Map<String, Integer> vector2 = getWordCount(text2);
if(vector1.size()==0 || vector2.size()==0){
if(vector1.size()==vector2.size()){
return 1;
}else{
return 0;
}
}
// 计算余弦相似度
double dotProduct = 0.0;
double norm1 = 0.0;
double norm2 = 0.0;
for (String word : vector1.keySet()) {
if (vector2.containsKey(word)) {
dotProduct += vector1.get(word) * vector2.get(word);
}
norm1 += Math.pow(vector1.get(word), 2);
}
for (String word : vector2.keySet()) {
norm2 += Math.pow(vector2.get(word), 2);
}
double similarity = dotProduct / (Math.sqrt(norm1) * Math.sqrt(norm2));
return similarity;
}
public static Map<String, Integer> getWordCount(String text) {
Map<String, Integer> wordCount = new HashMap<>();
List<Term> terms = HanLP.segment(text);
for (Term term : terms) {
String word = term.word;
if (word.trim().length() > 0) {
wordCount.put(word, wordCount.getOrDefault(word, 0) + 1);
}
}
return wordCount;
}
}
结果
text1与text2输出结果:
相似度:0.3999999999999999
两个文本不相似
text3与text4输出结果:
相似度:0.7977240352174656
两个文本相似