中文相关度计算
最近需要研究ES相关性打分,运用query之后的socre或者function_score无法满足客户需求,故借助于GPT,找了一个相关性计算的问题,借用此方法可在查询结果之后进行处理。感觉还不错,分享记录一下。
引入maven依赖
<dependency>
<groupId>com.hankcs</groupId>
<artifactId>hanlp</artifactId>
<version>portable-1.8.2</version>
</dependency>
详细代码
public static void main(String[] args) {
System.out.println(String.format("%.2f%%", calculateCosineSimilarity("中国武汉", "我爱中国武汉") * 100));
}
public static double calculateCosineSimilarity(String str1, String str2) {
Map<String, Integer> vector1 = getTermFrequencyVector(str1);
Map<String, Integer> vector2 = getTermFrequencyVector(str2);
double dotProduct = calculateDotProduct(vector1, vector2);
double magnitude1 = calculateMagnitude(vector1);
double magnitude2 = calculateMagnitude(vector2);
if (magnitude1 == 0 || magnitude2 == 0) {
return 0.0; // 避免除以零错误
} else {
return dotProduct / (magnitude1 * magnitude2);
}
}
private static Map<String, Integer> getTermFrequencyVector(String str) {
List<Term> terms = BasicTokenizer.segment(str);
Map<String, Integer> vector = new HashMap<>();
for (Term term : terms) {
String word = term.word;
vector.put(word, vector.getOrDefault(word, 0) + 1);
}
return vector;
}
private static double calculateDotProduct(Map<String, Integer> vector1, Map<String, Integer> vector2) {
double dotProduct = 0;
for (String term : vector1.keySet()) {
dotProduct += vector1.getOrDefault(term, 0) * vector2.getOrDefault(term, 0);
}
return dotProduct;
}
private static double calculateMagnitude(Map<String, Integer> vector) {
double magnitude = 0;
for (int value : vector.values()) {
magnitude += Math.pow(value, 2);
}
return Math.sqrt(magnitude);
}