文本或向量相似度比较
余弦相似度、欧式距离、曼哈顿距离、杰卡德相似系数
余弦相似度最常用;欧式距离、曼哈顿距离、杰卡德相似系数不常用。
API_KEY 和SECRET_KEY从百度的ErnieBot获取。
# Vector Similarity
# Cao Jinhao
# 20240729
import requests
import json
import numpy as np
import jieba
API_KEY = 'xxxxxx'
SECRET_KEY = 'xxxxx'
def get_access_token():
url = 'https://aip.baidubce.com/oauth/2.0/token'
params = {'grant_type': 'client_credentials', 'client_id': API_KEY, 'client_secret': SECRET_KEY}
return str(requests.post(url, params=params).json().get('access_token'))
def get_text_vector(text):
url = 'https://aip.baidubce.com/rpc/2.0/ai_custom/v1/wenxinworkshop/embeddings/embedding-v1?access_token=' + get_access_token()
payload = json.dumps({ 'input': [ text ] })
headers = { 'Content-Type': 'application/json' }
response = requests.request('POST', url, headers=headers, data=payload)
return response.json()['data'][0]['embedding']
def cosine_similarity(vec1, vec2):
dot_product = np.dot(vec1, vec2)
norm_vec1 = np.linalg.norm(vec1)
norm_vec2 = np.linalg.norm(vec2)
if norm_vec1 == 0 or norm_vec2 == 0:
return 0
cosine_sim = np.abs(dot_product) / (norm_vec1 * norm_vec2)
return cosine_sim
def euclidean_distance(vec1, vec2):
return np.sqrt(np.sum((np.array(vec1) - np.array(vec2)) ** 2))
def manhattan_distance(vec1, vec2):
return np.sum(np.abs(np.array(vec1) - np.array(vec2)))
def jaccard_similarity(words1, words2):
intersection = words1.intersection(words2)
union = words2.union(words2)
return len(intersection) / len(union)
text1 = '我的硕士研究方向是自然语言处理和大语言模型'
text2 = '自然语言处理和大语言模型是我硕士的研究方向'
vector1 = get_text_vector(text1)
vector2 = get_text_vector(text2)
words1 = set(jieba.cut(text1))
words2 = set(jieba.cut(text2))
# print(words1, words2)
# 余弦相似度:0.9903323287208232
# 欧氏距离:0.1390515819809643
# 曼哈顿距离:1.8267254046149901
# 杰卡德相似系数:1.0
cosine_sim = cosine_similarity(vector1, vector2)
euclidean_dis = euclidean_distance(vector1, vector2)
manhattan_dis = manhattan_distance(vector1, vector2)
jaccard_sim = jaccard_similarity(words1, words2)
print(cosine_sim)
print(euclidean_dis)
print(manhattan_dis)
print(jaccard_sim)