将文本先转化为bert向量,然后计算相似度
from transformers import BertTokenizer, BertModel
import torch
# 加载BERT模型和tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-chinesed')
model = BertModel.from_pretrained('bert-base-chinesed')
# 输入文本
text1 = "I am happy"
text2 = "I am glad"
# 将文本转换为BERT输入格式
encoded_input1 = tokenizer(text1, return_tensors='pt')
encoded_input2 = tokenizer(text2, return_tensors='pt')
# 获取BERT向量
with torch.no_grad():
output1 = model(**encoded_input1)
output2 = model(**encoded_input2)
# 获取文本对应的BERT向量
bert_vector1 = output1.last_hidden_state.mean(dim=1).squeeze()
bert_vector2 = output2.last_hidden_state.mean(dim=1).squeeze()
# 计算余弦相似度
cos_sim = torch.cosine_similarity(bert_vector1, bert_vector2, dim=0)
print("文本1的BERT向量:", bert_vector1)
print("文本2的BERT向量:", bert_vector2)
print("文本相似度:", cos_sim.item())