计算文本矩阵相似度

Transformers库

from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('all-MiniLM-L6-v2')

train_used = model.encode(train_used, convert_to_tensor=True)
test_used = model.encode(test_used, convert_to_tensor=True)
train_unused = model.encode(train_unused, convert_to_tensor=True)
test_unused = model.encode(test_unused, convert_to_tensor=True)
similarities_used2unused = util.cos_sim(test_used, train_unused).max(dim=1).values

spacy

import spacy
nlp = spacy.load('zh_core_web_lg')
from spacy.language import Language
from spacy.tokens import Doc
import os 
import torch.nn.functional as F
import spacy
nlp = spacy.load('zh_core_web_lg')
def sentence_embed( input_list): 
    docs = list(nlp.pipe(input_list, n_process=1))
    sentence_vector = torch.Tensor([x.vector for x in docs])
    return sentence_vector
def cos_sim(test,feature):
    # dim =0 对列正则化
    test=F.normalize(test,dim=1)
    feature = F.normalize(feature,dim=1)
    out=torch.mm(test,feature.T)
    return out

# 处理文件list中的文件

test_embeded=sentence_embed(content["sentence_list"][:500])
unused_val=cos_sim( test_embeded, train_unused).max(dim=1).values
used_val=cos_sim( test_embeded, train_used).max(dim=1).values
use=torch.logical_and(unused_val<0.9408 ,used_val>0.8428)
unuse=torch.logical_not(use)
use_sen=list(np.array(content["sentence_list"][:500])[use.numpy()])
unuse_sen=list(np.array(content["sentence_list"][:500])[unuse.numpy()])
content["use_sen"]=use_sen
content["unuse_sen"]=unuse_sen

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值