Transformers库
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')
train_used = model.encode(train_used, convert_to_tensor=True)
test_used = model.encode(test_used, convert_to_tensor=True)
train_unused = model.encode(train_unused, convert_to_tensor=True)
test_unused = model.encode(test_unused, convert_to_tensor=True)
similarities_used2unused = util.cos_sim(test_used, train_unused).max(dim=1).values
spacy
import spacy
nlp = spacy.load('zh_core_web_lg')
from spacy.language import Language
from spacy.tokens import Doc
import os
import torch.nn.functional as F
import spacy
nlp = spacy.load('zh_core_web_lg')
def sentence_embed( input_list):
docs = list(nlp.pipe(input_list, n_process=1))
sentence_vector = torch.Tensor([x.vector for x in docs])
return sentence_vector
def cos_sim(test,feature):
test=F.normalize(test,dim=1)
feature = F.normalize(feature,dim=1)
out=torch.mm(test,feature.T)
return out
test_embeded=sentence_embed(content["sentence_list"][:500])
unused_val=cos_sim( test_embeded, train_unused).max(dim=1).values
used_val=cos_sim( test_embeded, train_used).max(dim=1).values
use=torch.logical_and(unused_val<0.9408 ,used_val>0.8428)
unuse=torch.logical_not(use)
use_sen=list(np.array(content["sentence_list"][:500])[use.numpy()])
unuse_sen=list(np.array(content["sentence_list"][:500])[unuse.numpy()])
content["use_sen"]=use_sen
content["unuse_sen"]=unuse_sen