BERT
import torch
from transformers import BertModel, BertTokenizer
tokenizer = BertTokenizer.from_pretrained('chinese_model')
model = BertModel.from_pretrained('chinese_model', )
sentenceA = '我是一名学生,我喜欢学习'
sentenceB = "我是大学生,我平常喜欢看书"
sentenceC = "比特币在最近的市场中有剧烈的波动"
text_dictA = tokenizer.encode_plus(sentenceA, add_special_tokens=True, return_attention_mask=True)
input_ids = torch.tensor(text_dictA['input_ids']).unsqueeze(0)
token_type_ids = torch.tensor(text_dictA['token_type_ids']).unsqueeze(0)
attention_mask = torch.tensor(text_dictA['attention_mask']).unsqueeze(0)
resA = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
afterA = resA[1].squeeze(0)
text_dictB = tokenizer.encode_plus(sentenceB, add_special_tokens=True, return_attention_mask=True)
input_ids = torch.tensor(text_dictB['input_ids']).unsqueeze(0)
token_type_ids = torch.tensor(text_dictB['token_type_ids']).unsqueeze(0)
attention_mask = torch.tensor(text_dictB['attention_mask']).unsqueeze(0)
resB = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
afterB = resB[1].squeeze(0)
text_dictC = tokenizer.encode_plus(sentenceC, add_special_tokens=True, return_attention_mask=True)
input_ids = torch.tensor(text_dictC['input_ids']).unsqueeze(0)
token_type_ids = torch.tensor(text_dictC['token_type_ids']).unsqueeze(0)
attention_mask = torch.tensor(text_dictC['attention_mask']).unsqueeze(0)
resC = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
afterC = resC[1].squeeze(0)
print(torch.cosine_similarity(afterA, afterB, dim=0))
print(torch.cosine_similarity(afterA, afterC, dim=0))
print(torch.cosine_similarity(afterB, afterC, dim=0))
word2vec
词向量模型下载
import jieba
import numpy as np
from math import sqrt
seg1 = jieba.lcut("我是西南财经大学的学生", cut_all=False)
seg2 = jieba.lcut("我是来自四川成都的学生", cut_all=False)
seg3 = jieba.lcut("财经新闻属于新闻的一个细分类目,侧重点是采集、报道、发布财经领域的新闻", cut_all=False)
embeddings_index = {}
with open(r'F:\b_student\研究生\研究\多维信息融合\词向量模型\sgns.financial.bigram-char', encoding='utf-8',errors='ignore') as f:
for l in f.readlines():
values = l.split()
word = values[0]
try:
embeddings_index[word] = np.asarray(values[1:], dtype='float32')
except Exception as e:
print(e)
print(l)
count = 0
sen_vec1 = np.zeros(300)
for word in seg1:
try:
sen_vec1+=embeddings_index[word]
count+=1
except KeyError:
print(word)
if count!=0:
sen_vec1/=count
count = 0
sen_vec2 = np.zeros(300)
for word in seg2:
try:
sen_vec2+=embeddings_index[word]
count+=1
except KeyError:
print(word)
if count!=0:
sen_vec2/=count
count = 0
sen_vec3 = np.zeros(300)
for word in seg3:
try:
sen_vec3+=embeddings_index[word]
count+=1
except KeyError:
print(word)
if count!=0:
sen_vec3/=count
def similarity(v1, v2):
a = sqrt(np.dot(v1, v1))
b = sqrt(np.dot(v2, v2))
if a == 0 or b == 0:
return -1
cos_dis = np.dot(v1, v2) / (b * a)
return cos_dis
print(similarity(sen_vec1, sen_vec2))
print(similarity(sen_vec1, sen_vec3))
print(similarity(sen_vec2, sen_vec3))