NLP生成句子向量的两种方式

最新推荐文章于 2022-02-23 14:18:03 发布

zhangxiangnan0906

最新推荐文章于 2022-02-23 14:18:03 发布

阅读量570

点赞数 2

分类专栏：小代码文章标签： pytorch 自然语言处理深度学习

本文链接：https://blog.csdn.net/weixin_49328057/article/details/120394958

版权

小代码专栏收录该内容

17 篇文章 0 订阅

订阅专栏

BERT

import torch
from transformers import BertModel, BertTokenizer

tokenizer = BertTokenizer.from_pretrained('chinese_model')
model = BertModel.from_pretrained('chinese_model', )

sentenceA = '我是一名学生，我喜欢学习'
sentenceB = "我是大学生，我平常喜欢看书"
sentenceC = "比特币在最近的市场中有剧烈的波动"
# sentenceC = "我是一名中学生，我喜欢阅读文字"

text_dictA = tokenizer.encode_plus(sentenceA, add_special_tokens=True, return_attention_mask=True)
input_ids = torch.tensor(text_dictA['input_ids']).unsqueeze(0)
token_type_ids = torch.tensor(text_dictA['token_type_ids']).unsqueeze(0)
attention_mask = torch.tensor(text_dictA['attention_mask']).unsqueeze(0)
resA = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
afterA = resA[1].squeeze(0)

text_dictB = tokenizer.encode_plus(sentenceB, add_special_tokens=True, return_attention_mask=True)
input_ids = torch.tensor(text_dictB['input_ids']).unsqueeze(0)
token_type_ids = torch.tensor(text_dictB['token_type_ids']).unsqueeze(0)
attention_mask = torch.tensor(text_dictB['attention_mask']).unsqueeze(0)
resB = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
afterB = resB[1].squeeze(0)

text_dictC = tokenizer.encode_plus(sentenceC, add_special_tokens=True, return_attention_mask=True)
input_ids = torch.tensor(text_dictC['input_ids']).unsqueeze(0)
token_type_ids = torch.tensor(text_dictC['token_type_ids']).unsqueeze(0)
attention_mask = torch.tensor(text_dictC['attention_mask']).unsqueeze(0)
resC = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
afterC = resC[1].squeeze(0)

print(torch.cosine_similarity(afterA, afterB, dim=0))
print(torch.cosine_similarity(afterA, afterC, dim=0))
print(torch.cosine_similarity(afterB, afterC, dim=0))

word2vec

词向量模型下载

import jieba
import numpy as np
from math import sqrt


seg1 = jieba.lcut("我是西南财经大学的学生", cut_all=False)
seg2 = jieba.lcut("我是来自四川成都的学生", cut_all=False)
seg3 = jieba.lcut("财经新闻属于新闻的一个细分类目，侧重点是采集、报道、发布财经领域的新闻", cut_all=False)

embeddings_index = {}

with open(r'F:\b_student\研究生\研究\多维信息融合\词向量模型\sgns.financial.bigram-char', encoding='utf-8',errors='ignore') as f:
    for l in f.readlines():
        values = l.split()
        word = values[0]
        try:
            embeddings_index[word] = np.asarray(values[1:], dtype='float32')
        except Exception as e:
            print(e)
            print(l)


count = 0
sen_vec1 = np.zeros(300)
for word in seg1:
    try:
        sen_vec1+=embeddings_index[word]
        count+=1
    except KeyError:
        print(word)
    if count!=0:
        sen_vec1/=count

count = 0
sen_vec2 = np.zeros(300)
for word in seg2:
    try:
        sen_vec2+=embeddings_index[word]
        count+=1
    except KeyError:
        print(word)
    if count!=0:
        sen_vec2/=count


count = 0
sen_vec3 = np.zeros(300)
for word in seg3:
    try:
        sen_vec3+=embeddings_index[word]
        count+=1
    except KeyError:
        print(word)
    if count!=0:
        sen_vec3/=count

def similarity(v1, v2):
    a = sqrt(np.dot(v1, v1))
    b = sqrt(np.dot(v2, v2))
    if a == 0 or b == 0:
        return -1
    cos_dis = np.dot(v1, v2) / (b * a)

    return cos_dis

print(similarity(sen_vec1, sen_vec2))
print(similarity(sen_vec1, sen_vec3))
print(similarity(sen_vec2, sen_vec3))