NLP生成句子向量的两种方式

BERT

import torch
from transformers import BertModel, BertTokenizer

tokenizer = BertTokenizer.from_pretrained('chinese_model')
model = BertModel.from_pretrained('chinese_model', )

sentenceA = '我是一名学生,我喜欢学习'
sentenceB = "我是大学生,我平常喜欢看书"
sentenceC = "比特币在最近的市场中有剧烈的波动"
# sentenceC = "我是一名中学生,我喜欢阅读文字"

text_dictA = tokenizer.encode_plus(sentenceA, add_special_tokens=True, return_attention_mask=True)
input_ids = torch.tensor(text_dictA['input_ids']).unsqueeze(0)
token_type_ids = torch.tensor(text_dictA['token_type_ids']).unsqueeze(0)
attention_mask = torch.tensor(text_dictA['attention_mask']).unsqueeze(0)
resA = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
afterA = resA[1].squeeze(0)

text_dictB = tokenizer.encode_plus(sentenceB, add_special_tokens=True, return_attention_mask=True)
input_ids = torch.tensor(text_dictB['input_ids']).unsqueeze(0)
token_type_ids = torch.tensor(text_dictB['token_type_ids']).unsqueeze(0)
attention_mask = torch.tensor(text_dictB['attention_mask']).unsqueeze(0)
resB = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
afterB = resB[1].squeeze(0)

text_dictC = tokenizer.encode_plus(sentenceC, add_special_tokens=True, return_attention_mask=True)
input_ids = torch.tensor(text_dictC['input_ids']).unsqueeze(0)
token_type_ids = torch.tensor(text_dictC['token_type_ids']).unsqueeze(0)
attention_mask = torch.tensor(text_dictC['attention_mask']).unsqueeze(0)
resC = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
afterC = resC[1].squeeze(0)

print(torch.cosine_similarity(afterA, afterB, dim=0))
print(torch.cosine_similarity(afterA, afterC, dim=0))
print(torch.cosine_similarity(afterB, afterC, dim=0))

word2vec

词向量模型下载

import jieba
import numpy as np
from math import sqrt


seg1 = jieba.lcut("我是西南财经大学的学生", cut_all=False)
seg2 = jieba.lcut("我是来自四川成都的学生", cut_all=False)
seg3 = jieba.lcut("财经新闻属于新闻的一个细分类目,侧重点是采集、报道、发布财经领域的新闻", cut_all=False)

embeddings_index = {}

with open(r'F:\b_student\研究生\研究\多维信息融合\词向量模型\sgns.financial.bigram-char', encoding='utf-8',errors='ignore') as f:
    for l in f.readlines():
        values = l.split()
        word = values[0]
        try:
            embeddings_index[word] = np.asarray(values[1:], dtype='float32')
        except Exception as e:
            print(e)
            print(l)


count = 0
sen_vec1 = np.zeros(300)
for word in seg1:
    try:
        sen_vec1+=embeddings_index[word]
        count+=1
    except KeyError:
        print(word)
    if count!=0:
        sen_vec1/=count

count = 0
sen_vec2 = np.zeros(300)
for word in seg2:
    try:
        sen_vec2+=embeddings_index[word]
        count+=1
    except KeyError:
        print(word)
    if count!=0:
        sen_vec2/=count


count = 0
sen_vec3 = np.zeros(300)
for word in seg3:
    try:
        sen_vec3+=embeddings_index[word]
        count+=1
    except KeyError:
        print(word)
    if count!=0:
        sen_vec3/=count

def similarity(v1, v2):
    a = sqrt(np.dot(v1, v1))
    b = sqrt(np.dot(v2, v2))
    if a == 0 or b == 0:
        return -1
    cos_dis = np.dot(v1, v2) / (b * a)

    return cos_dis

print(similarity(sen_vec1, sen_vec2))
print(similarity(sen_vec1, sen_vec3))
print(similarity(sen_vec2, sen_vec3))
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值