python-pytorch实现skip-gram 0.5.000【直接可运行】

参考

https://blog.csdn.net/Metal1/article/details/132886936

https://blog.csdn.net/L_goodboy/article/details/136347947

导入包

import jieba
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from matplotlib import pyplot as plt
from sklearn.decomposition import PCA
from tqdm import tqdm, trange
torch.manual_seed(1)

加载数据和切词

# 加载停用词词表
def load_stop_words():
    """
        停用词是指在信息检索中,
        为节省存储空间和提高搜索效率,
        在处理自然语言数据(或文本)之前或之后
        会自动过滤掉某些字或词
    """
    with open('data/stopwords.txt', "r", encoding="utf-8") as f:
        return f.read().split("\n")
 
 
# 加载文本,切词
def cut_words():
    stop_words = load_stop_words()
    with open('data/zh.txt', encoding='utf8') as f:
        allData = f.readlines()
    result = []
    for words in allData:
        c_words = jieba.lcut(words)
        for word in c_words:
            if word not in stop_words and word != "\n":
                result.append(word)
    return result

# 加载文本,切词
def cut_sentense(str):
    stop_words = load_stop_words()
    with open('data/zh.txt', encoding='utf8') as f:
        allData = f.readlines()
    result = []
    c_words = jieba.lcut(str)

    for word in c_words:
        if word not in stop_words and word != "\n":
            result.append(word)
    return result

获取wordList、raw_text

wordList = []
data = cut_words()
data

count = 0
for words in data:
    if words not in wordList:
        wordList.append(words)
print("wordList=", wordList)
 
raw_text = wordList
print("raw_text=", raw_text)
# 超参数
learning_rate = 0.003
# 放cuda或者cpu里
device = torch.device('cpu')
# 上下文信息,即涉及文本的前n个和后n个
context_size = 2
# 词嵌入的维度,即一个单词用多少个浮点数表示比如 the=[10.2323,12.132133,4.1219774]...
embedding_dim = 100
epoch = 10
def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    return torch.tensor(idxs, dtype=torch.long)

获取vocab、vocab_size

# 把所有词集合转成dict
vocab = set(wordList)
vocab_size = len(vocab)
vocab,vocab_size

word_to_idx、idx_to_word

word_to_idx = {word: i for i, word in enumerate(vocab)}
idx_to_word = {i: word for i, word in enumerate(vocab)}

准备训练数据

data3 = []
window_size1=2
for i,word in enumerate(raw_text):
    target = raw_text[i]
    contexts=raw_text[max(i - window_size1, 0): min(i + window_size1 + 1, len(raw_text))]
    for context in contexts:
        if target!=context:
            data3.append((context,target))
data3,len(data3)
     

准备模型和参数

class SkipGramModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGramModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)
        
    def forward(self, center_word):
        embedded = self.embedding(center_word)
        output = self.linear(embedded)
        return output
 
model = SkipGramModel(vocab_size, embedding_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

训练模型

# Training
for epoch in tqdm(range(2000)):
    loss_sum = 0
    for target,input in data3:

        targetidx=word_to_idx[target]
        inputidx=word_to_idx[input]

        output=model(torch.tensor(inputidx,dtype=torch.long))
        loss=criterion(output,torch.tensor(targetidx))

        optimizer.zero_grad()  # 清空梯度
        loss.backward()  # 反向传播
        optimizer.step()  # 更新参数

        loss_sum += loss.item()
    if (epoch+1) % 10 == 0:
        print("loss is ",loss_sum/len(data2),loss.item())
   

保存模型

torch.save(model.state_dict(),"skipgram.pth")

简单预测

inputidx=word_to_idx["refresh"]

output=model(torch.tensor(inputidx,dtype=torch.long))
print(output.topk(4))
cc,index=output.topk(4)
idx_to_word[index[0].item()],idx_to_word[index[1].item()],idx_to_word[index[2].item()],idx_to_word[index[3].item()]

def predict(centerword):
    inputidx=word_to_idx[centerword]
    output=model(torch.tensor(inputidx,dtype=torch.long))
    print(output.topk(4))
    cc,index=output.topk(4)
    idx_to_word[index[0].item()],idx_to_word[index[1].item()],idx_to_word[index[2].item()],idx_to_word[index[3].item()]
    

获取训练后的词向量

trained_vector_dic={}
for word, idx in word_to_idx.items(): # 输出每个词的嵌入向量
    trained_vector_dic[word]=model.embedding.weight[idx]

trained_vector_dic

画图看下分布

fig, ax = plt.subplots() 
for word, idx in word_to_idx.items():
    # 获取每个单词的嵌入向量
    vec = model.embedding.weight[:,idx].detach().numpy() 
    ax.scatter(vec[0], vec[1]) # 在图中绘制嵌入向量的点
    ax.annotate(word, (vec[0], vec[1]), fontsize=12) # 点旁添加单词标签
plt.title(' 二维词嵌入 ') # 图题
plt.xlabel(' 向量维度 1') # X 轴 Label
plt.ylabel(' 向量维度 2') # Y 轴 Label
plt.show() # 显示图

利用词向量计算相似度

余弦

# https://blog.csdn.net/qq_41487299/article/details/106299882
import torch
import torch.nn.functional as F
 
# 计算余弦相似度
cosine_similarity = F.cosine_similarity(x.unsqueeze(0), y.unsqueeze(0))
 
print(cosine_similarity)

cosine_similarity1 = F.cosine_similarity(torch.tensor(trained_vector_dic["保持数据"].unsqueeze(0)), torch.tensor(trained_vector_dic["打印信息"]).unsqueeze(0))
print(cosine_similarity1)

点积

dot_product = torch.dot(torch.tensor(trained_vector_dic["保持数据"]), torch.tensor(trained_vector_dic["打印信息"]))
x_length = torch.norm(torch.tensor(trained_vector_dic["保持数据"]))
y_length = torch.norm(torch.tensor(trained_vector_dic["打印信息"]))
similarity = dot_product / (x_length * y_length)
 
print(similarity)
torch.tensor(trained_vector_dic["参数值"]),len(trained_vector_dic)
c1=cos(trained_vector_dic["删除"],trained_vector_dic["服务"])
print(c1)
  • 6
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值