Word2Vec的PyTorch实现(一)

import torch
import numpy as np
import torch.nn as nn
import torch.utils.data as Data
import matplotlib.pyplot as plt
import torch.optim as optimizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dtype = torch.FloatTensor

sentences = ["jack like dog", "jack like cat", "jack like animal",
  "dog cat animal", "banana apple cat dog like", "dog fish milk like",
  "dog cat animal like", "jack like apple", "apple like", "jack like banana",
  "apple banana jack movie book music like", "cat dog hate", "cat dog like"]

sentence_list = " ".join(sentences).split() # ['jack', 'like', 'dog', 'jack', 'like', 'cat',...]

vocab = list(set(sentence_list)) # ['like', 'animal', 'apple', 'movie', 'music', 'hate', 'jack', 'cat', 'milk', 'fish', 'book', 'dog', 'banana']
word2idx = {w:i for i,w in enumerate(vocab)} # {0: 'like', 1: 'banana', 2: 'hate', 3: 'fish', 4: 'milk', 5: 'movie', 6: 'animal', 7: 'music', 8: 'apple', 9: 'dog', 10: 'jack', 11: 'cat', 12: 'book'}
vocab_size = len(vocab)

# model parameters
C = 2 # window size
batch_size = 8
m = 2 # word embedding dim

skip_grams = [] # 定义列表
for idx in range(C,len(sentence_list) - C): # [0,1,2..........] 考虑中心词起始位置
    center = word2idx[sentence_list[idx]] # 中心词
    context_idx = list(range(idx - C, idx)) + list(range(idx + 1, idx + C + 1)) # 背景词的索引 print(list(range(2,8)))-->[2, 3, 4, 5, 6, 7]
    context = [word2idx[sentence_list[i]] for i in context_idx] # 背景词

    for w in context:
        skip_grams.append([center,w])
# print(skip_grams) # [[2, 11], [2, 10], [2, 11], [2, 10], [11, 10], [11, 2], [11, 10], [11, 5], [10, 2], [10, 11], [10, 5],...]

# print(np.eye(vocab_size))
# [[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
#  [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
#  [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
#  [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
#  [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
#  [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
#  [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
#  [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
#  [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
#  [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
#  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
#  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
#  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]
def make_data(skip_grams):
    input_data = []
    output_data = []
    for i in range(len(skip_grams)):
        input_data.append(np.eye(vocab_size)[skip_grams[i][0]]) # 生成对角矩阵,取第a行
        output_data.append(skip_grams[i][1])
    return input_data,output_data

input_data, output_data = make_data(skip_grams)
input_data, output_data = torch.Tensor(input_data), torch.LongTensor(output_data)
dataset = Data.TensorDataset(input_data,output_data)
loader = Data.DataLoader(dataset,batch_size,True)

# 建立模型
class Word2Vec(nn.Module):
    def __init__(self):
        super(Word2Vec, self).__init__()
        self.W = nn.Parameter(torch.randn(vocab_size, m).type(dtype)) # 每个词m维向量
        self.V = nn.Parameter(torch.randn(m, vocab_size).type(dtype))

    def forward(self, X):
        # X [batch_size, vacab_size]
        hidden = torch.mm(X, self.W) # [batch_size, m]
        output = torch.mm(hidden, self.V) # [batch_size, vocab_size]

        return output

model = Word2Vec().to(device)
loss_fn = nn.CrossEntropyLoss().to(device)
optim = optimizer.Adam(model.parameters(), lr=1e-3)

# 模型训练
for epoch in range(2000):
    for i, (batch_x, batch_y) in enumerate(loader):
        batch_x = batch_x.to(device)
        batch_y = batch_y.to(device)
        pred = model(batch_x)
        loss = loss_fn(pred, batch_y)

        if (epoch + 1) % 1000 == 0:
            print(epoch + 1, i, loss.item())

        optim.zero_grad()
        loss.backward()
        optim.step()

for i, label in enumerate(vocab):
  W, WT = model.parameters()
  x,y = float(W[i][0]), float(W[i][1])
  plt.scatter(x, y)
  plt.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom')
plt.show()


D:\dev\anaconda\python.exe E:/DL-Pytorch/Word2Vec/Word2Vec-Torch.py
1000 0 2.1735804080963135
1000 1 2.381221294403076
1000 2 2.744288921356201
1000 3 2.117478847503662
1000 4 2.412742853164673
1000 5 1.8749442100524902
1000 6 1.8011806011199951
1000 7 2.073594093322754
1000 8 1.9987781047821045
1000 9 2.042032241821289
1000 10 1.908738613128662
1000 11 2.047450065612793
1000 12 2.151552677154541
1000 13 1.8343603610992432
1000 14 2.034298896789551
1000 15 1.9653769731521606
1000 16 1.861008882522583
1000 17 1.7325068712234497
1000 18 2.021176815032959
1000 19 2.501958131790161
1000 20 2.10615873336792
2000 0 2.2089803218841553
2000 1 2.4472451210021973
2000 2 2.1905694007873535
2000 3 1.72980535030365
2000 4 2.0529072284698486
2000 5 1.999685287475586
2000 6 2.0909345149993896
2000 7 1.9278661012649536
2000 8 1.7663590908050537
2000 9 2.3422372341156006
2000 10 2.340832471847534
2000 11 2.1134681701660156
2000 12 2.3707051277160645
2000 13 2.1404480934143066
2000 14 1.9689396619796753
2000 15 2.090712547302246
2000 16 2.0929083824157715
2000 17 2.022451162338257
2000 18 1.5936014652252197
2000 19 1.5769505500793457
2000 20 2.144016981124878

在这里插入图片描述

  • 1
    点赞
  • 12
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
以下是使用 PyTorch 实现 Word2Vec 的基本步骤: 1. 数据预处理:将原始文本数据转换为神经网络可以理解的形式。可以使用 NLTK 等自然语言处理库来进行分词、去停用词等操作。 2. 构建神经网络模型:Word2Vec 通常使用两种架构模型:CBOW(Continuous Bag of Words)和 Skip-gram。CBOW 模型从上下文中预测中心词,Skip-gram 模型从中心词预测上下文。这里以 Skip-gram 为例。 3. 定义损失函数:Word2Vec 的目标是最大化词向量之间的相似度,可以使用 softmax 函数和交叉熵作为损失函数。 4. 训练模型:使用反向传播算法和随机梯度下降法来更新模型的参数,使得损失函数最小化。 下面是一个简单的 PyTorch 实现代码: ``` import torch import torch.nn as nn import torch.optim as optim class SkipGram(nn.Module): def __init__(self, vocab_size, embedding_dim): super(SkipGram, self).__init__() self.embeddings = nn.Embedding(vocab_size, embedding_dim) self.linear = nn.Linear(embedding_dim, vocab_size) def forward(self, center_word): center_embed = self.embeddings(center_word) out = self.linear(center_embed) log_probs = nn.functional.log_softmax(out, dim=1) return log_probs # 定义数据和超参数 data = ['I', 'love', 'NLP', 'and', 'deep', 'learning'] vocab_size = len(set(data)) embedding_dim = 10 batch_size = 1 learning_rate = 0.001 # 定义模型、损失函数和优化器 model = SkipGram(vocab_size, embedding_dim) criterion = nn.NLLLoss() optimizer = optim.SGD(model.parameters(), lr=learning_rate) # 训练模型 for epoch in range(100): for i in range(len(data)): center_word = torch.tensor([i]) log_probs = model(center_word) loss = criterion(log_probs, torch.tensor([i])) optimizer.zero_grad() loss.backward() optimizer.step() # 获取词向量 word_embeddings = model.embeddings.weight.detach().numpy() ``` 这个实现很简单,只能处理单个词语,处理文本需要使用更复杂的方法。需要注意的是,Word2Vec 训练需要大量的数据和计算资源,特别是在大规模语料库上训练时。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值