pytorch实现skip gram

最新推荐文章于 2024-04-09 17:40:37 发布

Hello 摸鱼达人

最新推荐文章于 2024-04-09 17:40:37 发布

阅读量239

点赞数

文章标签：自然语言处理

本文链接：https://blog.csdn.net/weixin_40440610/article/details/116570236

版权

import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import torch.utils.data as Data

dtype = torch.FloatTensor
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

sentences = ["I love china","I love xtu","china is a beautiful country","I love sports",
             "my research interesting is deep learning and marchine learning ",
             "I love neural langrage processing"]

word_sequence = " ".join(sentences).split()  # ['jack', 'like', 'dog', 'jack', 'like', 'cat', 'animal',...]
vocab = list(set(word_sequence))  # build words vocabulary
word2idx = {w: i for i, w in enumerate(vocab)}  # {'jack':0, 'like':1,...}

# Word2Vec Parameters
batch_size = 8
embedding_size = 2  # 2 dim vector represent one word
C = 2  # window size
voc_size = len(vocab)

skip_grams = []
for idx in range(C, len(word_sequence) - C):
    center = word2idx[word_sequence[idx]]  # center word
    context_idx = list(range(idx - C, idx)) + list(range(idx + 1, idx + C + 1))  # context word idx
    context = [word2idx[word_sequence[i]] for i in context_idx]
    for w in context:
        skip_grams.append([center, w])


def make_data(skip_grams):
    input_data = []
    output_data = []
    for i in range(len(skip_grams)):
        input_data.append(np.eye(voc_size)[skip_grams[i][0]])
        output_data.append(skip_grams[i][1])
    return input_data, output_data


input_data, output_data = make_data(skip_grams)
input_data, output_data = torch.Tensor(input_data), torch.LongTensor(output_data)
dataset = Data.TensorDataset(input_data, output_data)
loader = Data.DataLoader(dataset, batch_size, True)


# Model
class Word2Vec(nn.Module):
    def __init__(self):
        super(Word2Vec, self).__init__()

        # W and V is not Traspose relationship
        self.W = nn.Parameter(torch.randn(voc_size, embedding_size).type(dtype))
        self.V = nn.Parameter(torch.randn(embedding_size, voc_size).type(dtype))

    def forward(self, X):
        # X : [batch_size, voc_size] one-hot
        # torch.mm only for 2 dim matrix, but torch.matmul can use to any dim
        hidden_layer = torch.matmul(X, self.W)  # hidden_layer : [batch_size, embedding_size]
        output_layer = torch.matmul(hidden_layer, self.V)  # output_layer : [batch_size, voc_size]
        return output_layer


model = Word2Vec().to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# Training
for epoch in range(2000):
    for i, (batch_x, batch_y) in enumerate(loader):
        batch_x = batch_x.to(device)
        batch_y = batch_y.to(device)
        pred = model(batch_x)
        loss = criterion(pred, batch_y)
        #if (epoch + 1) % 1000 == 0:
        print(epoch + 1, i, loss.item())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

for i, label in enumerate(vocab):
    W, WT = model.parameters()
    x, y = float(W[i][0]), float(W[i][1])
    plt.scatter(x, y)
    plt.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom')
plt.show()

可视化结果：
在这里插入图片描述