Pytorch实现Word2Vec

# !/usr/bin/env Python3
# -*- coding: utf-8 -*-
# @version: v1.0
# @Author   : Meng Li
# @contact: 925762221@qq.com
# @FILE     : torch_word2vec.py
# @Time     : 2022/7/21 14:12
# @Software : PyCharm
# @site: 
# @Description : 自己实现的基于skip-gram算法的Word2Vec预训练语言模型
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import numpy as np

sentences = ["jack like dog", "jack like cat", "jack like animal",
             "dog cat animal", "banana apple cat dog like", "dog fish milk like",
             "dog cat animal like", "jack like apple", "apple like", "jack like banana",
             "apple banana jack movie book music like", "cat dog hate", "cat dog like"]
sentences_list = "".join([i for i in sentences]).split()
vocab = list(set(sentences_list))
word2idx = {j: i for i, j in enumerate(vocab)}
idx2word = {i: j for i, j in enumerate(vocab)}
vocab_size = len(vocab)
window_size = 2
embedding_size = 2


def make_data(seq_data):
    context_arr = []
    center = []
    context = []
    skip_gram = []
    seq_data = "".join([i for i in seq_data]).split()
    for sen in seq_data:
        for step in range(window_size, len(sen) - window_size):
            center = step
            context_arr = list(range(step - window_size, step)) + list(range(step + 1, step + window_size))
            for context_i in context_arr:
                skip_gram.append([np.eye(vocab_size)[word2idx[seq_data[center]]], context_i])
    input_data = []
    target_data = []
    for a, b in skip_gram:
        input_data.append(a)
        target_data.append(b)
    return torch.FloatTensor(input_data), torch.LongTensor(target_data)


class my_dataset(Dataset):
    def __init__(self, input_data, target_data):
        super(my_dataset, self).__init__()
        self.input_data = input_data
        self.target_data = target_data

    def __getitem__(self, index):
        return self.input_data[index], self.target_data[index]

    def __len__(self):
        return self.input_data.size(0)  # 返回张量的第一个维度


class SkipGram(nn.Module):
    def __init__(self, embedding_size):
        super(SkipGram, self).__init__()
        self.embedding_size = embedding_size
        self.fc1 = torch.nn.Linear(vocab_size, self.embedding_size)
        self.fc2 = torch.nn.Linear(self.embedding_size, vocab_size)
        self.loss = nn.CrossEntropyLoss()

    def forward(self, center, context):
        """
        :param center: [Batch_size]
        :param context:[Batch_size, vocab_size]
        :return:
        """
        center = self.fc1(center)
        center = self.fc2(center)
        loss = self.loss(center, context)
        return loss


batch_size = 2
center_data, context_data = make_data(sentences)
train_data = my_dataset(center_data, context_data)
train_iter = DataLoader(train_data, batch_size, shuffle=True)
epochs = 2000
model = SkipGram(embedding_size=embedding_size)
model.train()
optim = torch.optim.Adam(model.parameters(), lr=1e-3)
for epoch in range(epochs):
    for center, context in train_iter:
        loss = model(center, context)
        if epoch % 100 == 0:
            print("step {0} loss {1}".format(epoch, loss.detach().numpy()))
        optim.zero_grad()
        loss.backward()
        optim.step()

基于Pytorch实现的Word2vec预训练语言模型

上述代码段为Word2vec的简单实现

这里有两个加速算法,一个是negative sampling 一个是softmax 

由于整个模型的输出是一个softmax全连接,全连接的维度为词汇表 的长度

由于实际使用时词汇表长度可能过大,做softmax归一化时间太长,并且我们训练Word2Vec模型的主要目的不是想要预测下一个词语,而是得到一个Embedding矩阵对token进行封装。所以这里我们可以在构造正样本的同时,构造更多数量的负样本

eg: I like china very much  模型的window_size 为2 

那么单次china 构造的正样本对为{china , like } , {china , very}  

负样本对为{china , I }  {china , much }   通过构造负样本可以让模型更快的收敛

  • 2
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值