pytorch代码实践--word2vec:skip-gram

我周末再写

导入相应包

import torch
import torch.nn as nn
import torch.functional as F
import torch.utils.data as tud
from torch.utils.data import Dataset
from collections import Counter
import numpy as np
import random
import tqdm
import math
import pandas as pd
import scipy
import sklearn
from sklearn.metrics.pairwise import cosine_similarity


np.random.seed(1)
random.seed(1)
torch.manual_seed(1)

C = 3
K=100
num_epoch = 2
max_vocab_size = 30000
batch_size = 128
learning_rate = 0.01
embedding_size = 100

def word_tokenize(text):
    return text.split()

with open('./text8/text8.train.txt','r') as fin:
    text = fin.read()

text = text.split()
vocab = dict(Counter(text).most_common(max_vocab_size-1))
vocab['<unk>'] = len(text)-np.sum(list(vocab.values()))
idx_to_word = [word for word in vocab.keys()]
word_to_idx = {word:i for i, word in enumerate(idx_to_word)}
word_counts = np.array([count for count in vocab.values()], dtype=np.float32)
word_freqs = word_counts/np.sum(word_counts)
word_freqs = word_freqs**(3./4.)
word_freqs = word_freqs/np.sum(word_freqs)      # 因为3/4次方之后词频和不为1了,所以normalize一下
vocab_size = len(idx_to_word)

 实现dataloader

# 实现dataloader
class wordEmbeddingDataset(Dataset):
    def __init__(self,word_to_idx, word_freqs, text):
        super(wordEmbeddingDataset, self).__init__()
        self.text_encoded = [word_to_idx.get(word, word_to_idx['<unk>']) for word in text]       # dict.get(keys, default = None) 这里把default设置为<unk>的id   函数返回指定键的值,如果键不在字典中返回默认值
        self.text_encoded = torch.LongTensor(self.text_encoded)
        self.word_to_idx = word_to_idx
        # self.idx_to_word = idx_to_word
        self.word_freqs = torch.Tensor(word_freqs)
        # self.word_counts = torch.Tensor(word_counts)

    def __len__(self):
        return len(self.text_encoded)

    def __getitem__(self, idx):
        center_word = self.text_encoded[idx]
        pos_indices = list(range(idx-C,idx)) + list(range(idx+1,idx+C+1))
        pos_indices = [i % len(self.text_encoded) for i in pos_indices]
        pos_words = self.text_encoded[pos_indices]
        neg_words = torch.multinomial(self.word_freqs, K * pos_words.shape[0], True)  # torch.multinomial()
        return center_word, pos_words, neg_words


 定义一个模型

# 定义一个模型
class EmbeddingModel(nn.Module):

    def __init__(self,vocab_size,embed_size):
        super(EmbeddingModel, self).__init__()
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.in_embed = nn.Embedding(self.vocab_size,self.embed_size)
        self.out_embed = nn.Embedding(self.vocab_size, self.embed_size)

    def forward(self,input_labels,pos_labels,neg_labels):
        input_embedding = self.in_embed(input_labels)
        pos_embedding = self.in_embed(pos_labels)
        neg_embedding = self.in_embed(neg_labels)

        input_embedding=input_embedding.unsqueeze(2)
        pos_dot = torch.bmm(pos_embedding,input_embedding).squeeze(2)
        neg_dot = torch.bmm(neg_embedding,-input_embedding).squeeze(2)

        log_pos = torch.log(torch.sigmoid(pos_dot)).sum(1)
        log_neg = torch.log(torch.sigmoid(neg_dot)).sum(1)
        loss = log_pos + log_neg
        return -loss


 训练模型

mydataset = wordEmbeddingDataset(word_to_idx,word_freqs,text)
myloader = tud.DataLoader(mydataset,batch_size,shuffle=True,num_workers=0)

model = EmbeddingModel(vocab_size,embedding_size)

optimizer = torch.optim.SGD(model.parameters(),lr=learning_rate)

for e in range(num_epoch):
    for i, (input_labels,pos_labels,neg_labels) in enumerate(myloader):
        input_labels = input_labels.long()
        pos_labels = pos_labels.long()
        neg_labels = neg_labels.long()
        optimizer.zero_grad()
        loss = model(input_labels,pos_labels,neg_labels).mean()
        loss.backward()
        optimizer.step()
        if i % 100 == 0:
            print('epoch:',e ,'iteration', i , loss.item())

 

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值