我周末再写
导入相应包
import torch
import torch.nn as nn
import torch.functional as F
import torch.utils.data as tud
from torch.utils.data import Dataset
from collections import Counter
import numpy as np
import random
import tqdm
import math
import pandas as pd
import scipy
import sklearn
from sklearn.metrics.pairwise import cosine_similarity
np.random.seed(1)
random.seed(1)
torch.manual_seed(1)
C = 3
K=100
num_epoch = 2
max_vocab_size = 30000
batch_size = 128
learning_rate = 0.01
embedding_size = 100
def word_tokenize(text):
return text.split()
with open('./text8/text8.train.txt','r') as fin:
text = fin.read()
text = text.split()
vocab = dict(Counter(text).most_common(max_vocab_size-1))
vocab['<unk>'] = len(text)-np.sum(list(vocab.values()))
idx_to_word = [word for word in vocab.keys()]
word_to_idx = {word:i for i, word in enumerate(idx_to_word)}
word_counts = np.array([count for count in vocab.values()], dtype=np.float32)
word_freqs = word_counts/np.sum(word_counts)
word_freqs = word_freqs**(3./4.)
word_freqs = word_freqs/np.sum(word_freqs) # 因为3/4次方之后词频和不为1了,所以normalize一下
vocab_size = len(idx_to_word)
实现dataloader
# 实现dataloader
class wordEmbeddingDataset(Dataset):
def __init__(self,word_to_idx, word_freqs, text):
super(wordEmbeddingDataset, self).__init__()
self.text_encoded = [word_to_idx.get(word, word_to_idx['<unk>']) for word in text] # dict.get(keys, default = None) 这里把default设置为<unk>的id 函数返回指定键的值,如果键不在字典中返回默认值
self.text_encoded = torch.LongTensor(self.text_encoded)
self.word_to_idx = word_to_idx
# self.idx_to_word = idx_to_word
self.word_freqs = torch.Tensor(word_freqs)
# self.word_counts = torch.Tensor(word_counts)
def __len__(self):
return len(self.text_encoded)
def __getitem__(self, idx):
center_word = self.text_encoded[idx]
pos_indices = list(range(idx-C,idx)) + list(range(idx+1,idx+C+1))
pos_indices = [i % len(self.text_encoded) for i in pos_indices]
pos_words = self.text_encoded[pos_indices]
neg_words = torch.multinomial(self.word_freqs, K * pos_words.shape[0], True) # torch.multinomial()
return center_word, pos_words, neg_words
定义一个模型
# 定义一个模型
class EmbeddingModel(nn.Module):
def __init__(self,vocab_size,embed_size):
super(EmbeddingModel, self).__init__()
self.vocab_size = vocab_size
self.embed_size = embed_size
self.in_embed = nn.Embedding(self.vocab_size,self.embed_size)
self.out_embed = nn.Embedding(self.vocab_size, self.embed_size)
def forward(self,input_labels,pos_labels,neg_labels):
input_embedding = self.in_embed(input_labels)
pos_embedding = self.in_embed(pos_labels)
neg_embedding = self.in_embed(neg_labels)
input_embedding=input_embedding.unsqueeze(2)
pos_dot = torch.bmm(pos_embedding,input_embedding).squeeze(2)
neg_dot = torch.bmm(neg_embedding,-input_embedding).squeeze(2)
log_pos = torch.log(torch.sigmoid(pos_dot)).sum(1)
log_neg = torch.log(torch.sigmoid(neg_dot)).sum(1)
loss = log_pos + log_neg
return -loss
训练模型
mydataset = wordEmbeddingDataset(word_to_idx,word_freqs,text)
myloader = tud.DataLoader(mydataset,batch_size,shuffle=True,num_workers=0)
model = EmbeddingModel(vocab_size,embedding_size)
optimizer = torch.optim.SGD(model.parameters(),lr=learning_rate)
for e in range(num_epoch):
for i, (input_labels,pos_labels,neg_labels) in enumerate(myloader):
input_labels = input_labels.long()
pos_labels = pos_labels.long()
neg_labels = neg_labels.long()
optimizer.zero_grad()
loss = model(input_labels,pos_labels,neg_labels).mean()
loss.backward()
optimizer.step()
if i % 100 == 0:
print('epoch:',e ,'iteration', i , loss.item())