数据集链接:https://pan.baidu.com/s/1tFeK3mXuVXEy3EMarfeWvg 密码:v2z5
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as tud
from collections import Counter
import numpy as np
import random,math
import pandas as pd
import scipy
import sklearn
from sklearn.metrics.pairwise import cosine_similarity
random.seed(1)#np.random.seed(1) torch.manual_seed(1)
#设定一些超参数
C=3#指定周围的三个单词
K=100#每次出现一个正确的单词,设定100个错误的单词
NUM_EPOCHS=2
MAX_VOCAB_SIZE=30000
BATCH_SIZE=128
LEARNING_RATE=0.2
EMBEDDING_SIZE=100
def word_tokenize(text):
return text.split()#简单写了
with open(r'D:\各种编译器的代码\pythonProject12\机器学习\NLP自然语言处理\datas\text8\text8.train.txt','r') as fin:
text=fin.read()
text=text.split()#拿到一个一个的单词
vocab=dict(Counter(text).most_common(MAX_VOCAB_SIZE-1))#数单词
vocab['<unk>']=len(text)-np.sum(list(vocab.values()))
idx_to_word=[word for word in vocab.keys()]#取出来单词
word_to_idx={word:i for i,word in enumerate(idx_to_word)}#单词,下标
word_counts=np.array([count for count in vocab.values()],dtype=np.float32)
word_freqs=word_counts/np.sum(word_counts)
word_freqs=word_freqs**(3./4.)
word_freqs=word_counts/np.sum(word_counts)
VOCAB__SIZE=len(idx_to_word)
VOCAB__SIZE
class WordEmbeddingDataset(tud.Dataset):
def __init__(self,text,word_to_idx,idx_to_word,word_freqs,word_counts):
super(WordEmbeddingDataset,self).__init__()
self.text_encoded=[word_to_idx.get(word,word_to_idx['<unk>'])for word in text]
self.text_encoded=torch.LongTensor(self.text_encoded)
self.word_to_idx=word_to_idx
self.idx_to_word=idx_to_word
self.word_freqs=word_freqs
self.word_counts=word_counts
def __len__(self):
return len(self.text_encoded)
def __getitem__(self,idx):
center_word=self.text_encoded[idx]
pos_indices=list(range(idx-C,idx))+list(range(idx+1,idx+C+1))#周围单词的index
pos_indices=[i%len(self.text_encoded) for i in pos_indices]#防止下标超出,所以取余
pos_words=self.text_encoded[pos_indices]#周围单词
neg_words=torch.multinomial(self.word_freqs,K*pos_words.shape[0],True)#负例采样单词
return center_word,pos_words,neg_words
创建dataset和dataloader
dataset=WordEmbeddingDataset(text,word_to_idx,idx_to_word,word_freqs,word_counts)
dataloader=tud.DataLoader(dataset,batch_size=BATCH_SIZE,shuffle=True,num_workers=4)#4个线程
定义PyTorch模型
#定义PyTorch模型
class EmbeddingModel(nn.Module):
def __init__(self,vocab_size,embed_size):
super(EmbeddingModel,self).__init__()
self.vocab_size=vocab_size
self.embed_size=embed_size
self.in_embed=nn.Embedding(self.vocab_size,self.embed_size)
self.out_embed=nn.Embedding(self.vocab_size,self.embed_size)
def forward(self,input_labels,pos_labels,neg_labels):
#input_labels:[batch_size]
#pos_labels:[batch_size,(window_size*2)]
#neg_labels:[batch_size,(window_size*2*K)]
input_embedding=self.in_embed(input_labels)#[batch_size,embed_size]
pos_embedding=self.in_embed(pos_labels)#[batch_size,(window_size*2),embed_size]
neg_embedding=self.in_embed(neg_labels)#[batch_size,(window_size*2*K),embed_size]
input_embedding=input_embedding.unsquuze(2)#
pos_dot=torch.bmm(pos_embedding,input_embedding).squeeze(2)
neg_dot=torch.bmm(neg_embedding,input_embedding).squeeze(2)
log_pos=F.logsigmoid(pos_dot).sum(1)#在第一维度上求和
log_neg=F.logsigmoid(neg_dot).sum(1)
loss=log_pos+log_neg
return -loss
def input_embeddings(self):
return self.in_embed.weight.data
#定义一个模型
model=EmbeddingModel(VOCAB__SIZE,EMBEDDING_SIZE)
optimizer=torch.optim.SGD(model.parameters(),lr=LEARNING_RATE)
for e in range(NUM_EPOCHS):
for i,(input_labels,pos_labels,neg_labels)in enumerate(dataloader):
input_labels=input_labels.long()
pos_labels=pos_labels.long()
neg_labels=neg_labels.long()
optimizer.zero_grad()
loss=model(input_labels,pos_labels,neg_labels).mean()
loss.backward()
optimizer.step()
if i%100==0:
print("epoch",e,'iteration',i,loss.item(),sep='\t')
前面的几段代码都跑起来了,不知为何最后这一段代码没跑起来,ai~~~~