#1.创建字典和语料库import torch
import os
classDictionary(object):#字典def__init__(self):
self.word2idx={}
self.idx2word={}
self.idx=0defadd_word(self,word):ifnot word in self.word2idx:
self.word2idx[word]=self.idx
self.idx2word[self.idx]=word
self.idx+=1def__len__(self):#对象的len方法调用时:len(dictionary)returnlen(self.word2idx)classCorpus(object):#语料库def__init__(self):
self.dictionary=Dictionary()defget_data(self,path,batch_size=20):# 建立字典,Add words to the dictionarywithopen(path,'r')as f:
tokens=0#文本的单词和句号(eos)总共的个数for line in f:
words = line.split()+['<eos>']#'<eos>'表示一句话的结束,类似于句号
tokens+=len(words)for word in words:
self.dictionary.add_word(word)#建立带有句号eos的单词数字字典
ids=torch.LongTensor(tokens)
token=0withopen(path,'r')as f:#将文本库的文字换成数字形式的语料库for line in f:
words=line.split()+['<eos>']for word in words:
ids[token]=self.dictionary.word2idx[word]#ids[在txt中的第几个单词]=字典中的数字id号
token+=1
num_batch=ids.size(0)//batch_size#一共的单词个数(加eos)//batch_size
ids=ids[:num_batch*batch_size]#将语料库文字转换成batchsize的整倍数return ids.view(batch_size,-1)#返回整个语料库的数据
#网络训练import torch.nn as nn
import numpy as np
from torch.nn.utils import clip_grad_norm_#梯度裁剪
device=torch.device('cuda'if torch.cuda.is_available()else'cpu')
embed_size=128#将所有的单词映射到128维度,即每个单词是128维度
seq_length=30#输入是30个单词
batch_size=20
num_layers=1
hidden_size=1024
num_epochs=1
learning_rate=0.002
corpus=Corpus()
ids=corpus.get_data('data/train.txt',batch_size)#得到所有的数据,按batch个数据
vocab_size=len(corpus.dictionary)#调用字典的len方法,得到字典内单词的个数(包括一个eos)
num_batches=ids.size(1)//seq_length#所有数据有几个batch,即一个epoch内可以训练几次classRNNLM(nn.Module):def__init__(self,vocab_size,embed_size,hidden_size,num_layers):super(RNNLM,self).__init__()
self.embed=nn.Embedding(vocab_size,embed_size)#将字典内的所有单词映射到128维度上
self.lstm=nn.LSTM(embed_size,hidden_size,num_layers,batch_first=True)
self.linear=nn.Linear(hidden_size,vocab_size)#将30个单词输入预测出的30个单词(128维)通过linear层映射到字典坐标上defforward(self,x,h):#h是输入的初始状态
x=self.embed(x)
out,(h,c)=self.lstm(x,h)
out=out.reshape(out.size(0)*out.size(1),out.size(2))# Reshape output to (batch_size*sequence_length, hidden_size)
out=self.linear(out)return out,(h,c)
model=RNNLM(vocab_size,embed_size,hidden_size,num_layers).to(device)
criterion=nn.CrossEntropyLoss()
optimizer=torch.optim.Adam(model.parameters(),lr=learning_rate)defdetach(states):return[state.detach()for state in states]for epoch inrange(num_epochs):
states=(torch.zeros(num_layers,batch_size,hidden_size).to(device),torch.zeros(num_layers,batch_size,hidden_size).to(device))for i inrange(0,ids.size(1)-seq_length,seq_length):#每个batch数据的开始位置
inputs=ids[:,i:i+seq_length].to(device)#0~30
targets=ids[:,(i+1):(i+1)+seq_length].to(device)#1~31
states=detach(states)
outputs,states=model(inputs,states)
loss=criterion(outputs,targets.reshape(-1))
optimizer.zero_grad()
loss.backward()
clip_grad_norm_(model.parameters(),0.5)#梯度裁剪
optimizer.step()
step=(i+1)//seq_length
if step%100==0:print('epoch[{}/{}], Step[{}/{}],loss:{:.4f},perplexity:{:5.2f}'.format(epoch+1,num_epochs,step,num_batches,loss.item(),np.exp(loss.item())))
num_samples =100with torch.no_grad():withopen('sample.txt','w')as f:
state=(torch.zeros(num_layers,1,hidden_size).to(device),torch.zeros(num_layers,1,hidden_size).to(device))
prob=torch.ones(vocab_size)
inputs=torch.multinomial(prob,num_samples=1).unsqueeze(1).to(device)for i inrange(num_samples):
output,state=model(inputs,state)
prob=output.exp()
word_id=torch.multinomial(prob,num_samples=1).item()
inputs.fill_(word_id)
word=corpus.directionary.idx2word[word_id]
word='\n'if word =='<eos>'else word+''
f.write(word)if(i+1)%100==0:print('sample[{}/{}] words and save to{}'.format(i+1,num_samples,'sample.txt'))
torch.save(model.state_dict(),'model.ckpt')
#1.创建字典和语料库import torch import osclass Dictionary(object):#字典 def __init__(self): self.word2idx={} self.idx2word={} self.idx=0 def add_word(self,word): if not word in self.word2idx: self.word2idx[word]