import torch
import math
import time
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
#1.构建数据集
datas = np.load('tang.npz')
data, word2ix, ix2word = datas['data'], datas['word2ix'].item(), datas['ix2word'].item()
'''
tang.npz中有三个对象
data:(57580,125)的numpy数组,共有57580首诗歌,每首诗歌的长度为125个字符
(不足125,在古诗前补足空格,超过125,超过部分删去,古诗的开头是<start>,结尾是<end>,这俩字符对应的数字,超过125,只有对应的起始符)
word2ix:将词转化成它对应的序号
ix2word:将序号转化成它对应的词
'''
#print(len(word2ix)) 8293
#print(len(ix2word)) 8293
#说明有8293个字符,input_size=8293
#这个项目貌似不需要测试集
#dataset 构建的dataset函数满足两个条件:1.返回样本大小print(len(data)) 87580 2.由下标返回样本数据 print(data[0])
data=torch.from_numpy(data) #将numpy数组转化为tensor
dataloader=DataLoader(dataset=data,batch_size=32,shuffle=True,num_workers=0)
#2.搭建神经网络
class PoetryModel(nn.Module):#vocab_size=input_size=num_class
def __init__(self, vocab_size, embedding_size, hidden_size):
super(PoetryModel, self).__init__()
self.hidden_size = hidden_size
self.embeddings = nn.Embedding(vocab_size, embedding_size) #(seq_len,batch_size)->(seq_len,batch_size,embedding_size)
self.lstm = nn.LSTM(embedding_size, self.hidden_size, num_layers=2) #(inputsize,hiddensize,num_layzers) 这里inputsize=embeddingsize
self.linear = nn.Linear(self.hidden_size, vocab_size) #input(seqlen,batchsize,inputsize),h0(num_layers,batchsize,hiddensize),c0
#out(seqlen,batchsize,hiddensize),hn(num_layers,batchsize,hiddensize),cn
def forward(self, input, hidden=None): #data.size([57580, 125]) 要做一个转置
# input=input.t() #之后的设置中有转置项
seq_len, batch_size = input.size() #(seq_len,batch_size)
if hidden is None:
# h_0 = 0.01*torch.Tensor(2, batch_size, self.hidden_size).normal_().cuda()
# c_0 = 0.01*torch.Tensor(2, batch_size, self.hidden_size).normal_().cuda()
h_0 = input.data.new(2, batch_size, self.hidden_size).fill_(0).float()
c_0 = input.data.new(2, batch_size, self.hidden_size).fill_(0).float()
else:
h_0, c_0 = hidden
# size: (seq_len,batch_size,embedding_size) #经过embedding层输出的维度
embeds = self.embeddings(input)
# output size: (seq_len,batch_size,hidden_size)
output, hidden = self.lstm(embeds, (h_0, c_0))
# size: (seq_len*batch_size,vocab_size) #(seq_len*batch_size,hidden_size)->(seq_len*batch_size,vocab_size)
output = self.linear(output.view(seq_len * batch_size, -1))
return output, hidden
#(inputsize,embeddingsize,hiddensize) 输出output(seq_len*batch_size,vocab_size) hidden(hn,cn)
model = PoetryModel(len(word2ix), 128, 256)
device=torch.device("cuda:0"if torch.cuda.is_available()else"cpu") #cuda:1 第二块显卡
model.to(device) #将模型迁移到GPU
#3.定义优化器和损失函数
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
def time_since(since):
s = time.time() - since
m = math.floor(s / 60)
s -= m * 60
return '%dm %ds' % (m, s)
def train(epoch):
running_loss=0.0 # transpose(X,Y)函数和矩阵的转置是一个意思,
for ii, data_ in enumerate(dataloader): # 相当于行为X轴,列为Y轴,X轴和Y轴调换了位置;X轴用0表示,Y轴用1表示;
data_ = data_.long().transpose(1, 0).contiguous() # transpose:转置(batch_size,sen_len)->(sen_len,batch_size)
data_ = data_.to(device) # contiguous()保证tensor内存行优先进行连续排列
optimizer.zero_grad()
input_, target = data_[:-1, :], data_[1:, :] # 这里让target错开,使得x1的输出目标为x2,即该词的输出为该诗句的下一个词
output, _ = model(input_)
loss = criterion(output, target.view(-1)) # 序列损失的累加,体现序列的关联性,由于这个转化为1维的,所以不用进行累加
loss.backward()
optimizer.step()
running_loss += loss.item()
if ii % 500 == 499:
print(time_since(since=start),end=' ')
print('[%d,%5d] loss: %.3f' %(epoch+1,ii+1,running_loss/500)) #输出每500个batch_size的平均损失
loss_data=running_loss/500
running_loss=0.0
return loss_data
#4.模型训练
if __name__ == '__main__':
start = time.time() # 返回的是s为单位的时间
loss_list = []
for epoch in range(20):
loss_data=train(epoch)
loss_list.append(loss_data)
torch.save(model,'model.pt')
# 绘图
time = np.arange(1, len(loss_list) + 1, 1)
acc_list = np.array(loss_list)
plt.plot(time, loss_list)
plt.xlabel('Time')
plt.ylabel('Loss')
plt.grid() #网格
plt.show()