pytorch 文档: https://pytorch.org/docs/stable/nn.html?highlight=lstm#torch.nn.LSTM
torchtext 文档:https://torchtext.readthedocs.io/en/latest/#
参考论文:
Neural Nets and Neural Language Models. Dan Jurafsky and James H. Martin. Speech and Language Processing (3rd edition draft) . |
The Unreasonable Effectiveness of Recurrent Neural Networks. Andrej Karpathy. Blog post. 2015. |
A Neural Probabilistic Language Model (longer JMLR version). Yoshua Bengio, Réjean Ducharme, Pascal Vincent and Christian Jauvin. Journal of Machine Learning Research 2003.
|
其他参考:xxx
model 代码
class RNNModel(nn.Module):
def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, dropout=0.5):
super(RNNModel, self).__init__()
self.drop = nn.Dropout(dropout)
self.encoder = nn.Embedding(ntoken, ninp) # vocab_size -> ninp (embed size)
if rnn_type in ["LSTM", "GRU"]:
self.rnn = getattr(nn, rnn_type)(ninp, nhid, nlayers, dropout = dropout)
else:
try:
nonlinearity = {'RNN_TANH':'tanh', 'RNN_RELU':'relu'}[rnn_type]
except KeyError:
raise ValueError("""An invalid option for '--model' was supplied, options are ['LSTM', 'GRU', 'RNN_TANH'
or 'RNN_RELU']""")
self.rnn = nn.RNN(ninp, nhid, nlayers, nonlinearity = nonlinearity, dropout= dropout)
self.decoder = nn.Linear(nhid, ntoken) # -> 最后一层 返回50002维度单词,新增了 <unk>, <pad> 两个单词
self.init_weights()
self.rnn_type = rnn_type
self.nhid = nhid
self.nlayers = nlayers
def init_weights(self):
initrange = 0.1
self.encoder.weight.data.uniform_(-initrange, initrange)
self.decoder.bias.data.zero_()
self.decoder.weight.data.uniform_(-initrange, initrange)
def forward(self, input, hidden):
emb = self.drop(self.encoder(input))
output, hidden = self.rnn(emb, hidden)
output = self.drop(output)
decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2))) # 这里维度不清晰
return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden
def init_hidden(self, bsz, requires_grad=True):
weight = next(self.parameters()) # next 的用法需要熟悉
if self.rnn_type == 'LSTM': # 两个state 往下传
return (
weight.new_zeros((self.nlayers, bsz, self.nhid), requires_grad=requires_grad),
weight.new_zeros((self.nlayers, bsz, self.nhid), requires_grad= requires_grad)
)
else:
return weight.new_zeros((self.nlayers, bsz, self.nhid), requires_grad=requires_grad)
def evaluate(model, data):
model.eval()
total_loss = 0.
it = iter(data)
total_count = 0.
with torch.no_grad():
hidden = model.init_hidden(BATCH_SIZE, requires_grad=False)
for i, batch in enumerate(it):
data, target = batch.text, batch.target
hidden = repackage_hidden(hidden)
with torch.no_grad():
oput, hidden = model(data, hidden)
loss = loss_fn(output.view(-1, VOCAB_SIZE), target.view(-1))
total_count += np.multiply(*data.size())
total_loss += loss.item()*np.multiply(*data.size())
loss = total_loss / total_count
model.train()
return loss
loss_fn = nn.CrossEntropyLoss()
learning_rate = 0.0001
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.5)
epoch 0 iter 0 loss 8.5132417678833
epoch 0 iter 10 loss 8.426973342895508
epoch 0 iter 20 loss 8.063802719116211
epoch 0 iter 30 loss 8.000456809997559
epoch 0 iter 40 loss 6.623159885406494
epoch 0 iter 50 loss 6.447385311126709
epoch 0 iter 60 loss 6.258299827575684
epoch 0 iter 70 loss 6.241183280944824
epoch 0 iter 80 loss 6.147849082946777
epoch 0 iter 90 loss 6.017308712005615
epoch 0 iter 100 loss 6.1595139503479
epoch 0 iter 110 loss 5.974198341369629
epoch 0 iter 120 loss 5.985989570617676
epoch 0 iter 130 loss 5.666098594665527