学习语言模型,以及如何训练一个语言模型 学习torchtext的基本使用方法 构建 vocabulary 3.1 word to inde 和 index to word 学习torch.nn的一些基本模型 4.1 Linear 4.2 RNN 4.3 LSTM 4.4 GRU RNN的训练技巧 5.1 Gradient Clipping 如何保存和读取模型
import torchtext
from torchtext. vocab import Vectors
import torch
import numpy as np
import random, os
USE_CUDA = torch. cuda. is_available( )
random. seed( 1000 )
np. random. seed( 1000 )
torch. manual_seed( 1000 )
if USE_CUDA:
torch. cuda. manual_seed_all( 1000 )
device = torch. device( 'cuda' if USE_CUDA else 'cpu' )
BATCH_SIZE = 32
EMBEDDING_SIZE = 650
MAX_VOCAB_SIZE = 50000
DATA_PATH = r'./data/demo10_pytorch_skip-Gram'
TRAIN_DATA = 'text8.train.txt'
TEST_DATA = 'text8.test.txt'
VALI_DATA = 'text8.dev.txt'
SAVE_MODEL = DATA_PATH + os. sep + 'lossm.pth'
TEXT = torchtext. data. Field( lower= True )
train, val, test = torchtext. datasets. LanguageModelingDataset. splits( path= DATA_PATH,
text_field= TEXT,
train= TRAIN_DATA,
validation= VALI_DATA,
test= TEST_DATA)
TEXT. build_vocab( train, max_size= MAX_VOCAB_SIZE)
VOCAB_SIZE = len ( TEXT. vocab)
train_iter, val_iter, test_iter = torchtext. data. BPTTIterator. splits( datasets= ( train, val, test) ,
batch_size= BATCH_SIZE,
device= device,
bptt_len= 50 ,
repeat= False ,
shuffle= True )
class RNNModel ( torch. nn. Module) :
def __init__ ( self, rnn_type, vocab_size, embed_size, hidden_size, nlayers, dropout= 0.5 ) :
''' 该模型包含以下几层:
- 词嵌入层
- 一个循环神经网络层(RNN, LSTM, GRU)
- 一个线性层,从hidden state到输出单词表
- 一个dropout层,用来做regularization
'''
super ( RNNModel, self) . __init__( )
self. drop = torch. nn. Dropout( dropout)
self. encoder = torch. nn. Embedding( vocab_size, embed_size)
if rnn_type in [ 'LSTM' , 'GRU' ] :
self. rnn = getattr ( torch. nn, rnn_type) ( embed_size, hidden_size, nlayers, dropout= dropout)
else :
try :
nonlinearity = { 'RNN_TANH' : 'tanh' , 'RNN_RELU' : 'relu' } [ rnn_type]
except KeyError:
raise ValueError( """An invalid option for `--model` was supplied,
options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']""" )
self. rnn = torch. nn. RNN( embed_size, hidden_size, nlayers, nonlinearity= nonlinearity, dropout= dropout)
self. decoder = torch. nn. Linear( hidden_size, vocab_size)
self. init_weights( )
self. rnn_type = rnn_type
self. hidden_size = hidden_size
self. nlayers = nlayers
def init_weights ( self) :
initrange = 0.1
self. encoder. weight. data. uniform_( - initrange, initrange)
self. decoder. weight. data. uniform_( - initrange, initrange)
self. decoder. bias. data. zero_( )
def forward ( self, input , hidden) :
''' Forward pass:
- word embedding
- 输入循环神经网络
- 一个线性层从hidden state转化为输出单词表
'''
emb = self. drop( self. encoder( input ) )
output, hidden = self. rnn( emb, hidden)
decoded = self. decoder( self. drop( output. view( - 1 , output. size( 2 ) ) ) )
return decoded. view( output. size( 0 ) , output. size( 1 ) , - 1 ) , hidden
def init_hidden ( self, bsz, requires_grad= True ) :
weight = next ( self. parameters( ) )
if self. rnn_type == 'LSTM' :
return ( weight. new_zeros( ( self. nlayers, bsz, self. hidden_size) , requires_grad= requires_grad) ,
weight. new_zeros( ( self. nlayers, bsz, self. hidden_size) , requires_grad= requires_grad) )
else :
return weight. new_zeros( ( self. nlayers, bsz, self. hidden_size) , requires_grad= requires_grad)
model = RNNModel( rnn_type= 'LSTM' , vocab_size= VOCAB_SIZE, embed_size= EMBEDDING_SIZE, hidden_size= 100 , nlayers= 1 )
if USE_CUDA:
model = model. to( device)
def repackage_hidden ( h) :
"""Wraps hidden states in new Tensors, to detach them from their history."""
if isinstance ( h, torch. Tensor) :
return h. detach( )
else :
return tuple ( repackage_hidden( v) for v in h)
def evaluate ( model, data) :
model. eval ( )
total_loss = 0 .
it = iter ( data)
total_count = 0
with torch. no_grad( ) :
hidden = model. init_hidden( BATCH_SIZE, requires_grad= False )
for i, batch in enumerate ( it) :
data, target = batch. text, batch. target
if USE_CUDA:
data, target = data. cuda( ) , target. cuda( )
hidden = repackage_hidden( hidden)
with torch. no_grad( ) :
output, hidden = model( data, hidden)
loss = loss_fn( output. view( - 1 , VOCAB_SIZE) , target. view( - 1 ) )
total_count += np. multiply( * data. size( ) )
total_loss += loss. item( ) * np. multiply( * data. size( ) )
loss = total_loss/ total_count
model. train( )
return loss
NUM_EPOCHS = 2
GRAD_CLIP = 5 .
val_losses = [ ]
loss_fn = torch. nn. CrossEntropyLoss( )
learning_rate = 0.001
optimizer = torch. optim. Adam( model. parameters( ) , lr= learning_rate)
scheduler = torch. optim. lr_scheduler. ExponentialLR( optimizer, 0.5 )
for epoch in range ( NUM_EPOCHS) :
model. train( )
it = iter ( train_iter)
hidden = model. init_hidden( BATCH_SIZE)
for i, batch in enumerate ( it) :
data, target = batch. text, batch. target
hidden = repackage_hidden( hidden)
output, hidden = model( data, hidden)
loss = loss_fn( output. view( - 1 , VOCAB_SIZE) , target. view( - 1 ) )
optimizer. zero_grad( )
loss. backward( )
torch. nn. utils. clip_grad_norm_( model. parameters( ) , GRAD_CLIP)
optimizer. step( )
if i% 100 == 0 :
print ( 'epoch' , epoch, i, loss. item( ) )
if i% 1000 == 0 :
val_loss = evaluate( model, val_iter)
if len ( val_losses) == 0 or val_loss < min ( val_losses) :
torch. save( model. state_dict( ) , SAVE_MODEL)
print ( 'best model saved to lm.pth' )
else :
scheduler. step( )
val_losses. append( val_loss)