本项目尽可能复现Luong的attention模型,数据集小,只有一万多个句子的训练数据,所以训练出来的模型效果并不好。如果想训练一个好一点的模型,可以参考下面的资料。
课件
论文
PyTorch代码
更多关于Machine Translation
Beam Search - Pointer network 文本摘要Copy Mechanism 文本摘要 Converage Loss ConvSeq2Seq Transformer Tensor2Tensor
本项目的完整代码和数据集可见, 一键运行,开箱即食
import os
import sys
import math
from collections import Counter
import numpy as np
import random
import torch
import torch. nn as nn
import torch. nn. functional as F
import nltk
def load_data ( in_file) :
cn = [ ]
en = [ ]
num_examples = 0
with open ( in_file, 'r' , encoding= 'utf-8' ) as f:
for line in f:
line = line. strip( ) . split( "\t" )
en. append( [ "BOS" ] + nltk. word_tokenize( line[ 0 ] . lower( ) ) + [ "EOS" ] )
cn. append( [ "BOS" ] + [ c for c in line[ 1 ] ] + [ "EOS" ] )
return en, cn
train_file = "nmt/nmt/en-cn/train.txt"
dev_file = "nmt/nmt/en-cn/dev.txt"
train_en, train_cn = load_data( train_file)
dev_en, dev_cn = load_data( dev_file)
print ( train_en[ : 10 ] )
print ( train_cn[ : 10 ] )
UNK_IDX = 0
PAD_IDX = 1
def build_dict ( sentences, max_words= 50000 ) :
word_count = Counter( )
for sentence in sentences:
for s in sentence:
word_count[ s] += 1
ls = word_count. most_common( max_words)
print ( len ( ls) )
total_words = len ( ls) + 2
word_dict = { w[ 0 ] : index+ 2 for index, w in enumerate ( ls) }
word_dict[ "UNK" ] = UNK_IDX
word_dict[ "PAD" ] = PAD_IDX
return word_dict, total_words
en_dict, en_total_words = build_dict( train_en)
cn_dict, cn_total_words = build_dict( train_cn)
inv_en_dict = { v: k for k, v in en_dict. items( ) }
inv_cn_dict = { v: k for k, v in cn_dict. items( ) }
print ( en_total_words)
print ( list ( en_dict. items( ) ) [ : 10 ] )
print ( list ( en_dict. items( ) ) [ - 10 : ] )
print ( "---" * 20 )
print ( cn_total_words)
print ( list ( cn_dict. items( ) ) [ : 10 ] )
print ( list ( cn_dict. items( ) ) [ - 10 : ] )
print ( "---" * 20 )
print ( list ( inv_en_dict. items( ) ) [ : 10 ] )
print ( list ( inv_cn_dict. items( ) ) [ : 10 ] )
def encode ( en_sentences, cn_sentences, en_dict, cn_dict, sort_by_len= True ) :
length = len ( en_sentences)
out_en_sentences = [ [ en_dict. get( w, 0 ) for w in sent] for sent in en_sentences]
out_cn_sentences = [ [ cn_dict. get( w, 0 ) for w in sent] for sent in cn_sentences]
def len_argsort ( seq) :
return sorted ( range ( len ( seq) ) , key= lambda x: len ( seq[ x] ) )
if sort_by_len:
sorted_index = len_argsort( out_en_sentences)
out_en_sentences = [ out_en_sentences[ i] for i in sorted_index]
out_cn_sentences = [ out_cn_sentences[ i] for i in sorted_index]
return out_en_sentences, out_cn_sentences
train_en, train_cn = encode( train_en, train_cn, en_dict, cn_dict)
dev_en, dev_cn = encode( dev_en, dev_cn, en_dict, cn_dict)
seq = [ 5 , 4 , 6 , 9 , 10 ]
print ( sorted ( range ( 5 ) , key= lambda x: seq[ x] ) )
print ( sorted ( range ( 4 ) , key= lambda x: seq[ x] ) )
print ( train_en[ : 10 ] )
print ( train_cn[ : 10 ] )
print ( "---" * 20 )
k= 10000
print ( [ inv_cn_dict[ i] for i in train_cn[ k] ] )
print ( [ inv_en_dict[ i] for i in train_en[ k] ] )
print ( " " . join( [ inv_cn_dict[ i] for i in train_cn[ k] ] ) )
print ( " " . join( [ inv_en_dict[ i] for i in train_en[ k] ] ) )
print ( np. arange( 0 , 100 , 15 ) )
print ( np. arange( 0 , 15 ) )
def get_batches ( n, batch_size, shuffle= True ) :
idx_list = np. arange( 0 , n, batch_size)
if shuffle:
np. random. shuffle( idx_list)
batches = [ ]
for idx in idx_list:
batches. append( np. arange( idx, min ( idx + batch_size, n) ) )
return batches
get_batches( 100 , 15 )
def sent_padding ( seqs) :
lengths = [ len ( seq) for seq in seqs]
n_samples = len ( seqs)
max_len = np. max ( lengths)
x = np. zeros( ( n_samples, max_len) ) . astype( 'int32' )
x_lengths = np. array( lengths) . astype( "int32" )
for idx, seq in enumerate ( seqs) :
x[ idx, : lengths[ idx] ] = seq
return x, x_lengths
def gen_examples ( en_sentences, cn_sentences, batch_size) :
batches = get_batches( len ( en_sentences) , batch_size)
all_ex = [ ]
for batch in batches:
mb_en_sentences = [ en_sentences[ t] for t in batch]
mb_cn_sentences = [ cn_sentences[ t] for t in batch]
mb_x, mb_x_len = sent_padding( mb_en_sentences)
mb_y, mb_y_len = sent_padding( mb_cn_sentences)
all_ex. append( ( mb_x, mb_x_len, mb_y, mb_y_len) )
return all_ex
batch_size = 64
train_data = gen_examples( train_en, train_cn, batch_size)
random. shuffle( train_data)
dev_data = gen_examples( dev_en, dev_cn, batch_size)
print ( train_data[ 0 ] [ 0 ] . shape)
print ( train_data[ 0 ] [ 1 ] . shape)
print ( train_data[ 0 ] [ 2 ] . shape)
print ( train_data[ 0 ] [ 3 ] . shape)
print ( train_data[ 0 ] )
class PlainEncoder ( nn. Module) :
def __init__ ( self, vocab_size, hidden_size, dropout= 0.2 ) :
super ( PlainEncoder, self) . __init__( )
self. embed = nn. Embedding( vocab_size, hidden_size)
self. rnn = nn. GRU( hidden_size, hidden_size, batch_first= True )
self. dropout = nn. Dropout( dropout)
def forward ( self, x, lengths) :
sorted_len, sorted_idx = lengths. sort( 0 , descending= True )
x_sorted = x[ sorted_idx. long ( ) ]
embedded = self. dropout( self. embed( x_sorted) )
packed_embedded = nn. utils. rnn. pack_padded_sequence( embedded, sorted_len. long ( ) . cpu( ) . data. numpy( ) , batch_first= True )
packed_out, hid = self. rnn( packed_embedded)
out, _ = nn. utils. rnn. pad_packed_sequence( packed_out, batch_first= True )
_, original_idx = sorted_idx. sort( 0 , descending= False )
out = out[ original_idx. long ( ) ] . contiguous( )
hid = hid[ : , original_idx. long ( ) ] . contiguous( )
return out, hid[ [ - 1 ] ]
class PlainDecoder ( nn. Module) :
def __init__ ( self, vocab_size, hidden_size, dropout= 0.2 ) :
super ( PlainDecoder, self) . __init__( )
self. embed = nn. Embedding( vocab_size, hidden_size)
self. rnn = nn. GRU( hidden_size, hidden_size, batch_first= True )
self. out = nn. Linear( hidden_size, vocab_size)
self. dropout = nn. Dropout( dropout)
def forward ( self, y, y_lengths, hid) :
sorted_len, sorted_idx = y_lengths. sort( 0 , descending= True )
y_sorted = y[ sorted_idx. long ( ) ]
hid = hid[ : , sorted_idx. long ( ) ]
y_sorted = self. dropout( self. embed( y_sorted) )
packed_seq = nn. utils. rnn. pack_padded_sequence( y_sorted, sorted_len. long ( ) . cpu( ) . data. numpy( ) , batch_first= True )
out, hid = self. rnn( packed_seq, hid)
unpacked, _ = nn. utils. rnn. pad_packed_sequence( out, batch_first= True )
_, original_idx = sorted_idx. sort( 0 , descending= False )
output_seq = unpacked[ original_idx. long ( ) ] . contiguous( )
hid = hid[ : , original_idx. long ( ) ] . contiguous( )
output = F. log_softmax( self. out( output_seq) , - 1 )
return output, hid
class PlainSeq2Seq ( nn. Module) :
def __init__ ( self, encoder, decoder) :
super ( PlainSeq2Seq, self) . __init__( )
self. encoder = encoder
self. decoder = decoder
def forward ( self, x, x_lengths, y, y_lengths) :
encoder_out, hid = self. encoder( x, x_lengths)
output, hid = self. decoder( y, y_lengths, hid)
return output, None
def translate ( self, x, x_lengths, y, max_length= 10 ) :
encoder_out, hid = self. encoder( x, x_lengths)
preds = [ ]
batch_size = x. shape[ 0 ]
attns = [ ]
for i in range ( max_length) :
output, hid = self. decoder( y= y,
y_lengths= torch. ones( batch_size) . long ( ) . to( y. device) ,
hid= hid)
y = output. max ( 2 ) [ 1 ] . view( batch_size, 1 )
preds. append( y)
return torch. cat( preds, 1 ) , None
device = torch. device( "cuda" if torch. cuda. is_available( ) else "cpu" )
dropout = 0.2
hidden_size = 100
encoder = PlainEncoder( vocab_size= en_total_words,
hidden_size= hidden_size,
dropout= dropout)
decoder = PlainDecoder( vocab_size= cn_total_words,
hidden_size= hidden_size,
dropout= dropout)
model = PlainSeq2Seq( encoder, decoder)
class LanguageModelCriterion ( nn. Module) :
def __init__ ( self) :
super ( LanguageModelCriterion, self) . __init__( )
def forward ( self, input , target, mask) :
input = input . contiguous( ) . view( - 1 , input . size( 2 ) )
target = target. contiguous( ) . view( - 1 , 1 )
mask = mask. contiguous( ) . view( - 1 , 1 )
output = - input . gather( 1 , target) * mask
output = torch. sum ( output) / torch. sum ( mask)
return output
model = model. to( device)
loss_fn = LanguageModelCriterion( ) . to( device)
optimizer = torch. optim. Adam( model. parameters( ) )
def evaluate ( model, data) :
model. eval ( )
total_num_words = total_loss = 0 .
with torch. no_grad( ) :
for it, ( mb_x, mb_x_len, mb_y, mb_y_len) in enumerate ( data) :
mb_x = torch. from_numpy( mb_x) . to( device) . long ( )
mb_x_len = torch. from_numpy( mb_x_len) . to( device) . long ( )
mb_input = torch. from_numpy( mb_y[ : , : - 1 ] ) . to( device) . long ( )
mb_output = torch. from_numpy( mb_y[ : , 1 : ] ) . to( device) . long ( )
mb_y_len = torch. from_numpy( mb_y_len- 1 ) . to( device) . long ( )
mb_y_len[ mb_y_len<= 0 ] = 1
mb_pred, attn = model( mb_x, mb_x_len, mb_input, mb_y_len)
mb_out_mask = torch. arange( mb_y_len. max ( ) . item( ) , device= device) [ None , : ] < mb_y_len[ : , None ]
mb_out_mask = mb_out_mask. float ( )
loss = loss_fn( mb_pred, mb_output, mb_out_mask)
num_words = torch. sum ( mb_y_len) . item( )
total_loss += loss. item( ) * num_words
total_num_words += num_words
print ( "Evaluation loss" , total_loss/ total_num_words)
def train ( model, data, num_epochs= 2 ) :
for epoch in range ( num_epochs) :
model. train( )
total_num_words = total_loss = 0 .
for it, ( mb_x, mb_x_len, mb_y, mb_y_len) in enumerate ( data) :
mb_x = torch. from_numpy( mb_x) . to( device) . long ( )
mb_x_len = torch. from_numpy( mb_x_len) . to( device) . long ( )
mb_input = torch. from_numpy( mb_y[ : , : - 1 ] ) . to( device) . long ( )
mb_output = torch. from_numpy( mb_y[ : , 1 : ] ) . to( device) . long ( )
mb_y_len = torch. from_numpy( mb_y_len- 1 ) . to( device) . long ( )
mb_y_len[ mb_y_len<= 0 ] = 1
optimizer. zero_grad( )
mb_pred, attn = model( mb_x, mb_x_len, mb_input, mb_y_len)
mb_out_mask = torch. arange( mb_y_len. max ( ) . item( ) , device= device) [ None , : ] < mb_y_len[ : , None ]
mb_out_mask = mb_out_mask. float ( )
loss = loss_fn( mb_pred, mb_output, mb_out_mask)
num_words = torch. sum ( mb_y_len) . item( )
total_loss += loss. item( ) * num_words
total_num_words += num_words
loss. backward( )
torch. nn. utils. clip_grad_norm_( model. parameters( ) , 5 . )
optimizer. step( )
if it % 100 == 0 :
print ( "Epoch" , epoch, "iteration" , it, "loss" , loss. item( ) )
print ( "Epoch" , epoch, "Training loss" , total_loss/ total_num_words)
if epoch % 5 == 0 :
evaluate( model, dev_data)
train( model, train_data, num_epochs= 20 )
def translate_dev ( i) :
en_sent = " " . join( [ inv_en_dict[ w] for w in dev_en[ i] ] )
print ( en_sent)
cn_sent = " " . join( [ inv_cn_dict[ w] for w in dev_cn[ i] ] )
print ( "" . join( cn_sent) )
mb_x = torch. from_numpy( np. array( dev_en[ i] ) . reshape( 1 , - 1 ) ) . long ( ) . to( device)
mb_x_len = torch. from_numpy( np. array( [ len ( dev_en[ i] ) ] ) ) . long ( ) . to( device)
bos = torch. Tensor( [ [ cn_dict[ "BOS" ] ] ] ) . long ( ) . to( device)
translation, attn = model. translate( mb_x, mb_x_len, bos)
translation = [ inv_cn_dict[ i] for i in translation. data. cpu( ) . numpy( ) . reshape( - 1 ) ]
trans = [ ]
for word in translation:
if word != "EOS" :
trans. append( word)
else :
break
print ( "" . join( trans) )
for i in range ( 500 , 520 ) :
translate_dev( i)
print ( )
class Encoder ( nn. Module) :
def __init__ ( self, vocab_size, embed_size, enc_hidden_size, dec_hidden_size, dropout= 0.2 ) :
super ( Encoder, self) . __init__( )
self. embed = nn. Embedding( vocab_size, embed_size)
self. rnn = nn. GRU( embed_size, enc_hidden_size, batch_first= True , bidirectional= True )
self. dropout = nn. Dropout( dropout)
self. fc = nn. Linear( enc_hidden_size * 2 , dec_hidden_size)
def forward ( self, x, lengths) :
sorted_len, sorted_idx = lengths. sort( 0 , descending= True )
x_sorted = x[ sorted_idx. long ( ) ]
embedded = self. dropout( self. embed( x_sorted) )
packed_embedded = nn. utils. rnn. pack_padded_sequence( embedded, sorted_len. long ( ) . cpu( ) . data. numpy( ) , batch_first= True )
packed_out, hid = self. rnn( packed_embedded)
out, _ = nn. utils. rnn. pad_packed_sequence( packed_out, batch_first= True )
_, original_idx = sorted_idx. sort( 0 , descending= False )
out = out[ original_idx. long ( ) ] . contiguous( )
hid = hid[ : , original_idx. long ( ) ] . contiguous( )
hid = torch. cat( [ hid[ - 2 ] , hid[ - 1 ] ] , dim= 1 )
hid = torch. tanh( self. fc( hid) ) . unsqueeze( 0 )
return out, hid
class Attention ( nn. Module) :
def __init__ ( self, enc_hidden_size, dec_hidden_size) :
super ( Attention, self) . __init__( )
self. enc_hidden_size = enc_hidden_size
self. dec_hidden_size = dec_hidden_size
self. linear_in = nn. Linear( enc_hidden_size* 2 , dec_hidden_size, bias= False )
self. linear_out = nn. Linear( enc_hidden_size* 2 + dec_hidden_size, dec_hidden_size)
def forward ( self, output, context, mask) :
batch_size = output. size( 0 )
output_len = output. size( 1 )
input_len = context. size( 1 )
context_in = self. linear_in( context. view( batch_size* input_len, - 1 ) ) . view(
batch_size, input_len, - 1 )
attn = torch. bmm( output, context_in. transpose( 1 , 2 ) )
attn. data. masked_fill( mask, - 1e6 )
attn = F. softmax( attn, dim= 2 )
context = torch. bmm( attn, context)
output = torch. cat( ( context, output) , dim= 2 )
output = output. view( batch_size* output_len, - 1 )
output = torch. tanh( self. linear_out( output) )
output = output. view( batch_size, output_len, - 1 )
return output, attn
class Decoder ( nn. Module) :
def __init__ ( self, vocab_size, embed_size, enc_hidden_size, dec_hidden_size, dropout= 0.2 ) :
super ( Decoder, self) . __init__( )
self. embed = nn. Embedding( vocab_size, embed_size)
self. attention = Attention( enc_hidden_size, dec_hidden_size)
self. rnn = nn. GRU( embed_size, hidden_size, batch_first= True )
self. out = nn. Linear( dec_hidden_size, vocab_size)
self. dropout = nn. Dropout( dropout)
def create_mask ( self, x_len, y_len) :
device = x_len. device
max_x_len = x_len. max ( )
max_y_len = y_len. max ( )
x_mask = torch. arange( max_x_len, device= device) [ None , : ] < x_len[ : , None ]
y_mask = torch. arange( max_y_len, device= device) [ None , : ] < y_len[ : , None ]
mask = ( ~ x_mask[ : , : , None ] * y_mask[ : , None , : ] ) . byte( )
return mask
def forward ( self, encoder_out, x_lengths, y, y_lengths, hid) :
sorted_len, sorted_idx = y_lengths. sort( 0 , descending= True )
y_sorted = y[ sorted_idx. long ( ) ]
hid = hid[ : , sorted_idx. long ( ) ]
y_sorted = self. dropout( self. embed( y_sorted) )
packed_seq = nn. utils. rnn. pack_padded_sequence( y_sorted, sorted_len. long ( ) . cpu( ) . data. numpy( ) , batch_first= True )
out, hid = self. rnn( packed_seq, hid)
unpacked, _ = nn. utils. rnn. pad_packed_sequence( out, batch_first= True )
_, original_idx = sorted_idx. sort( 0 , descending= False )
output_seq = unpacked[ original_idx. long ( ) ] . contiguous( )
hid = hid[ : , original_idx. long ( ) ] . contiguous( )
mask = self. create_mask( y_lengths, x_lengths)
output, attn = self. attention( output_seq, encoder_out, mask)
output = F. log_softmax( self. out( output) , - 1 )
return output, hid, attn
class Seq2Seq ( nn. Module) :
def __init__ ( self, encoder, decoder) :
super ( Seq2Seq, self) . __init__( )
self. encoder = encoder
self. decoder = decoder
def forward ( self, x, x_lengths, y, y_lengths) :
encoder_out, hid = self. encoder( x, x_lengths)
output, hid, attn = self. decoder( encoder_out= encoder_out,
x_lengths= x_lengths,
y= y,
y_lengths= y_lengths,
hid= hid)
return output, attn
def translate ( self, x, x_lengths, y, max_length= 100 ) :
encoder_out, hid = self. encoder( x, x_lengths)
preds = [ ]
batch_size = x. shape[ 0 ]
attns = [ ]
for i in range ( max_length) :
output, hid, attn = self. decoder( encoder_out= encoder_out,
x_lengths= x_lengths,
y= y,
y_lengths= torch. ones( batch_size) . long ( ) . to( y. device) ,
hid= hid)
y = output. max ( 2 ) [ 1 ] . view( batch_size, 1 )
preds. append( y)
attns. append( attn)
return torch. cat( preds, 1 ) , torch. cat( attns, 1 )
dropout = 0.2
embed_size = hidden_size = 100
encoder = Encoder( vocab_size= en_total_words,
embed_size= embed_size,
enc_hidden_size= hidden_size,
dec_hidden_size= hidden_size,
dropout= dropout)
decoder = Decoder( vocab_size= cn_total_words,
embed_size= embed_size,
enc_hidden_size= hidden_size,
dec_hidden_size= hidden_size,
dropout= dropout)
model = Seq2Seq( encoder, decoder)
model = model. to( device)
loss_fn = LanguageModelCriterion( ) . to( device)
optimizer = torch. optim. Adam( model. parameters( ) )
train( model, train_data, num_epochs= 30 )
for i in range ( 100 , 120 ) :
translate_dev( i)
print ( )