NLP入门(二):seq2seq之GRU实现机器翻译

 https://github.com/bentrevett/pytorch-seq2seq/blob/master/2%20-%20Learning%20Phrase%20Representations%20using%20RNN%20Encoder-Decoder%20for%20Statistical%20Machine%20Translation.ipynb

import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.datasets import TranslationDataset,Multi30k
from torchtext.data import Field,BucketIterator

import spacy

import random
import math
import os
import time

SEED=1
random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.determinstic=True

spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

def tokenize_de(text):
    """
    Tokenizes German text from a string into a list of strings
    """
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

SRC = Field(tokenize=tokenize_de, init_token='<sos>', eos_token='<eos>', lower=True)
TRG = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>', lower=True)

train_data, valid_data, test_data = Multi30k.splits(exts=('.de', '.en'), fields=(SRC, TRG))

print(vars(train_data.examples[0]))

SRC.build_vocab(train_data, min_freq=2)
TRG.build_vocab(train_data, min_freq=2)

device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')

BATCH_SIZE=128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), batch_size=BATCH_SIZE, device=device)

'''
在NLP入门(一)中使用的是多层的LSTM,在本节使用单层的GRU(gate recurrent units)
nn.GRU   GRU中没有cell state,只有hidden state
nn.GRU forward函数中的输入
input 和 h_0
input shape [src sent length,batch size,emb dim]
h_0   shape [num_layers * num_directions,batch size,emb dim]
如果没有提供初始化的h_0参数,则默认初始化为全0向量(对于encoder)
GRU的输出
output,hidden
output 每个时间节点上的hidden state shape [src sent length,batch size,, num_directions * hid dim]
如果有多层GRU,则只输出最后一层的所有时间节点上的隐藏层状态(这与多层LSTM的情况一样:只输出最后一层LSTM的隐藏层和cell state状态)
hidden 最后一个时间节点上的hidden state  shaoe [num_layers * num_directions,batch size,hid dim]
'''

class Encoder(nn.Module):
    def __init__(self,input_dim,emb_dim,hid_dim,dropout):
        super().__init__()
        self.input_dim=input_dim
        self.emb_dim=emb_dim
        self.hid_dim=hid_dim

        self.embedding=nn.Embedding(input_dim,emb_dim)
        self.dropout=nn.Dropout(dropout)
        self.rnn=nn.GRU(emb_dim,hid_dim)

    def forward(self,src):
        '''
        :param src: shape [src sent length,batch size]
        其中的每个数值是当前单词在编码器源词汇表中的索引下标
        :return:
        '''
        embedded=self.embedding(src)

        #embedded=[src sent length,batch size,emb dim]

        output,hidden=self.rnn(self.dropout(embedded))

        #output = [src sent length,batch size,hid dim*n directions]
        #hidden = [n layers * n directions,batch size,hid dim]

        return hidden
'''
解码器中与NLP入门(一)的主要改进之处
1.在NLP入门(一)中解码器中每个LSTM时间节点处的输出只与:上一时间节点预测输出值或者上一时间节点的ground truth label有关系(teaching force机制)
但是在本节中的明显改进在于:
当前时间节点GRU预测的输出值与以下值相关:
(1)上一时间节点GRU的输出预测单词y(t-1)或者上一时间节点ground truth label,也可以记作y(t-1)
(2)encoder输出的context vector 注意只有hidden state
则GRU输入的特征向量维度为emd dim+hid dim
2.每个LSTM节点处的预测输出单词(nn.Linear线性层的输入)
之前是只与解码器当前时间节点处输出的hidden state有关(与当前时间节点处输出的cell state无关)
现在改成与以下3个数值相关:
(1)当前时间节点处GRU单元输出的hidden state
(2)编码器输出的context vector
(3)当前GRU输入的单词标签
则线性预测层的输入特征向量维度为  hid dim*2+emb dim
'''
class Decoder(nn.Module):
    def __init__(self,input_dim,emb_dim,hid_dim,dropout):
        super().__init__()
        self.input_dim=input_dim
        self.emb_dim=emb_dim
        self.hid_dim=hid_dim

        self.embedding=nn.Embedding(input_dim,emb_dim)
        self.dropout=nn.Dropout(dropout)
        self.rnn=nn.GRU(hid_dim+emb_dim,hid_dim)

        self.output=nn.Linear(hid_dim*2+emb_dim,input_dim)
    def forward(self, trg,hidden,context):
        '''
        :param trg:  shape [batch size]
        :param hidden:上一时间节点处隐藏层输出  hidden state,对于1时刻而言,hidden state=context [1,hid dim,batch size]
        :param context: 编码器输出的语义信息   shape [1,hid dim,batch size]

        无论是hidden还是context,都是某一层GRU的输出hidden,则它们的维度都是  [1,batch size,hid dim]
        :return:
        '''
        input=trg.unsqueeze(0)#input=[1,batch size]
        embedded=self.embedding(input)

        #embedded = [1,batch size,emd dim]

        emb_con=torch.cat((self.dropout(embedded),context),dim=2)

        #emb_con=[1,emd dim+hid dim, batch size]

        output,hidden=self.rnn(emb_con,hidden)

        #output = [1,batch size,hid dim]
        #hidden = [1,batch size,hid dim]

        output=torch.cat((emb_con,hidden),dim=2)

        #output = [1,batch size,emd dim+hid dim*2]

        pred=self.output(output.squeeze(0))

        #pred = [batch size,input dim]

        return pred,hidden#要返回当前层的hidden state作为下一个时间节点的输入

class Seq2Seq(nn.Module):
    def __init__(self,encoder,decoder,device):
        super().__init__()
        self.encoder=encoder
        self.decoder=decoder
        self.device=device

    def forward(self, src,trg,teaching_force_rate=0.5):
        '''
        :param src: [src sent length,batch size]
        :param trg: [trg sent length,batch size]
        :param teaching_force_rate: probability of using teaching force mechisim
        :return: prediction of trg sent length words index in trg vocal
                即当前模型所预测出来的每个时间节点处的索引值,索引值表示在解码器词汇表中
                解码出单词的索引下标
        '''
        max_len=trg.shape[0]
        batch_size=src.shape[1]
        trg_voc_len=self.decoder.input_dim
        context=self.encoder(src)
        # context = [1,batch size,hid dim]   for n layers* n directions=1

        hidden=context

        output=torch.zeros((max_len,batch_size,trg_voc_len)).to(self.device)
        input=trg[0,:,:]#input指的是对于decoder的第1个时间节点的输入,sos

        for t in range(1,max_len):
            pred, hidden=self.decoder(input,hidden,context)

            #pred = [batch size,input dim]

            output[t]=pred

            teaching_force=random.random()<teaching_force_rate
            if teaching_force:
                input=trg[t,:,:]#当前时刻的输入是上一时刻的ground truth,这里的t总是比pred(t+1)时间点少1
            else:
                input=torch.max(pred,dim=1)[1]

        return output
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, DEC_DROPOUT)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = Seq2Seq(enc, dec, device).to(device)

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

optimizer = optim.Adam(model.parameters())
pad_idx = TRG.vocab.stoi['<pad>']

criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)


def train(model, iterator, optimizer, criterion, clip):
    model.train()

    epoch_loss = 0

    for i, batch in enumerate(iterator):
        src = batch.src
        trg = batch.trg

        optimizer.zero_grad()

        output = model(src, trg)

        # trg = [trg sent len, batch size]
        # output = [trg sent len, batch size, output dim]

        output = output[1:].view(-1, output.shape[-1])
        trg = trg[1:].view(-1)

        # trg = [(trg sent len - 1) * batch size]
        # output = [(trg sent len - 1) * batch size, output dim]

        loss = criterion(output, trg)

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(iterator)


def evaluate(model, iterator, criterion):
    model.eval()

    epoch_loss = 0

    with torch.no_grad():
        for i, batch in enumerate(iterator):
            src = batch.src
            trg = batch.trg

            output = model(src, trg, 0)  # turn off teacher forcing

            # trg = [trg sent len, batch size]
            # output = [trg sent len, batch size, output dim]

            output = output[1:].view(-1, output.shape[-1])
            trg = trg[1:].view(-1)

            # trg = [(trg sent len - 1) * batch size]
            # output = [(trg sent len - 1) * batch size, output dim]

            loss = criterion(output, trg)

            epoch_loss += loss.item()

    return epoch_loss / len(iterator)


def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

N_EPOCHS=10
CLIP = 1
SAVE_DIR = 'models'
MODEL_SAVE_PATH = os.path.join(SAVE_DIR, 'tut2_model.pt')

best_valid_loss = float('inf')

if not os.path.isdir(f'{SAVE_DIR}'):
    os.makedirs(f'{SAVE_DIR}')

for epoch in range(N_EPOCHS):

    start_time = time.time()

    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), MODEL_SAVE_PATH)

    print(f'Epoch: {epoch + 1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

test/validation loss会比第一个例子小

 

  • 1
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值