seq2seq翻译实战-Pytorch复现

code

from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

SOS_token = 0
EOS_token = 1


class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1


def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )


def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s


def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")
    # 以行为单位读取文件
    lines = open('./data/%s-%s.txt ' % (lang1, lang2), encoding='utf-8'). \
        read().strip().split('\n')
    # 将每一行放入一个列表中
    # 一个列表中有两个元素,A语言文本与B语言文本
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]
    # 创建Lang实例,并确认是否反转语言顺序
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)
    return input_lang, output_lang, pairs


MAX_LENGTH = 10

eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)


def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH and p[1].startswith(eng_prefixes)


def filterPairs(pairs):
    # 选取仅仅包含 eng_prefixes 开头的语料
    return [pair for pair in pairs if filterPair(pair)]


def prepareData(lang1, lang2, reverse=False):
    # 读取文件中的数据
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))

    # 按条件选取语料
    pairs = filterPairs(pairs[:])
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")

    # 将语料保存至相应的语言类
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])

    # 打印语言类的信息
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs


input_lang, output_lang, pairs = prepareData('eng', 'fra', True)
print(random.choice(pairs))


class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)


class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.attn = nn.Linear(self.hidden_size*2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size*2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(
            0), encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)


def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]

# 将数字化的文本,转化为tensor数据


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

# 输入pair文本,输出预处理好的数据


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)


teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor,
          encoder, decoder,
          encoder_optimizer, decoder_optimizer,
          criterion, max_length=MAX_LENGTH):
    # 编码器初始化
    encoder_hidden = encoder.initHidden()

    # grad属性归零
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    # 用于创建一个指定大小的全零张量(tensor),用作默认编码器输出
    encoder_outputs = torch.zeros(
        max_length, encoder.hidden_size, device=device)

    loss = 0

    # 将处理好的语料送入编码器
    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    # 解码器默认输出
    decoder_input = torch.tensor([[SOS_token]], device=device)
    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    # 将编码器处理好的输出送入解码器
    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)

            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing
    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)

            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

Reading lines…
Read 135842 sentence pairs
Trimmed to 10599 sentence pairs
Counting words…
Counted words:
fra 4345
eng 2803
[‘ils jalousent notre succes .’, ‘they are jealous of our success .’]

import time
import math
 
 
def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)
 
def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))
 
def trainIters(encoder, decoder, n_iters, print_every=1000,
               plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every
 
    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
 
    # 在 pairs 中随机选取 n_iters 条数据用作训练集
    training_pairs = [tensorsFromPair(random.choice(pairs)) for i in range(n_iters)]
    criterion = nn.NLLLoss()
 
    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]
 
        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss
 
        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))
 
        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0
 
    return plot_losses
 
#评估函数
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang,sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()
 
        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)
 
        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]
 
        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS
 
        decoder_hidden = encoder_hidden
 
        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)
 
        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
 
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
 
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])
 
            decoder_input = topi.squeeze().detach()
 
        return decoded_words, decoder_attentions[:di + 1]
 
 
def evaluateRandomly(encoder, decoder, n=5):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')
 
 
hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words,dropout_p=0.1).to(device)
plot_losses = trainIters(encoder1, attn_decoder1, 20000, print_every=5000)
evaluateRandomly(encoder1, attn_decoder1)

7m 51s (- 23m 34s) (5000 25%) 2.8315
15m 49s (- 15m 49s) (10000 50%) 2.2725
24m 18s (- 8m 6s) (15000 75%) 1.9845
33m 10s (- 0m 0s) (20000 100%) 1.6941

je ne suis pas du tout fatigue .
= i m not tired at all .
< i m not a busy here .

vous n etes pas encore mort .
= you re not dead yet .
< you re not dead yet .

nous sommes quittes .
= we re even .
< we re punctual .

vous etes tous finis .
= you re all done .
< you re all terrified .

nous sommes encore mineures .
= we re still underage .
< we re still drunk .

import matplotlib.pyplot as plt
# 隐藏警告
import warnings
 
warnings.filterwarnings("ignore")  # 忽略警告信息
# plt.rcParams['font.sans-serif']    = ['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号
plt.rcParams['figure.dpi'] = 100  # 分辨率
 
epochs_range = range(len(plot_losses))
 
plt.figure(figsize=(8, 3))
 
plt.subplot(1, 1, 1)
plt.plot(epochs_range, plot_losses, label='Training Loss')
plt.legend(loc='upper right')
plt.title('Training Loss')
plt.show()
 

在这里插入图片描述

#可视化注意力
output_words, attentions = evaluate(encoder1, attn_decoder1, "je suis trop froid .")
plt.matshow(attentions.numpy())
 
import matplotlib.ticker as ticker
# 隐藏警告
import warnings
 
warnings.filterwarnings("ignore")  # 忽略警告信息
 
 
def showAttention(input_sentence, output_words, attentions):
    # Set up figure with colorbar
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(attentions.numpy(), cmap='bone')
    fig.colorbar(cax)
 
    # Set up axes
    ax.set_xticklabels([''] + input_sentence.split(' ') +
                       ['<EOS>'], rotation=90)
    ax.set_yticklabels([''] + output_words)
 
    # Show label at every tick
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
 
    plt.show()
 
 
def evaluateAndShowAttention(input_sentence):
    output_words, attentions = evaluate(
        encoder1, attn_decoder1, input_sentence)
    print('input =', input_sentence)
    print('output =', ' '.join(output_words))
    showAttention(input_sentence, output_words, attentions)
 
 
evaluateAndShowAttention("elle a cinq ans de moins que moi .")
evaluateAndShowAttention("elle est trop petit .")
evaluateAndShowAttention("je ne crains pas de mourir .")
evaluateAndShowAttention("c est un jeune directeur plein de talent .")

在这里插入图片描述
在这里插入图片描述

input = elle a cinq ans de moins que moi .
output = she is two years years than me .

在这里插入图片描述

input = elle est trop petit .
output = she is too loud .
在这里插入图片描述

input = je ne crains pas de mourir .
output = i m not afraid to
在这里插入图片描述

input = c est un jeune directeur plein de talent .
output = he is a very nice of girl .
在这里插入图片描述

总结

Transformer模型通过创新的自注意力机制和位置编码,在seq2seq翻译任务中展现出卓越性能。它由编码器和解码器两部分组成,编码器将源语言文本转换为上下文表示,解码器则基于这些表示和已生成的目标语言文本部分来预测下一个词。自注意力机制允许模型在处理每个词时考虑整个序列的上下文,有效捕捉长距离依赖。尽管模型计算复杂且对计算资源要求较高,但其并行处理能力和在多个NLP任务上的优越表现,使其成为当前自然语言处理领域的重要基石。Transformer的引入不仅推动了机器翻译的进步,也深刻影响了其他序列生成任务的发展。

PyTorch是一种深度学习框架,可以用于实现序列到序列(seq2seq)的机器翻译任务。在seq2seq模型中,编码器将源序列编码为一个固定长度的向量,解码器则将该向量解码为目标序列。为了提高翻译质量,可以使用注意力机制来在解码器中引入上下文信息。 在PyTorch中实现seq2seq模型,可以使用nn.Module类来定义模型架构。首先,需要定义编码器和解码器的结构。编码器通常使用循环神经网络(RNN)或卷积神经网络(CNN)进行实现,而解码器则需要使用注意力机制。注意力机制可以使解码器关注输入序列中最相关的部分并根据其进行翻译。 实现注意力机制时,需要计算每个输入序列位置和当前解码器状态之间的相似度。这可以通过计算点积或使用神经网络来实现。然后,可以将相似度作为权重,对输入序列进行加权求和,以计算上下文向量。最后,将上下文向量与当前解码器状态组合在一起,以生成下一个目标序列符号的概率分布。 在训练过程中,可以使用交叉熵损失函数来计算模型输出与正确目标序列之间的差异,并使用反向传播算法更新模型参数。在推理过程中,可以使用贪婪搜索或束搜索来生成翻译结果。 总的来说,PyTorch提供了一种灵活且高效的方式来实现seq2seq模型和注意力机制,可以用于各种自然语言处理任务,包括机器翻译、问答系统和对话生成等。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值