NLP-Pytorch-项目流程-案例（一）：文本摘要【Seq2Seq（BiLSTM-LSTM）+ Attention】【预测算法：GreedySearch、BeamSearch】

本文链接：https://blog.csdn.net/u013250861/article/details/119961270

在这里插入图片描述

一、数据预处理

1、原始数据

/data/raw_data/服饰_50k.json（共50000条数据）

{
	"1": {
		"title": "巴拉巴 拉 旗下 梦 多多 童装 男童 毛衫 冬季 中大童 毛衫 黑色",
		"kb": {
			"适用季节": "冬季",
			"厚度": "适中",
			"领型": "高领",
			"适用年龄": "9-12岁",
			"材质成分": "锦纶",
			"图案": "其它",
			"上市时间": "2018冬季",
			"面料": "其它",
			"风格": "休闲风",
			"衣门襟": "套头",
			"适用性别": "男",
			"安全等级": "B类",
			"毛线粗细": "普通毛线"
		},
		"ocr": "中国蓝，深土黄，健康安全，A门襟，黑色，衣袖，面料展，产品信息，领口，可水洗，细节展示，不宜暴晒，不可漂白，短拉链设计，简洁实用，吊牌价:239.00，适合季节:秋冬季，半开领设计，舒适亲肤，]面料构成，田属性说明，不可源自，合格证，不可干流",
		"reference": "三合一混纺纱线制成，柔软亲肤，贴身穿也没有扎感。半开领的立领设计，在较凉的天气，保护脖颈，穿脱也更为方便。侧袖的拼接撞色设计，凸现个性，宝宝穿上更帅气。"
	},
	"2": {
		"title": "土拨鼠 男款 户外运动 休闲 抓绒 开衫 黑色",
		"kb": {
			"适用人群": "男士",
			"功能": "超轻",
			"尺码": "XS",
			"分类": "抓绒衣",
			"适用场景": "徒步",
			"品牌": "土拨鼠（Marmot）"
		},
		"ocr": "立领设计，拉链口袋，细节展示，产品展示，土拨鼠吊，前胸口袋，洗涤方式，能迅速吸收运动过程产生的汗水，快速干燥，舒适保暖，衣身两侧拉链口袋，立领设计商务休闲，舒适保暖，拉链采用YKK拉链制作，土拨鼠洗唛标展示，美观大，保护收纳物品，产品特，不易丢失，(以吊牌为准)，徒步、登山、旅行、露营、跑步、日常穿着、骑行，基本信息，收纳物品，前胸拉链口袋方便，大方，产地:越南，4759青鬼蓝，颜色:，展示，2975深海军蓝，深海军蓝，面料:聚苯，黑色",
		"reference": "时尚的小高领设计，可以有效锁住勃颈处的温度，让寒风也无法侵袭。前胸处的口袋搭配上时尚的品牌logo，美观还实用，让你的穿搭造型更显品味和档次。采用100wt抓绒，穿起来更暖和。"
	}
}

2、数据预处理（process.py）

将文件“服饰_50k.json”中的数据保存成以下格式数据：

短外套 女 春夏 新款 女装 复古 百搭 牛仔 外套 女 宽松 韩版 连帽 长袖 上衣 女 图色 图案 纯色 袖型 常规袖 风格 休闲风 衣门襟 拉链 适用年龄 25-29周岁 衣领材质 其它 类型 牛仔外套 流行元素 带帽 品牌 xzoo 材质 聚酯纤维 厚度 常规 版型 宽松型 衣长 常规款 袖长 长袖 组合形式 单件 领型 连帽 上市时间 2019年夏季 潮流 时尚 ， 3D 立体 裁剪 ， 肌理 舒适 面料 ， 无 牛仔 不 时尚 ， 打造 修身 S 曲线 ， 修身 显瘦 版型 ， 舒适 挺括 ， 精美 口袋 设计 ， L 精美 口袋 设计 ， L 精致 袖口 设计 ， 细节 展示 ， 时尚 翻领 设计 ， 细节 解析 ， 吸湿 透气 ， 分割线 裁剪 ， 时尚 大方 ， 肌理 时尚 ， 不 起球 ， 舒适 面料 ， 轻微 弹力 ， 修身 版型 设计 ， 舒适 自 在 ， 多条 ， 精细 剪裁 ， 抗皱 免烫 ， 直筒 样式 长袖 袖口 ， 时尚 大气 ， ， 时尚 百搭 ， 优雅 显瘦 ， 时尚 翻领 设计 ， 恰到好处 ， 时 ， 提升 了 外套 的 设计 感 和 立体感 的 亮点 ， 整体 的 曲线 ， 勾勒 纤细 线条 ， 显瘦 大方 ， 两侧 口袋 的 设计 ， 方便使用 的 同 ， 两侧 微 开叉 弧形 下摆 设计 ， 修饰<sep>气质 的 牛仔 外套 ， 修身 的 版型 设计 ， 勾勒 出 少女 娇美 曼妙 的 身姿 ， 经典 的 连帽 设计 ， 修饰 脸型 ， 清纯 的 蓝色 ， 显白 衬肤 ， 打造 知性 优雅 的 淑女风格 。

<sep>符号前面的内容是原始文件（是json文件中的title、kb、ocr字段合并后的文本）；
<sep>符号后面的内容是Summary参考答案（json文件中的reference字段内容）；

# -*- coding: utf-8 -*-

import sys
import os
import pathlib
import json
import jieba

abs_path = pathlib.Path(__file__).parent.absolute()
sys.path.append(sys.path.append(abs_path))


def write_samples(lines, file_path, opt='w'):
    with open(file_path, opt, encoding='utf8') as file:
        for line in lines:
            file.write(line)
            file.write('\n')


if __name__ == '__main__':
    samples = set()
    json_path = os.path.join(abs_path, './raw_data/服饰_50k.json')  # 原始数据地址

    with open(json_path, 'r', encoding='utf8') as file:
        json_objs = json.load(file)

    for json_obj in json_objs.values():
        # ----------------------- 处理样本文本信息(x) -----------------------
        title = json_obj['title'] + ' '  # 样本标题
        kb = dict(json_obj['kb']).items()  # 样本所有属性
        kb_merged = ''  # 将当前样本的所有属性合并为一段文本，用空格分隔
        for key, val in kb:
            kb_merged += key + ' ' + val + ' '
        ocr = ' '.join(list(jieba.cut(json_obj['ocr'])))  # 样本图谱OCR后的文本
        source_text = title + kb_merged + ocr  # 合并所有样本多模型信息成一个source
        # ----------------------- 处理样本标签信息(y) -----------------------
        reference = ' '.join(list(jieba.cut(json_obj['reference'])))
        # ----------------------- 合并样本文本信息(x)、样本标签信息(y)【二者用<sep>分开】 -----------------------
        sample = source_text + '<sep>' + reference
        samples.add(sample)
        print('len(samples) = ', len(samples))

    # ----------------------- 将处理后的所有样本写入 ./processed_data/samples.txt 文件 -----------------------
    write_samples(samples, os.path.join(abs_path, './processed_data/samples.txt'))

    # ----------------------- 将所有样本samples分隔成：训练集、验证集、测试集 -----------------------
    train, dev, test = [], [], []
    count = 0
    for sample in samples:
        count += 1
        if count <= 1000:  # Test set size.
            test.append(sample)
        elif count <= 6000:  # Dev set size.
            dev.append(sample)
        else:
            train.append(sample)

    write_samples(train, os.path.join(abs_path, './processed_data/train.txt'))
    write_samples(dev, os.path.join(abs_path, './processed_data/dev.txt'))
    write_samples(test, os.path.join(abs_path, './processed_data/test.txt'))

测试集（test.txt）分配1000个样本；
验证集（dev.txt）分配5000个样本；
剩下的分配给训练集（train.txt）；

3、文本对提取（pairs_of_src_tgt.py）

从train.txt数据集(每行数据的格式为src++tgt)中提取训练数据集的(src, tgt)文本数据对

# -*- coding: utf-8 -*-
'''
@Description: 用于提取(source、target)文本对
'''

import sys
import pathlib
from typing import Callable
from utils import simple_tokenizer

abs_path = pathlib.Path(__file__).parent.absolute()
sys.path.append(sys.path.append(abs_path))


class PairsOfSrcTgt(object):
    """
    The class represents source-reference pairs.
    """

    def __init__(self, filename, tokenize: Callable = simple_tokenizer, max_src_len: int = None, max_tgt_len: int = None, truncate_src: bool = False, truncate_tgt: bool = False):
        print("Reading dataset %s..." % filename, end=' ', flush=True)
        self.filename = filename
        self.pairs = []

        with open(filename, 'rt', encoding='utf-8') as f:
            for i, line in enumerate(f):
                # Split the source and reference by the <sep> tag.
                pair = line.strip().split('<sep>')
                if len(pair) != 2:
                    print("Line %d of %s is malformed." % (i, filename))
                    print(line)
                    continue
                src = tokenize(pair[0])
                if max_src_len and len(src) > max_src_len:
                    if truncate_src:
                        src = src[:max_src_len]
                    else:
                        continue
                tgt = tokenize(pair[1])
                if max_tgt_len and len(tgt) > max_tgt_len:
                    if truncate_tgt:
                        tgt = tgt[:max_tgt_len]
                    else:
                        continue
                self.pairs.append((src, tgt))
        print("%d pairs." % len(self.pairs))

4、构建字典（build_vocab.py）

# -*- coding: utf-8 -*-
'''
@Description: 构建词典(vocb)
'''

from collections import Counter
from utils import count_words
import config
import numpy as np


class BuildVocab(object):
    def __init__(self, pairs: list = [], embed_file: str = None):
        """
        Build the vocabulary for the data set.

        Args:
            pairs：[(src01,tgr01),(src02,tgr02)...]
            embed_file (str, optional): The file path of the pre-trained embedding word vector. Defaults to None.
        Returns:
            vocab00000.Vocab: The vocab object.
        """
        self.vocab = Vocab()
        word_counts = Counter()  # word frequency
        count_words(word_counts, [src + tgr for src, tgr in pairs])
        # Filter the vocabulary by keeping only the top k tokens in terms of word frequncy in the data set, where k is the maximum vocab size set in "config.py".
        for word, count in word_counts.most_common(config.max_vocab_size):
            self.vocab.add_words([word])
        if embed_file is not None:
            count = self.vocab.load_embeddings(embed_file)
            print("%d pre-trained embeddings loaded." % count)


class Vocab(object):
    PAD = 0
    SOS = 1
    EOS = 2
    UNK = 3

    def __init__(self):
        '''
        @Description: Define the vocabulary object.
        '''
        self.word2index = {
   '<PAD>': 0, '<SOS>': 1, '<EOS>': 2, '<UNK>': 3}
        self.index2word = {
   val: key for key, val in self.word2index.items()}
        self.word_count = Counter()
        self.embeddings = None

    def add_words(self, words):
        """Add a new token to the vocab and do mapping between word and index.
        Args:
            words (list): The list of tokens to be added.
        """
        for word in words:
            if word not in self.word2index:
                self.word2index[word] = len(self.word2index)
                self.index2word[len(self.word2index) - 1] = word
        self.word_count.update(words)  # 根据新添加的元素更新Counter()

    def __getitem__(self, item):
        if type(item) is int:
            return self.index2word.get(item)  # 根据id取对应的token
        else:
            return self.word2index.get(item, self.UNK)  # 根据token取对应的id，如果该token不在字典里，则默认为self.UNK

    def __len__(self):
        return len(self.index2word)

    def size(self):
        """Returns the total size of the vocabulary"""
        return len(self.index2word)

    def load_embeddings(self, embed_file_path: str, dtype=np.float32) -> int:
        """
        Load embedding word vector.
        Args:
            embed_file_path (str): The file path of word vector to load.
            dtype (numpy dtype, optional): Defaults to np.float32.
        Returns:
            int: Number of embedded tokens.
        """
        num_embeddings = 0
        vocab_size = len(self)
        with open(embed_file_path, 'rb') as f:
            for line in f:
                line = line.split()
                word = line[0].decode('utf-8')
                idx = self.word2index.get(word)
                if idx is not None:
                    vec = np.array(line[1:], dtype=dtype)
                    if self.embeddings is None:
                        n_dims = len(vec)
                        self.embeddings = np.random.normal(np.zeros((vocab_size, n_dims))).astype(dtype)
                        self.embeddings[self.PAD] = np.zeros(n_dims)
                    self.embeddings[idx] = vec
                    num_embeddings += 1
        return num_embeddings

5、数据批次化加载（sample_dataset.py）

# -*- coding: utf-8 -*-
'''
@Description: Define the format of data used in the model.
'''

import sys
import pathlib
import torch
from torch.utils.data import Dataset
from utils import sort_batch_by_len, source2ids

abs_path = pathlib.Path(__file__).parent.absolute()
sys.path.append(sys.path.append(abs_path))


class SampleDataset(Dataset):
    """
    The class represents a sample set for training.
    """

    def __init__(self, data_pairs, vocab):
        self.src_texts = [data_pair[0] for data_pair in data_pairs]
        # print("self.src_texts[:2]", self.src_texts[:2])
        self.tgt_texts = [data_pair[1] for data_pair in data_pairs]
        # print("self.tgt_texts[:2]", self.tgt_texts[:2])
        self.vocab = vocab
        self._len = len(data_pairs)  # Keep track of how many data points.

    def __getitem__(self, index):
        # print("self.src_texts[{0}] = {1}".format(index, self.src_texts[index]))
        src_ids, oovs = source2ids(self.src_texts[index], self.vocab)  # 将当前文本self.src_texts[index]转为ids，oovs为超出词典范围的词汇文本
        return {
   
            'x': [self.vocab.SOS] + src_ids + [self.vocab.EOS],
            'y': [self.vocab.SOS] + [self.vocab[i] for i in self.tgt_texts[index]] + [self.vocab.EOS],
            'x_len': len(self.src_texts[index]),
            'y_len': len(self.tgt_texts[index]),
            'oovs': oovs,
            'len_oovs': len(oovs)
        }

    def __len__(self):
        return self._len


def collate_fn(batch):
    """Split data set into batches and do padding for each batch.

    Args:
        x_padded (Tensor): Padded source sequences.
        y_padded (Tensor): Padded reference sequences.
        x_len (int): Sequence length of the sources.
        y_len (int): Sequence length of the references.
        oovs (dict): Out-of-vocabulary tokens.
        len_oovs (int): Number of OOV tokens.
    """

    def padding(indice, max_length, pad_idx=0):
        pad_indice = [item + [pad_idx] * max(0, max_length - len(item)) for item in indice]
        return torch.tensor(pad_indice)

    data_batch = sort_batch_by_len(batch)

    x = data_batch["x"]
    x_max_length = max([len(t) for t in x])
    y = data_batch["y"]
    y_max_length = max([len(t) for t in y])

    oovs = data_batch["oovs"]
    len_oovs = torch.tensor(data_batch["len_oovs"])

    x_padded = padding(x, x_max_length)
    y_padded = padding(y, y_max_length)

    x_len = torch.tensor(data_batch["x_len"])
    y_len = torch.tensor(data_batch["y_len"])

    return x_padded, y_padded, x_len, y_len, oovs, len_oovs

二、模型构建

# -*- coding: utf-8 -*-
'''
@Description: Define the model.
'''

import os
import sys
import pathlib
import torch
import torch.nn as nn
import torch.nn.functional as F
import config

abs_path = pathlib.Path(__file__).parent.absolute()
sys.path.append(sys.path.append(abs_path))


class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, rnn_drop: float = 0):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)  # Embedding层【输入维度：vocab_size、输出维度：embed_size】
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(embed_size, hidden_size, bidirectional=True, dropout=rnn_drop, batch_first=True)  # LSTM层

    def forward(self, x):
        """
        Define forward propagation for the encoder.
        Args:
            x (Tensor): The input samples as shape (batch_size, seq_len).
        Returns:
            output (Tensor): The output of lstm with shape(batch_size, seq_len, 2 * hidden_size).
            hidden (tuple):  The hidden states of lstm (h_n, c_n). Each with shape (2, batch_size, hidden_size)
        """
        embedded = self.embedding(x)
        output, hidden = self.lstm(embedded)
        return output, hidden


class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        # Define feed-forward layers.
        self.Wh_Linear = nn.Linear(2 * hidden_size, 2 * hidden_size, bias=False)
        self.Ws_Linear = nn.Linear(2 * hidden_size, 2 * hidden_size)
        self.v_Linear = nn.Linear(2 * hidden_size, 1, bias=False)

    def forward(self, decoder_hidden_states, encoder_output, x_padding_masks):
        """
        Define forward propagation for the attention network.《论文：Get To The Point: Summarization with Pointer-Generator Networks》
        Args:
            decoder_hidden_states (tuple): The hidden states from lstm (h_n, c_n) in the decoder, each with shape (1, batch_size, hidden_size)
            encoder_output (Tensor): The output from the lstm in the decoder with shape (batch_size, seq_len, hidden_size).
            x_padding_masks (Tensor): The padding masks for the input sequences with shape (batch_size, seq_len).
        Returns:
            context_vector (Tensor): Dot products of attention weights and encoder hidden states. The shape is (batch_size, 2*hidden_size).
            attention_weights (Tensor): The shape is (batch_size, seq_length).
        """
        # ----------------------- 获取 Decoder端的 hidden state（Concatenate h and c to get s_t and expand the dim of s_t.） -----------------------
        h_dec, c_dec = decoder_hidden_states  # Decoder端是UniLSTM，合并 hidden、cell使其维度与Encoder端(BiLSTM)的hidden维度一致，都为 2*hidden_size
        s_t = torch.cat([h_dec, c_dec], dim=2)  # (1, batch_size, 2*hidden_size)
        s_t = s_t.transpose(0, 1)  # (batch_size, 1, 2*hidden_size)
        s_t = s_t.expand_as(encoder_output).contiguous()  # (batch_size, seq_length, 2*hidden_size)
        # ----------------------- 计算 Attention scores 【论文公式(1)：score = v×tanh(W_h×h_i + W_s×s_t)；其中v、W_h、W_s都是参数】-----------------------
        encoder_features = self.Wh_Linear(encoder_output.contiguous())  # W_h×h_i (batch_size, seq_length, 2*hidden_size)
        decoder_features = self.Ws_Linear(s_t)  # W_s×s_t (batch_size, seq_length, 2*hidden_size)
        att_inputs = encoder_features + decoder_features  # W_h×h_i + W_s×s_t (batch_size, seq_length, 2*hidden_size)
        score = self.v_Linear(torch.tanh(att_inputs))  # (batch_size, seq_length, 1)
        # ----------------------- 论文公式(2)：对 Attention scores 进行softmax操作，得到 Attention Weight-----------------------
        attention_weights = F.softmax(score, dim=1).squeeze(2)  # (batch_size, seq_length)
        attention_weights = attention_weights * x_padding_masks  # 删除Mask部分的权重
        # 排除Mask部分的权重后重新对Attention weights进行归一化操作
        normalization_factor = attention_weights.sum(1, keepdim=True)
        attention_weights = attention_weights / normalization_factor
        # ----------------------- 论文公式(3)：计算 Context vector-----------------------
        context_vector = torch.bmm(attention_weights.unsqueeze(1), encoder_output)  # (batch_size, 1, 2*hidden_size)
        context_vector = context_vector.squeeze(1)  # (batch_size, 2*hidden_size)

        return context_vector, attention_weights


class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, enc_hidden_size=None, is_cuda=False):
        super(Decoder, self).__init__()
        self.DEVICE = torch.device('cuda') if is_cuda else torch.device('cpu')
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size

        self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True)

        self.W1_Linear = nn.Linear(self.hidden_size * 3, self.hidden_size)
        self.W2_Linear = nn.Linear(self.hidden_size, vocab_size)

    def forward(self, decoder_input, decoder_hidden_states, encoder_output, context_vector):
        """Define forward propagation for the decoder.

        Args:
            decoder_input (Tensor): The input of the decoder x_t of shape (batch_size, 1).
            decoder_hidden_states (tuple): The hidden states(h_n, c_n) of the decoder from last time step. The shapes are (1, batch_size, hidden_size) for each.
            encoder_output (Tensor): The output from the encoder of shape (batch_size, seq_length, 2*hidden_size).
            context_vector (Tensor): The context vector from the attention network of shape (batch_size,2*hidden_size).

        Returns:
            p_vocab (Tensor): The vocabulary distribution of shape (batch_size, vocab_size).
            docoder_states (tuple):
                The lstm states in the decoder.
                The shapes are (1, batch_size, hidden_size) for each.
        """

        decoder_emb = self.embedding(decoder_input)
        decoder_output, decoder_hidden_states = self.lstm(decoder_emb, decoder_hidden_states)

        # concatenate context vector and decoder state (batch_size, 3*hidden_size)
        decoder_output = decoder_output.view(-1, config.hidden_size)

        # ----------------------- 论文公式(4)：P_vocab = softmax(V'×(V×[s_t,h^*]+b)+b')-----------------------
        concat_vector = torch.cat([decoder_output, context_vector], dim=-1)
        # calculate vocabulary distribution (batch_size, hidden_size)
        FF1_out = self.W1_Linear(concat_vector)
        # (batch_size, vocab_size)
        FF2_out = self.W2_Linear(FF1_out)
        # (batch_size, vocab_size)
        P_vocab = F.softmax(FF2_out, dim=1)

        return P_vocab, decoder_hidden_states


class ReduceState(nn.Module):
    """
    Since the encoder has a bidirectional LSTM layer while the decoder has a unidirectional LSTM layer,
    we add this module to reduce the hidden states output by the encoder (merge two directions) before input the hidden states into the decoder.
    """

    def __init__(self):
        super(ReduceState, self).__init__()

    def forward(self, hidden):
        """
        The forward propagation of reduce state module.
        Args: hidden (tuple): Hidden states of encoder, each with shape (2, batch_size, hidden_size).
        Returns:
            tuple:
                Reduced hidden states,
                each with shape (1, batch_size, hidden_size).
        """
        h, c = hidden
        h_reduced = torch.sum(h, dim=0, keepdim=True)
        c_reduced = torch.sum(c, dim=0, keepdim=True)
        hidden = (h_reduced, c_reduced)
        return hidden


class Seq2seq(nn.Module):
    def __init__(self, vocab):
        super(Seq2seq, self).__init__()
        self.DEVICE = torch.device("cuda" if config.is_cuda else "cpu")
        self.vocab = vocab  # 初始化 词典
        self.attention = Attention(config.hidden_size)  # 初始化 Attention
        self.encoder = Encoder(len(vocab), config.embed_size, config.hidden_size)  # 初始化 Encoder
        self.decoder = Decoder(len(vocab), config.embed_size, config.hidden_size)  # 初始化 Decoder
        self.reduce_state = ReduceState()  # 初始化 降维组件

    def load_model(self):
        if os.path.exists(config.encoder_saved_name):
            self.encoder = torch.load(config.encoder_saved_name, map_location='gpu' if config.is_cuda else 'cpu')
            self.decoder = torch.load(config.decoder_saved_name, map_location='gpu' if config.is_cuda else 'cpu')
            self.attention = torch.load(config.attention_saved_name, map_location='gpu' if config.is_cuda else 'cpu')
            self.reduce_state = torch.load(config.reduce_state_saved_name, map_location='gpu' if config.is_cuda else 'cpu')

    def forward(self, x, x_len, y, y_len, len_oovs, epoch_idx, batch_idx, is_train):  # x：序列化后的输入文本；y：序列化后的输出文本；oovs：超出自定义词典的词汇文本列表【各含有batch_size个样本】
        """
        Define the forward propagation for the seq2seq model.

        Args:
            x (Tensor): Input sequences as source with shape (batch_size, seq_len)
            x_len ([int): Sequence length of the current batch.
            y (Tensor): Input sequences as reference with shape (bacth_size, y_len)
            len_oovs (int): The number of out-of-vocabulary words in this sample.
            batch (int): The number of the current batch.

        Returns:
            batch_loss (Tensor): The average loss of the current batch.
        """
        print('\n************************************* epoch_idx = {0}；batch_idx = {1}： 向model喂入的数据 *************************************\n'.format(epoch_idx, batch_idx))
        print('batch_idx = ', batch_idx)
        print('x_len = ', x_len)
        print('x = ', x)
        print('y_len = ', y_len)  # y_len是在sample_dataset中手工计算的y的真实长度
        print('y.shape = ', y.shape, '; y.shape[1] = ', y.shape[1])  # 每一个batch的所有y.shape的每一个y的shape都相同，且等于该batch中长度最长的文本的长度。
        print('y = ', y)
        # ----------------------- 对输入的序列化样本进行处理，并生成本条样本的 padding_mask -----------------------
        print('\n************************************* 对输入的序列化样本进行处理，并生成本条样本的 padding_mask *************************************\n')
        oov_token = torch.full(x.shape, self.vocab.UNK).long().to(self.DEVICE)  # torch.full(size, fill_value)【Creates a tensor of size filled with fill_value.】
        x_copy = torch.where(x > len(self.vocab) - 1, oov_token, x)  # torch.where(condition, x, y)【对x中的每个元素进行三目运算，符合condition则取x里的值，否则取y里的值】
        x_padding_masks = torch.ne(x_copy, 0).byte().float()  # torch.ne(input, other) 【按元素判断input与other是否相等，如果不相等则为True,如果相等则为False（The second argument can be a number or a tensor whose shape is broadcastable with the first argument.）】
        print('\nx_padding_masks.shape = ', x_padding_masks.shape)
        print('x_padding_masks = \n', x_padding_masks)
        # ----------------------- 将序列化后的本条样本的x_copy作为输入喂给Encoder -----------------------
        encoder_output, encoder_hidden_states = self.encoder(x_copy)
        print('encoder_output.shape = ', encoder_output.shape)

        # ----------------------- 将Encoder的隐层向量encoder_states进行降维, 将降维后的结果作为Decoder的初始化隐层向量（因为Encoder用的是BiLSTM, Decoder用的是UniLSTM） -----------------------
        decoder_hidden_states = self.reduce_state(encoder_hidden_states)

        # ----------------------- 计算预测当前样本的目标summary中的每一个词汇的loss -----------------------
        step_losses = []  # 用于存放当前batch所有样本的目标summary的每一个词汇的loss
        for i in range(y.shape[1] - 1):  # for i in range(y_len - 1)，此y_len是每一个样本目标summary的真实长度
            print('\n--------------------------------- is_train = {0}; epoch_idx = {1}; batch_idx = {2}, 当前预测时间步 i = {3} ---------------------------------\n'.format(is_train, epoch_idx, batch_idx, i))
            decoder_input_i = y[:, i]  # x_i：特征值【将每个样本的第i个id作为第i+1个时间步的输入】
            decoder_target_i = y[:, i + 1]  # y_i：目标值【每个样本的第i+1个id作为目标值】
            print('decoder_input_i.shape = ', decoder_input_i.shape, '; decoder_input_i = ', decoder_input_i, '; decoder_input_i.unsqueeze(1).shape = ', decoder_input_i.unsqueeze(1).shape, '\ndecoder_input_i.unsqueeze(1) = \n', decoder_input_i.unsqueeze(1))
            print('decoder_target_i.shape = ', decoder_target_i.shape, '; decoder_target_i = ', decoder_input_i, '; decoder_target_i.unsqueeze(1).shape = ', decoder_target_i.unsqueeze(1).shape, '\ndecoder_target_i.unsqueeze(1) = \n', decoder_target_i.unsqueeze(1))
            # ----------------------- 通过Attention机制计算得出Context Vector以及Attention Weight -----------------------
            context_vector, attention_weights = self.attention(decoder_hidden_states, encoder_output, x_padding_masks)
            # Get vocab distribution and hidden states from the decoder.
            # ----------------------- 论文公式(4)：通过Decoder计算得出第i+1个的预测值的概率分布-----------------------
            p_vocab, decoder_hidden_states = self.decoder(decoder_input_i.unsqueeze(1), decoder_hidden_states, encoder_output, context_vector)  # p_vocab.shape =  torch.Size([3, 20004]) ----decoder_hidden_states[0].shape =  torch.Size([1, 4, 512]) ----decoder_hidden_states[1].shape =  torch.Size([1, 4, 512])
            print('\np_vocab.shape = ', p_vocab.shape, '----decoder_hidden_states[0].shape = ', decoder_hidden_states[0].shape, '----decoder_hidden_states[1].shape = ', decoder_hidden_states[1].shape)
            # ----------------------- 论文公式(5)：概率分布 p_vocab([batch_size, vocab_size]) 表示的是当前样本在词汇表中所有词汇的概率，根据真实目标值的索引值decoder_target_i获取该索引值所获取的概率分布值 -----------------------
            target_probs = torch.gather(p_vocab, 1, decoder_target_i.unsqueeze(1))  # torch.Size([3, 1])；torch.gather(input, dim, index) → Tensor【在input的dim维度，提取索引为index的值】
            print('target_probs.shape = ', target_probs.shape, '; \ntarget_probs = ', target_probs)
            target_probs = target_probs.squeeze(1)  # torch.Size([3]) 表示本batch中所有3个样本在本epoch预测中真实值所得到的概率预测值
            print('squeeze之后：target_probs.shape = ', target_probs.shape, '; target_probs = ', target_probs)
            # ----------------------- 论文公式(6)：loss_t = -logP(w^*_t)【config.eps的作用是为了方式-log(0)】-----------------------
            loss = -torch.log(target_probs + config.eps)
            print('\nepoch({0})-batch_idx({1})：预测y序列文本中第({2})个时间步的token的损失值: loss.shape = {3}; loss = {4} '.format(epoch_idx, batch_idx, i, loss.shape, loss))
            # ----------------------- 通过mask消除padding处的影响(如果for循环用的是各个样本真实的长度参数y_len,则无需用mask来消除padding的影响) -----------------------
            mask = torch.ne(decoder_target_i, 0).byte().float()  # torch.ne(input, other) 【按元素判断input与other是否相等，如果不相等则为True,如果相等则为False
            print('\nmask.shape = ', mask.shape, '; mask = ', mask)
            loss = loss * mask  # 元素对应相乘
            print('经过mask后：loss.shape = ', loss.shape, '; loss = ', loss)
            step_losses.append(loss)

        print('\nlen(step_losses) =', len(step_losses), '; step_losses[0].shape = ', step_losses[0].shape)  # len(step_losses) = 54;  step_losses[0].shape =  torch.Size([3])
        stack_losses = torch.stack(step_losses, 1)  # torch.stack(tensors, dim)-->torch.Size([3, 54])：对序列数据内部的张量进行扩维拼接，指定维度由程序员选择、大小是生成后数据的维度区间。
        print('stack_losses.shape = ', stack_losses.shape, '; \nstack_losses = \n', stack_losses)
        sum_losses = torch.sum(stack_losses, 1)
        print('sum_losses.shape = ', sum_losses.shape, '; sum_losses = ', sum_losses)  # sum_losses.shape =  torch.Size([3]) ; sum_losses =  tensor([514.7693, 534.7229, 455.4084], grad_fn=<SumBackward1>)代表当前batch里的3个样本各自的loss
        # get the non-padded length of each sequence in the batch
        seq_len_mask = torch.ne(y, 0).byte().float()
        print('seq_len_mask.shape = ', seq_len_mask.shape)
        print('seq_len_mask = \n', seq_len_mask)
        batch_seq_len = torch.sum(seq_len_mask, dim=1)  # 将每个样本的所有mask值(1/0)加和得到的数值即为该样本的真实长度【即：模型输入值y_len】
        print('batch_seq_len.shape = ', batch_seq_len.shape, '; batch_seq_len = ', batch_seq_len)
        batch_losses = sum_losses / batch_seq_len  # 将当前batch中的每个样本的总loss除以该样本的长度，得到每个样本的平均loss
        batch_loss = torch.mean(batch_losses)  # 计算当前batch所有样本的平均loss
        print('batch_loss = ', batch_loss)

        return batch_loss

三、模型训练

# -*- coding: utf-8 -*-
'''
@Description: Train the model.
'''

import os
import sys
import pathlib
import numpy as np
import pickle
from torch.utils.data import DataLoader
from torch import optim
from torch.nn.utils import clip_grad_norm_
from dataset import PairsOfSrcTgt, BuildVocab, SampleDataset, collate_fn
import torch
import config
from tqdm import tqdm, trange
from model import Seq2seq
from tensorboardX import SummaryWriter

abs_path = pathlib.Path(__file__).parent.absolute()
sys.path.append(sys.path.append(abs_path))


def train(pairs_train, pairs_val, vocab, start_epoch=0):
    """Train the model, evaluate it and store it.

    Args:
        pairs_train (dataset.PairsOfSrcTgt): The training dataset.
        pairs_val (dataset.PairDataset): The evaluation dataset.
        vocab (vocab.Vocab): The vocabulary built from the training dataset.
        start_epoch (int, optional): The starting epoch number. Defaults to 0.
    """
    print('\n\n\n ****************************** loading model ******************************\n')
    DEVICE = torch.device("cuda" if config.is_cuda else "cpu")

    # 加载并批次化训练数据
    dataset_train = SampleDataset(pairs_train, vocab)  # 训练数据集大小：len(dataset_train) = 43996
    dataset_valid = SampleDataset(pairs_val, vocab)  # 验证数据集大小：len(dataset_valid) = 5000
    print('len(dataset_train) = {0}----dataset_train = {1}'.format(len(dataset_train), dataset_train))
    print('len(dataset_valid) = {0}----dataset_valid = {1}'.format(len(dataset_valid), dataset_valid))

    dataloader_train = DataLoader(dataset=dataset_train, batch_size=config.batch_size, shuffle=True, collate_fn=collate_fn)
    dataloader_valid = DataLoader(dataset=dataset_valid, batch_size=config.batch_size, shuffle=True, pin_memory=True, drop_last=True, collate_fn=collate_fn)
    print('\nlen(dataloader_train) = {0}----dataloader_train = {1}'.format(len(dataloader_train), dataloader_train))
    print('len(dataloader_valid) = {0}----dataloader_valid = {1}'.format(len(dataloader_valid), dataloader_valid))

    # ----------------------- 初始化 -----------------------
    model = Seq2seq(vocab)  # 初始化model
    optimizer = optim.Adam(model.parameters(), lr=config.learning_rate)  # 初始化优化器
    prev_val_loss = np.inf  # 初始化验证集loss

    # ----------------------- 加载上次未训练完的模型参数以及验证集loss，然后继续训练 -----------------------
    model.load_model()
    if os.path.exists(config.prev_val_loss_path):
        with open(config.prev_val_loss_path, 'rb') as f:
            prev_val_loss = pickle.load(f)

    model.to(DEVICE)

    # SummaryWriter: Log writer used for TensorboardX visualization.【开启Tensorboard命令行：tensorboard --logdir ./runs/baseline】
    writer_batch = SummaryWriter(config.log_path_batch_train)  # 将每一个epoch中的每100个batch的平均loss保存在文件里供可视化使用。
    writer_epoch_train = SummaryWriter(config.log_path_epoch_train)  # 将当前epoch的train的平均loss保存在文件里供可视化使用。
    writer_epoch_valid = SummaryWriter(config.log_path_epoch_valid)  # 将当前epoch的val的平均loss保存在文件里供可视化使用。

    # ----------------------- 开始训练 -----------------------
    print('\n ***************************************** 开始训练 *****************************************\n')
    epoch_progress_bar = tqdm(range(start_epoch, config.epochs))
    epoch_progress_bar.set_description(f'Epoch Loss: ')
    for epoch_idx in epoch_progress_bar:
        print('\n ***************************************** epoch_idx = {0} *****************************************\n'.format(epoch_idx))
        batch_losses = []  # 存放各个batch计算的loss
        # tqdm进度条
        batch_progress_bar = tqdm(dataloader_train)
        batch_progress_bar.set_description(f'Epoch_idx = {
     epoch_idx},')
        for batch_idx, data in enumerate(batch_progress_bar):
            # print('\n************************* epoch_idx = {0}; batch_idx = {1} *************************\n'.format(epoch_idx, batch_idx))
            x, y, x_len, y_len, oovs, len_oovs = data  # x：序列化后的输入文本；y：序列化后的输出文本；oovs：超出自定义词典的词汇文本列表【各含有batch_size个样本】
            assert not np.any(np.isnan(x.numpy()))
            if config.is_cuda:  # Training with GPUs.
                x = x.to(DEVICE)
                y = y.to(DEVICE)
                x_len = x_len.to(DEVICE)
                len_oovs = len_oovs.to(DEVICE)
            model.train()  # 设置model进入训练模式

            # ----------------------- 梯度置零(进入每一个batch后都要先梯度置零) -----------------------
            optimizer.zero_grad()

            # ----------------------- 计算当前batch的平均loss -----------------------
            loss = model(x=x, x_len=x_len, y=y, y_len=y_len, len_oovs=len_oovs, epoch_idx=epoch_idx, batch_idx=batch_idx, is_train=True)  # 每次将batch_size个样本喂给model
            batch_losses.append(loss.item())

            # ----------------------- loss 反向传播 -----------------------
            loss.backward()  # 反向传播

            # 进行梯度裁剪，防止梯度爆炸
            clip_grad_norm_(model.encoder.parameters(), config.max_grad_norm)
            clip_grad_norm_(model.decoder.parameters(), config.max_grad_norm)
            clip_grad_norm_(model.attention.parameters(), config.max_grad_norm)

            # ----------------------- 更新model的所有可训练参数 -----------------------
            optimizer.step()

            # ----------------------- 更新一次 tqdm bar进度条 -----------------------
            batch_progress_bar.set_postfix(Batch_idx=batch_idx, Loss_Of_This_Batch=loss.item())

            if batch_idx % 5 == 0:
                # 将当前epoch里每100个batch计算的平均loss数据保存在文件里面供可视化使用。【这里是Scalar类型，所以使用writer.add_scalar()】
                writer_batch.add_scalar(f'Average loss for epoch {
     epoch_idx}', np.mean(batch_losses), global_step=batch_idx)  # 第一个参数可以简单理解为保存图的名称，第二个参数是可以理解为Y轴数据，第三个参数可以理解为X轴数据。

        # ----------------------- 计算当前epoch中所有样本平均后的Loss -----------------------
        curr_epoch_train_loss = np.mean(batch_losses)
        # ----------------------- 更新 以epoch为单位的 tqdm bar进度条 -----------------------
        epoch_progress_bar.set_postfix(Loss_Of_This_Epoch=curr_epoch_train_loss)

        # ----------------------- 计算模型在验证集上的loss值(在验证数据集上验证本epoch训练后的效果) -----------------------
        val_loss = []
        with torch.no_grad():
            print('\n************************************************************ 开始进入验证 ************************************************************')
            for batch_idx, data in enumerate(tqdm(dataloader_valid)):
                x, y, x_len, y_len, oovs, len_oovs = data  # x：序列化后的输入文本；y：序列化后的输出文本；oovs：超出自定义词典的词汇文本列表【各含有batch_size个样本】
                if config.is_cuda:
                    x = x.to(DEVICE)
                    y = y.to(DEVICE)
                    x_len = x_len.to(DEVICE)
                    len_oovs = len_oovs.to(DEVICE)
                loss = model(x=x, x_len=x_len, y=y, y_len=y_len, len_oovs=len_oovs, epoch_idx=epoch_idx, batch_idx=batch_idx, is_train=False)  # 每次将batch_size个样本喂给model
                val_loss.append(loss.item())
        curr_epoch_valid_loss = np.mean(val_loss)

        writer_epoch_train.add_scalar(f'Loss Of Train for All epoch', curr_epoch_train_loss, global_step=epoch_idx)
        writer_epoch_valid.add_scalar(f'Loss Of Val for All epoch', curr_epoch_valid_loss, global_step=epoch_idx)

        print('\n\n☆☆☆☆☆☆☆☆  第{0}个epoch: curr_epoch_train_loss = {1}; curr_epoch_valid_loss = {2}'.format(epoch_idx, curr_epoch_train_loss, curr_epoch_valid_loss))

        # 当验证数据集上的loss比上一个epoch的loss减少时，保存模型参数
        if curr_epoch_valid_loss < prev_val_loss:
            torch.save(model.encoder, config.encoder_saved_name)
            torch.save(model.decoder, config.decoder_saved_name)
            torch.save(model.attention, config.attention_saved_name)
            torch.save(model.reduce_state, config.reduce_state_saved_name)
            prev_val_loss = curr_epoch_valid_loss
        with open(config.prev_val_loss_path, 'wb') as f:
            pickle.dump(prev_val_loss, f)

    writer_batch.close()
    writer_epoch_train.close()
    writer_epoch_valid.close()


if __name__ == "__main__":
    # 声明训练设备
    DEVICE = torch.device('cuda') if config.is_cuda else torch.device('cpu')
    # 从数据集(每行数据的格式为src+<sep>+tgt)中提取训练数据集的(src, tgt)文本数据对
    pairs_train = PairsOfSrcTgt(config.data_train_path, max_src_len=config.max_src_len, max_tgt_len=config.max_tgt_len, truncate_src=config.truncate_src, truncate_tgt=config.truncate_tgt).pairs
    print('len(pairs_train) = ', len(pairs_train))
    # 根据训练数据集的(src, tgt)文本数据对构建词典
    vocab = BuildVocab(pairs_train, embed_file=config.embed_file).vocab
    print('len(vocab) = ', len(vocab))
    print('vocab.vocab.__getitem__(3) = ', vocab.__getitem__(3))
    # 从预训练数据集(每行数据的格式为src+<sep>+tgt)中提取验证数据集的(src, tgt)文本数据对
    pairs_val = PairsOfSrcTgt(config.data_val_path, max_src_len=config.max_src_len, max_tgt_len=config.max_tgt_len, truncate_src=config.truncate_src, truncate_tgt=config.truncate_tgt).pairs
    print('len(pairs_val) = ', len(pairs_val))

    train(pairs_train, pairs_val, vocab, start_epoch=0)

打印结果：

Reading dataset data/processed_data/train.txt... 43996 pairs.
len(pairs_train) =  43996
len(vocab) =  20004
vocab.vocab.__getitem__(3) =  <UNK>
Reading dataset ./data/processed_data/dev.txt... 5000 pairs.
len(pairs_val) =  5000
 ****************************** loading model ******************************
len(dataset_train) = 43996----dataset_train = <sample_dataset.SampleDataset object at 0x00000218344F8438>
len(dataset_valid) = 5000----dataset_valid = <sample_dataset.SampleDataset object at 0x00000218382238D0>
len(dataloader_train) = 5500----dataloader_train = <torch.utils.data.dataloader.DataLoader object at 0x000002186F2FF208>
len(dataloader_valid) = 625----dataloader_valid = <torch.utils.data.dataloader.DataLoader object at 0x000002187B137978>
 ***************************************** 开始训练 *****************************************
 ***************************************** epoch_idx = 0 *****************************************

************************************* epoch_idx = 0；batch_idx = 0： 向model喂入的数据 *************************************
batch_idx =  0
x_len =  tensor([197, 121,  85])
x =  tensor([[  1, 403, 404,  77, 174,  56, 405, 406,  81, 407,   9, 408, 174,  79,
         175, 409,  25,  45, 152,  25, 410, 176,  35, 411,  72, 412, 413, 414,
         162, 415,   6, 177,  30,  82, 416, 117, 163, 417,  20,  29,  26, 418,
          81,   4, 419, 178,   4,   9,   4,  73,   6,   4, 179, 420,  41,   4,
          14, 421,   4,  14, 153,   4,  17,   4,  40,   8,   4,  45,   4, 422,
           5, 180,   4,  44, 423, 424,   4, 425, 426, 427, 179,   4, 181,   4,
          70,  27,  28,   4,  80,   6,   4,  83, 428,   4,  67,   9,  42,  82,
         182,   4,  14,  35,  28, 181,  10,  45,  10, 429,  10, 430,  10, 431,
           4,  54,  27,  28,   4, 432, 433, 183,   4, 434, 435, 436,   4,  31,
          27,   4, 437,  27,  28,   4, 168, 178,  61,   4, 438,  42, 184,   5,
         439,   4, 142, 440,  89,   4, 441, 442,   4, 443, 183,   5, 444,   4,
          83,  73,   4,  45,   4, 185,  59,   4, 185,  22,   4, 445,   4, 446,
         447,   5,  82,   4, 448,   4, 101,   4, 449,   4,  65,   4, 450,   4,
         166,   4, 451,   4, 452,   4, 176,   4, 453,   4, 454, 180,   4, 455,
           2],
        [  1,  89, 193,  90,  91,  48,  92, 194, 195,  93,  94,  95,  49,  50,
          16,  29, 196,  17, 197, 198,  51,  52,   6,  18,  30, 199,  19,  18,
          96, 200,  97, 201,  53,  95,  20,  29,  54,  98,  31, 202,  99,  94,
          21,  93, 203,  48,   4, 204, 205,   9,   6,   4,  55, 206,   4, 207,
         208, 209,   4,  90,   4, 100,   4, 100,   4,  56,   4, 210,   4, 101,
           4, 102,   6, 103,  57, 211, 104, 212,   4,  58, 213,  21, 214,   5,
           6,   4,  32, 105,  10, 106, 215,  10, 216,  10, 217,  10, 218, 107,
         219,   4, 220, 221,  13,   5,   4, 222, 223, 224,   5, 108,   4, 225,
           4, 226,   4,  53,   4,  22,   4,  59,   2,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0],
        [  1, 323, 114, 144, 145, 146,  25, 147, 324, 148,  49, 149, 145, 150,
         151,  54,  98,  31, 325,  53, 144,  51, 326,  96, 327, 152,  25,  20,
          29,  99, 328, 329,  18,  97, 330,   6,  18,  72, 331,  19, 102,  30,
         332,  24,   8,   4,  40,   8,   4,   6, 153,   4, 333, 334,   4,  71,
         335,   4, 336,  36,   4, 337,  73,   4, 131, 338,   4,  51, 339,   4,
         340, 154,   4, 341, 342,   4,  74, 343,   4,  74, 344,   4,  74, 345,
           2,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0]])
y_len =  tensor([38, 48, 54])
y.shape =  torch.Size([3, 54]) ; y.shape[1] =  54
y =  tensor([[  1, 186,  13, 456, 457,  79, 175,  11,   4, 458,  61,   4, 459, 460,
         461, 462,   7,  68, 177,   6,   4, 172, 463,   4,  84, 464,   4, 465,
         466, 467,  83,   4, 173,   9,  41, 109,   7,   2,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
        [  1, 227, 228, 109,   5, 229,  48, 230, 231,   4, 232, 233, 234,   5,
         108,   4,  60,  23, 110, 235, 236, 237, 238,   4, 239, 240,   5,  92,
         241, 242,   7,  13, 243,   5,  52,  11,   4,   9,  61, 111,   4, 244,
          33,   9,   5, 112,   7,   2,   0,   0,   0,   0,   0,   0],
        [  1, 346,  15,  41, 112,   5, 146, 148,   4, 149, 150,   5,  31,  11,
           4, 155, 347,   4, 348, 349,  49, 350,   5, 351, 352, 353,   7, 354,
         126,   5, 355,   4, 356, 357,   4,  67, 156, 358, 359, 157, 158, 360,
           7, 361, 151, 362, 363, 147,  21,   4, 159,  16,   7,   2]])
************************************* 对输入的序列化样本进行处理，并生成本条样本的 padding_mask *************************************
x_padding_masks.shape =  torch.Size([3, 197])
x_padding_masks = 
 tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

encoder_output.shape =  torch.Size([3, 197, 1024])

--------------------------------- is_train = True; epoch_idx = 0; batch_idx = 0, 当前预测时间步 i = 0 ---------------------------------

decoder_input_i.shape =  torch.Size([3]) ; decoder_input_i =  tensor([1, 1, 1]) ; decoder_input_i.unsqueeze(1).shape =  torch.Size([3, 1]) 
decoder_input_i.unsqueeze(1) = 
 tensor([[1],
        [1],
        [1]])
decoder_target_i.shape =  torch.Size([3]) ; decoder_target_i =  tensor([1, 1, 1]) ; decoder_target_i.unsqueeze(1).shape =  torch.Size([3, 1]) 
decoder_target_i.unsqueeze(1) = 
 tensor([[186],
        [227],
        [346]])
p_vocab.shape =  torch.Size([3, 1003]) ----decoder_states[0].shape =  torch.Size([1, 3, 512]) ----decoder_states[1].shape =  torch.Size([1, 3, 512])
target_probs.shape =  torch.Size([3, 1]) ; 
target_probs =  tensor([[5.2464e-05],
        [5.7170e-05],
        [2.6298e-05]], grad_fn=<GatherBackward>)
squeeze之后：target_probs.shape =  torch.Size([3]) ; target_probs =  tensor([5.2464e-05, 5.7170e-05, 2.6298e-05], grad_fn=<SqueezeBackward1>)
epoch(0)-batch_idx(0)：预测y序列文本中第(0)个时间步的token的损失值: loss.shape = torch.Size([3]); loss = tensor([ 9.8554,  9.7695, 10.5460], grad_fn=<NegBackward>) 
mask.shape =  torch.Size([3]) ; mask =  tensor([1., 1., 1.])
经过mask后：loss.shape =  torch.Size([3]) ; loss =  tensor([ 9.8554,  9.7695, 10.5460], grad_fn=<MulBackward0>)

--------------------------------- is_train = True; epoch_idx = 0; batch_idx = 0, 当前预测时间步 i = 1 ---------------------------------

decoder_input_i.shape =  torch.Size([3]) ; decoder_input_i =  tensor([186, 227, 346]) ; decoder_input_i.unsqueeze(1).shape =  torch.Size([3, 1]) 
decoder_input_i.unsqueeze(1) = 
 tensor([[186],
        [227],
        [346]])
decoder_target_i.shape =  torch.Size([3]) ; decoder_target_i =  tensor([186, 227, 346]) ; decoder_target_i.unsqueeze(1).shape =  torch.Size([3, 1]) 
decoder_target_i.unsqueeze(1) = 
 tensor([[ 13],
        [228],
        [ 15]])
p_vocab.shape =  torch.Size([3, 1003]) ----decoder_states[0].shape =  torch.Size([1, 3, 512]) ----decoder_states[1].shape =  torch.Size([1, 3, 512])
target_probs.shape =  torch.Size([3, 1]) ; 
target_probs =  tensor([[2.9507e-05],
        [2.9505e-05],
        [1.0280e-03]], grad_fn=<GatherBackward>)
squeeze之后：target_probs.shape =  torch.Size([3]) ; target_probs =  tensor([2.9507e-05, 2.9505e-05, 1.0280e-03], grad_fn=<SqueezeBackward1>)
epoch(0)-batch_idx(0)：预测y序列文本中第(1)个时间步的token的损失值: loss.shape = torch.Size([3]); loss = tensor([10.4309, 10.4309,  6.8801], grad_fn=<NegBackward>) 
mask.shape =  torch.Size([3]) ; mask =  tensor([1., 1., 1.])
经过mask后：loss.shape =  torch.Size([3]) ; loss =  tensor([10.4309, 10.4309,  6.8801], grad_fn=<MulBackward0>)

......

--------------------------------- is_train = True; epoch_idx = 0; batch_idx = 0, 当前预测时间步 i = 36 ---------------------------------
decoder_input_i.shape =  torch.Size([3]) ; decoder_input_i =  tensor([  7,   4, 156]) ; decoder_input_i.unsqueeze(1).shape =  torch.Size([3, 1]) 
decoder_input_i.unsqueeze(1) = 
 tensor([[  7],
        [  4],
        [156]])
decoder_target_i.shape =  torch.Size([3]) ; decoder_target_i =  tensor([  7,   4, 156]) ; decoder_target_i.unsqueeze(1).shape =  torch.Size([3, 1]) 
decoder_target_i.unsqueeze(1) = 
 tensor([[  2],
        [  9],
        [358]])
p_vocab.shape =  torch.Size([3, 1003]) ----decoder_states[0].shape =  torch.Size([1, 3, 512]) ----decoder_states[1].shape =  torch.Size([1, 3, 512])
target_probs.shape =  torch.Size([3, 1]) ; 
target_probs =  tensor([[3.0631e-03],
        [1.1222e-03],
        [1.7396e-05]], grad_fn=<GatherBackward>)
squeeze之后：target_probs.shape =  torch.Size([3]) ; target_probs =  tensor([3.0631e-03, 1.1222e-03, 1.7396e-05], grad_fn=<SqueezeBackward1>)
epoch(0)-batch_idx(0)：预测y序列文本中第(36)个时间步的token的损失值: loss.shape = torch.Size([3]); loss = tensor([ 5.7883,  6.7925, 10.9593], grad_fn=<NegBackward>) 
mask.shape =  torch.Size([3]) ; mas