Pytorch官方Transformer应用实例代码学习

Talexiatian

已于 2024-07-10 21:56:23 修改

阅读量1.7k

点赞数 13

文章标签： pytorch transformer 学习

于 2024-06-16 12:22:13 首次发布

本文链接：https://blog.csdn.net/Talexiatian/article/details/139628523

版权

Pytorch官方Transformer实例

引言
data.py
model.py
- RNN Model
- Transformer Model
main.py
generate.py

引言

Pytorch官方提供了一个简单的基于RNN类模型、Transformer模型的文本生成项目。项目代码主要分为五个模块：

data.py 用于处理文本数据，构建词-索引映射关系；
model.py 基于pytorch构建RNN类模型和Transformer模型；
main.py 利用数据集对模型进行训练和评估；
generate.py 利用训练好的模型进行文本生成；
data file 来自wiki的英文文本文件包括train.txt\valid.txt\test.txt；

我对其代码进行逐行标注，提供学习参考。
该项目的官方Github链接: Transformer实例代码仓库

data.py

该模块用于处理文本文件，生成词典、词索引。然后将文本数据转换为索引组成的向量数据。这里分词的方式就是用空格来分词。

import os
from io import open
import torch

"""
该文件用于处理文本数据集
生成词典
"""


# 构建一个类，存储单词和它的索引
class Dictionary(object):
    def __init__(self):
        self.word2idx = {}  # 单词到索引的映射
        self.idx2word = []  # 索引到单词的映射

    # 定义一个加入新word进入word2idx以及idx2word的方法， 返回新word索引
    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    # 定义了一个返回对象长度的方法，例如定义了类mydict，那么可以直接调用len(mydict)输出类中的单词列表长度
    def __len__(self):
        return len(self.idx2word)


# 定义一个类用于处理文本数据
class Corpus(object):
    def __init__(self, path):
        self.dictionary = Dictionary()
        # 调用tokenize方法处理path路径下对应的训练、验证、检验文本数据集，生成文本单词索引向量
        self.train = self.tokenize(os.path.join(path, 'train.txt'))
        self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
        self.test = self.tokenize(os.path.join(path, 'test.txt'))

    # 定义了一个tokenize分词方法，将词转换为索引
    def tokenize(self, path):
        """Tokenizes a text file."""
        assert os.path.exists(path)  # 检查指定的路径是否存在
        # Add words to the dictionary 把path路径下本文中的词加入dictionary这个类中
        with open(path, 'r', encoding="utf8") as f:  # 打开路径下的文本文件，内容赋值给f
            for line in f:  # 遍历f每一行
                words = line.split() + ['<eos>']  # 按空格分割成单词列表，并在列表的末尾添加标记'<eos>'表示句子结束
                for word in words:
                    # 循环单词列表，加入dictionary
                    self.dictionary.add_word(word)

        # Tokenize file content 将文本句子转换为索引列表
        with open(path, 'r', encoding="utf8") as f:  # 打开路径下的文本文件，内容赋值给f
            idss = []  # 初始化一个列表，用于保存索引
            for line in f:  # 遍历文本每一行
                words = line.split() + ['<eos>']  # 按空格分割成单词列表，并在列表的末尾添加标记'<eos>'表示句子结束
                ids = []  # 初始化一个列表，用于保存句子索引
                for word in words:  # 遍历句子中每一个词，并将该词对应的索引加入ids
                    ids.append(self.dictionary.word2idx[word])
                idss.append(torch.tensor(ids).type(torch.int64))  # 将该句子索引列表ids转换为tensor对象，加入idss
            ids = torch.cat(idss)  # 将idss中所有的tensor按第0维的方向，合并成一个大的tensor对象
        return ids  # 输出该文本文件的索引向量

model.py

该模块下基于pytorch的nn模块构建了RNN模型、Transformer模型。

RNN Model

RNNModel类下可根据rnn_type（‘LSTM’、‘GRU’、‘RNN_TANH’、‘RNN_RELU’）构建不同的堆栈式循环神经网络。
模型结构大致如下：
-Dropout-
-Embedding-
-RNNs-
-Dropout-
-Linear(decoder)-
-Log_softmax-
其中Dropout是一种随机失活网络单元的方法，可以缓解过拟合的问题，增加模型的泛化能力。
Log_softmax是softmax的一种扩展，数学公式如下。它的作用是在使用交叉熵作为损失函数时，避免先softmax再log运算造成的运算速度慢和数值不稳定的问题。
在这里插入图片描述

import math
import torch
import torch.nn as nn
import torch.nn.functional as F

class RNNModel(nn.Module):
    """Container module with an encoder, a recurrent module, and a decoder."""

    def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, dropout=0.5, tie_weights=False):
        """
        rnn_type: 循环神经网络类型
        ntoken: 词典大小
        ninp: 输入向量的维度
        nhid: 隐藏状态维度
        nlayers: 堆栈RNN的层数
        dropout: dropout的比例
        tie_weights: 是否开启一个优化方法（将decoder和encoder参数绑定的方法）
        """
        super(RNNModel, self).__init__()  # 调用父类的构造函数，python3中直接super().__init__()
        self.ntoken = ntoken
        self.drop = nn.Dropout(dropout)  # 创建一个dropout层
        self.encoder = nn.Embedding(ntoken, ninp)  # 创建一个Embedding层
        # 获取对应模型类型的RNN层
        if rnn_type in ['LSTM', 'GRU']:
            self.rnn = getattr(nn, rnn_type)(ninp, nhid, nlayers, dropout=dropout)
        else:
            try:
                nonlinearity = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[rnn_type]
            except KeyError as e:
                raise ValueError("""An invalid option for `--model` was supplied, 
                                    options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']""") from e
            self.rnn = nn.RNN(ninp, nhid, nlayers, nonlinearity=nonlinearity, dropout=dropout)
        self.decoder = nn.Linear(nhid, ntoken)  # 创建一个线性层，输入维度是隐藏状态尺寸，输出维度是词典大小

        # Optionally tie weights as in:
        # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016)
        # https://arxiv.org/abs/1608.05859
        # and
        # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016)
        # https://arxiv.org/abs/1611.01462
        if tie_weights:
            if nhid != ninp:
                raise ValueError('When using the tied flag, nhid must be equal to emsize')
            self.decoder.weight = self.encoder.weight

        self.init_weights()  # 调用初始化参数的方法
        self.rnn_type = rnn_type
        self.nhid = nhid
        self.nlayers = nlayers

    # 定义了一个初始化模型参数的方法
    def init_weights(self):
        initrange = 0.1
        nn.init.uniform_(self.encoder.weight, -initrange, initrange)  # 编码器权重[-0.1, 0.1]均匀分布
        nn.init.zeros_(self.decoder.bias)  # 解码器偏置初始化为0
        nn.init.uniform_(self.decoder.weight, -initrange, initrange)  # 解码器权重[-0.1, 0.1]均匀分布

    # 前向传播
    def forward(self, input, hidden):
        """
        input: 本时间步下的输入数据
        hidden: 上一时间步输出的隐层状态
        """
        emb = self.drop(self.encoder(input))  # input输入Embedding层得到词向量，再输入dropout层，得到后续词向量
        output, hidden = self.rnn(emb, hidden)  # 将词向量和上一时间步的隐藏状态输入rnn层，得到output（rnn中最后一层的h_t）
        output = self.drop(output)  # output再经过一层dropout
        decoded = self.decoder(output)  # output输入decoder这个线性层
        decoded = decoded.view(-1, self.ntoken)  # 对解码器的输出进行维度改变
        return F.log_softmax(decoded, dim=1), hidden  # 一个是经过log_softmax的输出，一个是新的隐藏状态向量传递到下一步

    # 隐层向量初始化函数
    def init_hidden(self, bsz):
        weight = next(self.parameters())  # 获取模型参数中第一批参数，应该就是隐藏层参数
        if self.rnn_type == 'LSTM':
            # 如果是LSTM需要输出两个向量，一个是隐藏状态，一个是细胞状态（LSTM特有的）
            # new_zeros() 创建全新的0张量
            return (weight.new_zeros(self.nlayers, bsz, self.nhid),
                    weight.new_zeros(self.nlayers, bsz, self.nhid))
        else:
            return weight.new_zeros(self.nlayers, bsz, self.nhid)

Transformer Model

该模块基于nn以及nn.Transformer构建了Transformer模型和位置编码方法。

# 这个类定义了Transformer中位置编码的方法
class PositionalEncoding(nn.Module):
    r"""Inject some information about the relative or absolute position of the tokens in the sequence.
        The positional encodings have the same dimension as the embeddings, so that the two can be summed.
        Here, we use sine and cosine functions of different frequencies.
    .. math:
        \text{PosEncoder}(pos, 2i) = sin(pos/10000^(2i/d_model))
        \text{PosEncoder}(pos, 2i+1) = cos(pos/10000^(2i/d_model))
        \text{where pos is the word position and i is the embed idx)
    Args:
        d_model: the embed dim (required).
        dropout: the dropout value (default=0.1).
        max_len: the max. length of the incoming sequence (default=5000).
    Examples:
        >>> pos_encoder = PositionalEncoding(d_model)
    """

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        """
        d_model: 词向量维度
        """
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)  # 新建一个全0的维度为max_len(输入最大长度)*d_model的张量
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)  # 建立一个0-max_len-1的张量，表示位置
        # 实现这部分1/10000^(2i/d_model)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        # 区分单双数位置，进行位置编码
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)  # 改变pe张量的形状，max_len*d_model--->max_len*1*d_model
        # 因为pe是固定不变的，所以这行代码将pe张量注册为类的缓冲区，这样它就不会在每次前向传播时被重新创建。
        self.register_buffer('pe', pe)

    def forward(self, x):
        r"""Inputs of forward function
        Args:
            x: the sequence fed to the positional encoder model (required).
        Shape:
            x: [sequence length, batch size, embed dim]
            output: [sequence length, batch size, embed dim]
        Examples:
            >>> output = pos_encoder(x)
        """
        x = x + self.pe[:x.size(0), :]  # 将输入向量的词向量和位置编码向量逐个元素相加
        return self.dropout(x)  # dropout一下


class TransformerModel(nn.Transformer):
    """Container module with an encoder, a recurrent or transformer module, and a decoder."""
    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
        """
        ntoken: 词典大小
        ninp: 输入向量的维度
        nhead: 多头自注意力机制模块中的头数
        nhid: 隐藏状态维度
        nlayers: 模型层数
        dropout: dropout的比例
        """
        # 调用父类Transformer的构造函数，初始化Transformer模型
        super(TransformerModel, self).__init__(d_model=ninp, nhead=nhead, dim_feedforward=nhid, num_encoder_layers=nlayers)
        self.model_type = 'Transformer'
        self.src_mask = None  # 初始化掩码
        self.pos_encoder = PositionalEncoding(ninp, dropout)  # 创建一PE层，实例化位置编码类
        self.input_emb = nn.Embedding(ntoken, ninp)  # 创建一个Embedding层
        self.ninp = ninp
        self.decoder = nn.Linear(ninp, ntoken)  # 创建一个线性层
        self.init_weights()  # 初始化参数

    def _generate_square_subsequent_mask(self, sz):
        """
        sz: 输入向量第0维的尺寸大小
        return: 返回一个sz * sz尺寸的张量，对角线和对角线以下都是0，对角线以上都是负无穷-inf
        """
        # torch.tril() 创建一个下三角的矩阵
        return torch.log(torch.tril(torch.ones(sz, sz)))

    # 构建了一个初始化embedding层和decoder层参数的方法
    def init_weights(self):
        initrange = 0.1
        nn.init.uniform_(self.input_emb.weight, -initrange, initrange)  # [-0.1, 0.1]均匀分布
        nn.init.zeros_(self.decoder.bias)  # 全零
        nn.init.uniform_(self.decoder.weight, -initrange, initrange)  # [-0.1, 0.1]均匀分布

    def forward(self, src, has_mask=True):
        """
        src: 模型输入向量
        """
        if has_mask:  # 为True时
            device = src.device  # 获取src所在的设备信息
            if self.src_mask is None or self.src_mask.size(0) != len(src):  # 如果掩码还没有创建，或者不匹配则需要重新创建
                mask = self._generate_square_subsequent_mask(len(src)).to(device)  # 调用函数创建掩码
                self.src_mask = mask
        else:
            self.src_mask = None

        src = self.input_emb(src) * math.sqrt(self.ninp)  # 输入词索引向量通过Embedding层（为什么要乘以math.sqrt(self.ninp)？）
        src = self.pos_encoder(src)  # 进行位置编码
        output = self.encoder(src, mask=self.src_mask)  # 输入Transformer的编码器，得到Transformer的输出
        output = self.decoder(output)  # 传入一个线性层
        return F.log_softmax(output, dim=-1)  # 沿着最后一个维度计算log_softmax

main.py

主模块，负责对模型进行批训练、模型的验证以及模型的保存。
训练的时候input为seq_len * batchsize长度的文本向量；目标值tatget则为往后移动一个batchsize长度的后续seq_len * batchsize长度的文本。

import argparse
import time
import math
import os
import torch
import torch.nn as nn
import torch.onnx

import data
import model

# 使用argparse库来处理命令行参数
parser = argparse.ArgumentParser(description='PyTorch Wikitext-2 RNN/LSTM/GRU/Transformer Language Model')
parser.add_argument('--data', type=str, default='./data/wikitext-2',
                    help='location of the data corpus')  # 文本数据位置
parser.add_argument('--model', type=str, default='LSTM',
                    help='type of network (RNN_TANH, RNN_RELU, LSTM, GRU, Transformer)')  # 模型类型
parser.add_argument('--emsize', type=int, default=200,
                    help='size of word embeddings')  # 词嵌入维度
parser.add_argument('--nhid', type=int, default=200,
                    help='number of hidden units per layer')  # 隐藏状态向量尺寸
parser.add_argument('--nlayers', type=int, default=2,
                    help='number of layers')  # 模型层数
parser.add_argument('--lr', type=float, default=20,
                    help='initial learning rate')  # 初始化学习率
parser.add_argument('--clip', type=float, default=0.25,
                    help='gradient clipping')  # 梯度裁剪值，限制梯度大小的阈值
parser.add_argument('--epochs', type=int, default=40,
                    help='upper epoch limit')  # 训练轮数，即训练集完整遍历的次数
parser.add_argument('--batch_size', type=int, default=20, metavar='N',
                    help='batch size')  # 批处理大小
parser.add_argument('--bptt', type=int, default=35,
                    help='sequence length')  # 反向传播序列长度，即多少时间步反向传播一次
parser.add_argument('--dropout', type=float, default=0.2,
                    help='dropout applied to layers (0 = no dropout)')  # dropout比例
parser.add_argument('--tied', action='store_true',
                    help='tie the word embedding and softmax weights')  # 词嵌入和softmax权重绑定
parser.add_argument('--seed', type=int, default=1111,
                    help='random seed')  # 随机种子
parser.add_argument('--cuda', action='store_true', default=False,
                    help='use CUDA')  # 是否使用cuda
parser.add_argument('--mps', action='store_true', default=False,
                        help='enables macOS GPU training')  # 是否使用mps
parser.add_argument('--log-interval', type=int, default=200, metavar='N',
                    help='report interval')  # 日志报告时间间隔，默认200batches报告一次
parser.add_argument('--save', type=str, default='model.pt',
                    help='path to save the final model')  # 模型参数保存到model.pt
parser.add_argument('--onnx-export', type=str, default='',
                    help='path to export the final model in onnx format')  # 模型保存为onnx路径
parser.add_argument('--nhead', type=int, default=2,
                    help='the number of heads in the encoder/decoder of the transformer model')  # 头数
parser.add_argument('--dry-run', action='store_true',
                    help='verify the code and the model')  # 是否只验证代码和模型是否能跑通
args = parser.parse_args()

# Set the random seed manually for reproducibility.
torch.manual_seed(args.seed)  # 设置随机种子，
if torch.cuda.is_available():
    if not args.cuda:
        print("WARNING: You have a CUDA device, so you should probably run with --cuda.")
if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():  # 检查是否支持mps
    args.mps = True
    if not args.mps:
        print("WARNING: You have mps device, to enable macOS GPU run with --mps.")

# 设置用于训练的device
use_mps = args.mps and torch.backends.mps.is_available()
if args.cuda:
    device = torch.device("cuda")
elif use_mps:
    device = torch.device("mps")
else:
    device = torch.device("cpu")

###############################################################################
# Load data
###############################################################################
# 实例化data模块的Corpus类，将data路径下的文本文档处理成词索引向量，存入corpus
corpus = data.Corpus(args.data)

# Starting from sequential data, batchify arranges the dataset into columns.
# For instance, with the alphabet as the sequence and batch size 4, we'd get
# ┌ a g m s ┐
# │ b h n t │
# │ c i o u │
# │ d j p v │
# │ e k q w │
# └ f l r x ┘.
# These columns are treated as independent by the model, which means that the
# dependence of e. g. 'g' on 'f' can not be learned, but allows more efficient
# batch processing.


# 定义了一个将数据处理成批次的方法
def batchify(data, bsz):
    # Work out how cleanly we can divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz  # 数据能完整分割成多少个batch
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)  # 剪切掉尾部不能整除的数据
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()  # 将数据集排列成二维的张量，并转置成nbatch * bsz尺寸的张量
    return data.to(device)  # 将数据移动到device上


eval_batch_size = 10  # 定义验证和测试时的batch size为10
train_data = batchify(corpus.train, args.batch_size)  # 训练集
val_data = batchify(corpus.valid, eval_batch_size)  # 验证集
test_data = batchify(corpus.test, eval_batch_size)  # 测试集

###############################################################################
# Build the model
###############################################################################
ntokens = len(corpus.dictionary)  # 字典的大小，也就是数据集中包含的单词数量+符号数量+一个结尾标记'<eos>'
# 实例化模型，并传入device
if args.model == 'Transformer':
    model = model.TransformerModel(ntokens, args.emsize, args.nhead, args.nhid, args.nlayers, args.dropout).to(device)
else:
    model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied).to(device)

criterion = nn.NLLLoss()  # 定义损失函数，负对数似然函数Negative Log Likelihood Loss

###############################################################################
# Training code
###############################################################################


def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""
    # 目的是在时间步之间，允许梯度独立传播
    if isinstance(h, torch.Tensor):  # 判断h是否是一个张量
        return h.detach()  # h从计算图中分离出来，不在和之前的梯度计算相关联
    else:  # 不是一个张量，有可能是一个元组，那么对元组中每个张量递归调用
        return tuple(repackage_hidden(v) for v in h)  #


# get_batch subdivides the source data into chunks of length args.bptt.
# If source is equal to the example output of the batchify function, with
# a bptt-limit of 2, we'd get the following two Variables for i = 0:
# ┌ a g m s ┐ ┌ b h n t ┐
# └ b h n t ┘ └ c i o u ┘
# Note that despite the name of the function, the subdivison of data is not
# done along the batch dimension (i.e. dimension 1), since that was handled
# by the batchify function. The chunks are along dimension 0, corresponding
# to the seq_len dimension in the LSTM.

def get_batch(source, i):
    """
    source: 数据源
    i: 起始索引
    """
    seq_len = min(args.bptt, len(source) - 1 - i)  # 最后一批可能不够bptt个batch，因此取较小值
    data = source[i:i+seq_len]  # 取从索引i开始bptt个batch作为输入数据，维度为seq_len * batchsize
    target = source[i+1:i+1+seq_len].view(-1)  # 取索引i+1开始bptt个batch的数据铺平作为target
    return data, target


def evaluate(data_source):
    # Turn on evaluation mode which disables dropout.
    model.eval()  # 启动评估模式，dropout移除
    total_loss = 0.  # 初始化loss
    ntokens = len(corpus.dictionary)  # 词典大小
    if args.model != 'Transformer':
        hidden = model.init_hidden(eval_batch_size)  # RNN类的模型进行隐藏层状态初始化
    with torch.no_grad():  # 上下文管理器，避免张量梯度的跟踪和计算，节省内存和加速计算
        for i in range(0, data_source.size(0) - 1, args.bptt):  # 每次拿bptt个batch来训练
            data, targets = get_batch(data_source, i)  # 根据索引获取输入数据和target数据， data：seq_len(bptt) * batchsize
            if args.model == 'Transformer':
                output = model(data)  # 隐式地调用forward进行前向传播
                output = output.view(-1, ntokens)  # 更改形状，保证输出张量第二个维度为ntokens
            else:
                output, hidden = model(data, hidden)   # output为最后一层rnn输出的h隐藏状态，hidden为每一层输出的h
                hidden = repackage_hidden(hidden)  # rnn重新包装隐藏状态向量，目的是在循环之间，允许梯度独立传播
            total_loss += len(data) * criterion(output, targets).item()  # 累计总的loss
    return total_loss / (len(data_source) - 1)  # 每个batch平均loss，-1是因为最后一个batch会不够不会循环累计loss


def train():
    # Turn on training mode which enables dropout.
    model.train()  # 切换到训练模式，训练模式下dropout被激活
    total_loss = 0.  # 初始化loss
    start_time = time.time()  # 记录开始时间
    ntokens = len(corpus.dictionary)  # 词典大小
    if args.model != 'Transformer':
        hidden = model.init_hidden(args.batch_size)  # RNN类的模型需要进行隐藏层状态初始化
    for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)):  # 每次拿bptt个batch来训练
        # batch是第几包、i是包数据在train_data的起始索引
        data, targets = get_batch(train_data, i)  # 根据索引获取输入数据和target数据， data：seq_len(bptt) * batchsize
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        model.zero_grad()  # 梯度清零，为新的batch计算做准备
        if args.model == 'Transformer':
            output = model(data)  # 隐式地调用forward进行前向传播
            output = output.view(-1, ntokens)  # 更改形状，保证输出张量第二个维度为ntokens
        else:
            hidden = repackage_hidden(hidden)  # 重新包装隐藏状态向量，目的是在循环之间，允许梯度独立传播
            output, hidden = model(data, hidden)  # output为最后一层rnn输出的h隐藏状态，hidden为每一层输出的h
        loss = criterion(output, targets)  # 计算loss
        loss.backward()  # 反向传播计算损失梯度

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)  # 将梯度限制在clip范围内，防止梯度爆炸
        for p in model.parameters():  # 循环模型参数，进行梯度下降
            p.data.add_(p.grad, alpha=-lr)  # p.data + (p.grad * -lr) 参数沿着梯度下降

        total_loss += loss.item()  # 累计loss
        # 固定间隔打印log
        if batch % args.log_interval == 0 and batch > 0:
            cur_loss = total_loss / args.log_interval  # 平均一个batch的loss
            elapsed = time.time() - start_time  # 训练时长
            # 打印相关信息
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // args.bptt, lr,
                elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()
        if args.dry_run:  # 是否只是检查代码能否跑通，True则退出，False则继续训练
            break

# 模型导出为ONNX格式，没用到，可以先忽略
def export_onnx(path, batch_size, seq_len):
    print('The model is also exported in ONNX format at {}.'.format(os.path.realpath(args.onnx_export)))
    model.eval()
    dummy_input = torch.LongTensor(seq_len * batch_size).zero_().view(-1, batch_size).to(device)
    hidden = model.init_hidden(batch_size)
    torch.onnx.export(model, (dummy_input, hidden), path)


# Loop over epochs.
lr = args.lr
best_val_loss = None

# At any point you can hit Ctrl + C to break out of training early.
try:
    # 开始训练循环
    for epoch in range(1, args.epochs+1):
        epoch_start_time = time.time()  # 记录本次循环开始时间
        train()  # 执行模型训练
        val_loss = evaluate(val_data)  # 计算模型在验证集上的表现，计算loss
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                           val_loss, math.exp(val_loss)))
        print('-' * 89)
        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:  # 保存最优的模型参数到model文件中
            with open(args.save, 'wb') as f:
                torch.save(model, f)
            best_val_loss = val_loss
        else:
            # Anneal the learning rate if no improvement has been seen in the validation dataset.
            # 如果loss没有减小，那么学习率减小
            lr /= 4.0
except KeyboardInterrupt:  # Ctrl + C 提前中断训练
    print('-' * 89)
    print('Exiting from training early')

# Load the best saved model.
# 加载保存的最佳模型参数文件
with open(args.save, 'rb') as f:
    model = torch.load(f)
    # after load the rnn params are not a continuous chunk of memory
    # this makes them a continuous chunk, and will speed up forward pass
    # Currently, only rnn model supports flatten_parameters function.
    if args.model in ['RNN_TANH', 'RNN_RELU', 'LSTM', 'GRU']:
        model.rnn.flatten_parameters()  # 将RNN的参数转换为连续的内存块，这有助于提高前向传播的速度。

# Run on test data.
test_loss = evaluate(test_data)  # 评估模型在测试集上的表现，计算loss
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)

if len(args.onnx_export) > 0:
    # Export the model in ONNX format.
    export_onnx(args.onnx_export, batch_size=1, seq_len=args.bptt)

generate.py

generate模块调用训练好的模型，随机输入一个词，进行固定长度的文本生成。

###############################################################################
# Language Modeling on Wikitext-2
#
# This file generates new sentences sampled from the language model.
#
###############################################################################
import argparse
import torch
import data

# 使用argparse库来处理命令行参数
parser = argparse.ArgumentParser(description='PyTorch Wikitext-2 Language Model')
# Model parameters.
parser.add_argument('--data', type=str, default='./data/wikitext-2',
                    help='location of the data corpus')  # 文本数据位置
parser.add_argument('--checkpoint', type=str, default='./model.pt',
                    help='model checkpoint to use')  # 模型参数文件
parser.add_argument('--outf', type=str, default='generated.txt',
                    help='output file for generated text')  # 输出文本文件
parser.add_argument('--words', type=int, default='1000',
                    help='number of words to generate')  # 生成单词数
parser.add_argument('--seed', type=int, default=1111,
                    help='random seed')  # 随机种子数
parser.add_argument('--cuda', action='store_true',
                    help='use CUDA')  # cuda配置默认False
parser.add_argument('--mps', action='store_true', default=True,
                        help='enables macOS GPU training')  # macOS GPU配置
parser.add_argument('--temperature', type=float, default=1.0,
                    help='temperature - higher will increase diversity')  # 温度，温度越高模型生成的随机性和多样性越高
parser.add_argument('--log-interval', type=int, default=100,
                    help='reporting interval')  # 多少步打印一次log
args = parser.parse_args()

# Set the random seed manually for reproducibility.
torch.manual_seed(args.seed)  # 设置随机种子保证可复现性

# 训练device选择
if torch.cuda.is_available():
    if not args.cuda:
        print("WARNING: You have a CUDA device, so you should probably run with --cuda.")
if torch.backends.mps.is_available():
    if not args.mps:
        print("WARNING: You have mps device, to enable macOS GPU run with --mps.")

use_mps = args.mps and torch.backends.mps.is_available()
if args.cuda:
    device = torch.device("cuda")
elif use_mps:
    device = torch.device("mps")
else:
    device = torch.device("cpu")

# 模型温度建议设置0.001以上
if args.temperature < 1e-3:
    parser.error("--temperature has to be greater or equal 1e-3.")

with open(args.checkpoint, 'rb') as f:  # 二进制格式打开模型文件
    model = torch.load(f, map_location=device)  # 读取模型到device上
model.eval()  # 开启模型评估模式

corpus = data.Corpus(args.data)  # 调用data.py中的Corpus类处理数据，得到数据集
ntokens = len(corpus.dictionary)  # 词典大小

is_transformer_model = hasattr(model, 'model_type') and model.model_type == 'Transformer'  # 判断是不是Transformer模型
if not is_transformer_model:
    hidden = model.init_hidden(1)  # rnn类模型，调用隐藏向量初始化，batch_size = 1
input = torch.randint(ntokens, (1, 1), dtype=torch.long).to(device)  # 随机选取一个词索引作为input

with open(args.outf, 'w') as outf:  # 写入模式打开输出文本文件
    with torch.no_grad():  # no tracking history 节省内存和加速计算
        for i in range(args.words):  # 生成args.words长度的文本
            if is_transformer_model:  # 若是Transformer模型
                output = model(input, False)  # 输入模型input，调用model的forward，并且不设置掩码
                # 取output最后一层，去除为1的维度，除以temperature增加随机性，取指数，最后权重数据（维度为ntokens）读取在cpu上
                word_weights = output[-1].squeeze().div(args.temperature).exp().cpu()
                word_idx = torch.multinomial(word_weights, 1)[0]  # 根据weights进行多项式采样一次得到生成的词索引（并不是直接取最大的）
                word_tensor = torch.Tensor([[word_idx]]).long().to(device)  # 词索引转换格式放置到device上
                input = torch.cat([input, word_tensor], 0)  # 连接之前的输出，更新下一轮的input
            else:
                output, hidden = model(input, hidden)  # input、hidden输入rnn类模型，得到输出
                word_weights = output.squeeze().div(args.temperature).exp().cpu()  # 得到词权重（维度为ntokens）
                word_idx = torch.multinomial(word_weights, 1)[0]  # 采样得到词索引
                input.fill_(word_idx)  # input设置为word_idx，意味着本轮的输出作为下一轮的输入

            word = corpus.dictionary.idx2word[word_idx]  # 根据索引得到词文本

            outf.write(word + ('\n' if i % 20 == 19 else ' '))  # 写入文本，每20个词换行

            if i % args.log_interval == 0:
                print('| Generated {}/{} words'.format(i, args.words))  # 打印log