Pytorch 使用RNN模型构建人名分类器

幽影相随

已于 2022-07-26 15:35:25 修改

阅读量379

点赞数 1

分类专栏： # 自然语言处理文章标签： pytorch 人名分类器 RNN LSTM GRU

于 2022-07-26 13:49:15 首次发布

本文链接：https://blog.csdn.net/weixin_43820352/article/details/125989387

版权

自然语言处理专栏收录该内容

5 篇文章 0 订阅

订阅专栏

简介

人名分类问题: 以一个人名为输入, 使用模型帮助我们判断它最有可能是来自哪一个国家的人名, 这在某些国际化公司的业务中具有重要意义, 在用户注册过程中, 会根据用户填写的名字直接给他分配可能的国家或地区选项, 以及该国家或地区的国旗, 限制手机号码位数等等。

人名分类器的实现流程

导入必备的工具包及数据。
对 data 文件中的数据进行处理，满足训练要求。
构建RNN模型(包括传统 RNN, LSTM 以及 GRU)。
构建训练函数并进行训练。
构建评估函数并进行预测。

导入必备的工具包及数据

数据下载地址: https://download.pytorch.org/tutorial/data.zip
创建并解压至 data_names 文件夹，其中 5 个 py 文件是自己建立的，源文件中没有。
RNN_names_classification.py：主要的运行文件，包括运行流程等
utils.py：记录一些自定义工具函数
RNN.py：RNN 模型及相关函数
LSTM.py：LSTM 模型及相关函数
GRU.py：GRU 模型及相关函数
在这里插入图片描述
在 RNN_names_classification.py 导入如下包：

# 从io中导入文件打开方法
from io import open
# 帮助使用正则表达式进行子目录的查询
import glob
import os
# 用于获得常见字母及字符规范化
import string
import unicodedata
# 导入随机工具random
import random
# 导入时间和数学工具包
import time
import math
# 导入torch工具
import torch
# 引入制图工具包
import matplotlib.pyplot as plt
# 引入自定义函数
import utils
# 导入模型
import GRU
import LSTM
import RNN

在 utils.py 导入如下包：

# 从io中导入文件打开方法
from io import open
# 用于获得常见字母及字符规范化
import string
import unicodedata
# 导入时间和数学工具包
import time
import math
# 导入torch工具
import torch

在 RNN.py、LSTM.py 和 GRU.py 导入如下包：

import torch
import torch.nn as nn

数据处理

utils.py
相关测试在 RNN_names_classification.py 中运行

# unicode转Ascii函数
# 去掉一些语言中的重音标记
# normalize() 第一个参数指定字符串标准化的方式。
# NFC表示字符应该是整体组成(比如可能的话就使用单一编码)，而NFD表示字符应该分解为多个组合字符表示。
def unicodeToAscii(s, all_letters):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                   if unicodedata.category(c) != 'Mn'
                   and c in all_letters
                   )


def readLines(filename, all_letters):
    """从文件中读取每一行加载到内存中形成列表"""
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    # 对应每一个lines列表中的名字进行Ascii转换, 使其规范化.最后返回一个名字列表
    return [unicodeToAscii(line, all_letters) for line in lines]


# 将字符串(单词粒度)转化为张量表示，如："ab" --->
# tensor([[[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
#          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
#          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
#          0., 0., 0., 0., 0., 0.]],

#        [[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
#          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
#          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
#          0., 0., 0., 0., 0., 0.]]])
def lineToTensor(line, n_letters, all_letters):
    """将人名转化为对应onehot张量表示, 参数line是输入的人名"""
    # 首先初始化一个0张量, 它的形状(len(line), 1, n_letters)
    # 代表人名中的每个字母用一个1 x n_letters的张量表示.
    tensor = torch.zeros(len(line), 1, n_letters)
    # 遍历这个人名中的每个字符索引和字符
    for li, letter in enumerate(line):
        # 使用字符串方法find找到每个字符在all_letters中的索引
        # 它也是我们生成onehot张量中1的索引位置
        tensor[li][0][all_letters.find(letter)] = 1
    # 返回结果
    return tensor


def timeSince(since):
    """获得每次打印的训练耗时, since是训练开始时间"""
    # 获得当前时间
    now = time.time()
    # 获得时间差，就是训练耗时
    s = now - since
    # 将秒转化为分钟, 并取整
    m = math.floor(s / 60)
    # 计算剩下不够凑成1分钟的秒数
    s -= m * 60
    # 返回指定格式的耗时
    return '%dm %ds' % (m, s)

RNN 模型及相关函数

注：模型中使用了 data.add_() 函数和 squeeze() 函数，详情点击查看。
rnn.py

import torch
import torch.nn as nn


# 使用nn.RNN构建完成传统RNN使用类

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1):
        """初始化函数中有4个参数, 分别代表RNN输入最后一维尺寸, RNN的隐层最后一维尺寸, RNN层数"""
        super(RNN, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers

        # 实例化预定义的nn.RNN, 它的三个参数分别是input_size, hidden_size, num_layers
        self.rnn = nn.RNN(input_size, hidden_size, num_layers)
        # 实例化nn.Linear, 这个线性层用于将nn.RNN的输出维度转化为指定的输出维度
        self.linear = nn.Linear(hidden_size, output_size)
        # 实例化nn中预定的Softmax层, 用于从输出层获得类别结果
        self.softmax = nn.LogSoftmax(dim=-1)

    def forward(self, input, hidden):
        """完成传统RNN中的主要逻辑, 输入参数input代表输入张量, 它的形状是1 x n_letters
           hidden代表RNN的隐层张量, 它的形状是self.num_layers x 1 x self.hidden_size"""
        # 因为预定义的nn.RNN要求输入维度一定是三维张量, 因此在这里使用unsqueeze(0)扩展一个维度
        input = input.unsqueeze(0)
        # 将input和hidden输入到传统RNN的实例化对象中，如果num_layers=1, rr恒等于hn
        rr, hn = self.rnn(input, hidden)
        # 将从RNN中获得的结果通过线性变换和softmax返回，同时返回hn作为后续RNN的输入
        return self.softmax(self.linear(rr)), hn

    def initHidden(self):
        """初始化隐层张量"""
        # 初始化一个（self.num_layers, 1, self.hidden_size）形状的0张量
        return torch.zeros(self.num_layers, 1, self.hidden_size)


def trainRNN(rnn, category_tensor, line_tensor):
    """定义训练函数
        rnn为rnn模型
        category_tensor类别的张量表示, 相当于训练数据的标签,
        line_tensor名字的张量表示, 相当于对应训练数据"""
    # 定义损失函数为nn.NLLLoss，因为RNN的最后一层是nn.LogSoftmax, 两者的内部计算逻辑正好能够吻合
    criterion = nn.NLLLoss()
    # 设置学习率为0.005
    learning_rate = 0.005
    # 在函数中, 首先通过实例化对象rnn初始化隐层张量
    hidden = rnn.initHidden()
    # 然后将模型结构中的梯度归0
    rnn.zero_grad()
    output = None
    # 下面开始进行训练, 将训练数据line_tensor的每个字符逐个传入rnn之中, 得到最终结果
    for i in range(line_tensor.size()[0]):
        output, hidden = rnn(line_tensor[i], hidden)
    # 因为我们的rnn对象由nn.RNN实例化得到, 最终输出形状是三维张量, 为了满足于category_tensor
    # 进行对比计算损失, 需要减少第一个维度, 这里使用squeeze()方法
    loss = criterion(output.squeeze(0), category_tensor)
    # 损失进行反向传播
    loss.backward()
    # 更新模型中所有的参数
    for p in rnn.parameters():
        # 将参数的张量表示与参数的梯度乘以学习率的结果相加以此来更新参数
        p.data.add_(-learning_rate, p.grad.data)
    # 返回结果和损失的值
    return output, loss.item()


def evaluateRNN(rnn, line_tensor):
    """评估函数, 和训练函数逻辑相同, 参数是line_tensor代表名字的张量表示"""
    # 初始化隐层张量
    hidden = rnn.initHidden()
    # 将评估数据line_tensor的每个字符逐个传入rnn之中
    output = None
    for i in range(line_tensor.size()[0]):
        output, hidden = rnn(line_tensor[i], hidden)
    # 获得输出结果
    return output.squeeze(0)

LSTM 模型及相关函数

LSTM.py

import torch
import torch.nn as nn


# 使用nn.LSTM构建完成LSTM使用类

class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1):
        """初始化函数的参数与传统RNN相同"""
        super(LSTM, self).__init__()
        # 将hidden_size与num_layers传入其中
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        # 实例化预定义的nn.LSTM
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers)
        # 实例化nn.Linear, 这个线性层用于将nn.RNN的输出维度转化为指定的输出维度
        self.linear = nn.Linear(hidden_size, output_size)
        # 实例化nn中预定的Softmax层, 用于从输出层获得类别结果
        self.softmax = nn.LogSoftmax(dim=-1)

    def forward(self, input, hidden, c):
        """在主要逻辑函数中多出一个参数c, 也就是LSTM中的细胞状态张量"""
        # 使用unsqueeze(0)扩展一个维度
        input = input.unsqueeze(0)
        # 将input, hidden以及初始化的c传入lstm中
        rr, (hn, c) = self.lstm(input, (hidden, c))
        # 最后返回处理后的rr, hn, c
        return self.softmax(self.linear(rr)), hn, c

    def initHiddenAndC(self):
        """初始化函数不仅初始化hidden还要初始化细胞状态c, 它们形状相同"""
        c = hidden = torch.zeros(self.num_layers, 1, self.hidden_size)
        return hidden, c


def trainLSTM(lstm, category_tensor, line_tensor):
    """定义训练函数"""
    # 定义损失函数为nn.NLLLoss，因为RNN的最后一层是nn.LogSoftmax, 两者的内部计算逻辑正好能够吻合
    criterion = nn.NLLLoss()
    # 设置学习率为0.005
    learning_rate = 0.005
    hidden, c = lstm.initHiddenAndC()
    lstm.zero_grad()
    output = None
    for i in range(line_tensor.size()[0]):
        # 返回output, hidden以及细胞状态c
        output, hidden, c = lstm(line_tensor[i], hidden, c)
    loss = criterion(output.squeeze(0), category_tensor)
    loss.backward()
    for p in lstm.parameters():
        p.data.add_(-learning_rate, p.grad.data)
    return output, loss.item()


def evaluateLSTM(lstm, line_tensor):
    # 初始化隐层张量和细胞状态张量
    hidden, c = lstm.initHiddenAndC()
    # 将评估数据line_tensor的每个字符逐个传入lstm之中
    output = None
    for i in range(line_tensor.size()[0]):
        output, hidden, c = lstm(line_tensor[i], hidden, c)
    return output.squeeze(0)

GRU 模型及相关函数

GRU.py

import torch
import torch.nn as nn


# 使用nn.GRU构建完成传统RNN使用类
# GRU与传统RNN的外部形式相同, 都是只传递隐层张量, 因此只需要更改预定义层的名字


class GRU(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1):
        super(GRU, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        # 实例化预定义的nn.GRU, 它的三个参数分别是input_size, hidden_size, num_layers
        self.gru = nn.GRU(input_size, hidden_size, num_layers)
        self.linear = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=-1)

    def forward(self, input, hidden):
        input = input.unsqueeze(0)
        rr, hn = self.gru(input, hidden)
        return self.softmax(self.linear(rr)), hn

    def initHidden(self):
        return torch.zeros(self.num_layers, 1, self.hidden_size)


def trainGRU(gru, category_tensor, line_tensor):
    """定义训练函数"""
    # 定义损失函数为nn.NLLLoss，因为RNN的最后一层是nn.LogSoftmax, 两者的内部计算逻辑正好能够吻合
    criterion = nn.NLLLoss()
    # 设置学习率为0.005
    learning_rate = 0.005
    hidden = gru.initHidden()
    gru.zero_grad()
    output = None
    for i in range(line_tensor.size()[0]):
        output, hidden = gru(line_tensor[i], hidden)
    loss = criterion(output.squeeze(0), category_tensor)
    loss.backward()
    for p in gru.parameters():
        p.data.add_(-learning_rate, p.grad.data)
    return output, loss.item()


def evaluateGRU(gru, line_tensor):
    hidden = gru.initHidden()
    # 将评估数据line_tensor的每个字符逐个传入gru之中
    output = None
    for i in range(line_tensor.size()[0]):
        output, hidden = gru(line_tensor[i], hidden)
    return output.squeeze(0)

相关测试及调用

以下内容均存于 RNN_names_classification.py 中

# 从io中导入文件打开方法
from io import open
# 帮助使用正则表达式进行子目录的查询
import glob
import os
# 用于获得常见字母及字符规范化
import string
import unicodedata
# 导入随机工具random
import random
# 导入时间和数学工具包
import time
import math
# 导入torch工具
import torch
# 引入制图工具包
import matplotlib.pyplot as plt
# 引入自定义函数
import utils
# 导入模型
import GRU
import LSTM
import RNN

# 获取所有常用字符包括字母和常用标点
all_letters = string.ascii_letters + " .,;'"
# 获取常用字符数量 57
n_letters = len(all_letters)

s = "Ślusàrski"
a = utils.unicodeToAscii(s, all_letters)
print(a)
# 输出：Slusarski

# 从持久化文件中读取内容到内存
data_path = "./names/"

# # filename是数据集中某个具体的文件, 我们这里选择Chinese.txt
filename = data_path + "Chinese.txt"
lines = utils.readLines(filename, all_letters)
print(lines)
# 输出：['Ang', 'AuYong', 'Bai'......

# 构建的category_lines形如：{"English":["Lily", "Susan", "Kobe"], "Chinese":["Zhang San", "Xiao Ming"]}
category_lines = {}

# all_categories形如： ["English",...,"Chinese"]
all_categories = []

# 读取指定路径下的txt文件， 使用glob，path中可以使用正则表达式
# glob() 函数，将某目录下所有跟通配符模式相同的文件放到一个列表中
for filename in glob.glob(data_path + '*.txt'):
    # 获取每个文件的文件名, 就是对应的名字类别
    # os.path.basename()：返回最后的文件名
    category = os.path.splitext(os.path.basename(filename))[0]
    # 将其逐一装到all_categories列表中
    all_categories.append(category)
    # 然后读取每个文件的内容，形成名字列表
    lines = utils.readLines(filename, all_letters)
    # 按照对应的类别，将名字列表写入到category_lines字典中
    category_lines[category] = lines

# 查看类别总数
n_categories = len(all_categories)
print("n_categories:", n_categories)
# 输出：n_categories: 18

# 随便查看其中的一些内容
print(category_lines['Italian'][:5])
# 输出：['Abandonato', 'Abatangelo', 'Abatantuono', 'Abate', 'Abategiovanni']

# 人名转化为对应onehot张量测试
line = "Bai"
line_tensor = utils.lineToTensor(line, n_letters, all_letters)
print("line_tensot:", line_tensor)
''' 输出：
line_tensot: tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0.]],

        [[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0.]]])
          '''

'''模型测试'''

# 因为是onehot编码, 输入张量最后一维的尺寸就是n_letters
input_size = n_letters
# 定义隐层的最后一维尺寸大小
n_hidden = 128
# 输出尺寸为语言类别总数n_categories
output_size = n_categories
# num_layer使用默认值, num_layers = 1
# 假如我们以一个字母B作为RNN的首次输入, 它通过lineToTensor转为张量
# 因为我们的lineToTensor输出是三维张量, 而RNN类需要的二维张量
# 因此需要使用squeeze(0)降低一个维度
input = utils.lineToTensor('B', n_letters, all_letters).squeeze(0)
# 初始化一个三维的隐层0张量, 也是初始的细胞状态张量
hidden = c = torch.zeros(1, 1, n_hidden)
rnn = RNN.RNN(n_letters, n_hidden, n_categories)
lstm = LSTM.LSTM(n_letters, n_hidden, n_categories)
gru = GRU.GRU(n_letters, n_hidden, n_categories)

rnn_output, next_hidden = rnn(input, hidden)
print("rnn:", rnn_output)
lstm_output, next_hidden, c = lstm(input, hidden, c)
print("lstm:", lstm_output)
gru_output, next_hidden = gru(input, hidden)
print("gru:", gru_output)
'''输出：
rnn: tensor([[[-2.8846, -2.8577, -2.8771, -2.8404, -2.9493, -2.9763, -2.8778,
          -2.9013, -2.9770, -2.9255, -2.9453, -2.8653, -2.8467, -2.8520,
          -2.7569, -2.9069, -2.9328, -2.8793]]], grad_fn=<LogSoftmaxBackward0>)
lstm: tensor([[[-2.8939, -2.9496, -2.8471, -2.8903, -2.8491, -2.9028, -2.8200,
          -2.9084, -2.8248, -2.9217, -2.9744, -2.8988, -2.8330, -2.8690,
          -2.8347, -2.9451, -2.9294, -2.9547]]], grad_fn=<LogSoftmaxBackward0>)
gru: tensor([[[-2.8414, -2.9224, -2.9315, -2.9462, -2.9018, -2.9251, -2.9311,
          -2.9819, -2.9232, -2.9034, -2.9271, -2.7789, -2.8898, -2.8167,
          -2.8042, -2.8437, -2.8935, -2.8901]]], grad_fn=<LogSoftmaxBackward0>)
'''

'''构建训练函数并进行训练'''


# 从输出结果中获得指定类别
def categoryFromOutput(output):
    # 从输出张量中返回最大的值和索引对象, 我们这里主要需要这个索引
    # a.topk() 求a中的最大值或最小值，返回两个值，一个是a中的值(最大或最小)，一个是这个值的索引。
    top_n, top_i = output.topk(1)
    # top_i对象中取出索引的值
    category_i = top_i[0].item()
    # 根据索引值获得对应语言类别, 返回语言类别和索引值
    return all_categories[category_i], category_i


output = gru_output
print(output)
'''输出：
tensor([[[-2.8414, -2.9224, -2.9315, -2.9462, -2.9018, -2.9251, -2.9311,
          -2.9819, -2.9232, -2.9034, -2.9271, -2.7789, -2.8898, -2.8167,
          -2.8042, -2.8437, -2.8935, -2.8901]]], grad_fn=<LogSoftmaxBackward0>)
'''
category, category_i = categoryFromOutput(output)
print("category:", category)
print("category_i:", category_i)
# 输出：
# category: Korean
# category_i: 11

# 随机生成训练数据
def randomTrainingExample():
    # 首先使用random的choice方法从all_categories随机选择一个类别
    category = random.choice(all_categories)
    # 然后在通过category_lines字典取category类别对应的名字列表
    # 之后再从列表中随机取一个名字
    line = random.choice(category_lines[category])
    # 接着将这个类别在所有类别列表中的索引封装成tensor, 得到类别张量category_tensor
    category_tensor = torch.tensor([all_categories.index(category)], dtype=torch.long)
    # 最后, 将随机取到的名字通过函数lineToTensor转化为onehot张量表示
    line_tensor = utils.lineToTensor(line, n_letters, all_letters)
    return category, line, category_tensor, line_tensor


# 我们随机取出十个进行结果查看
for i in range(10):
    category, line, category_tensor, line_tensor = randomTrainingExample()
    print('category =', category, '/ line =', line, '/ category_tensor =', category_tensor)
'''输出：
category = Japanese / line = Nagano / category_tensor = tensor([10])
category = Polish / line = Adamczyk / category_tensor = tensor([12])
category = Vietnamese / line = Thuy / category_tensor = tensor([17])
category = Japanese / line = Kuga / category_tensor = tensor([10])
category = Portuguese / line = Rosario / category_tensor = tensor([13])
category = Vietnamese / line = Ma / category_tensor = tensor([17])
category = Arabic / line = Saliba / category_tensor = tensor([0])
category = Dutch / line = Koumans / category_tensor = tensor([3])
category = Polish / line = Rog / category_tensor = tensor([12])
category = Polish / line = Bartosz / category_tensor = tensor([12])
'''

# 假定模型训练开始时间是10min之前
since = time.time() - 10 * 60
period = utils.timeSince(since)
print(period)
# 输出：10m 0s

构建训练函数并进行训练

# 设置训练迭代次数
n_iters = 10000
# 设置结果的打印间隔
print_every = 500
# 设置绘制损失曲线上的制图间隔
plot_every = 10


def train(model, train_type_fn):
    """训练过程的日志打印函数, 参数train_type_fn代表选择哪种模型训练函数, 如trainRNN"""
    # 每个制图间隔损失保存列表
    all_losses = []
    # 获得训练开始时间戳
    start = time.time()
    # 设置初始间隔损失为0
    current_loss = 0
    # 从1开始进行训练迭代, 共n_iters次
    for iter in range(1, n_iters + 1):
        # 通过randomTrainingExample函数随机获取一组训练数据和对应的类别
        category, line, category_tensor, line_tensor = randomTrainingExample()
        # 将训练数据和对应类别的张量表示传入到train函数中
        output, loss = train_type_fn(model, category_tensor, line_tensor)
        # 计算制图间隔中的总损失
        current_loss += loss
        # 如果迭代数能够整除打印间隔
        if iter % print_every == 0:
            # 取该迭代步上的output通过categoryFromOutput函数获得对应的类别和类别索引
            guess, guess_i = categoryFromOutput(output)
            # 然后和真实的类别category做比较, 如果相同则打对号, 否则打叉号.
            correct = '✓' if guess == category else '✗ (%s)' % category
            # 打印迭代步, 迭代步百分比, 当前训练耗时, 损失, 该步预测的名字, 以及是否正确
            print('%d %d%% (%s) %.4f %s / %s %s' % (
                iter, iter / n_iters * 100, utils.timeSince(start), loss, line, guess, correct))
        # 如果迭代数能够整除制图间隔
        if iter % plot_every == 0:
            # 将保存该间隔中的平均损失到all_losses列表中
            all_losses.append(current_loss / plot_every)
            # 间隔损失重置为0
            current_loss = 0
    # 返回对应的总损失列表和训练耗时
    return all_losses, int(time.time() - start)


"""开始训练传统RNN, LSTM, GRU模型并制作对比图"""
# 调用train函数, 分别进行RNN, LSTM, GRU模型的训练
# 并返回各自的全部损失, 以及训练耗时用于制图
all_losses1, period1 = train(rnn, RNN.trainRNN)
all_losses2, period2 = train(lstm, LSTM.trainLSTM)
all_losses3, period3 = train(gru, GRU.trainGRU)

# 绘制损失对比曲线, 训练耗时对比柱张图
# 创建画布0
plt.figure(0)
# 绘制损失对比曲线
plt.plot(all_losses1, label="RNN")
plt.plot(all_losses2, color="red", label="LSTM")
plt.plot(all_losses3, color="orange", label="GRU")
plt.legend(loc='upper left')

# 创建画布1
plt.figure(1)
x_data = ["RNN", "LSTM", "GRU"]
y_data = [period1, period2, period3]
# 绘制训练耗时对比柱状图
plt.bar(range(len(x_data)), y_data, tick_label=x_data)
plt.show()

line = "Bai"
line_tensor = utils.lineToTensor(line, n_letters, all_letters)
rnn_output = RNN.evaluateRNN(rnn, line_tensor)
lstm_output = LSTM.evaluateLSTM(lstm, line_tensor)
gru_output = GRU.evaluateGRU(gru, line_tensor)
print("rnn_output:", rnn_output)
print("gru_output:", lstm_output)
print("gru_output:", gru_output)

输出结果

控制台

G:\need\NLP\data_names\RNN.py:64: UserWarning: This overload of add_ is deprecated:
	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\torch\csrc\utils\python_arg_parser.cpp:1174.)
  p.data.add_(-learning_rate, p.grad.data)
500 5% (0m 1s) 2.7323 Venne / Dutch ✓
1000 10% (0m 3s) 2.9895 Hladky / Dutch ✗ (Czech)
1500 15% (0m 4s) 2.7946 Araujo / Polish ✗ (Portuguese)
2000 20% (0m 5s) 3.0709 Herodes / Dutch ✗ (Czech)
2500 25% (0m 6s) 3.0051 Mahoney / Spanish ✗ (Irish)
3000 30% (0m 8s) 2.5392 Nelmes / Dutch ✗ (English)
3500 35% (0m 9s) 1.9422 Turchi / Italian ✓
4000 40% (0m 11s) 2.3251 Eichel / German ✓
4500 45% (0m 12s) 2.6242 Michel / Czech ✗ (Polish)
5000 50% (0m 14s) 2.9061 Hruskova / Japanese ✗ (Czech)
5500 55% (0m 15s) 2.9102 Comtois / Greek ✗ (French)
6000 60% (0m 17s) 1.4745 Yanibekov / Greek ✗ (Russian)
6500 65% (0m 18s) 1.1071 Nie / Chinese ✓
7000 70% (0m 20s) 1.5131 Guan / Chinese ✓
7500 75% (0m 22s) 2.0835 Williamson / Russian ✗ (Scottish)
8000 80% (0m 23s) 2.6354 Crawford / English ✗ (Scottish)
8500 85% (0m 25s) 1.0363 Brzezicki / Polish ✓
9000 90% (0m 26s) 2.3351 Teunissen / Russian ✗ (Dutch)
9500 95% (0m 28s) 2.9281 Muir / Korean ✗ (Scottish)
10000 100% (0m 29s) 0.9931 Choe / Korean ✓
500 5% (0m 2s) 2.9663 Jung  / Vietnamese ✗ (Korean)
1000 10% (0m 4s) 2.8818 Flynn / Japanese ✗ (Irish)
1500 15% (0m 7s) 2.9304 Althuis / Czech ✗ (Dutch)
2000 20% (0m 9s) 2.8281 Accursio / Portuguese ✗ (Italian)
2500 25% (0m 11s) 2.9179 Fairbrace / Spanish ✗ (English)
3000 30% (0m 13s) 2.8677 Gajos / Spanish ✗ (Polish)
3500 35% (0m 16s) 2.8854 Roth / Spanish ✗ (German)
4000 40% (0m 18s) 2.8443 Gakusha / Vietnamese ✗ (Japanese)
4500 45% (0m 20s) 2.8108 Jimenez / Spanish ✓
5000 50% (0m 22s) 2.9355 Boutros / Spanish ✗ (Arabic)
5500 55% (0m 25s) 2.8427 Antonopoulos / Spanish ✗ (Greek)
6000 60% (0m 27s) 2.8785 Flower / Scottish ✗ (English)
6500 65% (0m 29s) 2.8869 Wilmot / French ✗ (English)
7000 70% (0m 32s) 2.9042 Chu / French ✗ (Korean)
7500 75% (0m 34s) 2.8958 Sugita / Scottish ✗ (Japanese)
8000 80% (0m 36s) 2.9280 Vasquez / Scottish ✗ (Spanish)
8500 85% (0m 39s) 2.9010 Wang / Dutch ✗ (Korean)
9000 90% (0m 41s) 2.9680 Gallo / Russian ✗ (Spanish)
9500 95% (0m 43s) 2.8586 Lindsay / Russian ✗ (Scottish)
10000 100% (0m 46s) 2.8405 O'Boyle / Russian ✗ (Irish)
500 5% (0m 2s) 2.9223 Palmeiro / Russian ✗ (Portuguese)
1000 10% (0m 4s) 2.7640 Naldi / Italian ✓
1500 15% (0m 6s) 2.9366 Lang / Japanese ✗ (Chinese)
2000 20% (0m 9s) 2.8530 Borowski / Polish ✗ (Czech)
2500 25% (0m 12s) 2.8466 Bosko / Korean ✗ (Polish)
3000 30% (0m 15s) 2.7370 Vasyatkin / Italian ✗ (Russian)
3500 35% (0m 18s) 2.8483 Gan / Russian ✗ (Chinese)
4000 40% (0m 21s) 2.7950 Alber / Russian ✗ (English)
4500 45% (0m 23s) 2.6043 Shirmanov / Russian ✓
5000 50% (0m 26s) 3.0219 Salomon / Russian ✗ (Polish)
5500 55% (0m 29s) 2.8216 Gu / Chinese ✗ (Korean)
6000 60% (0m 34s) 2.7879 Gorski / Russian ✗ (Polish)
6500 65% (0m 37s) 2.8207 Rijnder / German ✗ (Dutch)
7000 70% (0m 41s) 2.6129 Dittmar / German ✓
7500 75% (0m 45s) 2.8878 Gauk / Russian ✗ (Chinese)
8000 80% (0m 49s) 2.7147 Lowe / Dutch ✗ (German)
8500 85% (0m 53s) 2.7290 Costa / Portuguese ✓
9000 90% (0m 57s) 2.7239 Borgnino / Russian ✗ (Italian)
9500 95% (1m 0s) 2.7706 Fionn / Russian ✗ (Irish)
10000 100% (1m 4s) 2.8627 Kauphsman / Irish ✗ (Czech)
rnn_output: tensor([[-3.6224, -0.7660, -4.2510, -5.5092, -5.0317, -5.0419, -5.3001, -5.0867,
         -4.7854, -4.0797, -3.3401, -1.7571, -3.6563, -5.2527, -4.4994, -4.5131,
         -4.9881, -1.7559]], grad_fn=<SqueezeBackward1>)
gru_output: tensor([[-2.8168, -2.8891, -2.8932, -2.9768, -2.9284, -2.8722, -2.9062, -2.9614,
         -2.8455, -2.8548, -2.9272, -2.9084, -2.8545, -2.9557, -2.7883, -2.8477,
         -2.9334, -2.8895]], grad_fn=<SqueezeBackward1>)
gru_output: tensor([[-2.7879, -2.7342, -2.8792, -3.0811, -3.1559, -3.0754, -3.1348, -3.1096,
         -2.9784, -2.6722, -2.5414, -2.7711, -2.7288, -2.9103, -2.9286, -3.1165,
         -2.8678, -2.8332]], grad_fn=<SqueezeBackward1>)

损失对比曲线图

在这里插入图片描述

损失对比曲线分析:

模型训练的损失降低快慢代表模型收敛程度, 由图可知, 传统RNN的模型收敛情况最好, 然后是GRU, 最后是LSTM,
这是因为: 我们当前处理的文本数据是人名, 他们的长度有限, 且长距离字母间基本无特定关联, 因此无法发挥改进模型LSTM和GRU的长距离捕捉语义关联的优势.
所以在以后的模型选用时, 要通过对任务的分析以及实验对比, 选择最适合的模型.

训练耗时对比图

在这里插入图片描述

训练耗时对比图分析:

模型训练的耗时长短代表模型的计算复杂度, 由图可知, 也正如我们之前的理论分析,
传统RNN复杂度最低, 耗时几乎只是后两者的一半, 然后是GRU, 最后是复杂度最高的LSTM.

结论:
模型选用一般应通过实验对比, 并非越复杂或越先进的模型表现越好, 而是需要结合自己的特定任务, 从对数据的分析和实验结果中获得最佳答案.

构建评估函数并进行预测

"""预测函数"""


def predict(input_line, evaluate, n_predictions=3):
    """预测函数, 输入参数input_line代表输入的名字,
       n_predictions代表需要取最有可能的top个"""
    # 首先打印输入
    print('\n> %s' % input_line)

    # 以下操作的相关张量不进行求梯度
    with torch.no_grad():
        model = None
        if evaluate == RNN.evaluateRNN:
            model = rnn
        elif evaluate == LSTM.evaluateLSTM:
            model = lstm
        elif evaluate == GRU.evaluateGRU:
            model = gru
        print(model)
        # 使输入的名字转换为张量表示, 并使用evaluate函数获得预测输出
        output = evaluate(model, utils.lineToTensor(input_line, n_letters, all_letters))

        # 从预测的输出中取前3个最大的值及其索引
        topv, topi = output.topk(n_predictions, 1, True)
        # 创建盛装结果的列表
        predictions = []
        # 遍历n_predictions
        for i in range(n_predictions):
            # 从topv中取出的output值
            value = topv[0][i].item()
            # 取出索引并找到对应的类别
            category_index = topi[0][i].item()
            # 打印ouput的值, 和对应的类别
            print('(%.2f) %s' % (value, all_categories[category_index]))
            # 将结果装进predictions中
            predictions.append([value, all_categories[category_index]])


for evaluate_fn in [RNN.evaluateRNN, LSTM.evaluateLSTM, GRU.evaluateGRU]:
    print("-" * 18)
    predict('Dovesky', evaluate_fn)
    predict('Jackson', evaluate_fn)
    predict('Satoshi', evaluate_fn)

运行结果

------------------

> Dovesky
RNN(
  (rnn): RNN(57, 128)
  (linear): Linear(in_features=128, out_features=18, bias=True)
  (softmax): LogSoftmax(dim=-1)
)
(-1.28) Polish
(-1.82) Russian
(-2.54) Scottish

> Jackson
RNN(
  (rnn): RNN(57, 128)
  (linear): Linear(in_features=128, out_features=18, bias=True)
  (softmax): LogSoftmax(dim=-1)
)
(-1.56) Irish
(-1.58) Scottish
(-1.71) Russian

> Satoshi
RNN(
  (rnn): RNN(57, 128)
  (linear): Linear(in_features=128, out_features=18, bias=True)
  (softmax): LogSoftmax(dim=-1)
)
(-1.17) Polish
(-1.49) Italian
(-2.69) Japanese
------------------

> Dovesky
LSTM(
  (lstm): LSTM(57, 128)
  (linear): Linear(in_features=128, out_features=18, bias=True)
  (softmax): LogSoftmax(dim=-1)
)
(-2.75) Russian
(-2.82) Polish
(-2.82) Scottish

> Jackson
LSTM(
  (lstm): LSTM(57, 128)
  (linear): Linear(in_features=128, out_features=18, bias=True)
  (softmax): LogSoftmax(dim=-1)
)
(-2.75) Russian
(-2.81) Irish
(-2.81) Scottish

> Satoshi
LSTM(
  (lstm): LSTM(57, 128)
  (linear): Linear(in_features=128, out_features=18, bias=True)
  (softmax): LogSoftmax(dim=-1)
)
(-2.75) Russian
(-2.82) Arabic
(-2.82) Scottish
------------------

> Dovesky
GRU(
  (gru): GRU(57, 128)
  (linear): Linear(in_features=128, out_features=18, bias=True)
  (softmax): LogSoftmax(dim=-1)
)
(-2.64) Russian
(-2.76) Polish
(-2.77) Czech

> Jackson
GRU(
  (gru): GRU(57, 128)
  (linear): Linear(in_features=128, out_features=18, bias=True)
  (softmax): LogSoftmax(dim=-1)
)
(-2.75) Russian
(-2.79) Irish
(-2.82) Dutch

> Satoshi
GRU(
  (gru): GRU(57, 128)
  (linear): Linear(in_features=128, out_features=18, bias=True)
  (softmax): LogSoftmax(dim=-1)
)
(-2.61) Japanese
(-2.68) Italian
(-2.70) Polish