通过实际案例理解pytorch中lstm使用技巧

最新推荐文章于 2024-05-26 11:00:33 发布
AiBigData
最新推荐文章于 2024-05-26 11:00:33 发布
阅读量749
点赞数 1
分类专栏： NLP 文章标签： lstm pytorch
原文链接：https://github.com/Yonv1943/DL_RL_Zoo/blob/master/RNN
版权
NLP 专栏收录该内容
2 篇文章 0 订阅
订阅专栏
import numpy as np
import torch
from torch import nn

import matplotlib.pyplot as plt

"""
Github: Yonv1943 Zen4 Jia1 hao2
https://github.com/Yonv1943/DL_RL_Zoo/blob/master/RNN

The source of training data 
https://github.com/L1aoXingyu/
code-of-learn-deep-learning-with-pytorch/blob/master/
chapter5_RNN/time-series/lstm-time-series.ipynb
"""


def run_train_gru():
    inp_dim = 3
    out_dim = 1
    batch_size = 12 * 4

    '''load data'''
    data = load_data()
    data_x = data[:-1, :]
    data_y = data[+1:, 0]
    assert data_x.shape[1] == inp_dim

    train_size = int(len(data_x) * 0.75)

    train_x = data_x[:train_size]
    train_y = data_y[:train_size]
    train_x = train_x.reshape((train_size, inp_dim))
    train_y = train_y.reshape((train_size, out_dim))

    '''build model'''
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    net = RegGRU(inp_dim, out_dim, mod_dim=12, mid_layers=2).to(device)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(net.parameters(), lr=1e-2)

    '''train'''
    var_x = torch.tensor(train_x, dtype=torch.float32, device=device)
    var_y = torch.tensor(train_y, dtype=torch.float32, device=device)

    batch_var_x = list()
    batch_var_y = list()

    for i in range(batch_size):
        j = train_size - i
        batch_var_x.append(var_x[j:])
        batch_var_y.append(var_y[j:])

    from torch.nn.utils.rnn import pad_sequence
    batch_var_x = pad_sequence(batch_var_x)
    batch_var_y = pad_sequence(batch_var_y)

    with torch.no_grad():
        weights = np.tanh(np.arange(len(train_y)) * (np.e / len(train_y)))
        weights = torch.tensor(weights, dtype=torch.float32, device=device)

    for e in range(256):
        out = net(batch_var_x)

        # loss = criterion(out, batch_var_y)
        loss = (out - batch_var_y) ** 2 * weights
        loss = loss.mean()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if e % 100 == 0:
            print('Epoch: {}, Loss: {:.5f}'.format(e, loss.item()))

    '''eval'''
    net = net.eval()

    test_x = data_x.copy()
    test_x[train_size:, 0] = 0
    test_x = test_x[:, np.newaxis, :]
    test_x = torch.tensor(test_x, dtype=torch.float32, device=device)
    for i in range(train_size, len(data) - 2):
        test_y = net(test_x[:i])
        test_x[i + 1, 0, 0] = test_y[-1]
    pred_y = test_x[1:, 0, 0]
    pred_y = pred_y.cpu().data.numpy()

    diff_y = pred_y[train_size:] - data_y[train_size:-1]
    l1_loss = np.mean(np.abs(diff_y))
    l2_loss = np.mean(diff_y ** 2)
    print("L1: {:.3f}    L2: {:.3f}".format(l1_loss, l2_loss))
    plt.plot(pred_y, 'r', label='pred')
    plt.plot(data_y, 'b', label='real')
    plt.legend(loc='best')
    plt.pause(4)


def run_train_lstm():
    inp_dim = 3
    out_dim = 1
    mid_dim = 8
    mid_layers = 1
    batch_size = 12 * 4
    mod_dir = '.'

    '''load data'''
    data = load_data()
    # 舍弃了最后一行
    data_x = data[:-1, :]
    # 行+1，取第0列的数据，其实取的是第0列除了第一行的数据外的所有列，因为第一行的label也是输入，所以就不该成为预测值了
    data_y = data[+1:, 0]  # seq_number, seq_year, month
    # 如果data_x的列维度不等于inp_dim 则直接报错
    assert data_x.shape[1] == inp_dim

    train_size = int(len(data_x) * 0.75)

    # 取0.75的数据作为训练集的x,y
    train_x = data_x[:train_size]
    train_y = data_y[:train_size]
    train_x = train_x.reshape((train_size, inp_dim))
    train_y = train_y.reshape((train_size, out_dim))

    '''build model 如果有gpu则用gpu，否则使用cpu'''
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    net = RegLSTM(inp_dim, out_dim, mid_dim, mid_layers).to(device)
    # 使用的损失函数是MSE
    criterion = nn.MSELoss()
    parameters = net.parameters()
    optimizer = torch.optim.Adam(net.parameters(), lr=1e-2)

    ''' train 将ndarray转化成torch,可能转化成cpu识别的格式或者gpu识别的tensor，通过device来决定 '''
    var_x = torch.tensor(train_x, dtype=torch.float32, device=device)
    var_y = torch.tensor(train_y, dtype=torch.float32, device=device)

    batch_var_x = list()
    batch_var_y = list()

    # 一共108个训练集，只用48batch，是否有些浪费
    for i in range(batch_size):
        # 不同的起始裁剪位点
        j = train_size - i
        # 从后往前进行裁剪，最开始0个有效数据，然后一个，2个，3个，最后batch_size个有效数据，但是9个月的数据还是没有用全。。。。
        # 在训练中作为输入的客流量数据（年份、月份、本月的客流量）
        # test_x = var_x[j:]
        # test_y = var_y[j:]
        batch_var_x.append(var_x[j:])
        # 在训练中作为标签的预测数据（下个月的客流量）
        batch_var_y.append(var_y[j:])

    from torch.nn.utils.rnn import pad_sequence
    # 47*48*3  相当于一共47个样本
    batch_var_x = pad_sequence(batch_var_x)
    # 47  相当于47个y
    batch_var_y = pad_sequence(batch_var_y)

    with torch.no_grad():
        # 长度越长，weights越大
        weights = np.tanh(np.arange(len(train_y)) * (np.e / len(train_y)))
        weights = torch.tensor(weights, dtype=torch.float32, device=device)

    print("Training Start")
    for e in range(304):
        # 相当于384次batch, 训练的时候，将seq_number, seq_year, seq_month 都放入模型进行了训练，感觉就像几个有规律组合成的单词，只不过跟前后的单词都有关
        out = net(batch_var_x)
    
        # loss = criterion(out, batch_var_y)
        # 手动计算mse
        loss = (out - batch_var_y) ** 2 * weights
        # loss对样本的个数求平均值
        loss = loss.mean()

        # 设置所有梯度优化器tensor为0
        optimizer.zero_grad()
        # 反向传播计算梯度
        loss.backward()
        # 通过梯度下降法进行梯度更新
        optimizer.step()

        # 每64次batch打印一下当前的损失函数，state_dict() 返回模型所有的状态。
        if e % 64 == 0:
            print('Epoch: {:4}, Loss: {:.5f}'.format(e, loss.item()))
    torch.save(net.state_dict(), '{}/net.pth'.format(mod_dir))
    print("Save in:", '{}/net.pth'.format(mod_dir))

    '''eval 加载通过torch.save方式存储的模型， '''
    net.load_state_dict(torch.load('{}/net.pth'.format(mod_dir), map_location=lambda storage, loc: storage))
    # 设置模型成评估模式
    net = net.eval()

    # data_x 是9+3年的数据
    test_x = data_x.copy()
    # 我们将后四年的数据设置成0 =>143:3
    test_x[train_size:, 0] = 0
    # 其实就是扩维度，中间增加一个维度 143:3=>143:1:3
    test_x = test_x[:, np.newaxis, :]
    # 将test_x ndarray变成tensor
    test_x = torch.tensor(test_x, dtype=torch.float32, device=device)

    '''simple way but no elegant'''
    # for i in range(train_size, len(data) - 2):
    #     test_y = net(test_x[:i])
    #     test_x[i, 0, 0] = test_y[-1]

    '''elegant way but slightly complicated 优雅的方法但是轻微复杂 '''
    eval_size = 1
    # 填充一个 mid_layers*eval_size*mid_dim维度的矩阵 1*1*8
    zero_ten = torch.zeros((mid_layers, eval_size, mid_dim), dtype=torch.float32, device=device)
    test1 = test_x[:train_size] # [0:106] 117
    test_y, hc = net.output_y_hc(test_x[:train_size], (zero_ten, zero_ten)) #
    test1 = test_y[-1]
    test_x[train_size + 1, 0, 0] = test_y[-1]  # 107*1*1  #
    for i in range(train_size + 1, len(data) - 2):  # (108:142]
        test11 = test_x[i:i + 1]
        test_y, hc = net.output_y_hc(test_x[i:i + 1], hc)  # 通过前面的hc，和当前的输入，预测后面的hc,哈哈哈，理解完概念 test_y 就是预测的下一个label，是一个连续的值
        test_x[i + 1, 0, 0] = test_y[-1]  # 将刚刚预测的结果，会跟seq_year,seq_month组合成下一个输入进行模型
    pred_y = test_x[1:, 0, 0]  # 去除第一行，因为第一行为输入值，不是输出，因为整个模型的输入是 seq_number,seq_year,seq_month
    pred_y = pred_y.cpu().data.numpy()  # 将最终预测的值转化成numpy格式

    data_test = data_y[train_size:-1]
    print(train_size, data_test)
    # 最后预测的正常应该是3年的数据，所以我们对比的也应该是3年的数据，但是实际的打印是35条，少一条，因为输入的数据集去除了最后一行，实际的值seq_num输出去除了第一行
    diff_y = pred_y[train_size:] - data_y[train_size:-1]
    print(diff_y)
    print(pred_y[train_size:])
    print(pred_y[train_size:])

    l1_loss = np.mean(np.abs(diff_y))  # l1范数
    l2_loss = np.mean(diff_y ** 2)  #  l2范数
    print("L1: {:.3f}    L2: {:.3f}".format(l1_loss, l2_loss))

    plt.plot(pred_y, 'r', label='pred')
    plt.plot(data_y, 'b', label='real', alpha=0.3)
    # [-1,2]代表train,pred之间的分解竖线的y轴的范围。
    plt.plot([train_size, train_size], [-1, 2], color='k', label='train | pred')
    plt.legend(loc='best')
    plt.savefig('lstm_reg.png')
    plt.pause(4)


def run_origin():
    inp_dim = 2
    out_dim = 1
    mod_dir = '.'

    '''load data'''
    data = load_data()  # axis1: number, year, month
    data_x = np.concatenate((data[:-2, 0:1], data[+1:-1, 0:1]), axis=1)
    data_y = data[2:, 0]

    train_size = int(len(data_x) * 0.75)
    train_x = data_x[:train_size]
    train_y = data_y[:train_size]

    train_x = train_x.reshape((-1, 1, inp_dim))
    train_y = train_y.reshape((-1, 1, out_dim))

    '''build model'''
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    net = RegLSTM(inp_dim, out_dim, mid_dim=4, mid_layers=2).to(device)
    criterion = nn.SmoothL1Loss()
    optimizer = torch.optim.Adam(net.parameters(), lr=1e-2)

    '''train'''
    var_x = torch.tensor(train_x, dtype=torch.float32, device=device)
    var_y = torch.tensor(train_y, dtype=torch.float32, device=device)
    print('var_x.size():', var_x.size())
    print('var_y.size():', var_y.size())

    for e in range(512):
        out = net(var_x)
        loss = criterion(out, var_y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (e + 1) % 100 == 0:  # 每 100 次输出结果
            print('Epoch: {}, Loss: {:.5f}'.format(e + 1, loss.item()))

    torch.save(net.state_dict(), '{}/net.pth'.format(mod_dir))

    '''eval'''
    # net.load_state_dict(torch.load('{}/net.pth'.format(mod_dir), map_location=lambda storage, loc: storage))
    net = net.eval()  # 转换成测试模式

    """
    inappropriate way of seq prediction: 
    use all real data to predict the number of next month
    """
    test_x = data_x.reshape((-1, 1, inp_dim))
    var_data = torch.tensor(test_x, dtype=torch.float32, device=device)
    eval_y = net(var_data)  # 测试集的预测结果
    pred_y = eval_y.view(-1).cpu().data.numpy()

    plt.plot(pred_y[1:], 'r', label='pred inappr', alpha=0.3)
    plt.plot(data_y, 'b', label='real', alpha=0.3)
    plt.plot([train_size, train_size], [-1, 2], label='train | pred')

    """
    appropriate way of seq prediction: 
    use real+pred data to predict the number of next 3 years.
    """
    test_x = data_x.reshape((-1, 1, inp_dim))
    test_x[train_size:] = 0  # delete the data of next 3 years.
    test_x = torch.tensor(test_x, dtype=torch.float32, device=device)
    for i in range(train_size, len(data) - 2):
        test_y = net(test_x[:i])
        test_x[i, 0, 0] = test_x[i - 1, 0, 1]
        test_x[i, 0, 1] = test_y[-1, 0]
    pred_y = test_x.cpu().data.numpy()
    pred_y = pred_y[:, 0, 0]
    plt.plot(pred_y[2:], 'g', label='pred appr')

    plt.legend(loc='best')
    plt.savefig('lstm_origin.png')
    plt.pause(4)


class RegLSTM(nn.Module):
    def __init__(self, inp_dim, out_dim, mid_dim, mid_layers):
        super(RegLSTM, self).__init__()
        # inp_dim 是LSTM输入张量的维度，我们已经根据我们的数据确定了这个值是3
        # mid_dim 是LSTM三个门 (gate) 的网络宽度，也是LSTM输出张量的维度
        # num_layers
        #         input_size: The number of expected features in the input `x`
        #         hidden_size: The number of features in the hidden state `h`
        #         num_layers: Number of recurrent layers. E.g., setting ``num_layers=2``
        #             would mean stacking two LSTMs together to form a `stacked LSTM`,
        #             with the second LSTM taking in outputs of the first LSTM and
        #             computing the final results. Default: 1
        #         bias: If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`.
        #             Default: ``True``
        #         batch_first: If ``True``, then the input and output tensors are provided
        #             as (batch, seq, feature). Default: ``False``
        #         dropout: If non-zero, introduces a `Dropout` layer on the outputs of each
        #             LSTM layer except the last layer, with dropout probability equal to
        #             :attr:`dropout`. Default: 0
        #         bidirectional: If ``True``, becomes a bidirectional LSTM. Default: ``False``
        #         proj_size: If ``> 0``, will use LSTM with projections of corresponding size. Default: 0
        # inp_dim输入维度，mid_dim:输出维度，mid_layers:循环神经网络的层数，
        self.rnn = nn.LSTM(inp_dim, mid_dim, mid_layers)  # rnn
        # 为了进行时间序列预测，我们在LSTM后面接上两层全连接层（1层亦可），同时改变最终输出张量的维度，我们只需要预测客流量这一个值，因此out_dim 为1。在LSTM后方的全连接层也可以看做是一个回归操作 regression。
        #
        # 在LSTM后面接上两层全连接层，为何是两层： 理论上足够宽，并且至少存在一层具有任何一种“挤压”性质的激活函数的两层全连接层就能拟合任何连续函数。最先提出这个理论证明的是 Barron et al., 1993，使用了UAT (Universal Approximation Theorem)，指出了可以在compact domain拟合任意多项式函数。”
        # 实际上对于过于复杂的连续函数，这个「足够宽」不容易满足。并且拟合训练数据并让神经网络具备足够的泛化性的前提是：良好的训练方法（比如批次训练数据满足 独立同分布 (i.i.d.)，良好的损失函数，满足Lipschitz连续 etc.），具体内容可以看：（引号内容为摘抄 数学爱好者在知乎上的回答 ）“
        # lstm 后面还接上了两层全连接
        self.reg = nn.Sequential(
            nn.Linear(mid_dim, mid_dim),
            nn.Tanh(),
            nn.Linear(mid_dim, out_dim),
        )  # regression

    def forward(self, x):
        y = self.rnn(x)[0]  # y, (h, c) = self.rnn(x)

        seq_len, batch_size, hid_dim = y.shape
        y = y.view(-1, hid_dim)
        y = self.reg(y)
        y = y.view(seq_len, batch_size, -1)
        return y

    """
    PyCharm Crtl+click nn.LSTM() jump to code of PyTorch:
    Examples::
        >>> rnn = nn.LSTM(10, 20, 2)
        >>> input = torch.randn(5, 3, 10)
        >>> h0 = torch.randn(2, 3, 20)
        >>> c0 = torch.randn(2, 3, 20)
        >>> output, (hn, cn) = rnn(input, (h0, c0))
    """
    # hc c是memory,h是 output
    def output_y_hc(self, x, hc):
        # hc (1*1*8, 1*1*8)
        y, hc = self.rnn(x, hc)  # y, (h, c) = self.rnn(x)
        # 107*1*8
        seq_len, batch_size, hid_dim = y.size()
        y = y.view(-1, hid_dim)  # 训练集 107*8 测试集 1*8
        y = self.reg(y)  # 107*1
        y = y.view(seq_len, batch_size, -1)  # 107*1*1
        return y, hc


class RegGRU(nn.Module):
    def __init__(self, inp_dim, out_dim, mod_dim, mid_layers):
        super(RegGRU, self).__init__()

        self.rnn = nn.GRU(inp_dim, mod_dim, mid_layers)
        self.reg = nn.Linear(mod_dim, out_dim)

    def forward(self, x):
        x, h = self.rnn(x)  # (seq, batch, hidden)

        seq_len, batch_size, hid_dim = x.shape
        x = x.view(-1, hid_dim)
        x = self.reg(x)
        x = x.view(seq_len, batch_size, -1)
        return x

    def output_y_h(self, x, h):
        y, h = self.rnn(x, h)

        seq_len, batch_size, hid_dim = y.size()
        y = y.view(-1, hid_dim)
        y = self.reg(y)
        y = y.view(seq_len, batch_size, -1)
        return y, h


def load_data():
    # passengers number of international airline , 1949-01 ~ 1960-12 per month
    seq_number = np.array(
        [112., 118., 132., 129., 121., 135., 148., 148., 136., 119., 104.,
         118., 115., 126., 141., 135., 125., 149., 170., 170., 158., 133.,
         114., 140., 145., 150., 178., 163., 172., 178., 199., 199., 184.,
         162., 146., 166., 171., 180., 193., 181., 183., 218., 230., 242.,
         209., 191., 172., 194., 196., 196., 236., 235., 229., 243., 264.,
         272., 237., 211., 180., 201., 204., 188., 235., 227., 234., 264.,
         302., 293., 259., 229., 203., 229., 242., 233., 267., 269., 270.,
         315., 364., 347., 312., 274., 237., 278., 284., 277., 317., 313.,
         318., 374., 413., 405., 355., 306., 271., 306., 315., 301., 356.,
         348., 355., 422., 465., 467., 404., 347., 305., 336., 340., 318.,
         362., 348., 363., 435., 491., 505., 404., 359., 310., 337., 360.,
         342., 406., 396., 420., 472., 548., 559., 463., 407., 362., 405.,
         417., 391., 419., 461., 472., 535., 622., 606., 508., 461., 390.,
         432.], dtype=np.float32)
    # assert seq_number.shape == (144, )
    # plt.plot(seq_number)
    # plt.ion()
    # plt.pause(1)
    # 一维行向量变成二维矩阵，矩阵的第二个维度是1
    seq_number = seq_number[:, np.newaxis]

    # print(repr(seq))
    # 1949~1960, 12 years, 12*12==144 month
    seq_year = np.arange(12)
    seq_month = np.arange(12)
    # [ 0  0  0  0  0  0  0  0  0  0  0  0  1  1  1  1  1  1  1  1  1  1  1  1 2  2  2  2  2  2  2  2  2  2  2  2  3  3  3  3  3  3  3  3  3  3  3  3 4  4  4  4  4  4  4  4  4  4  4  4  5  5  5  5  5  5  5  5  5  5  5  5 6  6  6  6  6  6  6  6  6  6  6  6  7  7  7  7  7  7  7  7  7  7  7  7 8  8  8  8  8  8  8  8  8  8  8  8  9  9  9  9  9  9  9  9  9  9  9  9 10 10 10 10 10 10 10 10 10 10 10 10 11 11 11 11 11 11 11 11 11 11 11 11]
    test0 = np.repeat(seq_year, len(seq_month))
    print(test0)
    # [ 0  1  2  3  4  5  6  7  8  9 10 11  0  1  2  3  4  5  6  7  8  9 10 11 0  1  2  3  4  5  6  7  8  9 10 11  0  1  2  3  4  5  6  7  8  9 10 11 0  1  2  3  4  5  6  7  8  9 10 11  0  1  2  3  4  5  6  7  8  9 10 11 0  1  2  3  4  5  6  7  8  9 10 11  0  1  2  3  4  5  6  7  8  9 10 11 0  1  2  3  4  5  6  7  8  9 10 11  0  1  2  3  4  5  6  7  8  9 10 11 0  1  2  3  4  5  6  7  8  9 10 11  0  1  2  3  4  5  6  7  8  9 10 11]
    test1 = np.tile(seq_month, len(seq_year))
    print(test1)
    # np.repeat：复制的是多维数组的每一个元素；axis来控制复制的行和列
    # np.tile：复制的是多维数组本身；
    # (144, 2)
    seq_year_month = np.transpose(
        [np.repeat(seq_year, len(seq_month)),
         np.tile(seq_month, len(seq_year))],
    )  # Cartesian Product

    seq = np.concatenate((seq_number, seq_year_month), axis=1)

    # Standardization 这个应该成为标准化
    seq = (seq - seq.mean(axis=0)) / seq.std(axis=0)
    return seq


if __name__ == '__main__':
    run_train_lstm()
    # run_train_gru()
    # run_origin()