rnn理解、复现和应用

最新推荐文章于 2023-09-09 10:29:10 发布

云贾

最新推荐文章于 2023-09-09 10:29:10 发布

阅读量302

点赞数

分类专栏： nlp 文章标签： rnn 深度学习 python

张云飞

本文链接：https://blog.csdn.net/qq_45162308/article/details/128355480

版权

nlp 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

rnn理解、复现和应用

rnn理解

pytorch中rnn参数

nn.RNN(
    input_size=4,
    hidden_size=3,
    # rnn数量
    # ps: 设置num_layers＝2将意味着将两个RNN堆叠在一起以形成堆叠的RNN,第二RNN接收第一RNN的输出并计算最终结果
    num_layers=1, # 默认
    nonlinearity='tanh', # 默认
    bias=True, # 默认
    batch_first=False, # 默认
    dropout=0, # 默认
    bidirectional=False # 默认
)

rnn输入、隐藏层、输出

'''
输入层5个节点,
隐藏层6个节点,
一个rnn,
批大小在前面
'''
rnn = nn.RNN(input_size=5, hidden_size=6, num_layers=1, batch_first=True)

'''
以一层RNN为例子
初始化时这里会有四个参数,分别为
	weight_ih_l0 => (hidden_size, input_size)
	weight_hh_l0 => (hidden_size, hidden_size)
	bias_ih_l0 => (hidden_size)
	bias_hh_l0 => (hidden_size)

注: 这里以及后面复现的时候为什么维度会倒过来
例如,weight_ih_l0这里应该是input_size在前,而结果却是hidden_size在前
我也不是很清楚,看了pytorch官网之前版本的源码,这里好像就是这么定义的
结合后面复现RNN,个人理解应该是为了矩阵相乘时不用再转置了
'''

'''
输入:
	batch_first = True时
		(batch_size, seq_len, input_size)
	batch_first = False时
		(seq_len, batch_size, input_size)
'''
input = torch.randn(3, 1, 5)

'''
隐藏层:
	h0如果没有提供,全为0
	(num_layers * num_directions, batch_size, hidden_size)
	num_directions: 是单向还是双向
	    单向num_directions = 1
	    双向单向num_directions = 2
'''
h0 = torch.randn(1, 3, 6)

'''

输出层:(batch_size, seq_len, num_directions * hidden_size)
隐藏层hn:(num_layers * num_directions, batch_size, hidden_size)
'''
output, hn = rnn(input)

# 显示所有状态
print(rnn.state_dict())

OrderedDict([('weight_ih_l0', tensor(
	   [[-0.0042, -0.2252, -0.1453, -0.1967,  0.0716],
        [ 0.1114,  0.3056, -0.2761, -0.0785,  0.0996],
        [-0.3213,  0.3594,  0.0489,  0.4038, -0.3169],
        [ 0.1048,  0.3901, -0.3059, -0.1866, -0.2989],
        [-0.0664,  0.2578,  0.2714,  0.1200, -0.2181],
        [ 0.2600, -0.1105, -0.2324,  0.1316, -0.3495]])), 
        ('weight_hh_l0', tensor(
       [[ 0.3784, -0.2398,  0.0806,  0.2033, -0.4014, -0.1181],
        [-0.3243, -0.3409, -0.2328, -0.1596,  0.0307, -0.3497],
        [-0.0058,  0.1994,  0.3338, -0.1157, -0.3158,  0.3193],
        [ 0.1708,  0.3534, -0.2362, -0.1357,  0.3582, -0.3219],
        [ 0.2879,  0.1790, -0.3412,  0.1042, -0.2953, -0.0971],
        [ 0.4013, -0.2294, -0.2610, -0.0615, -0.2490, -0.1505]])), 
        ('bias_ih_l0', tensor(
        [ 0.0124, -0.2176,  0.0594,  0.1182,  0.0878, -0.2428])), 
        ('bias_hh_l0', tensor(
        [ 0.1436,  0.1387, -0.2747,  0.3427,  0.3240,  0.1577]))])

# 所有可训练参数,这里和上面状态是一样的
params = list(rnn.parameters())
print(params)
print(len(params))
print(params[0].shape)

	# 追踪计算图
	print(output.grad_fn)
	print(output.grad_fn.next_functions[0][0])
	print(output.grad_fn.next_functions[0][0].next_functions[0][0])
	print(output.grad_fn.next_functions[0][0].next_functions[0][0].next_functions[0][0])

rnn复现

$h_{t} = tanh(x_{t}W_{ih}^{T} + b_{ih} + h_{t-1}W_{hh}^{T} + b_{hh})$

rnn就是围绕这个公式进行计算的,这也是为什么前面提到的四个参数

import torch
import torch.nn as nn

class Rnn(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, bias=True, dropout=0, bidirectional=False, nonlinearity='tanh'):
        super(Rnn, self).__init__()

        self.rnn = nn.RNN(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            nonlinearity=nonlinearity,
            bias=bias,
            batch_first=True,
            bidirectional=bidirectional
        )
        if dropout > 0: 
            self.dropout = nn.Dropout(p=dropout)
        self.reset_params()

    def reset_params(self):
        for i in range(self.rnn.num_layers):
            nn.init.kaiming_normal_(getattr(self.rnn, f'weight_ih_l{i}')) # 初始化输入层到隐藏层权重
            nn.init.kaiming_normal_(getattr(self.rnn, f'weight_hh_l{i}')) # 初始化隐藏层到隐藏层权重
            # 初始化服从标准正态分布随机值
            nn.init.kaiming_normal_(getattr(self.rnn, f'bias_ih_l{i}').unsqueeze(1))
            nn.init.kaiming_normal_(getattr(self.rnn, f'bias_hh_l{i}').unsqueeze(1))

    def forward(self, input, h_0=None):

        batch_size, seq_len, input_size = input.shape
        hidden_size = self.rnn.weight_ih_l0.shape[0]
        if hasattr(self, 'dropout'):
            input = self.dropout(input)
        output = torch.zeros(batch_size, seq_len, hidden_size)
        if h_0 == None:
            h_0 = torch.zeros(1, batch_size, hidden_size) # 这里h_0的状态只给了二维(batch_size, hidden_size)
        # h_0==>(num_layers * num_directions, batch_size, hidden_size)
        h_0 = h_0.squeeze(0)

        for t in range(seq_len):
            # input==>(batch_size, seq_len, input_size)
            # x==>(batch_size, input_size)
            x = input[:, t, :]
            # x==>(batch_size, input_size, 1)
            x = x.unsqueeze(2)

            # weight_ih_l0==>(hidden_size, input_size)
            # w_ih==>(1, hidden_size, input_size)
            w_ih = self.rnn.weight_ih_l0.unsqueeze(0)
            # w_ih==>(batch_size, hidden_size, input_size)
            w_ih = w_ih.tile(batch_size, 1, 1) # 扩充张量的某一维度,其余维度不变

            # weight_hh_l0==>(hidden_size, hidden_size)
            # w_hh==>(batch_size, hidden_size, hidden_size)
            w_hh = self.rnn.weight_hh_l0.unsqueeze(0)
            # w_hh==>(1, hidden_size, hidden_size)
            w_hh = w_hh.tile(batch_size, 1, 1)

            # (batch_size, hidden_size, input_size)@(batch_size, input_size, 1)
            # inputW_ih==>(batch_size, hidden_size, 1)
            inputW_ih = w_ih@x
            # inputW_ih==>(batch_size, hidden_size)
            inputW_ih = inputW_ih.squeeze(-1)

            # (batch_size, hidden_size, hidden_size)@(batch_size, hidden_size, 1)
            # hiddenW_hh==>(batch_size, hidden_size, 1)
            hiddenW_hh = w_hh@h_0.unsqueeze(2)
            # hiddenW_hh==>(batch_size, hidden_size)
            hiddenW_hh = hiddenW_hh.squeeze(-1)

            # h_0 = (batch_size, hidden_size)
            h_0 = torch.tanh(inputW_ih + self.rnn.bias_ih_l0 + hiddenW_hh + self.rnn.bias_hh_l0) # 这里的h_0不在是初始化的,而是前一时刻的

            output[:, t, :] = h_0

        # output==>(batch_size, seq_len, hidden_size)
        # h_0==>(num_layers * num_directions, batch_size, hidden_size)
        return output, h_0.unsqueeze(0) # 这里返回的h_0是最后一个时刻的状态

rnn应用

到这里对于rnn的理解我想差不多了,现在就开始应用一下吧,预测一下sin函数叭那就

定义预处理数据

class Data:
    def __init__(self, args):
        self.args = args
        self.x, self.y, self.time_steps = self.process()
    def process(self):
        start = random.randint(0, 3)
        time_steps = np.linspace(start=start, stop=start + 10, num=self.args['num_time_steps'])
        data = np.sin(time_steps)
        data = data.reshape(self.args['num_time_steps'], 1)

        x = torch.tensor(data[:-1], dtype=torch.float32)  # 不包含最后一个元素
        x = x.view(1, self.args['num_time_steps'] - 1, 1)
        y = torch.tensor(data[1:], dtype=torch.float32)  # 不包含第0个元素
        y = y.view(1, self.args['num_time_steps'] - 1, 1)
        return x, y, time_steps

定义网络结构

import torch.nn as nn
from model.nn import Rnn

class Net(nn.Module):
    def __init__(self, input_size, hidden_size, output_size=1, num_layers=1):
        super(Net, self).__init__()

        self.rnn = Rnn( # 使用复现的模型
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
        )
        self.reset_params()
        self.linear = nn.Linear(in_features=hidden_size, out_features=output_size) # 有兴趣的可以复现一下Linear模型,看看情况后面可能出一期

    def reset_params(self):
        for p in self.rnn.parameters():
            nn.init.normal_(p, mean=0.0, std=0.001)

    def forward(self, x, h_0):
        # x==>(batch_size, seq_len, input_size)
        # h_0==>(num_layers, batch_size, hidden_size)
        hidden_size = h_0.shape[2]

        # out==>(batch_size, seq_len, hidden_size)
        # h_prev==>(num_layers, batch_size, hidden_size)
        out, h_prev = self.rnn(x, h_0)

        # out==>(batch_size * seq_len, hidden_size)
        out = out.view(-1, hidden_size)
        # out==>(batch_size * seq_len, output_size) # 这里output_size是Linear中的out_features的值
        out = self.linear(out)

        # out==>(1, batch_size * seq_len, output_size)
        # h_prev==>(num_layers, batch_size, hidden_size)
        return out.unsqueeze(0), h_prev

定义训练函数

import torch
import torch.nn as nn
import torch.optim as optim
from matplotlib import pyplot as plt
import numpy as np
from model.model import Net
from model.data import Data

def train(args):
    model = Net(
        input_size=args['input_size'],
        hidden_size=args['hidden_size']
    )
    criterion = nn.MSELoss()

    optimizer = optim.Adam(model.parameters(), args['lr'])

    # h_0==>(num_layers, batch_size, hidden_size)
    h_0 = torch.zeros(1, 1, args['hidden_size'])

    for iter in range(args['epoch']):
        # 生成真实数据
        data = Data(args)
        x, y, time_steps = data.x, data.y, data.time_steps

        # 模型训练数据
        output, h_prev = model(x, h_0)
        h_prev = h_prev.detach()  # 剥离出不具有梯度的纯tensor数据

        loss = criterion(output, y)
        model.zero_grad()
        loss.backward()
        optimizer.step()

        if iter % 100 == 0:
            print('iteration: {} loss {}'.format(iter, loss.item()))

    return model, output, h_prev

定义测试函数

def test(model, args, h_prev):
    # 根据模型训练出来的数据去预测
    # 生成一批真实值
    data = Data(args)
    x, y, time_steps = data.x, data.y, data.time_steps

    predictions = []
    input = x[:, 0, :]
    for _ in range(x.shape[1]):
        input = input.view(1, 1, 1)
        pred, h_prev = model(input, h_prev)
        input = pred
        predictions.append(np.max(pred.detach().numpy().ravel()))
    return x, y, predictions, time_steps

定义主函数

def main():
    args = { # 超参数
        'lr': 0.01,
        'num_time_steps': 50,  # 生成样本数
        'input_size': 1,
        'hidden_size': 16,
        'epoch': 6000
    }
    model, output, h_prev = train(args) # 训练数据
    x, y, predictions, time_steps = test(model, args, h_prev) # 测试数据

    x = x.data.numpy().ravel()  # 拉成一个一维数组

    plt.scatter(x=time_steps[:-1], y=x)
    plt.plot(time_steps[:-1], x)  # 线
    plt.scatter(x=time_steps[1:], y=predictions)  # 预测x
    plt.show()


if __name__ == '__main__':
    main()

预测sin函数

图中蓝色表示真实值,黄色表示预测值

总结

整体代码结构

|-
|--model文件夹
|---nn.py # 这个不是必须的,这是复现的模型
|---model.py # 定义的网格结构
|---data.py # 数据预处理
|--main.py # 定义训练函数、测试函数、主函数

设计网络步骤

1. 定义预处理数据
2. 定义网络结构
3. 定义训练函数
4. 定义测试函数
5. 定义主函数

云贾

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
1
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

专栏目录