Pytorch实现RNN原理

最新推荐文章于 2024-09-15 12:46:45 发布

J k l

最新推荐文章于 2024-09-15 12:46:45 发布

阅读量685

点赞数 1

分类专栏：机器学习神经网络文章标签： rnn

本文链接：https://blog.csdn.net/qq_43056256/article/details/114272542

版权

机器学习同时被 2 个专栏收录

4 篇文章 0 订阅

订阅专栏

神经网络

3 篇文章 0 订阅

订阅专栏

Pytorch实现RNN原理

rnn公式如下。
${{\rm{h}}_t} = {W_{hh}}{{\rm{h}}_{t - 1}} + {W_{ih}}{X_t}$
其中 ${X_t}$ 表示t时刻的输入序列。Pytorch中RNN的输入 ${X}$ 大小为[seq, batch_size, embedding]。
所以 ${X_t}$ 的大小为[batch_size, embedding]。其中embedding维度是要参与运算的维度，batch_size是要保留的信息。所以一般将 ${X_t}$ 的大小写成转置的形式[embedding, batch_size]。

${W_{ih}}{X_t}$ 的结果为[hidden_size, batch_size]。从矩阵的角度来理解就是batchsize个维度为hidden_size的列向量产生都是由原本的序列产生，依赖于自身序列。而与其他的句子无关

上次写的还是有很多小错误，进行了改正

import numpy as np
import torch
from torch import nn
from torch.nn.parameter import Parameter

class Rnn(nn.Module):

    def __init__(self, input_size, hidden_size, num_layers, bidirectional=False):
        super(Rnn, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        Wih = [np.random.random((self.hidden_size, self.hidden_size)) for i in range(1, self.num_layers)]
        Whh = [np.random.random((self.hidden_size, self.hidden_size)) for i in range(self.num_layers)]
        self.Wih, self.Whh = Parameter(torch.tensor(Wih)), Parameter(torch.tensor(Whh))
        Wih0 = np.random.random((self.hidden_size, self.hidden_size))
        self.Wih0 = Parameter(torch.tensor(Wih0))

    def forward(self, x):
        '''

        :param x: [seq, batch_size, embedding]
        :return: out, hidden
        '''

        # x.shape [sep, batch, feature]
        # hidden.shape [hidden_size, batch]
        # Whh0.shape [hidden_size, hidden_size]  Wih0.shape [hidden_size, feature]
        # Whh1.shape [hidden_size, hidden_size]  Wih1.size  [hidden_size, hidden_size]
        if not isinstance(x, torch.Tensor):
            raise TypeError('x is not tensor')

        out = []
        hidden = [np.zeros((self.hidden_size, x.shape[1])) for i in range(self.num_layers)]

        Wih0 = np.random.random((self.hidden_size, x.shape[2]))

        # x, hidden, Wih, Whh = torch.from_numpy(x), torch.tensor(hidden), torch.tensor(Wih), torch.tensor(Whh)

        hidden = torch.tensor(hidden)
        self.Wih0 = Parameter(torch.tensor(Wih0))


        time = x.shape[0]
        for i in range(time):
            hidden[0] = torch.tanh((torch.matmul(self.Wih0, torch.transpose(x[i, ...], 1, 0)) +
                              torch.matmul(self.Whh[0], hidden[0].clone().detach())
                              ))

            for i in range(1, self.num_layers):
                hidden[i] = torch.tanh((torch.matmul(self.Wih[i-1], hidden[i-1].clone().detach()) +
                                     torch.matmul(self.Whh[i], hidden[i].clone().detach())
                                     ))

            out.append(hidden[self.num_layers-1])
        # 如果list中的元素为tensor，就无法用torch.tensor()转换，会报错
        return torch.stack([i for i in out]).permute(0, 2, 1).contiguous(), hidden.permute(0, 2, 1).contiguous()


if __name__ == '__main__':
    a = torch.tensor([1, 2, 3])
    print(torch.cuda.is_available(), type(a))
    rnn = Rnn(1, 5, 4)
    rnn_office = nn.RNN(1, 5, 4)
    optimizer = torch.optim.Adam(params=rnn.parameters(), lr=0.1)
    # print(list(rnn.parameters()))
    input = torch.tensor(np.random.random((6, 3, 1)))
    for _ in range(10):
        out, h = rnn(input)
        # pred = torch.softmax(h.mean(dim=0), dim=1).argmax(dim=1).to(torch.float64)
        pred = h.mean(dim=0)
        a = rnn.parameters()
        # print(rnn)

        param = [i for i in a]

        label = torch.tensor([1, 2, 3])
        criticism = nn.CrossEntropyLoss()
        loss = criticism(pred, label)
        print(loss.item())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(rnn_office(input.to(torch.float32))[1].shape)
    print(f'seq is {input.shape[0]}, batch_size is {input.shape[1]} ', 'out.shape ', out.shape, ' h.shape ', h.shape)

    # print(sigmoid(np.random.random((2, 3))))
    #
    # element-wise multiplication
    # print(np.array([1, 2])*np.array([2, 1]))

分割线

我又将代码稍微调整，使得其可以进行梯度下降计算。

import numpy as np
import torch
from torch import nn

class Rnn(nn.Module):

    def __init__(self, input_size, hidden_size, num_layers, bidirectional=False):
        super(Rnn, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bidirectional = bidirectional

    def forward(self, x):
        '''

        :param x: [seq, batch_size, embedding]
        :return: out, hidden
        '''

        # x.shape [sep, batch, feature]
        # hidden.shape [hidden_size, batch]
        # Whh0.shape [hidden_size, hidden_size]  Wih0.shape [hidden_size, feature]
        # Whh1.shape [hidden_size, hidden_size]  Wih1.size  [hidden_size, hidden_size]

        out = []
        x, hidden = np.array(x), [np.zeros((self.hidden_size, x.shape[1])) for i in range(self.num_layers)]
        Wih = [np.random.random((self.hidden_size, self.hidden_size)) for i in range(1, self.num_layers)]
        Wih0 = np.random.random((self.hidden_size, x.shape[2]))
        Whh = [np.random.random((self.hidden_size, self.hidden_size)) for i in range(self.num_layers)]
        # x, hidden, Wih, Whh = torch.from_numpy(x), torch.tensor(hidden), torch.tensor(Wih), torch.tensor(Whh)

        x = torch.from_numpy(x)
        hidden = torch.tensor(hidden)
        Wih0 = torch.tensor(Wih0, requires_grad=True)
        Wih, Whh = torch.tensor(Wih, requires_grad=True), torch.tensor(Whh, requires_grad=True)

        time = x.shape[0]
        for i in range(time):
            hidden[0] = torch.tanh((torch.matmul(Wih0, torch.transpose(x[i, ...], 1, 0)) +
                              torch.matmul(Whh[0], hidden[0])
                              ))

            for i in range(1, self.num_layers):
                hidden[i] = torch.tanh((torch.matmul(Wih[i-1], hidden[i-1]) +
                                     torch.matmul(Whh[i], hidden[i])
                                     ))

            out.append(hidden[self.num_layers-1])
        # 如果list中的元素为tensor，就无法用torch.tensor()转换，会报错
        return torch.stack([i for i in out]), hidden


def sigmoid(x):
    return 1.0/(1.0 + 1.0/np.exp(x))


if __name__ == '__main__':
    a = torch.tensor([1, 2, 3])
    print(torch.cuda.is_available(), type(a))
    rnn = Rnn(1, 5, 4)
    input = np.random.random((6, 2, 1))
    out, h = rnn(input)
    print(f'seq is {input.shape[0]}, batch_size is {input.shape[1]} ', 'out.shape ', out.shape, ' h.shape ', h.shape)
    # print(sigmoid(np.random.random((2, 3))))
    #
    # element-wise multiplication
    # print(np.array([1, 2])*np.array([2, 1]))

分割线

首先说明代码只是帮助理解，并未写出梯度下降部分，默认参数已经被固定，不影响理解。代码主要实现RNN原理，只使用numpy库，不可用于GPU加速。

import numpy as np


class Rnn():

    def __init__(self, input_size, hidden_size, num_layers, bidirectional=False):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bidirectional = bidirectional

    def feed(self, x):
        '''

        :param x: [seq, batch_size, embedding]
        :return: out, hidden
        '''

        # x.shape [sep, batch, feature]
        # hidden.shape [hidden_size, batch]
        # Whh0.shape [hidden_size, hidden_size]  Wih0.shape [hidden_size, feature]
        # Whh1.shape [hidden_size, hidden_size]  Wih1.size  [hidden_size, hidden_size]

        out = []
        x, hidden = np.array(x), [np.zeros((self.hidden_size, x.shape[1])) for i in range(self.num_layers)]
        Wih = [np.random.random((self.hidden_size, self.hidden_size)) for i in range(1, self.num_layers)]
        Wih.insert(0, np.random.random((self.hidden_size, x.shape[2])))
        Whh = [np.random.random((self.hidden_size, self.hidden_size)) for i in range(self.num_layers)]

        time = x.shape[0]
        for i in range(time):
            hidden[0] = np.tanh((np.dot(Wih[0], np.transpose(x[i, ...], (1, 0))) +
                              np.dot(Whh[0], hidden[0])
                              ))

            for i in range(1, self.num_layers):
                hidden[i] = np.tanh((np.dot(Wih[i], hidden[i-1]) +
                                     np.dot(Whh[i], hidden[i])
                                     ))

            out.append(hidden[self.num_layers-1])

        return np.array(out), np.array(hidden)


def sigmoid(x):
    return 1.0/(1.0 + 1.0/np.exp(x))


if __name__ == '__main__':
    rnn = Rnn(1, 5, 4)
    input = np.random.random((6, 2, 1))
    out, h = rnn.feed(input)
    print(f'seq is {input.shape[0]}, batch_size is {input.shape[1]} ', 'out.shape ', out.shape, ' h.shape ', h.shape)
    # print(sigmoid(np.random.random((2, 3))))
    #
    # element-wise multiplication
    # print(np.array([1, 2])*np.array([2, 1]))