LSTM2

最新推荐文章于 2024-10-04 20:30:00 发布
向上Claire
最新推荐文章于 2024-10-04 20:30:00 发布
阅读量97
点赞数
分类专栏：研一文章标签： python 深度学习开发语言
本文链接：https://blog.csdn.net/weixin_44522477/article/details/128747703
版权
研一专栏收录该内容
24 篇文章 1 订阅
订阅专栏
我真的太棒了吧
"""
---
title: Long Short-Term Memory (LSTM)
summary: A simple PyTorch implementation/tutorial of Long Short-Term Memory (LSTM) modules.
---
# Long Short-Term Memory (LSTM)
This is a [PyTorch](https://pytorch.org) implementation of Long Short-Term Memory.
"""

# Optional 的作用是可选类型，作用几乎和带默认值的参数等价。
# tuple 允许list, tuple, set, frozenset, deque, 或生成器并强制转换为元组
from typing import Optional, Tuple

import torch
from torch import nn

from labml_helpers.module import Module


class LSTMCell(Module):
    """
    ## Long Short-Term Memory Cell
    LSTM Cell computes $c$, and $h$. $c$ is like the long-term memory,
    and $h$ is like the short term memory.
    We use the input $x$ and $h$ to update the long term memory.
    In the update, some features of $c$ are cleared with a forget gate $f$,
    and some features $i$ are added through a gate $g$.
    The new short term memory is the $\tanh$ of the long-term memory
    multiplied by the output gate $o$.
    Note that the cell doesn't look at long term memory $c$ when doing the update. It only modifies it.
    Also $c$ never goes through a linear transformation.
    This is what solves vanishing and exploding gradients.
    Here's the update rule.
    \begin{align}
    c_t &= \sigma(f_t) \odot c_{t-1} + \sigma(i_t) \odot \tanh(g_t) \\
    h_t &= \sigma(o_t) \odot \tanh(c_t)
    \end{align}
    $\odot$ stands for element-wise multiplication.
    Intermediate values and gates are computed as linear transformations of the hidden
    state and input.
    \begin{align}
    i_t &= lin_x^i(x_t) + lin_h^i(h_{t-1}) \\
    f_t &= lin_x^f(x_t) + lin_h^f(h_{t-1}) \\
    g_t &= lin_x^g(x_t) + lin_h^g(h_{t-1}) \\
    o_t &= lin_x^o(x_t) + lin_h^o(h_{t-1})
    \end{align}
    """

    def __init__(self, input_size: int, hidden_size: int, layer_norm: bool = False):
        super().__init__()

        # These are the linear layer to transform the `input` and `hidden` vectors.
        # One of them doesn't need a bias since we add the transformations.

        # This combines $lin_x^i$, $lin_x^f$, $lin_x^g$, and $lin_x^o$ transformations.
        # 我认为应该是输入的是input-size , hidden_size就等于输出的size
        # self.hidden_lin = nn.Linear(hidden_size, 4 * hidden_size)
        self.input_lin = nn.Linear(input_size, 4 * hidden_size, bias=False)
        # This combines $lin_h^i$, $lin_h^f$, $lin_h^g$, and $lin_h^o$ transformations.
        self.hidden_lin = nn.Linear(hidden_size, 4 * hidden_size)

        # Whether to apply layer normalizations.
        #
        # Applying layer normalization gives better results.
        # $i$, $f$, $g$ and $o$ embeddings are normalized and $c_t$ is normalized in
        # $h_t = o_t \odot \tanh(\mathop{LN}(c_t))$
        # 线性层是否规范 layer_norm 图层规范层是BooL
        if layer_norm:
            # 我们将我们需要的层放到一个集合，然后将这个集合放入到nn.MoudleList中，MoudleList没有forward
            # 归一化的维度，我总共有四层，归一为hidden_size的维度
            self.layer_norm = nn.ModuleList([nn.LayerNorm(hidden_size) for _ in range(4)])
            # 归一层hidden_size的维度
            self.layer_norm_c = nn.LayerNorm(hidden_size)
        else:
            # 如果不规范，nn.Identity不改变输入，直接return input，否则就输入什么就输出什么
            self.layer_norm = nn.ModuleList([nn.Identity() for _ in range(4)])
            self.layer_norm_c = nn.Identity()

    def forward(self, x: torch.Tensor, h: torch.Tensor, c: torch.Tensor):
        # We compute the linear transformations for $i_t$, $f_t$, $g_t$ and $o_t$
        # using the same linear layers.
        # 拼接？ inpu_line是线性层，hidden_lin也是线性层，ifgo就是[3，24]，24我能理解，但为什么是3呢？
        ifgo = self.hidden_lin(h) + self.input_lin(x)
        # Each layer produces an output of 4 times the `hidden_size` and we split them
        # 按列分为4块,每一块是（3，6），batch_size是3
        ifgo = ifgo.chunk(4, dim=-1)

        # Apply layer normalization (not in original paper, but gives better results)
        # 四个规范层
        ifgo = [self.layer_norm[i](ifgo[i]) for i in range(4)]

        # $$i_t, f_t, g_t, o_t$$ i为第一个3，6 f为第二个3,6 g为第3个 # ft为softmax(h[x],x[i])正确，i也是正确的，f正确，o呢,o也是一样的东西呀！！！g是什么，两个联合起来了就是c权重不一样，都是进行了一个拼接家伙是那个这个偏置
        i, f, g, o = ifgo

        # $$c_t = \sigma(f_t) \odot c_{t-1} + \sigma(i_t) \odot \tanh(g_t) $$ c_next 为3,6
        # c_next 应该是ft*c+ i的softmax 之后乘以的是tanh（这个前面一样的玩意，
        c_next = torch.sigmoid(f) * c + torch.sigmoid(i) * torch.tanh(g)# f是正确的，这里了softmax，之后乘以之前那的c，加上softmax这个i之后乘以这个tanh,这个g是什么，对正确


        # $$h_t = \sigma(o_t) \odot \tanh(c_t)$$
        # Optionally, apply layer norm to $c_t$ h_next为3，6
        h_next = torch.sigmoid(o) * torch.tanh(self.layer_norm_c(c_next))# h_bext是一样的，输出，这个对的ot，之后乘以c_next，我对了！！！！

        return h_next, c_next


class LSTM(Module):
    """
    ## Multilayer LSTM
    """

    def __init__(self, input_size: int, hidden_size: int, n_layers: int):
        """
        Create a network of `n_layers` of LSTM.
        """

        super().__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        # Create cells for each layer. Note that only the first layer gets the input directly.
        # Rest of the layers get the input from the layer below
        self.cells = nn.ModuleList([LSTMCell(input_size, hidden_size)] +
                                   [LSTMCell(hidden_size, hidden_size) for _ in range(n_layers - 1)])

    def forward(self, x: torch.Tensor, state: Optional[Tuple[torch.Tensor, torch.Tensor]] = None):
        """
        `x` has shape `[n_steps, batch_size, input_size]` and
        `state` is a tuple of $h$ and $c$, each with a shape of `[batch_size, hidden_size]`.
        """
        # x:tensor([[[ 0.3184, -0.7862,  0.5178, -0.2246, -0.2944],
        #          [ 2.4391,  1.0972,  0.0428,  0.8763,  1.2474],
        #          [ 0.4803,  1.6502, -0.2327, -0.1479, -1.4880]]])
        n_steps, batch_size = x.shape[:2]# batch_size = 3 ,n_steps = 1

        # Initialize the state if `None`
        if state is None:
            # h 和c 一样[tensor([[0., 0., 0., 0., 0., 0.],
            #         [0., 0., 0., 0., 0., 0.],
            #         [0., 0., 0., 0., 0., 0.]]), tensor([[0., 0., 0., 0., 0., 0.],
            #         [0., 0., 0., 0., 0., 0.],
            #         [0., 0., 0., 0., 0., 0.]])]
            h = [x.new_zeros(batch_size, self.hidden_size) for _ in range(self.n_layers)]
            c = [x.new_zeros(batch_size, self.hidden_size) for _ in range(self.n_layers)]
        else:
            (h, c) = state
            # Reverse stack the tensors to get the states of each layer
            #
            # 📝 You can just work with the tensor itself but this is easier to debug
            h, c = list(torch.unbind(h)), list(torch.unbind(c))

        # Array to collect the outputs of the final layer at each time step.
        out = []
        for t in range(n_steps):
            # Input to the first layer is the input itself
            # inp = tensor[[3,5]], x 去掉一个大括号
            inp = x[t]
            # Loop through the layers layer先开始等于0，h[layer]
            for layer in range(self.n_layers):
                # Get the state of the layer h[0]为3，6 c[0]为3，6 next
                h[layer], c[layer] = self.cells[layer](inp, h[layer], c[layer])
                # Input to the next layer is the state of this layer
                inp = h[layer] # inp为h[0]，就是输入
            # Collect the output $h$ of the final layer
            out.append(h[-1]) # 第一次out 就是Inp,就是(3,6)

        # Stack the outputs and states
        out = torch.stack(out)# 维度的张量进行拼接 为（1，3，6）
        h = torch.stack(h)# 为2,3,6)
        c = torch.stack(c)# c之前是list=2 两个3，6，限制进行了拼接 变成了2，3，6

        return out, (h, c)# h,c,inp=out是不一样的，好像是相加了友好也没有，不知道也，out为1，3，6 h和c为2，3，6

# inputline  = inpuy_feature = 5 out_feature = 24
# hidden_size = 6, num_layer = 2
# hidden_lin in_feature = 6, output_feature= 24
LS = LSTM(5,6,2)
# tensor([[[ 0.3184],
#          [ 2.4391],
#          [ 0.4803]],
#
#         [[-0.7862],
#          [ 1.0972],
#          [ 1.6502]],
#
#         [[ 0.5178],
#          [ 0.0428],
#          [-0.2327]],
#
#         [[-0.2246],
#          [ 0.8763],
#          [-0.1479]],
#
#         [[-0.2944],
#          [ 1.2474],
#          [-1.4880]]])
# input 为 1：3：5
# x:tensor([[[ 0.3184, -0.7862,  0.5178, -0.2246, -0.2944],
#          [ 2.4391,  1.0972,  0.0428,  0.8763,  1.2474],
#          [ 0.4803,  1.6502, -0.2327, -0.1479, -1.4880]]])
input = torch.randn(1,3,5)
output , (hn,cn) = LS(input)
print(output)# 1 3 6  一模一样呀！！！
print("output:",output.shape)
print(hn)
print("hn:",hn.shape)# hn 2,3,6
print(cn)
print("cn:",cn.shape) # 2 3 6