LSTM

最新推荐文章于 2024-06-18 19:07:26 发布

酸柠檬水

最新推荐文章于 2024-06-18 19:07:26 发布

阅读量680

点赞数 1

本文链接：https://blog.csdn.net/zr7116/article/details/91558891

版权

https://www.zhihu.com/question/268862438

RNN内部通过σ(Wx+b)将输入x转成输出h。这里的参数W每一步都是一样的，即参数共享，其维度为[embed_dim, hidden_size]。

lstm 及变体 https://www.cnblogs.com/wangduo/p/6773601.html?utm_source=itdadao&utm_medium=referral

视频：https://www.youtube.com/watch?v=EC3SvfW0Z_A

论文公式代码 lstm GRU 实现：https://blog.csdn.net/meanme/article/details/48845793

GRU LSTM图像 LSTM公式：https://blog.csdn.net/HHTNAN/article/details/79014353

http://www.ishenping.com/ArtInfo/452977.html lstm 代码

https://zhuanlan.zhihu.com/p/44424550每块结构的代码

#http://www.ishenping.com/ArtInfo/452977.html lstm 代码

#https://zhuanlan.zhihu.com/p/44424550每块结构的代码
import warnings
#控制警告错误的输出
warnings.filterwarnings(action=''ignore'', category=UserWarning, module=''gensim'')
import gensim
from gensim.models import word2vec
import jieba
import tensorflow as tf
import numpy as np
import time
#模块random包含以各种方式生成随机数的函数，其中的randint()返回一个位于指定范围内的整数
from random import randint
#shuffle() 方法将序列的所有元素随机排序。
from random import shuffle
#----------------------------------
#通过读取名字为停用词.txt的文件来返回停用词
def makeStopWord():
    with open(''停用词.txt'',''r'',encoding = ''utf-8'') as f:
        lines = f.readlines()
    stopWord = []
    for line in lines:
        words = jieba.lcut(line,cut_all = False)
        for word in words:
            stopWord.append(word)
    return stopWord

def words2Array(lineList):
    linesArray=[]
    wordsArray=[]
    steps = []
    for line in lineList:
        t = 0
        p = 0
        for i in range(MAX_SIZE):#一条评价最多容纳的单词数目(25个，多退少补)
            if i<len(line):
                try:#添加每个行的每个词的词向量
                    wordsArray.append(model.wv.word_vec(line[i]))
                    p = p + 1
                except KeyError:
                    t=t+1
                    continue
            else:#一句话不够25个词的，用200维的词向量代表的词来补够
               wordsArray.append(np.array([0.0]*dimsh))
        for i in range(t):
            wordsArray.append(np.array([0.0]*dimsh))
        steps.append(p)#统计一句话包含多少个有效词（即扣除非补齐的词）
        linesArray.append(wordsArray)#从第一句话开始，每句话用25行200列的矩阵来表示，直到遍历所有的句子。
        wordsArray = []
    linesArray = np.array(linesArray)#三维矩阵
    steps = np.array(steps)#统计每一句话中的有效词放到数组中，数组的元素个数为句子的个数
    return linesArray, steps

def convert2Data(posArray, negArray, posStep, negStep):
    randIt = []
    data = []
    steps = []
    labels = []
    for i in range(len(posArray)):
        #积极评价：25*200的矩阵，有效词的个数，标签，如果是分3类，标签就为[1,0,0]
        randIt.append([posArray[i], posStep[i], [1,0]])
    for i in range(len(negArray)):#消极评价：25*200的矩阵，有效词的个数，标签
        randIt.append([negArray[i], negStep[i], [0,1]])
    shuffle(randIt)#随机混乱
    for i in range(len(randIt)):
        data.append(randIt[i][0])#每一句话的25*200的矩阵表示，放到data中
        steps.append(randIt[i][1])#每一句话的有效词的个数，放到step中
        labels.append(randIt[i][2])#每一句话的标签，放到label中
    data = np.array(data)
    steps = np.array(steps)
    return data, steps, labels

def getWords(file):
    wordList = []
    trans = []
    lineList = []
    with open(file,''r'',encoding=''utf-8'') as f:
        lines = f.readlines()
    for line in lines:
        #去掉句子末尾的空格符
        trans = jieba.lcut(line.replace(''\n'',''、''), cut_all = False)
        for word in trans:
            if word not in stopWord:
                wordList.append(word)
        lineList.append(wordList)
        wordList = []
    return lineList

def makeData(posPath,negPath):
    #获取词汇，返回类型为[[word1,word2...],[word1,word2...],...]
    pos = getWords(posPath)
    print("The positive data''s length is :",len(pos))
    neg = getWords(negPath)
    print("The negative data''s length is :",len(neg))
    #将评价数据转换为矩阵，返回类型为array
    posArray, posSteps = words2Array(pos)
    negArray, negSteps = words2Array(neg)
    #将积极数据和消极数据混合在一起打乱，制作数据集
    Data, Steps, Labels = convert2Data(posArray, negArray, posSteps, negSteps)
    return Data, Steps, Labels

#----------------------------------------------
# Word60.model   60维
# word2vec.model        200维

timeA=time.time()
word2vec_path = ''word2vec/word2vec.model''
model=gensim.models.Word2Vec.load(word2vec_path)
dimsh=model.vector_size
MAX_SIZE=25
stopWord = makeStopWord()

print("In train data:")
#trainSteps是一维数组，len(trainSteps)=总的样本数据的数目，即多少条评论；每个元素就是每条评论的长度
#trainData的数据结构是19130*25*200.
#trainLabels的数据结构是19130*2
trainData, trainSteps, trainLabels = makeData(''data/B/Pos-train.txt'',
                                              ''data/B/Neg-train.txt'')
print("In test data:")
testData, testSteps, testLabels = makeData(''data/B/Pos-test.txt'',
                                           ''data/B/Neg-test.txt'')
trainLabels = np.array(trainLabels)

del model

print("-"*30)
print("The trainData''s shape is:",trainData.shape)
print("The testData''s shape is:",testData.shape)
print("The trainSteps''s shape is:",trainSteps.shape)
print("The testSteps''s shape is:",testSteps.shape)
print("The trainLabels''s shape is:",trainLabels.shape)
print("The testLabels''s shape is:",np.array(testLabels).shape)


num_nodes = 128
batch_size = 16
output_size = 2

graph = tf.Graph()
with graph.as_default():
    #trainData''s shape 与下面的tf.placeholder的shape是相同的才行，都是3维的矩阵
    tf_train_dataset = tf.placeholder(tf.float32,shape=(batch_size,MAX_SIZE,dimsh))
    #每个batch中长度最长的那条样本数据对应的长度
    tf_train_steps = tf.placeholder(tf.int32,shape=(batch_size))
    #The trainLabels''s shape与tf_train_labels的shape类型相同，都是2维矩阵。
    tf_train_labels = tf.placeholder(tf.float32,shape=(batch_size,output_size))

    tf_test_dataset = tf.constant(testData,tf.float32)#常量
    tf_test_steps = tf.constant(testSteps,tf.int32)#常量
#tf.nn.rnn_cell.BasicLSTMCell定义单个基本的LSTM单元，num_units表示神经元的个数
#state_is_tuple=True的时候，state是元组形式，state=(c,h)。
#如果是False，那么state是一个由c和h拼接起来的张量，state=tf.concat(1,[c,h])
#还有个参数forget_bias是遗忘门的偏差值（0到1之间）
    lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units = num_nodes,
                                             state_is_tuple=True)
#tf.truncated_normal(shape, mean, stddev) :shape表示生成张量的维度，mean是均值，stddev是标准差。
#这个函数产生随机的正太分布值作为w1初始值，均值和标准差自己设定。
    w1 = tf.Variable(tf.truncated_normal([num_nodes,num_nodes // 2], stddev=0.1))
    #随机产生一维数组，数组内的元素个数为num_nodes // 2
    b1 = tf.Variable(tf.truncated_normal([num_nodes // 2], stddev=0.1))
    #如果是3分类问题，w2的列数就为3
    w2 = tf.Variable(tf.truncated_normal([num_nodes // 2, 2], stddev=0.1))
    #随机产生一维数组，数组内的元素个数(output_size)为2，如果是3分类，数组内的元素个数为3
    b2 = tf.Variable(tf.truncated_normal([2], stddev=0.1))
    #训练过程中，每送进模型一个字，就是一个step
    def model(dataset, steps):
        #dynamic_rnn返回两个变量，第一个是每个step的输出值，第二个是最终的状态。
        #sequencelength即每条评价对应的长度只包括有效单词，通过trainSteps传入
        outputs, last_states = tf.nn.dynamic_rnn(cell = lstm_cell,
                                                 dtype = tf.float32,
                                                 sequence_length = steps,
                                         inputs = dataset)
        #rnn会在每一个字产生一个cell_state，一个字对应一个cell，在这里只取最后一个字的cell_state作为输出
        #last_states[1]=last_states.h即该时刻的隐藏状态,形状为[batch_size,cell_num]
        #last_states[0]=last_states.c即该时刻的细胞状态,形状为[batch_size,cell_num]
        #outputs由一系列的h构成，形状为[batch_size,step,cell_num]
        hidden = last_states[1]
        hidden = tf.matmul(hidden, w1) + b1
        logits = tf.matmul(hidden, w2) + b2
        #就是神经网络最后一层的输出，如果有batch的话，它的大小就是[batchsize，num_classes]
        #意思是矩阵：batchsize*num_classes，logits的维度就是类别的数目，每个类都有对应的概率
        
        return logits
    train_logits = model(tf_train_dataset, tf_train_steps)
#tf.reduce_mean(x)表示计算全局平均值，batchz_size做分母；
#tf.reduce_mean(x, axis=0)表示计算每列的平均值，batchz_size做分母；
#tf.reduce_mean(x, axis=1)表示计算每行的平均值，batchz_size做分母；
    loss = tf.reduce_mean(
#第一个参数logits：就是神经网络最后一层的输出，如果有batch的话，它的大小就是[batchsize，num_classes]，单样本的话，大小就是num_classes
#第二个参数labels：实际的标签，大小同上。
#第一步是对输出层做归一化处理，得到每一批样本中的每个样本的结果分类概率分布
#第二步是实际标签做一个交叉熵，得到的是一个矩阵
#第三步是对上述矩阵求平均值
        tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels,
                                                logits=train_logits))
    
    #学习率取0.01
    optimizer = tf.train.GradientDescentOptimizer(0.01).minimize(loss)
#test_prediction将按照模型计算测试集的标签为0或1的概率，
#形式为[[0.6,0.4],
#      [0.8,0.2],
#     [0.4,0.6]]
    test_prediction = tf.nn.softmax(model(tf_test_dataset, tf_test_steps))



summary_frequency = 500


with tf.Session(graph = graph) as session:
    tf.global_variables_initializer().run()
    print(''Initialized'')
    mean_loss = 0
    #bacth_size为移动的覆盖面，从第一条评论开始移动，每次移动的步数最小是1，最大是bacth_size那么大。
    for data_index in range(len(trainData)+1):
        offset = (data_index * batch_size) % (len(trainLabels)-batch_size)
        #feed_dict的作用是给使用placeholder创建出来的tensor赋值
        #每次输入的训练数据只有batch_size个，随机取起点，取连续batch_size个数据
        feed_dict={tf_train_dataset:trainData[offset:offset + batch_size],
                   tf_train_labels:trainLabels[offset:offset + batch_size],
                   tf_train_steps:trainSteps[offset:offset + batch_size]}
        _, l = session.run([optimizer,loss],
                           feed_dict = feed_dict)
        mean_loss += l
        if data_index >0 and data_index % summary_frequency == 0:
            mean_loss = mean_loss / summary_frequency
            print("The data_index is: %d"%(data_index))
            print("In train data,the loss is:%.4f"%(mean_loss))
            mean_loss = 0
            acrc = 0
            prediction = session.run(test_prediction)
            for i in range(len(prediction)):
                #testLabels[i].index(1)代表测试集的第i行的第二列的vulue值(0或1),也就是类别值的标签
                #testLabels的形式为[[0,1],
                #                   [1,0],
                #                   [2,0]]
                #prediction的列数跟类别的数量相等，有多少列就有多少类
                ''''''多分类问题就是：testLabels的形式为[[0,1],相应的prediction的形式为[[0.6,0.2,02],                                                                                   []
                #                                   [1,2],                         [0.5,0.3,02],
                #                                   [2,0],                         [0.7,0.1,02],
                #                                   [3,2],                         [0.4,0.4,02],
                #                                   [4,1]]                         [0.6,0.3,01]]
                ''''''
                #大于0.5说明成功分类一次，如果是多分类比如分三类，大于0.34就行
                if prediction[i][testLabels[i].index(1)] > 0.5:
                    acrc = acrc + 1
            print("In test data,the accuracy is:%.2f%%"%((acrc/len(testLabels))*100))
#####################################
timeB=time.time()
#输出项目运行时间
print("time cost:",int(timeB-timeA))

训练过程如下：

In train data:
The positive data''s length is : 9701
The negative data''s length is : 9429
In test data:
The positive data''s length is : 995
The negative data''s length is : 999
------------------------------
The trainData''s shape is: (19130, 25, 200)
The testData''s shape is: (1994, 25, 200)
The trainSteps''s shape is: (19130,)
The testSteps''s shape is: (1994,)
The trainLabels''s shape is: (19130, 2)
The testLabels''s shape is: (1994, 2)
Initialized
The data_index is: 500
In train data,the loss is:0.6226
In test data,the accuracy is:69.56%
The data_index is: 1000
In train data,the loss is:0.5280
In test data,the accuracy is:74.17%
The data_index is: 1500
In train data,the loss is:0.4674
In test data,the accuracy is:77.68%
The data_index is: 2000
In train data,the loss is:0.4277
In test data,the accuracy is:80.59%
The data_index is: 2500
In train data,the loss is:0.3888
In test data,the accuracy is:82.15%
The data_index is: 3000
In train data,the loss is:0.3724
In test data,the accuracy is:83.05%
The data_index is: 3500
In train data,the loss is:0.3435
In test data,the accuracy is:83.75%
The data_index is: 4000
In train data,the loss is:0.3362
In test data,the accuracy is:86.06%
The data_index is: 4500
In train data,the loss is:0.3024
In test data,the accuracy is:87.36%
The data_index is: 5000
In train data,the loss is:0.2957
In test data,the accuracy is:88.11%
The data_index is: 5500
In train data,the loss is:0.2755
In test data,the accuracy is:89.27%
The data_index is: 6000
In train data,the loss is:0.2520
In test data,the accuracy is:89.82%
The data_index is: 6500
In train data,the loss is:0.2400
In test data,the accuracy is:88.67%
The data_index is: 7000
In train data,the loss is:0.2213
In test data,the accuracy is:91.02%
The data_index is: 7500
In train data,the loss is:0.2174
In test data,the accuracy is:92.43%
The data_index is: 8000
In train data,the loss is:0.1927
In test data,the accuracy is:92.68%
The data_index is: 8500
In train data,the loss is:0.1801
In test data,the accuracy is:93.28%
The data_index is: 9000
In train data,the loss is:0.1639
In test data,the accuracy is:92.73%
The data_index is: 9500
In train data,the loss is:0.1471
In test data,the accuracy is:94.68%
The data_index is: 10000
In train data,the loss is:0.1404
In test data,the accuracy is:93.83%
The data_index is: 10500
In train data,the loss is:0.1251
In test data,the accuracy is:94.13%
The data_index is: 11000
In train data,the loss is:0.1196
In test data,the accuracy is:94.38%
The data_index is: 11500
In train data,the loss is:0.0992
In test data,the accuracy is:94.73%
The data_index is: 12000
In train data,the loss is:0.0894
In test data,the accuracy is:94.73%
The data_index is: 12500
In train data,the loss is:0.0773
In test data,the accuracy is:94.68%
The data_index is: 13000
In train data,the loss is:0.0678
In test data,the accuracy is:95.49%
The data_index is: 13500
In train data,the loss is:0.0688
In test data,the accuracy is:96.24%
The data_index is: 14000
In train data,the loss is:0.0527
In test data,the accuracy is:94.78%
The data_index is: 14500
In train data,the loss is:0.0508
In test data,the accuracy is:95.79%
The data_index is: 15000
In train data,the loss is:0.0432
In test data,the accuracy is:95.54%
The data_index is: 15500
In train data,the loss is:0.0585
In test data,the accuracy is:94.73%
The data_index is: 16000
In train data,the loss is:0.0418
In test data,the accuracy is:95.79%
The data_index is: 16500
In train data,the loss is:0.0318
In test data,the accuracy is:95.44%
The data_index is: 17000
In train data,the loss is:0.0397
In test data,the accuracy is:96.04%
The data_index is: 17500
In train data,the loss is:0.0287
In test data,the accuracy is:95.74%
The data_index is: 18000
In train data,the loss is:0.0215
In test data,the accuracy is:95.14%
The data_index is: 18500
In train data,the loss is:0.0464
In test data,the accuracy is:95.09%
The data_index is: 19000
In train data,the loss is:0.0259
In test data,the accuracy is:95.59%
time cost: 662

RNN, LSTM, GRU 公式总结https://blog.csdn.net/zhangxb35/article/details/70060295

Vanilla RNN
参考 RNN wiki 的描述，根据隐层 htht 接受的是上时刻的隐层（hidden layer） ht−1ht−1 还是上时刻的输出（output layer）yt−1yt−1，分成了两种 RNN，定义如下：

Elman network 接受上时刻的隐层 ht−1ht−1
Jordan network 接受上时刻的输出 yt−1yt−1
但是看了很多的教程，感觉应用最多的还是 Elman network 的做法。比如 WILDML: RECURRENT NEURAL NETWORKS TUTORIAL 画出来的示意图：

还有 Andrej Karpathy 的博客 The Unreasonable Effectiveness of Recurrent Neural Networks 的实现，也是接收上一时刻隐层的结果，图就不贴了。

Bidirectional RNNs

双向的 RNN 是同时考虑“过去”和“未来”的信息，考虑上图，正常情况下，输入（黑色点）沿着黑色的实线箭头传输到隐层（黄色点），再沿着红色实线传到输出（红色点）。黑色实线做完前向传播后，在 Bidirectional RNNs 却先不急着后向传播，而是从末尾的时刻沿着虚线的方向再回传回来。最后把两个方向得到的激活值拼在一起（concatenate），当做最后的激活值。那么后向传播也是类似，要转一圈回来。

Stacked Bidirectional RNNs

堆多层的 recurrent layer，如上图所示，可以增加模型的参数，提高模型的学习能力。每层的 hidden state 不仅要输给下一时刻，还是当做是此时刻下一层的输入。上图展示了双向的三层 RNNs，那么 hidden state 的维度是 hidden_dim * 6，输出的维度为 hidden_dim * 2，因为是两个方向最有一层 hidden state 拼接的结果。
原始的 RNN 很难训练，主要是因为存在梯度消失（gradient vanishing problem）和梯度爆炸问题（gradient explosion problem）。梯度消失导致无法抓住长时刻依赖，因此效果不好，后面的 LSTM 和 GRU 的新结构，就是为了对付这个问题。而梯度爆炸问题虽然不是每次都出现，但是一旦出现就很致命。一般会选择用截断的梯度（clipped gradient）来更新参数，或者直接把梯度 rescale 到一个固定模大小的范围。

LSTM
由于 Vanilla RNN 具有梯度消失问题，对长关系的依赖（Long-Term Dependencies）的建模能力不够强大。这句话是什么意思呢？就是说，原来的 RNN，由于结构上的限制，很长的时刻以前的输入，对现在的网络影响非常小，后向传播时那些梯度，也很难影响很早以前的输入，即会出现梯度消失的问题。而 LSTM 通过构建一些门（Gate），让网络能记住那些非常重要的信息，而这个核心的结构，就是 cell state。比如遗忘门，来选择性清空过去的记忆和更新较新的信息。

上面讲的比较迷糊，如果我有新的理解会更新这个博客。另外可以参考大神的博客 Understanding LSTM Networks，把 LSTM 讲的深入浅出，并且提到了很多的变种和展望。

有两种常见的 LSTM 结构，如 LSTM wiki 总结的，第一种是带遗忘门的 Traditional LSTM，公式如下：

前三行是三个门，分别是遗忘门 ftft，输入门 itit，输出门 otot，输入都是 [xt,ht−1][xt,ht−1]，只是参数不同，然后要经过一个激活函数，把值放缩到 [0,1][0,1] 附近。第四行 ctct 是 cell state，由上一时刻的 ct−1ct−1 和输入得到。如果遗忘门 ftft 取 0 的话，那么上一时刻的状态就会全部被清空（清空 or 遗忘？），然后只关注此时刻的输入。输入门 itit 决定是否接收此时刻的输入。最后输出门 otot 决定是否输出 cell state。

注意这里的输出 htht 只是对应上面 RNN 的隐层，而非输出。这里的输出 htht 又会被当做是下一时刻的输入。

有时候第四个公式里的 σ(Wcxt+Ucht−1+bc)σ(Wcxt+Ucht−1+bc) 可以单独抽出来，写作 c˜c~，叫做 new memory content，那么第四个公式就可以写作是 ct=ft∘ct−1+it∘c˜ct=ft∘ct−1+it∘c~，这样一来 cell state 的更新来源就很明显了，一部分是上时刻的自己，一部分是新的 new memory content，而且两个来源是相互独立地由两个门控制的。遗忘门控制是否记住以前的那些特征，输入门决定是否接收当前的输入。后面可以看到 GRU 其实把这两个门合二为一了。

第二种是带遗忘门的 Peephole LSTM，公式如下，

和上面的公式做比较，发现只是把 ht−1ht−1 都换成了 ct−1ct−1，即三个门的输入都改成了 [xt,ct−1][xt,ct−1]。因为是从 cell state 里取得信息，所以叫窥视孔（peephole）。

还有把两种结构结合起来的，可以用下图描述，

图里的连着门的那些虚线就是窥视孔。三个输入分别是 [xt,ht−1,ct−1][xt,ht−1,ct−1] 。上图引自 Alex Graves 的论文 Supervised Sequence Labelling with Recurrent Neural Networks 中对 LSTM 的描述。注意该论文里的输出门和其他两个门稍稍不同，接受的是 ctct，而非 ct−1ct−1，我没有找到这样做的解释。

GRU
GRU 这个结构是 2014 年才出现的，效果堪比 LSTM，但是用到的参数更少。见论文 Empirical Evaluation of Gated Recurrent Neural Networks on Sequence Modeling 和 An Empirical Exploration of Recurrent Network Architectures 对 LSTM 和 GRU 这两种结构的比较。

GRU 的结构和 LSTM 类似，但是精简一些，见下图

公式如下：

zt=σ(Wzxt+Uzht−1)rt=σ(Wtxt+Utht−1)h˜t=tanh(Wxt+U(rt∘ht−1))ht=(1−zt)∘ht−1+zt∘h˜t
zt=σ(Wzxt+Uzht−1)rt=σ(Wtxt+Utht−1)h~t=tanh(Wxt+U(rt∘ht−1))ht=(1−zt)∘ht−1+zt∘h~t
这四行公式解释如下：

ztzt 是 update gate，更新 activation 时的逻辑门
rtrt 是 reset gate，决定 candidate activation 时，是否要放弃以前的 activation htht
h˜th~t 是 candidate activation，接收 [xt,ht−1][xt,ht−1]
htht 是 activation，是 GRU 的隐层，接收 [ht−1,h˜t][ht−1,h~t]
论文 [8] 和 [9] 详细对比了 LSTM 和 GRU 以及传统的 RNN 的异和同，探讨了这些结构的好处。从 LSTM 和 GRU 的公式里可以看出，都会有门操作，决定是否保留上时刻的状态，和是否接收此时刻的外部输入，LSTM 是用遗忘门（forget gate ftft）和输入门（input gate itit）来做到的，GRU 则是只用了一个更新门（update gate ztzt）。

这种设计有两个解释，一个解释是说，网络是能很容易地记住长依赖问题。即前面很久之前出现过一个重要的特征，如果遗忘门或者更新门选择不重写（overwritten）内部的 memory，那么网络就会一直记住之前的重要特征，那么会对当前或者未来继续产生影响。另一点是，这种设计可以为不同状态之间提供一条捷径（shortcut），那么梯度回传的时候就不会消失的太快，因此减缓了梯度消失带来的难训练问题。

LSTM 和 GRU 也有一些重要的不同点。首先就是 LSTM 有一个输出门来控制 memory content 的曝光程度（exposure），而 GRU 则是直接输出。另一点是要更新的 new memory content 的来源也不同。h˜th~t 会通过重置门（reset gate）控制从 ht−1ht−1 中得到信息的力度，而 c˜tc~t 则没有，而是直接输入 ht−1ht−1。

论文还用实验证明了相同个数参数的情况下，GRU 会比 LSTM 稍好一些。但是两种因为能抓住 Long-Term Dependencies，所以都比 Vanilla RNN 要好很多。

Reference:
1. LSTM wikipedia
2. WILDML RNN Tutorial
3. Pytorch Recurrent Layers
4. The Unreasonable Effectiveness of Recurrent Neural Networks
5. Understanding LSTM Networks
6. Supervised Sequence Labelling with Recurrent Neural Networks
7. A Critical Review of Recurrent Neural Networks for Sequence Learning
8. Empirical Evaluation of Gated Recurrent Neural Networks on Sequence Modeling
9. An Empirical Exploration of Recurrent Network Architectures、

酸柠檬水

关注

1
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
LSTM

https://www.zhihu.com/question/268862438RNN内部通过σ(Wx+b)将输入x转成输出h。这里的参数W每一步都是一样的，即参数共享，其维度为[embed_dim, hidden_size]。lstm 及变体https://www.cnblogs.com/wangduo/p/6773601.html?utm_source=itdadao&...
复制链接

扫一扫