lstm_cell = tf.contrib.rnn.BasicLSTMCell(lstm_units) # lstm_units隐层单元
outputs, final_state = tf.nn.dynamic_rnn(lstm_cell, lstm_inputs, dtype=tf.float32, time_major=True)

为了描述输出的形状,先介绍几个变量,batch_size是输入的这批数据的数量,max_time就是这批数据中序列的最长长度,如果输入的三个句子,那max_time对应的就是最长句子的单词数量,cell.output_size其实就是rnn cell中神经元的个数。
outputs. outputs是一个tensor
如果*time_major==True,outputs形状为 [max_time行数, batch_size批次, cell.output_size神经元数量 ]*
inputs 输入同理,当time_major==True是输入维度变为[max_time/timestep,batchsize,dim/n_inputs] (我的数据[2, 64, 1000])

inputs = tf.reshape(X, [-1, max_time, n_inputs])
lstm_inputs = tf.transpose(inputs , [1, 0, 2])  # 交换一下维度,[timestep,batchsize,dim]
lstm_cell = tf.contrib.rnn.BasicLSTMCell(lstm_units) # lstm_units隐层单元
outputs, final_state = tf.nn.dynamic_rnn(lstm_cell, lstm_inputs, dtype=tf.float32, time_major=True)

如果*time_major======False(默认),outputs形状为 [ batch_size, max_time, cell.output_size ]*
input输入维度变为[batchsize,max_time/timestep ,dim] (我的数据[64, 2, 1000])

def RNN(X, weights, biases):
    # 输入数据格式inputs=[批次, 共有多少行, 一行多少个数据]
    inputs = tf.reshape(X, [-1, max_time, n_inputs])
    # 定义LSTM基本CELL
    lstm_cell = tf.contrib.rnn.BasicLSTMCell(lstm_size)
    # final_state[0]是cell state
    # final_state[1]是hidden_state
    outputs, final_state = tf.nn.dynamic_rnn(lstm_cell, inputs, dtype=tf.float32)#默认time_major======False
    # 经过softmax层转为概率
   # results = tf.nn.softmax(tf.matmul(final_state[1], weights) + biases)
    results =tf.matmul(final_state[1], weights) + biases
    return results

state. state是一个tensor,state是最终的状态,也就是序列中最后一个cell输出的状态。一般情况下state的形状为 [batch_size, cell.output_size/lstm_size ],但当输入的cell为BasicLSTMCell时, state的形状为**[2,batch_size, cell.output_size ],其中2也对应着LSTM中的cell state和hidden state** final_state [batch_size, max_time, cell.output_size] 主要是对输出数据特征的表述
example中 batch_size=4,cell.output_size =lstm_size(5),state的形状为【2, 4,5】,


inputs的格式为[-1,max_time,n_inputs ]时,max_time的大小即为step的个数,图中网络的max_time就是2.



import tensorflow as tf
batch_size = 4 
input = tf.random_normal(shape=[3, batch_size, 6], dtype=tf.float32)
cell = tf.nn.rnn_cell.BasicLSTMCell(10, forget_bias=1.0, state_is_tuple=True)#n_hidden=10神经元数量
init_state = cell.zero_state(batch_size, dtype=tf.float32)
output, final_state = tf.nn.dynamic_rnn(cell, input, initial_state=init_state, time_major=True) 
#如果是False,那么输入的第二个维度就是steps。[batch_size,steps, depth]
#如果是True,output的维度是[steps, batch_size, depth],反之就是[batch_size, max_time, depth]。就是和输入是一样的
#final_state就是整个LSTM输出的最终的状态,包含c和h。final_state的维度是[2,batch_size, n_hidden];
#c和h的维度都是[batch_size, n_hidden],final_state[0]=cell_state,final_state[1]=hidden_state。
with tf.Session() as sess:


[array([[[ 0.11201711,  0.05266698,  0.12750182,  0.03627545,
          0.02706259, -0.11562401,  0.08477378,  0.06157489,
          0.07341921,  0.15011263],
        [-0.09552816, -0.17314027, -0.0895469 , -0.26399866,
         -0.36308575,  0.10537394, -0.09443868, -0.31130335,
          0.0132737 , -0.12810872],
        [-0.00719012,  0.04438379, -0.03804718, -0.06637346,
         -0.02082551,  0.132549  , -0.05982352,  0.11778613,
         -0.09206182,  0.02547247],
        [ 0.14723007,  0.05410767,  0.06571447,  0.06775881,
         -0.03286515,  0.31600857,  0.03567648,  0.10357846,
         -0.0679171 , -0.00785992]],

       [[ 0.06683166, -0.05883167,  0.10910213,  0.05030679,
          0.17738451,  0.00631482, -0.00457612, -0.03694798,
          0.17743434,  0.06658468],
        [-0.03385706, -0.20001511, -0.05247132, -0.14611273,
         -0.17433529,  0.14970839, -0.07725038, -0.32652032,
          0.09670977, -0.17828827],
        [ 0.03988864, -0.03118243, -0.09246919,  0.1831698 ,
         -0.01006366,  0.01672944,  0.01009638,  0.10943947,
         -0.00420897, -0.0054652 ],
        [ 0.16778645,  0.08699884,  0.12571299,  0.12276714,
          0.04879797,  0.10517071,  0.10341848,  0.15480027,
         -0.04619027,  0.11167715]],

       [[ 0.14293307, -0.10649989,  0.09144076, -0.03020415,
          0.18182378,  0.22111537, -0.02275194, -0.14586878,
          0.19310513, -0.02283864],
        [-0.0553881 , -0.16710383, -0.09584018, -0.06020959,
         -0.11862611,  0.05812657, -0.05461238, -0.21729217,
          0.08961426, -0.1420837 ],
        [ 0.03053934,  0.02213254, -0.11577073,  0.08933022,
         -0.08349261,  0.044699  ,  0.01332499,  0.14753158,
         -0.12446564,  0.00095996],
        [ 0.21244884,  0.11677884,  0.15352076,  0.04703464,
          0.07084017,  0.04610508,  0.09713535,  0.12495688,
          0.00218641,  0.17711937]]], dtype=float32), 
         LSTMStateTuple(c=array([[ 0.264239  , -0.16139928,  0.25842854, -0.05938458,  0.38918033,
         0.37621742, -0.06394874, -0.263255  ,  0.32704324, -0.04286532],
       [-0.11041687, -0.3316248 , -0.21551779, -0.12425021, -0.2452825 ,
         0.12507899, -0.11451716, -0.40844095,  0.20570038, -0.28551656],
       [ 0.0634905 ,  0.05425977, -0.19805768,  0.15730162, -0.14432296,
         0.09046975,  0.02406704,  0.34546444, -0.22364679,  0.00243504],
       [ 0.40725306,  0.25660557,  0.3873769 ,  0.11941462,  0.16212168,
         0.10613891,  0.1803763 ,  0.26139545,  0.00540481,  0.31761324]],
      h=array([[ 0.14293307, -0.10649989,  0.09144076, -0.03020415,  0.18182378,
         0.22111537, -0.02275194, -0.14586878,  0.19310513, -0.02283864],
       [-0.0553881 , -0.16710383, -0.09584018, -0.06020959, -0.11862611,
         0.05812657, -0.05461238, -0.21729217,  0.08961426, -0.1420837 ],
       [ 0.03053934,  0.02213254, -0.11577073,  0.08933022, -0.08349261,
         0.044699  ,  0.01332499,  0.14753158, -0.12446564,  0.00095996],
       [ 0.21244884,  0.11677884,  0.15352076,  0.04703464,  0.07084017,
         0.04610508,  0.09713535,  0.12495688,  0.00218641,  0.17711937]],

state在多层rnn中,输出包含了所有层的的,前向和后向C和H,两者代表的都是每一层的最后一时刻的输出,前后向的H,对应的output每一层的最后一个时刻值(output 只保存最后一层的输出,state 保存所有层的 H和C);
由输出可以看出来,这个output 包含了隐含层所有时刻的输出,如果加层的话,那么这个output 的每个时刻,就作为下一层每个时刻的输入;
LSTM 中 state的输出包含了C和H,两者代表的都是当前层的最后一时刻的输出,H和output的最后一个时刻值一样;
另一篇不错的参考文献outputs[:, -1, :],简洁参考文献3用tensorflow搭建RNN(LSTM)进行MNIST 手写数字辨识

The code you provided defines a named tuple `Hypothesis` with two fields, `value` and `score`. This is a convenient way to store and manipulate hypotheses in the context of sequence-to-sequence models. The `NMT` class is a PyTorch module that implements a simple neural machine translation model. It consists of a bidirectional LSTM encoder, a unidirectional LSTM decoder, and a global attention mechanism based on Luong et al. (2015). Here's a breakdown of the code: ```python from collections import namedtuple import torch import torch.nn as nn import torch.nn.functional as F Hypothesis = namedtuple('Hypothesis', ['value', 'score']) class NMT(nn.Module): def __init__(self, src_vocab_size, tgt_vocab_size, emb_size, hidden_size): super(NMT, self).__init__() self.src_embed = nn.Embedding(src_vocab_size, emb_size) self.tgt_embed = nn.Embedding(tgt_vocab_size, emb_size) self.encoder = nn.LSTM(emb_size, hidden_size, bidirectional=True) self.decoder = nn.LSTMCell(emb_size + hidden_size, hidden_size) self.attention = nn.Linear(hidden_size * 2, hidden_size) self.out = nn.Linear(hidden_size, tgt_vocab_size) self.hidden_size = hidden_size def forward(self, src, tgt): batch_size = src.size(0) src_len = src.size(1) tgt_len = tgt.size(1) # Encode the source sentence src_embedded = self.src_embed(src) encoder_outputs, (last_hidden, last_cell) = self.encoder(src_embedded) # Initialize the decoder states decoder_hidden = last_hidden.view(batch_size, self.hidden_size) decoder_cell = last_cell.view(batch_size, self.hidden_size) # Initialize the attention context vector context = torch.zeros(batch_size, self.hidden_size, device=src.device) # Initialize the output scores outputs = torch.zeros(batch_size, tgt_len, self.hidden_size, device=src.device) # Decode the target sentence for t in range(tgt_len): tgt_embedded = self.tgt_embed(tgt[:, t]) decoder_input =[tgt_embedded, context], dim=1) decoder_hidden, decoder_cell = self.decoder(decoder_input, (decoder_hidden, decoder_cell)) attention_scores = self.attention(encoder_outputs) attention_weights = F.softmax(torch.bmm(attention_scores, decoder_hidden.unsqueeze(2)).squeeze(2), dim=1) context = torch.bmm(attention_weights.unsqueeze(1), encoder_outputs).squeeze(1) output = self.out(decoder_hidden) outputs[:, t] = output return outputs ``` The `__init__` method initializes the model parameters and layers. It takes four arguments: - `src_vocab_size`: the size of the source vocabulary - `tgt_vocab_size`: the size of the target vocabulary - `emb_size`: the size of the word embeddings - `hidden_size`: the size of the encoder and decoder hidden states The model has four main components: - `src_embed`: an embedding layer for the source sentence - `tgt_embed`: an embedding layer for the target sentence - `encoder`: a bidirectional LSTM encoder that encodes the source sentence - `decoder`: a unidirectional LSTM decoder that generates the target sentence The attention mechanism is implemented in the `forward` method. It takes two arguments: - `src`: the source sentence tensor of shape `(batch_size, src_len)` - `tgt`: the target sentence tensor of shape `(batch_size, tgt_len)` The method first encodes the source sentence using the bidirectional LSTM encoder. The encoder outputs and final hidden and cell states are stored in `encoder_outputs`, `last_hidden`, and `last_cell`, respectively. The decoder is initialized with the final hidden and cell states of the encoder. At each time step, the decoder takes as input the embedded target word and the context vector, which is a weighted sum of the encoder outputs based on the attention scores. The decoder output and hidden and cell states are updated using the LSTMCell module. The attention scores are calculated by applying a linear transform to the concatenated decoder hidden state and encoder outputs, followed by a softmax activation. The attention weights are used to compute the context vector as a weighted sum of the encoder outputs. Finally, the decoder hidden state is passed through a linear layer to produce the output scores for each target word in the sequence. The output scores are stored in the `outputs` tensor and returned by the method.
