序列分类--基于LSTM

序列分类

参考《21个项目玩转深度学习》

from __future__ import print_function

import tensorflow as tf
import random
import numpy as np

# ====================
#  TOY DATA GENERATOR
# ====================
#这个类用于产生序列样本
class ToySequenceData(object):
    """ Generate sequence of data with dynamic length.
    This class generate samples for training:
    - Class 0: linear sequences (i.e. [0, 1, 2, 3,...])线性序列
    - Class 1: random sequences (i.e. [1, 3, 10, 7,...])完全随机的序列
    NOTICE:
    We have to pad each sequence to reach 'max_seq_len' for TensorFlow
    consistency (we cannot feed a numpy array with inconsistent
    dimensions). The dynamic calculation will then be perform thanks to
    'seqlen' attribute that records every actual sequence length.
    #max_seq_len是最大的序列长度。对于长度小于这个数值的序列,将会补0在送入RNN计算时,会借助sequence_length属性进行相应长度的计算。
    """

    def __init__(self, n_samples=1000, max_seq_len=20, min_seq_len=3,
                 max_value=1000):
        self.data = []
        self.labels = []
        self.seqlen = []
        for i in range(n_samples):
            # Random sequence length
            len = random.randint(min_seq_len, max_seq_len)
            # Monitor sequence length for TensorFlow dynamic calculation
            self.seqlen.append(len)
            # Add a random or linear int sequence (50% prob)
            if random.random() < .5:
                # Generate a linear sequence
                rand_start = random.randint(0, max_value - len)
                s = [[float(i) / max_value] for i in
                     range(rand_start, rand_start + len)]
                # Pad sequence for dimension consistency
                s += [[0.] for i in range(max_seq_len - len)]
                self.data.append(s)
                self.labels.append([1., 0.])
            else:
                # Generate a random sequence
                s = [[float(random.randint(0, max_value)) / max_value]
                     for i in range(len)]
                # Pad sequence for dimension consistency
                s += [[0.] for i in range(max_seq_len - len)]
                self.data.append(s)
                self.labels.append([0., 1.])
        self.batch_id = 0

    def next(self, batch_size):
        """ Return a batch of data. When dataset end is reached, start over.
        """
        if self.batch_id == len(self.data):
            self.batch_id = 0
        batch_data = (self.data[self.batch_id:min(self.batch_id +
                                                  batch_size, len(self.data))])
        batch_labels = (self.labels[self.batch_id:min(self.batch_id +
                                                      batch_size, len(self.data))])
        batch_seqlen = (self.seqlen[self.batch_id:min(self.batch_id +
                                                      batch_size, len(self.data))])
        self.batch_id = min(self.batch_id + batch_size, len(self.data))
        return batch_data, batch_labels, batch_seqlen

#测试如何使用上面定义的ToySequenceData
tmp=ToySequenceData()

#生成样本
batch_data, batch_labels, batch_seqlen=tmp.next(32)

#batch_data是序列数据,它是一个嵌套的list,形状为(batch_size,max_seq_len,1)
print(np.array(batch_data).shape)#(32,20,1)

#之前调用tmp.next(32),因此一共有32个序列
#可以打印出第一个序列
print(batch_data[0])

#batch_labels是label,它是一个嵌套的list,形状为(batch_size,2)
#(batch_size,2),2表示为两类分类
print(np.array(batch_labels).shape)  #(32,2)

#打印第一个序列的标签
print(batch_labels[0])

#batch_seqlen一个长度为batch_size的list,表示每个序列的实际长度
print(np.array(batch_seqlen).shape)#(32,)

#打印出第一个序列的长度
print(batch_seqlen[0])


results:
(32, 20, 1)
[[0.841], [0.096], [0.913], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0]]
(32, 2)
[0.0, 1.0]
(32,)
3

----------------我是分割线-----------------------
在Tensorflow里定义RNN分类模型

# ==========
#   MODEL
# ==========

# Parameters
learning_rate = 0.01
training_steps = 10000
batch_size = 128
display_step = 200

# Network Parameters
seq_max_len = 20  # Sequence max length
n_hidden = 64  # hidden layer num of features  隐层的size
n_classes = 2  # linear sequence or not 类别数

trainset = ToySequenceData(n_samples=1000, max_seq_len=seq_max_len)
testset = ToySequenceData(n_samples=500, max_seq_len=seq_max_len)

# tf Graph input
#x是输入,y是输出
x = tf.placeholder("float", [None, seq_max_len, 1])
y = tf.placeholder("float", [None, n_classes])
# A placeholder for indicating each sequence length
#存储输入的x中每个序列的实际长度
seqlen = tf.placeholder(tf.int32, [None])

# Define weights
weights = {
    'out': tf.Variable(tf.random_normal([n_hidden, n_classes]))
}
biases = {
    'out': tf.Variable(tf.random_normal([n_classes]))
}

#定义RNN分类模型
def dynamicRNN(x, seqlen, weights, biases):
    # Prepare data shape to match `rnn` function requirements
    # Current data input shape: (batch_size, n_steps, n_input)
    # Required shape: 'n_steps' tensors list of shape (batch_size, n_input)

    # Unstack to get a list of 'n_steps' tensors of shape (batch_size, n_input)
    # x = tf.unstack(x, seq_max_len, 1)

    # Define a lstm cell with tensorflow
    # lstm_cell = tf.contrib.rnn.BasicLSTMCell(n_hidden)

    lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(n_hidden)

    # Get lstm cell output, providing 'sequence_length' will perform dynamic使用tf.nn.dynamic_rnn展开时间维度
    #此外,sequence_length=seqlen也很重要,告诉tensorflow每一个序列运行多少步
    # calculation.
    # outputs, states = tf.contrib.rnn.static_rnn(lstm_cell, x, dtype=tf.float32,
    #                                             sequence_length=seqlen)
    outputs, states = tf.nn.dynamic_rnn(lstm_cell, x, dtype=tf.float32,
                                                 sequence_length=seqlen)
    #outputs的形状为(batch_size,max_seq_len,n_hidden)



    # When performing dynamic calculation, we must retrieve the last
    # dynamically computed output, i.e., if a sequence length is 10, we need
    # to retrieve the 10th output.
    # However TensorFlow doesn't support advanced indexing yet, so we build
    # a custom op that for each sample in batch size, get its length and
    # get the corresponding relevant output.

    # 'outputs' is a list of output at every timestep, we pack them in a Tensor
    # and change back dimension to [batch_size, n_step, n_input]
    # outputs = tf.stack(outputs)
    # outputs = tf.transpose(outputs, [1, 0, 2])

    # Hack to build the indexing and retrieve the right output.
    batch_size = tf.shape(outputs)[0]
    # Start indices for each sample  得到每一个序列真正的index
    index = tf.range(0, batch_size) * seq_max_len + (seqlen - 1)
    # Indexing
    outputs = tf.gather(tf.reshape(outputs, [-1, n_hidden]), index)

    # Linear activation, using outputs computed above 最后的输出
    return tf.matmul(outputs, weights['out']) + biases['out']


#定义损失并进行训练
#得到logits以后,可以利用它和标签y直接定义损失并训练

#这里的pred是logits而不是概率
pred = dynamicRNN(x, seqlen, weights, biases)

# Define loss and optimizer
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y))
optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(cost)

# Evaluate model 分类准确率
correct_pred = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

# Initialize the variables (i.e. assign their default value)初始化
init = tf.global_variables_initializer()

# Start training 训练
with tf.Session() as sess:
    # Run the initializer
    sess.run(init)
    step=1

    while step*batch_size < training_steps:
    # for step in range(1, training_steps + 1):
        batch_x, batch_y, batch_seqlen = trainset.next(batch_size)
        #每run一次会更新一次参数
        # Run optimization op (backprop)
        sess.run(optimizer, feed_dict={x: batch_x, y: batch_y,
                                       seqlen: batch_seqlen})
        # if step % display_step == 0 or step == 1:
        if step % display_step == 0:
        # Calculate batch accuracy & loss  在这个batch内计算准确度
        #     acc, loss = sess.run([accuracy, cost], feed_dict={x: batch_x, y: batch_y,
        #                                                       seqlen: batch_seqlen})
            acc = sess.run(accuracy, feed_dict={x: batch_x, y: batch_y,
                                                      seqlen: batch_seqlen})
            loss = sess.run( cost, feed_dict={x: batch_x, y: batch_y,
                                                      seqlen: batch_seqlen})
            print("Step " + str(step * batch_size) + ", Minibatch Loss= " + \
                  "{:.6f}".format(loss) + ", Training Accuracy= " + \
                  "{:.5f}".format(acc))
        step+=1
    print("Optimization Finished!")

    # Calculate accuracy最终,在测试集上计算一次准确度
    test_data = testset.data
    test_label = testset.labels
    test_seqlen = testset.seqlen
    print("Testing Accuracy:", \
          sess.run(accuracy, feed_dict={x: test_data, y: test_label,
                                        seqlen: test_seqlen}))

以上模型实现了一个最简单的序列分类:
1.输入序列的每一步只是一个数
2.输出只有两类,一类的标签为[1,0],另一类的标签为[0,1],即独热编码。

模型推广:
1.当输出不止两类时,可以修改n_classes,输出时使用的变量weights和bias的形状也会改变,最后的输出对应类别数。如,当类别是3类时,[0,0,1],[0,1,0],[0,0,1]

2.如果输入序列的一步是一个向量时,上面的batch_data以及x的形状为(batch_size,max_seq_len,1),只需要将其形状改为(batch_size,max_seq_len,input_size)可以了,其中每个序列每一步的值是一个长度为input_size的向量。

  • 2
    点赞
  • 31
    收藏
    觉得还不错? 一键收藏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值