序列分类
参考《21个项目玩转深度学习》
from __future__ import print_function
import tensorflow as tf
import random
import numpy as np
# ====================
# TOY DATA GENERATOR
# ====================
#这个类用于产生序列样本
class ToySequenceData(object):
""" Generate sequence of data with dynamic length.
This class generate samples for training:
- Class 0: linear sequences (i.e. [0, 1, 2, 3,...])线性序列
- Class 1: random sequences (i.e. [1, 3, 10, 7,...])完全随机的序列
NOTICE:
We have to pad each sequence to reach 'max_seq_len' for TensorFlow
consistency (we cannot feed a numpy array with inconsistent
dimensions). The dynamic calculation will then be perform thanks to
'seqlen' attribute that records every actual sequence length.
#max_seq_len是最大的序列长度。对于长度小于这个数值的序列,将会补0在送入RNN计算时,会借助sequence_length属性进行相应长度的计算。
"""
def __init__(self, n_samples=1000, max_seq_len=20, min_seq_len=3,
max_value=1000):
self.data = []
self.labels = []
self.seqlen = []
for i in range(n_samples):
# Random sequence length
len = random.randint(min_seq_len, max_seq_len)
# Monitor sequence length for TensorFlow dynamic calculation
self.seqlen.append(len)
# Add a random or linear int sequence (50% prob)
if random.random() < .5:
# Generate a linear sequence
rand_start = random.randint(0, max_value - len)
s = [[float(i) / max_value] for i in
range(rand_start, rand_start + len)]
# Pad sequence for dimension consistency
s += [[0.] for i in range(max_seq_len - len)]
self.data.append(s)
self.labels.append([1., 0.])
else:
# Generate a random sequence
s = [[float(random.randint(0, max_value)) / max_value]
for i in range(len)]
# Pad sequence for dimension consistency
s += [[0.] for i in range(max_seq_len - len)]
self.data.append(s)
self.labels.append([0., 1.])
self.batch_id = 0
def next(self, batch_size):
""" Return a batch of data. When dataset end is reached, start over.
"""
if self.batch_id == len(self.data):
self.batch_id = 0
batch_data = (self.data[self.batch_id:min(self.batch_id +
batch_size, len(self.data))])
batch_labels = (self.labels[self.batch_id:min(self.batch_id +
batch_size, len(self.data))])
batch_seqlen = (self.seqlen[self.batch_id:min(self.batch_id +
batch_size, len(self.data))])
self.batch_id = min(self.batch_id + batch_size, len(self.data))
return batch_data, batch_labels, batch_seqlen
#测试如何使用上面定义的ToySequenceData
tmp=ToySequenceData()
#生成样本
batch_data, batch_labels, batch_seqlen=tmp.next(32)
#batch_data是序列数据,它是一个嵌套的list,形状为(batch_size,max_seq_len,1)
print(np.array(batch_data).shape)#(32,20,1)
#之前调用tmp.next(32),因此一共有32个序列
#可以打印出第一个序列
print(batch_data[0])
#batch_labels是label,它是一个嵌套的list,形状为(batch_size,2)
#(batch_size,2),2表示为两类分类
print(np.array(batch_labels).shape) #(32,2)
#打印第一个序列的标签
print(batch_labels[0])
#batch_seqlen一个长度为batch_size的list,表示每个序列的实际长度
print(np.array(batch_seqlen).shape)#(32,)
#打印出第一个序列的长度
print(batch_seqlen[0])
results:
(32, 20, 1)
[[0.841], [0.096], [0.913], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0]]
(32, 2)
[0.0, 1.0]
(32,)
3
----------------我是分割线-----------------------
在Tensorflow里定义RNN分类模型
# ==========
# MODEL
# ==========
# Parameters
learning_rate = 0.01
training_steps = 10000
batch_size = 128
display_step = 200
# Network Parameters
seq_max_len = 20 # Sequence max length
n_hidden = 64 # hidden layer num of features 隐层的size
n_classes = 2 # linear sequence or not 类别数
trainset = ToySequenceData(n_samples=1000, max_seq_len=seq_max_len)
testset = ToySequenceData(n_samples=500, max_seq_len=seq_max_len)
# tf Graph input
#x是输入,y是输出
x = tf.placeholder("float", [None, seq_max_len, 1])
y = tf.placeholder("float", [None, n_classes])
# A placeholder for indicating each sequence length
#存储输入的x中每个序列的实际长度
seqlen = tf.placeholder(tf.int32, [None])
# Define weights
weights = {
'out': tf.Variable(tf.random_normal([n_hidden, n_classes]))
}
biases = {
'out': tf.Variable(tf.random_normal([n_classes]))
}
#定义RNN分类模型
def dynamicRNN(x, seqlen, weights, biases):
# Prepare data shape to match `rnn` function requirements
# Current data input shape: (batch_size, n_steps, n_input)
# Required shape: 'n_steps' tensors list of shape (batch_size, n_input)
# Unstack to get a list of 'n_steps' tensors of shape (batch_size, n_input)
# x = tf.unstack(x, seq_max_len, 1)
# Define a lstm cell with tensorflow
# lstm_cell = tf.contrib.rnn.BasicLSTMCell(n_hidden)
lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(n_hidden)
# Get lstm cell output, providing 'sequence_length' will perform dynamic使用tf.nn.dynamic_rnn展开时间维度
#此外,sequence_length=seqlen也很重要,告诉tensorflow每一个序列运行多少步
# calculation.
# outputs, states = tf.contrib.rnn.static_rnn(lstm_cell, x, dtype=tf.float32,
# sequence_length=seqlen)
outputs, states = tf.nn.dynamic_rnn(lstm_cell, x, dtype=tf.float32,
sequence_length=seqlen)
#outputs的形状为(batch_size,max_seq_len,n_hidden)
# When performing dynamic calculation, we must retrieve the last
# dynamically computed output, i.e., if a sequence length is 10, we need
# to retrieve the 10th output.
# However TensorFlow doesn't support advanced indexing yet, so we build
# a custom op that for each sample in batch size, get its length and
# get the corresponding relevant output.
# 'outputs' is a list of output at every timestep, we pack them in a Tensor
# and change back dimension to [batch_size, n_step, n_input]
# outputs = tf.stack(outputs)
# outputs = tf.transpose(outputs, [1, 0, 2])
# Hack to build the indexing and retrieve the right output.
batch_size = tf.shape(outputs)[0]
# Start indices for each sample 得到每一个序列真正的index
index = tf.range(0, batch_size) * seq_max_len + (seqlen - 1)
# Indexing
outputs = tf.gather(tf.reshape(outputs, [-1, n_hidden]), index)
# Linear activation, using outputs computed above 最后的输出
return tf.matmul(outputs, weights['out']) + biases['out']
#定义损失并进行训练
#得到logits以后,可以利用它和标签y直接定义损失并训练
#这里的pred是logits而不是概率
pred = dynamicRNN(x, seqlen, weights, biases)
# Define loss and optimizer
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y))
optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(cost)
# Evaluate model 分类准确率
correct_pred = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
# Initialize the variables (i.e. assign their default value)初始化
init = tf.global_variables_initializer()
# Start training 训练
with tf.Session() as sess:
# Run the initializer
sess.run(init)
step=1
while step*batch_size < training_steps:
# for step in range(1, training_steps + 1):
batch_x, batch_y, batch_seqlen = trainset.next(batch_size)
#每run一次会更新一次参数
# Run optimization op (backprop)
sess.run(optimizer, feed_dict={x: batch_x, y: batch_y,
seqlen: batch_seqlen})
# if step % display_step == 0 or step == 1:
if step % display_step == 0:
# Calculate batch accuracy & loss 在这个batch内计算准确度
# acc, loss = sess.run([accuracy, cost], feed_dict={x: batch_x, y: batch_y,
# seqlen: batch_seqlen})
acc = sess.run(accuracy, feed_dict={x: batch_x, y: batch_y,
seqlen: batch_seqlen})
loss = sess.run( cost, feed_dict={x: batch_x, y: batch_y,
seqlen: batch_seqlen})
print("Step " + str(step * batch_size) + ", Minibatch Loss= " + \
"{:.6f}".format(loss) + ", Training Accuracy= " + \
"{:.5f}".format(acc))
step+=1
print("Optimization Finished!")
# Calculate accuracy最终,在测试集上计算一次准确度
test_data = testset.data
test_label = testset.labels
test_seqlen = testset.seqlen
print("Testing Accuracy:", \
sess.run(accuracy, feed_dict={x: test_data, y: test_label,
seqlen: test_seqlen}))
以上模型实现了一个最简单的序列分类:
1.输入序列的每一步只是一个数
2.输出只有两类,一类的标签为[1,0],另一类的标签为[0,1],即独热编码。
模型推广:
1.当输出不止两类时,可以修改n_classes,输出时使用的变量weights和bias的形状也会改变,最后的输出对应类别数。如,当类别是3类时,[0,0,1],[0,1,0],[0,0,1]
2.如果输入序列的一步是一个向量时,上面的batch_data以及x的形状为(batch_size,max_seq_len,1),只需要将其形状改为(batch_size,max_seq_len,input_size)可以了,其中每个序列每一步的值是一个长度为input_size的向量。