RNN介绍
setup code
# 不显示python使用过程中的警告
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import os
def reset_graph(seed=42):
tf.reset_default_graph()
tf.set_random_seed(seed)
np.random.seed(seed)
return
# with tf.Session( config=tf.ConfigProto(gpu_options=gpu_options) ) as sess:
with tf.Session( ) as sess:
print( sess.run( tf.constant(1) ) )
1
简介
- RNN(recurrent neural networks)是用于预测未来的一些状态,可以用于股票预测、语义分析、句子生成、音乐生成等场景中。
- RNN中经常会遇到梯度爆炸与梯度弥散问题,在RNN中,提出的解决方法有:LSTM、GRU元等。
递归神经元(Recurrent Neurons)
- 前向的网络是从输入到输出,而RNN除了有从输入到输出的通道之外,输出的值也会到达输入(反向通道),在下一个step时被使用
- 创建一层RN的方法:在每一个时间t,每个神经元接受输入向量 x(t) x ( t ) 以及上一个step的输出向量 y(t−1) y ( t − 1 ) 。这里的输入与输出都是向量,如果只有一个神经元节点,则输出是标量,这与CNN不同,结构如下:
- 每个RN对于
x(t)
x
(
t
)
与
y(t−1)
y
(
t
−
1
)
分别有一个权值向量,为
wx
w
x
与
wy
w
y
,一个RN的输出为
y(t)=ϕ(xT(t)wx+yT(t−1)wy+b) y ( t ) = ϕ ( x ( t ) T w x + y ( t − 1 ) T w y + b )
也可以把一层中所有的RN写成矩阵的形式(每一行是一个RN)。 - 在这里需要注意的是: y(t) y ( t ) 与 y(t−1) y ( t − 1 ) 有关,以此类推,它与之前所有step的输出都有关
记忆单元(memory cells)
- 因为step为t时刻的输出与之前所有的输出都有关系,因此可以认为RN是有记忆的
- 前面的描述中, t−1 t − 1 时刻的输出直接作用于 t t 时刻,但是我们也可以将其利用函数进行处理之后再作用于时刻
输入与输出序列
- RNN可以对输入序列进行处理,得到输出序列
- 输入为序列,输出为序列:RNN可以用于预测股票等时间序列的问题
- 输入为序列,输出为一个向量:RNN可以用于情感分析,如视频的情感取向等
- 输入为向量,输出为序列:比如bottom-left network等
- 把sequence-to-vector network称为encoder,vector-to-sequence network称为decoder。如果在encoder后面加入decoder,则可以用于语言翻译等,encoder可以将句子序列转化为vector,decoder可以将vector转化为另一种语言表示的句子序列
# 构建一个包含2个时间t的RNN
n_inputs = 3
n_neurons = 5
X0 = tf.placeholder( tf.float32, [None, n_inputs] )
X1 = tf.placeholder( tf.float32, [None, n_inputs] )
Wx = tf.Variable( tf.random_normal(shape=[n_inputs, n_neurons]), dtype=tf.float32 )
Wy = tf.Variable( tf.random_normal(shape=[n_neurons, n_neurons], dtype=tf.float32 ))
b = tf.Variable( tf.zeros([1, n_neurons], dtype=tf.float32) )
Y0 = tf.tanh( tf.matmul( X0, Wx ) + b )
Y1 = tf.tanh( tf.matmul( Y0, Wy ) + tf.matmul( X1, Wx ) + b )
init = tf.global_variables_initializer()
X0_batch = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 0, 1]]) # t = 0
X1_batch = np.array([[9, 8, 7], [0, 0, 0], [6, 5, 4], [3, 2, 1]]) # t = 1
with tf.Session() as sess:
init.run()
Y0_val, Y1_val = sess.run( [Y0, Y1], feed_dict={ X0:X0_batch, X1:X1_batch } )
print( Y0_val )
print( Y1_val )
[[ 0.43158025 -0.9130973 0.2609397 -0.9974923 -0.9943459 ]
[ 0.9968195 -0.99999946 0.99997216 -0.99999976 -1. ]
[ 0.99998724 -1. 1. -1. -1. ]
[ 0.999971 -1. 1. 0.64810324 -0.99999994]]
[[ 0.9999944 -1. 1. -1. -1. ]
[-0.85875237 0.29986963 -0.99652356 0.96680504 -0.3667912 ]
[ 0.9989384 -1. 0.99999887 -0.9993215 -1. ]
[ 0.94780797 -0.9997977 0.99790573 0.8633509 -0.99999946]]
TF中RNN的使用
Static Unrolling Through Time
- static_rnn可以创建一个由链式单元组成的按时间轴展开的RNN网络,但是static_rnn对于t很大的网络的处理有很大限制,同时可能会有超过内存限制的问题
reset_graph()
X0 = tf.placeholder( tf.float32, [None, n_inputs] )
X1 = tf.placeholder( tf.float32, [None, n_inputs] )
basic_cell = tf.contrib.rnn.BasicRNNCell( num_units=n_neurons )
output_seqs, states = tf.contrib.rnn.static_rnn( basic_cell, [X0, X1], dtype=tf.float32 )
Y0, Y1 = output_seqs
init = tf.global_variables_initializer()
X0_batch = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 0, 1]])
X1_batch = np.array([[9, 8, 7], [0, 0, 0], [6, 5, 4], [3, 2, 1]])
with tf.Session() as sess:
init.run()
Y0_val, Y1_val = sess.run([Y0, Y1], feed_dict={X0: X0_batch, X1: X1_batch})
print( Y0_val )
print( Y1_val )
merged = basic_rnn = tf.summary.merge_all()
writer = basic_rnn = tf.summary.FileWriter( "./tf_logs/basic_rnn/", sess.graph )
writer.close()
[[ 0.30741337 -0.32884312 -0.6542847 -0.9385059 0.52089024]
[ 0.9912275 -0.95425415 -0.7518078 -0.9995208 0.98202336]
[ 0.99992675 -0.99783254 -0.82473516 -0.99999636 0.99947786]
[ 0.9967709 -0.6875061 0.8419969 0.93039113 0.81206834]]
[[ 0.99998885 -0.9997606 -0.06679297 -0.9999803 0.99982214]
[-0.65249425 -0.5152086 -0.37968948 -0.5922594 -0.08968376]
[ 0.998624 -0.99715203 -0.03308632 -0.9991566 0.9932902 ]
[ 0.99681675 -0.9598194 0.39660627 -0.8307605 0.7967197 ]]
n_steps = 2
n_inputs = 3
n_neurons = 5
reset_graph()
X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
X_seqs = tf.unstack(tf.transpose(X, perm=[1, 0, 2]))
basic_cell = tf.contrib.rnn.BasicRNNCell(num_units=n_neurons)
output_seqs, states = tf.contrib.rnn.static_rnn(basic_cell, X_seqs,
dtype=tf.float32)
outputs = tf.transpose(tf.stack(output_seqs), perm=[1, 0, 2])
init = tf.global_variables_initializer()
X_batch = np.array([
# t = 0 t = 1
[[0, 1, 2], [9, 8, 7]], # instance 1
[[3, 4, 5], [0, 0, 0]], # instance 2
[[6, 7, 8], [6, 5, 4]], # instance 3
[[9, 0, 1], [3, 2, 1]], # instance 4
])
with tf.Session() as sess:
init.run()
outputs_val = outputs.eval(feed_dict={X: X_batch})
print( outputs_val)
[[[-0.4565232 -0.6806412 0.40938237 0.631045 -0.45732823]
[-0.94288003 -0.9998869 0.9405581 0.99999845 -0.99999976]]
[[-0.80015343 -0.99218273 0.78177965 0.9971032 -0.9964609 ]
[-0.637116 0.11300934 0.5798437 0.43105593 -0.6371699 ]]
[[-0.93605185 -0.99983793 0.9308867 0.9999814 -0.99998313]
[-0.9165386 -0.99456036 0.89605415 0.9998719 -0.9999751 ]]
[[ 0.99273676 -0.9981933 -0.5554365 0.99890316 -0.9953323 ]
[-0.02746333 -0.7319198 0.7827872 0.9525682 -0.9781772 ]]]
Dynamic Unrolling Through Time
- dynamic_rnn可以解决上面遇到的问题
- 在BP时,经常出现OOM的问题,我们可以设置swap_memory=true,交换CPU与GPU内存,避免OOM的问题
n_steps = 2
n_inputs = 3
n_neurons = 5
reset_graph()
X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
basic_cell = tf.contrib.rnn.BasicRNNCell(num_units=n_neurons)
outputs, states = tf.nn.dynamic_rnn( basic_cell, X, dtype=tf.float32 )
init = tf.global_variables_initializer()
X_batch = np.array([
# t = 0 t = 1
[[0, 1, 2], [9, 8, 7]], # instance 1
[[3, 4, 5], [0, 0, 0]], # instance 2
[[6, 7, 8], [6, 5, 4]], # instance 3
[[9, 0, 1], [3, 2, 1]], # instance 4
])
with tf.Session() as sess:
init.run()
output_val = outputs.eval( feed_dict={ X:X_batch } )
print( output_val )
[[[ 0.8087223 -0.5231244 -0.6716494 -0.6976225 -0.54384494]
[ 0.9995454 0.9933981 -0.9999836 0.99919224 -0.9837949 ]]
[[ 0.9954711 -0.02155101 -0.9948289 0.17964771 -0.8317369 ]
[-0.06013342 0.4030144 0.02884478 -0.2943758 -0.8568158 ]]
[[ 0.9999026 0.4911105 -0.9999316 0.84138334 -0.944468 ]
[ 0.9940618 0.9581599 -0.99768937 0.98646176 -0.91752493]]
[[-0.8063292 0.93928134 -0.9730989 0.99996096 0.9743306 ]
[ 0.95047355 -0.51205146 -0.27763975 0.83108056 0.81631833]]]
变长输入序列的处理
n_steps = 2
n_inputs = 3
n_neurons = 5
reset_graph()
X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
basic_cell = tf.contrib.rnn.BasicRNNCell(num_units=n_neurons)
seq_length = tf.placeholder(tf.int32, [None])
outputs, states = tf.nn.dynamic_rnn(basic_cell, X, dtype=tf.float32,
sequence_length=seq_length)
init = tf.global_variables_initializer()
X_batch = np.array([
# step 0 step 1
[[0, 1, 2], [9, 8, 7]], # instance 1
[[3, 4, 5], [0, 0, 0]], # instance 2 (padded with zero vectors)
[[6, 7, 8], [6, 5, 4]], # instance 3
[[9, 0, 1], [3, 2, 1]], # instance 4
])
# 第二个数据为1,表示第二个instance只包含了t=0时刻,即时序列长度为1,而其他的instance包含t=0与1两个时刻
seq_length_batch = np.array([2, 1, 2, 2])
with tf.Session() as sess:
init.run()
outputs_val, states_val = sess.run( [outputs, states], feed_dict={X: X_batch, seq_length: seq_length_batch} )
print( outputs_val ) # 返回所有instance所有序列的值,有些小于最长序列的instance会补0
print( states_val ) # 返回所有instance的最终状态,其中只有第二个instance返回的是t=0的输出,其他都是t=1时刻的输出
[[[ 0.731557 0.3483572 0.50582004 -0.22882834 -0.4402272 ]
[-0.61832327 0.99999994 0.865848 0.97933763 0.99022454]]
[[ 0.84387034 0.9969754 0.84789455 0.43032196 0.09275493]
[ 0. 0. 0. 0. 0. ]]
[[ 0.9115923 0.9999905 0.9595445 0.8189222 0.5774024 ]
[-0.7754546 0.99987745 0.9784728 0.7317201 0.8887761 ]]
[[-0.99952275 0.9997785 0.5974465 0.99309695 0.9984741 ]
[-0.62119997 0.5400083 0.9693844 0.19893228 0.19448037]]]
[[-0.61832327 0.99999994 0.865848 0.97933763 0.99022454]
[ 0.84387034 0.9969754 0.84789455 0.43032196 0.09275493]
[-0.7754546 0.99987745 0.9784728 0.7317201 0.8887761 ]
[-0.62119997 0.5400083 0.9693844 0.19893228 0.19448037]]
可变长输出序列的处理
- 我们可以预先知道输入序列的长度,但是对于无法预知输出序列的长度,比如在翻译中,我们无法得知输出序列的长度,在这里定义输出为end-of-sequence token(EOS token),任何超过这个标记的输出都会被忽略
训练RNN
- 训练RNN最常用的方法就是将网络安按照时间轴展开,然后使用BP算法进行训练(backpropagation through time, BPTT).
用RNN做分类
reset_graph()
n_steps = 28
n_inputs = 28
n_neurons = 150
n_outputs = 10
learning_rate = 0.001
X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
y = tf.placeholder(tf.int32, [None])
basic_cell = tf.contrib.rnn.BasicRNNCell(num_units=n_neurons)
outputs, states = tf.nn.dynamic_rnn(basic_cell, X, dtype=tf.float32)
logits = tf.layers.dense(states, n_outputs) # 取出最后一个状态的输出值,相当于y(28)
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y,
logits=logits)
loss = tf.reduce_mean(xentropy)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(loss)
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
init = tf.global_variables_initializer()
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("./dataset/mnist")
X_test = mnist.test.images.reshape((-1, n_steps, n_inputs))
y_test = mnist.test.labels
n_epochs = 20
batch_size = 500
with tf.Session() as sess:
init.run()
for epoch in range(n_epochs):
for iteration in range(mnist.train.num_examples // batch_size):
X_batch, y_batch = mnist.train.next_batch(batch_size)
X_batch = X_batch.reshape((-1, n_steps, n_inputs))
sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
acc_train = accuracy.eval(feed_dict={X: X_batch, y: y_batch})
acc_test = accuracy.eval(feed_dict={X: X_test, y: y_test})
print(epoch, "Train accuracy:", acc_train, "Test accuracy:", acc_test)
Extracting ./dataset/mnist/train-images-idx3-ubyte.gz
Extracting ./dataset/mnist/train-labels-idx1-ubyte.gz
Extracting ./dataset/mnist/t10k-images-idx3-ubyte.gz
Extracting ./dataset/mnist/t10k-labels-idx1-ubyte.gz
0 Train accuracy: 0.902 Test accuracy: 0.8982
1 Train accuracy: 0.95 Test accuracy: 0.9365
2 Train accuracy: 0.946 Test accuracy: 0.9445
3 Train accuracy: 0.976 Test accuracy: 0.9573
4 Train accuracy: 0.972 Test accuracy: 0.9611
5 Train accuracy: 0.958 Test accuracy: 0.9547
6 Train accuracy: 0.97 Test accuracy: 0.9656
7 Train accuracy: 0.966 Test accuracy: 0.9657
8 Train accuracy: 0.976 Test accuracy: 0.9712
9 Train accuracy: 0.976 Test accuracy: 0.9721
10 Train accuracy: 0.972 Test accuracy: 0.9733
11 Train accuracy: 0.984 Test accuracy: 0.9695
12 Train accuracy: 0.98 Test accuracy: 0.9739
13 Train accuracy: 0.978 Test accuracy: 0.9724
14 Train accuracy: 0.982 Test accuracy: 0.9725
15 Train accuracy: 0.988 Test accuracy: 0.9735
16 Train accuracy: 0.982 Test accuracy: 0.9733
17 Train accuracy: 0.986 Test accuracy: 0.9755
18 Train accuracy: 0.99 Test accuracy: 0.9754
19 Train accuracy: 0.986 Test accuracy: 0.9759
- 构建多层的RNN的效果可能会更好
reset_graph()
n_steps = 28
n_inputs = 28
n_neurons = 150
n_outputs = 10
learning_rate = 0.001
# 构建3层的RNN
n_layers = 3
X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
y = tf.placeholder(tf.int32, [None])
layers = [tf.contrib.rnn.BasicRNNCell(num_units=n_neurons,
activation=tf.nn.relu)
for layer in range(n_layers)]
multi_layer_cell = tf.contrib.rnn.MultiRNNCell(layers)
outputs, states = tf.nn.dynamic_rnn(multi_layer_cell, X, dtype=tf.float32)
states_concat = tf.concat(axis=1, values=states) # concat可以将一个tuple中的所有数在特定通道方向上进行连接
logits = tf.layers.dense(states_concat, n_outputs) # 取出最后一个状态的输出值,相当于y(28)
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y,
logits=logits)
loss = tf.reduce_mean(xentropy)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(loss)
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
init = tf.global_variables_initializer()
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("./dataset/mnist")
X_test = mnist.test.images.reshape((-1, n_steps, n_inputs))
y_test = mnist.test.labels
n_epochs = 20
batch_size = 500
with tf.Session() as sess:
init.run()
for epoch in range(n_epochs):
for iteration in range(mnist.train.num_examples // batch_size):
X_batch, y_batch = mnist.train.next_batch(batch_size)
X_batch = X_batch.reshape((-1, n_steps, n_inputs))
sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
acc_train = accuracy.eval(feed_dict={X: X_batch, y: y_batch})
acc_test = accuracy.eval(feed_dict={X: X_test, y: y_test})
print(epoch, "Train accuracy:", acc_train, "Test accuracy:", acc_test)
Extracting ./dataset/mnist/train-images-idx3-ubyte.gz
Extracting ./dataset/mnist/train-labels-idx1-ubyte.gz
Extracting ./dataset/mnist/t10k-images-idx3-ubyte.gz
Extracting ./dataset/mnist/t10k-labels-idx1-ubyte.gz
0 Train accuracy: 0.91 Test accuracy: 0.9134
1 Train accuracy: 0.964 Test accuracy: 0.9615
2 Train accuracy: 0.972 Test accuracy: 0.9619
3 Train accuracy: 0.99 Test accuracy: 0.9689
4 Train accuracy: 0.98 Test accuracy: 0.9744
5 Train accuracy: 0.978 Test accuracy: 0.9726
6 Train accuracy: 0.99 Test accuracy: 0.9765
7 Train accuracy: 0.982 Test accuracy: 0.9834
8 Train accuracy: 0.99 Test accuracy: 0.9812
9 Train accuracy: 0.988 Test accuracy: 0.9783
10 Train accuracy: 0.984 Test accuracy: 0.9827
11 Train accuracy: 0.988 Test accuracy: 0.9822
12 Train accuracy: 0.994 Test accuracy: 0.9828
13 Train accuracy: 0.992 Test accuracy: 0.9856
14 Train accuracy: 0.994 Test accuracy: 0.9786
15 Train accuracy: 0.998 Test accuracy: 0.9866
16 Train accuracy: 0.998 Test accuracy: 0.9825
17 Train accuracy: 0.992 Test accuracy: 0.9871
18 Train accuracy: 0.994 Test accuracy: 0.9857
19 Train accuracy: 0.996 Test accuracy: 0.9788
states_concat = tf.concat(axis=0, values=states)
states_concat
<tf.Tensor 'concat_3:0' shape=(?, 150) dtype=float32>
预测时间序列
- RNN中,如果需要预测时间序列,可以选择一个时间周期,将这个某一时刻 t t ,一个周期内的数据视为一个输入序列,然后将下一时刻的一个周期内的数据设置为输出序列,从而构建了训练数据的输入输出序列
- 如果只设置输出为下一个时刻的值,则可以用于对下一时刻的状态进行预测。
- 如果需要预测时间序列问题,则输出为一个向量,如果此时也需要对某一时刻的值进行预测,则我们可以使用
OutputProjectionWrapper
对神经元进行包装,它不影响RN的状态,只是在输出上加入一个线性的全连接层,所有的全连接层实现权值共享。
# data preparation
t_min, t_max = 0, 30
resolution = 0.1
def time_series(t):
return t * np.sin(t) / 3 + 2 * np.sin(t*5)
def next_batch(batch_size, n_steps):
t0 = np.random.rand(batch_size, 1) * (t_max - t_min - n_steps * resolution)
Ts = t0 + np.arange(0., n_steps + 1) * resolution
ys = time_series(Ts)
return ys[:, :-1].reshape(-1, n_steps, 1), ys[:, 1:].reshape(-1, n_steps, 1)
reset_graph()
n_steps = 20
n_inputs = 1
n_neurons = 100
n_outputs = 1
# 输入的是一个序列,每个时刻只有一个值,输出也是一个序列,每个时刻只有一个值,相当于下一个序列
X = tf.placeholder( tf.float32, [None, n_steps, n_inputs] )
y = tf.placeholder( tf.float32, [None, n_steps, n_outputs] )
cell = tf.contrib.rnn.OutputProjectionWrapper( tf.contrib.rnn.BasicRNNCell( num_units=n_neurons, activation=tf.nn.relu ), output_size=n_outputs )
outputs, states = tf.nn.dynamic_rnn( cell, X, dtype=tf.float32 )
learning_rate = 0.001
loss = tf.reduce_mean( tf.square( outputs - y ) )
optimizer = tf.train.AdamOptimizer( learning_rate=learning_rate )
training_op = optimizer.minimize( loss )
init = tf.global_variables_initializer()
saver = tf.train.Saver()
n_iter = 1500
batch_size = 500
with tf.Session() as sess:
init.run()
for iteration in range(n_iter):
X_batch, y_batch = next_batch( batch_size, n_steps )
sess.run( training_op, feed_dict={ X:X_batch, y:y_batch } )
if iteration % 100 == 0:
mse = loss.eval( feed_dict={ X:X_batch, y:y_batch } )
print( iteration, ", MSE: ", mse )
saver.save( sess, "./models/rnn/my_time_series_model" )
0 , MSE: 22.603624
100 , MSE: 0.7702466
200 , MSE: 0.28123116
300 , MSE: 0.113177
400 , MSE: 0.0779924
500 , MSE: 0.056141626
600 , MSE: 0.053441137
700 , MSE: 0.04956764
800 , MSE: 0.05015872
900 , MSE: 0.053384278
1000 , MSE: 0.050169982
1100 , MSE: 0.047106195
1200 , MSE: 0.046533123
1300 , MSE: 0.04629165
1400 , MSE: 0.04463054
# load model and predict
t = np.linspace(t_min, t_max, int((t_max - t_min) / resolution))
n_steps = 20
t_instance = np.linspace(12.2, 12.2 + resolution * (n_steps + 1), n_steps + 1)
with tf.Session() as sess:
saver.restore( sess, "./models/rnn/my_time_series_model" )
# 去除最后一个值,用于对比
X_new = time_series( np.array( t_instance[:-1].reshape( -1, n_steps, n_inputs ) ) )
y_pred = sess.run( outputs, feed_dict={ X:X_new } )
plt.plot( t_instance[:-1], time_series( t_instance[:-1] ), 'ro-', label="real lines" )
plt.plot( t_instance[1:], y_pred.flatten(), 'b*--', label="predcited lines" )
plt.legend()
plt.show()
INFO:tensorflow:Restoring parameters from ./models/rnn/my_time_series_model
- 上面使用了
OutputProjectionWrapper
,用来将RNN的输出序列的维度降低为1(在每个step),我们可以采用另外一种更加高效的方法:之前的RNN输出的shape都是 [batch_size,n_steps,n_neurons] [ b a t c h _ s i z e , n _ s t e p s , n _ n e u r o n s ] ,我们可以将其reshape为 [batch_size∗n_steps,n_neurons] [ b a t c h _ s i z e ∗ n _ s t e p s , n _ n e u r o n s ] ,然后用全连接层处理得到输出,设置其shape为 [batch_size∗n_steps,n_outputs] [ b a t c h _ s i z e ∗ n _ s t e p s , n _ o u t p u t s ] ,然后在将其reshape为 [batch_size,n_steps,n_outputs] [ b a t c h _ s i z e , n _ s t e p s , n _ o u t p u t s ]
reset_graph()
n_steps = 20
n_inputs = 1
n_neurons = 100
n_outputs = 1
learning_rate = 0.001
# 输入的是一个序列,每个时刻只有一个值,输出也是一个序列,每个时刻只有一个值,相当于下一个序列
X = tf.placeholder( tf.float32, [None, n_steps, n_inputs] )
y = tf.placeholder( tf.float32, [None, n_steps, n_outputs] )
cell = tf.contrib.rnn.BasicRNNCell( num_units=n_neurons, activation=tf.nn.relu )
rnn_outputs, states = tf.nn.dynamic_rnn( cell, X, dtype=tf.float32 )
# 先reshape,再用FC处理,然后再reshape,得到输出
stacked_rnn_outputs = tf.reshape( rnn_outputs, [-1, n_neurons] )
stacked_outputs = tf.layers.dense( stacked_rnn_outputs, n_outputs )
outputs = tf.reshape( stacked_outputs, [-1, n_steps, n_outputs] )
loss = tf.reduce_mean( tf.square( outputs - y ) )
optimizer = tf.train.AdamOptimizer( learning_rate=learning_rate )
training_op = optimizer.minimize( loss )
init = tf.global_variables_initializer()
saver = tf.train.Saver()
n_iter = 1500
batch_size = 500
with tf.Session() as sess:
init.run()
for iteration in range(n_iter):
X_batch, y_batch = next_batch( batch_size, n_steps )
sess.run( training_op, feed_dict={ X:X_batch, y:y_batch } )
if iteration % 100 == 0:
mse = loss.eval( feed_dict={ X:X_batch, y:y_batch } )
print( iteration, ", MSE: ", mse )
saver.save( sess, "./models/rnn/my_time_series_model" )
0 , MSE: 11.881364
100 , MSE: 0.36172634
200 , MSE: 0.107290134
300 , MSE: 0.06558608
400 , MSE: 0.05752879
500 , MSE: 0.054240808
600 , MSE: 0.05000111
700 , MSE: 0.04845157
800 , MSE: 0.0483876
900 , MSE: 0.049780883
1000 , MSE: 0.0478195
1100 , MSE: 0.04629565
1200 , MSE: 0.04530079
1300 , MSE: 0.045294493
1400 , MSE: 0.044130925
- 采用之前的方法,我们可以根据一个序列的输出,对输出序列进行预测,以此类推,得到未来任意时间的输出
- 关于预测序列的初始化,我们可以赋初始值0,也可以从之前的训练数据的最后一段中截取一些出来作为预测的序列
with tf.Session() as sess: # not shown in the book
saver.restore(sess, "./models/rnn/my_time_series_model") # not shown
# 全0的初始值
sequence1 = [0.] * n_steps
for iteration in range(len(t) - n_steps):
X_batch = np.array(sequence1[-n_steps:]).reshape(1, n_steps, 1)
y_pred = sess.run(outputs, feed_dict={X: X_batch})
sequence1.append(y_pred[0, -1, 0])
# time_series的初始值
sequence2 = [time_series(i * resolution + t_min + (t_max-t_min/3)) for i in range(n_steps)]
for iteration in range(len(t) - n_steps):
X_batch = np.array(sequence2[-n_steps:]).reshape(1, n_steps, 1)
y_pred = sess.run(outputs, feed_dict={X: X_batch})
sequence2.append(y_pred[0, -1, 0])
plt.figure(figsize=(11,4))
plt.subplot(121)
plt.plot(t, sequence1, "b-")
plt.plot(t[:n_steps], sequence1[:n_steps], "b-", linewidth=3)
plt.xlabel("Time")
plt.ylabel("Value")
plt.subplot(122)
plt.plot(t, sequence2, "b-")
plt.plot(t[:n_steps], sequence2[:n_steps], "b-", linewidth=3)
plt.xlabel("Time")
plt.show()
INFO:tensorflow:Restoring parameters from ./models/rnn/my_time_series_model
Deep RNN
- 之前提到的多层RNN十分常见,如果RNN的层数很多,则可以组成Deep RNN
reset_graph()
n_inputs = 2
n_steps = 5
X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
n_neurons = 100
n_layers = 3
layers = [tf.contrib.rnn.BasicRNNCell(num_units=n_neurons)
for layer in range(n_layers)]
multi_layer_cell = tf.contrib.rnn.MultiRNNCell(layers)
outputs, states = tf.nn.dynamic_rnn(multi_layer_cell, X, dtype=tf.float32)
init = tf.global_variables_initializer()
X_batch = np.random.rand(2, n_steps, n_inputs)
with tf.Session() as sess:
init.run()
outputs_val, states_val = sess.run([outputs, states], feed_dict={X: X_batch})
print( outputs_val.shape ) # 输出的维度为[batch_size, n_steps, n_neurons]
print( len(states_val) ) # state是长度为层数的tuple
(2, 5, 100)
3
在多个GPU上运行RNN
- BasicRNNCell是一个factory,在创建时,不会创建cell,因此指定device也不会起到作用
- 如果不加额外的处理,在创建多层RNN时,所有的RNN都只能在一个device中被创建
# 这段代码指定的device没有作用
with tf.device( "/gpu:0" ):
layer1 = tf.contrib.rnn.BasicRNNCell( num_units=n_neurons )
- 创建一个
DeviceCellWrapper
的类进行处理,可以实现在多个device中创建multilayer RNN中的不同layer - 也可以直接使用
tf.nn.rnn_cell.DeviceWrapper
类
class DeviceCellWrapper(tf.contrib.rnn.RNNCell):
def __init__(self, device, cell):
self._cell = cell
self._device = device
@property
def state_size(self):
return self._cell.state_size
@property
def output_size(self):
return self._cell.output_size
def __call__(self, inputs, state, scope=None):
with tf.device(self._device):
return self._cell(inputs, state, scope)
reset_graph()
n_inputs = 5
n_steps = 20
n_neurons = 100
X = tf.placeholder(tf.float32, shape=[None, n_steps, n_inputs])
devices = ["/cpu:0", "/gpu:0", "/gpu:1"] # 在不同的device中创建不同的层
cells = [DeviceCellWrapper(dev,tf.contrib.rnn.BasicRNNCell(num_units=n_neurons))
for dev in devices]
multi_layer_cell = tf.contrib.rnn.MultiRNNCell(cells)
outputs, states = tf.nn.dynamic_rnn(multi_layer_cell, X, dtype=tf.float32)
init = tf.global_variables_initializer()
with tf.Session() as sess:
init.run()
print(sess.run(outputs, feed_dict={X: np.random.rand(2, n_steps, n_inputs)}))
[[[ 0.06828328 -0.11375453 0.06424566 ... -0.24244206 -0.04821675
-0.12077259]
[ 0.07453808 -0.22510499 0.20471567 ... -0.14811225 -0.09225387
-0.04429062]
[ 0.13780874 -0.14680627 -0.00956541 ... -0.08136036 0.07381526
-0.03125764]
...
[-0.2540025 -0.3207857 0.3992359 ... -0.26669195 0.33505762
-0.03757678]
[ 0.22596699 -0.09880796 -0.274223 ... -0.13386028 -0.25443038
-0.36498213]
[ 0.16559371 -0.3343584 0.34313312 ... -0.36904442 0.06908777
0.4657412 ]]
[[ 0.00489879 -0.03151968 0.02628037 ... -0.19341365 -0.07303753
0.00451886]
[ 0.03073939 -0.05795513 0.1778592 ... -0.20945792 0.0520001
-0.07436947]
[ 0.00192375 -0.25690767 0.12488239 ... 0.02644877 -0.2504646
-0.12239385]
...
[-0.13501374 -0.06209685 0.15950367 ... -0.20012137 -0.3338359
-0.09281676]
[-0.44347283 0.2032329 0.12526968 ... -0.0796242 0.27046493
0.31883997]
[ 0.23965771 -0.22903351 0.07749572 ... -0.02653921 0.08402385
0.02313656]]]
dropout
- 为了防止RNN训练中发生过拟合,可以使用
DropoutWrapper
函数,用dropout防止过拟合
reset_graph()
n_inputs = 1
n_neurons = 100
n_layers = 3
n_steps = 20
n_outputs = 1
X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
y = tf.placeholder(tf.float32, [None, n_steps, n_outputs])
# 在训练时,采用dropout,在测试时,使用全部的输入,不使用dropout
keep_prob = tf.placeholder_with_default(1.0, shape=())
cells = [tf.contrib.rnn.BasicRNNCell(num_units=n_neurons)
for layer in range(n_layers)]
cells_drop = [tf.contrib.rnn.DropoutWrapper(cell, input_keep_prob=keep_prob)
for cell in cells]
multi_layer_cell = tf.contrib.rnn.MultiRNNCell(cells_drop)
rnn_outputs, states = tf.nn.dynamic_rnn(multi_layer_cell, X, dtype=tf.float32)
learning_rate = 0.01
stacked_rnn_outputs = tf.reshape(rnn_outputs, [-1, n_neurons])
stacked_outputs = tf.layers.dense(stacked_rnn_outputs, n_outputs)
outputs = tf.reshape(stacked_outputs, [-1, n_steps, n_outputs])
loss = tf.reduce_mean(tf.square(outputs - y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(loss)
init = tf.global_variables_initializer()
saver = tf.train.Saver()
n_iterations = 1500
batch_size = 500
train_keep_prob = 0.5
with tf.Session() as sess:
init.run()
for iteration in range(n_iterations):
X_batch, y_batch = next_batch(batch_size, n_steps)
_, mse = sess.run([training_op, loss],
feed_dict={X: X_batch, y: y_batch,
keep_prob: train_keep_prob})
if iteration % 100 == 0: # not shown in the book
print(iteration, "Training MSE:", mse) # not shown
saver.save(sess, "./models/rnn/my_ts_dropout_model")
0 Training MSE: 17.523228
100 Training MSE: 3.6864297
200 Training MSE: 2.5945766
300 Training MSE: 2.608631
400 Training MSE: 2.354856
500 Training MSE: 2.166124
600 Training MSE: 2.0215302
700 Training MSE: 1.842349
800 Training MSE: 2.4512978
900 Training MSE: 1.9394426
1000 Training MSE: 2.3554778
1100 Training MSE: 2.031637
1200 Training MSE: 1.9235188
1300 Training MSE: 1.8453777
1400 Training MSE: 1.9560652
# load from disk and predict using rnn
with tf.Session() as sess:
saver.restore(sess, "./models/rnn/my_ts_dropout_model")
X_new = time_series(np.array(t_instance[:-1].reshape(-1, n_steps, n_inputs)))
y_pred = sess.run(outputs, feed_dict={X: X_new})
plt.title("Testing the model", fontsize=14)
plt.plot(t_instance[:-1], time_series(t_instance[:-1]), "bo", markersize=10, label="instance")
plt.plot(t_instance[1:], time_series(t_instance[1:]), "y*", markersize=10, label="target")
plt.plot(t_instance[1:], y_pred[0,:,0], "r.", markersize=10, label="prediction")
plt.legend(loc="upper left")
plt.xlabel("Time")
plt.show()
INFO:tensorflow:Restoring parameters from ./models/rnn/my_ts_dropout_model
训练很多个step时容易出现的一些问题
- 如果需要对很大的time step进行训练,这与DNN类似,都会产生梯度弥散或者梯度爆炸的问题,之前在DNN中使用的方法,如RELU、BN、梯度裁剪、Dropout等方法都可以解决这个问题,但是如果time step很长,训练速度也会十分缓慢。
- 如果训练速度很慢,可以减少训练过程中的time step个数,称为
truncated backpropagation through time
,但是这会使得RNN无法学习到long-term的特征 - 针对上述方案中RNN无法学习长时特征的缺点,可以采取一个折衷方案:我们可以选取过去很长时间内的粗粒度时间,再选取最近的细粒度时间,这样模型可以学习到长时间范围内的粗粒度信息与段时间范围内的细粒度信息,但是无法获得长时间范围内的细粒度信息
- 采用很大的step进行训练,除了训练时间很长,另外一个问题是较远step的信息会逐渐丢失,即RNN无法解决长时依赖问题(http://blog.csdn.net/heyongluoyao8/article/details/48636251),我们对于这种问题提出了一些解决方案,如LSTM与GRU。
LSTM(Long Short-Term Memory)单元
- LSTM很早就被提出,可以将LSTM cell视为一个基本的单元,它相对于之前的RNN,性能会更好,能够解决模型对于长时依赖的问题
- 对于一个最基本的LSTcell,它含有4个神经网络层,其结构如下所示
更多解释的参考链接:https://yunaitong.cn/understanding-lstm-networks.html
- LSTM的核心概念为cell state,cell state会穿过所有的cell,同时与每一个cell有较小的交互,cell中的信息可以通过判断gate的状态确定是否需要被传送到cell state中
- LSTM可以通过gate(门)对cell state添加或者删除信息,gate是由一个sigmoid网络层与一个点乘操作组成,sigmoid输出为0~1,0表示不不允许信息通过,1表示允许信息通过,1个LSTM cell有三个这样的gate,用于控制cell state。
LSTM cell组成详解
- 1个LSTM cell包含2个状态
h(t)
h
(
t
)
与
c(t)
c
(
t
)
,
c
c
代表cell,可以将视为短时状态,
c(t)
c
(
t
)
视为长时状态,
c(t−1)
c
(
t
−
1
)
穿过1个LSTM cell,它首先会经过一个
forget gate
,丢失部分信息,然后通过加法操作
,添加一些由输入门(input gate)
选择后的信息,处理之后, c(t) c ( t ) 不经过任何额外的处理,便直接输出,去往下一个step;同时 c(t) c ( t ) 也会经过tanh
函数处理,然后与输出门(output gate)
进行点乘操作之后,得到 h(t) h ( t ) ,这也就是这个time step中这个LSTM cell的输出 y(t) y ( t ) 。 - 当前时刻的输入
x(t)
x
(
t
)
与上一时刻的短时状态
h(t−1)
h
(
t
−
1
)
作用于4个不同的全连接层,主要有以下几个目的
- 最主要的一层是输出 g(t) g ( t ) ,它的激活函数为tanh。在一般的cell中,它会直接输出到 y(t) y ( t ) 与 h(t) h ( t ) ;在LSTM cell中,它会将部分信息储存在长时状态(long-term state)中。
- 另外三层都是
门控单元(gate controllers)
,因为它们需要确定信息是否能够通过,因此激活函数都是sigmoid函数,如果网络的输出为0,则关闭这个gate,为1则打开这个gate
forget gate
控制长时状态的哪些部分被删除(由 f(t) f ( t ) 控制)input gate
控制 g(t) g ( t ) 中的哪一部分被添加到长时状态中(由 i(t) i ( t ) 控制)output gate
控制长时状态中的哪个部分被读取,同时在这一时刻输出至 h(t) h ( t ) 与 y(t) y ( t ) ,它由 o(t) o ( t ) 控制
- 综合以上特征,LSTM可以学习到重要的输入,将其储存在长时状态 c(t) c ( t ) 中,同时只要forget gate选择不丢弃这个信息,它可以一直被存储。因此LSTM可以解决长时记忆的问题。
- LSTM cell中涉及到的一些计算
it=σ(WTxixt+WThiht−1+bi)ft=σ(WTxfxt+WThfht−1+bf)ot=σ(WTxoxt+WThoht−1+bo)gt=tanh(WTxgxt+WThght−1+bg)ct=ft⊗ct−1+it⊗gtyt=ht=ot⊗tanh(ct) i t = σ ( W x i T x t + W h i T h t − 1 + b i ) f t = σ ( W x f T x t + W h f T h t − 1 + b f ) o t = σ ( W x o T x t + W h o T h t − 1 + b o ) g t = tanh ( W x g T x t + W h g T h t − 1 + b g ) c t = f t ⊗ c t − 1 + i t ⊗ g t y t = h t = o t ⊗ tanh ( c t )
一个变形:窥视孔连接(Peephole Connections)
- 在一个基本的LSTM cell中,gate controllers只能观察到
xt
x
t
与
ht−1
h
t
−
1
的信息,更好的方法是让这些gate controllers也可以观察到长时状态
ct−1
c
t
−
1
的信息,这种LSTM的变种被称为Peephole Connections。在TF中,只需设置参数
use_peepholes=True
即可。
另一种变形:GRU cell(Gated Recurrent Unit)
GRU cell的结构如下
GRU cell是对LSTM cell的简化,同时也能取得很好的效果,GRU cell主要在以下几个方面进行了简化
- 两个状态向量 ht h t 与 ct c t 被合并为一个向量 ht h t 。
- 用1个gate controller去控制forget gate与input gate,如果gate controller输出为1,则input gate打开,forget gate关闭;如果gate controller输出为0,则input gate关闭,forget gate打开。即,当信息需要被存储时,它所在的存储位置的信息会被删除
- 没有output gate,同时引入了一个新的gate controller,用于决定之前状态的哪些部分会被用于main layer(之前LSTM cell中提到的,里面包含需要输出的信息)中
- GRU cell中的一些计算
zt=σ(WTxzxt+WThzht−1)rt=σ(WTxrxt+WThrht−1)gt=tanh(WTxgxt+WThg(rt⊗ht−1))ht=(1−zt)⊗ht−1+zt⊗gt z t = σ ( W x z T x t + W h z T h t − 1 ) r t = σ ( W x r T x t + W h r T h t − 1 ) g t = tanh ( W x g T x t + W h g T ( r t ⊗ h t − 1 ) ) h t = ( 1 − z t ) ⊗ h t − 1 + z t ⊗ g t - TF中直接使用
GRU Cell
就可以创建GRU cell
自然语言处理(NLP)
- 现在绝大部分的NLP问题,如机器翻译、总结、情感分析等,都是基于RNN
Word Embeddings(词嵌入)
- 首先我们需要选择一个词汇表征的方法(word representation),一个比较容易想到的方法就是将每个word都作为一个one-hot vector(这个word对应的列为1,其他全为0)。但是这种方法可能会非常低效,而且会耗费大量的内存,矩阵中全为0和1。
- 最常用的方法就是用一个很小的dense vector表示每个单词,这个vector被称为
embedding
,该矩阵中的元素为实数。随着BP的训练过程,含义相近的两个word的embedding会越来越靠近(embedding的差的范数会很小)。 - 在处理之前,我们需要对sentence进行预处理,去除其中一些几乎没有意义的词汇,将一些很简单的词汇转化进行转化。
- 得到比较好的embedding模型之后,也可以将该模型用到其他的NLP应用中。
Encoder-Decoder网络用于机器翻译的流程
- 输入的sentence被转化为单词words,这些words都有其唯一的id,
- 使用encoder计算words的embedding(可以首先训练出这样一个embedding的查找表,然后查找得到)
- 将embedding作为decoder的输入,对于每一个词汇,都给出其所有可能的对应输出词汇的分数,然后使用softmax将分数转化为概率,找出最大概率对应的word,就是翻译结果
- 可能会遇到的一些问题以及解决方案
- 假设所有的输入序列的长度是相同且固定的,但是实际情况并非如此。有几种方法:可以在定义rnn时,需要将
sequence_length
作为一个变量;也可以将所有的sentence用pad填充成一个固定的长度,再输入rnn中进行训练 - 如果输出的词汇表规模很大,则为词汇表中的每个可能输出的单词都计算其概率是一个十分耗时的过程。如果词汇表有50000个word,则对于每一个单词,都会输出50000个概率,这在计算softmax时十分耗时。一种解决方案是:让decoder输出一个尺寸更小的概率向量,然后才要那个抽样softmax技术,不对词汇上的所有word计算其loss。TF中可以使用
sampled_softmax_loss()
函数实现
- 假设所有的输入序列的长度是相同且固定的,但是实际情况并非如此。有几种方法:可以在定义rnn时,需要将
tf.nn.legacy_seq2seq
模块可以十分方便地构建Encoder-Decoder模型
from six.moves import urllib
import errno
import os
import zipfile
WORDS_PATH = "./dataset/words"
WORDS_URL = 'http://mattmahoney.net/dc/text8.zip'
def mkdir_p(path):
"""Create directories, ok if they already exist.
This is for python 2 support. In python >=3.2, simply use:
>>> os.makedirs(path, exist_ok=True)
"""
try:
os.makedirs(path)
except OSError as exc:
if exc.errno == errno.EEXIST and os.path.isdir(path):
pass
else:
raise
def fetch_words_data(words_url=WORDS_URL, words_path=WORDS_PATH):
os.makedirs(words_path, exist_ok=True)
zip_path = os.path.join(words_path, "words.zip")
if not os.path.exists(zip_path):
urllib.request.urlretrieve(words_url, zip_path)
with zipfile.ZipFile(zip_path) as f: # 如果报错,可能是下载过程中出现了一些问题,需要重新下载或者手动下载到指定目录
data = f.read(f.namelist()[0])
return data.decode("ascii").split()
# 读取下载的words
words = fetch_words_data()
print( len(words ) )
17005207
# 将word转化为integer
from collections import Counter
vocabulary_size = 50000
vocabulary = [("UNK", None)] + Counter(words).most_common(vocabulary_size - 1)
vocabulary = np.array([word for word, _ in vocabulary])
dictionary = {word: code for code, word in enumerate(vocabulary)}
data = np.array([dictionary.get(word, 0) for word in words])
print( words[:5] )
print( data[:5] )
['anarchism', 'originated', 'as', 'a', 'term']
[5234 3081 12 6 195]
import random
from collections import deque
def generate_batch(batch_size, num_skips, skip_window):
global data_index
assert batch_size % num_skips == 0
assert num_skips <= 2 * skip_window
batch = np.ndarray(shape=(batch_size), dtype=np.int32)
labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
span = 2 * skip_window + 1 # [ skip_window target skip_window ]
buffer = deque(maxlen=span)
for _ in range(span):
buffer.append(data[data_index])
data_index = (data_index + 1) % len(data)
for i in range(batch_size // num_skips):
target = skip_window # target label at the center of the buffer
targets_to_avoid = [ skip_window ]
for j in range(num_skips):
while target in targets_to_avoid:
target = random.randint(0, span - 1)
targets_to_avoid.append(target)
batch[i * num_skips + j] = buffer[skip_window]
labels[i * num_skips + j, 0] = buffer[target]
buffer.append(data[data_index])
data_index = (data_index + 1) % len(data)
return batch, labels
data_index=0
batch, labels = generate_batch(8, 2, 1)
print( batch )
print( labels.flatten() )
[3081 3081 12 12 6 6 195 195]
[ 12 5234 6 3081 12 195 6 2]
batch, [vocabulary[word] for word in batch]
(array([3081, 3081, 12, 12, 6, 6, 195, 195], dtype=int32),
['originated', 'originated', 'as', 'as', 'a', 'a', 'term', 'term'])
batch_size = 128
embedding_size = 128 # Dimension of the embedding vector.
skip_window = 1 # How many words to consider left and right.
num_skips = 2 # How many times to reuse an input to generate a label.
# We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent.
valid_size = 16 # Random set of words to evaluate similarity on.
valid_window = 100 # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
num_sampled = 64 # Number of negative examples to sample.
learning_rate = 0.01
reset_graph()
# Input data.
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
vocabulary_size = 50000
embedding_size = 150
# Look up embeddings for inputs.
init_embeds = tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0)
embeddings = tf.Variable(init_embeds)
train_inputs = tf.placeholder(tf.int32, shape=[None])
embed = tf.nn.embedding_lookup(embeddings, train_inputs)
# Construct the variables for the NCE loss
nce_weights = tf.Variable(
tf.truncated_normal([vocabulary_size, embedding_size],
stddev=1.0 / np.sqrt(embedding_size)))
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
# Compute the average NCE loss for the batch.
# tf.nce_loss automatically draws a new sample of the negative labels each
# time we evaluate the loss.
loss = tf.reduce_mean(
tf.nn.nce_loss(nce_weights, nce_biases, train_labels, embed,
num_sampled, vocabulary_size))
# Construct the Adam optimizer
optimizer = tf.train.AdamOptimizer(learning_rate)
training_op = optimizer.minimize(loss)
# Compute the cosine similarity between minibatch examples and all embeddings.
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), axis=1, keep_dims=True))
normalized_embeddings = embeddings / norm
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)
# Add variable initializer.
init = tf.global_variables_initializer()
# 训练模型
num_steps = 10001
with tf.Session() as session:
init.run()
average_loss = 0
for step in range(num_steps):
print("\rIteration: {}".format(step), end="\t")
batch_inputs, batch_labels = generate_batch(batch_size, num_skips, skip_window)
feed_dict = {train_inputs : batch_inputs, train_labels : batch_labels}
# We perform one update step by evaluating the training op (including it
# in the list of returned values for session.run()
_, loss_val = session.run([training_op, loss], feed_dict=feed_dict)
average_loss += loss_val
if step % 2000 == 0:
if step > 0:
average_loss /= 2000
# The average loss is an estimate of the loss over the last 2000 batches.
print("Average loss at step ", step, ": ", average_loss)
average_loss = 0
# Note that this is expensive (~20% slowdown if computed every 500 steps)
if step % 10000 == 0:
sim = similarity.eval()
for i in range(valid_size):
valid_word = vocabulary[valid_examples[i]]
top_k = 8 # number of nearest neighbors
nearest = (-sim[i, :]).argsort()[1:top_k+1]
log_str = "Nearest to %s:" % valid_word
for k in range(top_k):
close_word = vocabulary[nearest[k]]
log_str = "%s %s," % (log_str, close_word)
print(log_str)
final_embeddings = normalized_embeddings.eval()
Iteration: 0 Average loss at step 0 : 285.43389892578125
Nearest to only: desperation, harmed, tanaka, drown, alkenes, candu, laughter, illustrating,
Nearest to or: copyleft, haag, weaned, skilled, gv, gdansk, carcassonne, stokes,
Nearest to united: iii, citizen, crows, decimals, eutelsat, dcc, auckland, ennis,
Nearest to first: liquidity, steinbeck, profoundly, integration, inhabiting, ticino, incrimination, acclaimed,
Nearest to he: transitioned, winchell, resh, goldsmiths, standardised, markings, pursued, satirized,
Nearest to not: censor, fucking, venetian, lu, quarto, contractor, headway, stylus,
Nearest to many: spreadsheets, redeemer, nominees, absurd, alerts, xxvi, transylvanian, autonegotiation,
Nearest to been: powerpc, maccabean, precarious, hounds, hazael, gol, linear, schuster,
Nearest to an: gutierrez, dyess, privations, archaeological, bijection, kon, joh, insemination,
Nearest to six: melds, signer, hurtful, paws, ev, melodies, perennially, adf,
Nearest to however: frank, referring, maximization, beltway, liechtenstein, oxus, erik, vicki,
Nearest to has: camelopardalis, lessen, learning, ji, duddy, brontosaurus, unabomber, semipalatinsk,
Nearest to see: elsinore, gaines, esque, battleship, whip, hl, postings, tint,
Nearest to had: catalytic, frankenstein, tam, lefty, grenville, lineker, shtml, sartre,
Nearest to one: imagines, tijuana, hindrance, motorcyclist, steadfastly, lords, letting, hutchinson,
Nearest to d: schuster, asgard, intriguing, catus, jewellery, leptons, goodwill, prosthetic,
Iteration: 2000 Average loss at step 2000 : 130.98741731071473
Iteration: 4000 Average loss at step 4000 : 62.76376576328278
Iteration: 6000 Average loss at step 6000 : 42.172603695631025
Iteration: 8000 Average loss at step 8000 : 31.732391747474672
Iteration: 10000 Average loss at step 10000 : 25.78748117876053
Nearest to only: one, slovenians, delay, alerts, not, citrate, essendon, wolsey,
Nearest to or: of, uppercase, aruba, arendt, and, spassky, judo, ataxia,
Nearest to united: states, taxi, pus, inconsistencies, worldwide, statistic, phoenician, transmit,
Nearest to first: in, intercession, by, hep, fourier, phalanx, of, sly,
Nearest to he: it, observations, holmes, asparagales, had, that, illyrians, nurse,
Nearest to not: to, they, always, bogus, natchez, otherwise, assist, attend,
Nearest to many: the, astrological, chomsky, accredited, people, transporting, uppercase, boosting,
Nearest to been: has, parte, by, donated, cameron, vajrayana, it, illyrians,
Nearest to an: and, the, microtubules, ustinov, tetrapods, parsley, complement, crouching,
Nearest to six: five, nine, one, eight, seven, three, zero, two,
Nearest to however: ampere, eclipsed, scrimmage, coulomb, mj, precipitation, mathfrak, sidebands,
Nearest to has: is, been, willem, chadic, ataxia, columbus, contended, automorphism,
Nearest to see: heretics, bartholomew, pavements, destructive, topalov, doctorate, groundwater, helm,
Nearest to had: he, but, departures, assassinate, legend, surprisingly, columbus, deuterostomes,
Nearest to one: nine, two, six, seven, three, five, eight, four,
Nearest to d: nine, interlocking, ads, khorasan, perspectives, one, american, circus,
# 保存当期啊你的embedding
np.save("./models/words/my_final_embeddings.npy", final_embeddings)
def plot_with_labels(low_dim_embs, labels):
assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
plt.figure(figsize=(12, 12)) #in inches
for i, label in enumerate(labels):
x, y = low_dim_embs[i,:]
plt.scatter(x, y)
plt.annotate(label,
xy=(x, y),
xytext=(5, 2),
textcoords='offset points',
ha='right',
va='bottom')
from sklearn.manifold import TSNE
tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
plot_only = 100
low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only,:])
labels = [vocabulary[i] for i in range(plot_only)]
plot_with_labels(low_dim_embs, labels)
plt.show()