本篇blog主要以code+markdown的形式介绍tf这本实战书。(建议使用jupyter来学习)
第四章 深层神经网络(CNN)
-
4.1 深度学习与深度神经网络(线性模型、激活函数)
-
4.2 损失函数定义(经典loss函数、自定义loss函数)
-
4.3 神经网络优化算法
-
4.4 神经网络进一步优化
4.1 深度学习与深度神经网络(线性模型、激活函数)
4.1.1 线性模型的局限性
根据上一章简单的神经网络中的模型有
由前向传播算法有
整理以上公式得整个模型输出为
根据矩阵乘法结合律有
而可以表示为一个新的参数
这样输入输出关系可表示为
但是对于许多问题,线性模型并不可靠,也就是很难用线性的方式去分类。这时我们有时用ReLU激活函数
4.1.2 激活函数实现去线性化
激活函数三种类型
ReLU
sigmoid函数
tanh函数
# 4.1 ReLU
w1 = tf.Variable(tf.random_normal(shape = [2,3], seed = 1, stddev = 1))
w2 = tf.Variable(tf.random_normal(shape = [3,1], seed = 1, stddev = 1))
x = tf.placeholder(tf.float32, shape = (1, 2), name = "x-input")
biases1 = tf.constant([[-0.5,0.1,-0.1]])
biases2 = tf.constant([[0.1]])
a = tf.nn.relu(tf.matmul(x, w1) + biases1)
y = tf.nn.relu(tf.matmul(a, w2) + biases2)
sess = tf.Session()
init_op = tf.global_variables_initializer()
sess.run(init_op)
print(sess.run(a, feed_dict={x : [[0.7,0.9]]}))
print(sess.run(y, feed_dict={x : [[0.7,0.9]]}))
4.1.3 多层网络解决异或运算
加入隐藏层处理异或运算。(感知机proceptron)
4.2 损失函数定义
4.2.1 经典损失函数
- 交叉熵 cross_entropy
用p,q表示两个概率分布,交叉熵为:
对于该类问题使用softmax回归
# 4.2 交叉熵
batch_size = 8
x = tf.placeholder(tf.float32, shape=(None, 2), name="x-input")
y_ = tf.placeholder(tf.float32, shape=(None, 1), name='y-input')
w1= tf.Variable(tf.random_normal([2, 1], stddev=1, seed=1))
y = tf.matmul(x, w1)
cross_entropy = -tf.reduce_mean(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)))
sess = tf.Session()
init_op = tf.global_variables_initializer()
sess.run(init_op)
print(sess.run(y,feed_dict={x:[[0.7,0.9]]}))
print(sess.run(cross_entropy,feed_dict={x:[[0.7,0.9]],y_:[[1.0]]}))
- 使用tf.clip_by_value将张量中的数值限制在一定范围
# tf.clip_by_value
v = tf.constant([[1.0,2.0,3.0],[4.0,5.0,6.0]])
sess = tf.InteractiveSession()
print(tf.clip_by_value(v,2.5,4.5).eval())
- 元素相乘(点乘*)vs 矩阵乘法(tf.matmul)
v1 = tf.constant([[1.0,2.0],[3.0,4.0]])
v2 = tf.constant([[1.0,2.0],[3.0,4.0]])
sess = tf.InteractiveSession()
print((v1*v2).eval())
print(tf.matmul(v1, v2).eval())
输出结果
- 使用tf.reduce_mean(): 根据给出的axis在input_tensor上求平均值。除非keep_dims为真,axis中的每个的张量秩会减少1。如果keep_dims为真,求平均值的维度的长度都会保持为1.如果不设置axis,所有维度上的元素都会被求平均值,并且只会返回一个只有一个元素的张量。
v = tf.constant([[1.0,2.0,3.0],[4.0,5.0,6.0]])
print(tf.reduce_mean(v).eval())
# 输出3.5
交叉熵代码(y表示原始神经网络的输出结果,y_表示标准答案)
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(labels = y_, logits = y)
- 对于回归问题的均方误差MSE
# MSE
y_ = tf.placeholder(tf.float32, shape=(None, 1), name='y-input')
w1= tf.Variable(tf.random_normal([2, 1], stddev=1, seed=1))
x = tf.placeholder(tf.float32, shape=(1,2))
y = tf.matmul(x, w1)
mse = tf.reduce_mean(tf.square(y_ - y))
sess = tf.Session()
init_op = tf.global_variables_initializer()
sess.run(init_op)
sess.run(mse,feed_dict={x:[[0.7,0.9]], y_:[[1.0]]})
4.2.2 自定义损失函数
例如下面分段损失函数
代码实例
- 定义神经网络的相关参数和变量
batch_size = 8
x = tf.placeholder(tf.float32, shape=(None, 2), name="x-input")
y_ = tf.placeholder(tf.float32, shape=(None, 1), name='y-input')
w1= tf.Variable(tf.random_normal([2, 1], stddev=1, seed=1))
y = tf.matmul(x, w1)
- 设置自定义的损失函数
loss_less = 10
loss_more = 1
loss = tf.reduce_sum(tf.where(tf.greater(y, y_), (y - y_) * loss_more, (y_ - y) * loss_less))
train_step = tf.train.AdamOptimizer(0.001).minimize(loss)
对于tf.where及tf.greater比较
v1 = tf.constant([1.0,2.0,3.0,4.0])
v2 = tf.constant([4.0,3.0,2.0,1.0])
sess = tf.Session()
# tf.greater 比较元素大小返回bool型
print(sess.run(tf.greater(v1,v2)))
# tf.where 若为T返回前面值,若为F返回后面值
print(sess.run(tf.where(tf.greater(v1,v2), v1, v2)))
- 生成模拟数据集
rdm = RandomState(1)
X = rdm.rand(128,2)
Y = [[x1+x2+(rdm.rand()/10.0-0.05)] for (x1, x2) in X]
- 训练模型
with tf.Session() as sess:
init_op = tf.global_variables_initializer()
sess.run(init_op)
STEPS = 5000
for i in range(STEPS):
start = (i*batch_size) % 128
end = (i*batch_size) % 128 + batch_size
sess.run(train_step, feed_dict={x: X[start:end], y_: Y[start:end]})
if i % 1000 == 0:
print("After %d training step(s), w1 is: " % (i))
print(sess.run(w1), "\n")
print("Final w1 is: \n", sess.run(w1))
完整代码
# 定义神经网络的相关参数和变量
batch_size = 8
x = tf.placeholder(tf.float32, shape=(None, 2), name="x-input")
y_ = tf.placeholder(tf.float32, shape=(None, 1), name='y-input')
w1= tf.Variable(tf.random_normal([2, 1], stddev=1, seed=1))
y = tf.matmul(x, w1)
# 设置自定义的损失函数
# 定义损失函数使得预测少了的损失大,于是模型应该偏向多的方向预测。
loss_less = 10
loss_more = 1
loss = tf.reduce_sum(tf.where(tf.greater(y, y_), (y - y_) * loss_more, (y_ - y) * loss_less))
train_step = tf.train.AdamOptimizer(0.001).minimize(loss)
# 生成模拟数据集
rdm = RandomState(1)
X = rdm.rand(128,2)
Y = [[x1+x2+(rdm.rand()/10.0-0.05)] for (x1, x2) in X]
# 训练模型
with tf.Session() as sess:
init_op = tf.global_variables_initializer()
sess.run(init_op)
STEPS = 5000
for i in range(STEPS):
start = (i*batch_size) % 128
end = (i*batch_size) % 128 + batch_size
sess.run(train_step, feed_dict={x: X[start:end], y_: Y[start:end]})
if i % 1000 == 0:
print("After %d training step(s), w1 is: " % (i))
print(sess.run(w1), "\n")
print("Final w1 is: \n", sess.run(w1))
- 重新定义损失函数,使得预测多了的损失大,于是模型应该偏向少的方向预测
loss_less = 1
loss_more = 10
loss = tf.reduce_sum(tf.where(tf.greater(y, y_), (y - y_) * loss_more, (y_ - y) * loss_less))
train_step = tf.train.AdamOptimizer(0.001).minimize(loss)
with tf.Session() as sess:
init_op = tf.global_variables_initializer()
sess.run(init_op)
STEPS = 5000
for i in range(STEPS):
start = (i*batch_size) % 128
end = (i*batch_size) % 128 + batch_size
sess.run(train_step, feed_dict={x: X[start:end], y_: Y[start:end]})
if i % 1000 == 0:
print("After %d training step(s), w1 is: " % (i))
print sess.run(w1), "\n"
print "Final w1 is: \n", sess.run(w1)
- 定义损失函数为MSE
loss = tf.losses.mean_squared_error(y, y_)
train_step = tf.train.AdamOptimizer(0.001).minimize(loss)
with tf.Session() as sess:
init_op = tf.global_variables_initializer()
sess.run(init_op)
STEPS = 5000
for i in range(STEPS):
start = (i*batch_size) % 128
end = (i*batch_size) % 128 + batch_size
sess.run(train_step, feed_dict={x: X[start:end], y_: Y[start:end]})
if i % 1000 == 0:
print("After %d training step(s), w1 is: " % (i))
print(sess.run(w1), "\n")
print("Final w1 is: \n", sess.run(w1))
计算结果可以自行验证
4.3 神经网络优化算法
Gradient Descent(梯度下降)
缺点:
- 梯度下降得到的不一定是全局最优(只有当损失函数是凸函数时)
- 速度较慢(用SGD加速,超参数bacth_size)
# 4.3 神经网络优化
# SGD(batch_size)
batch_size = 8
x = tf.placeholder(tf.float32, shape = (None, 2), name = "x-input" )
y_ = tf.placeholder(tf.float32, shape = (None, 1), name = "y-input")
w1= tf.Variable(tf.random_normal([2, 1], stddev=1, seed=1))
y = tf.matmul(x, w1)
loss = tf.losses.mean_squared_error(y, y_)
train_step = tf.train.AdamOptimizer(0.001).minimize(loss)
rdm = RandomState(1)
X = rdm.rand(128,2)
Y = [[x1+x2+(rdm.rand()/10.0-0.05)] for (x1, x2) in X]
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
STEPS = 5000
for i in range(STEPS):
start = (i * batch_size) % 128
end = (i * batch_size) % 128 + batch_size
sess.run(train_step, feed_dict={x:X[start:end],y_: Y[start:end]})
if i % 1000 == 0:
print("After %d training steps, w1 is: " % (i))
print(sess.run(w1),"\n")
print("Final w1 is : \n", sess.run(w1))
Back Propagation(反向传播)
4.4 神经网络进一步优化
- 指数衰减设置梯度下降的学习率 learning rate
- 拟合问题 overfitting
- 滑动平均模型 moving average
4.4.1 学习率的设置
- 学习率为1时
# 4.4.1 学习率
#decayed_learning_rate = \
# learning_rate * decay_rate ^ (global_step / decay_steps)
#学习率为1时 x在5和-5之间震荡
train_step = 10
learning_rate = 1
x = tf.Variable(tf.constant(5,dtype=tf.float32), name = "x")
y = tf.square(x)
train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(y)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for i in range(train_step):
sess.run(train_op)
x_value = sess.run(x)
print("After %s iteration(s): x%s is %f."% (i+1, i+1, x_value))
- 学习率调整为0.01,可以试验下降速度
# 学习率为0.001的时候,下降速度过慢
TRAINING_STEPS = 1000
LEARNING_RATE = 0.001
x = tf.Variable(tf.constant(5, dtype=tf.float32), name="x")
y = tf.square(x)
train_op = tf.train.GradientDescentOptimizer(LEARNING_RATE).minimize(y)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for i in range(TRAINING_STEPS):
sess.run(train_op)
if i % 100 == 0:
x_value = sess.run(x)
print("After %s iteration(s): x%s is %f."% (i+1, i+1, x_value))
- 使用指数衰减的学习率,在迭代初期得到较高的下降速度,可以在较小的训练轮数下取得不错的收敛程度
# 使用指数衰减的学习率
TRAINING_STEPS = 100
global_step = tf.Variable(0)
LEARNING_RATE = tf.train.exponential_decay(0.1, global_step, 1, 0.96, staircase=True)
x = tf.Variable(tf.constant(5, dtype=tf.float32), name="x")
y = tf.square(x)
train_op = tf.train.GradientDescentOptimizer(LEARNING_RATE).minimize(y, global_step=global_step)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for i in range(TRAINING_STEPS):
sess.run(train_op)
if i % 10 == 0:
LEARNING_RATE_value = sess.run(LEARNING_RATE)
x_value = sess.run(x)
print("After %s iteration(s): x%s is %f, learning rate is %f."% (i+1, i+1, x_value, LEARNING_RATE_value))
4.2.2 过拟合问题
为了避免过拟合问题,使用正则化(regularization)
- L1正则
- L2正则
- L1&L2正则
# 4.2.2 过拟合问题:L1 L2正则
weights = tf.constant([[1.0,-2.0],[-3.0, 4.0]])
with tf.Session() as sess:
print(sess.run(tf.contrib.layers.l2_regularizer(.5)(weights)))
print(sess.run(tf.contrib.layers.l1_regularizer(.5)(weights)))
- 生成模拟数据集
# 生成模拟数据集
import matplotlib.pyplot as plt
import numpy as np
data = []
label = []
np.random.seed(0)
# 以原点为圆心,半径为1的圆把散点划分成红蓝两部分,并加入随机噪音。
for i in range(150):
x1 = np.random.uniform(-1,1)
x2 = np.random.uniform(0,2)
if x1**2 + x2**2 <= 1:
data.append([np.random.normal(x1, 0.1),np.random.normal(x2,0.1)])
label.append(0)
else:
data.append([np.random.normal(x1, 0.1), np.random.normal(x2, 0.1)])
label.append(1)
data = np.hstack(data).reshape(-1,2)
label = np.hstack(label).reshape(-1, 1)
plt.scatter(data[:,0], data[:,1], c=np.squeeze(label),cmap="RdBu", vmin=-.2, vmax=1.2, edgecolor="white")
plt.contour(xx, yy, probs, levels=[.5], cmap="Greys", vmin=0, vmax=.1)
plt.show()
- 定义一个获取权重,并自动加入正则项到损失的函数
def get_weight(shape, lambda1):
var = tf.Variable(tf.random_normal(shape), dtype=tf.float32)
tf.add_to_collection('losses', tf.contrib.layers.l2_regularizer(lambda1)(var))
return var
- 定义神经网络
x = tf.placeholder(tf.float32, shape=(None, 2))
y_ = tf.placeholder(tf.float32, shape=(None, 1))
sample_size = len(data)
# 每层节点的个数
layer_dimension = [2,10,5,3,1]
# 神经网络层数
n_layers = len(layer_dimension)
# 该变量维护前向传播最深层的节点,初始值为x输入
cur_layer = x
# 当前的节点数
in_dimension = layer_dimension[0]
# 循环生成5层网络结构
for i in range(1, n_layers):
# 下一层节点数
out_dimension = layer_dimension[i]
# 生成当前层的权重,并计算L2正则损失
weight = get_weight([in_dimension, out_dimension], 0.003)
bias = tf.Variable(tf.constant(0.1, shape=[out_dimension]))
# 使用ReLU激活函数
cur_layer = tf.nn.relu(tf.matmul(cur_layer, weight) + bias)
# 下一层的节点个数,并进入下一层
in_dimension = layer_dimension[i]
y= cur_layer
# 均方误差损失函数
mse_loss = tf.reduce_sum(tf.square(y_ - y))
# 将均方误差损失函数加入损失集合
tf.add_to_collection('losses', mse_loss)
loss = tf.add_n(tf.get_collection('losses'))
- 训练不带正则项的损失函数mse_loss
# 定义训练的目标函数mse_loss,训练次数及训练模型
train_op = tf.train.AdamOptimizer(0.001).minimize(mse_loss)
TRAINING_STEPS = 40000
with tf.Session() as sess:
tf.global_variables_initializer().run()
for i in range(TRAINING_STEPS):
sess.run(train_op, feed_dict={x: data, y_: label})
if i % 2000 == 0:
print("After %d steps, mse_loss: %f" % (i,sess.run(mse_loss, feed_dict={x: data, y_: label})))
# 画出训练后的分割曲线
xx, yy = np.mgrid[-1.2:1.2:.01, -0.2:2.2:.01]
grid = np.c_[xx.ravel(), yy.ravel()]
probs = sess.run(y, feed_dict={x:grid})
probs = probs.reshape(xx.shape)
plt.scatter(data[:,0], data[:,1], c=np.squeeze(label),cmap="RdBu", vmin=-.2, vmax=1.2, edgecolor="white")
plt.contour(xx, yy, probs, levels=[.5], cmap="Greys", vmin=0, vmax=.1)
plt.show()
- 训练带正则项的损失函数loss
# 定义训练的目标函数loss,训练次数及训练模型
train_op = tf.train.AdamOptimizer(0.001).minimize(loss)
TRAINING_STEPS = 40000
with tf.Session() as sess:
tf.global_variables_initializer().run()
for i in range(TRAINING_STEPS):
sess.run(train_op, feed_dict={x: data, y_: label})
if i % 2000 == 0:
print("After %d steps, loss: %f" % (i, sess.run(loss, feed_dict={x: data, y_: label})))
# 画出训练后的分割曲线
xx, yy = np.mgrid[-1:1:.01, 0:2:.01]
grid = np.c_[xx.ravel(), yy.ravel()]
probs = sess.run(y, feed_dict={x:grid})
probs = probs.reshape(xx.shape)
plt.scatter(data[:,0], data[:,1], c=np.squeeze(label),cmap="RdBu", vmin=-.2, vmax=1.2, edgecolor="white")
plt.contour(xx, yy, probs, levels=[.5], cmap="Greys", vmin=0, vmax=.1)
plt.show()
不带正则完整代码
# 生成模拟数据集
import matplotlib.pyplot as plt
import numpy as np
data = []
label = []
np.random.seed(0)
# 以原点为圆心,半径为1的圆把散点划分成红蓝两部分,并加入随机噪音。
for i in range(150):
x1 = np.random.uniform(-1,1)
x2 = np.random.uniform(0,2)
if x1**2 + x2**2 <= 1:
data.append([np.random.normal(x1, 0.1),np.random.normal(x2,0.1)])
label.append(0)
else:
data.append([np.random.normal(x1, 0.1), np.random.normal(x2, 0.1)])
label.append(1)
data = np.hstack(data).reshape(-1,2)
label = np.hstack(label).reshape(-1, 1)
def get_weight(shape, lambda1):
var = tf.Variable(tf.random_normal(shape), dtype=tf.float32)
tf.add_to_collection('losses', tf.contrib.layers.l2_regularizer(lambda1)(var))
return var
x = tf.placeholder(tf.float32, shape=(None, 2))
y_ = tf.placeholder(tf.float32, shape=(None, 1))
sample_size = len(data)
# 每层节点的个数
layer_dimension = [2,10,5,3,1]
# 神经网络层数
n_layers = len(layer_dimension)
# 该变量维护前向传播最深层的节点,初始值为x输入
cur_layer = x
# 当前的节点数
in_dimension = layer_dimension[0]
# 循环生成5层网络结构
for i in range(1, n_layers):
# 下一层节点数
out_dimension = layer_dimension[i]
# 生成当前层的权重,并计算L2正则损失
weight = get_weight([in_dimension, out_dimension], 0.003)
bias = tf.Variable(tf.constant(0.1, shape=[out_dimension]))
# 使用ReLU激活函数
cur_layer = tf.nn.relu(tf.matmul(cur_layer, weight) + bias)
# 下一层的节点个数,并进入下一层
in_dimension = layer_dimension[i]
y= cur_layer
# 均方误差损失函数
mse_loss = tf.reduce_sum(tf.square(y_ - y))
# 将均方误差损失函数加入损失集合
tf.add_to_collection('losses', mse_loss)
loss = tf.add_n(tf.get_collection('losses'))
# 训练不带正则的损失函数
# 定义训练的目标函数mse_loss,训练次数及训练模型
train_op = tf.train.AdamOptimizer(0.001).minimize(mse_loss)
TRAINING_STEPS = 40000
with tf.Session() as sess:
tf.global_variables_initializer().run()
for i in range(TRAINING_STEPS):
sess.run(train_op, feed_dict={x: data, y_: label})
if i % 2000 == 0:
print("After %d steps, mse_loss: %f" % (i,sess.run(mse_loss, feed_dict={x: data, y_: label})))
# 画出训练后的分割曲线
xx, yy = np.mgrid[-1.2:1.2:.01, -0.2:2.2:.01]
grid = np.c_[xx.ravel(), yy.ravel()]
probs = sess.run(y, feed_dict={x:grid})
probs = probs.reshape(xx.shape)
plt.scatter(data[:,0], data[:,1], c=np.squeeze(label),cmap="RdBu", vmin=-.2, vmax=1.2, edgecolor="white")
plt.contour(xx, yy, probs, levels=[.5], cmap="Greys", vmin=0, vmax=.1)
plt.show()
4.4.3 滑动平均模型
- 定义变量及滑动平均类
v1 = tf.Variable(0, dtype=tf.float32)
step = tf.Variable(0, trainable=False)
ema = tf.train.ExponentialMovingAverage(0.99, step)
maintain_averages_op = ema.apply([v1])
- 查看不同迭代中变量取值的变化
with tf.Session() as sess:
# 初始化
init_op = tf.global_variables_initializer()
sess.run(init_op)
print(sess.run([v1, ema.average(v1)]))
# 更新变量v1的取值
sess.run(tf.assign(v1, 5))
sess.run(maintain_averages_op)
print(sess.run([v1, ema.average(v1)]) )
# 更新step和v1的取值
sess.run(tf.assign(step, 10000))
sess.run(tf.assign(v1, 10))
sess.run(maintain_averages_op)
print(sess.run([v1, ema.average(v1)]) )
# 更新一次v1的滑动平均值
sess.run(maintain_averages_op)
print(sess.run([v1, ema.average(v1)]))
(附录:公式latex#csdn老是会改格式导致以前很多blog公式都乱了#)
# latex
W^{'}=W^{(1)}W^{(2)}=\begin{bmatrix}
W^{(1)}_{1,1} & W^{(1)}_{1,2} & W^{(1)}_{1,3} \\
W^{(1)}_{2,1} & W^{(1)}_{2,2} & W^{(1)}_{2,3}
\end{bmatrix}
\begin{bmatrix}
W^{(2)}_{1,1} \\ W^{(2)}_{2,1} \\ W^{(2)}_{3,1}
\end{bmatrix}
=
\begin{bmatrix}
W^{(1)}_{1,1} W^{(2)}_{1,1}+W^{(1)}_{1,2} W^{(2)}_{2,1} +W^{(1)}_{1,3} W^{(2)}_{3,1} \\ W^{(1)}_{2,1} W^{(2)}_{1,1}+W^{(1)}_{2,2} W^{(2)}_{2,1} +W^{(1)}_{2,3} W^{(2)}_{3,1}
\end{bmatrix}
=
\begin{bmatrix}
W^{'}_{1}\\W^{'}_{2}
\end{bmatrix}