一、设计最简单的数字识别问题
二、设计最简单的神经网络,仅含一隐藏层,4X3X2,激活函数:sigmod+softmax;损失函数:(均值)交叉熵cross-entropy
三、利用梯度下降法和BP算法编程求解,观察整个求解过程。
BP算法见:
参考http://neuralnetworksanddeeplearning.com/chap3.htmll
如采用softmax+cross-entropy,BP1为:
推导参考:https://blog.csdn.net/haolexiao/article/details/72757796
训练数据检查如下:
# coding=utf-8
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"]='2' # 只显示 warning 和 Error
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
logs_path=r'c:/temp/log_mnist_softmax'
learning_rate=5 #
training_epochs=100
trainData_in=np.array([[1.0,1.0,0.0,0.0],\
[1.0,0.0,1.0,0.0],\
[1.0,0.0,0.0,1.0],\
[1.0,0.0,0.0,0.0],\
[0.0,1.0,0.0,0.0],\
[0.0,0.0,0.0,1.0]])
trainData_out=np.array([[0.0,1.0],\
[0.0,1.0],\
[0.0,1.0],\
[1.0,0.0],\
[1.0,0.0],\
[1.0,0.0]])
testData_in=np.array([[0.0,0.0,0.0,1.0]])
testData_out=np.array([[1.0,0.0]])
print(np.shape(trainData_in))
print(np.shape(trainData_out))
for i in range(len(trainData_in)):
print(trainData_out[i])
I=trainData_in[i]
J=trainData_out[i]
print (list(J).index(max(J))) #J是array,转成list才有index
I.resize(2,2)
plt.subplot(6,6,i*6+1)
plt.imshow(I,cmap='Greys_r')
plt.show()
前向传播用tensorflow计算图完成,代码如下:
x_input=tf.placeholder(tf.float32, [None,4], name='x_input')
y_desired=tf.placeholder(tf.float32,[None,2],name='y_desired')
#w1=tf.Variable(tf.zeros([4,3]),name='w1')
w1=tf.Variable(tf.truncated_normal([4,3],stddev=0.1),name='w1')
b1=tf.Variable(tf.zeros([3]),name='b1')
z1=tf.matmul(x_input,w1)+b1
y1=tf.nn.sigmoid(z1)
#w=tf.Variable(tf.zeros([3,2]),name='w')
w=tf.Variable(tf.truncated_normal([3,2],stddev=0.1),name='w')
b=tf.Variable(tf.zeros([2]),name='b')
z=tf.matmul(y1,w)+b
y_output=tf.nn.softmax(z,name='y_output')
lossFun_crossEntropy=-tf.reduce_mean(y_desired*tf.log(y_output)) #交叉熵均值
feed_dict_trainData={x_input:trainData_in,y_desired:trainData_out}
feed_dict_testData={x_input:testData_in,y_desired:testData_out}
###
#train_step=tf.train.GradientDescentOptimizer(learning_rate).minimize(lossFun_crossEntropy)
###
tf.summary.scalar('cost',lossFun_crossEntropy)
summary_op=tf.summary.merge_all()
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
logs_writer=tf.summary.FileWriter(logs_path,graph=tf.get_default_graph())
for epoch in range(training_epochs):
# _,summary=sess.run([train_step,summary_op],feed_dict=feed_dict_trainData)
误差反向传播 BP计算代码如下:
#####
#cross-entropy+softmax BP
w1_temp,b1_temp,z1_temp,y1_temp,w_temp,b_temp,z_temp,y_output_temp,cost_temp=\
sess.run([w1,b1,z1,y1,w,b,z,y_output,lossFun_crossEntropy],\
feed_dict=feed_dict_trainData)
delta=y_output_temp-trainData_out #BP1
print('delta=',delta)
nabla_b=delta.sum(axis=0)#在列方向上求和delta #BP3
print('nabla_b=',nabla_b)
print('y1=',y1_temp)
nabla_w=np.dot(y1_temp.transpose(),delta) #BP4
print('nabla_w=',nabla_w)
print('z1=',z1_temp)
dSigmod_z1=sess.run(tf.nn.sigmoid(z1_temp)*(1-tf.nn.sigmoid(z1_temp)))
print('dSigmod_z1=',dSigmod_z1)
delta=np.dot(delta,w_temp.transpose())*dSigmod_z1 #BP2!!!
print('w=',w_temp)
print('delta=',delta)
nabla_b1=delta.sum(axis=0)#在列方向上求和delta #BP3
print('nabla_b1=',nabla_b1)
nabla_w1=np.dot(trainData_in.transpose(),delta) #BP4
print('x_input=',trainData_in)
print('nabla_w=',nabla_w1)
m,n=np.shape(trainData_out)
update_w1=tf.assign(w1,w1-learning_rate/m/n*nabla_w1)
update_b1=tf.assign(b1,b1-learning_rate/m/n*nabla_b1)
update_w=tf.assign(w,w-learning_rate/m/n*nabla_w)
update_b=tf.assign(b,b-learning_rate/m/n*nabla_b)
print('w1\'=',sess.run(update_w1))
print('b1\'=',sess.run(update_b1))
print('w\'=',sess.run(update_w))
print('b\'=',sess.run(update_b))
#####
summary=sess.run(summary_op,feed_dict=feed_dict_trainData)
logs_writer.add_summary(summary,epoch)
当w1,b1,w,b初始值都设为0时,6个样例的delta和正好=0,由BP3,输出层b对lossFun的敏感度nabla_b=0.
因为w=0,由BP2,delta无法传递到隐藏层,由BP3,隐藏层b1对lossFun的敏感度nabla_b1=0,由BP4,隐藏层w1对lossFun的敏感度nabla_w1=0。
因为输出层的delta正负正好相等,隐藏层的激活值y1全是0.5,由BP4,输出层的w对lossFun的敏感度nabla_w=0
所以,网络学习失败!!!
改变w1,w的初始值,见程序中。学习率5,学习100次,结果见下图,和tensorflow的tf.train.GradientDescentOptimizer(learning_rate).minimize(lossFun_crossEntropy)结果基本一致!
随机生成w1,w,进行训练3次,平均交叉熵cost如下图,在训练初始,cost均上升,在50步到70步之间出现波动。
异常波动的原因是学习率过大引起。将学习率/10,第49步到50步,Cost如下图:
从59步到60步,微调学习率,代码如下:
# coding=utf-8
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"]='2' # 只显示 warning 和 Error
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
logs_path=r'c:/temp/log_mnist_softmax'
learning_rate=5 #当>0.05时误差很大
training_epochs=1
trainData_in=np.array([[1.0,1.0,0.0,0.0],\
[1.0,0.0,1.0,0.0],\
[1.0,0.0,0.0,1.0],\
[1.0,0.0,0.0,0.0],\
[0.0,1.0,0.0,0.0],\
[0.0,0.0,0.0,1.0]])
trainData_out=np.array([[0.0,1.0],\
[0.0,1.0],\
[0.0,1.0],\
[1.0,0.0],\
[1.0,0.0],\
[1.0,0.0]])
testData_in=np.array([[0.0,0.0,0.0,1.0]])
testData_out=np.array([[1.0,0.0]])
print(np.shape(trainData_in))
print(np.shape(trainData_out))
x_input=tf.placeholder(tf.float32, [None,4], name='x_input')
y_desired=tf.placeholder(tf.float32,[None,2],name='y_desired')
#w1=tf.Variable(tf.zeros([4,3]),name='w1')
w1=tf.Variable(tf.truncated_normal([4,3],stddev=0.1),name='w1')
b1=tf.Variable(tf.zeros([3]),name='b1')
z1=tf.matmul(x_input,w1)+b1
y1=tf.nn.sigmoid(z1)
#w=tf.Variable(tf.zeros([3,2]),name='w')
w=tf.Variable(tf.truncated_normal([3,2],stddev=0.1),name='w')
b=tf.Variable(tf.zeros([2]),name='b')
z=tf.matmul(y1,w)+b
y_output=tf.nn.softmax(z,name='y_output')
lossFun_crossEntropy=-tf.reduce_mean(y_desired*tf.log(y_output)) #交叉熵均值
feed_dict_trainData={x_input:trainData_in,y_desired:trainData_out}
feed_dict_testData={x_input:testData_in,y_desired:testData_out}
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for epoch in range(training_epochs):
#####
w1_=np.array([[0.46067277,-1.9804744,1.5933108],
[0.17863363,-0.82081705,0.86146206],
[0.4359305,-1.3856853,1.3689806],
[0.1085812,-0.83159065,0.861873]])
b1_=np.array([-0.71817315,1.857689,-1.7725247])
w_=np.array([[-0.49652696,0.26705313],
[2.5914109,-2.7570033],
[-1.9809961,1.9674655]])
b_=np.array([0.6382933,-0.63829315])
w1_50=np.array([[0.53725445,-2.3688202,1.9239993],
[0.21249956,-1.0122195,1.0180486],
[0.45423114,-1.4623545,1.4372292],
[0.14236526,-1.0221516,1.0185666]])
b1_50=np.array([-0.64476234,1.4880188,-1.4563937])
w_50=np.array([[-0.68910146,0.45962757],
[2.5146284,-2.6802208],
[-2.266828,2.2532973]])
b_50=np.array([0.25415757,-0.25415745])
nabla_w1=(w1_50-w1_)/5
nabla_b1=(b1_50-b1_)/5
nabla_w=(w_50-w_)/5
nabla_b=(b_50-b_)/5
cost_fi=np.zeros(11)
fi=np.zeros(11)
for i in range(11):
w1_50=w1_+nabla_w1*0.5*i
b1_50=b1_+nabla_b1*0.5*i
w_50=w_+nabla_w*0.5*i
b_50=b_+nabla_b*0.5*i
if False:
update_w1=tf.assign(w1,w1_)
update_b1=tf.assign(b1,b1_)
update_w=tf.assign(w,w_)
update_b=tf.assign(b,b_)
else:
update_w1=tf.assign(w1,w1_50)
update_b1=tf.assign(b1,b1_50)
update_w=tf.assign(w,w_50)
update_b=tf.assign(b,b_50)
print('w1\'=',sess.run(update_w1))
print('b1\'=',sess.run(update_b1))
print('w\'=',sess.run(update_w))
print('b\'=',sess.run(update_b))
fi[i]=i
cost_fi[i]=sess.run(lossFun_crossEntropy,feed_dict=feed_dict_trainData)
plt.plot(fi,cost_fi)
plt.show()
#####