L2 regularization: C=C0+lambda/n/2*sum(w^2)
Nesterov's accelerated gradient descent
https://blog.csdn.net/tsyccnh/article/details/76673073
看上面一张图仔细想一下就可以明白,Nesterov动量法和经典动量法的差别就在B点和C点梯度的不同。
记vtvt 为第t次迭代梯度的累积
v0=0
v1=η∇θJ(θ)
v2=γv1+η∇θJ(θ−γv1)
↓
vt=γvt_1+η∇θJ(θ−γv t_1)
参数更新公式
θnew=θ−vt
公式里的−γvt_1 就是图中B到C的那一段向量, θ−γvt_1 就是C点的坐标(参数)
γ 代表衰减率,η 代表学习率。
NAG积累梯度快速下山,w取大值,但是采用L2正则化时,L2范数的梯度可能对其阻止,并使大的w变小。如图中2,采用NAG加速后,除训练开始时刻外,速度反而变慢,当减小lambda/n减小L2范数的影响后,训练加快,如图中3。
看懂下面的程序,就基本了解BP、SGD、momentum gradient descent、Nesterov's accelerated gradient descent、regularization
# coding=utf-8
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"]='2' # 只显示 warning 和 Error
###data (50000,784),(1000,784),(1000,784):
import pickle
import gzip
import numpy as np
def load_data():
f = gzip.open('../data/mnist.pkl.gz', 'rb')
training_data, validation_data, test_data = pickle.load(f,encoding='bytes')
f.close()
return (training_data, validation_data, test_data)
def vectorized_result(j):
e = np.zeros(10)
e[j] = 1.0
return e
training_data, validation_data, test_data = load_data()
trainData_in=training_data[0][:500]
trainData_out=[vectorized_result(j) for j in training_data[1][:500]]
validData_in=validation_data[0]
validData_out=[vectorized_result(j) for j in validation_data[1]]
testData_in=test_data[0][:100]
testData_out=[vectorized_result(j) for j in test_data[1][:100]]
###net 784X30X10:
import tensorflow as tf
import random
#import matplotlib.pyplot as plt
logs_path=r'c:/temp/log_mnist_softmax_1'
learning_rate=5.0 #当>0.05时误差很大
training_epochs=10
batch_size=10
lmbda_n=0.0001 #L2 regularization:lambda/n C=C0+lambda/n/2*sum(w^2)
###Nesterov's accelerated gradient descent:
gama=0.9 #衰减率
v_w1=tf.Variable(tf.zeros([784,30], name='v_w1')) #NAG累积梯度
v_b1=tf.Variable(tf.zeros([30],name='v_b1'))
v_w=tf.Variable(tf.zeros([30,10],name='v_w'))
v_b=tf.Variable(tf.zeros([10],name='v_b'))
x_input=tf.placeholder(tf.float32, [None,784], name='x_input')
y_desired=tf.placeholder(tf.float32,[None,10],name='y_desired')
w1=tf.Variable(tf.truncated_normal([784,30],stddev=0.1),name='w1')
w1_v=tf.add(w1,-gama*v_w1,name='w1_v') #NAG:w1向v_w1走一小步,后面求该处的梯度!!!
b1=tf.Variable(tf.zeros([30]),name='b1')
b1_v=tf.add(b1,-gama*v_b1,name='w1_v')#NAG:b1向v_b1走一小步,后面求该处的梯度!!!
z1=tf.matmul(x_input,w1_v)+b1_v#
y1=tf.nn.sigmoid(z1)
w=tf.Variable(tf.truncated_normal([30,10],stddev=0.1),name='w')
w_v=tf.add(w,-gama*v_w,name='w_v')#
b=tf.Variable(tf.zeros([10]),name='b')
b_v=tf.add(b,-gama*v_b,name='b_v')#
z=tf.matmul(y1,w_v)+b_v#
y_output=tf.nn.softmax(z,name='y_output')
lossFun_crossEntropy=-tf.reduce_mean(y_desired*tf.log(y_output)) #交叉熵均值
###L2:
logsFun_crossEntropy_L2=lossFun_crossEntropy+\
lmbda_n/2.0*tf.add(tf.reduce_sum(tf.square(w1)),tf.reduce_sum(tf.square(w)))#!!!
###BP:
delta=tf.add(y_output,-y_desired) #BP1
nabla_b=tf.reduce_sum(delta,axis=0,name='nabla_b')#在列方向上求和delta #BP3
nabla_w=tf.matmul(y1,delta,transpose_a=True,name='nabla_w') #BP4
dSigmod_z1=tf.nn.sigmoid(z1)*(1-tf.nn.sigmoid(z1))
delta=tf.matmul(delta,w_v,transpose_b=True)*dSigmod_z1 #BP2!!!;#NAG:w_v!!!
nabla_b1=tf.reduce_sum(delta,axis=0,name='nabla_b1')#在列方向上求和delta #BP3
nabla_w1=tf.matmul(x_input,delta,transpose_a=True,name='nabla_w1') #BP4
feed_dict_trainData={x_input:trainData_in,y_desired:trainData_out}
feed_dict_testData={x_input:testData_in,y_desired:testData_out}
correct_prediction=tf.equal(tf.argmax(y_output,1),\
tf.argmax(y_desired,1)) #1:按行索引,每行得一索引值
accuracy=tf.reduce_mean(tf.cast(correct_prediction,\
tf.float32))#将逻辑型变成数字型,再求均值
###
#train_step=tf.train.GradientDescentOptimizer(learning_rate).minimize(lossFun_crossEntropy)
###
tf.summary.scalar('cost',lossFun_crossEntropy)
tf.summary.scalar('cost_L2',logsFun_crossEntropy_L2)
tf.summary.scalar('accuracy',accuracy)
summary_op=tf.summary.merge_all()
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
logs_writer=tf.summary.FileWriter(logs_path,graph=tf.get_default_graph())
for epoch in range(training_epochs):
# _,summary=sess.run([train_step,summary_op],feed_dict=feed_dict_trainData)
######
#SGD:
trainData=list(zip(trainData_in,trainData_out))
random.shuffle(trainData)
trainData_in,trainData_out=zip(*trainData)
batch_count=int(len(trainData_in)/batch_size)
for i in range(batch_count):
batch_x=trainData_in[batch_size*i:batch_size*(i+1)]
batch_y=trainData_out[batch_size*i:batch_size*(i+1)]
feed_dict_batch={x_input:batch_x,y_desired:batch_y}
#update:
w1_temp,b1_temp,w_temp,b_temp,\
nabla_w1_temp,nabla_b1_temp,nabla_w_temp,nabla_b_temp,\
v_w1_temp,v_b1_temp,v_w_temp,v_b_temp=\
sess.run([w1,b1,w,b,nabla_w1,\
nabla_b1,nabla_w,nabla_b,\
v_w1,v_b1,v_w,v_b],\
feed_dict=feed_dict_batch)
###NAG:
m,n=np.shape(batch_y)
v_w1_new=gama*v_w1_temp+learning_rate/m/n*nabla_w1_temp+\
lmbda_n*learning_rate*w1_temp#!!!L2 lossFun梯度
v_b1_new=gama*v_b1_temp+learning_rate/m/n*nabla_b1_temp
v_w_new=gama*v_w_temp+learning_rate/m/n*nabla_w_temp+\
lmbda_n*learning_rate*w_temp#!!!L2 lossFun梯度
v_b_new=gama*v_b_temp+learning_rate/m/n*nabla_b_temp
update_v_w1=tf.assign(v_w1,v_w1_new)
update_v_b1=tf.assign(v_b1,v_b1_new)
update_v_w=tf.assign(v_w,v_w_new)
update_v_b=tf.assign(v_b,v_b_new)
###L2在前面梯度计算已经完成
update_w1=tf.assign(w1,w1_temp-v_w1_new)#!!!
update_b1=tf.assign(b1,b1_temp-v_b1_new)
update_w=tf.assign(w,w_temp-v_w_new)#!!!
update_b=tf.assign(b,v_b_new)
sess.run([update_v_w1,update_v_b1,update_v_w,update_v_b,\
update_w1,update_b1,update_w,update_b])
summary=sess.run(summary_op,feed_dict=feed_dict_trainData)
#summary=sess.run(summary_op,feed_dict=feed_dict_testData)
logs_writer.add_summary(summary,epoch)
print('Epoch',epoch)
print('Accuracy_trainData:',accuracy.eval\
(feed_dict=feed_dict_trainData))
print('Accuracy_testData:',accuracy.eval\
(feed_dict=feed_dict_testData))
print('Done')
try_input=testData_in[0]
try_desired=testData_out[0]
print(try_desired)
print(y_output.eval(feed_dict={x_input:[try_input]}))