梯度检验
code:
import numpy as np
import h5py
import matplotlib.pyplot as plt
from dnn_utils_v2 import *
from load_data import load_2D_dataset
class dnn:
def __init__(self,layer_dims) -> None:
self.WL={}
self.bL={}
self.layer_dims=layer_dims
self.L=len(layer_dims)-1
np.random.seed(1)
# 初始化参数
for i in range(1,self.L+1):
self.WL['W'+str(i)]=np.random.randn(layer_dims[i], layer_dims[i-1]) / np.sqrt(layer_dims[i-1])
self.bL['b'+str(i)]=np.random.randn(layer_dims[i],1)
self.XL={}
self.AL={}
self.ZL={}
self.dZ={}
self.dW={}
self.db={}
self.dA={}
# 梯度检验所用到的信息
# 将所有的参数w,b转换成一个列向量[x,1]
self.theta=np.array([])
self.keys=[]
self.gradient=[]
self.check_WL={}
self.check_bL={}
def input_data(self,X,Y,learning_rate):
self.m=X.shape[1]
self.AL["A0"]=X
self.Y=Y
self.learning_rate=learning_rate
def set_data(self,X):
self.m=X.shape[1]
self.AL["A0"]=X
#下面是前向传播模块,前向传播过程中需要记录Z,A
def linear_activation_forward(self,i,activation):
'''实现一层的正向传播'''
# self.ZL[Zi]和self.AL[Ai]记录第i层的数据
# 存储了计算的Z ,A,W,b,对象自带的有
self.ZL['Z'+str(i)]=np.dot(self.WL['W'+str(i)],self.AL['A'+str(i-1)])+self.bL['b'+str(i)]
if activation=="sigmoid":
self.AL['A'+str(i)]=1/(1+np.exp(-self.ZL['Z'+str(i)]))
elif activation=="relu":
self.AL['A'+str(i)]=np.maximum(0,self.ZL['Z'+str(i)])
# 前向传播
def L_model_forward(self):
# 前L-1层使用relu函数激活,最后一层使用sigmoid函数激活
for i in range(1,self.L):
self.linear_activation_forward(i,"relu")
self.linear_activation_forward(self.L,"sigmoid")
# 确定最后的输出是否是二分类所需要的输出
assert(self.AL['A'+str(self.L)].shape==(1,self.m))
# 下面时梯度检验的前向传播
def linear_activation_forward_check(self,i,activation):
'''实现一层的正向传播'''
# self.ZL[Zi]和self.AL[Ai]记录第i层的数据
# 存储了计算的Z ,A,W,b,对象自带的有
self.ZL['Z'+str(i)]=np.dot(self.check_WL['W'+str(i)],self.AL['A'+str(i-1)])+self.check_bL['b'+str(i)]
if activation=="sigmoid":
self.AL['A'+str(i)]=1/(1+np.exp(-self.ZL['Z'+str(i)]))
elif activation=="relu":
self.AL['A'+str(i)]=np.maximum(0,self.ZL['Z'+str(i)])
# 梯度检验的L层前向传播
def L_model_forward_check(self):
# 前L-1层使用relu函数激活,最后一层使用sigmoid函数激活
for i in range(1,self.L):
self.linear_activation_forward_check(i,"relu")
self.linear_activation_forward_check(self.L,"sigmoid")
# 确定最后的输出是否是二分类所需要的输出
assert(self.AL['A'+str(self.L)].shape==(1,self.m))
# 下面是计算损失函数
def computer_cost(self):
return np.squeeze(-1/self.m*np.sum(self.Y*np.log(self.AL['A'+str(self.L)])+(1-self.Y)*np.log(1-self.AL['A'+str(self.L)])))
# 下面是反向传播模块
def linear_backward(self,i):
'''根据dz[L]计算dw[L],db[L],dA[L-1]'''
self.dW['dW'+str(i)]=1/self.m*np.dot(self.dZ['dZ'+str(i)],self.AL["A"+str(i-1)].T)
self.db['db'+str(i)]=1/self.m*np.sum(self.dZ['dZ'+str(i)],axis=1,keepdims=True)
self.dA['dA'+str(i-1)]=np.dot(self.WL['W'+str(i)].T,self.dZ['dZ'+str(i)])
def L_model_backforward(self):
# 先计算最后一层
# #计算dA
# self.dA['dA'+str(self.L)]=-(np.divide(self.Y,self.AL['A'+str(self.L)])-np.divide(1-self.Y,1-self.AL['A'+str(self.L)]))
# # 计算dz
# s=1/(1+np.exp(-self.ZL['Z'+str(self.L)]))
# self.dZ['dZ'+str(self.L)]=self.dA['dA'+str(self.L)]*s*(1-s)
self.dZ['dZ'+str(self.L)]=self.AL['A'+str(self.L)]-self.Y
# 计算dw,db,dA[L-1]
self.linear_backward(self.L)
for i in reversed(range(self.L)):
if i==0:
break
else:
self.dZ['dZ'+str(i)]=relu_backward(self.dA['dA'+str(i)],self.ZL['Z'+str(i)])
# 计算当前i层的dw,db,和dA[L-1]
self.linear_backward(i)
# 更新参数
def update_wb(self):
for i in range(1,self.L+1):
self.WL['W'+str(i)]=self.WL['W'+str(i)]-self.learning_rate*self.dW['dW'+str(i)]
self.bL['b'+str(i)]=self.bL['b'+str(i)]-self.learning_rate*self.db['db'+str(i)]
def train(self,iterations,per_num_grad_check=-1,epsilon=1e-7,flag=0):
'''per_num_grad_check:每迭代per_num_grad_check次检验一次梯度
flag:为0时表示不进行梯度检验
为1时表示进行梯度检验'''
costs=[]
for i in range(iterations):
# 前向传播
self.L_model_forward()
# 计算损失
cost=self.computer_cost()
# 后向传播
self.L_model_backforward()
if flag==1 and i%per_num_grad_check==0:
self.gradient_check(epsilon)
# 更新参数
self.update_wb()
if i%1000==0:
costs.append(cost)
print("第"+str(i)+"次迭代cost="+str(cost))
return costs
def predict(self,X):
self.set_data(X)
self.L_model_forward()
ans=np.zeros((self.AL['A'+str(self.L)].shape[0],self.AL['A'+str(self.L)].shape[1]))
for i in range(0,self.AL['A'+str(self.L)].shape[1]):
if(self.AL['A'+str(self.L)][0,i]>0.5):
ans[0,i]=1
return ans
def dictionary_to_vector(self):
# 获取当前的神经网络参数
for i in range(1,self.L+1):
keyw='W'+str(i)
keyb='b'+str(i)
new_vectorw=np.reshape(self.WL[keyw],(-1,1))
new_vectorb=np.reshape(self.bL[keyb],(-1,1))
new_vectorw_b=np.concatenate((new_vectorw,new_vectorb),axis=0)
if i==1:
self.theta=new_vectorw
self.theta=np.concatenate((self.theta,new_vectorb),axis=0)
else:
self.theta=np.concatenate((self.theta,new_vectorw_b),axis=0)
def gradients_to_vector(self):
# 获取神经网络的梯度数据,并将其转换成一个[x,1]的数据
for i in range(1,self.L+1):
keyw='dW'+str(i)
keyb='db'+str(i)
new_vectordw=np.reshape(self.dW[keyw],(-1,1))
new_vectordb=np.reshape(self.db[keyb],(-1,1))
new_vectordw_b=np.concatenate((new_vectordw,new_vectordb),axis=0)
if i==1:
self.gradient=new_vectordw
self.gradient=np.concatenate((self.gradient,new_vectordb),axis=0)
else:
self.gradient=np.concatenate((self.gradient,new_vectordw_b),axis=0)
def vector_to_dictionary(self,theta):
len=0
for i in range(1,self.L+1):
len+=self.layer_dims[i]*self.layer_dims[i-1]
self.check_WL['W'+str(i)]=theta[len-self.layer_dims[i]*self.layer_dims[i-1]:len].reshape((self.layer_dims[i],self.layer_dims[i-1]))
# if i==1:
# print('check_W'+str(i))
# print(self.check_WL['W'+str(i)])
len+=self.layer_dims[i]
self.check_bL['b'+str(i)]=theta[len-self.layer_dims[i]:len].reshape((self.layer_dims[i],1))
def gradient_check(self,epsilon):
# 获取当前的参数
self.dictionary_to_vector()
# 将当前的梯度存储到self.gradient中
self.gradients_to_vector()
nums=self.theta.shape[0]
J_plus=np.zeros((nums,1))
J_minus=np.zeros((nums,1))
gradapprox=np.zeros((nums,1))
for i in range(nums):
thetaplus=np.copy(self.theta)
thetaplus[i][0]=thetaplus[i][0]+epsilon
# 将改变过的数据转换回矩阵
self.vector_to_dictionary(thetaplus)
# 前向传播,利用的是改变的thetaplus进行的前向传播
self.L_model_forward_check()
J_plus[i]=self.computer_cost()
thetaminus=np.copy(self.theta)
thetaminus[i][0]=thetaminus[i][0]-epsilon
self.vector_to_dictionary(thetaminus)
self.L_model_forward_check()
J_minus[i]=self.computer_cost()
gradapprox[i]=(J_plus[i]-J_minus[i])/(2*epsilon)#计算梯度
numerator=np.linalg.norm(self.gradient-gradapprox)
denominator=np.linalg.norm(self.gradient)+np.linalg.norm(gradapprox)
different=numerator/denominator
if different>1e-5:
print("梯度错误:different="+str(different))
else:
print("梯度正确:different="+str(different))
return different
# #加载数据
train_x, train_y, test_x, test_y = load_2D_dataset()
plt.subplot(1,3,1)
plt.title("dataset")
plt.scatter(train_x[0, :], train_x[1, :], c=train_y, s=40, cmap=plt.cm.Spectral)
# # 4层的神经网络
my_dnn=dnn([train_x.shape[0], 20, 3, 1])
my_dnn.input_data(train_x,train_y,0.0075 )
print("开始训练")
costs=my_dnn.train(20000,5000,1e-7,1)
print("训练结束")
# 画损失曲线图
plt.subplot(1,3,2)
plt.title("costs")
plt.plot(costs)
# 画分类图
plt.subplot(1,3,3)
plt.title("anser")
axes = plt.gca()
axes.set_xlim([-0.75,0.40])
axes.set_ylim([-0.75,0.65])
plot_decision_boundary(lambda x: my_dnn.predict(x.T), train_x, train_y)
y_predict_train=my_dnn.predict(train_x)
y_predict_test=my_dnn.predict(test_x)
# 准确率
print("训练集准确率:")
print(str((1-np.sum(np.abs(y_predict_train-train_y))/train_y.shape[1])*100)+"%")
print("测试集准确率:")
print(str((1-np.sum(np.abs(y_predict_test-test_y))/test_y.shape[1])*100)+"%")
plt.show()
每迭代5000次检验一次