#模型的反向传播很难实现,有时还会有错误
#证明你的反向传播实际上是有效的!为了保证这一点,你将应用到“梯度检验”
import numpy as np
from testCases_L2W1 import *
from gc_utils import sigmoid, relu, dictionary_to_vector, vector_to_dictionary, gradients_to_vector
#1.一维梯度检查
#为此简单函数实现“正向传播”和“向后传播”。 即在两个单独的函数中,计算J(正向传播)及其相对于theta(反向传播)的导数
def forward_propagation(x,theta):
"""
Implement the linear forward propagation (compute J) presented in Figure 1 (J(theta) = theta * x)
Arguments:
x -- a real-valued input
theta -- our parameter, a real number as well
Returns:
J -- the value of function J, computed using the formula J(theta) = theta * x
"""
J=theta*x
return J
theta,x=2,4
J=forward_propagation(x,theta)
print ("J = " + str(J))
#现在,执行图1的反向传播步骤(导数计算)。也就是说,计算J相对于 theta的导数。为避免进行演算,你应该得到dtheta=x
def backward_propagation(x,theta):
"""
Computes the derivative of J with respect to theta (see Figure 1).
Arguments:
x -- a real-valued input
theta -- our parameter, a real number as well
Returns:
dtheta -- the gradient of the cost with respect to theta
"""
dtheta=x
return dtheta
x, theta = 2, 4
dtheta = backward_propagation(x, theta)
print ("dtheta = " + str(dtheta))
#让我们实施梯度检验
def gradient_check(x,theta,epsilon=1e-7):
"""
Implement the backward propagation presented in Figure 1.
Arguments:
x -- a real-valued input
theta -- our parameter, a real number as well
epsilon -- tiny shift to the input to compute approximated gradient with formula(1)
Returns:
difference -- difference (2) between the approximated gradient and the backward propagation gradient
"""
theta_opt=theta+epsilon
theta_neg=theta-epsilon
J_opt=forward_propagation(x,theta_opt)
J_neg=forward_propagation(x,theta_neg)
grad=backward_propagation(x,theta)
gradapprox=(J_opt-J_neg)/(2*epsilon)
difference=np.linalg.norm(gradapprox-grad)/(np.linalg.norm(grad)+np.linalg.norm(gradapprox))
if difference<1e-7:
print("The gradient is correct!")
else:
print("The gradient is wrong!")
return difference
x, theta = 2, 4
difference = gradient_check(x, theta)
print("difference = " + str(difference))
#2.N维梯度检验
def forward_propagation_n(X,Y,parameters):
# cache=[]
# m=X.shape[1]
# L=len(parameters)//2
# A=X
# for l in range(1,L):
# A_prev=A
# W=parameters["W"+str(l)]
# b=parameters["b"+str(l)]
# Z=W*A_prev+b
# A=relu(Z)
# WL=parameters["WL"]
# bL=parameters["bL"]
# ZL = WL * A + bL
# Y_hat=sigmoid(ZL)
#
# cost=-1/m*np.sum(np.multiply(Y,np.log(Y_hat))+np.multiply(1-Y,np.log(1-Y_hat)))
m=X.shape[1]
W1 = parameters["W1"]
b1 = parameters["b1"]
W2 = parameters["W2"]
b2 = parameters["b2"]
W3 = parameters["W3"]
b3 = parameters["b3"]
Z1=np.dot(W1,X)+b1
A1=relu(Z1)
Z2=np.dot(W2,A1)+b2
A2=relu(Z2)
Z3=np.dot(W3,A2)+b3
A3=sigmoid(Z3)
# cost = -1 / m * np.sum(np.multiply(Y, np.log(A3)) + np.multiply(1 - Y, np.log(1 - A3)))
logprobs = np.multiply(-np.log(A3), Y) + np.multiply(-np.log(1 - A3), 1 - Y)
cost = 1. / m * np.sum(logprobs)
cache = (Z1, A1, W1, b1, Z2, A2, W2, b2, Z3, A3, W3, b3)
return cost,cache
def backward_propagation_n(X,Y,cache):
m=X.shape[1]
Z1, A1, W1, b1, Z2, A2, W2, b2, Z3, A3, W3, b3=cache
dZ3=A3-Y
dW3=1/m*np.dot(dZ3,A2.T)
db3=1/m*np.sum(dZ3,axis=1,keepdims=True)
dA2=np.dot(W3.T,dZ3)
dZ2 = np.multiply(dA2, np.int64(A2 > 0))
dW2=1/m*np.dot(dZ2,A1.T)
db2=1/m*np.sum(dZ2,axis=1,keepdims=True)
dA1=np.dot(W2.T,dZ2)
dZ1=np.multiply(dA1, np.int64(A1 > 0))
dW1=1/m*np.dot(dZ1,X.T)
db1=1/m*np.sum(dZ1,axis=1,keepdims=True)
gradients = {"dZ3": dZ3, "dW3": dW3, "db3": db3,
"dA2": dA2, "dZ2": dZ2, "dW2": dW2, "db2": db2,
"dA1": dA1, "dZ1": dZ1, "dW1": dW1, "db1": db1}
return gradients
#我们为你实现了一个函数"dictionary_to_vector()"。它将“参数”字典转换为称为“值”的向量,
# 该向量是通过将所有参数(W1, b1, W2, b2, W3, b3)重塑为向量并将它们串联而获得的。
#反函数是“vector_to_dictionary”,它输出回“parameters”字典
def gradient_check_n(parameters,gradients,X,Y,epsilon=1e-7):
theta,_=dictionary_to_vector(parameters)
grad=gradients_to_vector(gradients)
num_parameters=theta.shape[0]
J_plus=np.zeros((num_parameters,1))
J_minus=np.zeros((num_parameters,1))
gradapprox=np.zeros((num_parameters,1))
# print(num_parameters) #theta的维度为(47,1)
for i in range(num_parameters):
# Compute J_plus[i]. Inputs: "parameters_values, epsilon". Output = "J_plus[i]".
thetaplus=np.copy(theta) #只是单纯的复制,两者没有一点关系
# print(thetaplus[i])
# thetaplus[i][0]=thetaplus[i][0]+epsilon #官方示例,但其实不用加上[0],因为矩阵的列只是1,即它每一行只有一个数
thetaplus[i] = thetaplus[i] + epsilon
J_plus[i],_=forward_propagation_n(X,Y,vector_to_dictionary(thetaplus))
thetaminus=np.copy(theta)
thetaminus[i][0]=thetaplus[i][0]-epsilon
J_minus[i], _ = forward_propagation_n(X, Y, vector_to_dictionary(thetaminus))
gradapprox[i]=(J_plus[i]-J_minus[i])/(2*epsilon)
print(grad.shape)
print(gradapprox.shape)
difference=np.linalg.norm(gradapprox-grad)/(np.linalg.norm(grad)+np.linalg.norm(gradapprox))
if difference > 1e-7:
print(
"\033[93m" + "There is a mistake in the backward propagation! difference = " + str(difference) + "\033[0m")
else:
print(
"\033[92m" + "Your backward propagation works perfectly fine! difference = " + str(difference) + "\033[0m")
return difference
X, Y, parameters = gradient_check_n_test_case()
cost, cache = forward_propagation_n(X, Y, parameters)
gradients = backward_propagation_n(X, Y, cache)
difference = gradient_check_n(parameters, gradients, X, Y)
吴恩达神经网络学习-L2W1作业3
最新推荐文章于 2024-05-14 18:00:57 发布