#浅层前向传播deflinear_forward(A, W, b):"""
参数:
A -- 从上一层的激活值(或输入数据):
W -- : numpy array of shape (当前层数的大小, 上一层尺寸)
b -- bias vector, numpy array of shape (当前层数的大小, 1)
Returns:
Z -- 激活函数的输入,也称为预激活参数
缓存——包含" a ", "W"和"b"的python字典;存储用于有效地计算向后传递
"""### START CODE HERE ### (≈ 1 line of code)
Z = np.dot(W,A)+b
### END CODE HERE ###assert(Z.shape ==(W.shape[0], A.shape[1]))
cache =(A, W, b)return Z, cache
A, W, b = linear_forward_test_case()
Z, linear_cache = linear_forward(A, W, b)print("Z = "+str(Z))
#激活函数输出deflinear_activation_forward(A_prev, W, b, activation):"""
参数:
A_prev -- 来自前一层的激活(或输入数据):(前一层的大小,示例的数量)
W -- weights matrix: numpy array of shape (size of current layer, size of previous layer)
b -- bias vector, numpy array of shape (size of the current layer, 1)
activation -- the activation to be used in this layer, stored as a text string: "sigmoid" or "relu"
Returns:
A -- the output of the activation function, also called the post-activation value
cache -- a python dictionary containing "linear_cache" and "activation_cache";
stored for computing the backward pass efficiently
"""if activation =="sigmoid":# Inputs: "A_prev, W, b". Outputs: "A, activation_cache".### START CODE HERE ### (≈ 2 lines of code)
Z, linear_cache = linear_forward(A_prev, W, b)
A, activation_cache = sigmoid(Z)### END CODE HERE ###elif activation =="relu":# Inputs: "A_prev, W, b". Outputs: "A, activation_cache".### START CODE HERE ### (≈ 2 lines of code)
Z, linear_cache = linear_forward(A_prev, W, b)
A, activation_cache = relu(Z)### END CODE HERE ###assert(A.shape ==(W.shape[0], A_prev.shape[1]))
cache =(linear_cache, activation_cache)return A, cache
A_prev, W, b = linear_activation_forward_test_case()
A, linear_activation_cache = linear_activation_forward(A_prev, W, b, activation ="sigmoid")print("With sigmoid: A = "+str(A))
A, linear_activation_cache = linear_activation_forward(A_prev, W, b, activation ="relu")print("With ReLU: A = "+str(A))
#深层前向传播defL_model_forward(X, parameters):"""
Implement forward propagation for the [LINEAR->RELU]*(L-1)->LINEAR->SIGMOID computation
Arguments:
X -- data, numpy array of shape (input size, number of examples)
parameters -- output of initialize_parameters_deep()
Returns:
AL -- last post-activation value
caches -- list of caches containing:
every cache of linear_relu_forward() (there are L-1 of them, indexed from 0 to L-2)
the cache of linear_sigmoid_forward() (there is one, indexed L-1)
"""
caches =[]
A = X
L =len(parameters)//2# 深度# Implement [LINEAR -> RELU]*(L-1). Add "cache" to the "caches" list.for l inrange(1, L):
A_prev = A
### START CODE HERE ### (≈ 2 lines of code)
A, cache = linear_activation_forward(A_prev, parameters['W'+str(l)], parameters['b'+str(l)], activation ="relu")
caches.append(cache)### END CODE HERE #### Implement LINEAR -> SIGMOID. Add "cache" to the "caches" list.### START CODE HERE ### (≈ 2 lines of code)
AL, cache = linear_activation_forward(A, parameters['W'+str(L)], parameters['b'+str(L)], activation ="sigmoid")
caches.append(cache)### END CODE HERE ###assert(AL.shape ==(1,X.shape[1]))return AL, caches
X, parameters = L_model_forward_test_case()
AL, caches = L_model_forward(X, parameters)print("AL = "+str(AL))print("Length of caches list = "+str(len(caches)))
#损失函数defcompute_cost(AL, Y):"""
Implement the cost function defined by equation (7).
Arguments:
AL -- probability vector corresponding to your label predictions, shape (1, number of examples)
Y -- true "label" vector (for example: containing 0 if non-cat, 1 if cat), shape (1, number of examples)
Returns:
cost -- cross-entropy cost
"""
m = Y.shape[1]# Compute loss from aL and y.### START CODE HERE ### (≈ 1 lines of code)
cost =-np.sum(np.multiply(Y, np.log(AL))+ np.multiply(1-Y, np.log(1-AL)), axis=1,keepdims=True)/m
### END CODE HERE ###
cost = np.squeeze(cost)# To make sure your cost's shape is what we expect (e.g. this turns [[17]] into 17).assert(cost.shape ==())return cost
Y, AL = compute_cost_test_case()print("cost = "+str(compute_cost(AL, Y)))
#反向deflinear_backward(dZ, cache):"""
为单层(层l)实现反向传播的线性部分
Arguments:
dZ -- Gradient of the cost with respect to the linear output (of current layer l)
cache -- tuple of values (A_prev, W, b) coming from the forward propagation in the current layer
Returns:
dA_prev -- Gradient of the cost with respect to the activation (of the previous layer l-1), same shape as A_prev
dW -- Gradient of the cost with respect to W (current layer l), same shape as W
db -- Gradient of the cost with respect to b (current layer l), same shape as b
"""
A_prev, W, b = cache
m = A_prev.shape[1]### START CODE HERE ### (≈ 3 lines of code)
dA_prev = np.dot(W.T,dZ)/m
dW = np.dot(dZ,A_prev.T)
db = np.sum(dZ,axis=1,keepdims=True)/m
### END CODE HERE ###assert(dA_prev.shape == A_prev.shape)assert(dW.shape == W.shape)assert(db.shape == b.shape)return dA_prev, dW, db
# Set up some test inputs
dZ, linear_cache = linear_backward_test_case()
dA_prev, dW, db = linear_backward(dZ, linear_cache)print("dA_prev = "+str(dA_prev))print("dW = "+str(dW))print("db = "+str(db))
deflinear_activation_backward(dA, cache, activation):"""
Implement the backward propagation for the LINEAR->ACTIVATION layer.
Arguments:
dA -- post-activation gradient for current layer l
cache -- tuple of values (linear_cache, activation_cache) we store for computing backward propagation efficiently
activation -- the activation to be used in this layer, stored as a text string: "sigmoid" or "relu"
Returns:
dA_prev -- Gradient of the cost with respect to the activation (of the previous layer l-1), same shape as A_prev
dW -- Gradient of the cost with respect to W (current layer l), same shape as W
db -- Gradient of the cost with respect to b (current layer l), same shape as b
"""
linear_cache, activation_cache = cache
if activation =="relu":### START CODE HERE ### (≈ 2 lines of code)
dZ = relu_backward(dA, activation_cache)
dA_prev, dW, db = linear_backward(dZ, linear_cache)### END CODE HERE ###elif activation =="sigmoid":### START CODE HERE ### (≈ 2 lines of code)
dZ = sigmoid_backward(dA, activation_cache)
dA_prev, dW, db = linear_backward(dZ, linear_cache)### END CODE HERE ###return dA_prev, dW, db
GRADED FUNCTION: L_model_backward
defL_model_backward(AL, Y, caches):"""
Implement the backward propagation for the [LINEAR->RELU] * (L-1) -> LINEAR -> SIGMOID group
Arguments:
AL -- probability vector, output of the forward propagation (L_model_forward())
Y -- true "label" vector (containing 0 if non-cat, 1 if cat)
caches -- list of caches containing:
every cache of linear_activation_forward() with "relu" (it's caches[l], for l in range(L-1) i.e l = 0...L-2)
the cache of linear_activation_forward() with "sigmoid" (it's caches[L-1])
Returns:
grads -- A dictionary with the gradients
grads["dA" + str(l)] = ...
grads["dW" + str(l)] = ...
grads["db" + str(l)] = ...
"""
grads ={}
L =len(caches)# the number of layers
m = AL.shape[1]
Y = Y.reshape(AL.shape)# after this line, Y is the same shape as AL# Initializing the backpropagation### START CODE HERE ### (1 line of code)
dAL =-(np.divide(Y, AL)- np.divide(1- Y,1- AL))### END CODE HERE #### Lth layer (SIGMOID -> LINEAR) gradients. Inputs: "AL, Y, caches". Outputs: "grads["dAL"], grads["dWL"], grads["dbL"]### START CODE HERE ### (approx. 2 lines)
current_cache = caches[L-1]
grads["dA"+str(L)], grads["dW"+str(L)], grads["db"+str(L)]= linear_activation_backward(dAL, current_cache, activation ="sigmoid")#这里吴恩达老师吧AL-1重命名位AL### END CODE HERE ###for l inreversed(range(L -1)):
实际的值L-2开始下降到0;
### START CODE HERE ### (approx. 5 lines)print(l)
current_cache = caches[l]
dA_prev_temp, dW_temp, db_temp = linear_activation_backward(grads["dA"+str(l +2)], current_cache, activation ="relu")
grads["dA"+str(l +1)]= dA_prev_temp
grads["dW"+str(l +1)]= dW_temp
grads["db"+str(l +1)]= db_temp
### END CODE HERE ###return grads
defupdate_parameters(parameters, grads, learning_rate):"""
Update parameters using gradient descent
Arguments:
parameters -- python dictionary containing your parameters
grads -- python dictionary containing your gradients, output of L_model_backward
Returns:
parameters -- python dictionary containing your updated parameters
parameters["W" + str(l)] = ...
parameters["b" + str(l)] = ...
"""
L =len(parameters)//2# number of layers in the neural network# Update rule for each parameter. Use a for loop.### START CODE HERE ### (≈ 3 lines of code)for l inrange(L):
parameters["W"+str(l+1)]= parameters["W"+str(l+1)]- learning_rate*grads["dW"+str(l+1)]
parameters["b"+str(l+1)]= parameters["b"+str(l+1)]- learning_rate*grads["db"+str(l+1)]### END CODE HERE ###return parameters