Building your Deep Neural Network: Step by Step(吴恩达课程)
# GRADED FUNCTION: initialize_parametersdefinitialize_parameters(n_x, n_h, n_y):"""
Argument:
n_x -- size of the input layer
n_h -- size of the hidden layer
n_y -- size of the output layer
Returns:
parameters -- python dictionary containing your parameters:
W1 -- weight matrix of shape (n_h, n_x)
b1 -- bias vector of shape (n_h, 1)
W2 -- weight matrix of shape (n_y, n_h)
b2 -- bias vector of shape (n_y, 1)
"""
np.random.seed(1)#(≈ 4 lines of code)# W1 = ...# b1 = ...# W2 = ...# b2 = ...# YOUR CODE STARTS HERE
W1 = np.random.randn(n_h,n_x)*0.01
b1 = np.zeros((n_h,1))
W2 = np.random.randn(n_y,n_h)*0.01
b2 = np.zeros((n_y,1))# YOUR CODE ENDS HERE
parameters ={"W1": W1,"b1": b1,"W2": W2,"b2": b2}return parameters
# GRADED FUNCTION: initialize_parameters_deepdefinitialize_parameters_deep(layer_dims):"""
Arguments:
layer_dims -- python array (list) containing the dimensions of each layer in our network
Returns:
parameters -- python dictionary containing your parameters "W1", "b1", ..., "WL", "bL":
Wl -- weight matrix of shape (layer_dims[l], layer_dims[l-1])
bl -- bias vector of shape (layer_dims[l], 1)
"""
np.random.seed(3)
parameters ={}
L =len(layer_dims)# number of layers in the networkfor l inrange(1, L):#(≈ 2 lines of code)# parameters['W' + str(l)] = ...# parameters['b' + str(l)] = ...# YOUR CODE STARTS HERE
parameters['W'+str(l)]= np.random.randn(layer_dims[l],layer_dims[l-1])*0.01
parameters['b'+str(l)]= np.zeros((layer_dims[l],1))# YOUR CODE ENDS HEREassert(parameters['W'+str(l)].shape ==(layer_dims[l], layer_dims[l -1]))assert(parameters['b'+str(l)].shape ==(layer_dims[l],1))return parameters
# GRADED FUNCTION: linear_forwarddeflinear_forward(A, W, b):"""
Implement the linear part of a layer's forward propagation.
Arguments:
A -- activations from previous layer (or input data): (size of previous layer, number of examples)
W -- weights matrix: numpy array of shape (size of current layer, size of previous layer)
b -- bias vector, numpy array of shape (size of the current layer, 1)
Returns:
Z -- the input of the activation function, also called pre-activation parameter
cache -- a python tuple containing "A", "W" and "b" ; stored for computing the backward pass efficiently
"""#(≈ 1 line of code)# Z = ...# YOUR CODE STARTS HERE
Z = np.dot(W,A)+ b
# YOUR CODE ENDS HERE
cache =(A, W, b)return Z, cache
# GRADED FUNCTION: linear_activation_forwarddeflinear_activation_forward(A_prev, W, b, activation):"""
Implement the forward propagation for the LINEAR->ACTIVATION layer
Arguments:
A_prev -- activations from previous layer (or input data): (size of previous layer, number of examples)
W -- weights matrix: numpy array of shape (size of current layer, size of previous layer)
b -- bias vector, numpy array of shape (size of the current layer, 1)
activation -- the activation to be used in this layer, stored as a text string: "sigmoid" or "relu"
Returns:
A -- the output of the activation function, also called the post-activation value
cache -- a python tuple containing "linear_cache" and "activation_cache";
stored for computing the backward pass efficiently
"""if activation =="sigmoid":#(≈ 2 lines of code)# Z, linear_cache = ...# A, activation_cache = ...# YOUR CODE STARTS HERE
Z, linear_cache = linear_forward(A_prev,W,b)
A, activation_cache = sigmoid(Z)# YOUR CODE ENDS HEREelif activation =="relu":#(≈ 2 lines of code)# Z, linear_cache = ...# A, activation_cache = ...# YOUR CODE STARTS HERE
Z, linear_cache = linear_forward(A_prev,W,b)
A, activation_cache = relu(Z)# YOUR CODE ENDS HERE
cache =(linear_cache, activation_cache)return A, cache
# GRADED FUNCTION: L_model_forwarddefL_model_forward(X, parameters):"""
Implement forward propagation for the [LINEAR->RELU]*(L-1)->LINEAR->SIGMOID computation
Arguments:
X -- data, numpy array of shape (input size, number of examples)
parameters -- output of initialize_parameters_deep()
Returns:
AL -- activation value from the output (last) layer
caches -- list of caches containing:
every cache of linear_activation_forward() (there are L of them, indexed from 0 to L-1)
"""
caches =[]
A = X
L =len(parameters)//2# number of layers in the neural network# Implement [LINEAR -> RELU]*(L-1). Add "cache" to the "caches" list.# The for loop starts at 1 because layer 0 is the inputfor l inrange(1, L):
A_prev = A
#(≈ 2 lines of code)# A, cache = ...# caches ...# YOUR CODE STARTS HERE
A, cache = linear_activation_forward(A_prev, parameters['W'+str(l)], parameters['b'+str(l)], activation ="relu")
caches.append(cache)# YOUR CODE ENDS HERE# Implement LINEAR -> SIGMOID. Add "cache" to the "caches" list.#(≈ 2 lines of code)# AL, cache = ...# caches ...# YOUR CODE STARTS HERE
AL, cache = linear_activation_forward(A, parameters['W'+str(L)],parameters['b'+str(L)], activation ="sigmoid")
caches.append(cache)# YOUR CODE ENDS HEREreturn AL, caches
# GRADED FUNCTION: compute_costdefcompute_cost(AL, Y):"""
Implement the cost function defined by equation (7).
Arguments:
AL -- probability vector corresponding to your label predictions, shape (1, number of examples)
Y -- true "label" vector (for example: containing 0 if non-cat, 1 if cat), shape (1, number of examples)
Returns:
cost -- cross-entropy cost
"""
m = Y.shape[1]# Compute loss from aL and y.# (≈ 1 lines of code)# cost = ...# YOUR CODE STARTS HERE
cost =-1/ m *(np.dot(Y,np.log(AL.T))+ np.dot(1- Y,np.log(1- AL).T))# YOUR CODE ENDS HERE
cost = np.squeeze(cost)# To make sure your cost's shape is what we expect (e.g. this turns [[17]] into 17).return cost
# GRADED FUNCTION: linear_backwarddeflinear_backward(dZ, cache):"""
Implement the linear portion of backward propagation for a single layer (layer l)
Arguments:
dZ -- Gradient of the cost with respect to the linear output (of current layer l)
cache -- tuple of values (A_prev, W, b) coming from the forward propagation in the current layer
Returns:
dA_prev -- Gradient of the cost with respect to the activation (of the previous layer l-1), same shape as A_prev
dW -- Gradient of the cost with respect to W (current layer l), same shape as W
db -- Gradient of the cost with respect to b (current layer l), same shape as b
"""
A_prev, W, b = cache
m = A_prev.shape[1]### START CODE HERE ### (≈ 3 lines of code)# dW = ...# db = ... sum by the rows of dZ with keepdims=True# dA_prev = ...# YOUR CODE STARTS HERE
dW =1/ m *(np.dot(dZ,A_prev.T))
db =1/ m *(np.sum(dZ,axis =1,keepdims =True))
dA_prev = np.dot(W.T,dZ)# YOUR CODE ENDS HEREreturn dA_prev, dW, db
# GRADED FUNCTION: linear_activation_backwarddeflinear_activation_backward(dA, cache, activation):"""
Implement the backward propagation for the LINEAR->ACTIVATION layer.
Arguments:
dA -- post-activation gradient for current layer l
cache -- tuple of values (linear_cache, activation_cache) we store for computing backward propagation efficiently
activation -- the activation to be used in this layer, stored as a text string: "sigmoid" or "relu"
Returns:
dA_prev -- Gradient of the cost with respect to the activation (of the previous layer l-1), same shape as A_prev
dW -- Gradient of the cost with respect to W (current layer l), same shape as W
db -- Gradient of the cost with respect to b (current layer l), same shape as b
"""
linear_cache, activation_cache = cache
if activation =="relu":#(≈ 2 lines of code)# dZ = ...# dA_prev, dW, db = ...# YOUR CODE STARTS HERE
dZ = relu_backward(dA,activation_cache)
dA_prev, dW, db = linear_backward(dZ,linear_cache)# YOUR CODE ENDS HEREelif activation =="sigmoid":#(≈ 2 lines of code)# dZ = ...# dA_prev, dW, db = ...# YOUR CODE STARTS HERE
dZ = sigmoid_backward(dA,activation_cache)
dA_prev, dW, db = linear_backward(dZ,linear_cache)# YOUR CODE ENDS HEREreturn dA_prev, dW, db
# GRADED FUNCTION: L_model_backwarddefL_model_backward(AL, Y, caches):"""
Implement the backward propagation for the [LINEAR->RELU] * (L-1) -> LINEAR -> SIGMOID group
Arguments:
AL -- probability vector, output of the forward propagation (L_model_forward())
Y -- true "label" vector (containing 0 if non-cat, 1 if cat)
caches -- list of caches containing:
every cache of linear_activation_forward() with "relu" (it's caches[l], for l in range(L-1) i.e l = 0...L-2)
the cache of linear_activation_forward() with "sigmoid" (it's caches[L-1])
Returns:
grads -- A dictionary with the gradients
grads["dA" + str(l)] = ...
grads["dW" + str(l)] = ...
grads["db" + str(l)] = ...
"""
grads ={}
L =len(caches)# the number of layers
m = AL.shape[1]
Y = Y.reshape(AL.shape)# after this line, Y is the same shape as AL# Initializing the backpropagation#(1 line of code)# dAL = ...# YOUR CODE STARTS HERE
dAL =-(np.divide(Y, AL)- np.divide(1- Y,1- AL))# YOUR CODE ENDS HERE# Lth layer (SIGMOID -> LINEAR) gradients. Inputs: "dAL, current_cache". Outputs: "grads["dAL-1"], grads["dWL"], grads["dbL"]#(approx. 5 lines)# current_cache = ...# dA_prev_temp, dW_temp, db_temp = ...# grads["dA" + str(L-1)] = ...# grads["dW" + str(L)] = ...# grads["db" + str(L)] = ...# YOUR CODE STARTS HERE
current_cache = caches[L-1]
grads["dA"+str(L)], grads["dW"+str(L)], grads["db"+str(L)]= linear_activation_backward(dAL, current_cache, activation ="sigmoid")# YOUR CODE ENDS HERE# Loop from l=L-2 to l=0for l inreversed(range(L-1)):# lth layer: (RELU -> LINEAR) gradients.# Inputs: "grads["dA" + str(l + 1)], current_cache". Outputs: "grads["dA" + str(l)] , grads["dW" + str(l + 1)] , grads["db" + str(l + 1)] #(approx. 5 lines)# current_cache = ...# dA_prev_temp, dW_temp, db_temp = ...# grads["dA" + str(l)] = ...# grads["dW" + str(l + 1)] = ...# grads["db" + str(l + 1)] = ...# YOUR CODE STARTS HERE
current_cache = caches[l]
dA_prev_temp, dW_temp, db_temp = linear_activation_backward(grads["dA"+str(l+2)], current_cache, activation ="relu")
grads["dA"+str(l +1)]= dA_prev_temp
grads["dW"+str(l +1)]= dW_temp
grads["db"+str(l +1)]= db_temp
# YOUR CODE ENDS HEREreturn grads
# GRADED FUNCTION: update_parametersdefupdate_parameters(params, grads, learning_rate):"""
Update parameters using gradient descent
Arguments:
params -- python dictionary containing your parameters
grads -- python dictionary containing your gradients, output of L_model_backward
Returns:
parameters -- python dictionary containing your updated parameters
parameters["W" + str(l)] = ...
parameters["b" + str(l)] = ...
"""
parameters = params.copy()
L =len(parameters)//2# number of layers in the neural network# Update rule for each parameter. Use a for loop.#(≈ 2 lines of code)for l inrange(L):# parameters["W" + str(l+1)] = ...# parameters["b" + str(l+1)] = ...# YOUR CODE STARTS HERE
parameters["W"+str(l+1)]= parameters["W"+str(l+1)]- learning_rate * grads["dW"+str(l+1)]
parameters["b"+str(l+1)]= parameters["b"+str(l+1)]- learning_rate * grads["db"+str(l+1)]# YOUR CODE ENDS HEREreturn parameters