深度学习DeepLearning.ai系列课程学习总结:10. 初始化、正则化、梯度检查实战

转载过程中,图片丢失,代码显示错乱。

为了更好的学习内容,请访问原创版本:

http://www.missshi.cn/api/view/blog/59bbcacae519f50d04000202

Ps:初次访问由于js文件较大,请耐心等候(8s左右)

 

本文中,我们将以代码实战的方式来学习神经网络中的初始化、正则化以及进行梯度检查。

 

初始化

在我们模型训练的第一步中,我们首先需要给我们的模型参数一个初始值。

一个好的初始值有助于我们整个训练过程,一方面,可以加快梯度下降的收敛速度;另一方面,使得误差更有可能减小到一个更小的值。

接下来,让我们从实践中学习一下吧:

首先,是引入相关的库:


  
  
  1. import numpy as np
  2. import matplotlib.pyplot as plt
  3. import sklearn
  4. import sklearn.datasets
  5. from init_utils import sigmoid, relu, compute_loss, forward_propagation, backward_propagation
  6. from init_utils import update_parameters, predict, load_dataset, plot_decision_boundary, predict_dec
  7.  
  8. %matplotlib inline
  9. plt.rcParams['figure.figsize'] = (7.0, 4.0) # set default size of plots
  10. plt.rcParams['image.interpolation'] = 'nearest'
  11. plt.rcParams['image.cmap'] = 'gray'
  12.  
  13. # load image dataset: blue/red dots in circles
  14. train_X, train_Y, test_X, test_Y = load_dataset()

其中,一些函数的实现如下:


  
  
  1. def sigmoid(x):
  2.     """
  3.     Compute the sigmoid of x
  4.  
  5.     Arguments:
  6.     x -- A scalar or numpy array of any size.
  7.  
  8.     Return:
  9.     s -- sigmoid(x)
  10.     """
  11.     s = 1/(1+np.exp(-x))
  12.     return s
  13.  
  14. def relu(x):
  15.     """
  16.     Compute the relu of x
  17.  
  18.     Arguments:
  19.     x -- A scalar or numpy array of any size.
  20.  
  21.     Return:
  22.     s -- relu(x)
  23.     """
  24.     s = np.maximum(0,x)
  25.     
  26.     return s
  27.     
  28. def compute_loss(a3, Y):
  29.     
  30.     """
  31.     Implement the loss function
  32.     
  33.     Arguments:
  34.     a3 -- post-activation, output of forward propagation
  35.     Y -- "true" labels vector, same shape as a3
  36.     
  37.     Returns:
  38.     loss - value of the loss function
  39.     """
  40.     
  41.     m = Y.shape[1]
  42.     logprobs = np.multiply(-np.log(a3),Y) + np.multiply(-np.log(1 - a3), 1 - Y)
  43.     loss = 1./* np.nansum(logprobs)
  44.     
  45.     return loss
  46.     
  47. def forward_propagation(X, parameters):
  48.     """
  49.     Implements the forward propagation (and computes the loss) presented in Figure 2.
  50.     
  51.     Arguments:
  52.     X -- input dataset, of shape (input size, number of examples)
  53.     Y -- true "label" vector (containing 0 if cat, 1 if non-cat)
  54.     parameters -- python dictionary containing your parameters "W1", "b1", "W2", "b2", "W3", "b3":
  55.                     W1 -- weight matrix of shape ()
  56.                     b1 -- bias vector of shape ()
  57.                     W2 -- weight matrix of shape ()
  58.                     b2 -- bias vector of shape ()
  59.                     W3 -- weight matrix of shape ()
  60.                     b3 -- bias vector of shape ()
  61.     
  62.     Returns:
  63.     loss -- the loss function (vanilla logistic loss)
  64.     """
  65.         
  66.     # retrieve parameters
  67.     W1 = parameters["W1"]
  68.     b1 = parameters["b1"]
  69.     W2 = parameters["W2"]
  70.     b2 = parameters["b2"]
  71.     W3 = parameters["W3"]
  72.     b3 = parameters["b3"]
  73.     
  74.     # LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID
  75.     z1 = np.dot(W1, X) + b1
  76.     a1 = relu(z1)
  77.     z2 = np.dot(W2, a1) + b2
  78.     a2 = relu(z2)
  79.     z3 = np.dot(W3, a2) + b3
  80.     a3 = sigmoid(z3)
  81.     
  82.     cache = (z1, a1, W1, b1, z2, a2, W2, b2, z3, a3, W3, b3)
  83.     
  84.     return a3, cache
  85.  
  86. def backward_propagation(X, Y, cache):
  87.     """
  88.     Implement the backward propagation presented in figure 2.
  89.     
  90.     Arguments:
  91.     X -- input dataset, of shape (input size, number of examples)
  92.     Y -- true "label" vector (containing 0 if cat, 1 if non-cat)
  93.     cache -- cache output from forward_propagation()
  94.     
  95.     Returns:
  96.     gradients -- A dictionary with the gradients with respect to each parameter, activation and pre-activation variables
  97.     """
  98.     m = X.shape[1]
  99.     (z1, a1, W1, b1, z2, a2, W2, b2, z3, a3, W3, b3) = cache
  100.     
  101.     dz3 = 1./* (a3 - Y)
  102.     dW3 = np.dot(dz3, a2.T)
  103.     db3 = np.sum(dz3, axis=1, keepdims = True)
  104.     
  105.     da2 = np.dot(W3.T, dz3)
  106.     dz2 = np.multiply(da2, np.int64(a2 > 0))
  107.     dW2 = np.dot(dz2, a1.T)
  108.     db2 = np.sum(dz2, axis=1, keepdims = True)
  109.     
  110.     da1 = np.dot(W2.T, dz2)
  111.     dz1 = np.multiply(da1, np.int64(a1 > 0))
  112.     dW1 = np.dot(dz1, X.T)
  113.     db1 = np.sum(dz1, axis=1, keepdims = True)
  114.     
  115.     gradients = {"dz3": dz3, "dW3": dW3, "db3": db3,
  116.                  "da2": da2, "dz2": dz2, "dW2": dW2, "db2": db2,
  117.                  "da1": da1, "dz1": dz1, "dW1": dW1, "db1": db1}
  118.     
  119.     return gradients
  120.  
  121. def update_parameters(parameters, grads, learning_rate):
  122.     """
  123.     Update parameters using gradient descent
  124.     
  125.     Arguments:
  126.     parameters -- python dictionary containing your parameters 
  127.     grads -- python dictionary containing your gradients, output of n_model_backward
  128.     
  129.     Returns:
  130.     parameters -- python dictionary containing your updated parameters 
  131.                   parameters['W' + str(i)] = ... 
  132.                   parameters['b' + str(i)] = ...
  133.     """
  134.     
  135.     L = len(parameters) // 2 # number of layers in the neural networks
  136.  
  137.     # Update rule for each parameter
  138.     for k in range(L):
  139.         parameters["W" + str(k+1)] = parameters["W" + str(k+1)] - learning_rate * grads["dW" + str(k+1)]
  140.         parameters["b" + str(k+1)] = parameters["b" + str(k+1)] - learning_rate * grads["db" + str(k+1)]
  141.         
  142.     return parameters
  143.     
  144. def predict(X, y, parameters):
  145.     """
  146.     This function is used to predict the results of a  n-layer neural network.
  147.     
  148.     Arguments:
  149.     X -- data set of examples you would like to label
  150.     parameters -- parameters of the trained model
  151.     
  152.     Returns:
  153.     p -- predictions for the given dataset X
  154.     """
  155.     
  156.     m = X.shape[1]
  157.     p = np.zeros((1,m), dtype = np.int)
  158.     
  159.     # Forward propagation
  160.     a3, caches = forward_propagation(X, parameters)
  161.     
  162.     # convert probas to 0/1 predictions
  163.     for i in range(0, a3.shape[1]):
  164.         if a3[0,i] > 0.5:
  165.             p[0,i] = 1
  166.         else:
  167.             p[0,i] = 0
  168.  
  169.     # print results
  170.     print("Accuracy: "  + str(np.mean((p[0,:] == y[0,:]))))
  171.     
  172.     return p
  173.     
  174. def load_dataset():
  175.     np.random.seed(1)
  176.     train_X, train_Y = sklearn.datasets.make_circles(n_samples=300, noise=.05)
  177.     np.random.seed(2)
  178.     test_X, test_Y = sklearn.datasets.make_circles(n_samples=100, noise=.05)
  179.     # Visualize the data
  180.     plt.scatter(train_X[:, 0], train_X[:, 1], c=train_Y, s=40, cmap=plt.cm.Spectral);
  181.     train_X = train_X.T
  182.     train_Y = train_Y.reshape((1, train_Y.shape[0]))
  183.     test_X = test_X.T
  184.     test_Y = test_Y.reshape((1, test_Y.shape[0]))
  185.     return train_X, train_Y, test_X, test_Y
  186.  
  187. def plot_decision_boundary(model, X, y):
  188.     # Set min and max values and give it some padding
  189.     x_min, x_max = X[0, :].min() - 1, X[0, :].max() + 1
  190.     y_min, y_max = X[1, :].min() - 1, X[1, :].max() + 1
  191.     h = 0.01
  192.     # Generate a grid of points with distance h between them
  193.     xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
  194.     # Predict the function value for the whole grid
  195.     Z = model(np.c_[xx.ravel(), yy.ravel()])
  196.     Z = Z.reshape(xx.shape)
  197.     # Plot the contour and training examples
  198.     plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral)
  199.     plt.ylabel('x2')
  200.     plt.xlabel('x1')
  201.     plt.scatter(X[0, :], X[1, :], c=y, cmap=plt.cm.Spectral)
  202.     plt.show()
  203.  
  204. def predict_dec(parameters, X):
  205.     """
  206.     Used for plotting decision boundary.
  207.     
  208.     Arguments:
  209.     parameters -- python dictionary containing your parameters 
  210.     X -- input data of size (m, K)
  211.     
  212.     Returns
  213.     predictions -- vector of predictions of our model (red: 0 / blue: 1)
  214.     """
  215.     
  216.     # Predict using forward propagation and a classification threshold of 0.5
  217.     a3, cache = forward_propagation(X, parameters)
  218.     predictions = (a3>0.5)
  219.     return predictions

 

我们接下来的任务就是将图中的蓝色和红色点进行分割开来。

接下来,我们需要使用一个三层的神经网络模型来对该数据集进行分类。

模型如下:


  
  
  1. def model(X, Y, learning_rate = 0.01, num_iterations = 15000, print_cost = True, initialization = "he"):
  2.     """
  3.     Implements a three-layer neural network: LINEAR->RELU->LINEAR->RELU->LINEAR->SIGMOID.
  4.     
  5.     Arguments:
  6.     X -- input data, of shape (2, number of examples)
  7.     Y -- true "label" vector (containing 0 for red dots; 1 for blue dots), of shape (1, number of examples)
  8.     learning_rate -- learning rate for gradient descent 
  9.     num_iterations -- number of iterations to run gradient descent
  10.     print_cost -- if True, print the cost every 1000 iterations
  11.     initialization -- flag to choose which initialization to use ("zeros","random" or "he")
  12.     
  13.     Returns:
  14.     parameters -- parameters learnt by the model
  15.     """
  16.         
  17.     grads = {}
  18.     costs = [] # to keep track of the loss
  19.     m = X.shape[1] # number of examples
  20.     layers_dims = [X.shape[0], 10, 5, 1]
  21.     
  22.     # Initialize parameters dictionary.
  23.     if initialization == "zeros":
  24.         parameters = initialize_parameters_zeros(layers_dims)
  25.     elif initialization == "random":
  26.         parameters = initialize_parameters_random(layers_dims)
  27.     elif initialization == "he":
  28.         parameters = initialize_parameters_he(layers_dims)
  29.  
  30.     # Loop (gradient descent)
  31.  
  32.     for i in range(0, num_iterations):
  33.  
  34.         # Forward propagation: LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID.
  35.         a3, cache = forward_propagation(X, parameters)
  36.         
  37.         # Loss
  38.         cost = compute_loss(a3, Y)
  39.  
  40.         # Backward propagation.
  41.         grads = backward_propagation(X, Y, cache)
  42.         
  43.         # Update parameters.
  44.         parameters = update_parameters(parameters, grads, learning_rate)
  45.         
  46.         # Print the loss every 1000 iterations
  47.         if print_cost and i % 1000 == 0:
  48.             print("Cost after iteration {}: {}".format(i, cost))
  49.             costs.append(cost)
  50.             
  51.     # plot the loss
  52.     plt.plot(costs)
  53.     plt.ylabel('cost')
  54.     plt.xlabel('iterations (per hundreds)')
  55.     plt.title("Learning rate =" + str(learning_rate))
  56.     plt.show()
  57.     
  58.     return parameters

在这个模型中,我们支持了三种不同的初始化方法:

1.零初始化

2.随机初始化

3.He初始化

三种初始化方法的实现如下:

零初始化:


  
  
  1. def initialize_parameters_zeros(layers_dims):
  2.     """
  3.     Arguments:
  4.     layer_dims -- python array (list) containing the size of each layer.
  5.     
  6.     Returns:
  7.     parameters -- python dictionary containing your parameters "W1", "b1", ..., "WL", "bL":
  8.                     W1 -- weight matrix of shape (layers_dims[1], layers_dims[0])
  9.                     b1 -- bias vector of shape (layers_dims[1], 1)
  10.                     ...
  11.                     WL -- weight matrix of shape (layers_dims[L], layers_dims[L-1])
  12.                     bL -- bias vector of shape (layers_dims[L], 1)
  13.     """
  14.     
  15.     parameters = {}
  16.     L = len(layers_dims)            # number of layers in the network
  17.     
  18.     for l in range(1, L):
  19.         ### START CODE HERE ### (≈ 2 lines of code)
  20.         parameters['W' + str(l)] = np.zeros((layers_dims[l], layers_dims[l-1]))
  21.         parameters['b' + str(l)] = np.zeros((layers_dims[l], 1))
  22.         ### END CODE HERE ###
  23.     return parameters

训练一下吧:


  
  
  1. parameters = model(train_X, train_Y, initialization = "zeros")
  2. print ("On the train set:")
  3. predictions_train = predict(train_X, train_Y, parameters)
  4. print ("On the test set:")
  5. predictions_test = predict(test_X, test_Y, parameters)

观察结果,我们可以很容易的发现,在使用零初始化时,训练过程完全无效。

对于训练集和测试集的预测结果如下:


  
  
  1. print ("predictions_train = " + str(predictions_train))
  2. print ("predictions_test = " + str(predictions_test))
  3. plt.title("Model with Zeros initialization")
  4. axes = plt.gca()
  5. axes.set_xlim([-1.5,1.5])
  6. axes.set_ylim([-1.5,1.5])
  7. plot_decision_boundary(lambda x: predict_dec(parameters, x.T), train_X, train_Y)

从结果和图中,我们可以看出所有的样本的预测结果全部为0。

通常来说,零初始化都会导致神经网络无法打破对称性,最终导致的结构就是无论网络有多少层,最终只能得到和Logistic函数相同的效果。

 

随机初始化:

上面的分析中,我们已经可以看出随机初始化无法得到一个很好的结果,接下来,我们使用随机初始化来看一下效果吧:


  
  
  1. def initialize_parameters_random(layers_dims):
  2.     """
  3.     Arguments:
  4.     layer_dims -- python array (list) containing the size of each layer.
  5.     
  6.     Returns:
  7.     parameters -- python dictionary containing your parameters "W1", "b1", ..., "WL", "bL":
  8.                     W1 -- weight matrix of shape (layers_dims[1], layers_dims[0])
  9.                     b1 -- bias vector of shape (layers_dims[1], 1)
  10.                     ...
  11.                     WL -- weight matrix of shape (layers_dims[L], layers_dims[L-1])
  12.                     bL -- bias vector of shape (layers_dims[L], 1)
  13.     """
  14.     
  15.     np.random.seed(3)               # This seed makes sure your "random" numbers will be the as ours
  16.     parameters = {}
  17.     L = len(layers_dims)            # integer representing the number of layers
  18.     
  19.     for l in range(1, L):
  20.         ### START CODE HERE ### (≈ 2 lines of code)
  21.         parameters['W' + str(l)] = np.random.randn(layers_dims[l], layers_dims[l-1]) * 10
  22.         parameters['b' + str(l)] = np.zeros((layers_dims[l], 1))
  23.         ### END CODE HERE ###
  24.  
  25.     return parameters

训练一下吧:


  
  
  1. parameters = model(train_X, train_Y, initialization = "random")
  2. print ("On the train set:")
  3. predictions_train = predict(train_X, train_Y, parameters)
  4. print ("On the test set:")
  5. predictions_test = predict(test_X, test_Y, parameters)

从上面的结果可以看出,样本的训练已经有了效果,模型的预测结果已经不再是全部为零啦!


  
  
  1. print (predictions_train)
  2. print (predictions_test)
  3. plt.title("Model with large random initialization")
  4. axes = plt.gca()
  5. axes.set_xlim([-1.5,1.5])
  6. axes.set_ylim([-1.5,1.5])
  7. plot_decision_boundary(lambda x: predict_dec(parameters, x.T), train_X, train_Y)

从上面的练习中,我们可以观察到:

1.代价函数的初始值很高,这是由于我们选择了较大的初始值。

2.初始化选择的不恰当可能会导致梯度消失或梯度爆炸,这些都会导致优化算法的性能下降。

3.随着迭代次数的增加,最终的分离结果还能有一定的提升。但是如果初始值选择更大的话,性能也会随之降低。

 

He初始化:

He初始化与Xavire初始化十分类似。

Xavire初始化中,系数为

He初始化中,系数为


  
  
  1. def initialize_parameters_he(layers_dims):
  2.     """
  3.     Arguments:
  4.     layer_dims -- python array (list) containing the size of each layer.
  5.     
  6.     Returns:
  7.     parameters -- python dictionary containing your parameters "W1", "b1", ..., "WL", "bL":
  8.                     W1 -- weight matrix of shape (layers_dims[1], layers_dims[0])
  9.                     b1 -- bias vector of shape (layers_dims[1], 1)
  10.                     ...
  11.                     WL -- weight matrix of shape (layers_dims[L], layers_dims[L-1])
  12.                     bL -- bias vector of shape (layers_dims[L], 1)
  13.     """
  14.     
  15.     np.random.seed(3)
  16.     parameters = {}
  17.     L = len(layers_dims) - 1 # integer representing the number of layers
  18.      
  19.     for l in range(1, L + 1):
  20.         ### START CODE HERE ### (≈ 2 lines of code)
  21.         parameters['W' + str(l)] = np.random.randn(layers_dims[l], layers_dims[l-1]) * np.sqrt(2./layers_dims[l-1])
  22.         parameters['b' + str(l)] = np.zeros((layers_dims[l], 1))
  23.         ### END CODE HERE ###
  24.         
  25.     return parameters

训练一下吧:


  
  
  1. parameters = model(train_X, train_Y, initialization = "he")
  2. print ("On the train set:")
  3. predictions_train = predict(train_X, train_Y, parameters)
  4. print ("On the test set:")
  5. predictions_test = predict(test_X, test_Y, parameters)

 


  
  
  1. plt.title("Model with He initialization")
  2. axes = plt.gca()
  3. axes.set_xlim([-1.5,1.5])
  4. axes.set_ylim([-1.5,1.5])
  5. plot_decision_boundary(lambda x: predict_dec(parameters, x.T), train_X, train_Y)

从上图可以看出,使用He初始时,可以得到一个相对较好的分类结果。

 

在上面的学习中,我们尝试了三种不同的初始化方法,在相同的超参数和迭代次数的情况下,比较结果如下:

总结一下:

1. 不同的初始化方法可能导致最终不同的性能

2. 随机初始化有助于打破对称,使得不同隐藏层的单元可以学习到不同的参数。

3. 初始化时,初始值不宜过大。

4. He初始化搭配ReLU激活函数常常可以得到不错的效果。

 

正则化

在深度学习中,如果数据集没有足够大的话,可能会导致一些过拟合的问题。

过拟合导致的结果就是在训练集上有着很高的精确度,但是在遇到新的样本时,精确度下降严重。

为了避免过拟合的问题,接下来我们要讲解的方式就是正则化。

首先,引入相关的库:


  
  
  1. import numpy as np
  2. import matplotlib.pyplot as plt
  3. from reg_utils import sigmoid, relu, plot_decision_boundary, initialize_parameters, load_2D_dataset, predict_dec
  4. from reg_utils import compute_cost, predict, forward_propagation, backward_propagation, update_parameters
  5. import sklearn
  6. import sklearn.datasets
  7. import scipy.io
  8. from testCases import *
  9.  
  10. %matplotlib inline
  11. plt.rcParams['figure.figsize'] = (7.0, 4.0) # set default size of plots
  12. plt.rcParams['image.interpolation'] = 'nearest'
  13. plt.rcParams['image.cmap'] = 'gray'

其中,一些库中的函数如下:


  
  
  1. def initialize_parameters(layer_dims):
  2.     """
  3.     Arguments:
  4.     layer_dims -- python array (list) containing the dimensions of each layer in our network
  5.     
  6.     Returns:
  7.     parameters -- python dictionary containing your parameters "W1", "b1", ..., "WL", "bL":
  8.                     W1 -- weight matrix of shape (layer_dims[l], layer_dims[l-1])
  9.                     b1 -- bias vector of shape (layer_dims[l], 1)
  10.                     Wl -- weight matrix of shape (layer_dims[l-1], layer_dims[l])
  11.                     bl -- bias vector of shape (1, layer_dims[l])
  12.                     
  13.     Tips:
  14.     - For example: the layer_dims for the "Planar Data classification model" would have been [2,2,1]. 
  15.     This means W1's shape was (2,2), b1 was (1,2), W2 was (2,1) and b2 was (1,1). Now you have to generalize it!
  16.     - In the for loop, use parameters['W' + str(l)] to access Wl, where l is the iterative integer.
  17.     """
  18.     
  19.     np.random.seed(3)
  20.     parameters = {}
  21.     L = len(layer_dims) # number of layers in the network
  22.  
  23.     for l in range(1, L):
  24.         parameters['W' + str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1]) / np.sqrt(layer_dims[l-1])
  25.         parameters['b' + str(l)] = np.zeros((layer_dims[l], 1))
  26.         
  27.         assert(parameters['W' + str(l)].shape == layer_dims[l], layer_dims[l-1])
  28.         assert(parameters['W' + str(l)].shape == layer_dims[l], 1)
  29.  
  30.         
  31.     return parameters
  32.  
  33. def compute_cost(a3, Y):
  34.     """
  35.     Implement the cost function
  36.     
  37.     Arguments:
  38.     a3 -- post-activation, output of forward propagation
  39.     Y -- "true" labels vector, same shape as a3
  40.     
  41.     Returns:
  42.     cost - value of the cost function
  43.     """
  44.     m = Y.shape[1]
  45.     
  46.     logprobs = np.multiply(-np.log(a3),Y) + np.multiply(-np.log(1 - a3), 1 - Y)
  47.     cost = 1./* np.nansum(logprobs)
  48.     
  49.     return cost
  50.     
  51. def load_2D_dataset():
  52.     data = scipy.io.loadmat('datasets/data.mat')
  53.     train_X = data['X'].T
  54.     train_Y = data['y'].T
  55.     test_X = data['Xval'].T
  56.     test_Y = data['yval'].T
  57.  
  58.     plt.scatter(train_X[0, :], train_X[1, :], c=train_Y, s=40, cmap=plt.cm.Spectral);
  59.     
  60.     return train_X, train_Y, test_X, test_Y

Ps:其中,load_2D_dataset()中的数据集下载地址如下:

请访问http://www.missshi.cn/#/books搜索data.mat进行下载,首次访问Js可能加载微慢,请耐心等候(约10s)。

如果感觉不错希望大家推广下网站哈!不建议大家把训练集直接在QQ群或CSDN上直接分享。


问题描述:

假设你现在是一个AI专家,你需要设计一个模型,可以用于推荐在足球场中守门员将球发至哪个位置可以让本队的球员抢到球的可能性更大。


  
  
  1. train_X, train_Y, test_X, test_Y = load_2D_dataset()

上图中,每一个点对应一个足球落下的位置。

对于蓝色的点,表示我方足球队员抢到球;对于红色的点,则表示对方球员抢到球。

我们的目标是建立一个模型,来找到适合我方球员能抢到球的位置。

模型如下:


  
  
  1. def model(X, Y, learning_rate = 0.3, num_iterations = 30000, print_cost = True, lambd = 0, keep_prob = 1):
  2. """
  3. Implements a three-layer neural network: LINEAR->RELU->LINEAR->RELU->LINEAR->SIGMOID.
  4. Arguments:
  5. X -- input data, of shape (input size, number of examples)
  6. Y -- true "label" vector (1 for blue dot / 0 for red dot), of shape (output size, number of examples)
  7. learning_rate -- learning rate of the optimization
  8. num_iterations -- number of iterations of the optimization loop
  9. print_cost -- If True, print the cost every 10000 iterations
  10. lambd -- regularization hyperparameter, scalar
  11. keep_prob - probability of keeping a neuron active during drop-out, scalar.
  12. Returns:
  13. parameters -- parameters learned by the model. They can then be used to predict.
  14. """
  15. grads = {}
  16. costs = [] # to keep track of the cost
  17. m = X.shape[1] # number of examples
  18. layers_dims = [X.shape[0], 20, 3, 1]
  19. # Initialize parameters dictionary.
  20. parameters = initialize_parameters(layers_dims)
  21.  
  22. # Loop (gradient descent)
  23.  
  24. for i in range(0, num_iterations):
  25.  
  26. # Forward propagation: LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID.
  27. if keep_prob == 1:
  28. a3, cache = forward_propagation(X, parameters)
  29. elif keep_prob < 1:
  30. a3, cache = forward_propagation_with_dropout(X, parameters, keep_prob)
  31. # Cost function
  32. if lambd == 0:
  33. cost = compute_cost(a3, Y)
  34. else:
  35. cost = compute_cost_with_regularization(a3, Y, parameters, lambd)
  36. # Backward propagation.
  37. assert(lambd==0 or keep_prob==1) # it is possible to use both L2 regularization and dropout,
  38. # but this assignment will only explore one at a time
  39. if lambd == 0 and keep_prob == 1:
  40. grads = backward_propagation(X, Y, cache)
  41. elif lambd != 0:
  42. grads = backward_propagation_with_regularization(X, Y, cache, lambd)
  43. elif keep_prob < 1:
  44. grads = backward_propagation_with_dropout(X, Y, cache, keep_prob)
  45. # Update parameters.
  46. parameters = update_parameters(parameters, grads, learning_rate)
  47. # Print the loss every 10000 iterations
  48. if print_cost and i % 10000 == 0:
  49. print("Cost after iteration {}: {}".format(i, cost))
  50. if print_cost and i % 1000 == 0:
  51. costs.append(cost)
  52. # plot the cost
  53. plt.plot(costs)
  54. plt.ylabel('cost')
  55. plt.xlabel('iterations (x1,000)')
  56. plt.title("Learning rate =" + str(learning_rate))
  57. plt.show()
  58. return parameters

首先,我们使用无正则化的模型进行训练:


  
  
  1. parameters = model(train_X, train_Y)
  2. print ("On the training set:")
  3. predictions_train = predict(train_X, train_Y, parameters)
  4. print ("On the test set:")
  5. predictions_test = predict(test_X, test_Y, parameters)

 

我们可以看到,对于训练集,精确度为94%;而对于测试集,精确度为91.5%。

接下来,我们将分割曲线画出来:


  
  
  1. plt.title("Model without regularization")
  2. axes = plt.gca()
  3. axes.set_xlim([-0.75,0.40])
  4. axes.set_ylim([-0.75,0.65])
  5. plot_decision_boundary(lambda x: predict_dec(parameters, x.T), train_X, train_Y)

从图中可以看出,在无正则化时,分割曲线有了明显的过拟合特性。

接下来,我们使用L2方法进行正则化,相关的实现函数如下:


  
  
  1. def compute_cost_with_regularization(A3, Y, parameters, lambd):
  2. """
  3. Implement the cost function with L2 regularization. See formula (2) above.
  4. Arguments:
  5. A3 -- post-activation, output of forward propagation, of shape (output size, number of examples)
  6. Y -- "true" labels vector, of shape (output size, number of examples)
  7. parameters -- python dictionary containing parameters of the model
  8. Returns:
  9. cost - value of the regularized loss function (formula (2))
  10. """
  11. m = Y.shape[1]
  12. W1 = parameters["W1"]
  13. W2 = parameters["W2"]
  14. W3 = parameters["W3"]
  15. cross_entropy_cost = compute_cost(A3, Y) # This gives you the cross-entropy part of the cost
  16. ### START CODE HERE ### (approx. 1 line)
  17. L2_regularization_cost = 1.0 / m * lambd / 2 * (np.sum(np.square(W1)) + np.sum(np.square(W2)) + np.sum(np.square(W3)))
  18. ### END CODER HERE ###
  19. cost = cross_entropy_cost + L2_regularization_cost
  20. return cost
  21. def backward_propagation_with_regularization(X, Y, cache, lambd):
  22. """
  23. Implements the backward propagation of our baseline model to which we added an L2 regularization.
  24. Arguments:
  25. X -- input dataset, of shape (input size, number of examples)
  26. Y -- "true" labels vector, of shape (output size, number of examples)
  27. cache -- cache output from forward_propagation()
  28. lambd -- regularization hyperparameter, scalar
  29. Returns:
  30. gradients -- A dictionary with the gradients with respect to each parameter, activation and pre-activation variables
  31. """
  32. m = X.shape[1]
  33. (Z1, A1, W1, b1, Z2, A2, W2, b2, Z3, A3, W3, b3) = cache
  34. dZ3 = A3 - Y
  35. ### START CODE HERE ### (approx. 1 line)
  36. dW3 = 1./m * np.dot(dZ3, A2.T) + lambd/m*W3
  37. ### END CODE HERE ###
  38. db3 = 1./m * np.sum(dZ3, axis=1, keepdims = True)
  39. dA2 = np.dot(W3.T, dZ3)
  40. dZ2 = np.multiply(dA2, np.int64(A2 > 0))
  41. ### START CODE HERE ### (approx. 1 line)
  42. dW2 = 1./m * np.dot(dZ2, A1.T) + lambd/m*W2
  43. ### END CODE HERE ###
  44. db2 = 1./m * np.sum(dZ2, axis=1, keepdims = True)
  45. dA1 = np.dot(W2.T, dZ2)
  46. dZ1 = np.multiply(dA1, np.int64(A1 > 0))
  47. ### START CODE HERE ### (approx. 1 line)
  48. dW1 = 1./m * np.dot(dZ1, X.T) + lambd/m*W1
  49. ### END CODE HERE ###
  50. db1 = 1./m * np.sum(dZ1, axis=1, keepdims = True)
  51. gradients = {"dZ3": dZ3, "dW3": dW3, "db3": db3,"dA2": dA2,
  52. "dZ2": dZ2, "dW2": dW2, "db2": db2, "dA1": dA1,
  53. "dZ1": dZ1, "dW1": dW1, "db1": db1}
  54. return gradients

接下来,我们用L2正则模型进行训练:


  
  
  1. parameters = model(train_X, train_Y, lambd = 0.7)
  2. print ("On the train set:")
  3. predictions_train = predict(train_X, train_Y, parameters)
  4. print ("On the test set:")
  5. predictions_test = predict(test_X, test_Y, parameters)

分割曲线如下:


  
  
  1. plt.title("Model with L2-regularization")
  2. axes = plt.gca()
  3. axes.set_xlim([-0.75,0.40])
  4. axes.set_ylim([-0.75,0.65])
  5. plot_decision_boundary(lambda x: predict_dec(parameters, x.T), train_X, train_Y)

最后,我们使用Dropout来进行正则化,Dropout的原理就是每次迭代过程中随机将其中的一些神经元失效。

Dropout的实现如下:


  
  
  1. def forward_propagation_with_dropout(X, parameters, keep_prob = 0.5):
  2. """
  3. Implements the forward propagation: LINEAR -> RELU + DROPOUT -> LINEAR -> RELU + DROPOUT -> LINEAR -> SIGMOID.
  4. Arguments:
  5. X -- input dataset, of shape (2, number of examples)
  6. parameters -- python dictionary containing your parameters "W1", "b1", "W2", "b2", "W3", "b3":
  7. W1 -- weight matrix of shape (20, 2)
  8. b1 -- bias vector of shape (20, 1)
  9. W2 -- weight matrix of shape (3, 20)
  10. b2 -- bias vector of shape (3, 1)
  11. W3 -- weight matrix of shape (1, 3)
  12. b3 -- bias vector of shape (1, 1)
  13. keep_prob - probability of keeping a neuron active during drop-out, scalar
  14. Returns:
  15. A3 -- last activation value, output of the forward propagation, of shape (1,1)
  16. cache -- tuple, information stored for computing the backward propagation
  17. """
  18. np.random.seed(1)
  19. # retrieve parameters
  20. W1 = parameters["W1"]
  21. b1 = parameters["b1"]
  22. W2 = parameters["W2"]
  23. b2 = parameters["b2"]
  24. W3 = parameters["W3"]
  25. b3 = parameters["b3"]
  26. # LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID
  27. Z1 = np.dot(W1, X) + b1
  28. A1 = relu(Z1)
  29. ### START CODE HERE ### (approx. 4 lines) # Steps 1-4 below correspond to the Steps 1-4 described above.
  30. D1 = np.random.rand(A1.shape[0], A1.shape[1]) # Step 1: initialize matrix D1 = np.random.rand(..., ...)
  31. D1 = (D1 < keep_prob) # Step 2: convert entries of D1 to 0 or 1 (using keep_prob as the threshold)
  32. A1 = A1 * D1 # Step 3: shut down some neurons of A1
  33. A1 = A1 / keep_prob # Step 4: scale the value of neurons that haven't been shut down
  34. ### END CODE HERE ###
  35. Z2 = np.dot(W2, A1) + b2
  36. A2 = relu(Z2)
  37. ### START CODE HERE ### (approx. 4 lines)
  38. D2 = np.random.rand(A2.shape[0], A2.shape[1]) # Step 1: initialize matrix D2 = np.random.rand(..., ...)
  39. D2 = (D2 < keep_prob) # Step 2: convert entries of D2 to 0 or 1 (using keep_prob as the threshold)
  40. A2 = A2 * D2 # Step 3: shut down some neurons of A2
  41. A2 = A2 / keep_prob # Step 4: scale the value of neurons that haven't been shut down
  42. ### END CODE HERE ###
  43. Z3 = np.dot(W3, A2) + b3
  44. A3 = sigmoid(Z3)
  45. cache = (Z1, D1, A1, W1, b1, Z2, D2, A2, W2, b2, Z3, A3, W3, b3)
  46. return A3, cache
  47. def backward_propagation_with_dropout(X, Y, cache, keep_prob):
  48. """
  49. Implements the backward propagation of our baseline model to which we added dropout.
  50. Arguments:
  51. X -- input dataset, of shape (2, number of examples)
  52. Y -- "true" labels vector, of shape (output size, number of examples)
  53. cache -- cache output from forward_propagation_with_dropout()
  54. keep_prob - probability of keeping a neuron active during drop-out, scalar
  55. Returns:
  56. gradients -- A dictionary with the gradients with respect to each parameter, activation and pre-activation variables
  57. """
  58. m = X.shape[1]
  59. (Z1, D1, A1, W1, b1, Z2, D2, A2, W2, b2, Z3, A3, W3, b3) = cache
  60. dZ3 = A3 - Y
  61. dW3 = 1./m * np.dot(dZ3, A2.T)
  62. db3 = 1./m * np.sum(dZ3, axis=1, keepdims = True)
  63. dA2 = np.dot(W3.T, dZ3)
  64. ### START CODE HERE ### (≈ 2 lines of code)
  65. dA2 = dA2 * D2 # Step 1: Apply mask D2 to shut down the same neurons as during the forward propagation
  66. dA2 = dA2 / keep_prob # Step 2: Scale the value of neurons that haven't been shut down
  67. ### END CODE HERE ###
  68. dZ2 = np.multiply(dA2, np.int64(A2 > 0))
  69. dW2 = 1./m * np.dot(dZ2, A1.T)
  70. db2 = 1./m * np.sum(dZ2, axis=1, keepdims = True)
  71. dA1 = np.dot(W2.T, dZ2)
  72. ### START CODE HERE ### (≈ 2 lines of code)
  73. dA1 = dA1 * D1 # Step 1: Apply mask D1 to shut down the same neurons as during the forward propagation
  74. dA1 = dA1 / keep_prob # Step 2: Scale the value of neurons that haven't been shut down
  75. ### END CODE HERE ###
  76. dZ1 = np.multiply(dA1, np.int64(A1 > 0))
  77. dW1 = 1./m * np.dot(dZ1, X.T)
  78. db1 = 1./m * np.sum(dZ1, axis=1, keepdims = True)
  79. gradients = {"dZ3": dZ3, "dW3": dW3, "db3": db3,"dA2": dA2,
  80. "dZ2": dZ2, "dW2": dW2, "db2": db2, "dA1": dA1,
  81. "dZ1": dZ1, "dW1": dW1, "db1": db1}
  82. return gradients

接下来,使用带有Dropout的模型来训练吧:


  
  
  1. parameters = model(train_X, train_Y, keep_prob = 0.86, learning_rate = 0.3)
  2.  
  3. print ("On the train set:")
  4. predictions_train = predict(train_X, train_Y, parameters)
  5. print ("On the test set:")
  6. predictions_test = predict(test_X, test_Y, parameters)

查看结果,我们可以发现添加Dropout后,对于测试集,精确度提升到了95%。

接下来,我们来看一下分割曲线图吧:


  
  
  1. plt.title("Model with dropout")
  2. axes = plt.gca()
  3. axes.set_xlim([-0.75,0.40])
  4. axes.set_ylim([-0.75,0.65])
  5. plot_decision_boundary(lambda x: predict_dec(parameters, x.T), train_X, train_Y)

对比三种方法,结果如下:


梯度检查

在接下来的内容,我们将学习怎么样进行梯度检查。

假设你现在是一个全球移动支付团队中的一员,现在需要建立一个深度学习模型去判断用户账户在进行付款的时候是否是被黑客入侵的。

但是,在我们执行反向传播的计算过程中,反向传播函数的计算过程是比较复杂的。为了验证我们得到的反向传播函数是否正确,现在你需要编写一些代码来验证反向传播函数的正确性。


首先,引入相关的库:


  
  
  1. import numpy as np
  2. from testCases import *
  3. from gc_utils import sigmoid, relu, dictionary_to_vector, vector_to_dictionary, gradients_to_vector

其中,涉及到的一些函数如下:


  
  
  1. def dictionary_to_vector(parameters):
  2.     """
  3.     Roll all our parameters dictionary into a single vector satisfying our specific required shape.
  4.     """
  5.     keys = []
  6.     count = 0
  7.     for key in ["W1", "b1", "W2", "b2", "W3", "b3"]:
  8.         
  9.         # flatten parameter
  10.         new_vector = np.reshape(parameters[key], (-1,1))
  11.         keys = keys + [key]*new_vector.shape[0]
  12.         
  13.         if count == 0:
  14.             theta = new_vector
  15.         else:
  16.             theta = np.concatenate((theta, new_vector), axis=0)
  17.         count = count + 1
  18.  
  19.     return theta, keys
  20.  
  21. def vector_to_dictionary(theta):
  22.     """
  23.     Unroll all our parameters dictionary from a single vector satisfying our specific required shape.
  24.     """
  25.     parameters = {}
  26.     parameters["W1"] = theta[:20].reshape((5,4))
  27.     parameters["b1"] = theta[20:25].reshape((5,1))
  28.     parameters["W2"] = theta[25:40].reshape((3,5))
  29.     parameters["b2"] = theta[40:43].reshape((3,1))
  30.     parameters["W3"] = theta[43:46].reshape((1,3))
  31.     parameters["b3"] = theta[46:47].reshape((1,1))
  32.  
  33.     return parameters
  34.  
  35. def gradients_to_vector(gradients):
  36.     """
  37.     Roll all our gradients dictionary into a single vector satisfying our specific required shape.
  38.     """
  39.     
  40.     count = 0
  41.     for key in ["dW1", "db1", "dW2", "db2", "dW3", "db3"]:
  42.         # flatten parameter
  43.         new_vector = np.reshape(gradients[key], (-1,1))
  44.         
  45.         if count == 0:
  46.             theta = new_vector
  47.         else:
  48.             theta = np.concatenate((theta, new_vector), axis=0)
  49.         count = count + 1
  50.  
  51.     return theta

接下来,我们首先看下一维线性模型的梯度检查:

前向传播过程:


  
  
  1. def forward_propagation(x, theta):
  2. """
  3. Implement the linear forward propagation (compute J) presented in Figure 1 (J(theta) = theta * x)
  4. Arguments:
  5. x -- a real-valued input
  6. theta -- our parameter, a real number as well
  7. Returns:
  8. J -- the value of function J, computed using the formula J(theta) = theta * x
  9. """
  10. ### START CODE HERE ### (approx. 1 line)
  11. J = theta * x
  12. ### END CODE HERE ###
  13. return J

反向传播过程:


  
  
  1. def backward_propagation(x, theta):
  2. """
  3. Computes the derivative of J with respect to theta (see Figure 1).
  4. Arguments:
  5. x -- a real-valued input
  6. theta -- our parameter, a real number as well
  7. Returns:
  8. dtheta -- the gradient of the cost with respect to theta
  9. """
  10. ### START CODE HERE ### (approx. 1 line)
  11. dtheta = x
  12. ### END CODE HERE ###
  13. return dtheta

接下来,梯度检查的步骤如下:

接下来,计算梯度的反向传播值

最后计算误差:

当difference小于10-7时,我们通常认为我们计算的结果是正确的。


  
  
  1. def gradient_check(x, theta, epsilon = 1e-7):
  2. """
  3. Implement the backward propagation presented in Figure 1.
  4. Arguments:
  5. x -- a real-valued input
  6. theta -- our parameter, a real number as well
  7. epsilon -- tiny shift to the input to compute approximated gradient with formula(1)
  8. Returns:
  9. difference -- difference (2) between the approximated gradient and the backward propagation gradient
  10. """
  11. # Compute gradapprox using left side of formula (1). epsilon is small enough, you don't need to worry about the limit.
  12. ### START CODE HERE ### (approx. 5 lines)
  13. thetaplus = theta + epsilon # Step 1
  14. thetaminus = theta - epsilon # Step 2
  15. J_plus = x * (theta + epsilon) # Step 3
  16. J_minus = x * (theta - epsilon) # Step 4
  17. gradapprox = (J_plus - J_minus) / 2 / epsilon # Step 5
  18. ### END CODE HERE ###
  19. # Check if gradapprox is close enough to the output of backward_propagation()
  20. ### START CODE HERE ### (approx. 1 line)
  21. grad = backward_propagation(x, theta)
  22. ### END CODE HERE ###
  23. ### START CODE HERE ### (approx. 1 line)
  24. numerator = np.linalg.norm(grad - gradapprox) # Step 1'
  25. denominator = np.linalg.norm(grad) + np.linalg.norm(gradapprox) # Step 2'
  26. difference = numerator / denominator # Step 3'
  27. ### END CODE HERE ###
  28. if difference < 1e-7:
  29. print ("The gradient is correct!")
  30. else:
  31. print ("The gradient is wrong!")
  32. return difference

测试一下吧:


  
  
  1. x, theta = 2, 4
  2. difference = gradient_check(x, theta)
  3. print("difference = " + str(difference))


接下来,我们来看下对于N维的梯度检查吧:

前向传播如下:


  
  
  1. def forward_propagation_n(X, Y, parameters):
  2. """
  3. Implements the forward propagation (and computes the cost) presented in Figure 3.
  4. Arguments:
  5. X -- training set for m examples
  6. Y -- labels for m examples
  7. parameters -- python dictionary containing your parameters "W1", "b1", "W2", "b2", "W3", "b3":
  8. W1 -- weight matrix of shape (5, 4)
  9. b1 -- bias vector of shape (5, 1)
  10. W2 -- weight matrix of shape (3, 5)
  11. b2 -- bias vector of shape (3, 1)
  12. W3 -- weight matrix of shape (1, 3)
  13. b3 -- bias vector of shape (1, 1)
  14. Returns:
  15. cost -- the cost function (logistic cost for one example)
  16. """
  17. # retrieve parameters
  18. m = X.shape[1]
  19. W1 = parameters["W1"]
  20. b1 = parameters["b1"]
  21. W2 = parameters["W2"]
  22. b2 = parameters["b2"]
  23. W3 = parameters["W3"]
  24. b3 = parameters["b3"]
  25.  
  26. # LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID
  27. Z1 = np.dot(W1, X) + b1
  28. A1 = relu(Z1)
  29. Z2 = np.dot(W2, A1) + b2
  30. A2 = relu(Z2)
  31. Z3 = np.dot(W3, A2) + b3
  32. A3 = sigmoid(Z3)
  33.  
  34. # Cost
  35. logprobs = np.multiply(-np.log(A3),Y) + np.multiply(-np.log(1 - A3), 1 - Y)
  36. cost = 1./m * np.sum(logprobs)
  37. cache = (Z1, A1, W1, b1, Z2, A2, W2, b2, Z3, A3, W3, b3)
  38. return cost, cache

反向传播如下:


  
  
  1. def backward_propagation_n(X, Y, cache):
  2. """
  3. Implement the backward propagation presented in figure 2.
  4.  
  5. Arguments:
  6. X -- input datapoint, of shape (input size, 1)
  7. Y -- true "label"
  8. cache -- cache output from forward_propagation_n()
  9. Returns:
  10. gradients -- A dictionary with the gradients of the cost with respect to each parameter, activation and pre-activation variables.
  11. """
  12. m = X.shape[1]
  13. (Z1, A1, W1, b1, Z2, A2, W2, b2, Z3, A3, W3, b3) = cache
  14. dZ3 = A3 - Y
  15. dW3 = 1./m * np.dot(dZ3, A2.T)
  16. db3 = 1./m * np.sum(dZ3, axis=1, keepdims = True)
  17. dA2 = np.dot(W3.T, dZ3)
  18. dZ2 = np.multiply(dA2, np.int64(A2 > 0))
  19. dW2 = 1./m * np.dot(dZ2, A1.T)
  20. db2 = 1./m * np.sum(dZ2, axis=1, keepdims = True)
  21. dA1 = np.dot(W2.T, dZ2)
  22. dZ1 = np.multiply(dA1, np.int64(A1 > 0))
  23. dW1 = 1./m * np.dot(dZ1, X.T)
  24. db1 = 1./m * np.sum(dZ1, axis=1, keepdims = True)
  25. gradients = {"dZ3": dZ3, "dW3": dW3, "db3": db3,
  26. "dA2": dA2, "dZ2": dZ2, "dW2": dW2, "db2": db2,
  27. "dA1": dA1, "dZ1": dZ1, "dW1": dW1, "db1": db1}
  28.  
  29. return gradients

对于多维度的情况下,求导的公式仍然如下:

然而, 此时theta不再是一个标量,而是一个矢量了。

def model(X, Y, learning_rate = 0.3, num_iterations = 30000, print_cost = True, lambd = 0, keep_prob = 1):
    """
    Implements a three-layer neural network: LINEAR->RELU->LINEAR->RELU->LINEAR->SIGMOID.
    
    Arguments:
    X -- input data, of shape (input size, number of examples)
    Y -- true "label" vector (1 for blue dot / 0 for red dot), of shape (output size, number of examples)
    learning_rate -- learning rate of the optimization
    num_iterations -- number of iterations of the optimization loop
    print_cost -- If True, print the cost every 10000 iterations
    lambd -- regularization hyperparameter, scalar
    keep_prob - probability of keeping a neuron active during drop-out, scalar.
    
    Returns:
    parameters -- parameters learned by the model. They can then be used to predict.
    """
        
    grads = {}
    costs = []                            # to keep track of the cost
    m = X.shape[1]                        # number of examples
    layers_dims = [X.shape[0], 20, 3, 1]
    
    # Initialize parameters dictionary.
    parameters = initialize_parameters(layers_dims)

    # Loop (gradient descent)

    for i in range(0, num_iterations):

        # Forward propagation: LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> SIGMOID.
        if keep_prob == 1:
            a3, cache = forward_propagation(X, parameters)
        elif keep_prob < 1:
            a3, cache = forward_propagation_with_dropout(X, parameters, keep_prob)
        
        # Cost function
        if lambd == 0:
            cost = compute_cost(a3, Y)
        else:
            cost = compute_cost_with_regularization(a3, Y, parameters, lambd)
            
        # Backward propagation.
        assert(lambd==0 or keep_prob==1)    # it is possible to use both L2 regularization and dropout, 
                                            # but this assignment will only explore one at a time
        if lambd == 0 and keep_prob == 1:
            grads = backward_propagation(X, Y, cache)
        elif lambd != 0:
            grads = backward_propagation_with_regularization(X, Y, cache, lambd)
        elif keep_prob < 1:
            grads = backward_propagation_with_dropout(X, Y, cache, keep_prob)
        
        # Update parameters.
        parameters = update_parameters(parameters, grads, learning_rate)
        
        # Print the loss every 10000 iterations
        if print_cost and i % 10000 == 0:
            print("Cost after iteration {}: {}".format(i, cost))
        if print_cost and i % 1000 == 0:
            costs.append(cost)
    
    # plot the cost
    plt.plot(costs)
    plt.ylabel('cost')
    plt.xlabel('iterations (x1,000)')
    plt.title("Learning rate =" + str(learning_rate))
    plt.show()
    
    return parameters

类似的是,对于多维度而言,实现的过程如下:

1. 计算gradapprox

2. 计算梯度

3. 计算误差:

实现过程如下:


  
  
  1. def gradient_check_n(parameters, gradients, X, Y, epsilon = 1e-7):
  2. """
  3. Checks if backward_propagation_n computes correctly the gradient of the cost output by forward_propagation_n
  4. Arguments:
  5. parameters -- python dictionary containing your parameters "W1", "b1", "W2", "b2", "W3", "b3":
  6. grad -- output of backward_propagation_n, contains gradients of the cost with respect to the parameters.
  7. x -- input datapoint, of shape (input size, 1)
  8. y -- true "label"
  9. epsilon -- tiny shift to the input to compute approximated gradient with formula(1)
  10. Returns:
  11. difference -- difference (2) between the approximated gradient and the backward propagation gradient
  12. """
  13. # Set-up variables
  14. parameters_values, _ = dictionary_to_vector(parameters)
  15. grad = gradients_to_vector(gradients)
  16. num_parameters = parameters_values.shape[0]
  17. J_plus = np.zeros((num_parameters, 1))
  18. J_minus = np.zeros((num_parameters, 1))
  19. gradapprox = np.zeros((num_parameters, 1))
  20. # Compute gradapprox
  21. for i in range(num_parameters):
  22. # Compute J_plus[i]. Inputs: "parameters_values, epsilon". Output = "J_plus[i]".
  23. # "_" is used because the function you have to outputs two parameters but we only care about the first one
  24. ### START CODE HERE ### (approx. 3 lines)
  25. thetaplus = np.copy(parameters_values) # Step 1
  26. thetaplus[i][0] = thetaplus[i][0] + epsilon # Step 2
  27. J_plus[i], _ = forward_propagation_n(X, Y, vector_to_dictionary(thetaplus)) # Step 3
  28. ### END CODE HERE ###
  29. # Compute J_minus[i]. Inputs: "parameters_values, epsilon". Output = "J_minus[i]".
  30. ### START CODE HERE ### (approx. 3 lines)
  31. thetaminus = np.copy(parameters_values) # Step 1
  32. thetaminus[i][0] = thetaminus[i][0] - epsilon # Step 2
  33. J_minus[i], _ = forward_propagation_n(X, Y, vector_to_dictionary(thetaminus)) # Step 3
  34. ### END CODE HERE ###
  35. # Compute gradapprox[i]
  36. ### START CODE HERE ### (approx. 1 line)
  37. gradapprox[i] = (J_plus[i] - J_minus[i]) / 2 / epsilon
  38. ### END CODE HERE ###
  39. # Compare gradapprox to backward propagation gradients by computing difference.
  40. ### START CODE HERE ### (approx. 1 line)
  41. numerator = np.linalg.norm(grad - gradapprox) # Step 1'
  42. denominator = np.linalg.norm(grad) + np.linalg.norm(gradapprox) # Step 2'
  43. difference = numerator / denominator # Step 3' # Step 3'
  44. ### END CODE HERE ###
  45.  
  46. if difference > 1e-7:
  47. print ("\033[93m" + "There is a mistake in the backward propagation! difference = " + str(difference) + "\033[0m")
  48. else:
  49. print ("\033[92m" + "Your backward propagation works perfectly fine! difference = " + str(difference) + "\033[0m")
  50. return difference

到此为止,本文的内容就完成啦!接下来小伙伴们可以自己多多勤加练习!


 

更多更详细的内容,请访问原创网站:

http://www.missshi.cn/api/view/blog/59bbcacae519f50d04000202

Ps:初次访问由于js文件较大,请耐心等候(8s左右)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值