正则化的编程作业,包括无正则化情况、L2正则化、Dropout的编程实现,编程中用到的相关理论和公式请参考上一篇博文。
问题描述:原问题是判断足球运动员是否头球,在此省略问题背景,其实就是二分类问题。有以下类型的数据,蓝点为一类,红点为一类
导入需要的扩展包,reg_utils.py及数据集在此下载
import numpy as np
import matplotlib.pyplot as plt
from reg_utils import sigmoid, relu, plot_decision_boundary, initialize_parameters, load_2D_dataset, predict_dec
from reg_utils import compute_cost, predict, forward_propagation, backward_propagation, update_parameters
import sklearn
import sklearn.datasets
import scipy.io
from testCases import *
%matplotlib inline
plt.rcParams['figure.figsize'] = (7.0, 4.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'
train_X, train_Y, test_X, test_Y = load_2D_dataset() # 读取数据
一、无正则化的模型实现
def model(X, Y, learning_rate = 0.3, num_iterations = 30000, print_cost = True, lambd = 0, keep_prob = 1): """ 输入参数: X -- 输入数据 Y -- 标签,1代表蓝点,0代表红点 learning_rate -- 学习率 num_iterations -- 迭代次数 print_cost -- 如果为真,则每10000次迭代输出cost lambd -- 正则化参数 keep_prob - dropout参数 返回: parameters -- 模型学习到的参数 """ grads = {} # costs = [] # 记录cost m = X.shape[1] # 样本的数目 layers_dims = [X.shape[0], 20, 3, 1] # 定义网络结构 # 参数初始化 parameters = initialize_parameters(layers_dims) # 循环,梯度下降 for i in range(0, num_iterations): # 前向传播 if keep_prob == 1: a3, cache = forward_propagation(X, parameters) # 实使用不带dropout的前向传播 elif keep_prob < 1: a3, cache = forward_propagation_with_dropout(X, parameters, keep_prob) # 使用带dropout的前向传播 # 代价函数 if lambd == 0: cost = compute_cost(a3, Y) # 使用不带正则化的cost计算函数 else: cost = compute_cost_with_regularization(a3, Y, parameters, lambd) # 使用带正则化的cost计算函数 # 反向传播 assert(lambd==0 or keep_prob==1) # it is possible to use both L2 regularization and dropout, # but this assignment will only explore one at a time if lambd == 0 and keep_prob == 1: grads = backward_propagation(X, Y, cache) elif lambd != 0: grads = backward_propagation_with_regularization(X, Y, cache, lambd) elif keep_prob < 1: grads = backward_propagation_with_dropout(X, Y, cache, keep_prob) # 更新参数 parameters = update_parameters(parameters, grads, learning_rate) # 每10000次迭代打印cost if print_cost and i % 10000 == 0: print("Cost after iteration {}: {}".format(i, cost)) if print_cost and i % 1000 == 0: costs.append(cost) # plot the cost plt.plot(costs) plt.ylabel('cost') plt.xlabel('iterations (x1,000)') plt.title("Learning rate =" + str(learning_rate)) plt.show() return parameters
在没有任何正则化的情况下训练这个模型:
parameters = model(train_X, train_Y) print ("On the training set:") predictions_train = predict(train_X, train_Y, parameters) print ("On the test set:") predictions_test = predict(test_X, test_Y, parameters)
plt.title("Model without regularization") axes = plt.gca() axes.set_xlim([-0.75,0.40]) axes.set_ylim([-0.75,0.65]) plot_decision_boundary(lambda x: predict_dec(parameters, x.T), train_X, train_Y)
cost曲线如下所示:
在训练集上的准确率为0.947867298578,在测试集上的准确率为0.915
绘制出分类边界如下所示。没有正则化的情况下,训练出现了过拟合。
二、L2正则化
def compute_cost_with_regularization(A3, Y, parameters, lambd): """ 输入参数: A3 -- 前向传播的输出 Y -- 真实的标签 parameters -- 模型参数 返回: cost - 带正则化损失函数的值 """ m = Y.shape[1] W1 = parameters["W1"] W2 = parameters["W2"] W3 = parameters["W3"] # 不带正则化项的cost cross_entropy_cost = compute_cost(A3, Y) # 正则化项 L2_regularization_cost = (np.sum(np.square(W1)) + np.sum(np.square(W2)) + np.sum(np.square(W3))) * lambd /(2 * m) # 带正则化项的cost cost = cross_entropy_cost + L2_regularization_cost return cost
def backward_propagation_with_regularization(X, Y, cache, lambd): """ 输入参数: X -- 输入数据 Y -- 真实的标签 cache -- 从forward_propagation()输出的cache lambd -- 正则化参数 返回: gradients -- 权重和偏置的导数 """ m = X.shape[1] (Z1, A1, W1, b1, Z2, A2, W2, b2, Z3, A3, W3, b3) = cache dZ3 = A3 - Y dW3 = 1./m * np.dot(dZ3, A2.T) + W3 * lambd/m db3 = 1./m * np.sum(dZ3, axis=1, keepdims = True) dA2 = np.dot(W3.T, dZ3) dZ2 = np.multiply(dA2, np.int64(A2 > 0)) dW2 = 1./m * np.dot(dZ2, A1.T) + W2 * lambd/m db2 = 1./m * np.sum(dZ2, axis=1, keepdims = True) dA1 = np.dot(W2.T, dZ2) dZ1 = np.multiply(dA1, np.int64(A1 > 0)) dW1 = 1./m * np.dot(dZ1, X.T) + W1 * lambd/m db1 = 1./m * np.sum(dZ1, axis=1, keepdims = True) gradients = {"dZ3": dZ3, "dW3": dW3, "db3": db3,"dA2": dA2, "dZ2": dZ2, "dW2": dW2, "db2": db2, "dA1": dA1, "dZ1": dZ1, "dW1": dW1, "db1": db1} return gradients
训练此模型,得到cost曲线:
在训练集上的准确率为0.938388625592,在测试集上的准确率为0.93,
绘制出的分类边界如下:
三、Dropout
def forward_propagation_with_dropout(X, parameters, keep_prob = 0.5): """ 输入参数: X -- 输入数据 parameters -- 权重和偏置 keep_prob - 保留神经元的概率 返回: A3 -- 网络的输出 cache -- 计算反向传播的cache """ np.random.seed(1) W1 = parameters["W1"] b1 = parameters["b1"] W2 = parameters["W2"] b2 = parameters["b2"] W3 = parameters["W3"] b3 = parameters["b3"] Z1 = np.dot(W1, X) + b1 A1 = relu(Z1) # dropout D1 = np.random.rand(A1.shape[0], A1.shape[1]) # Step 1: initialize matrix D1 = np.random.rand(..., ...) D1 = (D1 < keep_prob) # Step 2: convert entries of D1 to 0 or 1 (using keep_prob as the threshold) A1 = np.multiply(A1, D1) # Step 3: shut down some neurons of A1 A1 = A1/keep_prob # Step 4: scale the value of neurons that haven't been shut down Z2 = np.dot(W2, A1) + b2 A2 = relu(Z2) # dropout D2 = np.random.rand(A2.shape[0], A2.shape[1]) # Step 1: initialize matrix D2 = np.random.rand(..., ...) D2 = (D2 < keep_prob) # Step 2: convert entries of D2 to 0 or 1 (using keep_prob as the threshold) A2 = np.multiply(A2, D2) # Step 3: shut down some neurons of A2 A2 = A2/keep_prob # Step 4: scale the value of neurons that haven't been shut down Z3 = np.dot(W3, A2) + b3 A3 = sigmoid(Z3) cache = (Z1, D1, A1, W1, b1, Z2, D2, A2, W2, b2, Z3, A3, W3, b3) return A3, cache
def backward_propagation_with_dropout(X, Y, cache, keep_prob): """ 输入参数: X -- 输入数据 Y -- 真实的标签 cache -- 从forward_propagation_with_dropout()输出的cache keep_prob - 保留神经元的概率 返回: gradients -- 权重、偏置的导数 """ m = X.shape[1] (Z1, D1, A1, W1, b1, Z2, D2, A2, W2, b2, Z3, A3, W3, b3) = cache dZ3 = A3 - Y dW3 = 1./m * np.dot(dZ3, A2.T) db3 = 1./m * np.sum(dZ3, axis=1, keepdims = True) dA2 = np.dot(W3.T, dZ3) dA2 = np.multiply(dA2, D2) # Step 1: Apply mask D2 to shut down the same neurons as during the forward propagation dA2 = dA2/keep_prob # Step 2: Scale the value of neurons that haven't been shut down dZ2 = np.multiply(dA2, np.int64(A2 > 0)) dW2 = 1./m * np.dot(dZ2, A1.T) db2 = 1./m * np.sum(dZ2, axis=1, keepdims = True) dA1 = np.dot(W2.T, dZ2) dA1 = np.multiply(dA1, D1) # Step 1: Apply mask D1 to shut down the same neurons as during the forward propagation dA1 = dA1/keep_prob # Step 2: Scale the value of neurons that haven't been shut down dZ1 = np.multiply(dA1, np.int64(A1 > 0)) dW1 = 1./m * np.dot(dZ1, X.T) db1 = 1./m * np.sum(dZ1, axis=1, keepdims = True) gradients = {"dZ3": dZ3, "dW3": dW3, "db3": db3,"dA2": dA2, "dZ2": dZ2, "dW2": dW2, "db2": db2, "dA1": dA1, "dZ1": dZ1, "dW1": dW1, "db1": db1} return gradients
训练此模型,得到cost曲线:
在训练集上的准确率为0.928909952607,在测试集上的准确率为0.95
绘制的分类边界为:
从以上可以看出,正则化降低了训练的准确率,因为它限制了网络拟合数据的能力,但提高了测试集的准确率。