深度神经网络中的梯度检验与Inverted dropout实现

import numpy as np
import matplotlib.pyplot as plt
import scipy.io as sio
def load_dataset():
    data = sio.loadmat("/home/yan/下载/datasets/data.mat")
    return data["X"].T, data["y"].T, data["Xval"].T, data["yval"].T
def init_prams(num_of_units, L):
    prams = {}
    for i in range(1, L + 1):
        np.random.seed(3)
        prams["W" + str(i)] = np.random.randn(num_of_units[i], num_of_units[i - 1]) * np.sqrt(2 / num_of_units[i - 1])
        prams["B" + str(i)] = np.zeros((num_of_units[i], 1))
    
    return prams
def sigmoid(Z):
    return 1 / (1 + np.exp(-Z))
def relu(Z):
    return np.maximum(0, Z)
def derivative_of_sigmoid(A):
    return A * (1 - A)
def derivative_of_relu(A):
    return np.float64((A > 0))
def forward_prop(x, y, prams, M, L, activation):
    A = {}
    A["0"] = x
    for i in range(1, L + 1):
        Z = prams["W" + str(i)] @ A[str(i - 1)] + prams["B" + str(i)]
        A[str(i)] = sigmoid(Z) if i == L else relu(Z)
    
    cost = np.squeeze(y @ np.log(A[str(L)].T) + (1 - y) @ np.log(1 - A[str(i)].T)) * -1 / M
    
    return cost, A
def forward_prop_with_dropout(x, y, prams, M, L, activation, keep_prop, check=False):
    A = {}
    mask = {}
    A["0"] = x
    
    for i in range(1, L + 1):
        if check: # check=True,即开启梯度检验时,进行非随机性失活以检验梯度是否正确获取
            np.random.seed(0)
        mask[str(i - 1)] = np.random.rand(A[str(i - 1)].shape[0], A[str(i - 1)].shape[1]) < keep_prop[i - 1]
        A[str(i - 1)] = A[str(i - 1)] * mask[str(i - 1)] / keep_prop[i - 1]
        Z = prams["W" + str(i)] @ A[str(i - 1)] + prams["B" + str(i)]
        A[str(i)] = sigmoid(Z) if i == L else relu(Z)
        
    if check:
        cost = np.squeeze(y @ np.log(A[str(L)].T) + (1 - y) @ np.log(1 - A[str(L)].T)) * - 1 / M
        return mask, A, cost
    else:
        return mask, A
def backward_prop(y, A, prams, M, L, activation):
    grads = {}
    dA = -1 * y / A[str(L)] + (1 - y) / (1 - A[str(L)])
    for i in range(L, 0, -1):
        dZ = dA * (derivative_of_sigmoid(A[str(i)]) if i == L else derivative_of_relu(A[str(i)]))
        grads["dW" + str(i)] = dZ @ A[str(i - 1)].T / M
        grads["dB" + str(i)] = dZ.sum(axis=1, keepdims=True) / M
        dA = prams["W" + str(i)].T @ dZ
    
    return grads
def backward_prop_with_dropout(y, A, prams, M, L, activation, mask, keep_prop):
    grads = {}

    dA = -1 * y / A[str(L)] + (1 - y) / (1 - A[str(L)])
    for i in range(L, 0, -1):
        dZ = dA * (derivative_of_sigmoid(A[str(i)]) if i == L else derivative_of_relu(A[str(i)]))
        grads["dW" + str(i)] = dZ @ A[str(i - 1)].T / M
        grads["dB" + str(i)] = dZ.sum(axis=1, keepdims=True) / M
        dA = prams["W" + str(i)].T @ dZ
        dA = dA * mask[str(i - 1)] / keep_prop[i - 1]
    
    return grads
def gradient_upgrade(prams, grads, L, alpha):
    for i in range(1, L + 1):
        prams["W" + str(i)] -= alpha * grads["dW" + str(i)]
        prams["B" + str(i)] -= alpha * grads["dB" + str(i)]
def gradient_check(x, y, prams, M, L, activation, num_of_units, epsilon, keep_prop):
    grads = []
    grad_approxs = []
    
    for l in range(1, L + 1):
        
        for i in range(num_of_units[l]):
            
            for j in range(num_of_units[l - 1]):
                prams["W" + str(l)][i, j] += epsilon
                _, _, J_plus = forward_prop_with_dropout(x, y, prams, M, L, activation, keep_prop, check=True)
                prams["W" + str(l)][i, j] -= epsilon * 2
                _, _, J_minus = forward_prop_with_dropout(x, y, prams, M, L, activation, keep_prop, check=True)
                grad_approxs.append((J_plus - J_minus) / 2 / epsilon)
                
                prams["W" + str(l)][i, j] += epsilon
                mask, A,_ = forward_prop_with_dropout(x, y, prams, M, L, activation, keep_prop, check=True)
                g = backward_prop_with_dropout(y, A, prams, M, L, activation, mask, keep_prop)
                grads.append(g["dW" + str(l)][i, j])     
            
            prams["B" + str(l)][i] += epsilon
            _, _, J_plus = forward_prop_with_dropout(x, y, prams, M, L, activation, keep_prop, check=True)
            prams["B" + str(l)][i] -= epsilon * 2
            _, _, J_minus = forward_prop_with_dropout(x, y, prams, M, L, activation, keep_prop, check=True)
            grad_approxs.append((J_plus - J_minus) / 2 / epsilon)
                
            prams["B" + str(l)][i] += epsilon
            mask, A,_ = forward_prop_with_dropout(x, y, prams, M, L, activation, keep_prop, check=True)
            g = backward_prop_with_dropout(y, A, prams, M, L, activation, mask, keep_prop)
            grads.append(g["dB" + str(l)][i])     
        
    return np.linalg.norm(np.array(grads) - np.array(grad_approxs)) / (np.linalg.norm(np.array(grads)) + np.linalg.norm(np.array(grad_approxs)))         
def predict(x, y, prams, M, L, activation):
    cost, A = forward_prop(x, y, prams, M, L, activation)
    p = np.int64(A[str(L)] > 0.5)
    return 1 - np.sum(np.abs(p - y)) / M
def plot_division_boundary(train_x, train_y, test_x, test_y, prams, M, L, activation):
    fig = plt.figure(figsize = (14, 6), dpi=400)
    
    ax_1 = fig.add_subplot(121)
    ax_1.set_title("Training_set")
    
    x_min, x_max = train_x[0].min() - .1, train_x[0].max() + .1
    y_min, y_max = train_x[1].min() - .1, train_x[1].max() + .1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.01), np.arange(y_min, y_max, 0.01))
    X = np.c_[xx.ravel(), yy.ravel()].T
    cost, A = forward_prop(X, np.zeros((1, X.shape[1])), prams, X.shape[1], L, activation)
    p = np.int64((A[str(L)] > 0.5).reshape(xx.shape))
    
    ax_1.contourf(xx, yy, p, cmap = plt.cm.Spectral)
    
    ax_1.scatter(train_x[0], train_x[1], c = train_y.squeeze(), cmap = plt.cm.Spectral)
    
    ax_2 = fig.add_subplot(122)
    ax_2.set_title("Testing set")
    
    x_min, x_max = test_x[0].min() - .1, test_x[0].max() + .1
    y_min, y_max = test_x[1].min() - .1, test_x[1].max() + .1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.01), np.arange(y_min, y_max, 0.01))
    X = np.c_[xx.ravel(), yy.ravel()].T
    cost, A = forward_prop(X, np.zeros((1, X.shape[1])), prams, X.shape[1], L, activation)
    p = np.int64((A[str(L)] > 0.5).reshape(xx.shape))
    
    ax_2.contourf(xx, yy, p, cmap = plt.cm.Spectral)
    
    ax_2.scatter(test_x[0], test_x[1], c = test_y.squeeze(), cmap = plt.cm.Spectral)
    
    plt.show()
def plot_learning_curve( train_accuracies, test_accuracies, iteration_times, learning_rates, activation):
    fig = plt.figure(dpi=400)
    ax = fig.add_subplot(111)
    ax.set_title("Learning rates: " + str(learning_rates) + 
                 " | Activation function: " + activation)
    ax.axis([-100, iteration_times, 0, 1])
    ax.set_xlabel("iterations")
    ax.set_ylabel("cost/accuracy")
    ax.plot(np.arange(0, iteration_times, 10), train_accuracies, label="training_set_accuracy")
    ax.plot(np.arange(0, iteration_times, 10), test_accuracies, label="testing_set_accuracy")
    plt.legend(bbox_to_anchor=(1, 0), loc=3)
def model(train_x, train_y, test_x, test_y, hyperprams):
    
    print("DNN structure: " + str(hyperprams["num_of_units"]) + 
          " | Learning rates: " + str(hyperprams["learning_rates"]) + 
          " | Activation function: " + hyperprams["activation_function"] + 
          "\nKeep prop: " + str(hyperprams["keep_prop"]))
    
    # 初始化
    L = len(hyperprams["num_of_units"]) - 1
    M_train, M_test = train_x.shape[1], test_x.shape[1]
    prams = init_prams(hyperprams["num_of_units"], L)
    train_accuracies = []
    test_accuracies = []
    
    # 梯度检验
    diff = gradient_check(train_x, train_y, prams, M_train, L, hyperprams["activation_function"], 
                          hyperprams["num_of_units"], hyperprams["epsilon"], hyperprams["keep_prop"])
    print("Gradient Check(epsilon=" + str(hyperprams["epsilon"]) + "): " + 
          ("OK." if diff < hyperprams["epsilon"] else "Error") + "\nDiff of grads: " + str(diff))
    
    # 梯度下降
    for iteration in range(hyperprams["iteration_times"]): 
        
        mask, A = forward_prop_with_dropout(train_x, train_y, prams, M_train, L, 
                                            hyperprams["activation_function"], hyperprams["keep_prop"])
        
        grads = backward_prop_with_dropout(train_y, A, prams, M_train, L, hyperprams["activation_function"], 
                              mask, hyperprams["keep_prop"])
        
        gradient_upgrade(prams, grads, L, hyperprams["learning_rates"])
        
        if iteration % 10 == 0:
            train_accuracies.append(predict(train_x, train_y, prams, M_train, L, hyperprams["activation_function"]))
            test_accuracies.append(predict(test_x, test_y, prams, M_test, L, hyperprams["activation_function"]))
                    
    # 预测
    print("Training set accuracy: " + str(np.round(train_accuracies[-1], 4)) + 
          " | Testing set accuracy: " + str(np.round(test_accuracies[-1], 4)))
    
    # 绘制学习曲线
    plot_learning_curve(train_accuracies, test_accuracies, 
                        hyperprams["iteration_times"], hyperprams["learning_rates"], hyperprams["activation_function"])
                        
    # 绘制决策边界
    plot_division_boundary(train_x, train_y, test_x, test_y, prams, M_test, L, hyperprams["activation_function"])
train_x, train_y, test_x, test_y = load_dataset()
hyperprams = {"num_of_units":[2, 32, 32, 16, 16, 4, 1], 
             "iteration_times":10000, "learning_rates":0.016, "activation_function":"relu", 
              "epsilon":1e-7, "keep_prop":[1, 0.9, 0.8, 1, 1, 1, 1]}
model(train_x, train_y, test_x, test_y, hyperprams)

Case 1: 无Dropout(Keep_prop: [1, 1, 1, 1, 1, 1, 1])


出现过拟合倾向

Case 2: 轻度dropout(Keep prop: [1, 0.9, 1, 1, 1, 1, 1])在这里插入图片描述

过拟合得到缓解

Case 3: 重度dropout(Keep prop: [1, 0.6, 0.6, 0.7, 0.7, 0.8, 1])

在这里插入图片描述
无过拟合现象,但稍有欠拟合趋势

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值