神经网络反向传播梯度推导及代码实现

最新推荐文章于 2024-05-21 10:54:03 发布

xiaoa~

最新推荐文章于 2024-05-21 10:54:03 发布

阅读量1.8k

点赞数

分类专栏：机器学习

本文链接：https://blog.csdn.net/qq_34964399/article/details/102100128

版权

机器学习专栏收录该内容

18 篇文章 0 订阅

订阅专栏

一. 梯度推导

$\large 本例中使用的激活函数为g(x)=sigmoid函数，损失函数使用的为逻辑回归的损失函数。\\ 方便公式简便，只有一个样本进行偏导计算，假设network共L层。使用“\textstyle \bullet” 表示向量乘积运算符, \\ python中的numpy.multiply\\$ $\large 本例中使用的激活函数为g(x)=sigmoid函数，损失函数使用的为逻辑回归的损失函数。\\ 方便公式简便，只有一个样本进行偏导计算，假设network共L层。使用“\textstyle \bullet” 表示向量乘积运算符, \\ python中的numpy.multiply\\$ 本例中使用的激活函数为g(x)=sigmoid函数，损失函数使用的为逻辑回归的损失函数。
方便公式简便，只有一个样本进行偏导计算，假设network共L层。使用 " $\large \textstyle \bullet$ " 表示向量乘积运算符, python中的numpy.multiply

网络大致图

梯度计算用的是链式求导法则

1.隐藏层-->输出层权重参数求导

2.隐藏层-->隐藏层(l-1层)权重参数求导

3. 剩余层权重参数求导

二. 神经网络训练步骤

1.随机初始化权重 ------ part6------

2.前向传播获得训练样本的预测值 $h_{\theta}(x)$ ------ part3------

3. 计算损失函数 $J_{\theta}$ ------ part3------

4.计算反向传播的偏导数 $\frac{\partial}{\partial \Theta_{ij}^l}J_{\theta}$ ------ part7------

5. 使用梯度检验比较numerical gradient 和 backpropagation gradient ------ part7------

6. 使用梯度下降或其他优化方法来最小化 $J_{\theta}$ 获得参数 $\theta$ ------ part8------

三. 代码实现

import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np
import time
import random
from scipy.io import loadmat
import scipy.optimize as opt
from sklearn.metrics import classification_report#这个包是评价报告

input_layer_size  = 400
hidden_layer_size = 25
num_labels = 10
data = loadmat('ex4data1.mat')
X = data['X']
y = data['y']
X.shape,y.shape

((5000, 400), (5000, 1))

# =========== Part 1: Loading and Visualizing Data =============
m = len(X)
rand_indices = random.sample(np.arange(5000).tolist(), 100)
sel = X[rand_indices,:]
def displayData(X):
    #  实际就是把（100,400) ，每行转换成20*20 像素的小矩阵即一个数字，100行共100个数字
    example_width = int(round(np.sqrt(X.shape[1])))
    [m,n] = X.shape
    example_height = int(n / example_width)
    display_rows = int(np.floor(np.sqrt(m)))
    display_cols = int(np.ceil(m / display_rows))
    pad = 1
    display_array = pd.DataFrame(-np.ones([pad + display_rows * (example_height + pad), pad + display_cols * \
                                           (example_width + pad)]))
    curr_ex = 0
    for j in range(display_rows):
        for i in range(display_cols):
            max_val = max(abs(X[curr_ex,:]))
            display_array.iloc[pad + j*(example_height+pad) + np.arange(example_height),\
                         pad + i*(example_width+pad) + np.arange(example_width)] = \
            X[curr_ex,:].reshape(example_height, example_width).T / max_val  #  和 matlab的reshap不同，需要加转置
            curr_ex += 1
    plt.imshow(display_array)
displayData(sel)

# ================ Part 2: Loading Pameters ================
# 通过已有的权重参数验证下cost是否与文档中一样，以确保cost函数编写正确
weight = loadmat('ex4weights.mat')
Theta1 = weight['Theta1']  # (25,401) 25：第一层隐藏层的神经元，401=400+1
Theta2 = weight['Theta2']
nn_params = np.concatenate((Theta1.reshape(Theta1.shape[0] * Theta1.shape[1], 1), \
                            Theta2.reshape(Theta2.shape[0] * Theta2.shape[1], 1)), axis=0)

损失函数计算公式

##  计算损失函数需要将label进行One-hot即ry
def forward_propagation(X, y, Theta1, Theta2, num_labels):
    z2 = np.dot(X, Theta1.T)
    a2 = sigmoid(z2)
    a2 = np.insert(a2, 0, 1, axis=1)
    z3 = np.dot(a2, Theta2.T)
    a3 = sigmoid(z3)
    E = np.eye(num_labels)
    ry = E[y.ravel()-1] #(5000,10)  
    return z2, a2, z3, a3, ry

def regularized_cost(nn_params, X, y, lamda, input_layer_size=input_layer_size, hidden_layer_size=hidden_layer_size,\
                    num_labels=num_labels, m=m):
    Theta1 = nn_params[:(input_layer_size + 1) * hidden_layer_size].reshape(hidden_layer_size, input_layer_size+1)
    Theta2 = nn_params[(input_layer_size + 1) * hidden_layer_size:].reshape(num_labels, hidden_layer_size+1)  
    [z2, a2, z3, a3, ry] = forward_propagation(X, y, Theta1, Theta2, num_labels)
    cost = - (ry*np.log(a3) + (1-ry)*np.log(1-a3))
    J = cost.sum() / m + lamda/(2*m) * (np.square(Theta1[:,1:]).sum() + np.square(Theta2[:,1:]).sum())
    return J
X = np.insert(X, 0, 1, axis=1)  # 输出函数的X要带有偏置项
J = regularized_cost(nn_params, X, y, 0)
print('Cost at parameters (loaded from ex4weights): %s ''\n(this value should be about 0.287629)\n'%J)

# =============== Part 4: Implement Regularization ===============
J = regularized_cost(nn_params, X, y, 1)
print('Cost at parameters (loaded from ex4weights): %s ''\n(this value should be about 0.383770\n'%J)

#  ================ Part 5: Sigmoid Gradient  ================
def sigmoid_gradient(z):
    return sigmoid(z)*(1-sigmoid(z))

以上基本功能函数写好，就可以初始化参数了

#  ================ Part 6: Initializing Pameters ================
def rand_initialize_weights(L_in, L_out):
    epsilon_init = 0.12
    weights = np.random.rand(L_out, L_in) * 2 * epsilon_init - epsilon_init
    return weights
initial_theta1 = rand_initialize_weights(input_layer_size+1, hidden_layer_size)
initial_theta2 = rand_initialize_weights(hidden_layer_size+1, num_labels)
initial_nn_params = np.concatenate((initial_theta1.ravel(), initial_theta2.ravel()), axis=0).reshape(-1, 1)

接下来就是重要的内容，反向传播实现

#  =============== Part 7: Implement Backpropagation ===============
def regularized_grad(nn_params, X, y, lamda, input_layer_size=input_layer_size, hidden_layer_size=hidden_layer_size,\
                    num_labels=num_labels, m=m):
    Theta1 = nn_params[:(input_layer_size + 1) * hidden_layer_size].reshape(hidden_layer_size, input_layer_size+1)
    Theta2 = nn_params[(input_layer_size + 1) * hidden_layer_size:].reshape(num_labels, hidden_layer_size+1)
    [z2, a2, z3, a3, ry] = forward_propagation(X, y, Theta1, Theta2, num_labels)
    delta3 = a3 - ry   # (5000,10)
    delta2 =delta3 @ Theta2[:,1:] * sigmoid_gradient(z2)   # (5000,25)
    Delta2 = delta3.T @ a2 #(10,26)
    Delta1 = delta2.T @ X  # (25, 401)
    Theta2_grad = Delta2 / m +  lamda/m * np.insert(Theta2[:,1:],0,0, axis=1)
    Theta1_grad = Delta1 / m +  lamda/m * np.insert(Theta1[:,1:],0,0, axis=1)
    grad = np.concatenate((Theta1_grad.reshape(Theta1_grad.shape[0] * Theta1_grad.shape[1], 1), \
                            Theta2_grad.reshape(Theta2_grad.shape[0] * Theta2_grad.shape[1], 1)), axis=0)
    return grad

写好反向传播，需要进行梯度检验以验证反向传播算法编写正确

计算每个 $\theta$ 的的数值梯度与你写的计算梯度的函数的值要非常接近则正确（如下图）

def check_gradients(lamda):
    def debug_initial_weights(L_in, L_out):
        W = np.zeros((L_out, L_in))
        W = np.sin(range(1,W.size+1)).reshape(W.shape[1],W.shape[0]).T / 10  # reshape这么写为了跟matlab一致
        return W
    # 生成一个小的network检验
    input_layer_size = 3
    hidden_layer_size = 5
    num_labels = 3
    m = 5
    initial_theta1 = debug_initial_weights(input_layer_size+1, hidden_layer_size)
    initial_theta2 = debug_initial_weights(hidden_layer_size+1, num_labels)
    X = debug_initial_weights(input_layer_size, m)
    y = np.array([x%num_labels+1 for x in range(1,m+1)]).reshape(-1,1)
    nn_params = np.concatenate((initial_theta1.ravel(), initial_theta2.ravel()), axis=0).reshape(-1, 1)
       # backprop grad
    grad = regularized_grad(nn_params, np.insert(X, 0, 1, axis=1) , y, lamda, input_layer_size, hidden_layer_size, num_labels, m)
    # numeric grad
    e = 1e-4
    numeric_grad = np.zeros((len(nn_params),1))
    for i in range(len(nn_params)):
        p = np.zeros((len(nn_params),1))
        p[i] = e
        plus_loss =  regularized_cost(nn_params+p, np.insert(X, 0, 1, axis=1) , y, lamda, input_layer_size, hidden_layer_size, num_labels, m)
        minus_loss =  regularized_cost(nn_params-p, np.insert(X, 0, 1, axis=1) , y, lamda, input_layer_size, hidden_layer_size, num_labels, m)
        numeric_grad[i] = (plus_loss - minus_loss) / (2*e)
    diff_grad = np.linalg.norm(grad - numeric_grad) /  np.linalg.norm(grad + numeric_grad)
    print('If your backpropagation implementation is correct, then \n' \
         'the relative difference will be small (less than 1e-9). \n' \
         '\nRelative Difference: %s\n' % diff_grad)
check_gradients(0)

#  =============== Part 8: Implement Regularization ===============
check_gradients(10)

#  =================== Part 8: Training NN ===================
res = opt.minimize(fun=regularized_cost,
                   x0=initial_nn_params,
                   args=(X, y, 5),
                   method='TNC',
                   jac=regularized_grad,
                   options={'maxiter': 400})
last_theta1 = res.x[:initial_theta1.size].reshape(initial_theta1.shape)
last_theta2 = res.x[initial_theta1.size:].reshape(initial_theta2.shape)
print(last_theta1.shape, last_theta2.shape)
res

#  ================= Part 9: Visualize Weights =================
displayData(last_theta1[:,1:])

#  ================= Part 10: Implement Predict =================
def predict(X, y, theta1, theta2 ):
    z2 = np.dot(X, theta1.T)
    a2 = sigmoid(z2)
    a2 = np.insert(a2, 0, 1, axis=1)
    z3 = np.dot(a2, theta2.T)
    a3 = sigmoid(z3)
    pos = np.argmax(a3, axis=1).reshape(-1,1)+1
    accuracy = (y == pos).mean() * 100
    print(classification_report(y, pos1))
    return accuracy, a3
accuracy, a3 = predict(X, y, last_theta1, last_theta2)
accuracy

xiaoa~

关注

0
点赞
踩
11

收藏

觉得还不错? 一键收藏
0
评论
神经网络反向传播梯度推导及代码实现

一. 梯度推导本例中使用的激活函数为g(x)=sigmoid函数，损失函数使用的为逻辑回归的损失函数。方便公式简便，只有一个样本进行偏导计算，假设network共L层。使用 "" 表示向量乘积运算符,python中的numpy.multiply网络大致图梯度计算用的是链式求导法则1.隐藏层-->输出层权重参数求导2.隐藏层-->隐藏层(l-1层...
复制链接

扫一扫