一. 梯度推导
本例中使用的激活函数为g(x)=sigmoid函数,损失函数使用的为逻辑回归的损失函数。
方便公式简便,只有一个样本进行偏导计算,假设network共L层。使用 "" 表示向量乘积运算符, python中的numpy.multiply
网络大致图
梯度计算用的是链式求导法则
1.隐藏层-->输出层权重参数求导
2.隐藏层-->隐藏层(l-1层)权重参数求导
3. 剩余层权重参数求导
二. 神经网络训练步骤
1.随机初始化权重 ------ part6------
2.前向传播获得训练样本的预测值 ------ part3------
3. 计算损失函数 ------ part3------
4.计算反向传播的偏导数 ------ part7------
5. 使用梯度检验比较numerical gradient 和 backpropagation gradient ------ part7------
6. 使用梯度下降或其他优化方法来最小化 获得参数 ------ part8------
三. 代码实现
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import time
import random
from scipy.io import loadmat
import scipy.optimize as opt
from sklearn.metrics import classification_report#这个包是评价报告
input_layer_size = 400
hidden_layer_size = 25
num_labels = 10
data = loadmat('ex4data1.mat')
X = data['X']
y = data['y']
X.shape,y.shape
((5000, 400), (5000, 1))
# =========== Part 1: Loading and Visualizing Data =============
m = len(X)
rand_indices = random.sample(np.arange(5000).tolist(), 100)
sel = X[rand_indices,:]
def displayData(X):
# 实际就是把(100,400) ,每行转换成20*20 像素的小矩阵即一个数字,100行共100个数字
example_width = int(round(np.sqrt(X.shape[1])))
[m,n] = X.shape
example_height = int(n / example_width)
display_rows = int(np.floor(np.sqrt(m)))
display_cols = int(np.ceil(m / display_rows))
pad = 1
display_array = pd.DataFrame(-np.ones([pad + display_rows * (example_height + pad), pad + display_cols * \
(example_width + pad)]))
curr_ex = 0
for j in range(display_rows):
for i in range(display_cols):
max_val = max(abs(X[curr_ex,:]))
display_array.iloc[pad + j*(example_height+pad) + np.arange(example_height),\
pad + i*(example_width+pad) + np.arange(example_width)] = \
X[curr_ex,:].reshape(example_height, example_width).T / max_val # 和 matlab的reshap不同,需要加转置
curr_ex += 1
plt.imshow(display_array)
displayData(sel)
# ================ Part 2: Loading Pameters ================
# 通过已有的权重参数验证下cost是否与文档中一样,以确保cost函数编写正确
weight = loadmat('ex4weights.mat')
Theta1 = weight['Theta1'] # (25,401) 25:第一层隐藏层的神经元,401=400+1
Theta2 = weight['Theta2']
nn_params = np.concatenate((Theta1.reshape(Theta1.shape[0] * Theta1.shape[1], 1), \
Theta2.reshape(Theta2.shape[0] * Theta2.shape[1], 1)), axis=0)
损失函数计算公式
## 计算损失函数需要将label进行One-hot即ry
def forward_propagation(X, y, Theta1, Theta2, num_labels):
z2 = np.dot(X, Theta1.T)
a2 = sigmoid(z2)
a2 = np.insert(a2, 0, 1, axis=1)
z3 = np.dot(a2, Theta2.T)
a3 = sigmoid(z3)
E = np.eye(num_labels)
ry = E[y.ravel()-1] #(5000,10)
return z2, a2, z3, a3, ry
def regularized_cost(nn_params, X, y, lamda, input_layer_size=input_layer_size, hidden_layer_size=hidden_layer_size,\
num_labels=num_labels, m=m):
Theta1 = nn_params[:(input_layer_size + 1) * hidden_layer_size].reshape(hidden_layer_size, input_layer_size+1)
Theta2 = nn_params[(input_layer_size + 1) * hidden_layer_size:].reshape(num_labels, hidden_layer_size+1)
[z2, a2, z3, a3, ry] = forward_propagation(X, y, Theta1, Theta2, num_labels)
cost = - (ry*np.log(a3) + (1-ry)*np.log(1-a3))
J = cost.sum() / m + lamda/(2*m) * (np.square(Theta1[:,1:]).sum() + np.square(Theta2[:,1:]).sum())
return J
X = np.insert(X, 0, 1, axis=1) # 输出函数的X要带有偏置项
J = regularized_cost(nn_params, X, y, 0)
print('Cost at parameters (loaded from ex4weights): %s ''\n(this value should be about 0.287629)\n'%J)
# =============== Part 4: Implement Regularization ===============
J = regularized_cost(nn_params, X, y, 1)
print('Cost at parameters (loaded from ex4weights): %s ''\n(this value should be about 0.383770\n'%J)
# ================ Part 5: Sigmoid Gradient ================
def sigmoid_gradient(z):
return sigmoid(z)*(1-sigmoid(z))
以上基本功能函数写好,就可以初始化参数了
# ================ Part 6: Initializing Pameters ================
def rand_initialize_weights(L_in, L_out):
epsilon_init = 0.12
weights = np.random.rand(L_out, L_in) * 2 * epsilon_init - epsilon_init
return weights
initial_theta1 = rand_initialize_weights(input_layer_size+1, hidden_layer_size)
initial_theta2 = rand_initialize_weights(hidden_layer_size+1, num_labels)
initial_nn_params = np.concatenate((initial_theta1.ravel(), initial_theta2.ravel()), axis=0).reshape(-1, 1)
接下来就是重要的内容,反向传播实现
# =============== Part 7: Implement Backpropagation ===============
def regularized_grad(nn_params, X, y, lamda, input_layer_size=input_layer_size, hidden_layer_size=hidden_layer_size,\
num_labels=num_labels, m=m):
Theta1 = nn_params[:(input_layer_size + 1) * hidden_layer_size].reshape(hidden_layer_size, input_layer_size+1)
Theta2 = nn_params[(input_layer_size + 1) * hidden_layer_size:].reshape(num_labels, hidden_layer_size+1)
[z2, a2, z3, a3, ry] = forward_propagation(X, y, Theta1, Theta2, num_labels)
delta3 = a3 - ry # (5000,10)
delta2 =delta3 @ Theta2[:,1:] * sigmoid_gradient(z2) # (5000,25)
Delta2 = delta3.T @ a2 #(10,26)
Delta1 = delta2.T @ X # (25, 401)
Theta2_grad = Delta2 / m + lamda/m * np.insert(Theta2[:,1:],0,0, axis=1)
Theta1_grad = Delta1 / m + lamda/m * np.insert(Theta1[:,1:],0,0, axis=1)
grad = np.concatenate((Theta1_grad.reshape(Theta1_grad.shape[0] * Theta1_grad.shape[1], 1), \
Theta2_grad.reshape(Theta2_grad.shape[0] * Theta2_grad.shape[1], 1)), axis=0)
return grad
写好反向传播,需要进行梯度检验以验证反向传播算法编写正确
计算每个的的数值梯度与你写的计算梯度的函数的值要非常接近则正确(如下图)
def check_gradients(lamda):
def debug_initial_weights(L_in, L_out):
W = np.zeros((L_out, L_in))
W = np.sin(range(1,W.size+1)).reshape(W.shape[1],W.shape[0]).T / 10 # reshape这么写为了跟matlab一致
return W
# 生成一个小的network检验
input_layer_size = 3
hidden_layer_size = 5
num_labels = 3
m = 5
initial_theta1 = debug_initial_weights(input_layer_size+1, hidden_layer_size)
initial_theta2 = debug_initial_weights(hidden_layer_size+1, num_labels)
X = debug_initial_weights(input_layer_size, m)
y = np.array([x%num_labels+1 for x in range(1,m+1)]).reshape(-1,1)
nn_params = np.concatenate((initial_theta1.ravel(), initial_theta2.ravel()), axis=0).reshape(-1, 1)
# backprop grad
grad = regularized_grad(nn_params, np.insert(X, 0, 1, axis=1) , y, lamda, input_layer_size, hidden_layer_size, num_labels, m)
# numeric grad
e = 1e-4
numeric_grad = np.zeros((len(nn_params),1))
for i in range(len(nn_params)):
p = np.zeros((len(nn_params),1))
p[i] = e
plus_loss = regularized_cost(nn_params+p, np.insert(X, 0, 1, axis=1) , y, lamda, input_layer_size, hidden_layer_size, num_labels, m)
minus_loss = regularized_cost(nn_params-p, np.insert(X, 0, 1, axis=1) , y, lamda, input_layer_size, hidden_layer_size, num_labels, m)
numeric_grad[i] = (plus_loss - minus_loss) / (2*e)
diff_grad = np.linalg.norm(grad - numeric_grad) / np.linalg.norm(grad + numeric_grad)
print('If your backpropagation implementation is correct, then \n' \
'the relative difference will be small (less than 1e-9). \n' \
'\nRelative Difference: %s\n' % diff_grad)
check_gradients(0)
# =============== Part 8: Implement Regularization ===============
check_gradients(10)
# =================== Part 8: Training NN ===================
res = opt.minimize(fun=regularized_cost,
x0=initial_nn_params,
args=(X, y, 5),
method='TNC',
jac=regularized_grad,
options={'maxiter': 400})
last_theta1 = res.x[:initial_theta1.size].reshape(initial_theta1.shape)
last_theta2 = res.x[initial_theta1.size:].reshape(initial_theta2.shape)
print(last_theta1.shape, last_theta2.shape)
res
# ================= Part 9: Visualize Weights =================
displayData(last_theta1[:,1:])
# ================= Part 10: Implement Predict =================
def predict(X, y, theta1, theta2 ):
z2 = np.dot(X, theta1.T)
a2 = sigmoid(z2)
a2 = np.insert(a2, 0, 1, axis=1)
z3 = np.dot(a2, theta2.T)
a3 = sigmoid(z3)
pos = np.argmax(a3, axis=1).reshape(-1,1)+1
accuracy = (y == pos).mean() * 100
print(classification_report(y, pos1))
return accuracy, a3
accuracy, a3 = predict(X, y, last_theta1, last_theta2)
accuracy