文章目录
前馈神经网络:
数据集:
源码:
import scipy.io as sio
import numpy as np
from sklearn.metrics import classification_report
import scipy.optimize as opt
加载数据:
由于是matlab类型,x需要转置
def load_data(file):
data = sio.loadmat(file)
# print(data)
x = data.get('X')
y = data.get('y')
y = y.reshape(y.shape[0]) # (5000,)
# x需要转置:每一行从(400,)-->(20*20)-->(20*20).T-->(400,)
m = x.shape[1]
sqr = int(np.sqrt(m))
x = np.array([im.reshape((20, 20)).T for im in x])
x = np.array([im.reshape(400) for im in x])
# x最前增加全部为1的一列,shape=(5000,401)
x = np.insert(x, 0, np.ones(x.shape[0]), axis=1)
# print('load x,y',x.shape, y.shape)
return x, y
x, y = load_data('ex4data1.mat') # (5000, 401) (5000,)
把标签y转换成onehot形式
def y_expand(y):
# y扩展成onehot
res = []
for i in y:
y_array = np.zeros(10)
y_array[i - 1] = 1
res.append(y_array)
return np.array(res)
y_onehot = y_expand(y)
定义将矩阵扁平化为序列的函数,以及将序列变为矩阵的函数
def serialize(a, b):
return np.concatenate((np.ravel(a), np.ravel(b)))
def deserialize(seq):
# """into ndarray of (25, 401), (10, 26)"""
return seq[:25 * 401].reshape(25, 401), seq[25 * 401:].reshape(10, 26)
定义sigmoid函数
def sigmoid(z):
return 1 / (1 + np.exp(-z))
定义前向传播网络
信息传播公式:
z ( l ) = W ( l ) ⋅ a ( l − 1 ) + b ( l ) z^{(l)}=W^{(l)}·a^{(l-1)}+b^{(l)} z(l)=W(l)⋅a(l−1)+b(l)
a ( l ) = f l ( z ( l ) ) a^{(l)}=f_l(z^{(l)}) a(l)=fl(z(l))
把x看作a0,进一步计算第一层净输入z1,输出a1;第二层净输入z2,输出h。h也是整个神经网络的输出。
w1.shape=(25,401)
w2.shape=(10,26)
def forward(ws, x):
# print('forward ws', ws.shape)
w1, w2 = deserialize(ws) # (25, 401),(10,26)
m = x.shape[0]
# print('forward w1,w1', w1.shape, w2.shape)
# print('forward x', x.shape)
a0 = x # (5000, 401)
# 第一层输入z1,输出a1
z1 = a0 @ w1.T # (5000,25)
# 第二层输入z2,输出h
a1 = np.insert(sigmoid(z1), 0, np.ones(m), axis=1) # (5000,26)
# print('forward a1', a1.shape)
z2 = a1 @ w2.T # (5000,10)
# 输出h
h = sigmoid(z2) # (5000,10)
return z1, z2, a0, a1, h
计算损失函数
注意:不要混淆数组相乘(‘@’、np.matmul)、数组元素相乘(‘
∗
*
∗’、np.multiply)
在这里用a
∗
*
∗b会出现cost=nan情况,用np.multiply(a,b)就不会
关于‘
∗
*
∗’、np.multiply在矩阵和数组中不同含义:
对于数组,
∗
*
∗和multiply均为对应元素相乘。
对于矩阵,
∗
*
∗为数学意义的矩阵乘法,multiply是对应元素相乘。
def cost(ws, x, y):
# print('cost', ws.shape)
_, _, _, _, h = forward(ws, x)
m = x.shape[0]
# 交叉熵损失函数 cost = -label*log(pred)-(1-label)log(1-pred)
cost = -np.multiply(y, np.log(h)) - np.multiply((1 - y), np.log(1 - h)) # (5000,10)
cost = np.sum(cost) / m # ()
return cost
计算正则化损失函数
r e g u l a r i z e d − c o s t = c o s t + λ / 2 n ∗ ∑ w 1 i j 2 + λ / 2 n ∗ ∑ w 2 i j 2 regularized-cost=cost+λ/2n * ∑w_{1ij}^2+λ/2n * ∑w_{2ij}^2 regularized−cost=cost+λ/2n∗∑w1ij2+λ/2n∗∑w2ij2
其中,λ是正则化系数,n是数据总数
注意:参数w1,w2的第一列是偏差项,不需要正则化
def regularized_cost(ws, x, y, l=1):
# 正则化:w1,w2第一列不需要
w1 = np.reshape(ws[:(input_dim) * h_dim], (-1, input_dim)) # (25, 401)
w2 = np.reshape(ws[(input_dim) * h_dim:], (-1,h_dim+1)) # (10, 26)
reg_t1 = (l/x.shape[0]) * np.power(w1[:, 1:], 2).sum()
reg_t2 = (l/x.shape[0]) * np.power(w2[:, 1:], 2).sum()
return cost(ws, x, y) + reg_t1 + reg_t2
计算sigmoid函数的梯度
def sigmoid_gradient(z):
return np.multiply(sigmoid(z), 1 - sigmoid(z))
计算梯度:反向传播算法
首先计算每一层的误差项δ:
最后一层 δ = h − y δ=h-y δ=h−y
反向计算 δ ( l ) = f ′ ( z ( l ) ) ⊙ ( w ( l ) ) T ⋅ δ ( l + 1 ) ) δ^{(l)}=f'(z^{(l)}) ⊙(w^{(l)})^T·δ^{(l+1)}) δ(l)=f′(z(l))⊙(w(l))T⋅δ(l+1))
⊙表示矩阵对应元素相乘
然后计算每一层权重的梯度grad:
g r a d ( l ) = δ ( l ) ( a ( l − 1 ) ) T grad^{(l)}=δ^{(l)}(a^{(l-1)})^T grad(l)=δ(l)(a(l−1))T
def grandient(ws,x,y):
z1, z2, a0, a1, h = forward(ws, x)
z1 = np.insert(z1, 0, np.ones(1), axis=1)
w1 = np.reshape(ws[:(input_dim) * h_dim], (-1, input_dim)) # (25, 401)
w2 = np.reshape(ws[(input_dim) * h_dim:], (-1, h_dim + 1)) # (10, 26)
# print('grad w1,w2',w1.shape,w2.shape)
grad1 = np.zeros(w1.shape)
grad2 = np.zeros(w2.shape)
for i in range(x.shape[0]):
z1i = z1[i, :] # (26,)
a1i = a1[i, :] # (26,)
a1i = np.array([a1i]) # (1,26)
a0i = a0[i, :] # (401,)
a0i = np.array([a0i]) # (1,401)
# 反向计算每 一层的误差项
d2i = h[i] - y[i] # (10,)
d1i = sigmoid_gradient(z1i) * (w2.T @ d2i) # (26,)
d2i = np.array([d2i]) # (1, 10)
d1i = np.array([d1i]) # (1, 26)
# print('grad a0i,a1i', a0i.shape, a1i.shape)
# 计算每 一层参数的导数,shape和每层w相同
grad2 += d2i.T @ a1i # (10,26)
grad1 += d1i[:, 1:].T @ a0i # (25,401)
# print('grad grad1,grad2', grad1.shape, grad2.shape)
grad2 = grad2/x.shape[0]
grad1 = grad1/x.shape[0]
return np.concatenate((np.ravel(grad1), np.ravel(grad2)))
计算正则化梯度
r e g u l a r i z e d − g r a d = g r a d + λ / n ∗ w regularized-grad=grad+λ/n * w regularized−grad=grad+λ/n∗w
def regularized_gradient(ws, x, y, l=1):
# 正则化
w1 = np.reshape(ws[:(input_dim) * h_dim], (-1, input_dim)) # (25, 401)
w2 = np.reshape(ws[(input_dim) * h_dim:], (-1, h_dim + 1)) # (10, 26)
grands = grandient(ws, x, y)
grad1, grad2 = grands[:25 * 401].reshape(25, 401),grands[25 * 401:].reshape(10, 26)
w1[:, 0] = 0
w2[:, 0] = 0
reg_term_g1 = (l / x.shape[0]) * w1
reg_term_g2 = (l / x.shape[0]) * w2
grad1 = grad1 + reg_term_g1
grad2 = grad2 + reg_term_g2
# print('grad_reg grad1,grad2', grad1.shape, grad2.shape)
# np.ravel()将多维数组降为一维,合并(10285,)
grad = np.concatenate((np.ravel(grad1), np.ravel(grad2)))
# print('grad_reg grad', grad.shape)
return grad
定义每层维度
input_dim = x.shape[1] # 401
h_dim = 25
output_dim = 10
定义随机初始化函数
def random_init(size):
return np.random.uniform(-0.12, 0.12, size)
训练模型:使用minimize进行最优化
def train(x, y):
# 参数初始化
# w1.shape = (401,25)
# w2.shape = (26,10)
size = (input_dim) * h_dim + (h_dim + 1) * output_dim
ws = random_init(size) # (10285,)
# print('train ws', ws.shape)
res = opt.minimize(fun=regularized_cost,
x0=ws,
args=(x, y, 1),
method='TNC',
jac=regularized_gradient,
options={'maxiter': 250,
'disp': True})
return res
模型评估:scipy.classification_report()
返回值有:
precision :精准度 P = a / ( a + c ) P=a/(a+c) P=a/(a+c),衡量搜索结果有多大用处
recall:召回率 R = a / ( a + d ) R=a/(a+d) R=a/(a+d),衡量结果如何完整
f1-score:精确度和召回率的调和平均值。 F 1 = 2 ∗ ( P ∗ R ) / ( P + R ) F_1=2*(P*R)/(P+R) F1=2∗(P∗R)/(P+R),F1值最佳为1。
support:
(预测真实际真)真阳性a、 20
(预测假实际假)真阴性b、
(预测真实际假)假阳性c、10
(预测假实际真)假阴性d 40
def evaluate(ws,x,y):
_, _, _, _, h = forward(ws, x)
# 获取概率最大值的索引,对应关系:0-9,1-0,2-1,,,9-8
# y中label与索引对应关系: 0-10,1-1,2-2,,,,9-9
pred = np.argmax(h,axis=1) + 1 # (5000,)
m = x.shape[0]
correct = [1 if a == b else 0 for (a, b) in zip(pred, y)]
acc = float(np.sum(correct)/m)
print('accuracy', acc)
#
print(classification_report(y, pred))
res = train(x, y_onehot)
final_ws = res.x
evaluate(final_ws,x,y)
accuracy 0.9506
precision recall f1-score support 1 0.99 0.98 0.98 500 2 0.98 0.96 0.97 500 3 0.95 0.96 0.96 500 4 0.87 1.00 0.93 500 5 1.00 0.79 0.88 500 6 0.94 0.99 0.97 500 7 0.97 0.98 0.97 500 8 0.96 0.96 0.96 500 9 0.99 0.89 0.94 500 10 0.89 1.00 0.94 500 accuracy 0.95 5000 macro avg 0.95 0.95 0.95 5000 weighted avg 0.95 0.95 0.95 5000