神经网络基本原理
前向传播
其中,σ(·)是激活函数,在简单的密集神经网络中,隐层一般使用ReLu线性整流函数或者Sigmoid函数,在输出层,如果分类任务是二分类,一般使用Sigmoid函数,如果是多分类,一般使用Softmax函数,这几种函数为:
损失函数
针对二分类任务,我们一般选择二元交叉熵损失函数
针对多分类任务,我们选择多元交叉熵损失函数
反向传播
先让损失函数对输出层未激活单元z求导得出最后一层的误差值:
随后通过链式求导得出每一层的误差:
然后就可以得出损失函数对权重矩阵及偏差向量的偏导数,然后就可以通过这个偏导数做梯度下降:
更多的数学推导可以参考这里
Python实现
以模仿tensorflow的风格用numpy实现了简单的推理框架,由于时间关系,在这份代码只实现了批量梯度下降,激活函数只有relu,sigmoid,softmax三种,正则化方式只有L2正则化。小批量梯度下降,其他激活函数,dropout等日后有时间再写写,虽然这样有点重复造轮子了,不过感觉自己写一下可以加深理解
import numpy as np
import scipy.io as sio
class Network():
"""
一个简单的神经网络模型
struct -- 神经网络的结构 eg.一个两层神经网络,激活函数均为sigmoid
[[64, 'sigmoid'], [10, 'sigmoid']]
lamda -- 神经网络的学习率,默认为0.01
alpha -- 神经网络的正则化参数,默认为0
"""
def __init__(self, struct=None, lamda=0.01, alpha=0):
self.X = None
self.y = None
self.struct = struct
self.lamda = lamda
self.alpha = alpha
self.Weights = None
def Weights_init(self):
"""
初始化网络权重至[-1, 1]区间
return:
Weights -- 以字典形式存放的权重信息 {'Theta':[], 'b':[]}
"""
self.Weights = {'Theta':[], 'b':[]}
input_size = np.size(self.X, 1)
depth = len(self.struct)
for i in range(depth):
if i == 0:
self.Weights['Theta'].append(np.random.rand(input_size, self.struct[0][0]) - np.random.rand(input_size, self.struct[0][0]))
self.Weights['b'].append(np.random.rand(self.struct[0][0]) - np.random.rand(self.struct[0][0]))
else:
self.Weights['Theta'].append(np.random.rand(self.struct[i-1][0], self.struct[i][0]) - np.random.rand(self.struct[i-1][0], self.struct[i][0]))
self.Weights['b'].append(np.random.rand(self.struct[i][0]) - np.random.rand(self.struct[i][0]))
def ReLu(self, Z):
"""
Relu激活函数
args:
Z -- 未激活单元
return:
A -- 激活单元
"""
mask = Z > 0
A = Z * mask
return A
def sigmoid(self, Z):
"""
sigmoid激活函数
args:
Z -- 未激活单元
return:
A -- 激活单元
"""
A = 1 / (1 + np.exp(-Z))
return A
def softmax(self, Z):
"""
softmax激活函数
args:
Z -- 未激活单元
return:
A -- 激活单元
"""
temp = np.exp(Z)
s = np.sum(temp, 1).reshape(-1, 1)
A = temp / s
return A
def forward(self, X):
"""
进行一次前向传播
args:
X -- 样本
return:
Layers -- 各层的激活(包含输出层)
"""
Layers = []
depth = len(self.struct)
A = X
for i in range(depth):
Z = A @ self.Weights['Theta'][i] + self.Weights['b'][i]
if self.struct[i][1] == 'ReLu':
A = self.ReLu(Z)
elif self.struct[i][1] == 'sigmoid':
A = self.sigmoid(Z)
elif self.struct[i][1] == 'softmax':
A = self.softmax(Z)
Layers.append(A)
return Layers
def backward(self, Layers):
"""
反向传播, 更新权重
args:
Layers -- 前向传播时每一层的激活
"""
m = np.size(self.X, 0)
DELTA = Layers.copy()
d_Theta = self.Weights['Theta'].copy()
d_b = self.Weights['b'].copy()
depth = len(self.struct)
# 计算各层的误差
for i in range(depth-1, -1, -1):
if i == depth-1:
delta = Layers[i] - self.y
else:
if self.struct[i][1] == 'sigmoid':
diff = Layers[i] * (1 - Layers[i])
elif self.struct[i][1] == 'ReLu':
diff = Layers[i] > 0
delta = delta @ self.Weights['Theta'][i+1].T * diff
DELTA[i] = delta
# 计算各权重的误差
for i in range(depth):
if i == 0:
d_Theta[i] = (self.X.T @ DELTA[i] + self.alpha * self.Weights['Theta'][i]) / m
else:
d_Theta[i] = (Layers[i-1].T @ DELTA[i] + self.alpha * self.Weights['Theta'][i]) / m
d_b[i] = np.sum(DELTA[i], 0) / m
# 更新权重
for i in range(depth):
self.Weights['Theta'][i] -= self.lamda * d_Theta[i]
self.Weights['b'][i] -= self.lamda * d_b[i]
def computeCost(self, y, out):
"""
计算代价函数,若输出层为sigmoid,则为二元交叉熵,若为softmax,则为多元交叉熵
args:
y -- ground truth 标签
out -- 输出层激活
return:
J -- 代价函数值
"""
m = np.size(out, 0)
if self.struct[-1][1] == 'sigmoid':
J = y * np.log(out) + (1 - y) * np.log(1 - out)
J = np.sum(J)
J = -J / m
elif self.struct[-1][1] == 'softmax':
J = -y * np.log(out)
J = np.sum(J)
J = J / m
s = 0
for Theta in self.Weights['Theta']:
s += np.sum(Theta)
s = s / (2 * m)
J = J + self.alpha * s
return J
def predict(self, X):
"""
对输入数据进行预测
args:
X -- 输入数据 shape=(samples, features)
return:
y -- 预测标签
"""
out = self.forward(X)[-1]
if np.size(out, 1) == 1:
y = out > 0.5
else:
y = (out - np.max(out, 1).reshape(-1, 1)) == 0
y = y.astype('uint8')
return y
def evaluate(self, X, y):
"""
返回给出数据集的的精度和损失
args:
X -- 数据集样本
y -- 数据集标签
returns:
acc -- 数据集精度
loss -- 数据集损失
"""
m = np.size(X, 0)
out = self.forward(X)[-1]
loss = self.computeCost(y, out)
predict = self.predict(X)
acc = np.sum(np.sum(abs(predict - y), 1) == 0) / m
return acc, loss
def fit(self, X, y, val_data, epochs, history_per_epochs, eps=0):
"""
拟合数据
args:
X -- 样本数据矩阵 shape=(samples, features)
y -- 样本标签 shape=(samples, )
val_data -- 验证集数据,若无验证集,则应给出None
epochs -- 要迭代训练的次数
history_per_epochs -- 每几个轮数记录一次训练指标
eps -- default=None, 若指定,则训练时损失值若连续5个轮数之间的变化值小于eps,则停止训练
return:
history -- 存放训练过程中 训练集的损失,精度 和 验证集的损失和精度 {'acc':, 'val_acc':, 'loss':, 'val_loss':}
"""
self.X = X
self.y = y
m_train = np.size(self.X, 0)
self.Weights_init()
history = {'acc':[], 'val_acc':[], 'loss':[], 'val_loss':[]}
epoch = 1
tempJ = float('inf')
while epoch <= epochs:
Layers = self.forward(self.X)
out = Layers[-1]
J = self.computeCost(self.y, out)
if abs(J - tempJ) < eps:
print('epoch' + str(epoch) + '\t Stop Training')
break
tempJ = J
self.backward(Layers)
if epoch % history_per_epochs == 0:
train_predict = self.predict(self.X)
acc = np.sum(np.sum(abs(train_predict - self.y), 1) == 0) / m_train
history['acc'].append(acc)
history['loss'].append(J)
if val_data:
val_X, val_y = val_data
val_acc, val_loss = self.evaluate(val_X, val_y)
history['val_acc'].append(val_acc)
history['val_loss'].append(val_loss)
print('epoch ' + str(epoch) + '/' + str(epochs) + '\t' +
'acc:' + str(acc) + ' - ' + 'val_acc:' + str(val_acc) + ' - ' + 'loss:' + str(J) + ' - ' + 'val_loss:' + str(val_loss))
else:
print('epoch ' + str(epoch) + '/' + str(epochs) + '\t' +
'acc:' + str(acc) + ' - ' + 'loss:' + str(J))
epoch += 1
return history
def save_model(self, filename):
"""
保存模型为mat文件
args:
filename -- 保存模型的文件名
"""
if filename[-4:] == 'mat':
pass
else:
filename = filename + '.mat'
layer_size = []
layer_activation = []
for layer in self.struct:
layer_size.append(layer[0])
layer_activation.append(layer[1])
sio.savemat(filename, {'layer_size':layer_size, 'layer_activation':layer_activation, 'Theta':self.Weights['Theta'], 'b':self.Weights['b']})
def load_model(self, filename):
"""
加载模型
args:
filename -- 模型保存的mat文件名
"""
if filename[-4:] == '.mat':
pass
else:
filename = filename + '.mat'
model = sio.loadmat(filename)
layer_size = model['layer_size'].tolist()[0]
layer_activation = model['layer_activation'].tolist()
Theta = model['Theta']
b = model['b']
depth = len(layer_size)
struct = []
Weights = {}
Weights['Theta'] = []
Weights['b'] = []
for i in range(depth):
struct.append([layer_size[i], layer_activation[i]])
Weights['Theta'].append(Theta[0][i])
Weights['b'].append(b[0][i])
self.struct = struct
self.Weights = Weights
if __name__ == "__main__":
pass
框架使用
测试数据集以MNIST为例,所用MNIST数据集在Lecun的网站下载,数据集的获得来自于我的另一篇文章,
我们选择其中5000个样本,其中3200个作为训练集,800个作为验证集,1000个作为测试集
我们先导入相应的包以及对数据做处理及划分
import numpy as np
import scipy.io as sio
import matplotlib.pyplot as plt
from NetWork import Network
mnist_path = 'D:\while(1)\MNIST\MNIST_data\MNIST.mat'
data = sio.loadmat(mnist_path)
X = data['img_train'][0:5000]
label = data['label_train'][0:5000]
y = np.zeros((len(label), 10))
for i in range(len(label)):
y[i, label[i]] = 1
X = X / 255
seg_1 = 3200
seg_2 = 4000
train_X = X[0:seg_1, :]
train_y = y[0:seg_1]
val_X = X[seg_1:seg_2, :]
val_y = y[seg_1:seg_2, :]
test_X = X[seg_2:, :]
test_y = y[seg_2:]
然后我们初始化模型:
model = Network(
[[256, 'sigmoid'], [128, 'sigmoid'], [10, 'softmax']],
lamda=0.01, alpha=0
)
对模型进行训练,迭代10000次,不设置判断收敛的eps,每50次迭代监视一次模型训练情况
history = model.fit(train_X, train_y, (val_X, val_y), 10000, 50)
保存模型
model.save_model('mnist_model_s256_s128_lr1e-2_alpha0_e1e+4')
对测试集进行评估
test_acc = model.evaluate(test_X, test_y)[0]
print('test set acc is: ' + str(test_acc))
获取模型训练时的监视数据并可视化:
acc = history['acc']
val_acc = history['val_acc']
loss = history['loss']
val_loss = history['val_loss']
epochs = range(1, len(acc)+1)
plt.plot(epochs, acc, 'bo', label='Training_acc')
plt.plot(epochs, val_acc, 'b', label='Val_acc')
plt.title('Training and Validation accuracy')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'bo', label='Training_loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()
如果要直接导入模型,则模型的初始化方式为
model = Network()
model.load_model(filename)
# 后续操作
测试结果
cmd的显示:
测试集精度为0.875
训练集和验证集精度变化曲线:
损失曲线: