用代码简单理解一下神经网络。
- 前馈神经网络是一种最简单的神经网络(MLP, 多层感知器),各种神经云分层排列,每个神经元只与前一层的神经元相连,接收前一层的输出,并输出给下一层,各层间没有反馈。它适于处理复杂的非线性分类情况,相比线性回归,logistic回归,提高灵活性的同时,又不太会有过拟合。
简单介绍一下神经网络的计算方式
- 假设有一个三层的神经网络结构:x = input(4) -> a2 = hidden(3) ->a3 = output(2)
假设有:
x = [
x11,
x21,
x31,
x41
]
假设有矩阵和向量,w1, b1, w2, b2, 而且每个层使用sigmoid激活函数激活,则有
z1 = w1.dot(x) + b1
a2 = sigmoid(z1)
z2 = w2.dot(x) + b2
a3 = sigmoid(z2)
更多的层数,堆叠即可
损失函数使用交叉熵公式,令py = a3
loss = np.sum(-y * log(py) - (1-y)*log(1-py))
- 链式求导,BP过程,这个过程后续可以单开一个文章手推一遍,供大家参考理解。如果有后续,会在这里加链接。
loss’(py) = (py-y)/(py*(1-py)) = (a3-y)/(a3*(1-a3)) #这个求导过程可自行推导
a3’(z2) = a3*(1-a3)
loss’(z2) = loss’(a3) * a3(z2) = (a3-y)/(a3*(1-a3)) * a3*(1-a3) = a3 - y
z2’(b2) = [1,
1]
loss’(b2) = loss’(z2) * z2’(b2) = loss’(z2) = a3 - y
loss’(a2) = w2.T.dot(loss’(z2))
loss’(w2) = loss’(z2).dot(a2.T)
a2’(z1) = a2 * (1-a2)
loss’(z1) = loss’(a2) +* a2’(z1)
loss’(b1) = loss’(z1)
loss’(w1) = loss’(z1).dot(x.T)
- 完整代码如下:
import numpy as np
# 手写实现固定三层的神经网络结构:input(4)->hidden(3)->output(2)
# 定义激活函数
def sigmoid(z):
return 1/(1 + np.exp(-z))
pass
# 定义三层神经网络结构
def predict(x, w1, b1, w2, b2):
z1 = w1.dot(x) + b1
a2 = sigmoid(z1)
z2 = w2.dot(a2) + b2
a3 = sigmoid(z2)
return a3, z2, a2, z1
pass
# 计算单个样本的损失值
def loss(x, w1, b1, w2, b2, y):
py, z2, a2, z1 = predict(x, w1, b1, w2, b2)
return np.sum(-y * np.log(py) - (1 - y)*np.log(1 - py))
pass
# 梯度函数,逆向计算以求得参数w1,b1,w2,b2的数值
def gradient(x, w1, b1, w2, b2, y):
a3, z2, a2, z1 = predict(x, w1, b1, w2, b2)
loss_a3 = (a3-y)/(a3*(1-a3))
loss_z2 = a3 - y
loss_b2 = a3 - y
loss_a2 = w2.T.dot(loss_z2)
loss_w2 = loss_z2.dot(a2.T)
a2_z1 = a2*(1-a2)
loss_z1 = loss_a2 * a2_z1
loss_b1 = loss_z1
loss_w1 = loss_z1.dot(x.T)
return loss_w1, loss_b1, loss_w2, loss_b2
pass
def train(x, w1, b1, w2, b2, y ,alpha = 0.1, to1 = 10, times = 10000):
# 每次训练一条数据
t = 0
total_loss = 0
while t < times:
for x1, y1 in zip(xTrain, yTrain):
x1 = x1.reshape(-1, 1)
y1 = y1.reshape(-1, 1)
loss1 = loss(x1, w1, b1, w2, b2, y1)
loss_w1, loss_b1, loss_w2, loss_b2 = gradient(x1, w1, b1, w2, b2, y1)
w1 = w1 - alpha * loss_w1
b1 = b1 - alpha * loss_b1
w2 = w2 - alpha * loss_w2
b2 = b2 - alpha * loss_b2
total_loss += loss1
pass
t += 1
ls = total_loss/len(xTrain)
print(ls)
total_loss = 0
pass
return w1, b1, w2, b2
pass
# 定义数据,验证计算过程, 期望最后得到的a3可以与y相等
xTrain = np.array([[1, 2, 4, 3],
[2, 3, 9, 1],
[100, 200, 300, 150],
[300, 400, 250, 350]])
# y需要经过onehot编码,这里是二分类问题的one-hot编码
y = np.array([0, 0, 1, 1])
yTrain = np.array([[1, 0], # 100%属于0类
[1, 0],
[0, 1], # 100%属于1类
[0, 1]])
# 判断,x(4x1)-> w1*x+b1(3x1) -> a3(2x1)
# w1(3 x 4) w2 (2 x 3)
# w1 需要是一个三行四列的矩阵
w1 = np.array([[0.1, 0.1, 0.1, 0.1],
[0.1, 0.1, 0.1, 0.1],
[0.1, 0.1, 0.1, 0.1]])
# b1 需要是三行一列的矩阵
b1 = np.array([[0.2],
[0.2],
[0.2]])
# w2 需要是两行三列的矩阵
w2 = np.array([[0.5, 0.5, 0.5],
[0.5, 0.5, 0.5]])
b2 = np.array([[0.1],
[0.1]])
# 训练模型
xTrain = xTrain/1000
w1, b1, w2, b2 = train(xTrain, w1, b1, w2, b2, yTrain)
# 模型预测
xTest = np.array([[1, 2, 5, 9]])
xTest = xTest/1000
result, z2, a2, z1 = predict(xTest[0].reshape(-1, 1), w1, b1, w2, b2)
print(result)
print(np.argmax(result.flatten()))
最后的结果,可以看出有99%的概率属于第一类,有0.1%的概率属于第二类,所以判断预测类属于第一类。
DNN实现MNIST手写文字识别
- 数据集:
- 代码:
import numpy as np
# 手写实现固定三层的神经网络结构:input(4)->hidden(3)->output(2)
# 定义激活函数
def sigmoid(z):
return 1/(1 + np.exp(-z))
pass
# 定义三层神经网络结构
def predict(x, w1, b1, w2, b2):
z1 = w1.dot(x) + b1
a2 = sigmoid(z1)
z2 = w2.dot(a2) + b2
a3 = sigmoid(z2)
return a3, z2, a2, z1
pass
# 计算单个样本的损失值
def loss(x, w1, b1, w2, b2, y):
py, z2, a2, z1 = predict(x, w1, b1, w2, b2)
return np.sum(-y * np.log(py) - (1 - y)*np.log(1 - py))
pass
# 梯度函数,逆向计算以求得参数w1,b1,w2,b2的数值
def gradient(x, w1, b1, w2, b2, y):
a3, z2, a2, z1 = predict(x, w1, b1, w2, b2)
loss_a3 = (a3-y)/(a3*(1-a3))
loss_z2 = a3 - y
loss_b2 = a3 - y
loss_a2 = w2.T.dot(loss_z2)
loss_w2 = loss_z2.dot(a2.T)
a2_z1 = a2*(1-a2)
loss_z1 = loss_a2 * a2_z1
loss_b1 = loss_z1
loss_w1 = loss_z1.dot(x.T)
return loss_w1, loss_b1, loss_w2, loss_b2
pass
def train(x, w1, b1, w2, b2, y ,alpha = 0.1, to1 = 10, times = 100):
# 每次训练一条数据
t = 0
total_loss = 0
while t < times:
for x1, y1 in zip(xTrain, yTrain):
x1 = x1.reshape(-1, 1)
y1 = y1.reshape(-1, 1)
loss1 = loss(x1, w1, b1, w2, b2, y1)
loss_w1, loss_b1, loss_w2, loss_b2 = gradient(x1, w1, b1, w2, b2, y1)
w1 = w1 - alpha * loss_w1
b1 = b1 - alpha * loss_b1
w2 = w2 - alpha * loss_w2
b2 = b2 - alpha * loss_b2
total_loss += loss1
pass
t += 1
ls = total_loss/len(xTrain)
print(ls)
total_loss = 0
pass
return w1, b1, w2, b2
pass
from PIL import Image
import matplotlib.pyplot as plt
import pandas as pd
df = pd.read_csv('data/digits_training.csv')
trainData = df.to_numpy()
xTrain = trainData[:, 1:] # 从第二列开始取
lable = trainData[:, 0] # 取第一列
# for x in xTrain:
# plt.imshow(x.reshape(28, 28), plt.cm.gray)
# plt.show()
# pass
w1 = np.random.uniform(-0.5, 0.5, size=(128, 784)) # 28 * 28 = 784
b1 = np.random.uniform(-0.5, 0.5, size=(128, 1))
w2 = np.random.uniform(-0.5, 0.5, size=(10, 128))
b2 = np.random.uniform(-0.5, 0.5, size=(10, 1))
# 对yTrain做one-hot编码,xTrain做归一化处理
xTrain = xTrain/255
yTrain = np.zeros(shape=(2000, 10))
for i, row in zip(lable, yTrain):
row[i] = 1
pass
w1, b1, w2, b2 = train(xTrain, w1, b1, w2, b2, yTrain)
# 测试验证计算准确率
df = pd.read_csv('data/digits_testing.csv')
testData = df.to_numpy()
xTest = testData[:, 1:]
label = testData[:, 0]
xTest = xTest/255
yTest = np.zeros(shape=(2000, 10))
for i, row in zip(lable, yTest):
row[i] = 1
pass
result = []
for x in xTest:
x = x.reshape(-1,1)
a3, z2, a2, z1 = predict(x, w1, b1, w2, b2)
result.append(np.argmax(a3.flatten()))
pass
result = np.array(result)
acc = 1-np.count_nonzero(label-result)/len(label)
print("准确率acc: %.2f%%" % (acc*100))
- 结果: