用NumPy搭建简单神经网络
模型代码
公共父类
后面所有类都需要继承该类,并实现其方法。
from abc import abstractmethod
import numpy as np
class Module(object):
def __init__(self) -> None:
super(Module, self).__init__()
@abstractmethod
def forward(self):
pass
def __call__(self, *args, **kwds):
return self.forward(*args, **kwds)
全链接层
数学推导:
-
前向传播公式:
Y n × q = X n × p ⋅ W p × q + b 1 × q Y_{n\times q}=X_{n\times p} \cdot W_{p\times q} + b_{1\times q} Yn×q=Xn×p⋅Wp×q+b1×q
-
反向传播公式:
∂ Y ∂ X n × p = Y ⋅ W T \frac{\partial Y}{\partial X_{n\times p}}=Y\cdot W^T ∂Xn×p∂Y=Y⋅WT
∂ Y ∂ W p × q = X T ⋅ Y \frac{\partial Y}{\partial W_{p\times q}}=X^T\cdot Y ∂Wp×q∂Y=XT⋅Y
∂ Y ∂ b 1 × q = 1 1 × q \frac{\partial Y}{\partial b_{1\times q}}=1_{1\times q} ∂b1×q∂Y=11×q
代码实现:
class Linear(Module):
def __init__(self, in_features, out_features, bias=False) -> None:
super(Linear, self).__init__()
self.W = np.random.normal(size=(in_features, out_features))
self.X = None
self.bias = None
if bias:
self.bias = np.random.normal(size=(out_features))
def forward(self, X):
"""
X: (batch_size, hidden_size)
"""
self.X = X
Y = np.dot(X, self.W)
if self.bias is not None:
Y = Y + self.bias
return Y
def backward(self, delta_Y, lr):
"""
delta_Y: (batch_size, output_size)
"""
delta_Y_ = np.dot(delta_Y, self.W.transpose())
self.W = self.W - np.dot(self.X.transpose(), delta_Y) * lr
if self.bias is not None:
self.bias -= np.average(delta_Y, axis=0)
return delta_Y_
tanh激活函数层
数学推导:
-
前向传播公式:
Y n × m = e X n × m − e − X n × m e X n × m + e − X n × m Y_{n\times m}=\frac{e^{X_{n\times m}}-e^{-X_{n\times m}}}{e^{X_{n\times m}}+e^{-X_{n\times m}}} Yn×m=eXn×m+e−Xn×meXn×m−e−Xn×m
-
反向传播公式:
∂ Y ∂ X n × m = 1 − t a n h ( X n × m ) 2 \frac{\partial Y}{\partial X_{n\times m}}=1-\mathrm{tanh}(X_{n\times m})^2 ∂Xn×m∂Y=1−tanh(Xn×m)2
代码实现:
class Tanh(Module):
def __init__(self) -> None:
super(Tanh, self).__init__()
self.Y = None
def forward(self, X):
self.Y = np.tanh(X)
return self.Y
def backward(self, delta_Y):
return np.multiply((1 - self.Y ** 2), delta_Y)
Softmax层
数学推导:
-
前向传播公式:
Y i , j = e X i , j ∑ j = 1 m e X i , j Y_{i,j}=\frac{e^{X_{i,j}}}{\sum_{j=1}^m e^{X_{i,j}}} Yi,j=∑j=1meXi,jeXi,j
-
反向传播公式:
∂ Y ∂ X i , j = ∑ k ≠ j − Y i , k ⋅ Y i , j + Y i , j ⋅ ( 1 − Y i , j ) \frac{\partial Y}{\partial X_{i,j}}=\sum_{k\ne j}-Y_{i,k}\cdot Y_{i,j}+Y_{i,j}\cdot(1-Y_{i,j}) ∂Xi,j∂Y=k=j∑−Yi,k⋅Yi,j+Yi,j⋅(1−Yi,j)
代码实现:
class Softmax(Module):
def __init__(self) -> None:
super(Softmax, self).__init__()
self.exps = None
self.exps_sum = None
def forward(self, X):
"""
X: (batch_size, features)
"""
C = np.max(X)
self.exps = np.exp(X - C)
self.exps_sum = np.sum(self.exps, axis=1).reshape((-1, 1))
return np.divide(self.exps, self.exps_sum)
def backward(self, delta_Y):
"""
delta_Y: (batch_size, features)
"""
exps_sum_square = self.exps_sum ** 2
ii_matrix = np.multiply(self.exps, self.exps_sum) / exps_sum_square # (batch_size, features)
ij_matrix = - np.matmul(self.exps[:, :, np.newaxis], self.exps[:, np.newaxis, :]) / exps_sum_square[:, :, np.newaxis] # (batch_size, features, features)
ij_Y = np.multiply(delta_Y[:, :, np.newaxis], ij_matrix).sum(axis=1) # (batch_size, features)
delta_Y = ij_Y + np.multiply(delta_Y, ii_matrix) # (batch_size, features)
return delta_Y
Log对数函数层
数学推导:
-
前向传播公式:
Y i , j = ln ( X i , j ) Y_{i,j}=\ln(X_{i,j}) Yi,j=ln(Xi,j)
-
反向传播公式:
∂ Y ∂ X i , j = 1 X i , j \frac{\partial Y}{\partial X_{i,j}}=\frac{1}{X_{i,j}} ∂Xi,j∂Y=Xi,j1
代码实现:
class Log(Module):
def __init__(self) -> None:
super(Log, self).__init__()
self.inf = 1e-10
self.X = None
def forward(self, X):
self.X = X
return np.log(X + self.inf)
def backward(self, delta_Y):
return np.multiply(1 / (self.X + self.inf), delta_Y)
损失函数层
数学推导:
-
前向传播公式:
Y = ∑ i = 1 n ∑ j = 1 m − X i , j T i , j n Y=\frac{\sum_{i=1}^n\sum_{j=1}^m-X_{i,j}T_{i,j}}{n} Y=n∑i=1n∑j=1m−Xi,jTi,j
其中:
- X i , j X_{i,j} Xi,j:预测的第 i i i 条数据是标签 j j j 的概率;
- T i , j T_{i,j} Ti,j:实际的第 i i i 条数据是标签 j j j 的概率【0,1】;
-
反向传播公式:
∂ Y ∂ X i , j = − T i , j n \frac{\partial Y}{\partial X_{i,j}}=-\frac{T_{i,j}}{n} ∂Xi,j∂Y=−nTi,j
代码实现:
class NLLloss(Module):
def __init__(self) -> None:
super(NLLloss, self).__init__()
self.target = None
self.loss = None
def forward(self, Y, target):
"""
Y: (batch_size, features)
target: (batch_size)
"""
self.target = np.ones(shape=Y.shape) * 1e-6
for i, j in enumerate(target):
self.target[i, j] = 1
self.loss = -np.sum(np.multiply(Y, self.target)) / len(target)
return self.loss
def backward(self):
return - self.target * self.loss
神经网络结构
模型结构
模型代码:
# 模型
linear1 = Linear(in_features=28 * 28, out_features=1024, bias=True)
tanh1 = Tanh()
linear2 = Linear(in_features=1024, out_features=10, bias=True)
softmax = Softmax()
log = Log()
nllloss = NLLloss()
模型实验
实验代码
# 训练参数
epochs = 30
batch_size_train = 64
batch_size_test = 1000
lr = 0.0003
# 模型
linear1 = Linear(in_features=28 * 28, out_features=1024, bias=True)
tanh1 = Tanh()
linear2 = Linear(in_features=1024, out_features=10, bias=True)
softmax = Softmax()
log = Log()
nllloss = NLLloss()
for epoch in range(epochs):
""" train """
process_bar = tqdm(range(len(X_train) // batch_size_train), ncols=150)
for itor in process_bar:
X = X_train[itor * batch_size_train: itor * batch_size_train + batch_size_train]
y = y_train[itor * batch_size_train: itor * batch_size_train + batch_size_train]
""" 前向传播 """
tmp = X.reshape((batch_size_train, -1))
tmp = linear1(tmp)
tmp = tanh1(tmp)
tmp = linear2(tmp)
tmp = softmax(tmp)
pre = np.argmax(tmp, axis=1)
train_acc = np.sum(y == pre)
train_total = len(y)
tmp = log(tmp)
loss = nllloss(tmp, y)
""" 反向传播 """
Y = nllloss.backward()
Y = log.backward(Y)
Y = softmax.backward(Y)
Y = linear2.backward(Y, lr)
Y = tanh1.backward(Y)
Y = linear1.backward(Y, lr)
process_bar.set_description('Train epoch:{} '.format(epoch + 1))
process_bar.set_postfix_str('loss: {:.4f} Acc:{:.2f}%'.format(
loss, 100. * train_acc / train_total))
""" test """
test_total = 0
test_acc = 0
test_process_bar = tqdm(range(len(X_test) // batch_size_test), ncols=150)
for itor in test_process_bar:
X = X_test[itor * batch_size_test: itor * batch_size_test + batch_size_test]
y = y_test[itor * batch_size_test: itor * batch_size_test + batch_size_test]
tmp = X.reshape((batch_size_test, -1))
tmp = linear1(tmp)
tmp = tanh1(tmp)
tmp = linear2(tmp)
Y = softmax(tmp)
Y = np.argmax(Y, axis=1)
test_total += len(y)
test_acc += np.sum(y == Y)
test_process_bar.set_description('Test epoch:{} '.format(epoch + 1))
test_process_bar.set_postfix_str('Acc [{}/{} ({:.2f}%)]'.format(
test_acc, test_total, 100. * test_acc/test_total))