import numpy as np
import torchvision
from torch.utils.data import DataLoader
class BP():
def __init__(self,nodes_num):
self.nodes_num = nodes_num # 各层结点数
self.layout_nums = len(nodes_num)
self.weights = [np.random.randn(n,m) for (m,n) in zip(self.nodes_num[:-1],self.nodes_num[1:])]
self.bias = [np.random.randn(1,n) for n in self.nodes_num[1:]]
self.before_activation = []
self.after_activation = []
self.loss_func = "MSE"
self.eta = 3
def sigmoid(self,x):
return 1.0 / (1.0 + np.exp(-x))
def sigmoid_grad(self,x):
return self.sigmoid(x) * (1.0 - self.sigmoid(x))
def forward(self,x):
# 前向传播
value = x
self.before_activation = []
# self.after_activation = []
self.before_activation.append(x)
# self.after_activation.append(x)
for (weight,bias) in zip(self.weights,self.bias):
self.before_activation.append(np.matmul(weight,value.T).T + bias)
value = self.sigmoid(np.matmul(weight,value.T).T + bias)
# self.after_activation.append(x)
return value
def backwards(self,y):
# minibatch反向传播
delta_new_weight = [np.zeros(w.shape) for w in self.weights]
delta_new_bias = [np.zeros(b.shape) for b in self.bias]
# y为真实样本
if self.loss_func == "MSE":
delta = -(y - self.sigmoid(self.before_activation[-1])) * self.sigmoid_grad(self.before_activation[-1])
# delta = -(y - self.sigmoid(self.before_activation[-1])) -->最后一层不激活
# 更新最后一层权重
delta_new_weight[-1] = np.matmul(delta.T,self.sigmoid(self.before_activation[-2]))
delta_new_bias[-1] = delta
for i in range(2,self.layout_nums):
# 可以print shape看一下
# 注意这里下标千万别乱了,self.before_activation的元素个数比self.weights的元素个数多1。
delta = np.matmul(self.weights[-i + 1].T, delta.T).T * self.sigmoid_grad(self.before_activation[-i])
delta_new_weight[-i] = np.matmul(delta.T,self.sigmoid(self.before_activation[-i-1]))
delta_new_bias[-i] = delta
return delta_new_weight,delta_new_bias
使用torchvision加载数据
myBP = BP([784,30,10])
dataset_train = torchvision.datasets.MNIST(root="./MNIST",train=True,transform=torchvision.transforms.ToTensor(),download=True)
dataset_test = torchvision.datasets.MNIST(root="./MNIST",train=False,transform=torchvision.transforms.ToTensor(),download=True)
def one_hot_label(label):
mlabel = np.array([[0.0] * 10])
mlabel[0][label] = 1.0
return mlabel
train_loader = DataLoader(dataset=dataset_train,batch_size=10,shuffle=True,num_workers=0,drop_last=False)
train_epochs = 30
for epoch in range(train_epochs):
for minibatch in train_loader:
batch_delta_new_weight = [np.zeros(w.shape) for w in myBP.weights]
batch_delta_new_bias = [np.zeros(b.shape) for b in myBP.bias]
imgs, labels = minibatch
for i in range(len(imgs)):
label = one_hot_label(labels[i])
img = np.array(imgs[i]).reshape(1,784) # 拉平
predict_label = myBP.forward(img) # 前向传播
delta_new_weight,delta_new_bias = myBP.backwards(label) # 反向传播
batch_delta_new_weight = [total+item for total,item in zip(batch_delta_new_weight,delta_new_weight)]
batch_delta_new_bias = [total+item for total,item in zip(batch_delta_new_bias,delta_new_bias)]
# 批量梯度更新
myBP.weights = [w - (myBP.eta / len(imgs)) * batch_new_weight for w,batch_new_weight in zip(myBP.weights,batch_delta_new_weight)]
myBP.bias = [b - (myBP.eta/len(imgs)) * batch_new_bias for b,batch_new_bias in zip(myBP.bias,batch_delta_new_bias)]
correct = 0
for i in range(len(dataset_test)):
img,label = dataset_test[i]
img = np.array(img).reshape(1,784)
predict_label = np.argmax(myBP.forward(img)[0])
if(predict_label == label):
correct += 1
print("Epoch {}: accuracy {}%".format(epoch,(correct / len(dataset_test) * 100)))
这里学习到了minibatch的概念,即以batch_size为一批次更新梯度。所更新的梯度为这批训练数据的平均梯度。
如果batch_size为1,训练速度快,但可能达不到最优解。
这里的损失函数并没有用交叉熵损失函数,而是全连接层输出层的十个结点最后激活后,会落入(0,1)的区间。转换真实值为独热编码,这十个输出值分别对应地去拟合独热编码(这是一个10个元素的向量)。
结果展示:
该模型在测试集上的效果:
只使用全连接层做手写数字识别的准确率最高在94%左右。