【日常】手写卷积神经网络（conv+relu+maxpooling+linear+relu+linear+softmax+交叉熵损失+正则）

本文链接：https://blog.csdn.net/CY19980216/article/details/90166327

第二次课程作业，手写CNN进行手写数字识别。据说还要手写一次RNN（这酸爽还有谁）。

作业坑点在于Assignment给定的这个架构与默认参数效果非常差，结果怎么调都调不好，一开始一直是以为写错了，但是检查了很久都发现不了哪里写错了。后来用keras实现了同样的架构，发现确实效果差得惊人（跟瞎猜没有区别），然后随便改了改（加一个卷积层或者加一个dense层，甚至只要修改一下优化函数，SGD的效果比adagrad及adam都要差太多），就能得到比较好的效果。这作业做完之后觉得神经网络真的是科学中的玄学，玄学中的佛学。

代码架构如下所示：

../
	train_small.py														 # 主文件
	test_grad.py														 # 用于测试layers.py的层前向及反向传播是否正确
	dataset.py															 # 读取数据
	check_gradient.py													 # test_grad.py常用的函数
../nn/
		cnn.py															 # CNN架构
		layers.py														 # 各个层的类
		loss.py															 # 损失函数
		optimizer.py													 # 优化器
		utils.py														 # 其他
../cifar-10-batches-py
		# 数据文件, 通过下面的方法下载
		# wget http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
		# tar -xzvf cifar-10-python.tar.gz
		# rm cifar-10-python.tar.gz

代码下面自取：

train_small.py

#-*- coding:UTF-8 -*-
import os
import sys
sys.path.append("{}\\nn".format(os.getcwd()))							 # 最近python老是犯病
import numpy as np
from nn.cnn import CNN
from nn.optimizer import SGD
from nn.utils import accuracy
from dataset import get_cifar10_data
from matplotlib import pyplot as plt
from collections import Counter

def train(model,X_train,y_train,X_val,y_val,batch_size,n_epochs,
	lr=1e-2,
	lr_decay=0.8,
	momentum=0.0,
	wd=0.0,
	verbose=True,
	print_level=1,
):																		 # 模型训练
	print("开始训练...")
	n_train = X_train.shape[0]											 # 获取训练集样本数量
	iterations_per_epoch = max(n_train//batch_size,1)					 # 每个epoch的迭代次数
	n_iterations = n_epochs*iterations_per_epoch						 # 总的迭代次数
	loss_hist = []														 # 储存每个epoch的损失函数值
	opt_params = {"lr":lr,"weight_decay":wd,"momentum":momentum}		 # 设置最优参数, 预设了学习率
	print("训练样本数：{}".format(n_train))
	print("每阶段迭代次数：{}".format(iterations_per_epoch))
	print("阶段数：{}".format(n_epochs))
	print("总迭代次数：{}".format(n_iterations))
	count = 0
	for epoch in range(n_epochs):										 # 遍历所有epoch
		for t in range(iterations_per_epoch):							 # 遍历总的迭代次数
			count += 1
			batch_mask = np.random.choice(n_train,batch_size)			 # 从训练集中随机取batch_size个出来
			X_batch = X_train[batch_mask]								 # 输入采样
			y_batch = y_train[batch_mask]								 # 输出采样
			loss,score = model.oracle(X_batch,y_batch)					 # 评估函数值与梯度
			loss_hist.append(loss)										 # 记录损失值								
			sgd = SGD(model.param_groups,**opt_params)					 # 定义优化器
			sgd.step()													 # 进行随机梯度下降
			if verbose and t%print_level==0:							 # 输出训练损失				
				train_acc = accuracy(score,y_batch)						 # 计算精度
				print("(Iteration {}/{},epoch {})loss:{},accu:{}".format(
					count,n_iterations,epoch,loss_hist[-1],train_acc))
			if t==iterations_per_epoch-1: opt_params["lr"] *= lr_decay
	
	""" 实验绘图 """
	plt.close()
	plt.figure()
	plt.plot(loss_hist,label="training loss")
	plt.legend(loc="best")
	plt.show()

if __name__ == "__main__":
	model = CNN()														 # 初始化卷积神经网络
	data = get_cifar10_data()											 # 读取数据
	num_train = 100													 # 训练集数量
	data = {															 # 把data转为字典形式
		"X_train": data["X_train"][:num_train],
		"y_train": data["y_train"][:num_train],
		"X_val": data["X_val"],
		"y_val": data["y_val"],
	}
	X_train,y_train,X_val,y_val = data["X_train"],data["y_train"],data["X_val"],data["y_val"]
	train(model,X_train,y_train,X_val,y_val,50,50)

test_grad.py

#-*- coding:UTF-8 -*-
import os
import sys
sys.path.append("{}\\nn".format(os.getcwd()))							 # 最近python老是犯病
import numpy as np
from nn.cnn import CNN
from check_gradient import *
from nn.layers import Conv,MaxPool,Linear,Relu

def rel_error(x,y):														 # 计算相对误差
	return np.nanmax(np.abs(x-y)/(np.maximum(1e-8,np.abs(x)+np.abs(y))))

""" relu激活层就不管了, 反正错了也不是我的锅 """
check_conv_forward = True
#check_conv_forward = False
check_conv_backward = True
#check_conv_backward = False
check_linear_forward = True
#check_linear_forward = False
check_linear_backward = True
#check_linear_backward = False
check_pool_forward = True
#check_pool_forward = False
check_pool_backward = True
#check_pool_backward = False

if check_conv_forward:													 # 检查卷积层前向传播是否正确
	x_shape = (2,3,4,4)													 # 2个样本, 3个轨道, 4×4像素
	w_shape = (3,3,4,4)													 # 3个输入轨道, 3个输出轨道, 4×4卷积核
	x = np.linspace(-0.1,0.5,num=np.prod(x_shape)).reshape(x_shape)		 # 2×3×4×4的样本
	w = np.linspace(-0.2,0.3,num=np.prod(w_shape)).reshape(w_shape)		 # 3×3×4×4的卷积过滤器
	b = np.linspace(-0.1,0.2,num=3)										 # 3个偏差
	conv = Conv(4,4,3,3,2,1)											 # 卷积层: 4×4卷积核, 3个输入轨道, 3个输出轨道, 步长2, padding值为1
	conv.params["w"]["param"] = w										 # 设置卷积过滤器
	conv.params["b"]["param"] = b										 # 设置偏差值
	out = conv(x)														 # Layer类是可以被调用的, 返回前向传播的结果
	correct_out = np.array([											 # 事先计算好的正确输出维度为: 2×3×2×2
		[
			[[-0.08759809,-0.10987781],[-0.18387192,-0.2109216]],
			[[0.21027089,0.21661097],[0.22847626,0.23004637]],
			[[0.50813986,0.54309974],[0.64082444,0.67101435]]
		],
		[
			[[-0.98053589,-1.03143541],[-1.19128892,-1.24695841]],
			[[0.69108355,0.66880383],[0.59480972,0.56776003]],
			[[2.36270298,2.36904306],[2.38090835,2.38247847]]
		]
	])
	print("Testing convolutional forward...")
	print("difference: {}".format(rel_error(out,correct_out)))

if check_conv_backward:													 # 检查卷积层反向传播是否正确
	np.random.seed(231)													 # 初始化随机种子
	x = np.random.randn(2,3,16,16)										 # 2个样本, 3个轨道, 16×16像素
	w = np.random.randn(3,3,3,3)										 # 3个输入轨道, 3个输出轨道, 3×3的卷积核
	b = np.random.randn(3,)												 # 3个偏差
	dout = np.random.randn(2,3,14,14)									 # 随机给定一个输出dout: 这个应该是被假设为下一层反向传回当层的输入梯度值
	conv = Conv(3,3,3,3,1,0)											 # 初始化一个3个输入轨道, 3个输出轨道, 3×3的卷积核, 步长为1且不padding
	conv.params["w"]["param"] = w										 # 设置卷积过滤器
	conv.params["b"]["param"] = b										 # 设置偏差
	out = conv(x)														 # 计算卷积层在输入x后的输出结果
	dx = conv.backward(dout,x)											 # 计算对应dout输入的反向传播输出
	dx_num = eval_numerical_gradient_array(conv,x,dout)					 # dx_num维度与x,dx完全相同
	params = conv.params												 # 获取卷积层的参数

	def fw(v):															 # 计算用v过滤器后的输出结果
		tmp = params["w"]["param"]								
		params["w"]["param"] = v							
		f_w = conv(x)										
		params["w"]["param"] = tmp							
		return f_w								

	def fb(v):															 # 计算用v偏差后的输出结果
		tmp = params["b"]["param"]
		params["b"]["param"] = v
		f_b = conv(x)
		params["b"]["param"] = tmp
		return f_b

	dw = params["w"]["grad"]											 # 卷积过滤器的梯度
	dw_num = eval_numerical_gradient_array(fw,w,dout)					 # dw_num维度与w,dw完全相同
	db = params["b"]["grad"]											 # db_num维度与b,db完全相同
	db_num = eval_numerical_gradient_array(fb,b,dout)
	
	print("Testing convolutional backward")
	print("dx error: {}".format(rel_error(dx_num,dx)))
	print("dw error: {}".format(rel_error(dw_num,dw)))
	print("db error: {}".format(rel_error(db_num,db)))

if check_linear_forward:												 # 检查线性层前向传播是否正确
	x_shape = (2,3,4,4)													 # 2个样本, 3个轨道, 4×4像素
	w_shape = (3*4*4,64)												 # 从48维映射到64维
	b_shape = (1,64)
	x = np.linspace(-0.1,0.5,num=np.prod(x_shape)).reshape(x_shape)
	w = np.linspace(-0.2,0.3,num=np.prod(w_shape)).reshape(w_shape)	
	b = np.linspace(-0.1,0.2,num=64).reshape(b_shape)					 # 64个偏差
	linear = Linear(3*4*4,64)
	linear.params["w"]["param"] = w										 # 设置卷积过滤器
	linear.params["b"]["param"] = b										 # 设置偏差值
	out = linear(x)														 # Layer类是可以被调用的, 返回前向传播的结果

	correct_out = np.dot(x.reshape(2,48),w)+b							 # 单纯的全连接层
	print("Testing linear forward...")
	print("difference: {}".format(rel_error(out,correct_out)))		

if check_linear_backward:												 # 检查线性层反向传播是否正确
	np.random.seed(231)													 # 初始化随机种子
	x = np.random.randn(2,3,4,4)										 # 2个样本, 3个轨道, 16×16像素
	w = np.random.randn(3*4*4,64)										 # 3个输入轨道, 3个输出轨道, 3×3的卷积核
	b = np.random.randn(1,64)											 # 3个偏差
	dout = np.random.randn(2,64)										 # 随机给定一个输出dout: 这个应该是被假设为下一层反向传回当层的输入梯度值
	linear = Linear(3*4*4,64)											 # 初始化一个3个输入轨道, 3个输出轨道, 3×3的卷积核, 步长为1且不padding
	linear.params["w"]["param"] = w										 # 设置卷积过滤器
	linear.params["b"]["param"] = b										 # 设置偏差
	out = linear(x)														 # 计算卷积层在输入x后的输出结果
	dx = linear.backward(dout,x)										 # 计算对应dout输入的反向传播输出
	dx_num = eval_numerical_gradient_array(linear,x,dout)				 # dx_num维度与x,dx完全相同
	dx_num = dx_num.reshape(dx_num.shape[0],-1)							 # 调整dx_num维度为二维张量
	params = linear.params												 # 获取卷积层的参数

	def fw(v):															 # 计算用v过滤器后的输出结果
		tmp = params["w"]["param"]								
		params["w"]["param"] = v							
		f_w = linear(x)										
		params["w"]["param"] = tmp							
		return f_w								

	def fb(v):															 # 计算用v偏差后的输出结果
		tmp = params["b"]["param"]
		params["b"]["param"] = v
		f_b = linear(x)
		params["b"]["param"] = tmp
		return f_b

	dw = params["w"]["grad"]											 # 卷积过滤器的梯度
	dw_num = eval_numerical_gradient_array(fw,w,dout)					 # dw_num维度与w,dw完全相同
	db = params["b"]["grad"]											 # db_num维度与b,db完全相同
	db_num = eval_numerical_gradient_array(fb,b,dout)
	
	print("Testing linear backward")
	print("dx error: {}".format(rel_error(dx_num,dx)))
	print("dw error: {}".format(rel_error(dw_num,dw)))
	print("db error: {}".format(rel_error(db_num,db)))

if check_pool_forward:													 # 检查池化层前向传播是否正确
	x_shape = (2,3,4,4)													 # 2个样本, 3个轨道, 4×4像素
	x = np.linspace(-0.1,0.5,num=np.prod(x_shape)).reshape(x_shape)		 # 2×3×4×4的样本
	pool = MaxPool(kernel_size=2,stride=2,padding=0)					 # 2×2的池化层, 步长2且不padding
	out = pool(x)
	out_shape = (2,3,2,2)
	correct_out = np.zeros(out_shape)
	for i in range(out_shape[0]):
		for j in range(out_shape[1]):
			for k in range(out_shape[2]):
				for l in range(out_shape[3]):
					correct_out[i,j,k,l] = x[i,j,2*k+1,2*l+1]			 # 因为是按顺序排列的, 每个窗口的右下角恰好为最大值
	print("Testing pooling forward...")
	print("difference: {}".format(rel_error(out,correct_out)))	

if check_pool_backward:													 # 检查池化层反向传播是否正确
	np.random.seed(231)													 # 初始化随机种子
	x = np.random.randn(3,2,8,8)										 # 随机给定一个输入x
	dout = np.random.randn(3,2,4,4)										 # 随机给定一个输出dout: 这个应该是被假设为下一层反向传回当层的输入梯度值
	pool = MaxPool(kernel_size=2,stride=2,padding=0)					 # 初始化一个2×2的池化核, 不padding且步长为2, 其输出恰好为8×8-->4×4
	out = pool(x)														 # 得出一个对应x的前向输出
	dx = pool.backward(dout,x)											 # 得出一个对应dout的反向输出的梯度
	dx_num = eval_numerical_gradient_array(pool,x,dout)					 # 调用手动计算梯度的函数: dx_num的维度与x的维度完全相同(3,2,8,8)
	print("Testing pooling backward:")
	print("dx error: ",rel_error(dx,dx_num))							 # 你的误差应该在1e-12左右

check_gradient.py

#-*- coding:UTF-8 -*-
import sys
import numpy as np
from random import randrange

if sys.version_info>=(3,0):												 # 判断python版本, 对于python3.x需要重写xrange生成器
	def xrange(*args,**kwargs):
		return iter(range(*args,**kwargs))

def eval_numerical_gradient(f,x,
	verbose=True,
	h=1e-5
):																		 # 求函数f在点x处的梯度的一个非常幼稚的实施方案: f为只有一个参数的函数, x为需要求梯度的点或者是数组
	fx = f(x)															 # 求函数在给定点的函数值
	grad = np.zeros_like(x)												 # 预设梯度为与x形状相同的玩意儿
	it = np.nditer(x,flags=["multi_index"],op_flags=["readwrite"])		 # 把x做成numpy生成器		 
	while not it.finished:												 # 遍历x中的所有元素, 一个个求偏导
		ix = it.multi_index												 # 获取生成器index
		oldval = x[ix]													 # 获取对应index的数值
		x[ix] = oldval+h 												 # 右移一段
		fxph = f(x)														 # 计算右函数值
		x[ix] = oldval-h												 # 左移一段
		fxmh = f(x)														 # 计算左函数值
		x[ix] = oldval													 # 还原该index上原先的值
		grad[ix] = (fxph-fxmh)/(2*h)									 # 计算偏导数 
		if verbose: print(ix,grad[ix])									 # 输出梯度
		it.iternext()													 # 步入下一次
	return grad

def eval_numerical_gradient_array(f,x,df,
	h=1e-5
):																		 # 对于一个接收了一个数组并返回数组的函数来计算数值的梯度
	grad = np.zeros_like(x)												 # 这里的输入x的维度为N_samples×n_Channels×Height×Width
	it = np.nditer(x,flags=["multi_index"],op_flags=["readwrite"])		 # 把x做成一个迭代生成器
	while not it.finished:
		ix = it.multi_index												 # 获取生成器index
		oldval = x[ix]													 # 获取对应index的数值
		x[ix] = oldval+h												 # 右移一段
		pos = f(x).copy()												 # 计算右函数值
		x[ix] = oldval-h												 # 左移一段
		neg = f(x).copy()												 # 计算左函数值
		x[ix] = oldval													 # 还原该index上原先的值
		grad[ix] = np.sum((pos-neg)*df)/(2*h)							 # 这个乘以df是什么操作?
		it.iternext()
	return grad

def eval_numerical_gradient_blobs(f,inputs,output,
	h=1e-5
):																		 # 对一个操作输入与输出斑点的函数计算数值梯度: f为函数(f接收几个输入斑点作为参数,然后跟随一个斑点用于写入输出==>y=f(x,w,out),x与w为输入斑点,f的结果将被写入out),inputs为输入的斑点,output为输出斑点,h为步长
	numeric_diffs = []
	for input_blob in inputs:
		diff = np.zeros_like(input_blob.diffs)
		it = np.nditer(
			input_blob.vals,
			flags=["multi_index"],
			op_flags=["readwrite"]
		)
		while not it.finished:
			idx = it.multi_index
			orig = input_blob.vals[idx]
			input_blob.vals[idx] = orig+h
			f(*(inputs + (output,)))
			pos = np.copy(output.vals)
			input_blob.vals[idx] = orig-h
			f(*(inputs + (output,)))
			neg = np.copy(output.vals)
			input_blob.vals[idx] = orig
			diff[idx] = np.sum((pos-neg)*output.diffs)/(2.0*h)
			it.iternext()
		numeric_diffs.append(diff)
	return numeric_diffs

def eval_numerical_gradient_net(net,inputs,output,
	h=1e-5
):
	result = eval_numerical_gradient_blobs(
		lambda *args: net.forward(),inputs,output,h=h
	)
	return result

def grad_check_sparse(f,x,analytic_grad,
	num_checks=10,
	h=1e-5
):																		 # 通过采样来检查梯度的稀疏性
	for i in range(num_checks):											 # 检查num_checks个点
		ix = tuple([randrange(m) for m in x.shape])						 # 随机生成一个位置
		oldval = x[ix]													 # 获取对应index的数值
		x[ix] = oldval+h 												 # 右移一段
		fxph = f(x)														 # 计算左函数值
		x[ix] = oldval-h												 # 左移一段
		fxmh = f(x)														 # 计算右函数值
		x[ix] = oldval													 # 还原该index上原先的值
		grad[ix] = (fxph-fxmh)/(2*h)									 # 计算偏导数 
		grad_analytic = analytic_grad[ix]
		error = abs(grad_numerical-grad_analytic)
		total = abs(grad_numerical)+abs(grad_analytic)
		rel_error = error/total
		print("numerical: %f analytic: %f,relative error: %e" % (
			grad_numerical,
			grad_analytic,
			rel_error
		))

dataset.py

#-*- coding:UTF-8 -*-
import os
import pickle
import numpy as np

def load_cifar_batch(filename):											 # 导入cifar-10数据集的一个batch
	with open(filename,"rb") as f:
		datadict = pickle.load(f,encoding="latin1")	
		X = datadict["data"]											 # 获取输入字段数据
		Y = datadict["labels"]											 # 获取输出标签数据
		X = X.reshape(10000,3,32,32).transpose(0,2,3,1).astype("float")	 # 每个batch里面都是10000个样本, 改变维度顺序把RGB轨道调到最后
		Y = np.array(Y)
		return X,Y

def load_cifar10():														 # 导入全部cifar-10数据集: 5个训练集batch与1个测试集test_batch
	xs = []
	ys = []
	for b in range(1,6):
		f = os.path.join("cifar-10-batches-py","data_batch_%d" % (b,))
		X,Y = load_cifar_batch(f)
		xs.append(X)
		ys.append(Y)
	Xtr = np.concatenate(xs)											 # 把5个分文件的输入字段拼接
	Ytr = np.concatenate(ys)											 # 把5个分文件的输出标签拼接
	del X,Y
	Xte,Yte = load_cifar_batch(
		os.path.join("cifar-10-batches-py","test_batch")
	)
	"""
		Xtr: 训练集(50000,32,32,3)
		Ytr: 训练集(50000,)
		Xte: 训练集(10000,32,32,3)
		Yte: 训练集(10000,)
	"""
	return Xtr,Ytr,Xte,Yte

def get_cifar10_data(
	n_train=49000,
	n_val=1000,
	n_test=10000,
	subtract_mean=True
):																		 # 从磁盘导入cifar-10数据集并进行预处理, 这与SVM的预处理操作相同, 我们将它打包为一个函数
	X_train,y_train,X_test,y_test = load_cifar10()						 # 导入全部数据
	mask = list(range(n_train,n_train + n_val))							 # 用于划分验证集
	X_val = X_train[mask]
	y_val = y_train[mask]
	mask = list(range(n_train))											 # 用于划分训练集
	X_train = X_train[mask]
	y_train = y_train[mask]
	mask = list(range(n_test))											 # 用于划分测试集
	X_test = X_test[mask]
	y_test = y_test[mask]
	if subtract_mean:													 # 数据标准化, 减去平均图片
		mean_image = np.mean(X_train,axis=0)
		X_train -= mean_image
		X_val -= mean_image
		X_test -= mean_image
	""" 调整输入特征维度的顺序从而使得channel处于最前面的位置 """
	X_train = X_train.transpose(0,3,1,2).copy()
	X_val = X_val.transpose(0,3,1,2).copy()
	X_test = X_test.transpose(0,3,1,2).copy()
	return {															 # 将数据打包为一个字典
	  "X_train": X_train,
	  "y_train": y_train,
	  "X_val": X_val,
	  "y_val": y_val,
	  "X_test": X_test,
	  "y_test": y_test,
	}

if __name__ == "__main__":
	Xtr,ytr,Xte,yte = load_cifar10()
	print(Xtr.shape)
	print(ytr.shape)
	print(Xte.shape)
	print(yte.shape)
	print(set(ytr.tolist()))
	data = get_cifar10_data()
	for key,value in data.items():
		print(key,value.shape)

nn/cnn.py

#-*- coding:UTF-8 -*-
import math
import time
import numpy as np
from loss import SoftmaxCE,softmax
from layers import Conv,Relu,MaxPool,Linear

class CNN(object):														 # 卷积神经网络架构类: 卷积+激活+池化+线性+激活+线性+软大
	def __init__(self,
		image_size=(3,32,32),
		channels=3,
		conv_kernel=7,
		pool_kernel=2,
		hidden_units=100,
		n_classes=10,
	):																	 # 构造函数: 初始化神经网络,定义网络层
		""" 类构造参数 """												
		self.image_size = image_size									 # 图片形状3×H×W
		self.channels = channels										 # 卷积层的轨道数
		self.conv_kernel = conv_kernel									 # 卷积层核维度
		self.pool_kernel = pool_kernel									 # 池化层核维度
		self.hidden_units = hidden_units								 # 线性传播中隐层单元数量
		self.n_classes = n_classes										 # 也许是总层数吧
		""" 类常用参数 """
		channel,height,width = self.image_size							 # 这三个变量将记录卷积部分的输入轨道与维度
		self.conv_stride = 1											 # 卷积核移动步长
		self.conv_padding = 0											 # 卷积层对输入padding数量
		self.pool_stride = 2											 # 池化层窗口移动步长
		self.pool_padding = 0											 # 池化层对输入padding数量
		self.conv = Conv(												 # 卷积层: 3×32×32-->3×(32-7+1)×(32-7+1)-->3×26×26
			height=self.conv_kernel,
			width=self.conv_kernel,
			in_channels=self.image_size[0],
			out_channels=self.channels,
			stride=self.conv_stride,
			padding=self.conv_padding,
			init_scale=1e-2,
		)
		""" 经过卷积层后轨道与维度的变化 """
		channel = self.channels
		height += (2*self.conv_padding-self.conv_kernel)
		height /= self.conv_stride
		height = int(height)+1
		width += (2*self.conv_padding-self.conv_kernel)
		width /= self.conv_stride
		width = int(width)+1										
		self.relu1 = Relu()												 # 激活层 A: 3×26×26-->3×26×26
		self.pool = MaxPool(											 # 池化层: 3×26×26-->3×13×13
			kernel_size=self.pool_kernel,
			stride=self.pool_stride,
			padding=self.pool_padding,
		)
		""" 经过池化层后轨道与维度的变化 """
		channel = channel
		height += (2*self.pool_padding-self.pool_kernel)
		height /= self.pool_stride
		height = int(height)+1
		width += (2*self.pool_padding-self.pool_kernel)
		width /= self.pool_stride
		width = int(width)+1
		self.linear1 = Linear(											 # 线性层 A: 3×13×13-->507-->100
			in_features=channel*height*width,
			out_features=self.hidden_units,
			init_scale=1e-2,
		)											
		self.relu2 = Relu()												 # 激活层 B: 100-->100 							
		self.linear2 = Linear(											 # 线性层 B: 100-->10
			in_features=self.hidden_units,
			out_features=10,											 # 最后一层应该是输出对每个字段的预测概率
			init_scale=1e-2,
		)
		""" 类初始化 """
		self.softmaxce = SoftmaxCE()
		self.param_groups = [											 # 卷积层与线性层有参数
			{
				"w": {
					"param": self.conv.params["w"]["param"],
					"grad": self.conv.params["w"]["grad"],
					"pregrad": np.zeros_like(self.conv.params["w"]["grad"])
				},
				"b": {
					"param": self.conv.params["b"]["param"],
					"grad": self.conv.params["b"]["grad"],
					"pregrad": np.zeros_like(self.conv.params["b"]["grad"])
				},
			},
			{
				"w": {
					"param": self.linear1.params["w"]["param"],
					"grad": self.linear1.params["w"]["grad"],
					"pregrad": np.zeros_like(self.linear1.params["w"]["grad"])
				},
				"b": {
					"param": self.linear1.params["b"]["param"],
					"grad": self.linear1.params["b"]["grad"],
					"pregrad": np.zeros_like(self.linear1.params["b"]["grad"])
				},
			},
			{
				"w": {
					"param": self.linear2.params["w"]["param"],
					"grad": self.linear2.params["w"]["grad"],
					"pregrad": np.zeros_like(self.linear2.params["w"]["grad"])
				},
				"b": {
					"param": self.linear2.params["b"]["param"],
					"grad": self.linear2.params["b"]["grad"],
					"pregrad": np.zeros_like(self.linear2.params["b"]["grad"])
				},
			},
		]
		
	def oracle(self,x,y):												 # 计算损失函数值,输出得分,损失函数梯度: x为一个N_samples×N_channels×Height×Width的张量,y为类别标签
		""" 前向传播 """
		""" 卷积层 """
		conv_out = self.conv.forward(x)

		""" 激活层 """
		relu1_out = self.relu1.forward(conv_out)

		""" 池化层 """
		pool_out = self.pool.forward(relu1_out)

		""" 线性层 """
		linear1_out = self.linear1.forward(pool_out)

		""" 激活层 """
		relu2_out = self.relu2.forward(linear1_out)

		""" 线性层 """
		linear2_out = self.linear2.forward(relu2_out)

		""" 软大交叉熵 """
		fx,g,s = self.softmaxce(linear2_out,y)							 # 损失函数值&梯度(是算在最后一层上面的梯度)&得分
		""" 反向传播 """
		linear2_back = self.linear2.backward(g,relu2_out)
		self.update_param()
		relu2_back = self.relu2.backward(linear2_back,linear1_out)
		self.update_param()
		linear1_back = self.linear1.backward(relu2_back,pool_out)
		self.update_param()
		pool_back = self.pool.backward(linear1_back,relu1_out)
		self.update_param()
		relu1_back = self.relu1.backward(pool_back,conv_out)
		self.update_param()
		conv_back = self.conv.backward(relu1_back,x)
		self.update_param()
		return fx,s

	def oracle_time(self,x,y):											 # 计算损失函数值,输出得分,损失函数梯度: x为一个N_samples×N_channels×Height×Width的张量,y为类别标签
		""" 前向传播 """
		""" 卷积层 """
		t = time.time()
		conv_out = self.conv.forward(x)
		conv_out_time = time.time()-t

		""" 激活层 """
		t = time.time()
		relu1_out = self.relu1.forward(conv_out)
		relu1_out_time = time.time()-t

		""" 池化层 """
		t = time.time()
		pool_out = self.pool.forward(relu1_out)
		pool_out_time = time.time()-t

		""" 线性层 """
		t = time.time()
		linear1_out = self.linear1.forward(pool_out)
		linear1_out_time = time.time()-t

		""" 激活层 """
		t = time.time()
		relu2_out = self.relu2.forward(linear1_out)
		relu2_out_time = time.time()-t

		""" 线性层 """
		t = time.time()
		linear2_out = self.linear2.forward(relu2_out)
		linear2_out_time = time.time()-t

		""" 软大交叉熵 """
		t = time.time()
		fx,g,s = self.softmaxce(linear2_out,y)							 # 损失函数值&梯度(是算在最后一层上面的梯度)&得分
		sf_out_time = time.time()-t
		""" 反向传播 """
		t = time.time()
		linear2_back = self.linear2.backward(g,relu2_out)
		self.update_param()
		linear2_back_time = time.time()-t
		t = time.time()
		relu2_back = self.relu2.backward(linear2_back,linear1_out)
		self.update_param()
		relu2_back_time = time.time()-t
		t = time.time()
		linear1_back = self.linear1.backward(relu2_back,pool_out)
		self.update_param()
		linear1_back_time = time.time()-t
		t = time.time()
		pool_back = self.pool.backward(linear1_back,relu1_out)
		self.update_param()
		pool_back_time = time.time()-t
		t = time.time()
		relu1_back = self.relu1.backward(pool_back,conv_out)
		self.update_param()
		relu1_back_time = time.time()-t
		t = time.time()
		conv_back = self.conv.backward(relu1_back,x)
		self.update_param()
		conv_back_time = time.time()-t

		timedict = dict(
			conv_out_time=conv_out_time,
			relu1_out_time=relu1_out_time,
			pool_out_time=pool_out_time,
			linear1_out_time=linear1_out_time,
			relu2_out_time=relu2_out_time,
			linear2_out_time=linear2_out_time,
			conv_back_time=conv_back_time,
			relu1_back_time=relu1_back_time,
			pool_back_time=pool_back_time,
			linear1_back_time=linear1_back_time,
			relu2_back_time=relu2_back_time,
			linear2_back_time=linear2_back_time,
		)
		return fx,s,timedict

	def score(self,x):													 # 预测的得分,除了oracle函数外还需要一个另外的得分函数,这在检查精度时是有用的: x为输入特征
		conv_out = self.conv(x)
		relu1_out = self.relu1(conv_out)
		pool_out = self.pool(relu1_out)
		linear1_out = self.linear1(pool_out)
		relu2_out = self.relu2(linear1_out)
		linear2_out = self.linear2(relu2_out)
		s = softmax(linear2_out)
		return s

	def update_param(self,):											 # 更新参数及梯度
		self.param_groups[0]["w"]["param"] = self.conv.params["w"]["param"]
		self.param_groups[0]["w"]["grad"] = self.conv.params["w"]["grad"]
		self.param_groups[0]["b"]["param"] = self.conv.params["b"]["param"]
		self.param_groups[0]["b"]["grad"] = self.conv.params["b"]["grad"]
		self.param_groups[1]["w"]["param"] = self.linear1.params["w"]["param"]
		self.param_groups[1]["w"]["grad"] = self.linear1.params["w"]["grad"]
		self.param_groups[1]["b"]["param"] = self.linear1.params["b"]["param"]
		self.param_groups[1]["b"]["grad"] = self.linear1.params["b"]["grad"]
		self.param_groups[2]["w"]["param"] = self.linear2.params["w"]["param"]
		self.param_groups[2]["w"]["grad"] = self.linear2.params["w"]["grad"]
		self.param_groups[2]["b"]["param"] = self.linear2.params["b"]["param"]
		self.param_groups[2]["b"]["grad"] = self.linear2.params["b"]["grad"]

if __name__ == "__main__":
	cnn = CNN()

nn/layers.py

#-*- coding:UTF-8 -*-
import abc
import numpy as np
from numba import jit

class Layer(object):													 # 神经网络层的基类,该类包含抽象方法,因此不可以被实例化
	def __init__(self,):												 # 抽象类构造函数
		self.params = dict()											 # 参数字典: 其中键为参数名称的字符串,键为字典,键字典里面包含"param"与"grad"两个字段,分别记录参数值与梯度

	@abc.abstractmethod
	def forward(self,x):												 # 评估输入特征与返回输出: x为输入特征, 返回值f(x)为输出特征
		pass

	@abc.abstractmethod
	def backward(self,grad_in,x):										 # 计算梯度并将梯度反向传播, 为了未来可以参考, 更新过的梯度应该被储存在self.params的对应区域中: grad_in为从反向传播得到的梯度, x为输入特征, 返回值grad_x为反向传播到下一层的梯度(分别为w.r.t x)
		pass

	def __call__(self,*args,**kwargs):									 # 使得Layer类型的变量可调用
		return self.forward(*args,**kwargs)


class Conv(Layer):														 # 卷积层类, 参数w为卷积过滤器, 参数b为偏差
	def __init__(self,height,width,
		in_channels=3,
		out_channels=3,
		stride=1,
		padding=0,
		init_scale=1e-2,
	):																	 # 构造函数
		super(Conv,self).__init__()
		""" 类构造参数 """
		self.height = height											 # 卷积核的高度
		self.width = width												 # 卷积核的宽度
		self.in_channels = in_channels									 # 输入轨道数
		self.out_channels = out_channels								 # 输出轨道数
		self.stride = stride											 # 卷积核移动步长
		self.padding = padding											 # 是否需要在周围补充0
		self.init_scale = init_scale									 # 初始规模
		""" 父类参数: 我不在类初始化时设定grad参数, 因为如果重复调用同一对象的backward方法可能会导致梯度重复更新而错误 """
		self.params["w"] = {											 # 我理解的情况是这样的: 每个卷积过滤器的轨道数应与输入特征的轨道数相同, 每个卷积过滤器可以生成一个输出层轨道, 输出轨道数应当与卷积过滤器的数量相一致
			"param": self.init_scale*np.random.random((self.out_channels,
							self.in_channels,self.height,self.width)),
			"grad": None,
		}
		self.params["b"] = {											 # 偏差值
			"param": self.init_scale*np.random.random((self.out_channels,)),
			"grad": None,
		}

	@jit(nopython=False,parallel=True)
	def forward(self,x):												 # 前向传播: x为一只四维张量, N_samples×n_Channels×Height×Width, 返回值out为卷积核的输出
		nSamples,nChannels,height,width = x.shape						 # 获取输入张量x的四个维度值
		assert nChannels==self.in_channels								 # 断言
		outshape = (													 # 计算输出的形状
			nSamples,self.out_channels,									 # 样本数, 输出轨道数
			int((2*self.padding+height-self.height)/self.stride)+1,		 # 输出高度
			int((2*self.padding+width-self.width)/self.stride)+1		 # 输出宽度
		)
		out = np.zeros(outshape)										 # 初始化输出
		if self.padding:												 # 如果需要padding
			x_ = np.zeros((
				nSamples,nChannels,
				height+2*self.padding,
				width+2*self.padding
			))
			x_[:,:,self.padding:-self.padding,
				self.padding:-self.padding] = x
		else: x_ = x.copy()
		for i in range(outshape[0]):									 # 遍历样本
			for j in range(outshape[1]):								 # 遍历输出轨道
				for k in range(outshape[2]):							 # 遍历像素点
					for l in range(outshape[3]):						 # 虽然很蠢, 但是用迭代生成器也不是很方便操作感觉
						x1,y1 = k*self.stride,l*self.stride
						x2,y2 = x1+self.height,y1+self.width
						total = 0
						for m in range(nChannels):						 # 遍历输入轨道计算哈氏积并累和: 这里天坑在于check_gradient.py的正确结果是默认w参数第一个维度是输出轨道, 第二个维度是输入轨道得到的, 而参数表里是先输入轨道后输出轨道的
							t1 = x_[i,m,x1:x2,y1:y2]					 # 输入对应区域
							t2 = self.params["w"]["param"][j,m,:,:]		 # 卷积过滤器对应区域
							total += np.nansum(t1*t2)
						out[i,j,k,l] = total+self.params["b"]["param"][j]# 最后不要忘了加上偏差值
		return out
	
	@jit(nopython=False,parallel=True)
	def backward(self,grad_in,x):										 # 卷积层的反向传播: grad_in的维度与卷积层forward中输出的维度相同
		self.params["w"]["grad"] = np.zeros((
			self.out_channels,self.in_channels,self.height,self.width))	 # 选择在反向传播时再重定义grad参数
		self.params["b"]["grad"] = np.zeros((self.out_channels,))		 # 选择在反向传播时再重定义grad参数
		nSamples,nChannels,height,width = x.shape	
		outshape = (													 # 计算输出的形状: 也是grad_in的形状
			nSamples,self.out_channels,									 # 样本数, 输出轨道数
			int((2*self.padding+height-self.height)/self.stride)+1,		 # 输出高度
			int((2*self.padding+width-self.width)/self.stride)+1		 # 输出宽度
		)
		assert outshape==grad_in.shape									 # 断言
		x_ = np.zeros(( 												 # 复现padding后的输入
			nSamples,nChannels,
			height+2*self.padding,
			width+2*self.padding
		))
		if self.padding:												 # 如果需要padding
			x_[:,:,self.padding:-self.padding,
				self.padding:-self.padding] = x
		else: x_ = x.copy()
		grad_x = np.zeros_like(x_)										 # 先设法对padding后的x求梯度, 然后只需取grad_x中间部分即可
		"""
			https://www.cnblogs.com/pinard/p/6494810.html
			上面链接中给出了步长为1且不padding情况下卷积层反向传播极其简洁的表达式;
			很可惜我只能用最愚蠢的方法来一个个填写梯度了;
		"""
		for i in range(outshape[0]):									 # 遍历样本
			for j in range(outshape[1]):								 # 遍历输出轨道
				self.params["b"]["grad"][j] += np.nansum(grad_in[i,j,:,:])
				for k in range(outshape[2]):							 # 遍历像素点
					for l in range(outshape[3]):						 # grad_in的维度必然为outshape, 通过遍历前向传播中outshape中每个位置上元素表达式来反向求导
						x1,y1 = k*self.stride,l*self.stride
						x2,y2 = x1+self.height,y1+self.width
						for m in range(nChannels):
							grad_x[i,m,x1:x2,y1:y2] += grad_in[i,j,
								k,l]*self.params["w"]["param"][j,m,:,:]
							self.params["w"]["grad"][j,m,:,:] += grad_in[
								i,j,k,l]*x_[i,m,x1:x2,y1:y2]		
		grad_x = grad_x[:,:,self.padding:-self.padding,
			self.padding:-self.padding] if self.padding else grad_x		 # 取出grad_x中不是padding的部分作为将要传递下去的梯度
		return grad_x

class Linear(Layer):													 # 线性层类, 用于对特征应用线性变换: w为n_in×n_out的矩阵, b为1×n_out的向量
	def __init__(self,in_features,out_features,
		init_scale=1e-2
	):																	 # 构造函数
		super(Linear,self).__init__()
		""" 类构造参数 """
		self.in_features = in_features									 # 输入特征数量
		self.out_features = out_features								 # 输出特征数量
		self.init_scale = 1e-2											 # 初始规模
		""" 父类参数 """
		self.params["w"] = {											 # 线性变换矩阵
			"param": self.init_scale*np.random.random((
						self.in_features,self.out_features)),
			"grad": None,												 # 不多说, 选择放在backward方法中初始化
		}
		self.params["b"] = {											 # 常数项偏差
			"param": self.init_scale*np.random.random((1,self.out_features)),
			"grad": None,
		}
		
	def forward(self,x):												 # 前向传播: x为维度为[n,d1,d2,...,dm]的输入特征, 返回值out为输出特征
		w = self.params["w"]["param"]						
		b = self.params["b"]["param"]
		x_ = x.reshape(x.shape[0],-1)									 # 全连接层我们需要以把每个输入样本压成一条向量处理
		out = np.dot(x_,w)+b											 # 这里两个维度分别为n_Sample×n_out与n_out×1, 但是它们还是可以相加的, 结果为每个sample加上b
		return out

	def backward(self,grad_in,x):										 # 线性层的反向传播: 这里的grad_in维度与forward中out维度相同
		"""
			out = np.dot(x,w) + b;
			x.shape = (n_Sample,in_features);
			w.shape = (in_features,out_features);
			out.shape = grad_in.shape = (n_Sample,out_features);
			b.shape = (1,out_features)
			b的梯度应该为(n_Sample,out_features)
		"""
		x_ = x.reshape(x.shape[0],-1)
		self.params["w"]["grad"] = np.dot(x_.T,grad_in)					 # 凑维度凑出来的
		self.params["b"]["grad"] = np.nansum(grad_in,axis=0)			 # 每个样本都被b搞了一下
		grad_x = np.dot(grad_in,self.params["w"]["param"].T)			 # 凑维度凑出来的
		return grad_x
		
class Relu(Layer):														 # 激活层类
	def __init__(self):													 # 构造函数
		super(Relu,self).__init__()

	def forward(self,x):												 # 前向传播
		return np.maximum(x,0)											 # 正数返回, 负数变零

	def backward(self,grad_in,x):										 # 反向传播
		return grad_in*(x>0)											 # 返回输入的梯度乘上relu的梯度

class MaxPool(Layer):													 # 池化层类
	def __init__(self,kernel_size,
		stride=2,
		padding=0
	):																	 # 构造函数
		super(MaxPool,self).__init__()
		""" 类构造参数 """
		self.kernel_size = kernel_size
		self.stride = stride
		self.padding = padding
		""" 父类参数 """
		self.params = dict()											 # 池化层应该是没有参数

	def forward(self,x):												 # 前向传播: x为一只四维张量, N_samples×n_Channels×Height×Width, 返回值out为池化层的输出
		nSamples,nChannels,height,width = x.shape						 # 获取输入张量x的四个维度值
		outshape = (													 # 计算输出的形状
			nSamples,nChannels,											 # 样本数, 输出轨道数
			int((2*self.padding+height-self.kernel_size)/self.stride)+1, # 输出高度
			int((2*self.padding+width-self.kernel_size)/self.stride)+1	 # 输出宽度
		)
		out = np.zeros(outshape)										 # 初始化输出
		if self.padding:												 # 如果需要padding
			x_ = np.zeros((
				nSamples,nChannels,
				height+2*self.padding,
				width+2*self.padding
			))
			x_[:,:,self.padding:-self.padding,
				self.padding:-self.padding] = x
		else: x_ = x.copy()
		for i in range(outshape[0]):									 # 遍历样本
			for j in range(outshape[1]):								 # 遍历输出轨道
				for k in range(outshape[2]):							 # 遍历像素点
					for l in range(outshape[3]):						 # 虽然很蠢, 但是用迭代生成器也不是很方便操作感觉
						x1,y1 = k*self.stride,l*self.stride
						x2,y2 = x1+self.kernel_size,y1+self.kernel_size
						out[i,j,k,l] = np.nanmax(x_[i,j,x1:x2,y1:y2])	 # 取窗口最大值
		return out

	def backward(self,grad_in,x):										 # 反向传播: 此事grad_in的维度恰与MaxPool的输出相同, 因此需要找出grad_in中每个元素对应了x中哪块区域
		"""
			最大值池化层平均值池化层都没有需要学习的参数;
			只需将误差项传递到上一层, 没有梯度的计算;
			最大值池化层：下一层的误差项的值会原封不动的传递到上一层对应区块中的最大值所对应的神经元, 而其他神经元的误差项的值都是0;
			平均值池化层: 下一层的误差项的值会平均分配到上一层对应区块中的所有神经元;
		"""
		nSamples,nChannels,height,width = x.shape						 # 获取输入的形状
		outshape = (													 # 计算输出的形状: 也是grad_in的形状
			nSamples,nChannels,											 # 样本数, 输出轨道数
			int((2*self.padding+height-self.kernel_size)/self.stride)+1, # 输出高度
			int((2*self.padding+width-self.kernel_size)/self.stride)+1	 # 输出宽度
		)
		grad_in_reshape = grad_in.reshape(outshape)
		x_ = np.zeros(( 												 # 复现padding后的输入
			nSamples,nChannels,
			height+2*self.padding,
			width+2*self.padding
		))
		if self.padding:												 # 如果需要padding
			x_[:,:,self.padding:-self.padding,
				self.padding:-self.padding] = x
		else: x_ = x.copy()
		grad_x = np.zeros_like(x_)										 # 先设法对padding后的x求梯度, 然后只需取grad_x中间部分即可

		for i in range(outshape[0]):									 # 遍历样本
			for j in range(outshape[1]):								 # 遍历输出轨道
				for k in range(outshape[2]):							 # 遍历像素点
					for l in range(outshape[3]):						 # grad_in的维度必然为outshape, 通过遍历前向传播中outshape中每个位置上元素表达式来反向求导
						x1,y1 = k*self.stride,l*self.stride
						x2,y2 = x1+self.kernel_size,y1+self.kernel_size
						maxgrid = np.nanmax(x_[i,j,x1:x2,y1:y2])		 # 找出对应该grad_in格子的原区域中最大值
						grad_x[i,j,x1:x2,y1:y2] += grad_in_reshape[i,j,
							k,l]*(x_[i,j,x1:x2,y1:y2]==maxgrid)

		grad_x = grad_x[:,:,self.padding:-self.padding,
			self.padding:-self.padding] if self.padding else grad_x		 # 取出grad_x中不是padding的部分作为将要传递下去的梯度
		return grad_x

nn/optimizer.py

#-*- coding:UTF-8 -*-
import abc
import numpy as np

class Optimizer(object):												 # 抽象优化器类, 为神经网络优化的一次算法(SGD算一次,Newton算二次): params_groups是神经网络模型中所有的参数构成的列表, configs是优化超参数
	def __init__(self,param_groups):									 # 构造函数
		self.param_groups = param_groups								 # 参数组应该是一个列表, 一般包含w, b两个字典, 再下一层是param与grad两个字段

	@abc.abstractmethod
	def step(self,isNestrov):											 # 步长问题
		pass

class SGD(Optimizer):													 # 随机梯度下降类
	def __init__(self,param_groups,
		lr=1e-2,
		weight_decay=0.0,
		momentum=0.0
	):																	 # 构造函数, 参数含义可以参考HW2
		super(SGD,self).__init__(param_groups)
		self.configs = dict(											 # 配置字典中包含学习率与权重衰减等超参数
			lr=lr,
			weight_decay=weight_decay,
			momentum=momentum,
		)			

	def step(self):														 # 步长问题
		lr = self.configs["lr"]											 # 获取学习率
		weight_decay = self.configs["weight_decay"]						 # 获取权重衰减系数
		momentum = self.configs["momentum"]								 # 获取动量系数
		""" 实现传统动量: 我觉得可能Nesterov's方法实现起来比较麻烦, 徒增计算量 """
		count = 0
		for group in self.param_groups:									 # 遍历每个参数
			count += 1
			for k,p in group.items():									 # 获取每个参数的信息
				grad = p["grad"]										 # 获取参数梯度值
				pregrad = p["pregrad"]									 # 获取上一次参数梯度值
				if k=="w":												 # 如果这是一个矩阵参数那么需要考虑权重衰减系数
					v = momentum*pregrad - grad - weight_decay*p["param"]
					p["param"] += lr*v
				else:
					v = momentum*pregrad - grad
					p["param"] += lr*v									 # 对于偏差参数不需要考虑权重衰减
				p["pregrad"] = v										 # 记录本次下降方向

nn/loss.py

#-*- coding:UTF-8 -*-
import numpy as np

def softmax(x):															 # 给定一个矩阵, 计算每行减去该行最大值后的softmax输出
	x_bar = x-np.nanmax(x,axis=1,keepdims=True)							 # 减去该行最大值: 防止数值溢出
	z = np.nansum(np.exp(x_bar),axis=1,keepdims=True)					 # 计算自然底数幂和
	return np.exp(x_bar)/z												 # 返回softmax输出

class SoftmaxCE(object):												 # 利用softmax转换计算交叉熵损失值的抽象类
	def __init__(self):													 # 构造函数
		pass

	@staticmethod
	def __call__(x,y):													 # 调用方法: x为n_samples×n_features的矩阵, y看起来像是某个样本的标签0~9
		sf = softmax(x)													 # 计算x的softmax输出
		n = x.shape[0]													 # 获取样本数
		sf_log = -np.log(sf[range(n),y])								 # 对应y的位置的概率即为预测准确的概率
		loss = np.mean(sf_log)											 # 用这部分概率来计算交叉熵值
		g = sf.copy()									
		g[range(n),y] -= 1												 # softmax交叉熵求导的形式极其简洁
		g /= n								
		return loss,g,sf												 # 返回交叉熵损失值, 梯度及softmax层的输出结果

nn/utils.py

#-*- coding:UTF-8 -*-
import numpy as np

def accuracy(score,y):													 # 计算分类精度, 返回一个0~1的标量: score是n×n类别得分矩阵, y为n×1的标签
	acc = np.mean(score.argmax(axis=1)==y)
	return acc

另外附一份用keras复现上面的架构，以及在该架构的基础上调参的测试代码：

#-*- coding:UTF-8 -*-
import time
import numpy as np
import tensorflow as tf
from dataset import get_cifar10_data
from matplotlib import pyplot as plt

from keras.optimizers import SGD
from keras.utils import to_categorical
from keras.models import Sequential,Model
from keras.layers import Dense,Dropout,Activation,Flatten,Conv2D,MaxPooling2D,Embedding,LSTM,Input

def read_data(num_train=100):											 # 读取数据
	data = get_cifar10_data()					
	data = {															 # 把data转为字典形式
		"X_train": data["X_train"][:num_train].transpose(0,2,3,1),		 # 把轨道放到最后一个维度上
		"y_train": to_categorical(data["y_train"][:num_train],10),		 # 标签一热化
		"X_val": data["X_val"].transpose(0,2,3,1),						 # 把轨道放到最后一个维度上
		"y_val": to_categorical(data["y_val"],10),						 # 标签一热化
	}
	return data

def model_1():															 # conv+relu+maxpool+linear+relu+linear+softmax									
	model = Sequential()
	model.add(Conv2D(3,(7,7),activation="relu",input_shape=(32,32,3)))
	model.add(MaxPooling2D(pool_size=(2,2)))
	model.add(Flatten())
	model.add(Dense(100,activation="relu"))
	model.add(Dense(10,activation="softmax"))
	sgd = SGD(lr=0.01,decay=1e-6,momentum=0.9,nesterov=False)
	model.compile(loss="categorical_crossentropy",optimizer=sgd,metrics=["accuracy"])
	model.summary()
	return model

def model_2():
	model = Sequential()
	model.add(Conv2D(3,(7,7),activation="relu",input_shape=(32,32,3)))
	model.add(MaxPooling2D(pool_size=(2,2)))
	model.add(Conv2D(3,(3,3),activation="relu",input_shape=(13,13,3)))
	model.add(Flatten())
	model.add(Dense(100,activation="sigmoid"))
	model.add(Dense(10,activation="softmax"))
	sgd = SGD(lr=0.01,decay=1e-6,momentum=0.9,nesterov=False)
	model.compile(loss="categorical_crossentropy",optimizer=sgd,metrics=["accuracy"])
	model.summary()
	return model

def model_3():
	model = Sequential()
	model.add(Conv2D(3,(7,7),activation="relu",input_shape=(32,32,3)))
	model.add(MaxPooling2D(pool_size=(2,2)))
	model.add(Flatten())
	model.add(Dense(256,activation="sigmoid"))
	model.add(Dense(128,activation="sigmoid"))
	model.add(Dense(10,activation="softmax"))
	sgd = SGD(lr=0.01,decay=1e-6,momentum=0.9,nesterov=False)
	model.compile(loss="categorical_crossentropy",optimizer=sgd,metrics=["accuracy"])
	model.summary()
	return model

def model_4():
	model = Sequential()
	model.add(Conv2D(3,(7,7),activation="relu",input_shape=(32,32,3)))
	model.add(MaxPooling2D(pool_size=(2,2)))
	model.add(Conv2D(3,(3,3),activation="relu",input_shape=(13,13,3)))
	model.add(Flatten())
	model.add(Dense(256,activation="sigmoid"))
	model.add(Dense(128,activation="sigmoid"))
	model.add(Dense(10,activation="softmax"))
	sgd = SGD(lr=0.01,decay=1e-6,momentum=0.9,nesterov=False)
	model.compile(loss="categorical_crossentropy",optimizer=sgd,metrics=["accuracy"])
	model.summary()
	return model

def model_adagrad():
	model = Sequential()
	model.add(Conv2D(3,(7,7),activation="relu",input_shape=(32,32,3)))
	model.add(MaxPooling2D(pool_size=(2,2)))
	model.add(Flatten())
	model.add(Dense(256,activation="sigmoid"))
	model.add(Dense(128,activation="sigmoid"))
	model.add(Dense(10,activation="softmax"))
	model.compile(loss="categorical_crossentropy",optimizer="adagrad",metrics=["accuracy"])
	model.summary()
	return model

def model_adam():
	model = Sequential()
	model.add(Conv2D(3,(7,7),activation="relu",input_shape=(32,32,3)))
	model.add(MaxPooling2D(pool_size=(2,2)))
	model.add(Flatten())
	model.add(Dense(256,activation="sigmoid"))
	model.add(Dense(128,activation="sigmoid"))
	model.add(Dense(10,activation="softmax"))
	model.compile(loss="categorical_crossentropy",optimizer="adam",metrics=["accuracy"])
	model.summary()
	return model

if __name__ == "__main__":
	""" P3Q1 """
	model = model_1()
	#model = model_2()
	#model = model_3()
	#model = model_4()
	data = read_data(49000)
	t = time.time()
	model.fit(data["X_train"],data["y_train"],batch_size=50,epochs=10)
	print("训练耗时：{}".format(time.time()-t))
	score = model.evaluate(data["X_val"],data["y_val"],batch_size=32)
	print(score)

	""" P3Q2 """
	#model = model_adagrad()
	model = model_adam()
	data = read_data(49000)
	t = time.time()
	model.fit(data["X_train"],data["y_train"],batch_size=50,epochs=10)
	print("训练耗时：{}".format(time.time()-t))
	score = model.evaluate(data["X_val"],data["y_val"],batch_size=32)
	print(score)

分享学习，共同进步！