# *_*coding:utf-8 *_*
import sys, os, time
import lasagne
import numpy as np
import theano
import theano.tensor as T
import cPickle
import xnor_net
import cnn_utils
from external import bnn_utils
import gzip
from collections import OrderedDict
def construct_cifar10_net(input_var, alpha, eps):
##################################################
#
# 结构:
#
# 输入层
# |
# 卷积层1 + BN层
# |
# 卷积层2 + 最大值混合层2 + BN层
# |
# 卷积层3 + BN层
# |
# 卷积层4 + 最大值混合层4 + BN层
# |
# 卷积层5 + BN层
# |
# 卷积层6 + 最大值混合层6 + BN层
# |
# 全连接层1 + BN层
# |
# 全连接层2 + BN层
# |
# 全连接层3
# |
# 输出
#
##################################################
# input layer (输入层)
# Input conv layer is not binary. As the paper states, the computational savings are very less
# when the input channels to the conv layer are less
# 第一层卷积:根据论文所述,由于加速与激活图层深度与卷积大小关系密切,输入层有RGB三层,所以不使用xnor-conv
cnn = lasagne.layers.InputLayer(shape=(None, 3, 32, 32), input_var=input_var)
# 卷积层1
cnn = xnor_net.Conv2DLayer(
cnn, # 输入层
xnor=False, # 输入层是否是XNor层 (如果不是,会对输入层进行二值化)
num_filters=128, # 滤波器个数
filter_size=(3, 3), # 滤波器大小
pad=1, # 边界填充方式 (lasagne.layers.Conv2DLayer的参数)
nonlinearity=lasagne.nonlinearities.identity) # 激活函数 (lasagne.layers.Conv2DLayer的参数)
# Batch Normalization层
cnn = lasagne.layers.BatchNormLayer(
cnn,
epsilon=eps, # todo: BN层参数
alpha=alpha)
# 卷积层2
cnn = xnor_net.Conv2DLayer(
cnn,
xnor=True,
num_filters=128,
filter_size=(3, 3),
pad=1,
nonlinearity=lasagne.nonlinearities.identity)
# 最大值混合层2
cnn = lasagne.layers.MaxPool2DLayer(cnn, pool_size=(2, 2))
# Batch Normalization层
cnn = lasagne.layers.BatchNormLayer(
cnn,
epsilon=eps,
alpha=alpha)
# 卷积层3
cnn = xnor_net.Conv2DLayer(
cnn,
xnor=True,
num_filters=256,
filter_size=(3, 3),
pad=1,
nonlinearity=lasagne.nonlinearities.identity)
# Batch Normalization层
cnn = lasagne.layers.BatchNormLayer(
cnn,
epsilon=eps,
alpha=alpha)
# 卷积层4
cnn = xnor_net.Conv2DLayer(
cnn,
xnor=True,
num_filters=256,
filter_size=(3, 3),
pad=1,
nonlinearity=lasagne.nonlinearities.identity)
# 最大值混合层4
cnn = lasagne.layers.MaxPool2DLayer(cnn, pool_size=(2, 2))
# Batch Normalization层
cnn = lasagne.layers.BatchNormLayer(
cnn,
epsilon=eps,
alpha=alpha)
# 卷积层5
cnn = xnor_net.Conv2DLayer(
cnn,
xnor=True,
num_filters=512,
filter_size=(3, 3),
pad=1,
nonlinearity=lasagne.nonlinearities.identity)
# Batch Normalization层
cnn = lasagne.layers.BatchNormLayer(
cnn,
epsilon=eps,
alpha=alpha)
# 卷积层6
cnn = xnor_net.Conv2DLayer(
cnn,
xnor=True,
num_filters=512,
filter_size=(3, 3),
pad=1,
nonlinearity=lasagne.nonlinearities.identity)
# 最大值混合层6
cnn = lasagne.layers.MaxPool2DLayer(cnn, pool_size=(2, 2))
# Batch Normalization层
cnn = lasagne.layers.BatchNormLayer(
cnn,
epsilon=eps,
alpha=alpha)
# 全连接层1
cnn = xnor_net.DenseLayer(
cnn,
xnor=True,
nonlinearity=lasagne.nonlinearities.identity,
num_units=1024)
# Batch Normalization层
cnn = lasagne.layers.BatchNormLayer(
cnn,
epsilon=eps,
alpha=alpha)
# 全连接层2
cnn = xnor_net.DenseLayer(
cnn,
xnor=True,
nonlinearity=lasagne.nonlinearities.identity,
num_units=1024)
# Batch Normalization层
cnn = lasagne.layers.BatchNormLayer(
cnn,
epsilon=eps,
alpha=alpha)
# 全连接层3
cnn = xnor_net.DenseLayer(
cnn,
xnor=False,
nonlinearity=lasagne.nonlinearities.softmax,
num_units=10)
return cnn
if __name__ == '__main__':
# theano.function(inputs, outputs=None, mode=None, updates=None, givens=None):
# 根据输入inputs计算数据outputs的函数,其中:
# inputs: 列表类型,用来保存输入量
# outputs: 列表或字典类型,用来保存输出量。输入量与输出量的映射关系通常在输出量的定义中体现。
# updates: 一组可迭代更新的量(shared_variable, new_expression),
# 对其中的shared_variable输入用new_expression表达式更新
#
# lasagne.updates.adam(loss_or_grads, params, learning_rate): 用于参数更新,
# 其中: loss_or_grads: 误差或梯度
# params: 要更新的参数
# learning_rate: 更新速率(学习速率)
#
# lasagne.layers.get_output(layer): 对指定网络,计算网络输出;
# lasagne.objectives.categorical_crossentropy(predictions, targets): 计算分类结果与目标的交叉熵(误差);
# lasagne.layers.get_all_params(layer): 返回一个列表,包含该层参数的theano共享变量或表达式 ?
# bnn_utils.compute_grads(loss, network): 计算梯度;
# bnn_utilsclipping_scaling(updates, network): 该函数在参数更新后规范化;
# OrderedDict: 有序字典类;
# dict.items(): 返回字典的键值;
# tensor.nep(): 相当于"a != b";
# tensor.argmax(): 返回沿指定轴取得最大值的下标;
# This is XNOR net (设定XNOR-神经网络标识)
xnor = True
# Model file name (设定模型文件名)
model_file = 'xnor_net_cifar10_nonxnor_first_lyr.npz'
# hyper parameters (设定超参数)
batch_size = 50 # 小批量数据大小
alpha = 0.1 # (BN层参数)
eps = 1e-4 # (BN层参数)
no_epochs = 200 # 迭代期次数
# learning rate (设定学习速率)
# similar setting
# s as in BinaryNet
LR_start = 0.001 # 学习速率-起点
LR_end = 0.0000003 # 学习速率-终点
LR_decay = (LR_end/LR_start)**(1./no_epochs) # 平台值后,LR的衰减倍数
print('LR_start = {:f}\tLR_end = {:f}\tLR_decay = {:f}'.format(LR_start, LR_end, LR_decay))
# input data, target and learning rate as theano symbolic var (建立输入数据,目标和学习速率变量)
input_vars = T.tensor4('input') # 输入数据,四维tensor张量
targets = T.fmatrix('target') # 目标,tensor矩阵
LR = T.scalar('LR', dtype=theano.config.floatX) # 学习速率,tensor标量
# construct deep network (构建深度网络)
print('Constructing the network...')
net = construct_cifar10_net(input_vars, alpha, eps) # 建立网络,这里调用了construct_cifar10_net()函数
# Load data (加载数据)
print('Loading the data...')
train_x, val_x, test_x, train_y, val_y, test_y = cnn_utils.load_data('cifar10') # 加载数据
# network output (计算网络输出)
# 注:用于建立训练数据的误差计算及参数更新函数
train_pred = lasagne.layers.get_output(net, deterministic=False)
print('Constructed symbolic output')
# loss (定义误差计算函数)
# 注:用于建立训练数据的误差计算及参数更新函数
# As per paper it is -ve log-liklihood on softmax output
loss = lasagne.objectives.categorical_crossentropy(train_pred, targets)
# mean loss across all images in the batch
# 计算小批量数据中所有图片误差的平均值
loss = T.mean(loss)
print('Constructed symbolic training loss')
# define the update process (定义参数更新函数)
# 注:用于建立训练数据的误差计算及参数更新函数
# No need of weight cliping as in BinaryNet
# 在二值神经网络中无需进行权重重置。
print('Defining the update process...')
if xnor:
# W updates (权重更新)
W = lasagne.layers.get_all_params(net, xnor=True) # 取得网络权重
W_grads = bnn_utils.compute_grads(loss, net) # 计算梯度
updates = lasagne.updates.adam(loss_or_grads=W_grads, params=W, learning_rate=LR)
#是的
updates = bnn_utils.clipping_scaling(updates, net)
# other parameters updates (其它参数更新)
params = lasagne.layers.get_all_params(net, trainable=True, xnor=False)
updates = OrderedDict(updates.items() + lasagne.updates.adam(loss_or_grads=loss,
params=params, learning_rate=LR).items()) # 将其它参数的更新添加到updates中
else:
# (参数更新)
params = lasagne.layers.get_all_params(net, trainable=True)
updates = lasagne.updates.adam(loss_or_grads=loss, params=params, learning_rate=LR)
# test prediction and loss expressions (测试数据和误差表示)
# 注:用于建立测试数据的误差计算函数
print('Creating test prediction, loss and error expressions...')
test_pred = lasagne.layers.get_output(net, deterministic=True)
#多分类交叉熵损失函数,一般用作最后一层为sigmoid分类
test_loss = T.mean(lasagne.objectives.categorical_crossentropy(test_pred, targets))
test_err = T.mean(T.neq(T.argmax(test_pred, axis=1), T.argmax(targets, axis=1)), dtype=theano.config.floatX)
# construct theano function train, validation/testing process (建立训练数据和测试数据的theano函数)
train_fn = theano.function([input_vars, targets, LR], loss, updates=updates) # 建立训练数据的误差计算及参数更新函数
# test_fn = theano.function([input_vars, targets], test_loss)
test_fn = theano.function([input_vars, targets], [test_loss, test_err]) # 建立测试数据的误差计算函数
print('Created theano functions for training and validation...')
# 训练数据
print('Training...')
# new_loss = train_fn(train_x[0:50], train_y[0:50], LR_start)
print('Trainset shape = ', train_x.shape, train_y.shape)
print('Valset shape = ', val_x.shape, val_y.shape)
print('Testset shape = ', test_x.shape, test_y.shape)
# new_loss, new_err = test_fn(val_x[0:50], val_y[0:50])
bnn_utils.train(
train_fn, test_fn, # 训练和测试数据的theano函数
net, # 神经网络
batch_size, # 小批量数据大小
LR_start, LR_decay, # 学习速率
no_epochs, # 迭代期次数
train_x, train_y, # 训练数据
val_x, val_y, # 验证数据
test_x, test_y, # 测试数据
save_path=model_file, # 模型存放路径
shuffle_parts=1)
# This should produce at most 13.89% test error rate (这应该产生至多13.89%的测试错误率。)