实验环境:matlab R2017a; win10
test_example_CNN.m
cnn.layers = {
%cnn.layers是创建cnn这个结构体中的layers元胞,layers元胞包括5个结构体(有点类似字典中的键值对)
struct('type', 'i') %input layer 输入层数量为1,因为cnn以图像形式输入
struct('type', 'c', 'outputmaps', 6, 'kernelsize', 5) %convolution layer
struct('type', 's', 'scale', 2) %sub sampling layer
struct('type', 'c', 'outputmaps', 12, 'kernelsize', 5) %convolution layer
struct('type', 's', 'scale', 2) %subsampling layer
};
opts.alpha = 1;
opts.batchsize = 50;
opts.numepochs = 1;
cnn = cnnsetup(cnn, train_x, train_y);
cnn = cnntrain(cnn, train_x, train_y, opts);
[er, bad] = cnntest(cnn, test_x, test_y);
cnnsetup.m
%assert(~isOctave() || compare_versions(OCTAVE_VERSION, '3.8.0', '>='), ['Octave 3.8.0 or greater is required for CNNs as there is a bug in convolution in previous versions. See http://savannah.gnu.org/bugs/?39314. Your version is ' myOctaveVersion]);
%如果报错,需要注释掉上段代码
inputmaps = 1;
mapsize = size(squeeze(train_x(:, :, 1)));
for l = 1 : numel(cnn.layers) % layer
if strcmp(cnn.layers{l}.type, 's')
mapsize = mapsize / cnn.layers{l}.scale;
assert(all(floor(mapsize)==mapsize), ['Layer ' num2str(l) ' size must be integer. Actual: ' num2str(mapsize)]);
for j = 1 : inputmaps
cnn.layers{l}.b{j} = 0;
end
end
if strcmp(cnn.layers{l}.type, 'c')
mapsize = mapsize - cnn.layers{l}.kernelsize + 1;
fan_out = cnn.layers{l}.outputmaps * cnn.layers{l}.kernelsize ^ 2;
for j = 1 : cnn.layers{l}.outputmaps % output map
fan_in = inputmaps * cnn.layers{l}.kernelsize ^ 2;
for i = 1 : inputmaps % input map 权值初始化,生成一批服从均值为0,方差为2/(fan_in + fan_out)的随机数,更趋近于0
% 还是没有明白这里除以连接数是为了什么,让他更趋近于0? 不懂 个人感觉没有差别
% 生成这样的一个均值为0的随机矩阵作为卷积算子,是为了抽取出特征,这里想详细说明一下为什么要卷积
% 众所周知图像的特征,最关键的还是边缘特征,至于灰度等等一些受光照影响太严重,而 提取边缘特征还是要用到卷积
% 这里的卷积算子不太好设计,因为我们不知道那些特征是有用的,特别是不确定分类目标是什么的时候,这个时候就要采取这样的
% 随机算子来提取特征,这里的每个算子都不同,故而特征提取能力也不同。
% 顺便一提的是,在MATLAB中,convn函数自带滑动,也就是说,遍历图像每一个像素块的都是同一个卷积算子
% 所以这里采用convn函数已经保证了神经元的权值共享,并且,权值共享还体现在下层的某个神经元针对上层所有输出都
% 采用相同的卷积算子
% 这里的权值也就是卷积算子的参数,权值共享即卷积核相同
cnn.layers{l}.k{i}{j} = (rand(cnn.layers{l}.kernelsize) - 0.5) * 2 * sqrt(6 / (fan_in + fan_out));
end
cnn.layers{l}.b{j} = 0;
end
inputmaps = cnn.layers{l}.outputmaps;
end
end
% 'onum' is the number of labels, that's why it is calculated using size(y, 1). If you have 20 labels so the output of the network will be 20 neurons.
% 'fvnum' is the number of output neurons at the last layer, the layer just before the output layer.
% 'ffb' is the biases of the output neurons.
% 'ffW' is the weights between the last layer and the output neurons. Note that the last layer is fully connected to the output layer, that's why the size of the weights is (onum * fvnum)
fvnum = prod(mapsize) * inputmaps;
onum = size(train_y, 1);
cnn.ffb = zeros(onum, 1);
cnn.ffW = (rand(onum, fvnum) - 0.5) * 2 * sqrt(6 / (onum + fvnum));
依次设置各层的参数,如果是卷积层,会对每个连接初始化权重,总共是inputmaps * outputmaps个权值矩阵,同时设置卷积层的偏倚,个数是该层的outputmaps;如果是下采样层,设置下采样层的偏倚,个数为采样对象的个数,也就是下采样层当前层的输入inputmaps。
最后,设定输出层神经元个数为标签个数 size(y, 1),同时初始化了输出层的偏倚ffb和权重ffw;设定倒数第一层为全连接层,神经元个数为输入的像素个数(mapsize * mapsize)乘以输入的神经元个数(inputmaps)乘以输出层神经元个数
cnntrain.m
m = size(train_x, 3);
numbatches = m / opts.batchsize;
cnn.rL = [];
for i = 1 : opts.numepochs
kk = randperm(m);
for l = 1 : numbatches
batch_x = train_x(:, :, kk((l - 1) * opts.batchsize + 1 : l * opts.batchsize));
batch_y = train_y(:, kk((l - 1) * opts.batchsize + 1 : l * opts.batchsize));
cnn = cnnff(cnn, batch_x);
cnn = cnnbp(cnn, batch_y);
cnn = cnnapplygrads(cnn, opts);
if isempty(cnn.rL)
cnn.rL(1) = cnn.L;
end
cnn.rL(end + 1) = 0.99 * cnn.rL(end) + 0.01 * cnn.L;
end
end
cnnff.m
卷积神经网络的前向传播
n = numel(cnn.layers); % cnn层数
cnn.layers{1}.a{1} = batch_x;
inputmaps = 1;
%每一个神经元都会对输入做出反应,所以是交叉影响,一共有outputmaps * inputmaps个连接
%每个连接都会做一次卷积,并且对输入数求和,即求出每个神经元对前层所有输入的和
for l = 2 : n % for each layer
if strcmp(cnn.layers{l}.type, 'c')
% !!below can probably be handled by "insane matrix" operations
for j = 1 : cnn.layers{l}.outputmaps % for each output map 对每一个神经元
% create temp output map 创建一个24*24*50的输出矩阵,容纳第一个卷积层的输出
z = zeros(size(cnn.layers{l - 1}.a{1}) - [cnn.layers{l}.kernelsize - 1 cnn.layers{l}.kernelsize - 1 0]);
for i = 1 : inputmaps % for each input map
% convolve with corresponding kernel and add to temp output map
z = z + convn(cnn.layers{l - 1}.a{i}, cnn.layers{l}.k{i}{j}, 'valid');
%输入与标号为( i-j )的卷积算子做卷积,意为第i个输入与第j个接收的连接
end
% 卷积层的偏置是直接线性加上去的,add bias, pass through nonlinearity
cnn.layers{l}.a{j} = sigm(z + cnn.layers{l}.b{j});% 这里是传统意义上的激活函数,在图像的意义上就是做一个对比度增强
% sigmod函数让卷积后的图像拉伸到0-1之间,对比看生成的图像有点二值化的意味
% 现在,卷积层也有a了,大小为24*24,并且有6个,也就是每一个神经元都输出一个
end
% set number of input maps to this layers number of outputmaps
inputmaps = cnn.layers{l}.outputmaps;
elseif strcmp(cnn.layers{l}.type, 's')
% downsample
for j = 1 : inputmaps
z = convn(cnn.layers{l - 1}.a{j}, ones(cnn.layers{l}.scale) / (cnn.layers{l}.scale ^ 2), 'valid'); % !! replace with variable
% 实际上是个平滑,即均值滤波
cnn.layers{l}.a{j} = z(1 : cnn.layers{l}.scale : end, 1 : cnn.layers{l}.scale : end, :);
end
end
end
% 特征提取完毕,接下来就是全连接层
% concatenate all end layer feature maps into vector
cnn.fv = []; %特征向量
for j = 1 : numel(cnn.layers{n}.a)
sa = size(cnn.layers{n}.a{j});
cnn.fv = [cnn.fv; reshape(cnn.layers{n}.a{j}, sa(1) * sa(2), sa(3))];
end
% feedforward into output perceptrons % 感知机又出现了,感动中国!
cnn.o = sigm(cnn.ffW * cnn.fv + repmat(cnn.ffb, 1, size(cnn.fv, 2)));
% 这里的ffW是初始化之后的权值,只是一个加权平均
cnnbp.m
cnn误差的向后传播
n = numel(cnn.layers);
% error
cnn.e = cnn.o - batch_y;
% loss function
cnn.L = 1/2* sum(cnn.e(:) .^ 2) / size(cnn.e, 2);
%% backprop deltas % 前向反馈差值
cnn.od = cnn.e .* (cnn.o .* (1 - cnn.o)); % output delta 计算delta ,这里其实是loss function对输出o求导,写成od,
% 而dffw还需要在od上乘上一个fv
cnn.fvd = (cnn.ffW' * cnn.od); % feature vector delta 这里是loss function对fv求导,可以看成是fv的梯度,记成fvd
if strcmp(cnn.layers{n}.type, 'c') % only conv layers has sigm function 如果最后一层是卷积层,则:fv的梯度要在原来的梯度上????
cnn.fvd = cnn.fvd .* (cnn.fv .* (1 - cnn.fv));
end
% reshape feature vector deltas into output map style 特征向量差值
sa = size(cnn.layers{n}.a{1});% 最后一层的输出图片的大小
fvnum = sa(1) * sa(2); %每幅图片的特征数
for j = 1 : numel(cnn.layers{n}.a)% 最后一层神经元的数量
cnn.layers{n}.d{j} = reshape(cnn.fvd(((j - 1) * fvnum + 1) : j * fvnum, :), sa(1), sa(2), sa(3));
%最后一层的偏倚 fvd取16个变形为4*4,作为最后一层的偏倚,误差从全连接层传回了最后一层
end
for l = (n - 1) : -1 : 1 %误差开始从最后一层往前传,4-3-2-1
if strcmp(cnn.layers{l}.type, 'c')
for j = 1 : numel(cnn.layers{l}.a)
% 损失函数对上一层的卷积核权值求导,形式是类似sigmod函数求导的形式,公式推导见下方链接,所以要乘上一个上一层的输出
% 这里上一层的输出由下一层的输出上采样得到
cnn.layers{l}.d{j} = cnn.layers{l}.a{j} .* (1 - cnn.layers{l}.a{j}) .* (expand(cnn.layers{l + 1}.d{j}, [cnn.layers{l + 1}.scale cnn.layers{l + 1}.scale 1]) / cnn.layers{l + 1}.scale ^ 2);
end
elseif strcmp(cnn.layers{l}.type, 's') %对于下采样层
for i = 1 : numel(cnn.layers{l}.a)
z = zeros(size(cnn.layers{l}.a{1})); %匹配图像大小12*12的一个z
for j = 1 : numel(cnn.layers{l + 1}.a) %下一个卷积层的神经元个数
% 拿下一层的偏倚和下一层的神经元的卷积算子的旋转做卷积,其实就是相关,详见下方链接这是公式推导,看不太懂
z = z + convn(cnn.layers{l + 1}.d{j}, flip(flip(cnn.layers{l + 1}.k{i}{j}, 1), 2), 'full');
% 然后把12个神经元的卷积结果都加起来,作为下采样层6个神经元中的一个
end
cnn.layers{l}.d{i} = z;
end
end
end
%误差计算至此结束
%% calc gradients
for l = 2 : n
if strcmp(cnn.layers{l}.type, 'c') % 只对卷积层计算梯度
for j = 1 : numel(cnn.layers{l}.a)
for i = 1 : numel(cnn.layers{l - 1}.a)
% flipall是把整个图像集旋转,分别是对图像上下旋转+左右旋转(rot180),并且对数据集翻转,1变为50,50变为1,倒了个序。
% 将新的当前层误差与旋转后的图像做卷积,其实就是相关,至于为什么卷积层的梯度要这样计算,详见下方链接,我自己也看不懂
cnn.layers{l}.dk{i}{j} = convn(flipall(cnn.layers{l - 1}.a{i}), cnn.layers{l}.d{j}, 'valid') / size(cnn.layers{l}.d{j}, 3);
% 计算了卷积算子的偏差
end
cnn.layers{l}.db{j} = sum(cnn.layers{l}.d{j}(:)) / size(cnn.layers{l}.d{j}, 3);
end
end
end
cnn.dffW = cnn.od * (cnn.fv')/size(cnn.od, 2);
cnn.dffb = mean(cnn.od, 2);
function X = rot180(X)
X = flipdim(flipdim(X, 1), 2);
end
cnnapplygrads.m
梯度和偏倚的迭代
for l = 2 : numel(net.layers)
if strcmp(net.layers{l}.type, 'c')
for j = 1 : numel(net.layers{l}.a)
for ii = 1 : numel(net.layers{l - 1}.a)
net.layers{l}.k{ii}{j} = net.layers{l}.k{ii}{j} - opts.alpha * net.layers{l}.dk{ii}{j};
end
net.layers{l}.b{j} = net.layers{l}.b{j} - opts.alpha * net.layers{l}.db{j};
end
end
end
net.ffW = net.ffW - opts.alpha * net.dffW;
net.ffb = net.ffb - opts.alpha * net.dffb;
主要的代码注释至此结束,有问题可以相互讨论。
参考链接:
http://blog.csdn.net/lansatiankongxxc/article/details/49666073
http://blog.csdn.net/celerychen2009/article/details/8964753
http://www.cnblogs.com/tornadomeet/p/3468450.html
第三个链接中我个人觉得有个问题,个人觉得前向传播时就是卷积,bp时是相关,文章中弄反了