function[res]= vl_myforbackward(net, x, dzdy, res, epoch, count1, varargin)
% vl_myforbackward evaluates a simple SPDNet
opts.res =[];
opts.conserveMemory =false;
opts.sync =false;
opts.disableDropout =false;
opts.freezeDropout =false;
opts.accumulate =false;
opts.cudnn =true;
opts.skipForward =false;
opts.backPropDepth = +inf ;
opts.epsilon = 1e-5; % this parameter is worked in the ReEig Layer
% dev_ml = cell(1,30);
% dev_re = cell(1,30);
% for p =1:30
% dev_ml{p}= zeros(50,50);
% dev_re{p}= zeros(400,400);
% end
n = numel(net.layers); % count the number of layers
if(nargin <=2)|| isempty(dzdy)
doder =false;else
doder =true; % this variable is used to control when to compute the derivative
end
if opts.cudnn
cudnn ={'CuDNN'};else
cudnn ={'NoCuDNN'};
end
gpuMode = isa(x, 'gpuArray');if nargin <=3|| isempty(res)
res = struct(...
'x', cell(1,n+1), ...
'dzdx', cell(1,n+1), ... % this gradient is necessary for computing the gradients in the layers below and updating their parameters
'dzdw', cell(1,n+1), ... % this gradient is required for updating W
'aux', cell(1,n+1), ...
'time', num2cell(zeros(1,n+1)), ...
'backwardTime', num2cell(zeros(1,n+1)));
end
if ~opts.skipForward
res(1).x = x ;
end
% -------------------------------------------------------------------------
% Forward pass
% -------------------------------------------------------------------------
%res.SS 是为了加速训练,相当于pytroch的ctx 上下文,是对正向传播和反向传播公用的数据
fori=1:n
if opts.skipForward
break;
end
l = net.layers{i}; % each net layer stores two components: (1) layer type(2) weight
res(i).time = tic ; % count the time spend on each layer
switch l.type
case'bfc'
res(i+1).x = vl_mybfc(res(i).x, l.weight, i, res); % the output data of each layer is stored in the x part
case'fc'
res(i+1).x = vl_myfc(res(i).x, l.weight);case'rec'if doder
alt = doder;else
alt = doder + 1;
end
[res(i+1).x, SS]= vl_myrec(res(i).x, opts.epsilon, alt, []);
res(i+1).SS = SS;
%res(i+1).recov = recov;case'add'
%使用前面的x是应为前面是其对称求出来的数据,就是log log不是成对出现的么,这个就是将这
%两个log映射后的加权平均
sc = res(i-1).x;
res(i+1).x = vl_myadd(res(i).x,sc);case'rec_relu'
res(i+1).x = vl_myrec_relu(res(i).x, opts.epsilon);case'marginloss'
res(i+1).obj =0.0;
res(i+1).x = res(i).x;case'reconstructionloss'
res(i+1).obj = vl_myreconstructionloss(res(i).x, res(1).x, epoch); %
res(i+1).x = res(7).x;case'log'if doder
alt = doder;else
alt = doder + 1;
end
%对于10 16 使用的都是前面与其对应的相同的spd矩阵,求它们的log
if i ==10
sc = res(i-5).x;
elseif i ==16
sc = res(i-13).x;else
sc = res(i).x;
end
%SS是对sc的svd分解
[res(i+1).x, SS]= vl_mylog(sc, alt, []);
res(i+1).SS = SS;case'exp' %alt的作用是用来控制是正向传播还是反向传播的
if doder
alt = doder;else
alt = doder + 1;
end
[res(i+1).x, SS]= vl_myexp(res(i).x, alt, []);
res(i+1).SS = SS;%相当于同一层之间正向传播和反向传播使用的共享变量,
%在pytorch中成为ctx上下文文本
case'softmaxloss'
res(i+1).x = vl_mysoftmaxloss(res(i).x, l.class);case'custom'
res(i+1)= l.forward(l, res(i), res(i+1));
otherwise
error('Unknown layer type %s', l.type);
end
% optionally forget intermediate results
forget = opts.conserveMemory ;
forget = forget &(~doder || strcmp(l.type, 'relu'));
forget = forget & ~(strcmp(l.type, 'loss')|| strcmp(l.type, 'softmaxloss'));
forget = forget &(~isfield(l, 'rememberOutput')|| ~l.rememberOutput);if forget
res(i).x =[];
end
if gpuMode & opts.sync
% This should make things slower, but on MATLAB 2014a it is necessary
% for any decent performance.
wait(gpuDevice);
end
res(i).time = toc(res(i).time);
end
% -------------------------------------------------------------------------
% Backward pass
% -------------------------------------------------------------------------
if doder
res(n+1).dzdx = dzdy ; % the right hand first part of eq.6 in SPDNet. Here, its value is 1for i = n:-1:max(1, n-opts.backPropDepth+1) % calculate the derivate in reversed order
l = net.layers{i};
res(i).backwardTime = tic ;
switch l.type
case'bfc'%返回值有dzdw 因为bfc的权重需要更新,有w权重的都有dzdw返回值 比如fc
[res(i).dzdx, res(i).dzdw]=... % all the data in a given batch share the same weight
vl_mybfc(res(i).x, l.weight, i, res, res(i+1).dzdx); %
case'fc'[res(i).dzdx, res(i).dzdw]=...
vl_myfc(res(i).x, l.weight, res(i+1).dzdx);case'rec'
temp = res(i).x;
%下面几行代码也很好理解 第4层的rec后面就接了第10层的log 所以接收第10层的dzdx
if i ==4
dev_sc = res(i+6).dzdx; %
elseif i ==2
dev_sc = res(i+14).dzdx; %
else
ZM = zeros(size(temp{1},1),size(temp{1},2));for num =1: length(temp)
dev_sc{num}= ZM;
end
end
alt = doder;
alt = alt - 1;[res(i).dzdx, SS]= vl_myrec(res(i).x, opts.epsilon, alt, res(i+1).SS, res(i+1).dzdx, dev_sc);
%[res(i).dzdx, recov]= vl_myrec(res(i).x, opts.epsilon, res(i+1).dzdx);case'add'
%sc是上一层log的结构 res(i).x是这一层log的结果,两个log结果相加 加权平均
sc = res(i-1).x; %我验证了下sc和res(i).x矩阵大小也是相同的所以解释是对的
%但是传入这两个对反向传播没有实际意义,这行代码是反向传播的地方。对于add的
%反向传播公式在代码里面解释了
res(i).dzdx = vl_myadd(res(i).x, sc, res(i+1).dzdx);case'rec_relu'
res(i).dzdx = vl_myrec_relu(res(i).x, opts.epsilon, res(i+1).dzdx);case'marginloss'
dev_ml_trans = cell(length(res(i).x),1);
dzdx_recon = res(i+1).dzdx;
dzdx_log = res(i+15).dzdx;for ii =1: length(res(i).x)
dev_ml_trans{ii}= dzdx_recon{ii} + dzdx_log{ii};
end
res(i).dzdx = dev_ml_trans;case'reconstructionloss'
%虽然res(i+1).dzdx传入的是从后面一层log fc softmaxloss这些损失函数往前传递的,
%但是具体使用了没有,因为reconstructionloss作为最后一层,不应该使用上一层传递的dzdx
%进入函数后,dzdx并没有使用传入的参数,而是重新初始化为0了,然后加上求出的导数
res(i).dzdx = vl_myreconstructionloss(res(i).x, res(1).x, epoch, res(i+1).dzdx); % dev_re
case'exp'
alt = doder;
alt = alt - 1;[res(i).dzdx, SS]= vl_myexp(res(i).x, alt, res(i+1).SS, res(i+1).dzdx);case'log'
alt = doder;
alt = alt - 1;
%下面四行代码很好理解。对于9层和15层 对应的就是5个log中的第一个和第三个,
%就是前两对log的前一个,i+1和i+2的区别 在于i+1是上一层传递过来的导数,
%i+2是上上一层传递的导数,i+2对应上上一层的导数是add,
%因为add就是将前两层log映射的相加了,对于第9和15直接使用add也是对的,数据流是反向传输的
if i ==9|| i ==15
dev_sc = res(i+2).dzdx;else
dev_sc = res(i+1).dzdx;
end
if i ==10
sc = res(i-5).x;
elseif i ==16 %res(3).x是第三层bfc处理之后的还没有经过log的数据,
%对应spdnet也是如此,用到是log映射之前的x,对于原因论文中用的是原始的x
sc = res(i-13).x;else
sc = res(i).x;
end
%下面是对最后末尾的log函数进行解释,也就是log-fc-softmaxloss的log
%sc是这一层的数据,res(i+1)SS也是这一层对应的正向传播的共享数据,dev_sc是上一层的dzdx
%这一行代码很普通,但是真正使用该dzdx是marginloss函数的反向传播使用,因为在
%正向传播的过程中,mylog函数接在了marginloss函数后面
[res(i).dzdx, SS]= vl_mylog(sc, alt, res(i+1).SS, dev_sc);case'softmaxloss'
res(i).dzdx = vl_mysoftmaxloss(res(i).x, l.class, res(i+1).dzdx);case'custom'
res(i)= l.backward(l, res(i), res(i+1));
end
if opts.conserveMemory
res(i+1).dzdx =[];
end
if gpuMode & opts.sync
wait(gpuDevice);
end
res(i).backwardTime = toc(res(i).backwardTime);
end
end
function Y = vl_myreconstructionloss(X, X_ori, epoch, dzdy)
% this function is designed to implement the decode term with the reconstruction function
% Date:
% Author:
% Coryright:
% Note!!!!!:to make the code currectly run, I adjust the line of 96-107 of steifelfactory.m
for m =1: length(X)
dzdy{m}= single(zeros(size(X{1},1),size(X{1},1))); % 400 * 400
end
dzdy_l3 = single(1);%dzdy_l3是1
gamma =0.01; % needs to be adjusted to 1e-4 for EWD
count = epoch;
%gamma =0.8^floor(epoch / 20) * gamma;
%是1x30
dist_sum = zeros(1,length(X)); % save each pair dist
%Y是30x1的矩阵 对于前向传播来说是一个值,如果是前向传播的话,该值已经覆盖了,就是这行代码没什么用
%这行代码的存在为了反向传播用的。
Y = cell(length(X), 1); % save obj or dev
%1X30的矩阵
dev_term = cell(1, length(X)); % save each pair' derivation
for i = 1 : length(X)
temp = X{i} - X_ori{i}; %最后一层的矩阵减去第一层的矩阵是51x51的矩阵
dev_term{i} = 2 * temp; % D((x_1-x_2)^2)=2(x1-x2)
dist_sum(i) = norm(temp,'fro') * norm(temp,'fro'); %该函数是所有元素的平方相加
end
if nargin <4%前面30个值相加平均下x gamma作为损失函数的值
Y = gamma * (sum(dist_sum) / length(X)); % the obj of this loss functionelsefor j =1: length(X)
%dev_term{j}的项是51x51 size(X{1},1)的大小为51 ones(51)生成一个51x51的矩阵
%dzdy_l3=1 bsxfun(@times, ones(size(X{1},1)), dzdy_l3))
%bsxfun类似与广播机制,暂时含不清楚 第一个传入的参数是51x51的全1矩阵,第二个参数是1
%bsxfun(@times, ones(size(X{1},1)), dzdy_l3) 返回的是51x51的全1矩阵
%外面的bsxfun函数,里面传入两个参数,第一个是dev_term的第j个元素,里面是51x51的矩阵,
%第二个参数是51x51的全1矩阵 相当于第一个51x51的矩阵点乘全1矩阵,点乘是对应元素相乘
%得到的实际没有任何变化,还是51x51的矩阵。这个矩阵就是导数矩阵
dev_l3 = bsxfun(@times, dev_term{j}, bsxfun(@times, ones(size(X{1},1)), dzdy_l3));
Y{j}= gamma * dev_l3 + dzdy{j}; % the sum of reconstruction term and softmax term
end
end
end