我是链接
这节主要讲了forward propagation和back propagation algorithm。
- 隐藏层最后一层到输出层是用的是softmax函数,之前层的活化函数是sigmoid函数;
- 练习中cost function用的也是和softmax regression基本相同的cross entropy;最后加上了weight decay项(注意没有把b包括进来,PRML
1.1
节说了一句附了篇论文,没看0.0),目的是控制weight的大小,防止过拟合。对这一点不是特别理解,为什么weight越大就对应着模型越复杂呢?有种解释是weight越小“曲面”越平滑,weight越大对应的“曲面”起伏越大,对输入变化就越敏感。这里有位前辈自己的理解,行文很欢乐^_^
代码如下,参考这里
function [ cost, grad, pred_prob] = supervised_dnn_cost( theta, ei, data, labels, pred_only)
% SPNETCOSTSLAVE Slave cost function for simple phone net
% Does all the work of cost / gradient computation
% Returns cost broken into cross-entropy, weight norm, and prox reg
% components (ceCost, wCost, pCost)
%% default values
po = false;
if exist('pred_only','var')
po = pred_only;
end;
%% reshape into network
stack = params2stack(theta, ei);
numHidden = numel(ei.layer_sizes) - 1;
hAct = cell(numHidden+1, 1);
gradStack = cell(numHidden+1, 1);
%% forward prop
%%% YOUR CODE HERE %%%
%calculate hidden layer
for layer=1:numHidden
if layer==1
z = bsxfun(@plus,stack{1}.W*data,stack{1}.b);
hAct{1} = sigmf(z,[1,0]);%sigmoid function
else
z = bsxfun(@plus,stack{layer}.W*hAct{layer-1},stack{layer}.b);
hAct{layer} = sigmf(z,[1,0]);
end
end
%calculate output layer(softmax function)
h = bsxfun(@plus,stack{numHidden+1}.W*hAct{numHidden},stack{numHidden+1}.b);
e = exp(h);
pred_prob = bsxfun(@rdivide,e,sum(e,1));
hAct{numHidden+1} = pred_prob;
%% return here if only predictions desired.
if po
cost = -1; ceCost = -1; wCost = -1; numCorrect = -1;
grad = [];
return;
end;
%% compute cost
%%% YOUR CODE HERE %%%
ceCost = 0;
c = -log(pred_prob);
I = sub2ind( size(pred_prob),labels', 1:size(pred_prob,2));
cost_bar = c(I);
ceCost = sum(cost_bar);
%% compute gradients using backpropagation
%%% YOUR CODE HERE %%%
%delta----------error
flag = zeros(size(pred_prob));
flag(I) = 1;
delta = pred_prob-flag;
for layer = numHidden+1:-1:1
gradStack{layer}.b = sum(delta,2);
if layer ==1
gradStack{1}.W = delta*data';
break;
else
gradStack{layer}.W = delta*hAct{layer-1}';
end
delta = (stack{layer}.W)'*delta .* hAct{layer-1} .* (1-hAct{layer-1});
end
%% compute weight penalty cost and gradient for non-bias terms
%%% YOUR CODE HERE %%%
wCost = 0;
for layer=1:numHidden+1
wCost = wCost + 0.5*ei.lambda*(sum(stack{layer}.W(:).^2));
end
cost = ceCost + wCost;
for layer=1:numHidden+1
gradStack{layer}.W = gradStack{layer}.W + ei.lambda*stack{layer}.W;
end
%% reshape gradients into vector
[grad] = stack2params(gradStack);
end