nnCostFunction.m
a1 = [ones(m, 1) X]; %5000x401
z2 = a1 * Theta1'; %5000x25 Theta1 25*401
a2 = sigmoid(z2); %5000x25
a2 = [ones(m, 1) a2]; %5000x26
z3 = a2 * Theta2'; %5000x10 Theta2 10×26
a3 = sigmoid(z3); %5000x10
h = a3; %5000x10
u = eye(num_labels); %一共有多少个标签
y = u(y,:); % 5000 * 10 选出每一行的y值作为u的行标,将这行u替换对应行的y,妙啊...
J = 1/m*(sum(sum(-y .* log(h) - (1 - y) .* log(1 - h)))); % 必须要有点 .*
% sum先把行相加,再把列相加
regularization = lambda/(2 * m) * (sum(sum(Theta1(:,2:end) .^ 2)) + sum(sum(Theta2(:,2:end) .^ 2)));
J += regularization;
delta3 = a3 - y; % 5000 * 10
delta2 = delta3 * Theta2; % 5000 * 26
delta2 = delta2(:,2:end); % 5000 * 25
delta2 = delta2 .* sigmoidGradient(z2); % 5000 * 25
Delta1 = zeros(size(Theta1)); % 25 * 401
Delta2 = zeros(size(Theta2)); % 10 * 26
Delta1 = Delta1 + delta2' * a1; % 25 * 401 5000×25' * 5000x401
Delta2 = Delta2 + delta3' * a2; % 10 * 26 5000×10' * 5000x26
%我们用Delta来表示这个误差矩阵。第L层的第i个激活单元受到第j个参数影响而导致的误差。
Theta1_grad = 1 / m * Delta1 + lambda / m * Theta1;
Theta2_grad = 1 / m * Delta2 + lambda / m * Theta2;
Theta1_grad(:,1) = 1 / m * Delta1(:,1);
Theta2_grad(:,1) = 1 / m * Delta2(:,1);
sigmoidGradient.m
g = sigmoid(z) .* (1 - sigmoid(z));
randInitializeWeights.m
epsilon_init = 0.12;
W = rand(L_out, 1 + L_in) * 2 * epsilon_init - epsilon_init;