人工神经网络实现兵王问题
主函数
clear all;
fid = fopen('krkopt.DATA');
c = fread(fid, 3);
vec = zeros(6,1);
xapp = [];
yapp = [];
while ~feof(fid)
string = [];
c = fread(fid,1);
flag = flag+1;
while c~=13
string = [string, c];
c=fread(fid,1);
end;
fread(fid,1);
if length(string)>10
vec(1) = string(1) - 96;
vec(2) = string(3) - 48;
vec(3) = string(5) - 96;
vec(4) = string(7) - 48;
vec(5) = string(9) - 96;
vec(6) = string(11) - 48;
xapp = [xapp,vec];
if string(13) == 100
yapp = [yapp,[1,0]'];
else
yapp = [yapp,[0,1]'];
end;
end;
end;
fclose(fid);
[N,M] = size(xapp);
p = randperm(M); %Shuffle the network
ratioTraining = 0.15;
ratioValidation = 0.05;
ratioTesting = 0.8;
xTraining = [];
yTraining = [];
for i=1:floor(ratioTraining*M)
xTraining = [xTraining,xapp(:,p(i))];
yTraining = [yTraining,yapp(:,p(i))];
end;
xTraining = xTraining';
yTraining = yTraining';
[U,V] = size(xTraining);
avgX = mean(xTraining);
sigma = std(xTraining);
xTraining = (xTraining - repmat(avgX,U,1))./repmat(sigma,U,1);
xValidation = [];
yValidation = [];
for i=floor(ratioTraining*M)+1:floor((ratioTraining+ratioValidation)*M)
xValidation = [xValidation,xapp(:,p(i))];
yValidation = [yValidation,yapp(:,p(i))];
end;
xValidation= xValidation';
yValidation = yValidation';
[U,V] = size(xValidation);
xValidation = (xValidation - repmat(avgX,U,1))./repmat(sigma,U,1);
xTesting = [];
yTesting = [];
for i=floor((ratioTraining+ratioValidation)*M)+1:M
xTesting = [xTesting,xapp(:,p(i))];
yTesting = [yTesting,yapp(:,p(i))];
end;
xTesting = xTesting';
yTesting = yTesting';
[U,V] = size(xTesting);
xTesting = (xTesting - repmat(avgX,U,1))./repmat(sigma,U,1);
%create a neural net
clear nn;
nn = nn_create([6,10,10,10,10,10,10,10,10,10,10,2],'active function','relu','learning rate',0.005, 'batch normalization',1,'optimization method','Adam', 'objective function', 'Cross Entropy');
%train
option.batch_size = 100;
option.iteration = 1;
iteration = 0;
maxAccuracy = 0;
totalAccuracy = [];
maxIteration = 10000;
while(iteration<=maxIteration)
iteration = iteration +1;
nn = nn_train(nn,option,xTraining,yTraining);
totalCost(iteration) = sum(nn.cost)/length(nn.cost);
[wrongs,accuracy] = nn_test(nn,xValidation,yValidation);
totalAccuracy = [totalAccuracy,accuracy];
if accuracy>maxAccuracy
maxAccuracy = accuracy;
storedNN = nn;
end;
cost = totalCost(iteration);
accuracy
cost
end;
[wrongs,accuracy] = nn_test(storedNN,xTesting,yTesting);
读入数据
clear all;
fid = fopen('krkopt.DATA');
c = fread(fid, 3);
vec = zeros(6,1);
xapp = [];
yapp = [];
while ~feof(fid)
string = [];
c = fread(fid,1);
flag = flag+1;
while c~=13
string = [string, c];
c=fread(fid,1);
end;
fread(fid,1);
if length(string)>10
vec(1) = string(1) - 96;
vec(2) = string(3) - 48;
vec(3) = string(5) - 96;
vec(4) = string(7) - 48;
vec(5) = string(9) - 96;
vec(6) = string(11) - 48;
xapp = [xapp,vec];
if string(13) == 100
yapp = [yapp,[1,0]'];
else
yapp = [yapp,[0,1]'];
end;
end;
end;
fclose(fid);
注意
if string(13) == 100
yapp = [yapp,[1,0]'];
else
yapp = [yapp,[0,1]'];
end;
前面讲过如果是N类情况,Y使用独热向量存储
将数据集分块
[N,M] = size(xapp);
p = randperm(M); %Shuffle the network
ratioTraining = 0.15;
ratioValidation = 0.05;
ratioTesting = 0.8;
xTraining = [];
yTraining = [];
for i=1:floor(ratioTraining*M)
xTraining = [xTraining,xapp(:,p(i))];
yTraining = [yTraining,yapp(:,p(i))];
end;
xTraining = xTraining';
yTraining = yTraining';
ratioTraining = 0.15;
ratioValidation = 0.05;
ratioTesting = 0.8;
我们把整个数据集分为训练数据集,验证数据集,测试数据集
每一轮调整过后,我们用验证数据集来验证这个调整是好是坏,决定是否退出。
对训练验证测试数据进行归一化处理
[U,V] = size(xTraining);
avgX = mean(xTraining);
sigma = std(xTraining);
xTraining = (xTraining - repmat(avgX,U,1))./repmat(sigma,U,1);
xValidation = [];
yValidation = [];
for i=floor(ratioTraining*M)+1:floor((ratioTraining+ratioValidation)*M)
xValidation = [xValidation,xapp(:,p(i))];
yValidation = [yValidation,yapp(:,p(i))];
end;
xValidation= xValidation';
yValidation = yValidation';
[U,V] = size(xValidation);
xValidation = (xValidation - repmat(avgX,U,1))./repmat(sigma,U,1);
xTesting = [];
yTesting = [];
for i=floor((ratioTraining+ratioValidation)*M)+1:M
xTesting = [xTesting,xapp(:,p(i))];
yTesting = [yTesting,yapp(:,p(i))];
end;
xTesting = xTesting';
yTesting = yTesting';
[U,V] = size(xTesting);
xTesting = (xTesting - repmat(avgX,U,1))./repmat(sigma,U,1);
创建神经网络
clear nn;
nn = nn_create([6,10,10,10,10,10,10,10,10,10,10,2],'active function','relu','learning rate',0.005, 'batch normalization',1,'optimization method','Adam', 'objective function', 'Cross Entropy');
nn_create函数就是创建一个神经网络
第一个参数代表了神经网络中每一层的神经元的个数,兵王问题中输入是6个维度,所以开头是6,而输出是两个维度,所以最后输出是两个维度,而中间有10层每层有10个神经网络
接下来是神经网络的激活函数,选择的是relu函数,学习率函数选择0.005
训练
option.batch_size = 100;
option.iteration = 1;
iteration = 0;
maxAccuracy = 0;
totalAccuracy = [];
maxIteration = 10000;
while(iteration<=maxIteration)
iteration = iteration +1;
nn = nn_train(nn,option,xTraining,yTraining);
totalCost(iteration) = sum(nn.cost)/length(nn.cost);
[wrongs,accuracy] = nn_test(nn,xValidation,yValidation);
totalAccuracy = [totalAccuracy,accuracy];
if accuracy>maxAccuracy
maxAccuracy = accuracy;
storedNN = nn;
end;
cost = totalCost(iteration);
accuracy
cost
end;
[wrongs,accuracy] = nn_test(storedNN,xTesting,yTesting);
objective function是目标函数,我们选择的是交叉熵函数
option.batch_size = 100;
option.iteration = 1;
代表每一个BATCH中有100个训练样本
maxIteration = 10000;
设置最大的训练轮次是10000轮
totalCost(iteration) = sum(nn.cost)/length(nn.cost);
平均的损失函数
[wrongs,accuracy] = nn_test(nn,xValidation,yValidation);
在验证集上测试识别率
最后测试测试集上的数据来得到识别率
前项计算:nn_forward
function nn = nn_forward(nn,batch_x,batch_y)
s = size(nn.cost) + 1;%s为cost矩阵的行和列2维向量并各加1,这步配合第51行的nn.cost(s)
%实际效果其实就是每次在cost行向量后挤入一个新值
batch_x = batch_x';
batch_y = batch_y';
m = size(batch_x,2);%size(,2)得到矩阵列数
nn.a{1} = batch_x;
cost2 = 0;%cost2指cost的第二个和式即添加的正则项
for k = 2 : nn.depth
y = nn.W{k-1} * nn.a{k-1} + repmat(nn.b{k-1},1,m);%repmat(A,m,n)将A复制m×n块
%由于进行批处理,将m组数据存在矩阵同时处理,而对每组数据来说阈值设定是相同的,故将b复制m次
%此处y即为所给推导方法中的z.
if nn.batch_normalization
nn.E{k-1} = nn.E{k-1}*nn.vecNum + sum(y,2);
nn.S{k-1} = nn.S{k-1}.^2*(nn.vecNum-1) + (m-1)*std(y,0,2).^2;
nn.vecNum = nn.vecNum + m;
nn.E{k-1} = nn.E{k-1}/nn.vecNum;
nn.S{k-1} = sqrt(nn.S{k-1}/(nn.vecNum-1));
y = (y - repmat(nn.E{k-1},1,m))./repmat(nn.S{k-1}+0.0001*ones(size(nn.S{k-1})),1,m);
y = nn.Gamma{k-1}*y+nn.Beta{k-1};
end;
if k == nn.depth%输出层激活函数选择
switch nn.output_function
case 'sigmoid'
nn.a{k} = sigmoid(y);
case 'tanh'
nn.a{k} = tanh(y);
case 'relu'
nn.a{k} = max(y,0);
case 'softmax'
nn.a{k} = softmax(y);
end
else
switch nn.active_function%隐层激活函数选择
case 'sigmoid'
nn.a{k} = sigmoid(y);
case 'tanh'
nn.a{k} = tanh(y);
case 'relu'
nn.a{k} = max(y,0);
end
end
cost2 = cost2 + sum(sum(nn.W{k-1}.^2));%正则项计算
end
if nn.encoder == 1%此参数为0故可略
roj = sum(nn.a{2},2)/m;
nn.cost(s) = 0.5 * sum(sum((nn.a{k} - batch_y).^2))/m + 0.5 * nn.weight_decay * cost2 + 3 * sum(nn.sparsity * log(nn.sparsity ./ roj) + ...
(1-nn.sparsity) * log((1-nn.sparsity) ./ (1-roj)));
else
if strcmp(nn.objective_function,'MSE')
nn.cost(s) = 0.5 / m * sum(sum((nn.a{k} - batch_y).^2)) + 0.5 * nn.weight_decay * cost2;
elseif strcmp(nn.objective_function,'Cross Entropy')
nn.cost(s) = -0.5*sum(sum(batch_y.*log(nn.a{k})))/m + 0.5 * nn.weight_decay * cost2;
end
end
y = nn.W{k-1} * nn.a{k-1} + repmat(nn.b{k-1},1,m);%repmat(A,m,n)将A复制m×n块
switch nn.active_function%隐层激活函数选择
case 'sigmoid'
nn.a{k} = sigmoid(y);
case 'tanh'
nn.a{k} = tanh(y);
case 'relu'
nn.a{k} = max(y,0);
end
从k-1层输出经过计算在加上偏执得到k层输出
后向传输:nn_backpropagation
function nn = nn_backpropagation(nn,batch_y)
batch_y = batch_y';
m = size(nn.a{1},2);%m为矩阵的列数,即一batch的数目
nn.theta{1} = 0;%nn.theta即所给推导公式中的δ,特别的下面 nn.theta{nn.depth}对应推导中δ(l)
%和之前一样{ }用来记录每一层的δ大小,这也是后向传播的意义
switch nn.output_function
case 'sigmoid'
nn.theta{nn.depth} = -(batch_y-nn.a{nn.depth}) .* nn.a{nn.depth} .* (1 - nn.a{nn.depth});
case 'tanh'
nn.theta{nn.depth} = -(batch_y-nn.a{nn.depth}) .* (1 - nn.a{nn.depth}.^2);
case 'softmax'
nn.theta{nn.depth} = nn.a{nn.depth} - batch_y;
end
if nn.batch_normalization
x = nn.W{nn.depth-1} * nn.a{nn.depth-1} + repmat(nn.b{nn.depth-1},1,m);
x = (x - repmat(nn.E{nn.depth-1},1,m))./repmat(nn.S{nn.depth-1}+0.0001*ones(size(nn.S{nn.depth-1})),1,m);
temp = nn.theta{nn.depth}.*x;
nn.Gamma_grad{nn.depth-1} = sum(mean(temp,2));
nn.Beta_grad{nn.depth-1} = sum(mean(nn.theta{nn.depth},2));
nn.theta{nn.depth} = nn.Gamma{nn.depth-1}*nn.theta{nn.depth}./repmat((nn.S{nn.depth-1}+0.0001),1,m);
end;
%第二项是引入正则项后求导产生的,即正则项系数nn.weight_decay乘以w
nn.W_grad{nn.depth-1} = nn.theta{nn.depth}*nn.a{nn.depth-1}'/m + nn.weight_decay*nn.W{nn.depth-1};
nn.b_grad{nn.depth-1} = sum(nn.theta{nn.depth},2)/m;
%上面式子中sum函数,无参数则累加列得到行向量,sum(,2)则累加行得到列向量
%这两个式子都对m个梯度求了平均
switch nn.active_function
case 'sigmoid'
if nn.encoder == 0;
for ll = 2 : nn.depth - 1
k = nn.depth - ll + 1;%这里就用到了后向传播的典型意义,从后向前递推,求δ
%下式中都将δ写成矩阵形式而不像老师推导中给出的δi,故式子直接写成矩阵相乘
nn.theta{k} = ((nn.W{k}'*nn.theta{k+1})) .* nn.a{k} .* (1 - nn.a{k});
if nn.batch_normalization%不解释
x = nn.W{k-1} * nn.a{k-1} + repmat(nn.b{k-1},1,m);
x = (x - repmat(nn.E{k-1},1,m))./repmat(nn.S{k-1}+0.0001*ones(size(nn.S{k-1})),1,m);
temp = nn.theta{k}.*x;
nn.Gamma_grad{k-1} = sum(mean(temp,2));
nn.Beta_grad{k-1} = sum(mean(nn.theta{k},2));
nn.theta{k} = nn.Gamma{k-1}*nn.theta{k}./repmat((nn.S{k-1}+0.0001),1,m);
end;
nn.W_grad{k-1} = nn.theta{k}*nn.a{k-1}'/m + nn.weight_decay*nn.W{k-1};
nn.b_grad{k-1} = sum(nn.theta{k},2)/m;%前面已经解释
end
else
roj = sum(nn.a{2},2)/m;
temp = (-nn.sparsity./roj+(1-nn.sparsity)./(1-roj));
nn.theta{2} = ((nn.W{2}'*nn.theta{3}) + nn.beta*repmat(temp,1,m)) .* nn.a{2} .* (1 - nn.a{2});
nn.W_grad{1} = nn.theta{2}*nn.a{1}'/m + nn.weight_decay*nn.W{1};
nn.b_grad{1} = sum(nn.theta{2},2)/m;
end
case 'tanh'
for ll = 2 : nn.depth - 1
if nn.encoder == 0;
k = nn.depth - ll + 1;
nn.theta{k} = ((nn.W{k}'*nn.theta{k+1})) .* (1-nn.a{k}.^2);
if nn.batch_normalization
x = nn.W{k-1} * nn.a{k-1} + repmat(nn.b{k-1},1,m);
x = (x - repmat(nn.E{k-1},1,m))./repmat(nn.S{k-1}+0.0001*ones(size(nn.S{k-1})),1,m);
temp = nn.theta{k}.*x;
nn.Gamma_grad{k-1} = sum(mean(temp,2));
nn.Beta_grad{k-1} = sum(mean(nn.theta{k},2));
nn.theta{k} = nn.Gamma{k-1}*nn.theta{k}./repmat((nn.S{k-1}+0.0001),1,m);
end;
nn.W_grad{k-1} = nn.theta{k}*nn.a{k-1}'/m + nn.weight_decay*nn.W{k-1};
nn.b_grad{k-1} = sum(nn.theta{k},2)/m;
else
roj = sum(nn.a{2},2)/m;
temp = (-nn.sparsity./roj+(1-nn.sparsity)./(1-roj));
nn.theta{2} = ((nn.W{2}'*nn.theta{3}) + nn.beta*repmat(temp,1,m)) .* (1-nn.a{2}.^2);
nn.W_grad{1} = nn.theta{2}*nn.a{1}'/m + nn.weight_decay*nn.W{1};
nn.b_grad{1} = sum(nn.theta{2},2)/m;
end
end
case 'relu'
if nn.encoder == 0;
for ll = 2 : nn.depth - 1
k = nn.depth - ll + 1;
nn.theta{k} = ((nn.W{k}'*nn.theta{k+1})).*(nn.a{k}>0);
if nn.batch_normalization
x = nn.W{k-1} * nn.a{k-1} + repmat(nn.b{k-1},1,m);
x = (x - repmat(nn.E{k-1},1,m))./repmat(nn.S{k-1}+0.0001*ones(size(nn.S{k-1})),1,m);
temp = nn.theta{k}.*x;
nn.Gamma_grad{k-1} = sum(mean(temp,2));
nn.Beta_grad{k-1} = sum(mean(nn.theta{k},2));
nn.theta{k} = nn.Gamma{k-1}*nn.theta{k}./repmat((nn.S{k-1}+0.0001),1,m);
end;
nn.W_grad{k-1} = nn.theta{k}*nn.a{k-1}'/m + nn.weight_decay*nn.W{k-1};
nn.b_grad{k-1} = sum(nn.theta{k},2)/m;
end
else
roj = sum(nn.a{2},2)/m;
temp = (-nn.sparsity./roj+(1-nn.sparsity)./(1-roj));
M = max(nn.a{2},0);
M = M./max(M,0.001);
nn.theta{2} = ((nn.W{2}'*nn.theta{3}) + nn.beta*repmat(temp,1,m)) .* M;
nn.W_grad{1} = nn.theta{2}*nn.a{1}'/m + nn.weight_decay*nn.W{1};
nn.b_grad{1} = sum(nn.theta{2},2)/m;
end
end
end
switch nn.output_function
case 'sigmoid'
nn.theta{nn.depth} = -(batch_y-nn.a{nn.depth}) .* nn.a{nn.depth} .* (1 - nn.a{nn.depth});
case 'tanh'
nn.theta{nn.depth} = -(batch_y-nn.a{nn.depth}) .* (1 - nn.a{nn.depth}.^2);
case 'softmax'
nn.theta{nn.depth} = nn.a{nn.depth} - batch_y;
end
if nn.batch_normalization
x = nn.W{nn.depth-1} * nn.a{nn.depth-1} + repmat(nn.b{nn.depth-1},1,m);
x = (x - repmat(nn.E{nn.depth-1},1,m))./repmat(nn.S{nn.depth-1}+0.0001*ones(size(nn.S{nn.depth-1})),1,m);
temp = nn.theta{nn.depth}.*x;
nn.Gamma_grad{nn.depth-1} = sum(mean(temp,2));
nn.Beta_grad{nn.depth-1} = sum(mean(nn.theta{nn.depth},2));
nn.theta{nn.depth} = nn.Gamma{nn.depth-1}*nn.theta{nn.depth}./repmat((nn.S{nn.depth-1}+0.0001),1,m);
end;
%第二项是引入正则项后求导产生的,即正则项系数nn.weight_decay乘以w
求出偏导的过程。
多层神经网络参数的更新过程:nn_applygradient
function nn = nn_applygradient(nn)%都是各种优化方法,我们选normal
if strcmp(nn.optimization_method, 'AdaGrad') || strcmp(nn.optimization_method, 'RMSProp') || strcmp(nn.optimization_method, 'Adam')
grad_squared = 0;
if nn.batch_normalization == 0
for k = 1 : nn.depth-1
grad_squared = grad_squared + sum(sum(nn.W_grad{k}.^2))+sum(nn.b_grad{k}.^2);
end;
else
for k = 1 : nn.depth-1
grad_squared = grad_squared + sum(sum(nn.W_grad{k}.^2))+sum(nn.b_grad{k}.^2)+nn.Gamma{k}^2 + nn.Beta{k}^2;
end;
end;
end;
for k = 1 : nn.depth-1
if nn.batch_normalization == 0
if strcmp(nn.optimization_method, 'normal')
nn.W{k} = nn.W{k} - nn.learning_rate*nn.W_grad{k};
nn.b{k} = nn.b{k} - nn.learning_rate*nn.b_grad{k};
elseif strcmp(nn.optimization_method, 'AdaGrad')
nn.rW{k} = nn.rW{k} + nn.W_grad{k}.^2;
nn.rb{k} = nn.rb{k} + nn.b_grad{k}.^2;
nn.W{k} = nn.W{k} - nn.learning_rate*nn.W_grad{k}./(sqrt(nn.rW{k})+0.001);
nn.b{k} = nn.b{k} - nn.learning_rate*nn.b_grad{k}./(sqrt(nn.rb{k})+0.001);
elseif strcmp(nn.optimization_method, 'Momentum')
rho = 0.1;%rho = 0.1;
nn.vW{k} = rho*nn.vW{k} - nn.learning_rate*nn.W_grad{k};
nn.vb{k} = rho*nn.vb{k} - nn.learning_rate*nn.b_grad{k};
nn.W{k} = nn.W{k} + nn.vW{k};
nn.b{k} = nn.b{k} + nn.vb{k};
elseif strcmp(nn.optimization_method, 'RMSProp')
rho = 0.9; %rho=0.9
nn.rW{k} = rho*nn.rW{k} + 0.1*nn.W_grad{k}.^2;
nn.rb{k} = rho*nn.rb{k} + 0.1*nn.b_grad{k}.^2;
nn.W{k} = nn.W{k} - nn.learning_rate*nn.W_grad{k}./(sqrt(nn.rW{k})+0.001);
nn.b{k} = nn.b{k} - nn.learning_rate*nn.b_grad{k}./(sqrt(nn.rb{k})+0.001); %rho = 0.9
elseif strcmp(nn.optimization_method, 'Adam')
rho1 = 0.9;
rho2 = 0.999;
nn.sW{k} = rho1*nn.sW{k} + (1-rho1)*nn.W_grad{k};
nn.sb{k} = rho1*nn.sb{k} + (1-rho1)*nn.b_grad{k};
nn.rW{k} = rho2*nn.rW{k} + (1-rho2)*nn.W_grad{k}.^2;
nn.rb{k} = rho2*nn.rb{k} + (1-rho2)*nn.b_grad{k}.^2;
newS = nn.sW{k}/(1-rho1^nn.AdamTime);
newR = nn.rW{k}/(1-rho2^nn.AdamTime);
nn.W{k} = nn.W{k} - nn.learning_rate*newS./sqrt(newR+0.00001);
newS = nn.sb{k}/(1-rho1^nn.AdamTime);
newR = nn.rb{k}/(1-rho2^nn.AdamTime);
nn.b{k} = nn.b{k} - nn.learning_rate*newS./sqrt(newR+0.00001); %rho1 = 0.9, rho2 = 0.999, delta = 0.00001
end;
else
if strcmp(nn.optimization_method, 'normal')
nn.W{k} = nn.W{k} - nn.learning_rate*nn.W_grad{k};
nn.b{k} = nn.b{k} - nn.learning_rate*nn.b_grad{k};
nn.Gamma{k} = nn.Gamma{k} - nn.learning_rate*nn.Gamma_grad{k};
nn.Beta{k}= nn.Beta{k} - nn.learning_rate*nn.Beta_grad{k};
elseif strcmp(nn.optimization_method, 'AdaGrad')
nn.rW{k} = nn.rW{k} + nn.W_grad{k}.^2;
nn.rb{k} = nn.rb{k} + nn.b_grad{k}.^2;
nn.rGamma{k} = nn.rGamma{k} + nn.Gamma_grad{k}^2;
nn.rBeta{k} = nn.rBeta{k} + nn.Beta_grad{k}^2;
nn.W{k} = nn.W{k} - nn.learning_rate*nn.W_grad{k}./(sqrt(nn.rW{k})+0.001);
nn.b{k} = nn.b{k} - nn.learning_rate*nn.b_grad{k}./(sqrt(nn.rb{k})+0.001);
nn.Gamma{k} = nn.Gamma{k} - nn.learning_rate*nn.Gamma_grad{k}/(sqrt(nn.rGamma{k})+0.001);
nn.Beta{k}= nn.Beta{k} - nn.learning_rate*nn.Beta_grad{k}/(sqrt(nn.rBeta{k})+0.001);
elseif strcmp(nn.optimization_method, 'RMSProp')
nn.rW{k} = 0.9*nn.rW{k} + 0.1*nn.W_grad{k}.^2;
nn.rb{k} = 0.9*nn.rb{k} + 0.1*nn.b_grad{k}.^2;
nn.rGamma{k} = 0.9*nn.rGamma{k} + 0.1*nn.Gamma_grad{k}^2;
nn.rBeta{k} = 0.9*nn.rBeta{k} + 0.1*nn.Beta_grad{k}^2;
nn.W{k} = nn.W{k} - nn.learning_rate*nn.W_grad{k}./(sqrt(nn.rW{k})+0.001);
nn.b{k} = nn.b{k} - nn.learning_rate*nn.b_grad{k}./(sqrt(nn.rb{k})+0.001);
nn.Gamma{k} = nn.Gamma{k} - nn.learning_rate*nn.Gamma_grad{k}/(sqrt(nn.rGamma{k})+0.001);
nn.Beta{k}= nn.Beta{k} - nn.learning_rate*nn.Beta_grad{k}/(sqrt(nn.rBeta{k})+0.001); %rho = 0.9
elseif strcmp(nn.optimization_method, 'Momentum')
rho = 0.1;%rho = 0.1;
nn.vW{k} = rho*nn.vW{k} - nn.learning_rate*nn.W_grad{k};
nn.vb{k} = rho*nn.vb{k} - nn.learning_rate*nn.b_grad{k};
nn.vGamma{k} = rho*nn.vGamma{k} - nn.learning_rate*nn.Gamma_grad{k};
nn.vBeta{k} = rho*nn.vBeta{k} - nn.learning_rate*nn.Beta_grad{k};
nn.W{k} = nn.W{k} + nn.vW{k};
nn.b{k} = nn.b{k} + nn.vb{k};
nn.Gamma{k} = nn.Gamma{k} + nn.vGamma{k};
nn.Beta{k} = nn.Beta{k} + nn.vBeta{k};
elseif strcmp(nn.optimization_method, 'Adam') %rho1=0.9,rho2 =0.999
rho1=0.9;
rho2 =0.999;
nn.sW{k} = rho1*nn.sW{k} + (1-rho1)*nn.W_grad{k};
nn.sb{k} = rho1*nn.sb{k} + (1-rho1)*nn.b_grad{k};
nn.sGamma{k} = rho1*nn.sGamma{k} + (1-rho1)*nn.Gamma_grad{k};
nn.sBeta{k} = rho1*nn.sBeta{k} + (1-rho1)*nn.Beta_grad{k};
nn.rW{k} = rho2*nn.rW{k} + (1-rho2)*nn.W_grad{k}.^2;
nn.rb{k} = rho2*nn.rb{k} + (1-rho2)*nn.b_grad{k}.^2;
nn.rBeta{k} = rho2*nn.rBeta{k} + (1-rho2)*nn.Beta_grad{k}.^2;
nn.rGamma{k} = rho2*nn.rGamma{k} + (1-rho2)*nn.Gamma_grad{k}.^2;
newS = nn.sW{k}/(1-rho1^nn.AdamTime);
newR = nn.rW{k}/(1-rho2^nn.AdamTime);
nn.W{k} = nn.W{k} - nn.learning_rate*newS./sqrt(newR+0.00001);
newS = nn.sb{k}/(1-rho1^nn.AdamTime);
newR = nn.rb{k}/(1-rho2^nn.AdamTime);
nn.b{k} = nn.b{k} - nn.learning_rate*newS./sqrt(newR+0.00001);
newS = nn.sGamma{k}/(1-rho1^nn.AdamTime);
newR = nn.rGamma{k}/(1-rho2^nn.AdamTime);
nn.Gamma{k} = nn.Gamma{k} - nn.learning_rate*newS./sqrt(newR+0.00001);
newS = nn.sBeta{k}/(1-rho1^nn.AdamTime);
newR = nn.rBeta{k}/(1-rho2^nn.AdamTime);
nn.Beta{k} = nn.Beta{k} - nn.learning_rate*newS./sqrt(newR+0.00001);%rho1 = 0.9, rho2 = 0.999, delta = 0.00001
end;
end
end
if strcmp(nn.optimization_method, 'normal')
nn.W{k} = nn.W{k} - nn.learning_rate*nn.W_grad{k};
nn.b{k} = nn.b{k} - nn.learning_rate*nn.b_grad{k};
得到新的w和b
注:就按下面顺序看可能更容易理解
只注释了理解整个过程的必要步骤,对优化方法都略过了
1.test_fot_NN:主文件,包括神经网络的创建、训练、测试
1.nn_create:创建一个神经网络的结构体
2.nn_train:训练神经网络,包括
(1)nn_forward:前馈
(2)nn_backpropagation:反向传播
(3)牛顿梯度下降
3.nn_test:测试一个神经网络,包括
(1)nn_predict:由输入得到输出
三个训练神经网络的建议
(1)一般情况下,在训练集上的目标函数的平均值(cost)会随着训练的深入而不断减小,如果这个指标有增大情况,停下来
两种情况:采用的模型不够复杂,以至于不能在训练集上完全拟合;
已经训练很好了
因此停下来训练测试集,看是否训练好以便采取相应措施
(2)分出一些验证集,训练的本质目标是在验证机上获取最大的识别率,我们采用验证集上识别率最大的模型作为最后的结果。
(3)注意调整学习率,如果刚训练几步cost就增加,一般来说是学习率太高了;
如果每次cost变化很小,说明学习率太低了。
训练神经网络的各种经验
目标函数可以加入正则项
L(w,b)是原来的目标函数,后面就是正则项在这里
λ是权值衰减系数,优化不仅是让原来的目标函数尽可能小,也要让
w的模尽可能小,这样可以防止一些权值绝对值过大造成的过拟合现象,从而增加系统的鲁棒性。
在代码上nn_forward的
if strcmp(nn.objective_function,'MSE')
nn.cost(s) = 0.5 / m * sum(sum((nn.a{k} - batch_y).^2)) + 0.5 * nn.weight_decay * cost2;
elseif strcmp(nn.objective_function,'Cross Entropy')
nn.cost(s) = -0.5*sum(sum(batch_y.*log(nn.a{k})))/m + 0.5 * nn.weight_decay * cost2;
后向传播的:nn_backpropagation
%第二项是引入正则项后求导产生的,即正则项系数nn.weight_decay乘以w
nn.W_grad{nn.depth-1} = nn.theta{nn.depth}*nn.a{nn.depth-1}'/m + nn.weight_decay*nn.W{nn.depth-1};
(2)训练数据的归一化
保证每一个维度落到一个相应的空间中
(3)参数w和b的初始化
随机梯度下降法第一步是随机的取所有(w,b),但是可能会出现梯度消失现象。
上图这两个函数在绝对值很大的地方梯度都很小,所以一开始很大,梯度将趋近于0,则会导致训练缓慢,因此让一开始落在0附近
一中比较简单有效的方法是(w,b)初始化从区间均匀随机取值。其中d为(w,b)所在层的神经元个数。
nn_create
function nn = nn_create(varargin)%varargin相当于c中argv[]
%结构体用于各种参数、方法选择设置
SIZE = varargin{1};
nn.keep_probability = 1;
nn.size = SIZE;
nn.depth = numel(SIZE);%numel当数()中元素个数就好
nn.active_function = 'sigmoid';
nn.output_function = 'sigmoid';
nn.learning_rate = 1.5;%学习率
nn.weight_decay = 0;%为防止过拟合,weight_decay是放在正则项前面的一个系数
nn.cost = [];
nn.encoder = 0;%编码器,用于初始赋值
nn.sparsity = 0.03;%用于上面的稀疏编码,可略
nn.beta = 3;%用于下面的objective_function,可略
nn.batch_normalization = 0;
nn.grad_squared = 0;
nn.r = 0
nn.optimization_method = 'normal';
nn.objective_function = 'MSE';%均方误差,计算E
for i = 2:length(varargin)%matlab数组从1开始,由于主函数输入时只输入一个参数,length()=1,故这段代码暂时没用
if strcmp('active function',varargin{i})
nn.active_function = varargin{i+1};
elseif strcmp('output function',varargin{i})
nn.output_function = varargin{i+1};
elseif strcmp('learning rate',varargin{i})
nn.learning_rate = varargin{i+1};
elseif strcmp('weight decay',varargin{i})
nn.weight_decay = varargin{i+1};
elseif strcmp('sparsity',varargin{i})
nn.sparsity = varargin{i+1};
elseif strcmp('beta',varargin{i})
nn.weight_decay = varargin{i+1};
elseif strcmp('batch normalization',varargin{i})
nn.batch_normalization = varargin{i+1};
elseif strcmp('optimization method',varargin{i})
nn.optimization_method = varargin{i+1};
elseif strcmp('objective function', varargin{i})
nn.objective_function = varargin{i+1};
elseif strcmp('weight decay', varargin{i})
nn.weight_decay = varargin{i+1};
elseif strcmp('keep probability',varargin{i})
nn.keep_probability = varargin{i+1};
end;
end;
if strcmp(nn.objective_function, 'Cross Entropy')%应该也没交叉检验,故不用看
nn.output_function = 'softmax';
end;
for k = 1 : nn.depth-1
width = nn.size(k);
height = nn.size(k+1);
%nn.W{k} = (rand(height, width) - 0.5) * 2 * sqrt(6 / (height + width + 1)) - sqrt(6 / (height + width + 1));
nn.W{k} = 2*rand(height, width)/sqrt(width)-1/sqrt(width);%rand产生伪随机数矩阵,即W权重矩阵初始化
if abs(nn.keep_probability-1)>0.001
nn.WMask{k} = ones(height,width);
end;
%nn.W{k} = 2*rand(height, width)-1;
%Xavier initialization
if strcmp(nn.active_function, 'relu')
nn.b{k} = rand(height,1)+0.01;
else
nn.b{k} = 2*rand(height, 1)/sqrt(width)-1/sqrt(width);%b阈值的初始化
end;
%parameters for moments
if strcmp(nn.optimization_method,'Momentum')%以下都是解决梯度方向等问题的优化方法,可以略
nn.vW{k} = zeros(height,width);
nn.vb{k} = zeros(height,1);
end;
if strcmp(nn.optimization_method,'AdaGrad') ||strcmp(nn.optimization_method,'RMSProp') || strcmp(nn.optimization_method,'Adam')
nn.rW{k} = zeros(height,width);
nn.rb{k} = zeros(height,1);
end;
if strcmp(nn.optimization_method,'Adam')
nn.sW{k} = zeros(height,width);
nn.sb{k} = zeros(height,1);
end;
%parameters for batch normalization.
if nn.batch_normalization
nn.E{k} = zeros(height,1);
nn.S{k} = zeros(height,1);
nn.Gamma{k} = 1;
nn.Beta{k} = 0;
if strcmp(nn.optimization_method,'Momentum')
nn.vGamma{k} = 1;
nn.vBeta{k} = 0;
end;
if strcmp(nn.optimization_method,'AdaGrad') ||strcmp(nn.optimization_method,'RMSProp') || strcmp(nn.optimization_method,'Adam')
nn.rW{k} = zeros(height,width);
nn.rb{k} = zeros(height,1);
nn.rGamma{k} = 0;
nn.rBeta{k} = 0;
end;
if strcmp(nn.optimization_method,'Adam')
nn.sGamma{k} = 1;
nn.sBeta{k} = 0;
end;
nn.vecNum = 0;
end;
nn.W_grad{k} = zeros(height,width);
end
if strcmp(nn.optimization_method,'Adam')
nn.AdamTime = 0;
end;
nn.W{k} = 2*rand(height, width)/sqrt(width)-1/sqrt(width);%rand产生伪随机数矩阵,即W权重矩阵初始化
if abs(nn.keep_probability-1)>0.001
nn.WMask{k} = ones(height,width);
end;
%nn.W{k} = 2*rand(height, width)-1;
%Xavier initialization
if strcmp(nn.active_function, 'relu')
nn.b{k} = rand(height,1)+0.01;
else
nn.b{k} = 2*rand(height, 1)/sqrt(width)-1/sqrt(width);%b阈值的初始化
w和b的初始化
(4)BATCH NORMALIZATION
基本思想:既然我们希望每一层获得的值都在0附近,从而避免梯度消失现象,那么我们为什么步直接把每一层的值做基于均值和方差的归一化呢?
我们可以对每一层输入X进行减去均值除以方差的归一化处理
(5)参数的更新策略
(w,b)的每一个分量获得的梯度绝对值有大有小,一些情况下,将会迫使优化路径变为Z字形状。
ADAGRAD的方法
SGD的问题
SGD求梯度的策略过于随机
引入动量Momentum
其每一次更新不仅考虑梯度的方向,还考虑上一次更新的方向,将这两个方向以人为规定的比例加权求和得到最终的更新方向。
Adam算法:同时结合了ADAGRAD和MOMENTUM,同时解决了两个问题,同时引入了逐渐降低梯度搜索步长的机制,使得开始步长大一些,后来小一些