自学机器学习笔记(十)

人工神经网络实现兵王问题

主函数

clear all;
fid  =  fopen('krkopt.DATA');
c = fread(fid, 3);


vec = zeros(6,1);
xapp = [];
yapp = [];
while ~feof(fid)
    string = [];
    c = fread(fid,1);
    flag = flag+1;
    while c~=13
        string = [string, c];
        c=fread(fid,1);
    end;
    fread(fid,1);  
    if length(string)>10
        vec(1) = string(1) - 96;
        vec(2) = string(3) - 48;
        vec(3) = string(5) - 96;
        vec(4) = string(7) - 48;
        vec(5) = string(9) - 96;
        vec(6) = string(11) - 48;
        xapp = [xapp,vec];
        if string(13) == 100
            yapp = [yapp,[1,0]'];
        else
            yapp = [yapp,[0,1]'];
        end;
    end;
end;
fclose(fid);

[N,M] = size(xapp);
p = randperm(M); %Shuffle the network
ratioTraining = 0.15; 
ratioValidation = 0.05;
ratioTesting = 0.8;
xTraining = [];
yTraining = [];
for i=1:floor(ratioTraining*M)
    xTraining  = [xTraining,xapp(:,p(i))];
    yTraining = [yTraining,yapp(:,p(i))];
end;
xTraining = xTraining';
yTraining = yTraining';


[U,V] = size(xTraining);
avgX = mean(xTraining);
sigma = std(xTraining);
xTraining = (xTraining - repmat(avgX,U,1))./repmat(sigma,U,1);

xValidation = [];
yValidation = [];
for i=floor(ratioTraining*M)+1:floor((ratioTraining+ratioValidation)*M)
    xValidation  = [xValidation,xapp(:,p(i))];
    yValidation = [yValidation,yapp(:,p(i))];
end;
xValidation= xValidation';
yValidation = yValidation';

[U,V] = size(xValidation);
xValidation = (xValidation - repmat(avgX,U,1))./repmat(sigma,U,1);

xTesting = [];
yTesting = [];
for i=floor((ratioTraining+ratioValidation)*M)+1:M
    xTesting  = [xTesting,xapp(:,p(i))];
    yTesting = [yTesting,yapp(:,p(i))];
end;
xTesting = xTesting';
yTesting = yTesting';
[U,V] = size(xTesting);
xTesting = (xTesting - repmat(avgX,U,1))./repmat(sigma,U,1);

%create a neural net
clear nn;

nn = nn_create([6,10,10,10,10,10,10,10,10,10,10,2],'active function','relu','learning rate',0.005, 'batch normalization',1,'optimization method','Adam', 'objective function', 'Cross Entropy');



%train
option.batch_size = 100;
option.iteration = 1;

iteration = 0;
maxAccuracy = 0;
totalAccuracy = [];
maxIteration = 10000;
while(iteration<=maxIteration)
    iteration = iteration +1; 
    nn = nn_train(nn,option,xTraining,yTraining);
    totalCost(iteration) = sum(nn.cost)/length(nn.cost);
    [wrongs,accuracy] = nn_test(nn,xValidation,yValidation);
    totalAccuracy = [totalAccuracy,accuracy];
    if accuracy>maxAccuracy
        maxAccuracy = accuracy;
        storedNN = nn;
    end;
    cost = totalCost(iteration);
    accuracy
    cost
end;
[wrongs,accuracy] = nn_test(storedNN,xTesting,yTesting);

读入数据

clear all;
fid  =  fopen('krkopt.DATA');
c = fread(fid, 3);


vec = zeros(6,1);
xapp = [];
yapp = [];
while ~feof(fid)
    string = [];
    c = fread(fid,1);
    flag = flag+1;
    while c~=13
        string = [string, c];
        c=fread(fid,1);
    end;
    fread(fid,1);  
    if length(string)>10
        vec(1) = string(1) - 96;
        vec(2) = string(3) - 48;
        vec(3) = string(5) - 96;
        vec(4) = string(7) - 48;
        vec(5) = string(9) - 96;
        vec(6) = string(11) - 48;
        xapp = [xapp,vec];
        if string(13) == 100
            yapp = [yapp,[1,0]'];
        else
            yapp = [yapp,[0,1]'];
        end;
    end;
end;
fclose(fid);

注意

 if string(13) == 100
            yapp = [yapp,[1,0]'];
        else
            yapp = [yapp,[0,1]'];
        end;

前面讲过如果是N类情况,Y使用独热向量存储

将数据集分块

[N,M] = size(xapp);
p = randperm(M); %Shuffle the network
ratioTraining = 0.15; 
ratioValidation = 0.05;
ratioTesting = 0.8;
xTraining = [];
yTraining = [];
for i=1:floor(ratioTraining*M)
    xTraining  = [xTraining,xapp(:,p(i))];
    yTraining = [yTraining,yapp(:,p(i))];
end;
xTraining = xTraining';
yTraining = yTraining';
ratioTraining = 0.15; 
ratioValidation = 0.05;
ratioTesting = 0.8;

我们把整个数据集分为训练数据集,验证数据集,测试数据集

每一轮调整过后,我们用验证数据集来验证这个调整是好是坏,决定是否退出。

对训练验证测试数据进行归一化处理

[U,V] = size(xTraining);
avgX = mean(xTraining);
sigma = std(xTraining);
xTraining = (xTraining - repmat(avgX,U,1))./repmat(sigma,U,1);

xValidation = [];
yValidation = [];
for i=floor(ratioTraining*M)+1:floor((ratioTraining+ratioValidation)*M)
    xValidation  = [xValidation,xapp(:,p(i))];
    yValidation = [yValidation,yapp(:,p(i))];
end;
xValidation= xValidation';
yValidation = yValidation';

[U,V] = size(xValidation);
xValidation = (xValidation - repmat(avgX,U,1))./repmat(sigma,U,1);

xTesting = [];
yTesting = [];
for i=floor((ratioTraining+ratioValidation)*M)+1:M
    xTesting  = [xTesting,xapp(:,p(i))];
    yTesting = [yTesting,yapp(:,p(i))];
end;
xTesting = xTesting';
yTesting = yTesting';
[U,V] = size(xTesting);
xTesting = (xTesting - repmat(avgX,U,1))./repmat(sigma,U,1);

创建神经网络

clear nn;

nn = nn_create([6,10,10,10,10,10,10,10,10,10,10,2],'active function','relu','learning rate',0.005, 'batch normalization',1,'optimization method','Adam', 'objective function', 'Cross Entropy');

nn_create函数就是创建一个神经网络

第一个参数代表了神经网络中每一层的神经元的个数,兵王问题中输入是6个维度,所以开头是6,而输出是两个维度,所以最后输出是两个维度,而中间有10层每层有10个神经网络

接下来是神经网络的激活函数,选择的是relu函数,学习率函数选择0.005

训练

option.batch_size = 100;
option.iteration = 1;

iteration = 0;
maxAccuracy = 0;
totalAccuracy = [];
maxIteration = 10000;
while(iteration<=maxIteration)
    iteration = iteration +1; 
    nn = nn_train(nn,option,xTraining,yTraining);
    totalCost(iteration) = sum(nn.cost)/length(nn.cost);
    [wrongs,accuracy] = nn_test(nn,xValidation,yValidation);
    totalAccuracy = [totalAccuracy,accuracy];
    if accuracy>maxAccuracy
        maxAccuracy = accuracy;
        storedNN = nn;
    end;
    cost = totalCost(iteration);
    accuracy
    cost
end;
[wrongs,accuracy] = nn_test(storedNN,xTesting,yTesting);

objective function是目标函数,我们选择的是交叉熵函数

option.batch_size = 100;
option.iteration = 1;

代表每一个BATCH中有100个训练样本

maxIteration = 10000;

设置最大的训练轮次是10000轮

 totalCost(iteration) = sum(nn.cost)/length(nn.cost);

平均的损失函数

 [wrongs,accuracy] = nn_test(nn,xValidation,yValidation);

在验证集上测试识别率

最后测试测试集上的数据来得到识别率

前项计算:nn_forward

function nn = nn_forward(nn,batch_x,batch_y)    
    s = size(nn.cost) + 1;%s为cost矩阵的行和列2维向量并各加1,这步配合第51行的nn.cost(s)
                          %实际效果其实就是每次在cost行向量后挤入一个新值
    batch_x = batch_x';
    batch_y = batch_y';
    m = size(batch_x,2);%size(,2)得到矩阵列数
    nn.a{1} = batch_x;
    cost2 = 0;%cost2指cost的第二个和式即添加的正则项
    for k = 2 : nn.depth
        y = nn.W{k-1} * nn.a{k-1} + repmat(nn.b{k-1},1,m);%repmat(A,m,n)将A复制m×n块
        %由于进行批处理,将m组数据存在矩阵同时处理,而对每组数据来说阈值设定是相同的,故将b复制m次
        %此处y即为所给推导方法中的z.
        if nn.batch_normalization
            nn.E{k-1} = nn.E{k-1}*nn.vecNum + sum(y,2);
            nn.S{k-1} = nn.S{k-1}.^2*(nn.vecNum-1) + (m-1)*std(y,0,2).^2;
            nn.vecNum = nn.vecNum + m;
            nn.E{k-1} = nn.E{k-1}/nn.vecNum;
            nn.S{k-1} = sqrt(nn.S{k-1}/(nn.vecNum-1));
            y = (y - repmat(nn.E{k-1},1,m))./repmat(nn.S{k-1}+0.0001*ones(size(nn.S{k-1})),1,m);
            y = nn.Gamma{k-1}*y+nn.Beta{k-1};
        end;
        if k == nn.depth%输出层激活函数选择
            switch nn.output_function
                case 'sigmoid'
                    nn.a{k} = sigmoid(y);
                case 'tanh'
                    nn.a{k} = tanh(y);
                case 'relu'
                    nn.a{k} = max(y,0);
                case 'softmax'
                    nn.a{k} = softmax(y);
            end
        else 
            switch nn.active_function%隐层激活函数选择
                case 'sigmoid'
                    nn.a{k} = sigmoid(y);
                case 'tanh'
                    nn.a{k} = tanh(y);
                case 'relu'
                    nn.a{k} = max(y,0);
            end
        end
        cost2 = cost2 +  sum(sum(nn.W{k-1}.^2));%正则项计算
    end
    if nn.encoder == 1%此参数为0故可略
        roj = sum(nn.a{2},2)/m;
        nn.cost(s) = 0.5 * sum(sum((nn.a{k} - batch_y).^2))/m + 0.5 * nn.weight_decay * cost2 + 3 * sum(nn.sparsity * log(nn.sparsity ./ roj) + ...
            (1-nn.sparsity) * log((1-nn.sparsity) ./ (1-roj)));
    else
        if strcmp(nn.objective_function,'MSE')
            nn.cost(s) = 0.5 / m * sum(sum((nn.a{k} - batch_y).^2)) + 0.5 * nn.weight_decay * cost2;
        elseif strcmp(nn.objective_function,'Cross Entropy')
            nn.cost(s) = -0.5*sum(sum(batch_y.*log(nn.a{k})))/m + 0.5 * nn.weight_decay * cost2;
        
    end
    
end
 y = nn.W{k-1} * nn.a{k-1} + repmat(nn.b{k-1},1,m);%repmat(A,m,n)将A复制m×n块
   switch nn.active_function%隐层激活函数选择
                case 'sigmoid'
                    nn.a{k} = sigmoid(y);
                case 'tanh'
                    nn.a{k} = tanh(y);
                case 'relu'
                    nn.a{k} = max(y,0);
            end

从k-1层输出经过计算在加上偏执得到k层输出

后向传输:nn_backpropagation

function nn = nn_backpropagation(nn,batch_y)
    batch_y = batch_y';
    m = size(nn.a{1},2);%m为矩阵的列数,即一batch的数目
    nn.theta{1} = 0;%nn.theta即所给推导公式中的δ,特别的下面 nn.theta{nn.depth}对应推导中δ(l)
                    %和之前一样{ }用来记录每一层的δ大小,这也是后向传播的意义
    switch nn.output_function 
        case 'sigmoid'
            nn.theta{nn.depth} = -(batch_y-nn.a{nn.depth}) .* nn.a{nn.depth} .* (1 - nn.a{nn.depth});
        case 'tanh'
            nn.theta{nn.depth} = -(batch_y-nn.a{nn.depth}) .* (1 - nn.a{nn.depth}.^2);
        case 'softmax'
            nn.theta{nn.depth} = nn.a{nn.depth} - batch_y;
    end
    if nn.batch_normalization
        x = nn.W{nn.depth-1} * nn.a{nn.depth-1} + repmat(nn.b{nn.depth-1},1,m);
        x = (x - repmat(nn.E{nn.depth-1},1,m))./repmat(nn.S{nn.depth-1}+0.0001*ones(size(nn.S{nn.depth-1})),1,m);
        temp = nn.theta{nn.depth}.*x;
        nn.Gamma_grad{nn.depth-1} = sum(mean(temp,2));
        nn.Beta_grad{nn.depth-1} = sum(mean(nn.theta{nn.depth},2));
        nn.theta{nn.depth} = nn.Gamma{nn.depth-1}*nn.theta{nn.depth}./repmat((nn.S{nn.depth-1}+0.0001),1,m);
    end;
    %第二项是引入正则项后求导产生的,即正则项系数nn.weight_decay乘以w
    nn.W_grad{nn.depth-1} = nn.theta{nn.depth}*nn.a{nn.depth-1}'/m + nn.weight_decay*nn.W{nn.depth-1};
    nn.b_grad{nn.depth-1} = sum(nn.theta{nn.depth},2)/m;
    %上面式子中sum函数,无参数则累加列得到行向量,sum(,2)则累加行得到列向量
    %这两个式子都对m个梯度求了平均
    switch nn.active_function
        case 'sigmoid'
            if nn.encoder == 0;
                for ll = 2 : nn.depth - 1
                    k = nn.depth - ll + 1;%这里就用到了后向传播的典型意义,从后向前递推,求δ
                    %下式中都将δ写成矩阵形式而不像老师推导中给出的δi,故式子直接写成矩阵相乘
                    nn.theta{k} = ((nn.W{k}'*nn.theta{k+1})) .* nn.a{k} .* (1 - nn.a{k});
                    if nn.batch_normalization%不解释
                        x = nn.W{k-1} * nn.a{k-1} + repmat(nn.b{k-1},1,m);
                        x = (x - repmat(nn.E{k-1},1,m))./repmat(nn.S{k-1}+0.0001*ones(size(nn.S{k-1})),1,m);
                        temp = nn.theta{k}.*x;
                        nn.Gamma_grad{k-1} = sum(mean(temp,2));
                        nn.Beta_grad{k-1} = sum(mean(nn.theta{k},2));
                        nn.theta{k} = nn.Gamma{k-1}*nn.theta{k}./repmat((nn.S{k-1}+0.0001),1,m);
                    end;
                    nn.W_grad{k-1} = nn.theta{k}*nn.a{k-1}'/m + nn.weight_decay*nn.W{k-1};
                    nn.b_grad{k-1} = sum(nn.theta{k},2)/m;%前面已经解释
                end
            else
                roj = sum(nn.a{2},2)/m;
                temp = (-nn.sparsity./roj+(1-nn.sparsity)./(1-roj));
                nn.theta{2} = ((nn.W{2}'*nn.theta{3}) + nn.beta*repmat(temp,1,m)) .* nn.a{2} .* (1 - nn.a{2});
                nn.W_grad{1} = nn.theta{2}*nn.a{1}'/m + nn.weight_decay*nn.W{1};
                nn.b_grad{1} = sum(nn.theta{2},2)/m;
            end
        

            
        case 'tanh'
            for ll = 2 : nn.depth - 1
                if nn.encoder == 0;
                    k = nn.depth - ll + 1;
                    nn.theta{k} = ((nn.W{k}'*nn.theta{k+1})) .* (1-nn.a{k}.^2);
                    if nn.batch_normalization
                        x = nn.W{k-1} * nn.a{k-1} + repmat(nn.b{k-1},1,m);
                        x = (x - repmat(nn.E{k-1},1,m))./repmat(nn.S{k-1}+0.0001*ones(size(nn.S{k-1})),1,m);
                        temp = nn.theta{k}.*x;
                        nn.Gamma_grad{k-1} = sum(mean(temp,2));
                        nn.Beta_grad{k-1} = sum(mean(nn.theta{k},2));
                        nn.theta{k} = nn.Gamma{k-1}*nn.theta{k}./repmat((nn.S{k-1}+0.0001),1,m);
                    end;
                    nn.W_grad{k-1} = nn.theta{k}*nn.a{k-1}'/m + nn.weight_decay*nn.W{k-1};
                    nn.b_grad{k-1} = sum(nn.theta{k},2)/m;
                else
                    roj = sum(nn.a{2},2)/m;
                    temp = (-nn.sparsity./roj+(1-nn.sparsity)./(1-roj));
                    nn.theta{2} = ((nn.W{2}'*nn.theta{3}) + nn.beta*repmat(temp,1,m)) .* (1-nn.a{2}.^2);
                    nn.W_grad{1} = nn.theta{2}*nn.a{1}'/m + nn.weight_decay*nn.W{1};
                    nn.b_grad{1} = sum(nn.theta{2},2)/m;
                end
            end
            
        case 'relu'
            if nn.encoder == 0;
                for ll = 2 : nn.depth - 1
                    k = nn.depth - ll + 1;
                  
                    nn.theta{k} = ((nn.W{k}'*nn.theta{k+1})).*(nn.a{k}>0);
                    if nn.batch_normalization
                        x = nn.W{k-1} * nn.a{k-1} + repmat(nn.b{k-1},1,m);
                        x = (x - repmat(nn.E{k-1},1,m))./repmat(nn.S{k-1}+0.0001*ones(size(nn.S{k-1})),1,m);
                        temp = nn.theta{k}.*x;
                        nn.Gamma_grad{k-1} = sum(mean(temp,2));
                        nn.Beta_grad{k-1} = sum(mean(nn.theta{k},2));
                        nn.theta{k} = nn.Gamma{k-1}*nn.theta{k}./repmat((nn.S{k-1}+0.0001),1,m);
                    end;
                    nn.W_grad{k-1} = nn.theta{k}*nn.a{k-1}'/m + nn.weight_decay*nn.W{k-1};
                    nn.b_grad{k-1} = sum(nn.theta{k},2)/m;
                end
            else
                roj = sum(nn.a{2},2)/m;
                temp = (-nn.sparsity./roj+(1-nn.sparsity)./(1-roj));
                M = max(nn.a{2},0);
                M = M./max(M,0.001);
                    
                nn.theta{2} = ((nn.W{2}'*nn.theta{3}) + nn.beta*repmat(temp,1,m)) .* M;
                nn.W_grad{1} = nn.theta{2}*nn.a{1}'/m + nn.weight_decay*nn.W{1};
                nn.b_grad{1} = sum(nn.theta{2},2)/m;
            end
    end
    
end
 switch nn.output_function 
        case 'sigmoid'
            nn.theta{nn.depth} = -(batch_y-nn.a{nn.depth}) .* nn.a{nn.depth} .* (1 - nn.a{nn.depth});
        case 'tanh'
            nn.theta{nn.depth} = -(batch_y-nn.a{nn.depth}) .* (1 - nn.a{nn.depth}.^2);
        case 'softmax'
            nn.theta{nn.depth} = nn.a{nn.depth} - batch_y;
    end
    if nn.batch_normalization
        x = nn.W{nn.depth-1} * nn.a{nn.depth-1} + repmat(nn.b{nn.depth-1},1,m);
        x = (x - repmat(nn.E{nn.depth-1},1,m))./repmat(nn.S{nn.depth-1}+0.0001*ones(size(nn.S{nn.depth-1})),1,m);
        temp = nn.theta{nn.depth}.*x;
        nn.Gamma_grad{nn.depth-1} = sum(mean(temp,2));
        nn.Beta_grad{nn.depth-1} = sum(mean(nn.theta{nn.depth},2));
        nn.theta{nn.depth} = nn.Gamma{nn.depth-1}*nn.theta{nn.depth}./repmat((nn.S{nn.depth-1}+0.0001),1,m);
    end;
    %第二项是引入正则项后求导产生的,即正则项系数nn.weight_decay乘以w

求出偏导的过程。

多层神经网络参数的更新过程:nn_applygradient

function nn = nn_applygradient(nn)%都是各种优化方法,我们选normal

if strcmp(nn.optimization_method, 'AdaGrad') || strcmp(nn.optimization_method, 'RMSProp') || strcmp(nn.optimization_method, 'Adam')
    grad_squared = 0;
    if nn.batch_normalization == 0
        for k = 1 : nn.depth-1
            grad_squared = grad_squared + sum(sum(nn.W_grad{k}.^2))+sum(nn.b_grad{k}.^2);
        end;
    else
        for k = 1 : nn.depth-1
            grad_squared = grad_squared + sum(sum(nn.W_grad{k}.^2))+sum(nn.b_grad{k}.^2)+nn.Gamma{k}^2 + nn.Beta{k}^2;
        end;
    end;
end;

for k = 1 : nn.depth-1
    if nn.batch_normalization == 0
        if strcmp(nn.optimization_method, 'normal')
            nn.W{k} = nn.W{k} - nn.learning_rate*nn.W_grad{k};
            nn.b{k} = nn.b{k} - nn.learning_rate*nn.b_grad{k};
        elseif strcmp(nn.optimization_method, 'AdaGrad')
            nn.rW{k} = nn.rW{k} + nn.W_grad{k}.^2;
            nn.rb{k} = nn.rb{k} + nn.b_grad{k}.^2;
            nn.W{k} = nn.W{k} - nn.learning_rate*nn.W_grad{k}./(sqrt(nn.rW{k})+0.001);
            nn.b{k} = nn.b{k} - nn.learning_rate*nn.b_grad{k}./(sqrt(nn.rb{k})+0.001);
        elseif strcmp(nn.optimization_method, 'Momentum')
            rho = 0.1;%rho = 0.1;
            nn.vW{k} = rho*nn.vW{k} - nn.learning_rate*nn.W_grad{k};
            nn.vb{k} = rho*nn.vb{k} - nn.learning_rate*nn.b_grad{k};
            nn.W{k} = nn.W{k} + nn.vW{k};
            nn.b{k} = nn.b{k} + nn.vb{k}; 

        elseif strcmp(nn.optimization_method, 'RMSProp')
            rho = 0.9; %rho=0.9
            nn.rW{k} = rho*nn.rW{k} + 0.1*nn.W_grad{k}.^2;
            nn.rb{k} = rho*nn.rb{k} + 0.1*nn.b_grad{k}.^2;

            nn.W{k} = nn.W{k} - nn.learning_rate*nn.W_grad{k}./(sqrt(nn.rW{k})+0.001);
            nn.b{k} = nn.b{k} - nn.learning_rate*nn.b_grad{k}./(sqrt(nn.rb{k})+0.001); %rho = 0.9
        elseif strcmp(nn.optimization_method, 'Adam')
            rho1 = 0.9;
            rho2 = 0.999;
            nn.sW{k} = rho1*nn.sW{k} + (1-rho1)*nn.W_grad{k};
            nn.sb{k} = rho1*nn.sb{k} + (1-rho1)*nn.b_grad{k};
            nn.rW{k} = rho2*nn.rW{k} + (1-rho2)*nn.W_grad{k}.^2;
            nn.rb{k} = rho2*nn.rb{k} + (1-rho2)*nn.b_grad{k}.^2;
            
            newS = nn.sW{k}/(1-rho1^nn.AdamTime);
            newR = nn.rW{k}/(1-rho2^nn.AdamTime);
            nn.W{k} = nn.W{k} - nn.learning_rate*newS./sqrt(newR+0.00001);
            newS = nn.sb{k}/(1-rho1^nn.AdamTime);
            newR = nn.rb{k}/(1-rho2^nn.AdamTime);
            nn.b{k} = nn.b{k} - nn.learning_rate*newS./sqrt(newR+0.00001);  %rho1 = 0.9, rho2 = 0.999, delta = 0.00001
        end;
    else
        if strcmp(nn.optimization_method, 'normal')
            nn.W{k} = nn.W{k} - nn.learning_rate*nn.W_grad{k};
            nn.b{k} = nn.b{k} - nn.learning_rate*nn.b_grad{k};
            nn.Gamma{k} = nn.Gamma{k} - nn.learning_rate*nn.Gamma_grad{k};
            nn.Beta{k}= nn.Beta{k} - nn.learning_rate*nn.Beta_grad{k};
        elseif strcmp(nn.optimization_method, 'AdaGrad')
            nn.rW{k} = nn.rW{k} + nn.W_grad{k}.^2;
            nn.rb{k} = nn.rb{k} + nn.b_grad{k}.^2;
            nn.rGamma{k} = nn.rGamma{k} +  nn.Gamma_grad{k}^2;
            nn.rBeta{k} = nn.rBeta{k} +  nn.Beta_grad{k}^2;
            nn.W{k} = nn.W{k} - nn.learning_rate*nn.W_grad{k}./(sqrt(nn.rW{k})+0.001);
            nn.b{k} = nn.b{k} - nn.learning_rate*nn.b_grad{k}./(sqrt(nn.rb{k})+0.001);
            nn.Gamma{k} = nn.Gamma{k} - nn.learning_rate*nn.Gamma_grad{k}/(sqrt(nn.rGamma{k})+0.001);
            nn.Beta{k}= nn.Beta{k} - nn.learning_rate*nn.Beta_grad{k}/(sqrt(nn.rBeta{k})+0.001);
        elseif strcmp(nn.optimization_method, 'RMSProp')
            nn.rW{k} = 0.9*nn.rW{k} + 0.1*nn.W_grad{k}.^2;
            nn.rb{k} = 0.9*nn.rb{k} + 0.1*nn.b_grad{k}.^2;
            nn.rGamma{k} = 0.9*nn.rGamma{k} +  0.1*nn.Gamma_grad{k}^2;
            nn.rBeta{k} = 0.9*nn.rBeta{k} +  0.1*nn.Beta_grad{k}^2;
            nn.W{k} = nn.W{k} - nn.learning_rate*nn.W_grad{k}./(sqrt(nn.rW{k})+0.001);
            nn.b{k} = nn.b{k} - nn.learning_rate*nn.b_grad{k}./(sqrt(nn.rb{k})+0.001);
            nn.Gamma{k} = nn.Gamma{k} - nn.learning_rate*nn.Gamma_grad{k}/(sqrt(nn.rGamma{k})+0.001);
            nn.Beta{k}= nn.Beta{k} - nn.learning_rate*nn.Beta_grad{k}/(sqrt(nn.rBeta{k})+0.001); %rho = 0.9
        elseif strcmp(nn.optimization_method, 'Momentum')
            rho = 0.1;%rho = 0.1;
            nn.vW{k} = rho*nn.vW{k} - nn.learning_rate*nn.W_grad{k};
            nn.vb{k} = rho*nn.vb{k} - nn.learning_rate*nn.b_grad{k};
            nn.vGamma{k} = rho*nn.vGamma{k} - nn.learning_rate*nn.Gamma_grad{k};
            nn.vBeta{k} = rho*nn.vBeta{k} - nn.learning_rate*nn.Beta_grad{k};
            nn.W{k} = nn.W{k} + nn.vW{k};
            nn.b{k} = nn.b{k} + nn.vb{k}; 
            nn.Gamma{k} = nn.Gamma{k} + nn.vGamma{k};
            nn.Beta{k} = nn.Beta{k} + nn.vBeta{k};
            
        elseif strcmp(nn.optimization_method, 'Adam') %rho1=0.9,rho2 =0.999
            rho1=0.9;
            rho2 =0.999;
            nn.sW{k} = rho1*nn.sW{k} + (1-rho1)*nn.W_grad{k};
            nn.sb{k} = rho1*nn.sb{k} + (1-rho1)*nn.b_grad{k};
            nn.sGamma{k} = rho1*nn.sGamma{k} + (1-rho1)*nn.Gamma_grad{k};
            nn.sBeta{k} = rho1*nn.sBeta{k} + (1-rho1)*nn.Beta_grad{k};
            nn.rW{k} = rho2*nn.rW{k} + (1-rho2)*nn.W_grad{k}.^2;
            nn.rb{k} = rho2*nn.rb{k} + (1-rho2)*nn.b_grad{k}.^2;
            nn.rBeta{k} = rho2*nn.rBeta{k} + (1-rho2)*nn.Beta_grad{k}.^2;
            nn.rGamma{k} = rho2*nn.rGamma{k} + (1-rho2)*nn.Gamma_grad{k}.^2;   
            
            newS = nn.sW{k}/(1-rho1^nn.AdamTime);
            newR =  nn.rW{k}/(1-rho2^nn.AdamTime);
            nn.W{k} = nn.W{k} - nn.learning_rate*newS./sqrt(newR+0.00001);
            
            newS = nn.sb{k}/(1-rho1^nn.AdamTime);
            newR =  nn.rb{k}/(1-rho2^nn.AdamTime);
            nn.b{k} = nn.b{k} - nn.learning_rate*newS./sqrt(newR+0.00001); 
            
            newS = nn.sGamma{k}/(1-rho1^nn.AdamTime);
            newR =  nn.rGamma{k}/(1-rho2^nn.AdamTime);
            nn.Gamma{k} = nn.Gamma{k} - nn.learning_rate*newS./sqrt(newR+0.00001);
            
            newS = nn.sBeta{k}/(1-rho1^nn.AdamTime);
            newR =  nn.rBeta{k}/(1-rho2^nn.AdamTime);
            nn.Beta{k} = nn.Beta{k} - nn.learning_rate*newS./sqrt(newR+0.00001);%rho1 = 0.9, rho2 = 0.999, delta = 0.00001
        end;

    end
end
 if strcmp(nn.optimization_method, 'normal')
            nn.W{k} = nn.W{k} - nn.learning_rate*nn.W_grad{k};
            nn.b{k} = nn.b{k} - nn.learning_rate*nn.b_grad{k};

得到新的w和b

注:就按下面顺序看可能更容易理解
    只注释了理解整个过程的必要步骤,对优化方法都略过了
1.test_fot_NN:主文件,包括神经网络的创建、训练、测试
1.nn_create:创建一个神经网络的结构体
2.nn_train:训练神经网络,包括
(1)nn_forward:前馈
(2)nn_backpropagation:反向传播
(3)牛顿梯度下降
3.nn_test:测试一个神经网络,包括
(1)nn_predict:由输入得到输出

三个训练神经网络的建议

(1)一般情况下,在训练集上的目标函数的平均值(cost)会随着训练的深入而不断减小,如果这个指标有增大情况,停下来

两种情况:采用的模型不够复杂,以至于不能在训练集上完全拟合;

已经训练很好了

因此停下来训练测试集,看是否训练好以便采取相应措施

(2)分出一些验证集,训练的本质目标是在验证机上获取最大的识别率,我们采用验证集上识别率最大的模型作为最后的结果。

(3)注意调整学习率,如果刚训练几步cost就增加,一般来说是学习率太高了;

如果每次cost变化很小,说明学习率太低了。

训练神经网络的各种经验

目标函数可以加入正则项

 L(w,b)是原来的目标函数,后面就是正则项在这里

 λ是权值衰减系数,优化不仅是让原来的目标函数尽可能小,也要让

w的模尽可能小,这样可以防止一些权值绝对值过大造成的过拟合现象,从而增加系统的鲁棒性。

在代码上nn_forward的

   if strcmp(nn.objective_function,'MSE')
            nn.cost(s) = 0.5 / m * sum(sum((nn.a{k} - batch_y).^2)) + 0.5 * nn.weight_decay * cost2;
        elseif strcmp(nn.objective_function,'Cross Entropy')
            nn.cost(s) = -0.5*sum(sum(batch_y.*log(nn.a{k})))/m + 0.5 * nn.weight_decay * cost2;

后向传播的:nn_backpropagation

 %第二项是引入正则项后求导产生的,即正则项系数nn.weight_decay乘以w
    nn.W_grad{nn.depth-1} = nn.theta{nn.depth}*nn.a{nn.depth-1}'/m + nn.weight_decay*nn.W{nn.depth-1};

(2)训练数据的归一化

保证每一个维度落到一个相应的空间中

 (3)参数w和b的初始化

随机梯度下降法第一步是随机的取所有(w,b),但是可能会出现梯度消失现象。

 上图这两个函数在绝对值很大的地方梯度都很小,所以一开始很大,梯度将趋近于0,则会导致训练缓慢,因此让一开始落在0附近

一中比较简单有效的方法是(w,b)初始化从区间均匀随机取值。其中d为(w,b)所在层的神经元个数。

nn_create

function nn = nn_create(varargin)%varargin相当于c中argv[]
%结构体用于各种参数、方法选择设置
SIZE = varargin{1};
nn.keep_probability     =               1;
nn.size                 =               SIZE;
nn.depth                =               numel(SIZE);%numel当数()中元素个数就好
nn.active_function      =               'sigmoid';
nn.output_function      =               'sigmoid';
nn.learning_rate        =               1.5;%学习率
nn.weight_decay         =               0;%为防止过拟合,weight_decay是放在正则项前面的一个系数
nn.cost                 =               [];
nn.encoder              =               0;%编码器,用于初始赋值
nn.sparsity             =               0.03;%用于上面的稀疏编码,可略
nn.beta                 =               3;%用于下面的objective_function,可略
nn.batch_normalization  =               0;
nn.grad_squared         =               0;
nn.r                    =               0
nn.optimization_method  =               'normal';
nn.objective_function   =               'MSE';%均方误差,计算E



for i = 2:length(varargin)%matlab数组从1开始,由于主函数输入时只输入一个参数,length()=1,故这段代码暂时没用
    if strcmp('active function',varargin{i})
        nn.active_function = varargin{i+1};
    elseif strcmp('output function',varargin{i})
        nn.output_function = varargin{i+1};
    elseif strcmp('learning rate',varargin{i})
        nn.learning_rate = varargin{i+1};
    elseif strcmp('weight decay',varargin{i})
        nn.weight_decay = varargin{i+1};
    elseif strcmp('sparsity',varargin{i})
        nn.sparsity = varargin{i+1};
    elseif strcmp('beta',varargin{i})
        nn.weight_decay = varargin{i+1};
    elseif strcmp('batch normalization',varargin{i})
        nn.batch_normalization = varargin{i+1};
    elseif strcmp('optimization method',varargin{i})
        nn.optimization_method = varargin{i+1};
    elseif strcmp('objective function', varargin{i})
        nn.objective_function = varargin{i+1};
    elseif strcmp('weight decay', varargin{i})
        nn.weight_decay = varargin{i+1};
    elseif strcmp('keep probability',varargin{i})
        nn.keep_probability = varargin{i+1};
    end;
end;

if strcmp(nn.objective_function, 'Cross Entropy')%应该也没交叉检验,故不用看
    nn.output_function = 'softmax';
end;

for k = 1 : nn.depth-1
    width = nn.size(k);
    height = nn.size(k+1);
    %nn.W{k} = (rand(height, width) - 0.5) * 2 * sqrt(6 / (height + width + 1)) - sqrt(6 / (height + width + 1));
    
    nn.W{k} = 2*rand(height, width)/sqrt(width)-1/sqrt(width);%rand产生伪随机数矩阵,即W权重矩阵初始化
    if abs(nn.keep_probability-1)>0.001
        nn.WMask{k} = ones(height,width);
    end;
    %nn.W{k} = 2*rand(height, width)-1;
    %Xavier initialization 
    if strcmp(nn.active_function, 'relu')
        nn.b{k} = rand(height,1)+0.01;
    else
        nn.b{k} = 2*rand(height, 1)/sqrt(width)-1/sqrt(width);%b阈值的初始化
    end;
    
        
    %parameters for moments 
    if strcmp(nn.optimization_method,'Momentum')%以下都是解决梯度方向等问题的优化方法,可以略
        nn.vW{k} = zeros(height,width);
        nn.vb{k} = zeros(height,1);
    end; 
    if strcmp(nn.optimization_method,'AdaGrad')  ||strcmp(nn.optimization_method,'RMSProp') || strcmp(nn.optimization_method,'Adam')
        nn.rW{k} = zeros(height,width);
        nn.rb{k} = zeros(height,1);
    end;
    if strcmp(nn.optimization_method,'Adam')
        nn.sW{k} = zeros(height,width);
        nn.sb{k} = zeros(height,1);
    end; 
       %parameters for batch normalization.
    if nn.batch_normalization
        nn.E{k} = zeros(height,1);
        nn.S{k} = zeros(height,1);
        nn.Gamma{k} = 1;
        nn.Beta{k} = 0;
        if  strcmp(nn.optimization_method,'Momentum')
            nn.vGamma{k} = 1;
            nn.vBeta{k} = 0;
        end;
        if strcmp(nn.optimization_method,'AdaGrad')  ||strcmp(nn.optimization_method,'RMSProp') || strcmp(nn.optimization_method,'Adam')
            nn.rW{k} = zeros(height,width);
            nn.rb{k} = zeros(height,1);
            nn.rGamma{k} = 0;
            nn.rBeta{k} = 0;
        end;
        if  strcmp(nn.optimization_method,'Adam')
            nn.sGamma{k} = 1;
            nn.sBeta{k} = 0;
        end;
    
        nn.vecNum = 0;
    end;
    nn.W_grad{k} = zeros(height,width);
end
if  strcmp(nn.optimization_method,'Adam')
    nn.AdamTime = 0;
end;
nn.W{k} = 2*rand(height, width)/sqrt(width)-1/sqrt(width);%rand产生伪随机数矩阵,即W权重矩阵初始化
    if abs(nn.keep_probability-1)>0.001
        nn.WMask{k} = ones(height,width);
    end;
    %nn.W{k} = 2*rand(height, width)-1;
    %Xavier initialization 
    if strcmp(nn.active_function, 'relu')
        nn.b{k} = rand(height,1)+0.01;
    else
        nn.b{k} = 2*rand(height, 1)/sqrt(width)-1/sqrt(width);%b阈值的初始化

 w和b的初始化

(4)BATCH NORMALIZATION

 基本思想:既然我们希望每一层获得的值都在0附近,从而避免梯度消失现象,那么我们为什么步直接把每一层的值做基于均值和方差的归一化呢?

我们可以对每一层输入X进行减去均值除以方差的归一化处理

(5)参数的更新策略

(w,b)的每一个分量获得的梯度绝对值有大有小,一些情况下,将会迫使优化路径变为Z字形状。

ADAGRAD的方法

 SGD的问题

SGD求梯度的策略过于随机

引入动量Momentum

 其每一次更新不仅考虑梯度的方向,还考虑上一次更新的方向,将这两个方向以人为规定的比例加权求和得到最终的更新方向。

Adam算法:同时结合了ADAGRAD和MOMENTUM,同时解决了两个问题,同时引入了逐渐降低梯度搜索步长的机制,使得开始步长大一些,后来小一些

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值