在前面的两篇文章中介绍了其使用的基本数据结构Vol和神经网络组件Layer。最后两个就是convnet_net.js和convnet_trainers.js。
2.6 convnet_net.js
在第二篇曾经初步的设计过这里面的内容,主要是接受Layer类的array,形成一个健全的神经网络。一下是文件convnetjs_net.js的全部代码。其中中文注释为自己添加的,英文为作者添加的。
(function(global) {
"use strict";
var Vol = global.Vol; // convenience
var assert = global.assert;
// Net manages a set of layers
// For now constraints: Simple linear order of layers, first layer input last layer a cost layer
var Net = function(options) {
this.layers = []; // 输入为layer类实例的array。其中第一层为input型,最后为Loss型。layers是一个重要的属性。
}
Net.prototype = {
// takes a list of layer definitions and creates the network layer objects
makeLayers: function(defs) {
// few checks
assert(defs.length >= 2, 'Error! At least one input layer and one loss layer are required.');
assert(defs[0].type === 'input', 'Error! First layer must be the input layer, to declare size of inputs');
// desuger函数完成了用户输入layer一些检查,并添加一部分的fullycnnlayer型层,同时对于输入的激活函数类型,添加对应的激活函数层,这一点在上一篇的补续有详细说明。
// desugar layer_defs for adding activation, dropout layers etc
var desugar = function() {
var new_defs = [];
for(var i=0;i<defs.length;i++) {
var def = defs[i];
if(def.type==='softmax' || def.type==='svm') {
// add an fc layer here, there is no reason the user should
// have to worry about this and we almost always want to
new_defs.push({type:'fc', num_neurons: def.num_classes});
}
if(def.type==='regression') {
// add an fc layer here, there is no reason the user should
// have to worry about this and we almost always want to
new_defs.push({type:'fc', num_neurons: def.num_neurons});
}
if((def.type==='fc' || def.type==='conv')
&& typeof(def.bias_pref) === 'undefined'){
def.bias_pref = 0.0;
if(typeof def.activation !== 'undefined' && def.activation === 'relu') {
def.bias_pref = 0.1; // relus like a bit of positive bias to get gradients early
// otherwise it's technically possible that a relu unit will never turn on (by chance)
// and will never get any gradient and never contribute any computation. Dead relu.
}
}
new_defs.push(def);
if(typeof def.activation !== 'undefined') {
if(def.activation==='relu') { new_defs.push({type:'relu'}); }
else if (def.activation==='sigmoid') { new_defs.push({type:'sigmoid'}); }
else if (def.activation==='tanh') { new_defs.push({type:'tanh'}); }
else if (def.activation==='maxout') {
// create maxout activation, and pass along group size, if provided
var gs = def.group_size !== 'undefined' ? def.group_size : 2;
new_defs.push({type:'maxout', group_size:gs});
}
else { console.log('ERROR unsupported activation ' + def.activation); }
}
if(typeof def.drop_prob !== 'undefined' && def.type !== 'dropout') {
new_defs.push({type:'dropout', drop_prob: def.drop_prob});
}
}
return new_defs;
}
defs = desugar(defs);
// 下面几行代码在第二篇中图一的截图。完成从输入layer参数到实际构建layer实例的转化。并将其值保存到layers属性。
// create the layers
this.layers = [];
for(var i=0;i<defs.length;i++) {
var def = defs[i];
if(i>0) {
var prev = this.layers[i-1];
def.in_sx = prev.out_sx;
def.in_sy = prev.out_sy;
def.in_depth = prev.out_depth;
}
switch(def.type) {
case 'fc': this.layers.push(new global.FullyConnLayer(def)); break;
case 'lrn': this.layers.push(new global.LocalResponseNormalizationLayer(def)); break;
case 'dropout': this.layers.push(new global.DropoutLayer(def)); break;
case 'input': this.layers.push(new global.InputLayer(def)); break;
case 'softmax': this.layers.push(new global.SoftmaxLayer(def)); break;
case 'regression': this.layers.push(new global.RegressionLayer(def)); break;
case 'conv': this.layers.push(new global.ConvLayer(def)); break;
case 'pool': this.layers.push(new global.PoolLayer(def)); break;
case 'relu': this.layers.push(new global.ReluLayer(def)); break;
case 'sigmoid': this.layers.push(new global.SigmoidLayer(def)); break;
case 'tanh': this.layers.push(new global.TanhLayer(def)); break;
case 'maxout': this.layers.push(new global.MaxoutLayer(def)); break;
case 'svm': this.layers.push(new global.SVMLayer(def)); break;
default: console.log('ERROR: UNRECOGNIZED LAYER TYPE: ' + def.type);
}
}
},
/* 下面是类net的第二个方法,也就是前向传播方法。注意的是这里的forword与之前每一个layer的forword的不同,layer//层面的forword是对于这个层的,输入是in_act输出是out_act有带有权值层和激活函数层。这里的forword是对于每一个layer调用forword.对于一个Vol类型输入,经过input_layer层的in_act和out_act转变输出一个Vol类型的out_act,然后这个out_act有///作为下一个层的in_act,依次循环到最后一层,因此通过调用不同layer级别的out_act我们就可以知道数据现在已经被加工成什///么样子,这一点在作者的Demo中被使用。 */
// fordword prop the netowrk . The trainer class passes is_training = true, but when this function is
// called from outside (not from the trainer), it defaults to prediction mode
forward: function(V, is_training) {
if(typeof(is_training) === 'undefined') is_training = false;
var act = this.layers[0].forward(V, is_training);
for(var i=1;i<this.layers.length;i++) {
act = this.layers[i].forward(act, is_training);
}
return act;
},
getCostLoss: function(V, y) {// 这里loss是最后一层backward()函数的返回的结果。具体的可以参看convnetjs_layer_los//s.js中。调用这个函数可以返回基于目前权重之下的Loss.
this.forward(V, false);
var N = this.layers.length;
var loss = this.layers[N-1].backward(y);
return loss;
},
// backprop: compute gradients wrt all parameters.这里的后向传播是在每一个layer的级别上完成的。
backward: function(y) {
var N = this.layers.length;
var loss = this.layers[N-1].backward(y); // last layer assumed to be loss layer
for(var i=N-2;i>=0;i--) { // first layer assumed input
this.layers[i].backward();
}
return loss;
},
getParamsAndGrads: function() {
// accumulate parameters and gradients for the entire network
var response = [];
for(var i=0;i<this.layers.length;i++) {
var layer_reponse = this.layers[i].getParamsAndGrads();
for(var j=0;j<layer_reponse.length;j++) {
response.push(layer_reponse[j]);
}
}
return response;
},
getPrediction: function() {
// this is a convenience function for returning the argmax
// prediction, assuming the last layer of the net is a softmax
var S = this.layers[this.layers.length-1];
assert(S.layer_type === 'softmax', 'getPrediction function assumes softmax as last layer of the net!');
var p = S.out_act.w;
var maxv = p[0];
var maxi = 0;
for(var i=1;i<p.length;i++) {
if(p[i] > maxv) { maxv = p[i]; maxi = i;}
}
return maxi; // return index of the class with highest class probability
},
toJSON: function() {
var json = {};
json.layers = [];
for(var i=0;i<this.layers.length;i++) {
json.layers.push(this.layers[i].toJSON());
}
return json;
},
fromJSON: function(json) {
this.layers = [];
for(var i=0;i<json.layers.length;i++) {
var Lj = json.layers[i]
var t = Lj.layer_type;
var L;
if(t==='input') { L = new global.InputLayer(); }
if(t==='relu') { L = new global.ReluLayer(); }
if(t==='sigmoid') { L = new global.SigmoidLayer(); }
if(t==='tanh') { L = new global.TanhLayer(); }
if(t==='dropout') { L = new global.DropoutLayer(); }
if(t==='conv') { L = new global.ConvLayer(); }
if(t==='pool') { L = new global.PoolLayer(); }
if(t==='lrn') { L = new global.LocalResponseNormalizationLayer(); }
if(t==='softmax') { L = new global.SoftmaxLayer(); }
if(t==='regression') { L = new global.RegressionLayer(); }
if(t==='fc') { L = new global.FullyConnLayer(); }
if(t==='maxout') { L = new global.MaxoutLayer(); }
if(t==='svm') { L = new global.SVMLayer(); }
L.fromJSON(Lj);
this.layers.push(L);
}
}
}
global.Net = Net;
})(convnetjs);
2.7convnet_trainers.js
最后一个文件是类Trainer的定义文件,在文件中仅仅有一个方法train()。输入是之前的神级网络,和一些训练时指定的参数。
以下为convnet_trainers.js文件的全部代码,中文为我只加的注释。从下面的分析,可以知道作者的这里训练方式是BP算法。与一般的深度神经网络的训练方式不同的是,通常会对网络进行无监督训练,然后使用这些无监督训练得到的权值为初始化网络权值,然后使用BP算法,进行微调。
作者使用的默认训练方式是Stochastic Gradient Descent(SGD)随机梯度下降。使用了在梯度公式中有标准的参数的梯度+动量项+L1+L2正则项。
(function(global) {
"use strict";
var Vol = global.Vol; // convenience
var Trainer = function(net, options) {
this.net = net;
var options = options || {};
this.learning_rate = typeof options.learning_rate !== 'undefined' ? options.learning_rate : 0.01;
this.l1_decay = typeof options.l1_decay !== 'undefined' ? options.l1_decay : 0.0;
this.l2_decay = typeof options.l2_decay !== 'undefined' ? options.l2_decay : 0.0;
this.batch_size = typeof options.batch_size !== 'undefined' ? options.batch_size : 1;
this.method = typeof options.method !== 'undefined' ? options.method : 'sgd'; // sgd/adam/adagrad/adadelta/windowgrad/netsterov
this.momentum = typeof options.momentum !== 'undefined' ? options.momentum : 0.9;
this.ro = typeof options.ro !== 'undefined' ? options.ro : 0.95; // used in adadelta
this.eps = typeof options.eps !== 'undefined' ? options.eps : 1e-8; // used in adam or adadelta
this.beta1 = typeof options.beta1 !== 'undefined' ? options.beta1 : 0.9; // used in adam
this.beta2 = typeof options.beta2 !== 'undefined' ? options.beta2 : 0.999; // used in adam
this.k = 0; // iteration counter
this.gsum = []; // last iteration gradients (used for momentum calculations)
this.xsum = []; // used in adam or adadelta
// check if regression is expected
if(this.net.layers[this.net.layers.length - 1].layer_type === "regression")
this.regression = true;
else
this.regression = false;
}
Trainer.prototype = {
train: function(x, y) {
var start = new Date().getTime();
this.net.forward(x, true); // also set the flag that lets the net know we're just training
var end = new Date().getTime();
var fwd_time = end - start;
var start = new Date().getTime();
var cost_loss = this.net.backward(y);
var l2_decay_loss = 0.0;
var l1_decay_loss = 0.0;
var end = new Date().getTime();
var bwd_time = end - start;
if(this.regression && y.constructor !== Array)
console.log("Warning: a regression net requires an array as training output vector.");
this.k++;
if(this.k % this.batch_size === 0) {
var pglist = this.net.getParamsAndGrads();// 返回目前net中的每一个layer的参数和各个梯度。
// initialize lists for accumulators. Will only be done once on first iteration
if(this.gsum.length === 0 && (this.method !== 'sgd' || this.momentum > 0.0)) {
// only vanilla sgd doesnt need either lists
// momentum needs gsum
// adagrad needs gsum
// adam and adadelta needs gsum and xsum
for(var i=0;i<pglist.length;i++) {
this.gsum.push(global.zeros(pglist[i].params.length));// 建立和参数长度相同的统计量存储器gsum
if(this.method === 'adam' || this.method === 'adadelta') {
this.xsum.push(global.zeros(pglist[i].params.length));
} else {
this.xsum.push([]); // conserve memory
}
}
}
// perform an update for all sets of weights
for(var i=0;i<pglist.length;i++) {
var pg = pglist[i]; // param, gradient, other options in future (custom learning rate etc)
var p = pg.params;// 重要的变量p,神经网络的权值
var g = pg.grads;// 重要变量g,神经网络的梯度
// learning rate for some parameters.
var l2_decay_mul = typeof pg.l2_decay_mul !== 'undefined' ? pg.l2_decay_mul : 1.0;
var l1_decay_mul = typeof pg.l1_decay_mul !== 'undefined' ? pg.l1_decay_mul : 1.0;
var l2_decay = this.l2_decay * l2_decay_mul;
var l1_decay = this.l1_decay * l1_decay_mul;
var plen = p.length;
for(var j=0;j<plen;j++) {
l2_decay_loss += l2_decay*p[j]*p[j]/2; // accumulate weight decay loss
l1_decay_loss += l1_decay*Math.abs(p[j]);// 这里是对权值进行正则化的两种选择,L1,L2
var l1grad = l1_decay * (p[j] > 0 ? 1 : -1);
var l2grad = l2_decay * (p[j]);// 这里是对正则化项的梯度公式,注意L1,L2的不同
var gij = (l2grad + l1grad + g[j]) / this.batch_size; // raw batch gradient ,这里是batchlearning的梯度//公式
/// 下面是对于不同的训练方式的选择,默认的是SGD,我们直接跳入if语句的最后一行
var gsumi = this.gsum[i];
var xsumi = this.xsum[i];
if(this.method === 'adam') {
// adam update
gsumi[j] = gsumi[j] * this.beta1 + (1- this.beta1) * gij; // update biased first moment estimate
xsumi[j] = xsumi[j] * this.beta2 + (1-this.beta2) * gij * gij; // update biased second moment estimate
var biasCorr1 = gsumi[j] * (1 - Math.pow(this.beta1, this.k)); // correct bias first moment estimate
var biasCorr2 = xsumi[j] * (1 - Math.pow(this.beta2, this.k)); // correct bias second moment estimate
var dx = - this.learning_rate * biasCorr1 / (Math.sqrt(biasCorr2) + this.eps);
p[j] += dx;
} else if(this.method === 'adagrad') {
// adagrad update
gsumi[j] = gsumi[j] + gij * gij;
var dx = - this.learning_rate / Math.sqrt(gsumi[j] + this.eps) * gij;
p[j] += dx;
} else if(this.method === 'windowgrad') {
// this is adagrad but with a moving window weighted average
// so the gradient is not accumulated over the entire history of the run.
// it's also referred to as Idea #1 in Zeiler paper on Adadelta. Seems reasonable to me!
gsumi[j] = this.ro * gsumi[j] + (1-this.ro) * gij * gij;
var dx = - this.learning_rate / Math.sqrt(gsumi[j] + this.eps) * gij; // eps added for better conditioning
p[j] += dx;
} else if(this.method === 'adadelta') {
gsumi[j] = this.ro * gsumi[j] + (1-this.ro) * gij * gij;
var dx = - Math.sqrt((xsumi[j] + this.eps)/(gsumi[j] + this.eps)) * gij;
xsumi[j] = this.ro * xsumi[j] + (1-this.ro) * dx * dx; // yes, xsum lags behind gsum by 1.
p[j] += dx;
} else if(this.method === 'nesterov') {
var dx = gsumi[j];
gsumi[j] = gsumi[j] * this.momentum + this.learning_rate * gij;
dx = this.momentum * dx - (1.0 + this.momentum) * gsumi[j];
p[j] += dx;
} else {
// assume SGD
if(this.momentum > 0.0) {
// momentum update
var dx = this.momentum * gsumi[j] - this.learning_rate * gij; // step
gsumi[j] = dx; // back this up for next iteration of momentum
p[j] += dx; // apply corrected gradient
} else {
// vanilla sgd
p[j] += - this.learning_rate * gij;
}
}
g[j] = 0.0; // zero out gradient so that we can begin accumulating anew
}
}
}
// appending softmax_loss for backwards compatibility, but from now on we will always use cost_loss
// in future, TODO: have to completely redo the way loss is done around the network as currently
// loss is a bit of a hack. Ideally, user should specify arbitrary number of loss functions on any layer
// and it should all be computed correctly and automatically.
return {fwd_time: fwd_time, bwd_time: bwd_time,
l2_decay_loss: l2_decay_loss, l1_decay_loss: l1_decay_loss,
cost_loss: cost_loss, softmax_loss: cost_loss,
loss: cost_loss + l1_decay_loss + l2_decay_loss}
}
}
global.Trainer = Trainer;
global.SGDTrainer = Trainer; // backwards compatibility
})(convnetjs);
截止到目前为止,作者使用的基本API功能已经全部分析完了。在src中还有一个js文件convnet_magicnet.js。其中定义了magicnet. 作者对其的描述是:
The MagicNet class performs fully-automatic prediction on your data. You don't have to worry about anything except providing your data and letting it train for a while. Internally, the MagicNet tries out many different types of networks, performs n-fold cross-validations of network hyper-parameters across folds of your data, and creates a final classifier ensemble by model averaging the best architectures.
从中可以看出这是一个对以上基本API的一个二次封装,用户使用时候不需要担心内部的模型选择。这个magicnet.js会后续文章中解析ConvnetJS华丽的应用上面进行详细说明。