// NOTE
// Update the next available ID when youadd a new SolverParameter field.
//
// SolverParameter next available ID: 44(last added: plateau_winsize)
message SolverParameter {
//
// Specifying the train and test networksc
//
//Exactly one train net must be specified using one of the following fields:
// train_net_param, train_net,net_param, net(必须仅有一个训练网络)
//One or more test nets may be specified using any of the following fields:
// test_net_param, test_net,net_param, net(可以有多个测试网络)
//If more than one test net field is specified (e.g., both net and
//test_net are specified), they will be evaluated in the field order given
//above: (1) test_net_param, (2) test_net, (3) net_param/net.
//A test_iter must bespecified for each test_net.
//A test_level and/ora test_stage may also be specified for each test_net.
//
//网络参数 及其 评测手段
//Proto filename for the train net, possibly combined with one or more
//test nets.
optional string net = 24;
//Inline train net param, possibly combined with one or more test nets.
optional NetParameter net_param = 25;
optional string train_net = 1; // Proto filename for the train net.
repeated string test_net = 2; // Proto filenames for the test nets.
optional NetParameter train_net_param = 21; // Inline train net params.
repeated NetParameter test_net_param = 22; // Inline test net params.
//The states for the train/test nets. Must be unspecified or
//specified once per net.
//
//By default, all states will have solver = true;
//train_state will have phase = TRAIN,
//and all test_state's will have phase = TEST.
//Other defaults are set according to the NetState defaults.
optional NetState train_state =26;
repeated NetState test_state =27;
//Evaluation type.
optional string eval_type = 41 [default = "classification"];
//ap_version: different ways of computing Average Precision.
// Checkhttps://sanchom.wordpress.com/tag/average-precision/ for details.
// 11point: the 11-pointinterpolated average precision. Used in VOC2007.
// MaxIntegral: maximallyinterpolated AP. Used in VOC2012/ILSVRC.
// Integral: the natural integral of theprecision-recall curve.
optional string ap_version = 42 [default = "Integral"];
//If true, display per class result.
optional bool show_per_class_result = 44 [default = false];
/每个测试网络的迭代次数
//The number of iterations for each test net.
repeated int32 test_iter = 3;
///两次测试阶段的迭代间隔
//The number of iterations between two testing phases.
optional int32 test_interval = 4 [default = 0];
optional bool test_compute_loss = 19 [default = false];
//If true, run an initial test pass before the first iteration,
//ensuring memory availability and printing the starting value of the loss.
optional bool test_initialization = 32 [default = true];
optional float base_lr = 5; //The base learning rate(后续配合衰减因子一起用)
//the number of iterations between displaying info. If display = 0, no info
//will be displayed.
optional int32 display = 6;
//Display the loss averaged over the last average_loss iterations
optional int32 average_loss = 33 [default = 1];
optional int32 max_iter = 7; //the maximum number of iterations
//accumulate gradients over `iter_size` x `batch_size` instances
optional int32 iter_size = 36 [default = 1];
//衰减机制的学习率
//The learning rate decay policy. The currently implemented learning rate
//policies are as follows:
// - fixed: always returnbase_lr.
// - step: return base_lr *gamma ^ (floor(iter / step))
// - exp: return base_lr *gamma ^ iter
// - inv: return base_lr * (1 +gamma * iter) ^ (- power)
// - multistep: similar to stepbut it allows non uniform steps defined by
// stepvalue
// - poly: the effectivelearning rate follows a polynomial decay, to be
// zero by the max_iter.return base_lr (1 - iter/max_iter) ^ (power)
// - sigmoid: the effectivelearning rate follows a sigmod decay
// return base_lr ( 1/(1 +exp(-gamma * (iter - stepsize))))
// - plateau: decreases lr
// if the minimumloss isn't updated for 'plateau_winsize' iters
//
//where base_lr, max_iter, gamma, step, stepvalue and power are defined
//in the solver parameter protocol buffer, and iter is the current iteration.
optional string lr_policy = 8;
optional float gamma = 9; // The parameter to compute the learning rate.
optional float power = 10; // The parameter to compute the learningrate.
optional float momentum = 11; //The momentum value.
optional float weight_decay =12; // The weight decay.
// regularization typessupported: L1 and L2
// controlled by weight_decay
optional stringregularization_type = 29 [default = "L2"];
//the stepsize for learning rate policy "step"
optional int32 stepsize = 13;
//the stepsize for learning rate policy "multistep"
repeated int32 stepvalue = 34;
//the stepsize for learning rate policy "plateau"
repeated int32 plateau_winsize = 43;
//Set clip_gradients to >= 0 to clip parameter gradients to that L2 norm,
//whenever their actual L2 norm is larger.
optional float clip_gradients =35 [default = -1];
/关于快照的一些东西
optional int32 snapshot = 14 [default = 0]; // The snapshot interval
optional string snapshot_prefix = 15; // The prefix for the snapshot.
//whether to snapshot diff in the results or not. Snapshotting diff will help
//debugging but the final protocol buffer size will be much larger.
optional bool snapshot_diff = 16 [default = false];
enum SnapshotFormat {
HDF5 = 0;
BINARYPROTO = 1;
}
optional SnapshotFormat snapshot_format = 37 [default = BINARYPROTO];
//the mode solver will use: 0 for CPU and 1 for GPU. Use GPU in default.
enum SolverMode {
CPU = 0;
GPU = 1;
}
///最优化的模式
optional SolverMode solver_mode = 17 [default = GPU];
//the device_id will that be used in GPU mode. Use device_id = 0 in default.
optional int32 device_id = 18 [default = 0];
//If non-negative, the seed with which the Solver will initialize the Caffe
//random number generator -- useful for reproducible results. Otherwise,
//(and by default) initialize using a seed derived from the system clock.
optional int64 random_seed = 20 [default = -1];
//type of the solver
optional string type = 40 [default = "SGD"];
//numerical stability for RMSProp, AdaGrad and AdaDelta and Adam
optional float delta = 31 [default = 1e-8];
//parameters for the Adam solver
optional float momentum2 = 39 [default = 0.999];
//RMSProp decay value
//MeanSquare(t) = rms_decay*MeanSquare(t-1) + (1-rms_decay)*SquareGradient(t)
optional float rms_decay = 38 [default = 0.99];
//If true, print information about the state of the net that may help with
//debugging learning problems.
optional bool debug_info = 23 [default = false];
//If false, don't save a snapshot after training finishes.
optional bool snapshot_after_train = 28 [default = true];
//DEPRECATED: old solver enum types, use string instead
enum SolverType {
SGD = 0;
NESTEROV = 1;
ADAGRAD = 2;
RMSPROP = 3;
ADADELTA = 4;
ADAM = 5;
}
//DEPRECATED: use type instead of solver_type
optional SolverType solver_type = 30 [default = SGD];
}
补充:SGD的一些东西
随机梯度下降法:此处的随机,实际指的是样本的随机,就是取minbatch的梯度下降法。参数的跟新为:新参数= 旧参数– 学习因子* 梯度。
momentum可以让使用SGD的深度学习方法更加稳定以及快速,
根据论文《imagenet classification with deep convolution neural network》中权重都保留了上一次的权重的一部分,确实可以达到稳定的效果,至于快速,还是不懂。
并且此论文中说了weight_decay的作用,查看论文5.
好像网上说的更多的偏向于什么惩罚正则项。