1、先看一个Caffe Model Zoo的例子
cd到当前Caffe目录,下载meta数据,通过data\ilsvrc12\get_ilsvrc_aux.sh脚本,windows用户直接通过脚本中“http://dl.caffe.berkeleyvision.org/caffe_ilsvrc12.tar.gz”下载该数据包,解压。 将其中的imagenet_mean.binaryproto和synset_words.txt文件放到data\ilsvrc12\目录下面。
下载caffenet模型,通过http://dl.caffe.berkeleyvision.org/bvlc_reference_caffenet.caffemodel, 将文件放到“D:\VS2012\Projects\caffe-windows\models\bvlc_reference_caffenet”
运行命令“D:\VS2012\Projects\caffe-windows>Build\x64\Release\classification.exe models\bvlc_reference_caffenet\deploy.prototxt models\bvlc_reference_caffenet\bvlc_reference_caffenet.caffemodel data\ilsvrc12\imagenet_mean.binaryproto data\ilsvrc12\synset_words.txt "examples\images\cat.jpg"“
得到
---------- Prediction for examples\images\cat.jpg ----------
0.3134 - "n02123045 tabby, tabby cat"
0.2380 - "n02123159 tiger cat"
0.1235 - "n02124075 Egyptian cat"
0.1003 - "n02119022 red fox, Vulpes vulpes"
0.0715 - "n02127052 lynx, catamount"
查看了classification.cpp的main()可知该命令的运用需5个参数
int main(int argc, char** argv) {
if (argc != 6) {
std::cerr << "Usage: " << argv[0]
<< " deploy.prototxt network.caffemodel"
<< " mean.binaryproto labels.txt img.jpg" << std::endl;
return 1;
}
::google::InitGoogleLogging(argv[0]);
string model_file = argv[1];
string trained_file = argv[2];
string mean_file = argv[3];
string label_file = argv[4];
Classifier classifier(model_file, trained_file, mean_file, label_file);
string file = argv[5];
std::cout << "---------- Prediction for "
<< file << " ----------" << std::endl;
…
}
2、求解器是什么
int main(int argc, char** argv) { if (argc != 6) { std::cerr << "Usage: " << argv[0] << " deploy.prototxt network.caffemodel" << " mean.binaryproto labels.txt img.jpg" << std::endl; return 1; } ::google::InitGoogleLogging(argv[0]); string model_file = argv[1]; string trained_file = argv[2]; string mean_file = argv[3]; string label_file = argv[4]; Classifier classifier(model_file, trained_file, mean_file, label_file); string file = argv[5]; std::cout << "---------- Prediction for " << file << " ----------" << std::endl;
…
}
求解器负责对模型优化,它的KPI就是让损失函数达到全局最小。
求解器特性如下:
- 记录优化过程,创建用于学习的训练网络和用于评估学习效果的测试网络
- 调用Forward->调用Backward->更新权值,反复迭代优化模型
- 周期性的评估测试网络
- 在优化过程中为模型、求解器状态打快照
为了让权值从初始态向跟好的模型前进,求解器在每次迭代中做以下事情:
- 调用Net的前向传播函数来计算输出和损失函数
- 调用Net的反向传播函数来计算梯度
- 根据求解器方法,将梯度转化为权值增量
- 根据学习速率、历史权值、所用方法更新求解器状态。
在设计求解器时,学习速率参数的设置主要针对卷积层和全连接层(统称权值层)
学习速率(learning rate)是负梯度的权重,遗忘因子(momentum)是权值更新历史的权重。顾名思义,学习速率和遗忘因子分别代表了今天学了多少新知识,以及以前学过的知识还记得多少。
3、求解器的实现
查看caffe.proto
message SolverParameter { // // Specifying the train and test networks // // Exactly one train net must be specified using one of the following fields: // train_net_param, train_net, net_param, net // One or more test nets may be specified using any of the following fields: // test_net_param, test_net, net_param, net // If more than one test net field is specified (e.g., both net and // test_net are specified), they will be evaluated in the field order given // above: (1) test_net_param, (2) test_net, (3) net_param/net. // A test_iter must be specified for each test_net. // A test_level and/or a test_stage may also be specified for each test_net. // // Proto filename for the train net, possibly combined with one or more // test nets. optional string net = 24; // Inline train net param, possibly combined with one or more test nets. optional NetParameter net_param = 25; optional string train_net = 1; // Proto filename for the train net. repeated string test_net = 2; // Proto filenames for the test nets. optional NetParameter train_net_param = 21; // Inline train net params. repeated NetParameter test_net_param = 22; // Inline test net params. // The states for the train/test nets. Must be unspecified or // specified once per net. // // By default, all states will have solver = true; // train_state will have phase = TRAIN, // and all test_state's will have phase = TEST. // Other defaults are set according to the NetState defaults. optional NetState train_state = 26; repeated NetState test_state = 27; // The number of iterations for each test net. repeated int32 test_iter = 3; // The number of iterations between two testing phases. optional int32 test_interval = 4 [default = 0]; optional bool test_compute_loss = 19 [default = false]; // If true, run an initial test pass before the first iteration, // ensuring memory availability and printing the starting value of the loss. optional bool test_initialization = 32 [default = true]; optional float base_lr = 5; // The base learning rate // the number of iterations between displaying info. If display = 0, no info // will be displayed. optional int32 display = 6; // Display the loss averaged over the last average_loss iterations optional int32 average_loss = 33 [default = 1]; optional int32 max_iter = 7; // the maximum number of iterations // accumulate gradients over `iter_size` x `batch_size` instances optional int32 iter_size = 36 [default = 1]; // The learning rate decay policy. The currently implemented learning rate // policies are as follows: // - fixed: always return base_lr. // - step: return base_lr * gamma ^ (floor(iter / step)) // - exp: return base_lr * gamma ^ iter // - inv: return base_lr * (1 + gamma * iter) ^ (- power) // - multistep: similar to step but it allows non uniform steps defined by // stepvalue // - poly: the effective learning rate follows a polynomial decay, to be // zero by the max_iter. return base_lr (1 - iter/max_iter) ^ (power) // - sigmoid: the effective learning rate follows a sigmod decay // return base_lr ( 1/(1 + exp(-gamma * (iter - stepsize)))) // // where base_lr, max_iter, gamma, step, stepvalue and power are defined // in the solver parameter protocol buffer, and iter is the current iteration. optional string lr_policy = 8; optional float gamma = 9; // The parameter to compute the learning rate. optional float power = 10; // The parameter to compute the learning rate. optional float momentum = 11; // The momentum value. optional float weight_decay = 12; // The weight decay. // regularization types supported: L1 and L2 // controlled by weight_decay optional string regularization_type = 29 [default = "L2"]; // the stepsize for learning rate policy "step" optional int32 stepsize = 13; // the stepsize for learning rate policy "multistep" repeated int32 stepvalue = 34; // Set clip_gradients to >= 0 to clip parameter gradients to that L2 norm, // whenever their actual L2 norm is larger. optional float clip_gradients = 35 [default = -1]; optional int32 snapshot = 14 [default = 0]; // The snapshot interval optional string snapshot_prefix = 15; // The prefix for the snapshot. // whether to snapshot diff in the results or not. Snapshotting diff will help // debugging but the final protocol buffer size will be much larger. optional bool snapshot_diff = 16 [default = false]; enum SnapshotFormat { HDF5 = 0; BINARYPROTO = 1; } optional SnapshotFormat snapshot_format = 37 [default = BINARYPROTO]; // the mode solver will use: 0 for CPU and 1 for GPU. Use GPU in default. enum SolverMode { CPU = 0; GPU = 1; } optional SolverMode solver_mode = 17 [default = GPU]; // the device_id will that be used in GPU mode. Use device_id = 0 in default. optional int32 device_id = 18 [default = 0]; // If non-negative, the seed with which the Solver will initialize the Caffe // random number generator -- useful for reproducible results. Otherwise, // (and by default) initialize using a seed derived from the system clock. optional int64 random_seed = 20 [default = -1]; // type of the solver optional string type = 40 [default = "SGD"]; // numerical stability for RMSProp, AdaGrad and AdaDelta and Adam optional float delta = 31 [default = 1e-8]; // parameters for the Adam solver optional float momentum2 = 39 [default = 0.999]; // RMSProp decay value // MeanSquare(t) = rms_decay*MeanSquare(t-1) + (1-rms_decay)*SquareGradient(t) optional float rms_decay = 38 [default = 0.99]; // If true, print information about the state of the net that may help with // debugging learning problems. optional bool debug_info = 23 [default = false]; // If false, don't save a snapshot after training finishes. optional bool snapshot_after_train = 28 [default = true]; // DEPRECATED: old solver enum types, use string instead enum SolverType { SGD = 0; NESTEROV = 1; ADAGRAD = 2; RMSPROP = 3; ADADELTA = 4; ADAM = 5; } // DEPRECATED: use type instead of solver_type optional SolverType solver_type = 30 [default = SGD]; }
查看一个solver.prototxt
net: "models/bvlc_reference_caffenet/train_val.prototxt" test_iter: 1000 test_interval: 1000 base_lr: 0.01 //基准学习速率为0.01,每个layer会在基准上进行细调 lr_policy: "step" //学习速率衰减策略,step为步进方式,即每进行step次迭代,学习速率更新一次 gamma: 0.1 //学习速率衰减常数,每次更新学习速率都是乘上这个固定常数 stepsize: 100000 //每10万次迭代,对学习速率进行一次更新 display: 20 //每次打印信息的间隔,单位为迭代次数 max_iter: 450000 //训练总共需要45万次迭代 momentum: 0.9 //遗忘因子 weight_decay: 0.0005 //权值衰减常数 snapshot: 10000 //打快照的间隔,单位为迭代次数 snapshot_prefix: "models/bvlc_reference_caffenet/caffenet_train" solver_mode: GPU
查看include/caffe/solver.hpp
template <typename Dtype> class Solver { public: explicit Solver(const SolverParameter& param, const Solver* root_solver = NULL); explicit Solver(const string& param_file, const Solver* root_solver = NULL); void Init(const SolverParameter& param); void InitTrainNet(); void InitTestNets(); // Client of the Solver optionally may call this in order to set the function // that the solver uses to see what action it should take (e.g. snapshot or // exit training early). void SetActionFunction(ActionCallback func); SolverAction::Enum GetRequestedAction(); // The main entry of the solver function. In default, iter will be zero. Pass // in a non-zero iter number to resume training for a pre-trained net. virtual void Solve(const char* resume_file = NULL); inline void Solve(const string resume_file) { Solve(resume_file.c_str()); } void Step(int iters); // The Restore method simply dispatches to one of the // RestoreSolverStateFrom___ protected methods. You should implement these // methods to restore the state from the appropriate snapshot type. void Restore(const char* resume_file); // The Solver::Snapshot function implements the basic snapshotting utility // that stores the learned net. You should implement the SnapshotSolverState() // function that produces a SolverState protocol buffer that needs to be // written to disk together with the learned net. void Snapshot(); virtual ~Solver() {} inline const SolverParameter& param() const { return param_; } inline shared_ptr<Net<Dtype> > net() { return net_; } inline const vector<shared_ptr<Net<Dtype> > >& test_nets() { return test_nets_; } int iter() { return iter_; } // Invoked at specific points during an iteration class Callback { protected: virtual void on_start() = 0; virtual void on_gradients_ready() = 0; template <typename T> friend class Solver; }; const vector<Callback*>& callbacks() const { return callbacks_; } void add_callback(Callback* value) { callbacks_.push_back(value); } void CheckSnapshotWritePermissions(); /** * @brief Returns the solver type. */ virtual inline const char* type() const { return ""; } protected: // Make and apply the update value for the current iteration. virtual void ApplyUpdate() = 0; string SnapshotFilename(const string extension); string SnapshotToBinaryProto(); string SnapshotToHDF5(); // The test routine void TestAll(); void Test(const int test_net_id = 0); virtual void SnapshotSolverState(const string& model_filename) = 0; virtual void RestoreSolverStateFromHDF5(const string& state_file) = 0; virtual void RestoreSolverStateFromBinaryProto(const string& state_file) = 0; void DisplayOutputBlobs(const int net_id); void UpdateSmoothedLoss(Dtype loss, int start_iter, int average_loss); SolverParameter param_; int iter_; int current_step_; shared_ptr<Net<Dtype> > net_; vector<shared_ptr<Net<Dtype> > > test_nets_; vector<Callback*> callbacks_; vector<Dtype> losses_; Dtype smoothed_loss_; // The root solver that holds root nets (actually containing shared layers) // in data parallelism const Solver* const root_solver_; // A function that can be set by a client of the Solver to provide indication // that it wants a snapshot saved and/or to exit early. ActionCallback action_request_function_; // True iff a request to stop early was received. bool requested_early_exit_; DISABLE_COPY_AND_ASSIGN(Solver); };
用SGD方法的求解器,派生于Solver,查看sgd_solver.hpp, 其他类似的还有RMSPropSolver、AdaDeltaSolver、AdamSolver…等。
template <typename Dtype> class SGDSolver : public Solver<Dtype> { public: explicit SGDSolver(const SolverParameter& param) : Solver<Dtype>(param) { PreSolve(); } explicit SGDSolver(const string& param_file) : Solver<Dtype>(param_file) { PreSolve(); } virtual inline const char* type() const { return "SGD"; } const vector<shared_ptr<Blob<Dtype> > >& history() { return history_; } protected: void PreSolve(); Dtype GetLearningRate(); virtual void ApplyUpdate(); virtual void Normalize(int param_id); virtual void Regularize(int param_id); virtual void ComputeUpdateValue(int param_id, Dtype rate); virtual void ClipGradients(); virtual void SnapshotSolverState(const string& model_filename); virtual void SnapshotSolverStateToBinaryProto(const string& model_filename); virtual void SnapshotSolverStateToHDF5(const string& model_filename); virtual void RestoreSolverStateFromHDF5(const string& state_file); virtual void RestoreSolverStateFromBinaryProto(const string& state_file); // history maintains the historical momentum data. // update maintains update related data and is not needed in snapshots. // temp maintains other information that might be needed in computation // of gradients/updates and is not needed in snapshots vector<shared_ptr<Blob<Dtype> > > history_, update_, temp_; DISABLE_COPY_AND_ASSIGN(SGDSolver); };
查看求解器的核心程序
template <typename Dtype> void Solver<Dtype>::Solve(const char* resume_file) { CHECK(Caffe::root_solver()); LOG(INFO) << "Solving " << net_->name(); LOG(INFO) << "Learning Rate Policy: " << param_.lr_policy(); // Initialize to false every time we start solving. requested_early_exit_ = false; if (resume_file) { //如果指定了快照恢复文件,则从快照恢复训练环境 LOG(INFO) << "Restoring previous solver status from " << resume_file; Restore(resume_file); } // For a network that is trained by the solver, no bottom or top vecs // should be given, and we will just provide dummy vecs. int start_iter = iter_; Step(param_.max_iter() - iter_); //关键函数 // If we haven't already, save a snapshot after optimization, unless // overridden by setting snapshot_after_train := false if (param_.snapshot_after_train() && (!param_.snapshot() || iter_ % param_.snapshot() != 0)) { Snapshot(); } if (requested_early_exit_) { LOG(INFO) << "Optimization stopped early."; return; } // After the optimization is done, run an additional train and test pass to // display the train and test loss/outputs if appropriate (based on the // display and test_interval settings, respectively). Unlike in the rest of // training, for the train net we only run a forward pass as we've already // updated the parameters "max_iter" times -- this final pass is only done to // display the loss, which is computed in the forward pass. if (param_.display() && iter_ % param_.display() == 0) { int average_loss = this->param_.average_loss(); Dtype loss; net_->Forward(&loss); UpdateSmoothedLoss(loss, start_iter, average_loss); LOG(INFO) << "Iteration " << iter_ << ", loss = " << smoothed_loss_; } if (param_.test_interval() && iter_ % param_.test_interval() == 0) { TestAll(); } LOG(INFO) << "Optimization Done."; }
深入Step()函数
template <typename Dtype> void Solver<Dtype>::Step(int iters) { const int start_iter = iter_; const int stop_iter = iter_ + iters; int average_loss = this->param_.average_loss(); losses_.clear(); smoothed_loss_ = 0; while (iter_ < stop_iter) { // zero-init the params net_->ClearParamDiffs(); if (param_.test_interval() && iter_ % param_.test_interval() == 0 && (iter_ > 0 || param_.test_initialization()) && Caffe::root_solver()) { TestAll(); if (requested_early_exit_) { // Break out of the while loop because stop was requested while testing. break; } } for (int i = 0; i < callbacks_.size(); ++i) { callbacks_[i]->on_start(); } const bool display = param_.display() && iter_ % param_.display() == 0; net_->set_debug_info(display && param_.debug_info()); // accumulate the loss and gradient Dtype loss = 0; for (int i = 0; i < param_.iter_size(); ++i) { loss += net_->ForwardBackward(); } loss /= param_.iter_size(); // average the loss across iterations for smoothed reporting UpdateSmoothedLoss(loss, start_iter, average_loss); if (display) { LOG_IF(INFO, Caffe::root_solver()) << "Iteration " << iter_ << ", loss = " << smoothed_loss_; const vector<Blob<Dtype>*>& result = net_->output_blobs(); int score_index = 0; for (int j = 0; j < result.size(); ++j) { const Dtype* result_vec = result[j]->cpu_data(); const string& output_name = net_->blob_names()[net_->output_blob_indices()[j]]; const Dtype loss_weight = net_->blob_loss_weights()[net_->output_blob_indices()[j]]; for (int k = 0; k < result[j]->count(); ++k) { ostringstream loss_msg_stream; if (loss_weight) { loss_msg_stream << " (* " << loss_weight << " = " << loss_weight * result_vec[k] << " loss)"; } LOG_IF(INFO, Caffe::root_solver()) << " Train net output #" << score_index++ << ": " << output_name << " = " << result_vec[k] << loss_msg_stream.str(); } } } for (int i = 0; i < callbacks_.size(); ++i) { callbacks_[i]->on_gradients_ready(); } ApplyUpdate(); // Increment the internal iter_ counter -- its value should always indicate // the number of times the weights have been updated. ++iter_; SolverAction::Enum request = GetRequestedAction(); // Save a snapshot if needed. if ((param_.snapshot() && iter_ % param_.snapshot() == 0 && Caffe::root_solver()) || (request == SolverAction::SNAPSHOT)) { Snapshot(); } if (SolverAction::STOP == request) { requested_early_exit_ = true; // Break out of training loop. break; } } }
由于ApplyUpdate()是纯虚函数,所以查看SGDSolver中的ApplyUpdate()
template <typename Dtype> void SGDSolver<Dtype>::ApplyUpdate() { CHECK(Caffe::root_solver()); Dtype rate = GetLearningRate(); if (this->param_.display() && this->iter_ % this->param_.display() == 0) { LOG(INFO) << "Iteration " << this->iter_ << ", lr = " << rate; } ClipGradients(); //钳制误差梯度 for (int param_id = 0; param_id < this->net_->learnable_params().size(); //对训练网络中的每个权值Blob,都做归一化、正则化、计算增量这三步 ++param_id) { Normalize(param_id); Regularize(param_id); ComputeUpdateValue(param_id, rate); } this->net_->Update(); }
Solver每隔一定周期会对训练的网络做一次评估,使用test_nets_切换到新数据集进行预测,调用TestAll()函数:
template <typename Dtype> void Solver<Dtype>::TestAll() { for (int test_net_id = 0; test_net_id < test_nets_.size() && !requested_early_exit_; ++test_net_id) { Test(test_net_id); } }
对单个网络的预测会用Test()函数:
template <typename Dtype> void Solver<Dtype>::Test(const int test_net_id) { CHECK(Caffe::root_solver()); LOG(INFO) << "Iteration " << iter_ << ", Testing net (#" << test_net_id << ")"; CHECK_NOTNULL(test_nets_[test_net_id].get())-> ShareTrainedLayersWith(net_.get()); vector<Dtype> test_score; vector<int> test_score_output_id; const shared_ptr<Net<Dtype> >& test_net = test_nets_[test_net_id]; Dtype loss = 0; for (int i = 0; i < param_.test_iter(test_net_id); ++i) { SolverAction::Enum request = GetRequestedAction(); // Check to see if stoppage of testing/training has been requested. while (request != SolverAction::NONE) { if (SolverAction::SNAPSHOT == request) { Snapshot(); } else if (SolverAction::STOP == request) { requested_early_exit_ = true; } request = GetRequestedAction(); } if (requested_early_exit_) { // break out of test loop. break; } Dtype iter_loss; const vector<Blob<Dtype>*>& result = test_net->Forward(&iter_loss); if (param_.test_compute_loss()) { loss += iter_loss; } if (i == 0) { for (int j = 0; j < result.size(); ++j) { const Dtype* result_vec = result[j]->cpu_data(); for (int k = 0; k < result[j]->count(); ++k) { test_score.push_back(result_vec[k]); test_score_output_id.push_back(j); } } } else { int idx = 0; for (int j = 0; j < result.size(); ++j) { const Dtype* result_vec = result[j]->cpu_data(); for (int k = 0; k < result[j]->count(); ++k) { test_score[idx++] += result_vec[k]; } } } } if (requested_early_exit_) { LOG(INFO) << "Test interrupted."; return; } if (param_.test_compute_loss()) { loss /= param_.test_iter(test_net_id); LOG(INFO) << "Test loss: " << loss; } for (int i = 0; i < test_score.size(); ++i) { const int output_blob_index = test_net->output_blob_indices()[test_score_output_id[i]]; const string& output_name = test_net->blob_names()[output_blob_index]; const Dtype loss_weight = test_net->blob_loss_weights()[output_blob_index]; ostringstream loss_msg_stream; const Dtype mean_score = test_score[i] / param_.test_iter(test_net_id); if (loss_weight) { loss_msg_stream << " (* " << loss_weight << " = " << loss_weight * mean_score << " loss)"; } LOG(INFO) << " Test net output #" << i << ": " << output_name << " = " << mean_score << loss_msg_stream.str(); } }