啊,起这么唬人的名字确实不符合做科研应该有的心态,但是作为一个菜鸟,让我“揭秘”一下吧,如果某一天我成了大牛,我一定会低调的。
看caffe的源代码的时候,在网上看到了很多不错的资料,我觉得知乎上甘宇飞的回答对我帮助最大,但是有些地方对我来说还是太简洁了,希望把我的经验分享出来,也做个记录,代码肯定还是要自己看的,看得时候拿这个做一个参考还是可以的,废话不多说,把我看的流程展示如下:
就是按照cnn运行的过程来看得代码:
(1)首先在$root_caffe/tools/caffe.cpp,这个文件里边包含了主函数,由于train和test的过程差别不大(其实我没有细看test的过程),这里只分析train的过程
95 int train() {
96 CHECK_GT(FLAGS_solver.size(), 0) << "Need a solver definition to train.";
97 CHECK(!FLAGS_snapshot.size() || !FLAGS_weights.size())
98 << "Give a snapshot to resume training or weights to finetune "
99 "but not both.";
100
101 caffe::SolverParameter solver_param;
102 caffe::ReadProtoFromTextFileOrDie(FLAGS_solver, &solver_param);
103
104 // If the gpu flag is not provided, allow the mode and device to be set
105 // in the solver prototxt.
106 if (FLAGS_gpu < 0
107 && solver_param.solver_mode() == caffe::SolverParameter_SolverMode_GPU) {
108 FLAGS_gpu = solver_param.device_id();
109 }
110
111 // Set device id and mode
112 if (FLAGS_gpu >= 0) {
113 LOG(INFO) << "Use GPU with device ID " << FLAGS_gpu;
114 Caffe::SetDevice(FLAGS_gpu);
115 Caffe::set_mode(Caffe::GPU);
116 } else {
117 LOG(INFO) << "Use CPU.";
118 Caffe::set_mode(Caffe::CPU);
119 }
120
121 LOG(INFO) << "Starting Optimization";
122 shared_ptr<caffe::Solver<float> >
123 solver(caffe::GetSolver<float>(solver_param));
124
125 if (FLAGS_snapshot.size()) {
126 LOG(INFO) << "Resuming from " << FLAGS_snapshot;
127 solver->Solve(FLAGS_snapshot);
128 } else if (FLAGS_weights.size()) {
129 CopyLayers(&*solver, FLAGS_weights);
130 solver->Solve();
131 } else {
132 solver->Solve();
133 }
134 LOG(INFO) << "Optimization Done.";
135 return 0;
136 }
137 RegisterBrewFunction(train);
这里主要的函数包括两个,一个是solver(),进行初始化的,另一个就是cnn进行迭代学习的主函数solver->Solve(),按照顺序来介绍。
(2)首先是solver(),solver()调用Init(),Init()如下:
template <typename Dtype>
31 void Solver<Dtype>::Init(const SolverParameter& param) {
32 LOG(INFO) << "Initializing solver from parameters: " << std::endl
33 << param.DebugString();
34 param_ = param;
35 CHECK_GE(param_.average_loss(), 1) << "average_loss should be non-negative.";
36 if (param_.random_seed() >= 0) {
37 Caffe::set_random_seed(param_.random_seed());
38 }
39 // Scaffolding code
40 InitTrainNet();
41 InitTestNets();
42 LOG(INFO) << "Solver scaffolding done.";
43 iter_ = 0;
44 current_step_ = 0;
45 }
主要是InitTrainNet()和InitTestNets()这两个函数,如名字所示这两个就分别初始化训练网络和测试网络,下边看InitTrainNet()
template <typename Dtype>
48 void Solver<Dtype>::InitTrainNet() {
49 const int num_train_nets = param_.has_net() + param_.has_net_param() +
50 param_.has_train_net() + param_.has_train_net_param();
51 const string& field_names = "net, net_param, train_net, train_net_param";
52 CHECK_GE(num_train_nets, 1) << "SolverParameter must specify a train net "
53 << "using one of these fields: " << field_names;
54 CHECK_LE(num_train_nets, 1) << "SolverParameter must not contain more than "
55 << "one of these fields specifying a train_net: " << field_names;
56 NetParameter net_param;
57 if (param_.has_train_net_param()) {
58 LOG(INFO) << "Creating training net specified in train_net_param.";
59 net_param.CopyFrom(param_.train_net_param());
60 } else if (param_.has_train_net()) {
61 LOG(INFO) << "Creating training net from train_net file: "
62 << param_.train_net();
63 ReadNetParamsFromTextFileOrDie(param_.train_net(), &net_param);
64 }
65 if (param_.has_net_param()) {
66 LOG(INFO) << "Creating training net specified in net_param.";
67 net_param.CopyFrom(param_.net_param());
68 }
69 if (param_.has_net()) {
70 LOG(INFO) << "Creating training net from net file: " << param_.net();
71 ReadNetParamsFromTextFileOrDie(param_.net(), &net_param);
72 }
73 // Set the correct NetState. We start with the solver defaults (lowest
74 // precedence); then, merge in any NetState specified by the net_param itself;
75 // finally, merge in any NetState specified by the train_state (highest
76 // precedence).
77 NetState net_state;
78 net_state.set_phase(TRAIN);
79 net_state.MergeFrom(net_param.state());
80 net_state.MergeFrom(param_.train_state());
81 net_param.mutable_state()->CopyFrom(net_state);
82 net_.reset(new Net<Dtype>(net_param));
83 }
84
代码看起来很长,可是干货也就是最后一条命令,net_.reset(new Net(net_param));这一条命令实现了net的初始化,调用了net::Init(),net::Init()代码就比较复杂,我看这个看了很久,这里主要是搭建整个网络,连接各个layer,说具体点儿就是net这个类中包含了很多变量
/// @brief The network name
213 string name_;
214 /// @brief The phase: TRAIN or TEST
215 Phase phase_;
216 /// @brief Individual layers in the net
217 vector<shared_ptr<Layer<Dtype> > > layers_;
218 vector<string> layer_names_;
219 map<string, int> layer_names_index_;
220 vector<bool> layer_need_backward_;
221 /// @brief the blobs storing intermediate results between the layer.
222 vector<shared_ptr<Blob<Dtype> > > blobs_;
223 vector<string> blob_names_;
224 map<string, int> blob_names_index_;
225 vector<bool> blob_need_backward_;
226 /// bottom_vecs stores the vectors containing the input for each layer.
227 /// They don't actually host the blobs (blobs_ does), so we simply store
228 /// pointers.
229 vector<vector<Blob<Dtype>*> > bottom_vecs_;
230 vector<vector<int> > bottom_id_vecs_;
231 vector<vector<bool> > bottom_need_backward_;
232 /// top_vecs stores the vectors containing the output for each layer
233 vector<vector<Blob<Dtype>*> > top_vecs_;
234 vector<vector<int> > top_id_vecs_;
235 /// Vector of weight in the loss (or objective) function of each net blob,
236 /// indexed by blob_id.
237 vector<Dtype> blob_loss_weights_;
238 vector<vector<int> > param_id_vecs_;
239 vector<int> param_owners_;
240 vector<string> param_display_names_;
241 vector<pair<int, int> > param_layer_indices_;
242 map<string, int> param_names_index_;
243 /// blob indices for the input and the output of the net
244 vector<int> net_input_blob_indices_;
245 vector<int> net_output_blob_indices_;
246 vector<Blob<Dtype>*> net_input_blobs_;
247 vector<Blob<Dtype>*> net_output_blobs_;
248 /// The parameters in the network.
249 vector<shared_ptr<Blob<Dtype> > > params_;
250 /// the learning rate multipliers
251 vector<float> params_lr_;
252 /// the weight decay multipliers
253 vector<float> params_weight_decay_;
254 /// The bytes of memory used by this net
255 size_t memory_used_;
主要就是根据各个layer的参数,初始化这些变量,定义好每个layer的bottom以及top,定义好每个layer的param,以及整个net的output.这里有个大的循环,是一层一层初始化的,等所有的layer都初始化好之后,net的output也就出来了,其实就是所有layer的output减去input.
初始化完了之后solver()也就基本上介绍完了,下边该solver->Solve()登场了:
220 template <typename Dtype>
221 void Solver<Dtype>::Solve(const char* resume_file) {
222 LOG(INFO) << "Solving " << net_->name();
223 LOG(INFO) << "Learning Rate Policy: " << param_.lr_policy();
224
225 if (resume_file) {
226 LOG(INFO) << "Restoring previous solver status from " << resume_file;
227 Restore(resume_file);
228 }
229
230 // For a network that is trained by the solver, no bottom or top vecs
231 // should be given, and we will just provide dummy vecs.
232 Step(param_.max_iter() - iter_);
233 // If we haven't already, save a snapshot after optimization, unless
234 // overridden by setting snapshot_after_train := false
235 if (param_.snapshot_after_train()
236 && (!param_.snapshot() || iter_ % param_.snapshot() != 0)) {
237 Snapshot();
238 }
239 // After the optimization is done, run an additional train and test pass to
240 // display the train and test loss/outputs if appropriate (based on the
241 // display and test_interval settings, respectively). Unlike in the rest of
242 // training, for the train net we only run a forward pass as we've already
243 // updated the parameters "max_iter" times -- this final pass is only done to
244 // display the loss, which is computed in the forward pass.
245 if (param_.display() && iter_ % param_.display() == 0) {
246 Dtype loss;
247 net_->ForwardPrefilled(&loss);
248 LOG(INFO) << "Iteration " << iter_ << ", loss = " << loss;
249 }
250 if (param_.test_interval() && iter_ % param_.test_interval() == 0) {
251 TestAll();
252 }
253 LOG(INFO) << "Optimization Done.";
254 }
其实吧,这里边开头和结尾是处理一些输出的信息,主要的循环是在Step(param_.max_iter() - iter_);这个函数:
161 template <typename Dtype>
162 void Solver<Dtype>::Step(int iters) {
163 vector<Blob<Dtype>*> bottom_vec;
164 const int start_iter = iter_;
165 const int stop_iter = iter_ + iters;
166 int average_loss = this->param_.average_loss();
167 vector<Dtype> losses;
168 Dtype smoothed_loss = 0;
169
170 for (; iter_ < stop_iter; ++iter_) {
171 if (param_.test_interval() && iter_ % param_.test_interval() == 0
172 && (iter_ > 0 || param_.test_initialization())) {
173 TestAll();
174 }
175
176 const bool display = param_.display() && iter_ % param_.display() == 0;
177 net_->set_debug_info(display && param_.debug_info());
178 Dtype loss = net_->ForwardBackward(bottom_vec);
179 if (losses.size() < average_loss) {
180 losses.push_back(loss);
181 int size = losses.size();
182 smoothed_loss = (smoothed_loss * (size - 1) + loss) / size;
183 } else {
184 int idx = (iter_ - start_iter) % average_loss;
185 smoothed_loss += (loss - losses[idx]) / average_loss;
186 losses[idx] = loss;
187 }
188 if (display) {
189 LOG(INFO) << "Iteration " << iter_ << ", loss = " << smoothed_loss;
190 const vector<Blob<Dtype>*>& result = net_->output_blobs();
191 int score_index = 0;
192 for (int j = 0; j < result.size(); ++j) {
193 const Dtype* result_vec = result[j]->cpu_data();
194 const string& output_name =
195 net_->blob_names()[net_->output_blob_indices()[j]];
196 const Dtype loss_weight =
197 net_->blob_loss_weights()[net_->output_blob_indices()[j]];
198 for (int k = 0; k < result[j]->count(); ++k) {
199 ostringstream loss_msg_stream;
200 if (loss_weight) {
201 loss_msg_stream << " (* " << loss_weight
202 << " = " << loss_weight * result_vec[k] << " loss)";
203 }
204 LOG(INFO) << " Train net output #"
205 << score_index++ << ": " << output_name << " = "
206 << result_vec[k] << loss_msg_stream.str();
207 }
208 }
209 }
210 ComputeUpdateValue();
211 net_->Update();
212
213 // Save a snapshot if needed.
214 if (param_.snapshot() && (iter_ + 1) % param_.snapshot() == 0) {
215 Snapshot();
216 }
217 }
218 }
这里首先判断是否该测试了,如果要测试就调用testAll()显示test的信息。train的过程呢主要是调用了net_->ForwardBackward(bottom_vec);来产生loss,而ForwardBackward()主要调用了net_>Forward()和net_>Backward()而这两个函数分别调用了各个layer的Forward()和Backward(),Forward()之后就可以产生loss,最后的loss是将所有层的loss加到了一起作为总的loss。当然大部分层是不会产生loss的。
那么好了,有了loss之后就是判断是否该显示loss,这里的显示就是迭代过程中看到的loss输出的那一块儿。虽然backward并不影响loss的结果,但是backward()却会产生diff,也就是求导数,这个对后来的参数更新起到了重要的作用。
net_->ForwardBackward(bottom_vec)之后,显示了loss,在接下来就是这两个函数,ComputeUpdateValue();
这两个函数主要是用来更新参数的,毕竟前向也传播了,后向也传播了,再不更新参数就白算了,所以这里主要是按照net里边设置的weight_decay以及momentum,和lr(学习率)来对学到的diff进行修正,最后的Update()将修正的diff更新到网络中。达到了最大迭代次数,step()也就结束了。然后再看看是否需要再显示一下train和test的信息,最后判断是否需要snapshot,如果需要的话就保存一下。
net_->Update();
好了基本上就是这个样子。
本文主要介绍了caffe训练的流程,并没有涉及具体的layer。每个layer主要涉及三部分setup(),forward(),backward(),这里就不再多说了。