Caffe源码阅读(粗读)–网络初始化
上一篇博文中我们简单了解了网络初始化过程,接下来我们仍将粗略理解网络训练过程。在网络初始化完成之后,生成的Solver类对象solver将会调用Solve()函数训练网络并保存训练好的网络模型。
src/caffe/solver.cpp
Solve()函数首先根据resume_file是否为空来判断是否需要finetuning网络模型,如果resume_file不为空,就从之前训练好的网络中直接拷贝网络参数。然后设置起始迭代位置,这个是为了从之前突然中断的网络训练中回复而设定的。接着调用该类的Step()函数。
Step()函数需要预先根据参数输入设定训练起始和终结位置,并从用户参数中回复average loss。这个参数是设定对多少迭代范围内的loss值做一次平滑处理,这个平滑操作实在UpdateSmoothedLoss()函数调用时进行的。接着在一个while循环内,开始迭代优化网络参数。
while (iter_ < stop_iter) {
//首先清除前一轮迭代产生的梯度
net_->ClearParamDiffs();
//判断一下是否需要进行测试,参数test_interval由用户设定来每隔test_interval次进行一次测试
if (param_.test_interval() && iter_ % param_.test_interval() == 0
&& (iter_ > 0 || param_.test_initialization())
&& Caffe::root_solver()) {
//这儿先暂时略过不讲
}
//这个是多个GPU协同工作时保持同步的一些操作,略过不讲
for (int i = 0; i < callbacks_.size(); ++i) {
callbacks_[i]->on_start();
}
//确定是否需要显示训练信息,display参数是指经过多少次迭代打印一次
const bool display = param_.display() && iter_ % param_.display() == 0;
net_->set_debug_info(display && param_.debug_info());
//printf("The display is %d, and the debug_info is %d...\n", int(display), param_.debug_info());
// accumulate the loss and gradient
Dtype loss = 0;
//iter_size是每个batch_size的数据将会在一次迭代训练内被重复使用多少次
for (int i = 0; i < param_.iter_size(); ++i) {
//很重要,完成一次前向传播和BP算法
loss += net_->ForwardBackward();
}
loss /= param_.iter_size();
//平滑loss值,只是为了打印,并不影响网络的训练
UpdateSmoothedLoss(loss, start_iter, average_loss);
if (display) {
LOG_IF(INFO, Caffe::root_solver()) << "Iteration " << iter_
<< ", loss = " << smoothed_loss_;
const vector<Blob<Dtype>*>& result = net_->output_blobs();
int score_index = 0;
for (int j = 0; j < result.size(); ++j) {
const Dtype* result_vec = result[j]->cpu_data();
const string& output_name =
net_->blob_names()[net_->output_blob_indices()[j]];
const Dtype loss_weight =
net_->blob_loss_weights()[net_->output_blob_indices()[j]];
for (int k = 0; k < result[j]->count(); ++k) {
ostringstream loss_msg_stream;
if (loss_weight) {
loss_msg_stream << " (* " << loss_weight
<< " = " << loss_weight * result_vec[k] << " loss)";
}
LOG_IF(INFO, Caffe::root_solver()) << " Train net output #"
<< score_index++ << ": " << output_name << " = "
<< result_vec[k] << loss_msg_stream.str();
}
}
}
for (int i = 0; i < callbacks_.size(); ++i) {
callbacks_[i]->on_gradients_ready();
}
ApplyUpdate();
// Increment the internal iter_ counter -- its value should always indicate
// the number of times the weights have been updated.
++iter_;
SolverAction::Enum request = GetRequestedAction();
// Save a snapshot if needed.
if ((param_.snapshot()
&& iter_ % param_.snapshot() == 0
&& Caffe::root_solver()) ||
(request == SolverAction::SNAPSHOT)) {
Snapshot();
}
if (SolverAction::STOP == request) {
requested_early_exit_ = true;
// Break out of training loop.
break;
}
}