这次来写写caffe是如何来solve的
在成员函数Solve()内部,
template <typename Dtype>
void Solver<Dtype>::Solve(const char* resume_file) {
......
// For a network that is trained by the solver, no bottom or top vecs
// should be given, and we will just provide dummy vecs.
int start_iter = iter_;
//开始迭代
Step(param_.max_iter() - iter_);
......
}
下面我们看一下Solver::Step()函数内部实现情况,
template <typename Dtype>
void Solver<Dtype>::Step(int iters)
{
// 起始迭代步数
const int start_iter = iter_;
// 终止迭代步数
const int stop_iter = iter_ + iters;
// 判断是否已经完成设定步数
while (iter_ < stop_iter)
{
// 将net_中的Bolb梯度参数置为零
net_->ClearParamDiffs();
...
// accumulate the loss and gradient
Dtype loss = 0;
for (int i = 0; i < param_.iter_size(); ++i)
{
// 正向传导和反向传导,并计算loss
loss += net_->ForwardBackward();
}
loss /= param_.iter_size();
// 为了输出结果平滑,将临近的average_loss个loss数值进行平均,存储在成员变量smoothed_loss_中
UpdateSmoothedLoss(loss, start_iter, average_loss);
// BP算法更新权重
ApplyUpdate();
// Increment the internal iter_ counter -- its value should always indicate
// the number of times the weights have been updated.
++iter_;
}
}
while循环中先调用了网络类Net::ForwardBackward()成员函数进行正向传导和反向传导,并计算loss
Dtype ForwardBackward() {
Dtype loss;
//正向传导
Forward(&loss);
//反向传导
Backward();
return loss;
}
而Fordward函数中调用了ForwardFromTo
template <typename Dtype>
const vector<Blob<Dtype>*>& Net<Dtype>::Forward(Dtype* loss) {
if (loss != NULL) {
*loss = ForwardFromTo(0, layers_.size() - 1);
} else {
ForwardFromTo(0, layers_.size() - 1);
}
return net_output_blobs_;
}
而FordwardFromTo又调用了每个layer的Fordward
template <typename Dtype>
Dtype Net<Dtype>::ForwardFromTo(int start, int end) {
CHECK_GE(start, 0);
CHECK_LT(end, layers_.size());
Dtype loss = 0;
for (int i = start; i <= end; ++i) {
// LOG(ERROR) << "Forwarding " << layer_names_[i];
//每个layer的前向传导
Dtype layer_loss = layers_[i]->Forward(bottom_vecs_[i], top_vecs_[i]);
loss += layer_loss;
if (debug_info_) { ForwardDebugInfo(i); }
}
return loss;
}
虽然layer这个基类的Forward函数不是虚函数,但是在其内部包装了虚函数Forward_cpu()和Forward_gpu(),分别对应CPU版本和GPU版本。其中Forward_cpu()为父类Layer的纯虚函数,必须被子类重载。而Forward_gpu()在父类Layer中的实现为直接调用Forward_cpu(),于是该虚函数的实现为可选。总的来说,正因为这两个虚函数,所以不同层有不同的正向传导计算方法。
// Forward and backward wrappers. You should implement the cpu and
// gpu specific implementations instead, and should not change these
// functions.
template <typename Dtype>
inline Dtype Layer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {
// Lock during forward to ensure sequential forward
Lock();
Dtype loss = 0;
Reshape(bottom, top);
switch (Caffe::mode()) {
case Caffe::CPU:
//调用每个layer的子类的Forward_cpu
Forward_cpu(bottom, top);
for (int top_id = 0; top_id < top.size(); ++top_id) {
if (!this->loss(top_id)) { continue; }
const int count = top[top_id]->count();
const Dtype* data = top[top_id]->cpu_data();
const Dtype* loss_weights = top[top_id]->cpu_diff();
loss += caffe_cpu_dot(count, data, loss_weights);
}
break;
case Caffe::GPU:
Forward_gpu(bottom, top);
#ifndef CPU_ONLY
for (int top_id = 0; top_id < top.size(); ++top_id) {
if (!this->loss(top_id)) { continue; }
const int count = top[top_id]->count();
const Dtype* data = top[top_id]->gpu_data();
const Dtype* loss_weights = top[top_id]->gpu_diff();
Dtype blob_loss = 0;
caffe_gpu_dot(count, data, loss_weights, &blob_loss);
loss += blob_loss;
}
#endif
break;
default:
LOG(FATAL) << "Unknown caffe mode.";
}
Unlock();
return loss;
}
反向传导函数Backward()调用了BackwardFromTo(int start, int end)函数
template <typename Dtype>
void Net<Dtype>::Backward()
{
BackwardFromTo(layers_.size() - 1, 0);
}
template <typename Dtype>
void Net<Dtype>::BackwardFromTo(int start, int end)
{
CHECK_GE(end, 0);
CHECK_LT(start, layers_.size());
// 倒过来逐层传导
for (int i = start; i >= end; --i)
{
if (layer_need_backward_[i])
{
// 与正向传导函数类似,虽然Backward()不是虚函数,但是包装了虚函数Backward_cpu()和Backward_gpu(),因此不同层有不同的计算方法
// 注意反向传导比正向传导多了一个参数bottom_need_backward_。在实现反向传导时,首先判断当前层是否需要反向传导的层,不需要则直接返回
layers_[i]->Backward(top_vecs_[i], bottom_need_backward_[i], bottom_vecs_[i]);
if (debug_info_)
{
BackwardDebugInfo(i);
}
}
}
}
正向传导和反向传导结束后,再调用SGDSolver::ApplyUpdate()成员函数进行权重更新。
template <typename Dtype>
void SGDSolver<Dtype>::ApplyUpdate()
{
// 获取当前学习速率
Dtype rate = GetLearningRate();
if (this->param_.display() && this->iter_ % this->param_.display() == 0)
{
LOG(INFO) << "Iteration " << this->iter_ << ", lr = " << rate;
}
// 在计算当前梯度的时候,如果该值超过了阈值clip_gradients,则将梯度直接设置为该阈值
// 此处阈值设为-1,即不起作用
ClipGradients();
// 逐层更新网络中的可学习层
for (int param_id = 0; param_id < this->net_->learnable_params().size();
++param_id)
{
// 归一化
Normalize(param_id);
// L2范数正则化添加衰减权重
Regularize(param_id);
// 随机梯度下降法计算更新值
ComputeUpdateValue(param_id, rate);
}
// 更新权重
this->net_->Update();
}
最后将迭代次数++iter_,继续while循环,直到迭代次数完成。
这就是整个网络的训练过程。
感谢博客Rolin的专栏