caffe的训练流程中的源码解析

参考博客https://blog.csdn.net/mounty_fsc/article/details/51090114
以及http://alanse7en.github.io/caffedai-ma-jie-xi-1/caffe代码解析(1)-(4)
//在这里我们只分析训练过程,根据上述博客把训练流程中的重点整理一遍,写的比较粗糙
//在src\caffe\solvers中,有六种solver,在每种solver的最后有REGISTER_SOLVER_CLASS(XXX),以REGISTER_SOLVER_CLASS(SGD)为例
//首先调用17行
/*
分别定义了SolverRegisterer这个模板类的float和double类型的static对象,这会去调用各自
的构造函数,而在SolverRegisterer的构造函数中调用了之前提到的SolverRegistry类的
AddCreator函数,这个函数就是将刚才定义的Creator_SGDSolver这个函数的指针存到
g_registry指向的map里面。
*/
#define REGISTER_SOLVER_CREATOR(type, creator)                                 \
  static SolverRegisterer<float> g_creator_f_##type(#type, creator<float>);    \
  static SolverRegisterer<double> g_creator_d_##type(#type, creator<double>)   \
 
template <typename Dtype>
class SolverRegisterer {
 public:
  SolverRegisterer(const string& type,
            //函数的指针
                     Solver<Dtype>* (*creator)(const SolverParameter&))
  {
    // LOG(INFO) << "Registering solver type: " << type;
    SolverRegistry<Dtype>::AddCreator(type, creator);
  }
};
  static void AddCreator(const string& type, Creator creator) {
    CreatorRegistry& registry = Registry();
    CHECK_EQ(registry.count(type), 0)
        << "Solver type " << type << " already registered.";
    registry[type] = creator;
  }
/*
这个宏会定义一个名为Creator_SGDSolver的函数,这个函数即为Creator类型的指针指向的函数,
在这个函数中定义了SGDSolver类对象并调用其构造函数,并将构造的这个对象得到的地址返回,这也就是Creator
类型函数的作用:构造一个对应类型的Solver对象,将其地址返回。然后在这个宏里又调用了
REGISTER_SOLVER_CREATOR这个宏
*/
#define REGISTER_SOLVER_CLASS(type)                                            \
  template <typename Dtype>                                                    \
  Solver<Dtype>* Creator_##type##Solver(                                       \
      const SolverParameter& param)                                            \
  {                                                                            \
    return new type##Solver<Dtype>(param);                                     \
  }                                                                            \
  REGISTER_SOLVER_CREATOR(type, Creator_##type##Solver)
}  // namespace caffe
//在tools/caffe.cpp的int train()中
//SolverParameter是通过Google Protocol Buffer自动生成的一个类
caffe::SolverParameter solver_param;
shared_ptr<caffe::Solver<float> > //初始化
      solver(caffe::SolverRegistry<float>::CreateSolver(solver_param))
 
---------------------------------------------------------------------------------------------- 
// 在solver_factory.hpp中
//Creator是一个函数指针类型,指向的函数的参数为SolverParameter类型,返回类型为Solver<Dtype>*
typedef Solver<Dtype>* (*Creator)(const SolverParameter&);
typedef std::map<string, Creator> CreatorRegistry
  static Solver<Dtype>* CreateSolver(const SolverParameter& param) {
 // string类型的变量type,表示Solver的类型(‘SGD’/’Nestrov’等)
 // 默认为SGD
    const string& type = param.type();
    // 定义了一个key类型为string,value类型为Creator的map:registry
    // 返回为静态变量
    CreatorRegistry& registry = Registry();
 
 
 //此处又调用solver_factory.hpp中Registry()函数,具体如下:
  static CreatorRegistry& Registry() {
 //静态变量
    static CreatorRegistry* g_registry_ = new CreatorRegistry();
    return *g_registry_;
  }
----------------------------------------------------------------------------------------------
    for (typename CreatorRegistry::iterator iter = registry.begin();
             iter != registry.end(); ++iter)
    {
         std::cout<<"key:"<<iter->first<<"``` "
             <<"value:"<<iter->second<<std::endl;}
    /*
     * 如果是一个已经register过的Solver类型,那么registry.count(type)应该为1,
     * 然后通过registry这个map返回了我们需要类型的Solver的creator,并调用这个
     * creator函数,将creator返回的Solver<Dtype>*返回。
     */
    CHECK_EQ(registry.count(type), 1) << "Unknown solver type: " << type
        << " (known types: " << SolverTypeListString() << ")";
    //通过static的g_registry_[type]获得type对应的solver的creator函数指针
    return registry[type](param);//返回Solver<Dtype>*,但是是子类SGDSolver对象的地址返回给基类
  }
 //--------------------------------------------------------------------------------------------------
//在return registry[SGD](param)中调用了36行中的new SGDSolver<Dtype>(param),而SGDSolver继承基类Solver,所以首先
//调用基类的构造函数。在sgd_solvers.hpp中定义了
explicit SGDSolver(const SolverParameter& param): Solver<Dtype>(param) { PreSolve(); }
//子类构造函数先放这儿,因为基类构造函数太长,调用的函数太多。
template <typename Dtype>
void SGDSolver<Dtype>::PreSolve() {
  // Initialize the history
  const vector<Blob<Dtype>*>& net_params = this->net_->learnable_params();
  history_.clear();
  update_.clear();
  temp_.clear();
  for (int i = 0; i < net_params.size(); ++i) {
    const vector<int>& shape = net_params[i]->shape();
    history_.push_back(shared_ptr<Blob<Dtype> >(new Blob<Dtype>(shape)));
    update_.push_back(shared_ptr<Blob<Dtype> >(new Blob<Dtype>(shape)));
    temp_.push_back(shared_ptr<Blob<Dtype> >(new Blob<Dtype>(shape)));
  }
}
在solver.hpp中又定义了
explicit Solver(const SolverParameter& param,const Solver* root_solver = NULL);
在solver.cpp中
//会调用Init()方法进行初始化,即Solver scaffolding
template <typename Dtype>
Solver<Dtype>::Solver(const SolverParameter& param, const Solver* root_solver)
    : net_(), callbacks_(), root_solver_(root_solver),requested_early_exit_(false)
{
  Init(param);
}
template <typename Dtype>
void Solver<Dtype>::Init(const SolverParameter& param) {
 // 检查当前是否是root_solver(多GPU模式下,只有root_solver才运行这一部分的代码)
 //Caffe::root_solver()此时应为真,集体在哪个位置还没找到
 CHECK(Caffe::root_solver() || root_solver_)
      << "root_solver_ needs to be set for all non-root solvers";
  LOG_IF(INFO, Caffe::root_solver()) << "Initializing solver from parameters: "
    << std::endl << param.DebugString();//DebugString()没找到在哪
  //为solver类的数据成员param_赋值
  param_ = param;
  // 默认为1
  CHECK_GE(param_.average_loss(), 1) << "average_loss should be non-negative.";
  //检测快照的的写入权限
  CheckSnapshotWritePermissions();
  //random_seed默认为-1,
  if (Caffe::root_solver() && param_.random_seed() >= 0) {
  //调用Caffe命名空间里的set_random_seed函数,而不是caffe类的set_random_seed函数;
  //param_.random_seed()实际上调用的是::google::protobuf::int64 random_seed()
   Caffe::set_random_seed(param_.random_seed());
  }
  // Scaffolding code
  // 搭建网络结构
  InitTrainNet();
  if (Caffe::root_solver()) {
 //LOG(INFO) << "You big SB.";
    InitTestNets();
    //LOG(INFO) << "Solver scaffolding done.";
  }
// iter_初始化为0
  iter_ = 0;
  current_step_ = 0;
}
//InitTrainNet()具体如下:
// 初始化训练网络
template <typename Dtype>
void Solver<Dtype>::InitTrainNet() {
  const int num_train_nets = param_.has_net() + param_.has_net_param() +
      param_.has_train_net() + param_.has_train_net_param();
  const string& field_names = "net, net_param, train_net, train_net_param";
  //有且只能有一个train net
  CHECK_GE(num_train_nets, 1) << "SolverParameter must specify a train net "
      << "using one of these fields: " << field_names;
  CHECK_LE(num_train_nets, 1) << "SolverParameter must not contain more than "
      << "one of these fields specifying a train_net: " << field_names;
// 读取训练网络结构参数
  NetParameter net_param;
  if (param_.has_train_net_param()) {
    LOG_IF(INFO, Caffe::root_solver())
        << "Creating training net specified in train_net_param.";
    net_param.CopyFrom(param_.train_net_param());
  }
  else if (param_.has_train_net()) {
    LOG_IF(INFO, Caffe::root_solver())
        << "Creating training net from train_net file: " << param_.train_net();
    ReadNetParamsFromTextFileOrDie(param_.train_net(), &net_param);
  }
  if (param_.has_net_param()) {
    LOG_IF(INFO, Caffe::root_solver())
        << "Creating training net specified in net_param.";
    net_param.CopyFrom(param_.net_param());
  }
  if (param_.has_net()) {
    LOG_IF(INFO, Caffe::root_solver())
        << "Creating training net from net file: " << param_.net();
    ReadNetParamsFromTextFileOrDie(param_.net(), &net_param);
  }
  // Set the correct NetState.  We start with the solver defaults (lowest
  // precedence); then, merge in any NetState specified by the net_param itself;
  // finally, merge in any NetState specified by the train_state (highest
  // precedence).
  //设置正确的网络状态,训练从默认开始,然后融入通过网络层规定在任何状态,
  //最后融入训练状态(最优解)
  NetState net_state;
  net_state.set_phase(TRAIN);//哪里有net_state.set_phase,猜测在proto.pb.h中
  //LOG(INFO) << net_state.phase()<<"You big SB.";
  net_state.MergeFrom(net_param.state());
  //LOG(INFO) << net_state.phase()<<"You big SB.";
  //从低到高获取state,最终从最高优先级SolverParameter类型中的train_state,
  //显然这会覆盖掉之前获取的state。
  net_state.MergeFrom(param_.train_state());
  //LOG(INFO) << net_state.phase()<<"You big SB.";
  //这里获取的state可以为Netparameter中的state赋值,然后可以根据LayerParameter中的
  //include和exclude来确定该层是否应该包含在网络中。
  net_param.mutable_state()->CopyFrom(net_state);
  //这是Initialize train net 的一部分工作。InitTestNets也是如此
  if (Caffe::root_solver()) {
  //调用模板类的构造函数,进行net的初始化
    net_.reset(new Net<Dtype>(net_param));
  }
  else {
    net_.reset(new Net<Dtype>(net_param, root_solver_->net_.get()));
  }
}
 207处net_.reset(new Net<Dtype>(net_param))的构造函数在net,hpp中声明如下:
 explicit Net(const NetParameter& param, const Net* root_net = NULL);
 调用net.cpp中的定义
template <typename Dtype>
Net<Dtype>::Net(const NetParameter& param, const Net* root_net)
    : root_net_(root_net) {
  Init(param);
}
//-----------------------网络结构初始化开始,通过Net的构造函数调用---------------------
template <typename Dtype>
void Net<Dtype>::Init(const NetParameter& in_param) {
  CHECK(Caffe::root_solver() || root_net_)
      << "root_net_ needs to be set for all non-root solvers";
  // 得到是训练网络还是测试网络
  phase_ = in_param.state().phase();

  /*
   * 引用作为函数参数进行传递时,实质上传递的是实参本身,即传递进来的不是实参的一个拷贝,
   * 因此对形参的修改其实是对实参的修改,所以在用引用进行参数传递时,不仅节约时间,而且
   * 可以节约空间。
   */

  // Filter layers based on their include/exclude rules and
  // the current NetState.
  // 传入网络结构参数,然后可以根据LayerParameter中的
  // include和exclude来确定该层是否应该包含在网络中,返回过滤过后的网络参数
  NetParameter filtered_param;
  FilterNet(in_param, &filtered_param);
  LOG_IF(INFO, Caffe::root_solver())
      << "Initializing net from parameters: " << std::endl
      << filtered_param.DebugString();
  // Create a copy of filtered_param with splits added where necessary.
  /*
   * InsertSplits函数,若某层的top(即输出)被两个或两个以上的层作为输入或输入的一部分,
   * 则对该层增加空间位置与其成并列关系的一个或若干个SplitLayer。(每仔细看)
   */
  NetParameter param;
  InsertSplits(filtered_param, &param);
  // Basically, build all the layers and set up their connections.
  name_ = param.name();
  map<string, int> blob_name_to_idx;
  //LOG(INFO) <<  " -> " <<" "<<(blob_name_to_idx).size()<<"heheda";
  //
  set<string> available_blobs;
  memory_used_ = 0;
  // For each layer, set up its input and output
  // resize是改变容器的大小,并且使用默认构造函数创建对象
  // 参数初始化
  bottom_vecs_.resize(param.layer_size());//存每一层的输入(bottom)blob指针
  top_vecs_.resize(param.layer_size());//存每一层输出(top)的blob指针
  bottom_id_vecs_.resize(param.layer_size());//存每一层输入(bottom)blob的id 
  param_id_vecs_.resize(param.layer_size());//存每一层参数blob的id
  top_id_vecs_.resize(param.layer_size());//存每一层输出(top)的blob的id 
  bottom_need_backward_.resize(param.layer_size());//该blob是需要返回的bool值

  // 循环每一层
  for (int layer_id = 0; layer_id < param.layer_size(); ++layer_id) {
    // For non-root solvers, whether this layer is shared from root_net_.
 // 默认为false
    bool share_from_root = !Caffe::root_solver()
        && root_net_->layers_[layer_id]->ShareInParallel();
    // Inherit phase from net if unset.
    // 如果每一层没有设置phase,则从网络参数中继承
    if (!param.layer(layer_id).has_phase()) {
      param.mutable_layer(layer_id)->set_phase(phase_);
    }
    // Setup layer.
    // 每一层的结构参数常量
    const LayerParameter& layer_param = param.layer(layer_id);
    // 是否设置了对输入求导,参考caffe.proto里LayerParameter的propagate_down参数
    if (layer_param.propagate_down_size() > 0) {
      CHECK_EQ(layer_param.propagate_down_size(),
          layer_param.bottom_size())
          << "propagate_down param must be specified "
          << "either 0 or bottom_size times ";
    }

    if (share_from_root) {
      LOG(INFO) << "Sharing layer " << layer_param.name() << " from root net";
      layers_.push_back(root_net_->layers_[layer_id]);
      layers_[layer_id]->SetShared(true);
    }
    else {
      //把每一特定层的指针存放在容器中
      layers_.push_back(LayerRegistry<Dtype>::CreateLayer(layer_param));
    }
    //存放网络中每一层的名称
    layer_names_.push_back(layer_param.name());
    LOG_IF(INFO, Caffe::root_solver())
        << "Creating Layer " << layer_param.name();
    // 判断每层是否需要反向传播
    bool need_backward = false;
    // Figure out this layer's input and output
    // 计算这一层的输入和输出,注意第一层他妈的没有输出bottom,所以在第一层的时候并不会进入循环
    // 这个地方耽误了半天,妈的
    for (int bottom_id = 0; bottom_id < layer_param.bottom_size();
         ++bottom_id) {
      //LOG(INFO) <<  " -> " <<" "<<(blob_name_to_idx).size()<<"sbheheda";
      const int blob_id = AppendBottom(param, layer_id, bottom_id,
                                       &available_blobs, &blob_name_to_idx);
      need_backward |= blob_need_backward_[blob_id];
    }
    // 每一层输出数据的个数
    int num_top = layer_param.top_size();

    // 对每层的每个输出数据
    for (int top_id = 0; top_id < num_top; ++top_id) {
      AppendTop(param, layer_id, top_id, &available_blobs, &blob_name_to_idx);
      // Collect Input layer tops as Net inputs.
      if (layer_param.type() == "Input") {
        const int blob_id = blobs_.size() - 1;
        net_input_blob_indices_.push_back(blob_id);
        net_input_blobs_.push_back(blobs_[blob_id].get());
      }
    }
    // If the layer specifies that AutoTopBlobs() -> true and the LayerParameter
    // specified fewer than the required number (as specified by
    // ExactNumTopBlobs() or MinTopBlobs()), allocate them here.
    Layer<Dtype>* layer = layers_[layer_id].get();
    if (layer->AutoTopBlobs()) {
      const int needed_num_top =
          std::max(layer->MinTopBlobs(), layer->ExactNumTopBlobs());
      for (; num_top < needed_num_top; ++num_top) {
        // Add "anonymous" top blobs -- do not modify available_blobs or
        // blob_name_to_idx as we don't want these blobs to be usable as input
        // to other layers.
        AppendTop(param, layer_id, num_top, NULL, NULL);
      }
    }
    // After this layer is connected, set it up.
    if (share_from_root) {
      // Set up size of top blobs using root_net_
      const vector<Blob<Dtype>*>& base_top = root_net_->top_vecs_[layer_id];
      const vector<Blob<Dtype>*>& this_top = this->top_vecs_[layer_id];
      for (int top_id = 0; top_id < base_top.size(); ++top_id) {
        this_top[top_id]->ReshapeLike(*base_top[top_id]);
        LOG(INFO) << "Created top blob " << top_id << " (shape: "
            << this_top[top_id]->shape_string() <<  ") for shared layer "
            << layer_param.name();
      }
    }
    else {
      // 设置layers实例
      // 调用layer类的Setup函数进行初始化,输入参数:每个layer的输入blobs以及输出blobs
      // 为每个blob设置大小
      // 设置每一层的可学习参数,保存在layer的成员blobs_中
      layers_[layer_id]->SetUp(bottom_vecs_[layer_id], top_vecs_[layer_id]);
    }
    LOG_IF(INFO, Caffe::root_solver())
        << "Setting up " << layer_names_[layer_id];
    // 对每一层的输出blobs循环
    for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) {
      if (blob_loss_weights_.size() <= top_id_vecs_[layer_id][top_id]) {
        blob_loss_weights_.resize(top_id_vecs_[layer_id][top_id] + 1, Dtype(0));
      }
      blob_loss_weights_[top_id_vecs_[layer_id][top_id]] = layer->loss(top_id);
      LOG_IF(INFO, Caffe::root_solver())
          << "Top shape: " << top_vecs_[layer_id][top_id]->shape_string();
      if (layer->loss(top_id)) {
        LOG_IF(INFO, Caffe::root_solver())
            << "    with loss weight " << layer->loss(top_id);
      }
      // 计算网络所使用的字节数
      memory_used_ += top_vecs_[layer_id][top_id]->count();
    }
    // 打印目前所需的存储
    LOG_IF(INFO, Caffe::root_solver())
        << "Memory required for data: " << memory_used_ * sizeof(Dtype);
    // param通常用来设置学习率之类的参数,每层的param有多少个则说明至少有这么多个
    // 可学习参数
    const int param_size = layer_param.param_size();
    //可学习参数个数
    const int num_param_blobs = layers_[layer_id]->blobs().size();
    CHECK_LE(param_size, num_param_blobs)
        << "Too many params specified for layer " << layer_param.name();
    ParamSpec default_param_spec;
    // 对每一个可学习的参数循环
    for (int param_id = 0; param_id < num_param_blobs; ++param_id) {
      // 如果prototxt文件没有设置param,则使用默认param
      const ParamSpec* param_spec = (param_id < param_size) ?
          &layer_param.param(param_id) : &default_param_spec;
      // 学习率不等于0,表示需要对这个可学习的参数反向求导
      const bool param_need_backward = param_spec->lr_mult() != 0;
      need_backward |= param_need_backward;
      layers_[layer_id]->set_param_propagate_down(param_id,
                                                  param_need_backward);
    }
    // 接下来的工作是将每层的parameter的指针塞进params_,尤其是learnable_params_。
    // 对每一层的每个可学习参数循环
    for (int param_id = 0; param_id < num_param_blobs; ++param_id) {
     // param:整个网络参数,layer_id:层数id,param_id:可学习参数id
     // 设置每一层权值的一些参数,学习率,正则率,参数id等
      AppendParam(param, layer_id, param_id);
    }
    // Finally, set the backward flag
    // 最后设置反向传播标志
    layer_need_backward_.push_back(need_backward);
    if (need_backward) {
      for (int top_id = 0; top_id < top_id_vecs_[layer_id].size(); ++top_id) {
        blob_need_backward_[top_id_vecs_[layer_id][top_id]] = true;
      }
    }
  }
// 每一层的循环在这里结束

  // Go through the net backwards to determine which blobs contribute to the
  // loss.  We can skip backward computation for blobs that don't contribute
  // to the loss.
  // 寻找反向传播过程中哪些blobs对最终的loss有影响,如果某个blob对最终的loss没有贡献,
  // 则不需要对这个blob求梯度
  // Also checks if all bottom blobs don't need backward computation (possible
  // because the skip_propagate_down param) and so we can skip bacward
  // computation for the entire layer
  // 还要检查是否所有的bottom blobs都不需要求梯度
  set<string> blobs_under_loss;
  set<string> blobs_skip_backp;
  // 对每一层从后向前循环
  for (int layer_id = layers_.size() - 1; layer_id >= 0; --layer_id) {
    bool layer_contributes_loss = false;
    bool layer_skip_propagate_down = true;
    // 对每一层的每个top blob循环
    for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) {
      const string& blob_name = blob_names_[top_id_vecs_[layer_id][top_id]];
      if (layers_[layer_id]->loss(top_id) ||
          (blobs_under_loss.find(blob_name) != blobs_under_loss.end())) {
        layer_contributes_loss = true;
      }
      if (blobs_skip_backp.find(blob_name) == blobs_skip_backp.end()) {
        layer_skip_propagate_down = false;
      }
      // 只要这一层有一个blob对loss有贡献,就说明这层对loss有贡献
      if (layer_contributes_loss && !layer_skip_propagate_down)
        break;
    }
    // If this layer can skip backward computation, also all his bottom blobs
    // don't need backpropagation
    // 如果这一层跳过梯度计算,那么这一层所有的输入blobs都不需要计算梯度
    if (layer_need_backward_[layer_id] && layer_skip_propagate_down) {
      layer_need_backward_[layer_id] = false;
      for (int bottom_id = 0; bottom_id < bottom_vecs_[layer_id].size();
               ++bottom_id) {
        bottom_need_backward_[layer_id][bottom_id] = false;
      }
    }
    if (!layer_contributes_loss) { layer_need_backward_[layer_id] = false; }
    if (Caffe::root_solver()) {
      if (layer_need_backward_[layer_id]) {
        LOG(INFO) << layer_names_[layer_id] << " needs backward computation.";
      }
      else {
        LOG(INFO) << layer_names_[layer_id]
            << " does not need backward computation.";
      }
    }
    for (int bottom_id = 0; bottom_id < bottom_vecs_[layer_id].size();
         ++bottom_id) {
      if (layer_contributes_loss) {
        const string& blob_name =
            blob_names_[bottom_id_vecs_[layer_id][bottom_id]];
        blobs_under_loss.insert(blob_name);
      }
      else {
        bottom_need_backward_[layer_id][bottom_id] = false;
      }
      if (!bottom_need_backward_[layer_id][bottom_id]) {
        const string& blob_name =
                   blob_names_[bottom_id_vecs_[layer_id][bottom_id]];
        blobs_skip_backp.insert(blob_name);
      }
    }
  }
  // 从后向前循环结束

  // Handle force_backward if needed.
  // 如果设置强制计算梯度
  if (param.force_backward()) {
    for (int layer_id = 0; layer_id < layers_.size(); ++layer_id) {
      layer_need_backward_[layer_id] = true;
      for (int bottom_id = 0;
           bottom_id < bottom_need_backward_[layer_id].size(); ++bottom_id) {
        bottom_need_backward_[layer_id][bottom_id] =
            bottom_need_backward_[layer_id][bottom_id] ||
            layers_[layer_id]->AllowForceBackward(bottom_id);
        blob_need_backward_[bottom_id_vecs_[layer_id][bottom_id]] =
            blob_need_backward_[bottom_id_vecs_[layer_id][bottom_id]] ||
            bottom_need_backward_[layer_id][bottom_id];
      }
      for (int param_id = 0; param_id < layers_[layer_id]->blobs().size();
           ++param_id) {
        layers_[layer_id]->set_param_propagate_down(param_id, true);
      }
    }
  }

  // In the end, all remaining blobs are considered output blobs.
  // 最后,输入输出blob中除了输入blob剩下的都作为网络的输出,比如loss blob
  for (set<string>::iterator it = available_blobs.begin();
      it != available_blobs.end(); ++it) {
    LOG_IF(INFO, Caffe::root_solver())
        << "This network produces output " << *it;
    net_output_blobs_.push_back(blobs_[blob_name_to_idx[*it]].get());
    net_output_blob_indices_.push_back(blob_name_to_idx[*it]);
  }
  for (size_t blob_id = 0; blob_id < blob_names_.size(); ++blob_id) {
    blob_names_index_[blob_names_[blob_id]] = blob_id;
  }
  for (size_t layer_id = 0; layer_id < layer_names_.size(); ++layer_id) {
    layer_names_index_[layer_names_[layer_id]] = layer_id;
  }
  ShareWeights();
  debug_info_ = param.debug_info();
  LOG_IF(INFO, Caffe::root_solver()) << "Network initialization done.";
}
//在Net<Dtype>::Init中有几个重要的函数如下:
// 每一层的结构参数常量
LayerParameter layer_param_
vector<shared_ptr<Layer<Dtype> > > layers_
const LayerParameter& layer_param = param.layer(layer_id);
layers_.push_back(LayerRegistry<Dtype>::CreateLayer(layer_param));//把每一特定层的指针存放在容器中
// Get a layer using a LayerParameter.
  // 通过LayerParameter,返回特定层的实例智能指针
  static shared_ptr<Layer<Dtype> > CreateLayer(const LayerParameter& param) {
    if (Caffe::root_solver()) {
      LOG(INFO) << "Creating layer " << param.name();
    }
    const string& type = param.type();
    CreatorRegistry& registry = Registry();
    CHECK_EQ(registry.count(type), 1) << "Unknown layer type: " << type
        << " (known types: " << LayerTypeListString() << ")";
    return registry[type](param);
  }
  return registry[type](param)会调用611行return shared_ptr<Layer<Dtype> >(new type##Layer<Dtype>(param));
  type##Layer继承于Layer,所以先调用基类的构造函数Layer,如下:
 
 // 构造方法只复制层参数说明的值,如果层说明参数中提供了权值和偏置参数,也复制
 // 继承自Layer类的子类都会显示的调用Layer的构造函数
  explicit Layer(const LayerParameter& param)
    : layer_param_(param), is_shared_(false) {
      // Set phase and copy blobs (if there are any).
      phase_ = param.phase();   //训练还是测试
      // 在layer类中被初始化,如果blobs_size() > 0
      // 在prototxt文件中一般没有提供blobs参数,所以这段代码一般不执行
      if (layer_param_.blobs_size() > 0) {
        blobs_.resize(layer_param_.blobs_size());
        for (int i = 0; i < layer_param_.blobs_size(); ++i) {
          blobs_[i].reset(new Blob<Dtype>());
          blobs_[i]->FromProto(layer_param_.blobs(i));
        }
      }
    }
 
//层的注册类似于solver,在src\caffe\layers\中每层末尾都有REGISTER_LAYER_CLASS(XXX)进行注册,在layer_factory.hpp中
/*
 * 宏 REGISTER_LAYER_CLASS 为每个type生成了create方法,并和type一起注册到了LayerRegistry中
 * ,保存在一个map里面。
 */
#define REGISTER_LAYER_CLASS(type)                                             \
  template <typename Dtype>                                                    \
  shared_ptr<Layer<Dtype> > Creator_##type##Layer(const LayerParameter& param) \
  {                                                                            \
    return shared_ptr<Layer<Dtype> >(new type##Layer<Dtype>(param));           \
  }                                                                            \
  REGISTER_LAYER_CREATOR(type, Creator_##type##Layer)
}
#define REGISTER_LAYER_CREATOR(type, creator)                                  \
  static LayerRegisterer<float> g_creator_f_##type(#type, creator<float>);     \
  static LayerRegisterer<double> g_creator_d_##type(#type, creator<double>)    \
template <typename Dtype>
class LayerRegisterer {
 public:
  LayerRegisterer(const string& type,
                  shared_ptr<Layer<Dtype> > (*creator)(const LayerParameter&))
 {
    // LOG(INFO) << "Registering layer type: " << type;
    LayerRegistry<Dtype>::AddCreator(type, creator);
  }
};
 // Adds a creator.
  // 给定类型,以及函数指针,加入到注册表
  static void AddCreator(const string& type, Creator creator) {
    CreatorRegistry& registry = Registry();
 static CreatorRegistry& Registry() {
    static CreatorRegistry* g_registry_ = new CreatorRegistry();
    return *g_registry_;
  }
    /*
     for (typename CreatorRegistry::iterator iter = registry.begin();
             iter != registry.end(); ++iter) {
          std::cout<<"Layer names:"<<(iter->first)<<std::endl;
     }
     */
    CHECK_EQ(registry.count(type), 0)
        << "Layer type " << type << " already registered.";
    registry[type] = creator;
  }
  在Net<Dtype>::Init中重要的函数:
  // 设置layers实例
      // 调用layer类的Setup函数进行初始化,输入参数:每个layer的输入blobs以及输出blobs
      // 为每个blob设置大小
      // 设置每一层的可学习参数,保存在layer的成员blobs_中
      layers_[layer_id]->SetUp(bottom_vecs_[layer_id], top_vecs_[layer_id]);
//在layer.hpp中
// 1. 检查输入输出blob个数是否满足要求,每个层能处理的输入输出数据不一样
//   * 2. 调用LayerSetUp函数初始化特殊的层,每个Layer子类需重写这个函数完成定制的初始化
//   * 3. 调用Reshape函数为top blob分配合适大小的存储空间
//   * 4. 为每个top blob设置损失权重乘子,非LossLayer为的top blob其值为零
//   *
//   * 此方法非虚函数,不用重写,模式固定
//bottom 层的输入数据,blob中的存储空间已申请,
//top 层的输出数据,blob对象已构造但是其中的存储空间未申请,
//具体空间大小需根据bottom blob大小和layer_param_共同决定,具体在Reshape函数现实.
 void SetUp(const vector<Blob<Dtype>*>& bottom,   //在模型初始化时重置 layers 及其相互之间的连接 ;
      const vector<Blob<Dtype>*>& top) {
    InitMutex();
    CheckBlobCounts(bottom, top);
    LayerSetUp(bottom, top);
    Reshape(bottom, top);
    SetLossWeights(top);
  }
 // ----------------------------------网络初始化完成--------------------------------------------
 //在tools/caffe.cpp的int train()中,紧接着
 shared_ptr<caffe::Solver<float> >  solver(caffe::SolverRegistry<float>::CreateSolver(solver_param))之后是
 solver->SetActionFunction(signal_handler.GetActionFunction());具体介绍见上述博客;再之后是:
 solver->Solve();
 在solver.hpp中有:
 // solver函数的主要入口,默认iter为0。非0的iter输入到预训练的网络中来进行继续训练。
  virtual void Solve(const char* resume_file = NULL);//注意不是构造函数
  在solver.cpp中有:
  /*
对整个网络进行训练(也就是你运行Caffe训练某个模型)的时候,实际上是在运行caffe.cpp中的
train()函数,而这个函数实际上是实例化一个Solver对象,初始化后调用了Solver中的Solve()方法
调用此方法训练网络,其中会调用Step()方法来迭代,迭代 param_.max_iter() - iter_ 次
*/
template <typename Dtype>
void Solver<Dtype>::Solve(const char* resume_file) {
// 检查当前是否是root_solver(多GPU模式下,只有root_solver才运行这一部分的代码)
  CHECK(Caffe::root_solver());//为真
  LOG(INFO) << "Solving " << net_->name();
  LOG(INFO) << "Learning Rate Policy: " << param_.lr_policy();
  // Initialize to false every time we start solving.
  // requested_early_exit_`一开始被赋值为false,也就是现在没有要求在优化结束前退出
  requested_early_exit_ = false;
  // 判断`resume_file`这个指针是否NULL,
  //如果不是则需要从resume_file存储的路径里读取之前训练的状态
  if (resume_file) {
    LOG(INFO) << "Restoring previous solver status from " << resume_file;
    Restore(resume_file);
  }
  // For a network that is trained by the solver, no bottom or top vecs
  // should be given, and we will just provide dummy vecs.
  int start_iter = iter_;
  //对于一个正在训练的网络,没有bottom或top向量被给,而且仅仅提供dummy vecs
  // 然后调用了'Step'函数,这个函数执行了实际的逐步的迭代过程
  // 最大迭代次数
  Step(param_.max_iter() - iter_);
  // If we haven't already, save a snapshot after optimization, unless
  // overridden by setting snapshot_after_train := false
  // 迭代结束或者遇到系统信号提前结束后,判断是否需要在训练结束之后snapshot
  // 这个可以在solver.prototxt里设置
  if (param_.snapshot_after_train()
      && (!param_.snapshot() || iter_ % param_.snapshot() != 0)) {
    Snapshot();
  }
  // 如果在`Step`函数的迭代过程中遇到了系统信号,且我们的处理方式设置为`STOP`,
  // 那么`requested_early_exit_`会被修改为true,迭代提前结束,输出相关信息
  if (requested_early_exit_) {
    LOG(INFO) << "Optimization stopped early.";
    return;
  }
  // After the optimization is done, run an additional train and test pass to
  // display the train and test loss/outputs if appropriate (based on the
  // display and test_interval settings, respectively).  Unlike in the rest of
  // training, for the train net we only run a forward pass as we've already
  // updated the parameters "max_iter" times -- this final pass is only done to
  // display the loss, which is computed in the forward pass.
  // 优化完后,运行一个额外的训练和测试过程展示训练测试的loss或者输出。
  // 判断是否需要输出最后的loss
  if (param_.display() && iter_ % param_.display() == 0) {
    int average_loss = this->param_.average_loss();
    Dtype loss;
    net_->Forward(&loss);
    UpdateSmoothedLoss(loss, start_iter, average_loss);
    LOG(INFO) << "Iteration " << iter_ << ", loss = " << smoothed_loss_;
  }
  // 判断是否需要最后Test
  if (param_.test_interval() && iter_ % param_.test_interval() == 0) {
    TestAll();
  }
  LOG(INFO) << "Optimization Done.";
}
//-----------------------------------Step(param_.max_iter() - iter_)-----------函数
其中的Step(param_.max_iter() - iter_)函数也在solver.cpp中
template <typename Dtype>
void Solver<Dtype>::Step(int iters) {
  // 设置开始的迭代次数(如果是从之前的snapshot恢复的,那iter_
  // 等于snapshot时的迭代次数)和结束的迭代次数
  const int start_iter = iter_;
  // iters = param_.max_iter() - iter_
  const int stop_iter = iter_ + iters;
  // 输出的loss为前average_loss次loss的平均值,在solver.prototxt里设置,默认为1,
  // losses存储之前的average_loss个loss,smoothed_loss为最后要输出的均值
  int average_loss = this->param_.average_loss();//默认为1
  losses_.clear();
  smoothed_loss_ = 0;
  //迭代
  while (iter_ < stop_iter) {
    // zero-init the params
 // 清空上一次所有参数的梯度
    net_->ClearParamDiffs();
    // test_initialization默认为true
    // 判断是否需要测试
    if (param_.test_interval() && iter_ % param_.test_interval() == 0
        && (iter_ > 0 || param_.test_initialization())
        && Caffe::root_solver()) {
      TestAll();
      // 判断是否需要提前结束迭代
      if (requested_early_exit_) {
        // Break out of the while loop because stop was requested while testing.
        break;
      }
    }
    for (int i = 0; i < callbacks_.size(); ++i) {
      callbacks_[i]->on_start();
    }
    // 判断当前迭代次数是否需要显示loss等信息
    const bool display = param_.display() && iter_ % param_.display() == 0;
    net_->set_debug_info(display && param_.debug_info());
    // accumulate the loss and gradient
    Dtype loss = 0;
    // iter_size也是在solver.prototxt里设置,实际上的batch_size=iter_size*网络定义里的batch_size,
    // 因此每一次迭代的loss是iter_size次迭代的和,再除以iter_size,这个loss是通过调用`Net::ForwardBackward`函数得到的
    // 这个设置我的理解是在GPU的显存不够的时候使用,比如我本来想把batch_size设置为128,但是会out_of_memory,
    // 借助这个方法,可以设置batch_size=32,iter_size=4,那实际上每次迭代还是处理了128个数据。
    // accumulate gradients over `iter_size` x `batch_size` instances
    for (int i = 0; i < param_.iter_size(); ++i) {
    /*
     * 调用了Net中的代码,主要完成了前向后向的计算,
     * 前向用于计算模型的最终输出和Loss,后向用于
     * 计算每一层网络和参数的梯度。
     */
      loss += net_->ForwardBackward();
    }
    //accumulate(累积) gradients over `iter_size` x `batch_size` instances。
    //默认情况下,iter_size=1,即默认情况下,一个iteratio一个batch
    loss /= param_.iter_size();
    // 计算要输出的smoothed_loss,如果losses里还没有存够average_loss个loss
    //则将当前的loss插入,如果已经存够了,则将之前的替换掉
    // average the loss across iterations for smoothed reporting
    /*
     * 这个函数主要做Loss的平滑。由于Caffe的训练方式是SGD,我们无法把所有的数据同时
     * 放入模型进行训练,那么部分数据产生的Loss就可能会和全样本的平均Loss不同,在必要
     * 时候将Loss和历史过程中更新的Loss求平均就可以减少Loss的震荡问题。
     */
    UpdateSmoothedLoss(loss, start_iter, average_loss);
    //输出当前迭代信息
    if (display) {
      LOG_IF(INFO, Caffe::root_solver()) << "Iteration " << iter_
          << ", loss = " << smoothed_loss_;
      const vector<Blob<Dtype>*>& result = net_->output_blobs();
      int score_index = 0;
      for (int j = 0; j < result.size(); ++j) {
        const Dtype* result_vec = result[j]->cpu_data();
        const string& output_name =
            net_->blob_names()[net_->output_blob_indices()[j]];
        const Dtype loss_weight =
            net_->blob_loss_weights()[net_->output_blob_indices()[j]];
        for (int k = 0; k < result[j]->count(); ++k) {
          ostringstream loss_msg_stream;
          if (loss_weight) {
            loss_msg_stream << " (* " << loss_weight
                            << " = " << loss_weight * result_vec[k] << " loss)";
          }
          LOG_IF(INFO, Caffe::root_solver()) << "    Train net output #"
              << score_index++ << ": " << output_name << " = "
              << result_vec[k] << loss_msg_stream.str();
        }
      }
    }

    for (int i = 0; i < callbacks_.size(); ++i) {
      callbacks_[i]->on_gradients_ready();
    }
    // 执行梯度的更新,这个函数在基类`Solver`中没有实现,会调用每个子类自己的实现
    //,后面具体分析`SGDSolver`的实现
    ApplyUpdate();
    // Increment the internal iter_ counter -- its value should always indicate
    // the number of times the weights have been updated.
    // 迭代次数加1
    ++iter_;
    // 调用GetRequestedAction,实际是通过action_request_function_函数指针调用之前设置好(通过`SetRequestedAction`)的
    // signal_handler的`CheckForSignals`函数,这个函数的作用是
    // 会根据之前是否遇到系统信号以及信号的类型和我们设置(或者默认)的方式返回处理的方式
    SolverAction::Enum request = GetRequestedAction();
    // Save a snapshot if needed.
    // 判断当前迭代是否需要snapshot,如果request等于`SNAPSHOT`则也需要
    if ((param_.snapshot()
         && iter_ % param_.snapshot() == 0
         && Caffe::root_solver()) ||
         (request == SolverAction::SNAPSHOT)) {
      Snapshot();
    }
    // 如果request为`STOP`则修改`requested_early_exit_`为true,之后就会提前结束迭代
    if (SolverAction::STOP == request) {
      requested_early_exit_ = true;
      // Break out of training loop.
      break;
    }
  }
}
//------------------------------------loss += net_->ForwardBackward()函数--------------------------
loss += net_->ForwardBackward()
在net.hpp中:
// 进行一次正向传播,一次反向传播
  Dtype ForwardBackward() {
    Dtype loss;
    Forward(&loss);
    Backward();
    return loss;
  }
// 前向传播
template <typename Dtype>
const vector<Blob<Dtype>*>& Net<Dtype>::Forward(Dtype* loss) {
  //应该是训练过程的前向传播
  if (loss != NULL) {
    *loss = ForwardFromTo(0, layers_.size() - 1);比如层一共有12层应该是0~11.
  }
  else {
    ForwardFromTo(0, layers_.size() - 1);
  }
  return net_output_blobs_;
}
//----------------------------------ForwardFromTo(0, layers_.size() - 1)函数-----------------------
在net.cpp中有:
template <typename Dtype>
Dtype Net<Dtype>::ForwardFromTo(int start, int end) {
  CHECK_GE(start, 0);
  CHECK_LT(end, layers_.size());
  Dtype loss = 0;
  for (int i = start; i <= end; ++i) {
    // LOG(ERROR) << "Forwarding " << layer_names_[i];
 // 对每一层进行前向计算,返回每层的loss,其实只有最后一层loss不为0
    Dtype layer_loss = layers_[i]->Forward(bottom_vecs_[i], top_vecs_[i]);
    loss += layer_loss;
    if (debug_info_) { ForwardDebugInfo(i); }
  }
  return loss;
}
//----------------------------layers_[i]->Forward(bottom_vecs_[i], top_vecs_[i])函数----------------
在layer.hpp中
// 前向传播和反向传播接口。 每个Layer的派生类都应该实现Forward_cpu()
template <typename Dtype>
inline Dtype Layer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom,
    const vector<Blob<Dtype>*>& top) {
  // Lock during forward to ensure sequential forward
  Lock();
  Dtype loss = 0;
  Reshape(bottom, top);
  switch (Caffe::mode()) {
  case Caffe::CPU:
    Forward_cpu(bottom, top);
    // 计算loss
    for (int top_id = 0; top_id < top.size(); ++top_id) {
      if (!this->loss(top_id)) { continue; }
      const int count = top[top_id]->count();
      const Dtype* data = top[top_id]->cpu_data();
      const Dtype* loss_weights = top[top_id]->cpu_diff();
      loss += caffe_cpu_dot(count, data, loss_weights);
    }
    break;
  case Caffe::GPU:
    Forward_gpu(bottom, top);
#ifndef CPU_ONLY
    for (int top_id = 0; top_id < top.size(); ++top_id) {
      if (!this->loss(top_id)) { continue; }
      const int count = top[top_id]->count();
      const Dtype* data = top[top_id]->gpu_data();
      const Dtype* loss_weights = top[top_id]->gpu_diff();
      Dtype blob_loss = 0;
      caffe_gpu_dot(count, data, loss_weights, &blob_loss);
      loss += blob_loss;
    }
#endif
    break;
  default:
    LOG(FATAL) << "Unknown caffe mode.";
  }
  Unlock();
  return loss;
}
//假如我们是CPU模式则调用Forward_cpu(bottom, top)函数;
在layer.hpp中
//---------------* 纯虚函数,子类必须实现,使用cpu经行前向计算-------------
   */
  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top) = 0;//此处开始调用子类层的前传函数
//-------------------936行Backward()函数---------------- 
在net.cpp中
   template <typename Dtype>
void Net<Dtype>::Backward() {
  BackwardFromTo(layers_.size() - 1, 0);
  if (debug_info_) {
    Dtype asum_data = 0, asum_diff = 0, sumsq_data = 0, sumsq_diff = 0;
    for (int i = 0; i < learnable_params_.size(); ++i) {
      asum_data += learnable_params_[i]->asum_data();
      asum_diff += learnable_params_[i]->asum_diff();
      sumsq_data += learnable_params_[i]->sumsq_data();
      sumsq_diff += learnable_params_[i]->sumsq_diff();
    }
    const Dtype l2norm_data = std::sqrt(sumsq_data);
    const Dtype l2norm_diff = std::sqrt(sumsq_diff);
    LOG(ERROR) << "    [Backward] All net params (data, diff): "
               << "L1 norm = (" << asum_data << ", " << asum_diff << "); "
               << "L2 norm = (" << l2norm_data << ", " << l2norm_diff << ")";
  }
}
//-------------------------- BackwardFromTo(layers_.size() - 1, 0)-----------------
// 与前向传播一样,反向传播也有很多相关函数,但都是对BackwardFromTo(int start, int end)的封装
template <typename Dtype>
void Net<Dtype>::BackwardFromTo(int start, int end) {
  CHECK_GE(end, 0);
  CHECK_LT(start, layers_.size());
  for (int i = start; i >= end; --i) {
    if (layer_need_backward_[i]) {
    // 对每一层经行反向传播计算
      layers_[i]->Backward(
          top_vecs_[i], bottom_need_backward_[i], bottom_vecs_[i]);
      if (debug_info_) { BackwardDebugInfo(i); }
    }
  }
}
//----------layers_[i]->Backward(top_vecs_[i], bottom_need_backward_[i], bottom_vecs_[i])--------;
在layer.hpp中有:
//给定相对于 top 层输出的梯度,计算其相对于输入的梯度,并传递到 bottom层
//一个有参数的 layer 需要计算相对于各个参数的梯度值并存储在内部
inline void Backward(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down,const vector<Blob<Dtype>*>& bottom);
template <typename Dtype>
inline void Layer<Dtype>::Backward(const vector<Blob<Dtype>*>& top,
    const vector<bool>& propagate_down,
    const vector<Blob<Dtype>*>& bottom) {
  switch (Caffe::mode()) {
  case Caffe::CPU:
    Backward_cpu(top, propagate_down, bottom);
    break;
  case Caffe::GPU:
    Backward_gpu(top, propagate_down, bottom);
    break;
  default:
    LOG(FATAL) << "Unknown caffe mode.";
  }
//假如是cpu模式
//----------------------Backward_cpu(top, propagate_down, bottom)函数--------
在layer.hpp中
//纯虚函数,派生类必须实现
 virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
      const vector<bool>& propagate_down,
      const vector<Blob<Dtype>*>& bottom) = 0;//开始调用子类的后项传播函数,前后项传播结束
  
//869行的UpdateSmoothedLoss(loss, start_iter, average_loss);
//和903行的ApplyUpdate()见博客分析,写的比较清楚;
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值