caffe中thread-safe问题解决

最新推荐文章于 2021-09-29 14:02:31 发布

seasermy

最新推荐文章于 2021-09-29 14:02:31 发布

阅读量348

点赞数

分类专栏： caffe研究与应用文章标签： caffe thread safe multi thread thread_specific_ptr openmp

本文链接：https://blog.csdn.net/seasermy/article/details/100207359

版权

caffe研究与应用专栏收录该内容

4 篇文章 0 订阅

订阅专栏

caffe并不是thread safe的，在使用深度学习框架部署的实践中，一定注意这个问题, 其他的框架如mxnet, tensorflow也是如此，那么如何解决caffe的这个问题？先明确一个线程的概念：

1. boost::thread_specific_ptr

什么是线程不安全呢？多个线程执行同一段代码或接口，如果代码或接口中含有共享的变量，那么会引起线程竞争，引发conflict.

boost中的thread_specific_ptr具有局部线程存储的属性，被其修饰过的变量，在被多线程使用的过程中，会被不同的线程拷贝一份到自己的context中，从而使得不会引发memory conflict或竞争，如下例：

#include <boost/thread/thread.hpp>
#include <boost/thread/mutex.hpp>
#include <boost/thread/tss.hpp>
#include <iostream>

boost::mutex io_mutex;
boost::thread_specific_ptr<int> price;

class Item
{
public:

    Item(int id) : id(id){}

    void operator()()
    {   
        if (price.get() == 0)
        {
            price.reset(new int(0));
        }

        for (int i = 0; i < 10; ++i)
        {
            (*price)++;
            boost::mutex::scoped_lock lock(io_mutex);
            std::cout << id << ": " << *price << std::endl;
        }
    }   

private:

    int id; 
};

int main(int argc, char* argv[])
{
    boost::thread thrd1(Item(1));
    boost::thread thrd2(Item(2));
    thrd1.join();
    thrd2.join();
    return 0;
}

结果如下：

两个线程对price的修改，彼此不影响．

2. caffe中thread_specific_ptr的使用情况：

在早期的caffe版本中，caffe实例是以单件的形式存在的，如下(v0.9)：

shared_ptr<Caffe> Caffe::singleton_;

整个libcaffe.so中只有一份caffe instance，线程un safe，在SetDevice(创建cuBlas等context)时，只一份cuda context,如果多线程调用，cuda这块就会有问题，多个线程间的cuda context冲突．

在新版本caffe中，在src/caffe/common.cpp中，caffe实例是以如下形式存在的：

 // Make sure each thread can have different values.
 static boost::thread_specific_ptr<Caffe> thread_instance_;

解决了多个线程之间的cuda context冲突（by the way,,. 线程切换时，有构造cuda context的开销）.

3. thread safe解决的不彻底：

当你基于caffe封装inferrence sdk之后，经常在部署生产环境的时候，会引发多线程调用同一个inference接口的问题，此时，是由于共享同一份net引起的不安全，那么需要如下解决：

（１）修改syncedmem

这里主要是记录一下device id，防止cpu gpu内存在多线程alloc和release时的冲突（由于device不一致导致）

/*syncedmem.hpp*/

inline void CaffeMallocHost(void** ptr, size_t size, bool* use_cuda, int *alloc_device) {
#ifndef CPU_ONLY
  if (Caffe::mode() == Caffe::GPU) {
    CUDA_CHECK(cudaGetDevice(alloc_device));
    CUDA_CHECK(cudaMallocHost(ptr, size));
    *use_cuda = true;
    return;
  }
#endif
  *ptr = malloc(size);
  *use_cuda = false;
  CHECK(*ptr) << "host allocation of size " << size << " failed";
}

inline void CaffeFreeHost(void* ptr, bool use_cuda, int alloc_device) {
#ifndef CPU_ONLY
  if (use_cuda) {
    int initial_device;
    cudaGetDevice(&initial_device);
    if(alloc_device != -1)
    {
        CUDA_CHECK(cudaSetDevice(alloc_device));
    }
    CUDA_CHECK(cudaFreeHost(ptr));
    printf("After cudaFreeHost %ld %d\n", ptr, alloc_device);
    cudaSetDevice(initial_device);
    return;
  }
#endif
  //printf("release %ld\n", ptr);
  free(ptr);
}

class SyncedMemory {
 public:
  SyncedMemory()
      : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(0), head_(UNINITIALIZED),
        own_cpu_data_(false), cpu_malloc_use_cuda_(false), own_gpu_data_(false),
        gpu_device_(-1), alloc_device_(-1) {}
  explicit SyncedMemory(size_t size)
      : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(size), head_(UNINITIALIZED),
        own_cpu_data_(false), cpu_malloc_use_cuda_(false), own_gpu_data_(false),
        gpu_device_(-1), alloc_device_(-1) {}
  ~SyncedMemory();
  const void* cpu_data();
  void set_cpu_data(void* data);
  const void* gpu_data();
  void set_gpu_data(void* data);
  void* mutable_cpu_data();
  void* mutable_gpu_data();
  enum SyncedHead { UNINITIALIZED, HEAD_AT_CPU, HEAD_AT_GPU, SYNCED };
  SyncedHead head() { return head_; }
  size_t size() { return size_; }

#ifndef CPU_ONLY
  void async_gpu_push(const cudaStream_t& stream);
#endif

 private:
  void to_cpu();
  void to_gpu();
  void* cpu_ptr_;
  void* gpu_ptr_;
  size_t size_;
  SyncedHead head_;
  bool own_cpu_data_;
  bool cpu_malloc_use_cuda_;
  bool own_gpu_data_;
  int gpu_device_;
  // device used when cpu_ptr_ is allocated
  int alloc_device_;

  DISABLE_COPY_AND_ASSIGN(SyncedMemory);
};  // class SyncedMemory

SyncedMemory::~SyncedMemory() {
  if (cpu_ptr_ && own_cpu_data_) {
    CaffeFreeHost(cpu_ptr_, cpu_malloc_use_cuda_, alloc_device_);
  }

#ifndef CPU_ONLY
  if (gpu_ptr_ && own_gpu_data_) {
    int initial_device;
    cudaGetDevice(&initial_device);
    if (gpu_device_ != -1) {
      CUDA_CHECK(cudaSetDevice(gpu_device_));
    }
    CUDA_CHECK(cudaFree(gpu_ptr_));
    cudaSetDevice(initial_device);
  }
#endif  // CPU_ONLY
}

inline void SyncedMemory::to_cpu() {
  switch (head_) {
  case UNINITIALIZED:
    CaffeMallocHost(&cpu_ptr_, size_, &cpu_malloc_use_cuda_, &alloc_device_);
    caffe_memset(size_, 0, cpu_ptr_);
    head_ = HEAD_AT_CPU;
    own_cpu_data_ = true;
    break;
  case HEAD_AT_GPU:
#ifndef CPU_ONLY
    if (cpu_ptr_ == NULL) {
      CaffeMallocHost(&cpu_ptr_, size_, &cpu_malloc_use_cuda_, &alloc_device_);
      own_cpu_data_ = true;
    }
    caffe_gpu_memcpy(size_, gpu_ptr_, cpu_ptr_);
    head_ = SYNCED;
#else
    NO_GPU;
#endif
    break;
  case HEAD_AT_CPU:
  case SYNCED:
    break;
  }
}

(2)处理net共享问题：

std::shared_ptr<caffe::NetParameter> param_;
std::shared_ptr<caffe::Net<float> > net_;

boost::thread_specific_ptr<caffe::Net<float> > predictors_;
boost::thread_specific_ptr<Blob<float> > tmp_blob;


std::unique_ptr<caffe::NetParameter> LoadNetFromFile(std::string &model_file)
{
    auto prototxt = std::make_unique<caffe::NetParameter>();
    caffe::ReadProtoFromTextFile(model_file.c_str(), prototxt.get());
    caffe::UpgradeNetAsNeeded(model_file, prototxt.get());
    return prototxt;
}

std::unique_ptr<caffe::NetParameter> LoadWeightsFromFile(std::string &weight_file)
{
    auto weights = std::make_unique<caffe::NetParameter>();
    caffe::ReadProtoFromBinaryFile(weight_file.c_str(), weights.get());
    caffe::UpgradeNetAsNeeded(weight_file, weights.get());
    return weights;
}

bool Init(int dev_id, int dev_type, const char *model, const char *weights)
{
    dev_id_ = dev_id;
    dev_type_ = dev_type;
    model_ = model;
    weights_ = weights;

    Caffe::SetDevice(dev_id_);
    if(dev_type_ == 0)
    {   
        Caffe::set_mode(Caffe::CPU);
    }   
    else
    {   
        Caffe::set_mode(Caffe::GPU);
    }   

    auto prototxt_net_param = LoadNetFromFile(model_);
    auto weights_net_param = LoadWeightsFromFile(weights_);
    param_ = std::make_shared<caffe::NetParameter>(*prototxt_net_param);
    param_->mutable_state()->set_phase(caffe::TEST);
    net_ = std::make_shared<caffe::Net<float> >(*param_);
    net_->CopyTrainedLayersFrom(*weights_net_param);


    return true;
}

void Inference(cv::Mat &img)
{
    if(!predictors_.get())
    {
        auto predictor = std::make_unique<caffe::Net<float> >(*param_);
        predictor->ShareTrainedLayersWith(net_.get());
        predictors_.reset(predictor.release());
    }

    auto *predictor = predictors_.get();

  
    Blob<float> *input_blobs = predictor->input_blobs()[0];

    int num_channels = input_blobs->channels();

    int input_width = input_blobs->width();

    int input_height = input_blobs->height();

    if(!tmp_blob.get())
    {
        auto blob = new Blob<float>();
        tmp_blob.reset(blob);
    }

    Blob<float> *InBlob = tmp_blob.get();

    InBlob->Reshape(1, num_channels, input_height, input_width);

    std::vector<cv::Mat> input_channels;

    wrapInputLayer(&input_channels, predictor);

    preprocess(img, &input_channels, num_channels, input_width, input_height);

    CHECK_EQ(1, predictor->input_blobs().size());

    predictor->input_blobs()[0]->ReshapeLike(*InBlob);

    predictor->input_blobs()[0]->set_cpu_data(InBlob->mutable_cpu_data());

    predictor->Reshape();
　　　
　　predictor->Forward();
｝