caffe中thread-safe问题解决

 

caffe并不是thread safe的,在使用深度学习框架部署的实践中,一定注意这个问题, 其他的框架如mxnet, tensorflow也是如此,那么如何解决caffe的这个问题?先明确一个线程的概念:

1. boost::thread_specific_ptr

什么是线程不安全呢?多个线程执行同一段代码或接口,如果代码或接口中含有共享的变量,那么会引起线程竞争,引发conflict.

boost中的thread_specific_ptr具有局部线程存储的属性,被其修饰过的变量,在被多线程使用的过程中,会被不同的线程拷贝一份到自己的context中,从而使得不会引发memory conflict或竞争,如下例:

#include <boost/thread/thread.hpp>
#include <boost/thread/mutex.hpp>
#include <boost/thread/tss.hpp>
#include <iostream>

boost::mutex io_mutex;
boost::thread_specific_ptr<int> price;

class Item
{
public:

    Item(int id) : id(id){}

    void operator()()
    {   
        if (price.get() == 0)
        {
            price.reset(new int(0));
        }

        for (int i = 0; i < 10; ++i)
        {
            (*price)++;
            boost::mutex::scoped_lock lock(io_mutex);
            std::cout << id << ": " << *price << std::endl;
        }
    }   

private:

    int id; 
};

int main(int argc, char* argv[])
{
    boost::thread thrd1(Item(1));
    boost::thread thrd2(Item(2));
    thrd1.join();
    thrd2.join();
    return 0;
}

结果如下:

1: 1
1: 2
1: 3
1: 4
1: 5
1: 6
1: 7
1: 8
1: 9
1: 10
2: 1
2: 2
2: 3
2: 4
2: 5
2: 6
2: 7
2: 8
2: 9
2: 10

两个线程对price的修改,彼此不影响.

 

2. caffe中thread_specific_ptr的使用情况:

在早期的caffe版本中,caffe实例是以单件的形式存在的,如下(v0.9):

shared_ptr<Caffe> Caffe::singleton_;

整个libcaffe.so中只有一份caffe instance,线程un safe,在SetDevice(创建cuBlas等context)时,只一份cuda context,如果多线程调用,cuda这块就会有问题,多个线程间的cuda context冲突.

在新版本caffe中,在src/caffe/common.cpp中,caffe实例是以如下形式存在的:

 // Make sure each thread can have different values.
 static boost::thread_specific_ptr<Caffe> thread_instance_;

解决了多个线程之间的cuda context冲突(by the way,,. 线程切换时,有构造cuda context的开销).

 

3. thread safe解决的不彻底:

当你基于caffe封装inferrence  sdk之后,经常在部署生产环境的时候,会引发多线程调用同一个inference接口的问题,此时,是由于共享同一份net引起的不安全,那么需要如下解决:

(1)修改syncedmem

这里主要是记录一下device id,防止cpu gpu内存在多线程alloc和release时的冲突(由于device不一致导致)

/*syncedmem.hpp*/

inline void CaffeMallocHost(void** ptr, size_t size, bool* use_cuda, int *alloc_device) {
#ifndef CPU_ONLY
  if (Caffe::mode() == Caffe::GPU) {
    CUDA_CHECK(cudaGetDevice(alloc_device));
    CUDA_CHECK(cudaMallocHost(ptr, size));
    *use_cuda = true;
    return;
  }
#endif
  *ptr = malloc(size);
  *use_cuda = false;
  CHECK(*ptr) << "host allocation of size " << size << " failed";
}

inline void CaffeFreeHost(void* ptr, bool use_cuda, int alloc_device) {
#ifndef CPU_ONLY
  if (use_cuda) {
    int initial_device;
    cudaGetDevice(&initial_device);
    if(alloc_device != -1)
    {
        CUDA_CHECK(cudaSetDevice(alloc_device));
    }
    CUDA_CHECK(cudaFreeHost(ptr));
    printf("After cudaFreeHost %ld %d\n", ptr, alloc_device);
    cudaSetDevice(initial_device);
    return;
  }
#endif
  //printf("release %ld\n", ptr);
  free(ptr);
}

class SyncedMemory {
 public:
  SyncedMemory()
      : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(0), head_(UNINITIALIZED),
        own_cpu_data_(false), cpu_malloc_use_cuda_(false), own_gpu_data_(false),
        gpu_device_(-1), alloc_device_(-1) {}
  explicit SyncedMemory(size_t size)
      : cpu_ptr_(NULL), gpu_ptr_(NULL), size_(size), head_(UNINITIALIZED),
        own_cpu_data_(false), cpu_malloc_use_cuda_(false), own_gpu_data_(false),
        gpu_device_(-1), alloc_device_(-1) {}
  ~SyncedMemory();
  const void* cpu_data();
  void set_cpu_data(void* data);
  const void* gpu_data();
  void set_gpu_data(void* data);
  void* mutable_cpu_data();
  void* mutable_gpu_data();
  enum SyncedHead { UNINITIALIZED, HEAD_AT_CPU, HEAD_AT_GPU, SYNCED };
  SyncedHead head() { return head_; }
  size_t size() { return size_; }

#ifndef CPU_ONLY
  void async_gpu_push(const cudaStream_t& stream);
#endif

 private:
  void to_cpu();
  void to_gpu();
  void* cpu_ptr_;
  void* gpu_ptr_;
  size_t size_;
  SyncedHead head_;
  bool own_cpu_data_;
  bool cpu_malloc_use_cuda_;
  bool own_gpu_data_;
  int gpu_device_;
  // device used when cpu_ptr_ is allocated
  int alloc_device_;

  DISABLE_COPY_AND_ASSIGN(SyncedMemory);
};  // class SyncedMemory




SyncedMemory::~SyncedMemory() {
  if (cpu_ptr_ && own_cpu_data_) {
    CaffeFreeHost(cpu_ptr_, cpu_malloc_use_cuda_, alloc_device_);
  }

#ifndef CPU_ONLY
  if (gpu_ptr_ && own_gpu_data_) {
    int initial_device;
    cudaGetDevice(&initial_device);
    if (gpu_device_ != -1) {
      CUDA_CHECK(cudaSetDevice(gpu_device_));
    }
    CUDA_CHECK(cudaFree(gpu_ptr_));
    cudaSetDevice(initial_device);
  }
#endif  // CPU_ONLY
}

inline void SyncedMemory::to_cpu() {
  switch (head_) {
  case UNINITIALIZED:
    CaffeMallocHost(&cpu_ptr_, size_, &cpu_malloc_use_cuda_, &alloc_device_);
    caffe_memset(size_, 0, cpu_ptr_);
    head_ = HEAD_AT_CPU;
    own_cpu_data_ = true;
    break;
  case HEAD_AT_GPU:
#ifndef CPU_ONLY
    if (cpu_ptr_ == NULL) {
      CaffeMallocHost(&cpu_ptr_, size_, &cpu_malloc_use_cuda_, &alloc_device_);
      own_cpu_data_ = true;
    }
    caffe_gpu_memcpy(size_, gpu_ptr_, cpu_ptr_);
    head_ = SYNCED;
#else
    NO_GPU;
#endif
    break;
  case HEAD_AT_CPU:
  case SYNCED:
    break;
  }
}

(2)处理net共享问题:

std::shared_ptr<caffe::NetParameter> param_;
std::shared_ptr<caffe::Net<float> > net_;

boost::thread_specific_ptr<caffe::Net<float> > predictors_;
boost::thread_specific_ptr<Blob<float> > tmp_blob;


std::unique_ptr<caffe::NetParameter> LoadNetFromFile(std::string &model_file)
{
    auto prototxt = std::make_unique<caffe::NetParameter>();
    caffe::ReadProtoFromTextFile(model_file.c_str(), prototxt.get());
    caffe::UpgradeNetAsNeeded(model_file, prototxt.get());
    return prototxt;
}

std::unique_ptr<caffe::NetParameter> LoadWeightsFromFile(std::string &weight_file)
{
    auto weights = std::make_unique<caffe::NetParameter>();
    caffe::ReadProtoFromBinaryFile(weight_file.c_str(), weights.get());
    caffe::UpgradeNetAsNeeded(weight_file, weights.get());
    return weights;
}

bool Init(int dev_id, int dev_type, const char *model, const char *weights)
{
    dev_id_ = dev_id;
    dev_type_ = dev_type;
    model_ = model;
    weights_ = weights;

    Caffe::SetDevice(dev_id_);
    if(dev_type_ == 0)
    {   
        Caffe::set_mode(Caffe::CPU);
    }   
    else
    {   
        Caffe::set_mode(Caffe::GPU);
    }   

    auto prototxt_net_param = LoadNetFromFile(model_);
    auto weights_net_param = LoadWeightsFromFile(weights_);
    param_ = std::make_shared<caffe::NetParameter>(*prototxt_net_param);
    param_->mutable_state()->set_phase(caffe::TEST);
    net_ = std::make_shared<caffe::Net<float> >(*param_);
    net_->CopyTrainedLayersFrom(*weights_net_param);


    return true;
}

void Inference(cv::Mat &img)
{
    if(!predictors_.get())
    {
        auto predictor = std::make_unique<caffe::Net<float> >(*param_);
        predictor->ShareTrainedLayersWith(net_.get());
        predictors_.reset(predictor.release());
    }

    auto *predictor = predictors_.get();

  
    Blob<float> *input_blobs = predictor->input_blobs()[0];

    int num_channels = input_blobs->channels();

    int input_width = input_blobs->width();

    int input_height = input_blobs->height();

    if(!tmp_blob.get())
    {
        auto blob = new Blob<float>();
        tmp_blob.reset(blob);
    }

    Blob<float> *InBlob = tmp_blob.get();

    InBlob->Reshape(1, num_channels, input_height, input_width);

    std::vector<cv::Mat> input_channels;

    wrapInputLayer(&input_channels, predictor);

    preprocess(img, &input_channels, num_channels, input_width, input_height);

    CHECK_EQ(1, predictor->input_blobs().size());

    predictor->input_blobs()[0]->ReshapeLike(*InBlob);

    predictor->input_blobs()[0]->set_cpu_data(InBlob->mutable_cpu_data());

    predictor->Reshape();
   
  predictor->Forward();
}

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

seasermy

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值