caffe并不是thread safe的,在使用深度学习框架部署的实践中,一定注意这个问题, 其他的框架如mxnet, tensorflow也是如此,那么如何解决caffe的这个问题?先明确一个线程的概念:
1. boost::thread_specific_ptr
什么是线程不安全呢?多个线程执行同一段代码或接口,如果代码或接口中含有共享的变量,那么会引起线程竞争,引发conflict.
boost中的thread_specific_ptr具有局部线程存储的属性,被其修饰过的变量,在被多线程使用的过程中,会被不同的线程拷贝一份到自己的context中,从而使得不会引发memory conflict或竞争,如下例:
#include <boost/thread/thread.hpp>
#include <boost/thread/mutex.hpp>
#include <boost/thread/tss.hpp>
#include <iostream>
boost::mutex io_mutex;
boost::thread_specific_ptr<int> price;
class Item
{
public:
Item(int id) : id(id){}
void operator()()
{
if (price.get() == 0)
{
price.reset(new int(0));
}
for (int i = 0; i < 10; ++i)
{
(*price)++;
boost::mutex::scoped_lock lock(io_mutex);
std::cout << id << ": " << *price << std::endl;
}
}
private:
int id;
};
int main(int argc, char* argv[])
{
boost::thread thrd1(Item(1));
boost::thread thrd2(Item(2));
thrd1.join();
thrd2.join();
return 0;
}
结果如下:
1: 1
1: 2
1: 3
1: 4
1: 5
1: 6
1: 7
1: 8
1: 9
1: 10
2: 1
2: 2
2: 3
2: 4
2: 5
2: 6
2: 7
2: 8
2: 9
2: 10
两个线程对price的修改,彼此不影响.
2. caffe中thread_specific_ptr的使用情况:
在早期的caffe版本中,caffe实例是以单件的形式存在的,如下(v0.9):
shared_ptr<Caffe> Caffe::singleton_;
整个libcaffe.so中只有一份caffe instance,线程un safe,在SetDevice(创建cuBlas等context)时,只一份cuda context,如果多线程调用,cuda这块就会有问题,多个线程间的cuda context冲突.
在新版本caffe中,在src/caffe/common.cpp中,caffe实例是以如下形式存在的:
// Make sure each thread can have different values.
static boost::thread_specific_ptr<Caffe> thread_instance_;
解决了多个线程之间的cuda context冲突(by the way,,. 线程切换时,有构造cuda context的开销).
3. thread safe解决的不彻底:
当你基于caffe封装inferrence sdk之后,经常在部署生产环境的时候,会引发多线程调用同一个inference接口的问题,此时,是由于共享同一份net引起的不安全,那么需要如下解决:
(1)修改syncedmem
这里主要是记录一下device id,防止cpu gpu内存在多线程alloc和release时的冲突(由于device不一致导致)
/*syncedmem.hpp*/
inline void CaffeMallocHost(void** ptr, size_t size, bool* use_cuda, int *alloc_device) {
#ifndef CPU_ONLY
if (Caffe::mode() == Caffe::GPU) {
CUDA_CHECK(cudaGetDevice(alloc_device));
CUDA_CHECK(cudaMallocHost(ptr, size));
*use_cuda = true;
return;
}
#endif
*ptr = malloc(size);
*use_cuda = false;
CHECK(*ptr) << "host allocation of size " << size << " failed";
}
inline void CaffeFreeHost(void* ptr, bool use_cuda, int alloc_device) {
#ifndef CPU_ONLY
if (use_cuda) {
int initial_device;
cudaGetDevice(&initial_device);
if(alloc_device != -1)
{
CUDA_CHECK(cudaSetDevice(alloc_device));
}
CUDA_CHECK(cudaFreeHost(ptr));
printf("After cudaFreeHost %ld %d\n", ptr, alloc_device);
cudaSetDevice(initial_device);
return;
}
#endif
//printf("release %ld\n", ptr);
free(ptr);
}
class SyncedMemory {
public:
SyncedMemory()
: cpu_ptr_(NULL), gpu_ptr_(NULL), size_(0), head_(UNINITIALIZED),
own_cpu_data_(false), cpu_malloc_use_cuda_(false), own_gpu_data_(false),
gpu_device_(-1), alloc_device_(-1) {}
explicit SyncedMemory(size_t size)
: cpu_ptr_(NULL), gpu_ptr_(NULL), size_(size), head_(UNINITIALIZED),
own_cpu_data_(false), cpu_malloc_use_cuda_(false), own_gpu_data_(false),
gpu_device_(-1), alloc_device_(-1) {}
~SyncedMemory();
const void* cpu_data();
void set_cpu_data(void* data);
const void* gpu_data();
void set_gpu_data(void* data);
void* mutable_cpu_data();
void* mutable_gpu_data();
enum SyncedHead { UNINITIALIZED, HEAD_AT_CPU, HEAD_AT_GPU, SYNCED };
SyncedHead head() { return head_; }
size_t size() { return size_; }
#ifndef CPU_ONLY
void async_gpu_push(const cudaStream_t& stream);
#endif
private:
void to_cpu();
void to_gpu();
void* cpu_ptr_;
void* gpu_ptr_;
size_t size_;
SyncedHead head_;
bool own_cpu_data_;
bool cpu_malloc_use_cuda_;
bool own_gpu_data_;
int gpu_device_;
// device used when cpu_ptr_ is allocated
int alloc_device_;
DISABLE_COPY_AND_ASSIGN(SyncedMemory);
}; // class SyncedMemory
SyncedMemory::~SyncedMemory() {
if (cpu_ptr_ && own_cpu_data_) {
CaffeFreeHost(cpu_ptr_, cpu_malloc_use_cuda_, alloc_device_);
}
#ifndef CPU_ONLY
if (gpu_ptr_ && own_gpu_data_) {
int initial_device;
cudaGetDevice(&initial_device);
if (gpu_device_ != -1) {
CUDA_CHECK(cudaSetDevice(gpu_device_));
}
CUDA_CHECK(cudaFree(gpu_ptr_));
cudaSetDevice(initial_device);
}
#endif // CPU_ONLY
}
inline void SyncedMemory::to_cpu() {
switch (head_) {
case UNINITIALIZED:
CaffeMallocHost(&cpu_ptr_, size_, &cpu_malloc_use_cuda_, &alloc_device_);
caffe_memset(size_, 0, cpu_ptr_);
head_ = HEAD_AT_CPU;
own_cpu_data_ = true;
break;
case HEAD_AT_GPU:
#ifndef CPU_ONLY
if (cpu_ptr_ == NULL) {
CaffeMallocHost(&cpu_ptr_, size_, &cpu_malloc_use_cuda_, &alloc_device_);
own_cpu_data_ = true;
}
caffe_gpu_memcpy(size_, gpu_ptr_, cpu_ptr_);
head_ = SYNCED;
#else
NO_GPU;
#endif
break;
case HEAD_AT_CPU:
case SYNCED:
break;
}
}
(2)处理net共享问题:
std::shared_ptr<caffe::NetParameter> param_;
std::shared_ptr<caffe::Net<float> > net_;
boost::thread_specific_ptr<caffe::Net<float> > predictors_;
boost::thread_specific_ptr<Blob<float> > tmp_blob;
std::unique_ptr<caffe::NetParameter> LoadNetFromFile(std::string &model_file)
{
auto prototxt = std::make_unique<caffe::NetParameter>();
caffe::ReadProtoFromTextFile(model_file.c_str(), prototxt.get());
caffe::UpgradeNetAsNeeded(model_file, prototxt.get());
return prototxt;
}
std::unique_ptr<caffe::NetParameter> LoadWeightsFromFile(std::string &weight_file)
{
auto weights = std::make_unique<caffe::NetParameter>();
caffe::ReadProtoFromBinaryFile(weight_file.c_str(), weights.get());
caffe::UpgradeNetAsNeeded(weight_file, weights.get());
return weights;
}
bool Init(int dev_id, int dev_type, const char *model, const char *weights)
{
dev_id_ = dev_id;
dev_type_ = dev_type;
model_ = model;
weights_ = weights;
Caffe::SetDevice(dev_id_);
if(dev_type_ == 0)
{
Caffe::set_mode(Caffe::CPU);
}
else
{
Caffe::set_mode(Caffe::GPU);
}
auto prototxt_net_param = LoadNetFromFile(model_);
auto weights_net_param = LoadWeightsFromFile(weights_);
param_ = std::make_shared<caffe::NetParameter>(*prototxt_net_param);
param_->mutable_state()->set_phase(caffe::TEST);
net_ = std::make_shared<caffe::Net<float> >(*param_);
net_->CopyTrainedLayersFrom(*weights_net_param);
return true;
}
void Inference(cv::Mat &img)
{
if(!predictors_.get())
{
auto predictor = std::make_unique<caffe::Net<float> >(*param_);
predictor->ShareTrainedLayersWith(net_.get());
predictors_.reset(predictor.release());
}
auto *predictor = predictors_.get();
Blob<float> *input_blobs = predictor->input_blobs()[0];
int num_channels = input_blobs->channels();
int input_width = input_blobs->width();
int input_height = input_blobs->height();
if(!tmp_blob.get())
{
auto blob = new Blob<float>();
tmp_blob.reset(blob);
}
Blob<float> *InBlob = tmp_blob.get();
InBlob->Reshape(1, num_channels, input_height, input_width);
std::vector<cv::Mat> input_channels;
wrapInputLayer(&input_channels, predictor);
preprocess(img, &input_channels, num_channels, input_width, input_height);
CHECK_EQ(1, predictor->input_blobs().size());
predictor->input_blobs()[0]->ReshapeLike(*InBlob);
predictor->input_blobs()[0]->set_cpu_data(InBlob->mutable_cpu_data());
predictor->Reshape();
predictor->Forward();
}