TensorRT只支持对channel通道的softmax,对其他通道不支持,而SSD中的softmax不是对channel通道的分类,故要想实现对SSD的TensorRT加速,需要手动编写softmax层的IPugin代码。
//Softmax layer . TensorRT softmax only support cross channel class SoftmaxPlugin : public IPlugin { public: SoftmaxPlugin(int softmax_axis) {// 通过构造函数,将prototxt中SoftMax层的参数(分类的维度索引)传进来 softmax_axis_ = softmax_axis; } SoftmaxPlugin(const void* buffer, size_t size) {// 将通过serialize函数保存到engine文件中的内容解析出来 assert(size == 3 * sizeof(int)); const int* d = reinterpret_cast<const int*>(buffer); outer_num_ = d[0]; shape_softmax_axis_ = d[1]; inner_num_ = d[2]; } inline int getNbOutputs() const override { return 1; }//第一步 Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override {//第二步 assert(1 == nbInputDims); assert(0 == index); assert(3 == inputs[index].nbDims); // softmax层的输入输出维度完全一致 return DimsCHW(inputs[0].d[0], inputs[0].d[1], inputs[0].d[2]); } void configure(const Dims*inputs, int nbInputs, const Dims* outputs, int nbOutputs, int maxBatchSize) override {//第三步 assert(softmax_axis_ < 4); // softmax层的数据维度以维度softmax_axis_为中心分割开,参考caffe softmax层的源码 int count = 1; for (int i = 0; i < softmax_axis_; ++i) { if(0 == i) count *= maxBatchSize; else count *= inputs[0].d[i-1]; } outer_num_ = count; if(0 == softmax_axis_) shape_softmax_axis_ = maxBatchSize; else shape_softmax_axis_ = inputs[0].d[softmax_axis_-1]; count = 1; for (int i = softmax_axis_+1; i < 4; ++i) { if(0 == i) count *= maxBatchSize; else count *= inputs[0].d[i-1]; } inner_num_ = count; } int initialize() override {//第四步 // Initialize CUDNN. CUDNN_CHECK(cudnnCreate(&handle_)); cudnn::createTensor4dDesc<float>(&bottom_desc_); cudnn::createTensor4dDesc<float>(&top_desc_); int N = outer_num_; int K = shape_softmax_axis_; int H = inner_num_; int W = 1; cudnn::setTensor4dDesc<float>(&bottom_desc_, N, K, H, W); cudnn::setTensor4dDesc<float>(&top_desc_, N, K, H, W); handles_setup_ = true; return 0; } size_t getSerializationSize() override { return 3 * sizeof(int); } void serialize(void* buffer) override {//第五步 int* d = reinterpret_cast<int*>(buffer); d[0] = outer_num_; d[1] = shape_softmax_axis_; d[2] = inner_num_; } inline void terminate() override {//第六步 // Check that handles have been setup before destroying. if (!handles_setup_) { return; } cudnnDestroyTensorDescriptor(bottom_desc_); cudnnDestroyTensorDescriptor(top_desc_); cudnnDestroy(handle_); } inline size_t getWorkspaceSize(int) const override { return 0; } int enqueue(int batchSize, const void*const *inputs, void** outputs, void*, cudaStream_t stream) override { // 参考caffe softmax层cudnn前向计算代码 const float* bottom_data = (float*)inputs[0]; float* top_data = (float*)outputs[0]; CUDNN_CHECK(cudnnSoftmaxForward(handle_, CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_CHANNEL, cudnn::dataType<float>::one, bottom_desc_, bottom_data, cudnn::dataType<float>::zero, top_desc_, top_data)); return 0; } protected: int outer_num_; int inner_num_; int softmax_axis_; int shape_softmax_axis_; bool handles_setup_; cudnnHandle_t handle_; cudnnTensorDescriptor_t bottom_desc_; cudnnTensorDescriptor_t top_desc_; };