可变形卷积原理、实现及工程化部署

引言

本文主要以目标检测算法CenterNet作为载体,介绍可变形卷积的算法原理,python实现以及工程化部署

可变形卷积原理介绍

可变形卷积目前有DCN V1以及在V1基础上改进发展来的DCN V2,具体的算法原理可以看论文原文或者参考:

https://cloud.tencent.com/developer/article/1679389

可变形卷积的实现

这里主要参考:https://cloud.tencent.com/developer/article/1638363 中的讲解

CenterNet中实现DCN V2的工程化部署

这里主要参考的代码为:https://github.com/CaoWGG/TensorRT-CenterNet

这里参考:

有几个点需要事先说明:

  • 为了能够实现对于模型的量化,需要将CenterNet模型先转换为ONNX,然后利用tensorRT官方公布的onnx-tensorrt库来实现tensorRT模型的转化

  • 由于TensorRT中不支持可变形卷积的操作,所以需要自定义plugin来进行实现

CenterNet中DCNV2的plugin定义在onnx-tensorrt文件夹中,分别为:

dcn_v2_im2col_cuda.cu和dcn_v2_im2col_cuda.h;DCNv2.cpp和DCNv2.h,其中DCNv2.cpp和DCNv2.h为tensorRT中自定义plugin的文件,具体代码如下:

DCNv2Plugin::DCNv2Plugin(int in_channel,
                         int out_channel,
                         int kernel_H,
                         int kernel_W,
                         int deformable_group,
                         int dilation,
                         int groups,
                         int padding,
                         int stride,
                         nvinfer1::Weights const &weight, nvinfer1::Weights const &bias):_in_channel(in_channel),
                        _out_channel(out_channel),_kernel_H(kernel_H),_kernel_W(kernel_W),_deformable_group(deformable_group),
                         _dilation(dilation),_groups(groups),_padding(padding),_stride(stride),_initialized(false){

    if (weight.type == nvinfer1::DataType::kFLOAT)
    {
        _h_weight.assign((float*)weight.values,(float*)weight.values+weight.count);
    } else { throw std::runtime_error("Unsupported  weight dtype");}

    if (bias.type == nvinfer1::DataType::kFLOAT)
    {
        _h_bias.assign((float*)bias.values,(float*)bias.values+bias.count);
    } else { throw std::runtime_error("Unsupported  bias dtype");}

}

// 初始化函数,为参数提前开辟显存空间
int DCNv2Plugin::initialize() {
    if(_initialized) return 0;
    auto _output_dims = this->getOutputDimensions(0, &this->getInputDims(0), 3);
    assert(is_CHW(this->getInputDims(0)));
    assert(is_CHW(_output_dims));
    size_t ones_size = _output_dims.d[1]*_output_dims.d[2]* sizeof(float);
    size_t weight_size = _h_weight.size()* sizeof(float);
    size_t bias_size = _h_bias.size()* sizeof(float);
    float *ones_cpu = new float[ones_size/ sizeof(float)];
    for (int i = 0; i < ones_size/ sizeof(float); i++) {
        ones_cpu[i] = 1.0;
    }
    CHECK_CUDA(cudaMalloc((void**)&_d_columns, _in_channel * _kernel_H * _kernel_W * ones_size););
    CHECK_CUDA(cudaMalloc((void**)&_d_ones, ones_size));
    CHECK_CUDA(cudaMalloc((void**)&_d_weight, weight_size));
    CHECK_CUDA(cudaMalloc((void**)&_d_bias, bias_size));
    CHECK_CUDA(cudaMemcpy(_d_ones, ones_cpu, ones_size, cudaMemcpyHostToDevice));
    CHECK_CUDA(cudaMemcpy(_d_weight, _h_weight.data(), weight_size, cudaMemcpyHostToDevice));
    CHECK_CUDA(cudaMemcpy(_d_bias, _h_bias.data(), bias_size, cudaMemcpyHostToDevice));
    delete[] ones_cpu;
    _initialized = true;

    return 0;
}

// 用于释放之前申请的显存空间
void DCNv2Plugin::terminate() {
    if (!_initialized) {
        return;
    }
    cudaFree(_d_columns);
    cudaFree(_d_bias);
    cudaFree(_d_weight);
    cudaFree(_d_ones);
    _initialized = false;
}

DCNv2Plugin::~DCNv2Plugin() {
    terminate();
}

// 判断数据类型是否正确
bool DCNv2Plugin::supportsFormat(nvinfer1::DataType type, nvinfer1::PluginFormat format) const {

    return (type == nvinfer1::DataType::kFLOAT);
}

// TensorRT支持动态tensor大小的时候,batch的维度需要用下面的函数定义
nvinfer1::Dims DCNv2Plugin::getOutputDimensions(int index, const nvinfer1::Dims *inputDims, int nbInputs) {
    assert(index == 0);
    assert(inputDims);
    assert(nbInputs == 3);
    nvinfer1::Dims const& input = inputDims[0];
    assert(is_CHW(input));
    nvinfer1::Dims output;
    output.nbDims = input.nbDims;
    for( int d=0; d<input.nbDims; ++d ) {
        output.type[d] = input.type[d];
        output.d[d] = input.d[d];
    }
    output.d[0] = _out_channel;
    output.d[1] = (output.d[1] + 2 * _padding - (_dilation * (_kernel_H - 1) + 1)) / _stride + 1 ;
    output.d[2] = (output.d[2] + 2 * _padding - (_dilation * (_kernel_H - 1) + 1)) / _stride + 1 ;
    return output;
}

// 返回该plugin需要中间显存变量的实际数据大小
size_t DCNv2Plugin::getWorkspaceSize(int maxBatchSize) const {
    return 0;
}

// 该plugin定义的op的执行函数
int DCNv2Plugin::enqueue(int batchSize, const void *const *inputs, void **outputs, void *workspace,
                         cudaStream_t stream) {
    float alpha ,beta;
    int m, n, k;

    cublasHandle_t handle = blas_handle();
    const float* input = static_cast<const float *>(inputs[0]);
    const float* offset = static_cast<const float *>(inputs[1]);
    const float* mask = static_cast<const float *>(inputs[2]);
    float * output = static_cast<float *>(outputs[0]);
    nvinfer1::Dims input_dims = this->getInputDims(0);
    assert(batchSize==1);
    int h = input_dims.d[1];
    int w = input_dims.d[2];
    int height_out = (h + 2 * _padding - (_dilation * (_kernel_H - 1) + 1)) / _stride + 1;
    int width_out = (w + 2 * _padding - (_dilation * (_kernel_W - 1) + 1)) / _stride + 1;
    m = _out_channel;
    n = height_out * width_out;
    k = 1;
    alpha = 1.0;
    beta = 0.0;
    /// output  nxm
    /// ones    1xn  T ->> nx1
    /// bias    1xm
    /// ones x bias = nxm
    //  add bias
    cublasSgemm(handle,
                CUBLAS_OP_T, CUBLAS_OP_N,
                n, m, k,&alpha,
                _d_ones, k,
                _d_bias, k,&beta,
                output, n);
    // im2col (offset and mask)
    modulated_deformable_im2col_cuda(stream,input,offset,mask,
                                     1, _in_channel, h, w,
                                     height_out, width_out, _kernel_H, _kernel_W,
                                     _padding, _padding, _stride, _stride, _dilation, _dilation,
                                     _deformable_group, _d_columns);
    m = _out_channel;
    n = height_out * width_out;
    k = _in_channel * _kernel_H * _kernel_W;
    alpha = 1.0;
    beta = 1.0;
    // im2col conv
    cublasSgemm(handle,
                CUBLAS_OP_N, CUBLAS_OP_N,
                n, m, k,&alpha,
                _d_columns, n,
                _d_weight, k,
                &beta,
                output, n);
    return 0;
}

然后需要将以上plugin定义好的op在onnx-tensorrt中builtin_op_importers.cpp文件中进行插件的注册操作:

DEFINE_BUILTIN_OP_IMPORTER(DCNv2) {
    ASSERT(inputs.at(0).is_tensor(),  ErrorCode::kUNSUPPORTED_NODE); // input
    ASSERT(inputs.at(1).is_tensor(), ErrorCode::kUNSUPPORTED_NODE); // offset
    ASSERT(inputs.at(2).is_tensor(), ErrorCode::kUNSUPPORTED_NODE); // mask
    ASSERT(inputs.at(3).is_weights(), ErrorCode::kUNSUPPORTED_NODE); // weight

    auto kernel_weights = inputs.at(3).weights();
    nvinfer1::Weights bias_weights;
    if( inputs.size() == 5 ) {
        ASSERT(inputs.at(4).is_weights(), ErrorCode::kUNSUPPORTED_NODE);
        auto shaped_bias_weights = inputs.at(4).weights();
        ASSERT(shaped_bias_weights.shape.nbDims == 1, ErrorCode::kINVALID_NODE);
        ASSERT(shaped_bias_weights.shape.d[0] == kernel_weights.shape.d[0], ErrorCode::kINVALID_NODE);
        bias_weights = shaped_bias_weights;
    } else {
        bias_weights = ShapedWeights::empty(kernel_weights.type);
    }
    int out_channel,in_channel,kernel_H,kernel_W,deformable_group,dilation,groups,padding,stride;
    out_channel = kernel_weights.shape.d[0];
    in_channel = kernel_weights.shape.d[1];
    kernel_H = kernel_weights.shape.d[2];
    kernel_W = kernel_weights.shape.d[3];

    OnnxAttrs attrs(node);
    deformable_group = attrs.get("deformable_group", 1);
    dilation = attrs.get("dilation", 1);
    groups = attrs.get("groups", 1);
    padding = attrs.get("padding", 1);
    stride = attrs.get("stride", 1);
    RETURN_FIRST_OUTPUT(
            ctx->addPlugin(
                    new DCNv2Plugin(in_channel,out_channel,kernel_H,kernel_W,deformable_group,
                            dilation,groups,padding,stride, kernel_weights, bias_weights),
                    {&inputs.at(0).tensor(),&inputs.at(1).tensor(),&inputs.at(2).tensor()}));
}

在builtin_plugins.cpp也进行注册:

REGISTER_BUILTIN_PLUGIN("DCNv2", DCNv2Plugin);

然后需要在onnx-tensorrt中的CMakeLists.txt里添加上定义plugin对应的源码并将其链接到动态库中:

# 定义插件源码
set(PLUGIN_SOURCES
  FancyActivation.cu
  ResizeNearest.cu
  Split.cu
  dcn_v2_im2col_cuda.cu
  InstanceNormalization.cpp
  DCNv2.cpp
  plugin.cpp
  )
  
# 链接到动态库
list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC --expt-extended-lambda")
if(${CMAKE_VERSION} VERSION_LESS ${CMAKE_VERSION_THRESHOLD})
  CUDA_INCLUDE_DIRECTORIES(${CUDNN_INCLUDE_DIR} ${TENSORRT_INCLUDE_DIR})
  CUDA_ADD_LIBRARY(nvonnxparser_plugin STATIC ${PLUGIN_SOURCES})
else()
  include_directories(${CUDNN_INCLUDE_DIR} ${TENSORRT_INCLUDE_DIR})
  add_library(nvonnxparser_plugin STATIC ${PLUGIN_SOURCES})
endif()
target_include_directories(nvonnxparser_plugin PUBLIC ${CUDA_INCLUDE_DIRS} ${ONNX_INCLUDE_DIRS} ${TENSORRT_INCLUDE_DIR} ${CUDNN_INCLUDE_DIR})
target_link_libraries(nvonnxparser_plugin ${TENSORRT_LIBRARY} cuda cudart cublas)

最后在链接以上生成的库来进行tensort标准C++ API函数进行前向推理以及模型转换操作

  • 5
    点赞
  • 18
    收藏
    觉得还不错? 一键收藏
  • 6
    评论
根据提供的引用内容,可变形卷积的PyTorch代码实现可以参考DCNv2。该代码已经经过编译,可以直接使用。在Ubuntu环境下使用。 以下是一个可变形卷积的PyTorch代码实现的例子: ```python import torch from torch import nn from torch.autograd import Function # 定义可变形卷积的前向和反向传播函数 class DeformConvFunction(Function): @staticmethod def forward(ctx, input, offset, weight, bias=None): # 前向传播逻辑 # ... @staticmethod def backward(ctx, grad_output): # 反向传播逻辑 # ... # 定义可变形卷积层 class DeformConv(nn.Module): def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True): super(DeformConv, self).__init__() self.offset_conv = nn.Conv2d(in_channels, 2 * kernel_size * kernel_size, kernel_size, stride, padding, dilation, groups) self.weight = nn.Parameter(torch.Tensor(out_channels, in_channels, kernel_size, kernel_size)) if bias: self.bias = nn.Parameter(torch.Tensor(out_channels)) else: self.register_parameter('bias', None) self.reset_parameters() def reset_parameters(self): nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5)) if self.bias is not None: fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight) bound = 1 / math.sqrt(fan_in) nn.init.uniform_(self.bias, -bound, bound) def forward(self, input): offset = self.offset_conv(input) output = DeformConvFunction.apply(input, offset, self.weight, self.bias) return output # 使用可变形卷积层 input = torch.randn(1, 3, 32, 32) conv = DeformConv(3, 64, kernel_size=3, stride=1, padding=1) output = conv(input) ``` 请注意,上述代码仅为示例,实际使用时需要根据具体需求进行适当修改。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 6
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值