TensorRT 初探(2)—— group conv与depthwise_conv

首先,不管是普通的conv,还是group conv以及depthwise conv 都是使用 addConvolutionNd 接口。可以参考上一篇文档 TensorRT学习——conv2d:IConvolutionLayer

普通卷积

参考文档1
参考文档2
在这里插入图片描述

分组卷积

在这里插入图片描述
输入每组feature map尺寸: C/g, H, W ,共有g组。
单个卷积核每组的尺寸:C/g, k, k ,一个卷积核被分成了g组。
当 g = C时, 就是深度可分离卷积。

深度可分离卷积

在这里插入图片描述

主要代码演示

完整代码见附录

普通conv

// Create a Network Definition
  IBuilder* builder = createInferBuilder(logger);
  uint32_t flag = 1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
  INetworkDefinition* network = builder->createNetworkV2(flag);
  
  Dims4 input_shape{1, 9, 4, 4};
  Dims4 filter_shape{1, 9, 2, 2};
  DimsHW kernel_size{2, 2};
  DimsHW stride{1, 1};

  // Add the Input layer to the network
  auto input_data = network->addInput("input", DataType::kFLOAT, input_shape);
  
  // Add the Convolution layer with hidden layer input nodes, strides and weights for filter and bias.
  std::vector<float>filter(ProductOfDims(filter_shape), DEFAULT_VALUE);
  Weights filter_w{DataType::kFLOAT, filter.data(), filter.size()};
  Weights bias_w{DataType::kFLOAT, nullptr, 0}; // no bias
  int32_t output_channel = filter_shape.d[0];
  auto conv2d = network->addConvolutionNd(*input_data, output_channel, kernel_size, filter_w, bias_w);
  conv2d->setStrideNd(stride);
  conv2d->setNbGroups(1);

这里设置了group为1(setNbGroups(1)), 输入input_shape{1, 9, 4, 4}, filter_shape{1, 9, 2, 2}; 可知输出shape为(1,1,3,3);对于conv,输入的input channel和 filter channel务必相等。

分组卷积

// Create a Network Definition
  IBuilder* builder = createInferBuilder(logger);
  uint32_t flag = 1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
  INetworkDefinition* network = builder->createNetworkV2(flag);
  
  Dims4 input_shape{1, 9, 4, 4};
  Dims4 filter_shape{3, 3, 2, 2};
  DimsHW kernel_size{2, 2};
  DimsHW stride{1, 1};

  // Add the Input layer to the network
  auto input_data = network->addInput("input", DataType::kFLOAT, input_shape);
  
  // Add the Convolution layer with hidden layer input nodes, strides and weights for filter and bias.
  std::vector<float>filter(ProductOfDims(filter_shape), DEFAULT_VALUE);
  Weights filter_w{DataType::kFLOAT, filter.data(), filter.size()};
  Weights bias_w{DataType::kFLOAT, nullptr, 0}; // no bias
  int32_t output_channel = filter_shape.d[0];
  auto conv2d = network->addConvolutionNd(*input_data, output_channel, kernel_size, filter_w, bias_w);
  conv2d->setStrideNd(stride);
  conv2d->setNbGroups(3);

这里设置了group为3, 那么input 就被分为了3组,相当于每组的channel变为3。一个filter变为3组,每个filter的channel也为 9/3 =3。故filter shape从{1, 9, 2, 2} 变为 {3, 3, 2, 2}; 由此可知输出shape为 {1, 3, 3, 3}。

深度可分离卷积

与分组卷积类似,只需将group数设置为与input channel数相同,filter shape变为 {9, 1, 2, 2};

// Create a Network Definition
  IBuilder* builder = createInferBuilder(logger);
  uint32_t flag = 1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
  INetworkDefinition* network = builder->createNetworkV2(flag);
  
  Dims4 input_shape{1, 9, 4, 4};
  Dims4 filter_shape{9, 1, 2, 2};
  DimsHW kernel_size{2, 2};
  DimsHW stride{1, 1};

  // Add the Input layer to the network
  auto input_data = network->addInput("input", DataType::kFLOAT, input_shape);
  
  // Add the Convolution layer with hidden layer input nodes, strides and weights for filter and bias.
  std::vector<float>filter(ProductOfDims(filter_shape), DEFAULT_VALUE);
  Weights filter_w{DataType::kFLOAT, filter.data(), filter.size()};
  Weights bias_w{DataType::kFLOAT, nullptr, 0}; // no bias
  int32_t output_channel = filter_shape.d[0];
  auto conv2d = network->addConvolutionNd(*input_data, output_channel, kernel_size, filter_w, bias_w);
  conv2d->setStrideNd(stride);
  conv2d->setNbGroups(9);

附录

完整代码

#include "NvInfer.h"
#include <iostream>
#include <cuda_runtime_api.h>
#include <vector>
#include <sstream>
#include <assert.h>

using namespace nvinfer1;

#define DEFAULT_VALUE 1.0

class Logger : public ILogger
{
public:
    void log(Severity severity, const char* msg) noexcept override
    {
        // suppress info-level messages
        if (severity <= Severity::kWARNING)
            std::cout << msg << std::endl;
    }
};

size_t ProductOfDims(Dims dims) {
  size_t result = 1;
  for(size_t i = 0; i < dims.nbDims; i++) {
    result *= dims.d[i];
  }
  return result;
}

std::string DimsToStr(Dims dims) {
  std::stringstream ss;
  for(size_t i = 0; i < dims.nbDims; i++) {
    ss << dims.d[i] << " ";
  }
  return ss.str();
}

int main() {
  Logger logger;

  // Create a Network Definition
  IBuilder* builder = createInferBuilder(logger);
  uint32_t flag = 1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
  INetworkDefinition* network = builder->createNetworkV2(flag);
  
  Dims4 input_shape{1, 9, 4, 4};
  Dims4 filter_shape{3, 3, 2, 2};
  DimsHW kernel_size{2, 2};
  DimsHW stride{1, 1};

  // Add the Input layer to the network
  auto input_data = network->addInput("input", DataType::kFLOAT, input_shape);
  
  // Add the Convolution layer with hidden layer input nodes, strides and weights for filter and bias.
  std::vector<float>filter(ProductOfDims(filter_shape), DEFAULT_VALUE);
  Weights filter_w{DataType::kFLOAT, filter.data(), filter.size()};
  Weights bias_w{DataType::kFLOAT, nullptr, 0}; // no bias
  int32_t output_channel = filter_shape.d[0];
  auto conv2d = network->addConvolutionNd(*input_data, output_channel, kernel_size, filter_w, bias_w);
  conv2d->setStrideNd(stride);
  conv2d->setNbGroups(3);

  // Add a name for the output of the conv2d layer so that the tensor can be bound to a memory buffer at inference time:
  conv2d->getOutput(0)->setName("output");
  // Mark it as the output of the entire network:
  network->markOutput(*conv2d->getOutput(0));
  Dims output_shape = network->getOutput(0)->getDimensions();

  // Building an Engine(optimize the network)
  IBuilderConfig* config = builder->createBuilderConfig();
  IHostMemory*  serializedModel = builder->buildSerializedNetwork(*network, *config);
  IRuntime* runtime = createInferRuntime(logger);
  ICudaEngine* engine = runtime->deserializeCudaEngine(serializedModel->data(), serializedModel->size());

  // Prepare input_data
  int32_t inputIndex = engine->getBindingIndex("input");
  int32_t outputIndex = engine->getBindingIndex("output");
  std::vector<float> input(ProductOfDims(input_shape), DEFAULT_VALUE);
  std::vector<float> output(ProductOfDims(output_shape));
  void *GPU_input_Buffer_ptr;  // a host ptr point to a GPU buffer
  void *GPU_output_Buffer_ptr;  // a host ptr point to a GPU buffer
  void* buffers[2];
  cudaMalloc(&GPU_input_Buffer_ptr, sizeof(float)*input.size()); //malloc gpu buffer for input
  cudaMalloc(&GPU_output_Buffer_ptr, sizeof(float)*output.size()); //malloc gpu buffer for output
  cudaMemcpy(GPU_input_Buffer_ptr, input.data(), input.size()*sizeof(float), cudaMemcpyHostToDevice); // copy input data from cpu to gpu
  buffers[inputIndex] = static_cast<void*>(GPU_input_Buffer_ptr);
  buffers[outputIndex] = static_cast<void*>(GPU_output_Buffer_ptr);

  // Performing Inference
  IExecutionContext *context = engine->createExecutionContext();
  context->executeV2(buffers);

  // copy result data from gpu to cpu
  cudaMemcpy(output.data(), GPU_output_Buffer_ptr, output.size()*sizeof(float), cudaMemcpyDeviceToHost); 

  // display output
  std::cout << "output shape : " << DimsToStr(output_shape) << "\n";
  std::cout << "output data : \n";
  for(auto i : output)
    std::cout << i << " ";
  std::cout << std::endl;
}

编译方式

project(tensorrt_demo)
cmake_minimum_required(VERSION 3.16)

set(TRT_PATH /weishengying/DownLoads/TensorRT-8.2.3.0)
set(CUDA_PATH /usr/local/cuda)

include_directories(${TRT_PATH}/include)
include_directories(${CUDA_PATH}/include)

function(compile_demo target source)
  add_executable(${target} ${source})
  target_link_libraries(${target} ${CUDA_PATH}/lib64/libcudart.so)
  target_link_libraries(${target} ${TRT_PATH}/lib/libnvinfer.so)
  target_link_libraries(${target} /usr/lib/x86_64-linux-gnu/libcudnn.so)
endfunction(compile_demo)

compile_demo(depthwise_conv2d depthwise_conv2d.cc)
  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值