TensorRT 初探(1)—— conv2d:IConvolutionLayer

环境搭建

环境搭建参考文档 TensorRT使用入门

简介

本节主要介绍使用tensorrt来计算conv2d,主要介绍
addConvolutionNd接口的使用,以及IConvolutionLayer中参数的设置。

代码

建议先熟悉环境搭建文档中的demo,熟悉基本的c++ api使用。主要代码如下。
先引入必要的头文件

#include "NvInfer.h"
#include <iostream>
#include <cuda_runtime_api.h>
#include <vector>
#include <sstream>
#include <assert.h>

using namespace nvinfer1;

#define DEFAULT_VALUE 1.0

定义logger

class Logger : public ILogger
{
public:
    void log(Severity severity, const char* msg) noexcept override
    {
        // suppress info-level messages
        if (severity <= Severity::kWARNING)
            std::cout << msg << std::endl;
    }
};

定义一些辅助函数

size_t ProductOfDims(Dims dims) {
  size_t result = 1;
  for(size_t i = 0; i < dims.nbDims; i++) {
    result *= dims.d[i];
  }
  return result;
}

std::string DimsToStr(Dims dims) {
  std::stringstream ss;
  for(size_t i = 0; i < dims.nbDims; i++) {
    ss << dims.d[i] << " ";
  }
  return ss.str();
}

Dims4 CalculateConv2dOutput(Dims4 input, Dims4 filter, DimsHW kernel_size, DimsHW stride, DimsHW paddings) {
  /******dataformat is NCHW******/
  assert(input.d[1] == filter.d[1]); // assert same channel value
  assert(filter.d[2] == kernel_size.d[0]);
  assert(filter.d[3] == kernel_size.d[1]);

  Dims4 output;
  output.d[0] = input.d[0]; //batch
  output.d[1] = filter.d[0]; //output channel

  int32_t kh = kernel_size.d[0];
  int32_t kw = kernel_size.d[1];
  int32_t sh = stride.d[0];
  int32_t sw = stride.d[1];
  
  output.d[2] = ((input.d[2] + 2 * paddings.d[0] - kh) / sh) + 1; //cal output height
  output.d[3] = ((input.d[3] + 2 * paddings.d[1] - kw) / sw) + 1; //cal output weight

  return output;
}

三个函数的意思显而易见,ProductOfDims计算输入Dims各维度数值的乘积(即元素个数),CalculateConv2dOutput为了计算conv2d的输出shape。注意:CalculateConv2dOutput函数目前只考虑下paddings,未考虑prepadding, postpadding以及padding mode。这在后面会讨论到。

创建一个Network

int main() {
  Logger logger;

  // Create a Network Definition
  IBuilder* builder = createInferBuilder(logger);
  uint32_t flag = 1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
  INetworkDefinition* network = builder->createNetworkV2(flag);

定义conv2d的输入以及一些必要属性的shape信息

  Dims4 input_shape{3, 3, 8, 8};
  Dims4 filter_shape{3, 3, 2, 2};
  DimsHW kernel_size{2, 2};
  DimsHW stride{1, 1};
  DimsHW paddings{1, 1}; //2d padding (padding is symmetric)
  Dims4 output_shape = CalculateConv2dOutput(input_shape, filter_shape, kernel_size, stride, paddings);

  // Add the Input layer to the network
  auto input_data = network->addInput("input", DataType::kFLOAT, input_shape);

添加一个convolution layer,设置stride, padding,

// Add the Convolution layer with hidden layer input nodes, strides and weights for filter and bias.
  std::vector<float>filter(ProductOfDims(filter_shape), DEFAULT_VALUE);
  Weights filter_w{DataType::kFLOAT, filter.data(), filter.size()};
  Weights bias_w{DataType::kFLOAT, nullptr, 0}; // no bias
  int32_t output_channel = filter_shape.d[0];
  auto conv2d = network->addConvolutionNd(*input_data, output_channel, kernel_size, filter_w, bias_w);
  conv2d->setStrideNd(stride);
  conv2d->setPaddingNd(paddings);

完整代码见附录1

设置prepadding和postpadding

2d conv的padding有四个方向,上左下右。
前面demo中设置的padding是二维的,即HW两个方向,由于IConvolutionLayer中padding这个属性是对称的,所以padding[0]是上下padding, padding[1]左右padding。

但是实际应用中padding不一定是对称的,即上下左右padding可能各不相同。这是就需要设置IConvolutionLayer中的prepadding和postpadding属性。对于2d conv,prepadding[0]和prepadding[1]分别表示上padding,左padding,postpadding[0]和postpadding[1]分别表示下padding,右padding。
设置prepadding 和 postpadding 的demo如附录2,
我们简化下输入:

  Dims4 input_shape{1, 1, 2, 2};
  Dims4 filter_shape{1, 1, 2, 2};
  DimsHW kernel_size{2, 2};
  DimsHW stride{1, 1};
  Dims2 prepadding{1,0};
  Dims2 postpadding{0,1};

这里定义了上padding和右padding为1。同时conv2d的layer设置对应的属性

  auto conv2d = network->addConvolutionNd(*input_data, output_channel, kernel_size, filter_w, bias_w);
  conv2d->setStrideNd(stride);
  conv2d->setPrePadding(prepadding);
  conv2d->setPostPadding(postpadding);

同时注意CalculateConv2dOutput的修改

Dims4 CalculateConv2dOutput(Dims4 input, Dims4 filter, DimsHW kernel_size, DimsHW stride, Dims2 prepadding, Dims2 postpadding) {
  /******dataformat is NCHW******/
  assert(input.d[1] == filter.d[1]); // assert same channel value
  assert(filter.d[2] == kernel_size.d[0]);
  assert(filter.d[3] == kernel_size.d[1]);

  Dims4 output;
  output.d[0] = input.d[0]; //batch
  output.d[1] = filter.d[0]; //output channel

  int32_t kh = kernel_size.d[0];
  int32_t kw = kernel_size.d[1];
  int32_t sh = stride.d[0];
  int32_t sw = stride.d[1];
  
  output.d[2] = ((input.d[2] + prepadding.d[0] + postpadding.d[0] - kh) / sh) + 1; //cal output height
  output.d[3] = ((input.d[3] + prepadding.d[1] + postpadding.d[1] - kw) / sw) + 1; //cal output weight

  return output;
}

最后的运行结果如下:

output shape : 1 1 2 2 
output data : 
2 1 4 2 

完整代码见附录2

prepadding、postpadding与paddingNd冲突时

通过前面的介绍可知,IConvolutionLayer有三个接口设置不同情况的padding,即

  conv2d->setPaddingNd(padding);
  conv2d->setPrePadding(prepadding);
  conv2d->setPostPadding(postpadding);

如果三个同时设置了,靠后的设置会覆盖之前的设置。比如上面这种情况,prepadding和postpadding有效,padding无效。读者可自信实验验证。根本原因是:调用setPaddingNd时,内部去调用接口setPrePadding和setPostPadding。也就是说,setPaddingNd接口更改的底层数据和setPrePadding、setPostPadding是同一份数据(prepadding, postpadding)。

我们可以通过一个小case验证一下:

DimsHW padding{1, 1};
conv2d->setPaddingNd(padding);
std::cout << DimsToStr(conv2d->getPaddingNd()) << "\n";
std::cout << DimsToStr(conv2d->getPrePadding()) << "\n";
std::cout << DimsToStr(conv2d->getPostPadding()) << "\n";

输出为:

1 1 
1 1 
1 1 

设置padding mode

IConvolutionLayer设置padding mode的api是

conv2d->setPaddingMode(PaddingMode::kEXPLICIT_ROUND_DOWN); //defalt mode

PaddingMode主要包括三种情况,kEXPLICIT_ROUND_DOWN是默认值,此时计算方式就是前面写的CalculateConv2dOutput中的计算方式。
PaddingMode详细信息可以参考官方文档:
PaddingMode官方文档
在这里插入图片描述
注意,从图中公式可知,padding mode为某些场景时,会自动计算prepadding,postpadding(虚拟的值,实际计算o时,如ceil(I/S),不需要pre/post padding的值),这种情况设置prepadding, postpadding的优先级更高。(原文 padding mode takes precedence if setPaddingMode() and setPrePadding() are also used.)意思是当你设置的padding mode需要自己推到prepadding, postpadding时,通过setPaddingNd,setPrePadding,setPostPadding这些接口设置的prepadding,postpadding值无效。读者可自行验证。

总结

经过上述的描述后,我们知道,关于padding,本质内部只有两个数据结构,prepadding和postpadding,setPaddingNd(),setPrePadding(),setPostPadding()这三个API都会去设置这两个数据结构,同时,当padding mode为某些场景时,也会去自动设置这两个数据,并且拥有更高的优先值。因为基于上述讨论,我们可以完善下CalculateConv2dOutput()这个函数的逻辑。

Dims4 CalculateConv2dOutput(Dims4 input, Dims4 filter, DimsHW kernel_size, DimsHW stride, Dims2 prepadding, 
                            Dims2 postpadding, PaddingMode paddingmode = PaddingMode::kEXPLICIT_ROUND_DOWN) {
  /******dataformat is NCHW******/
  assert(input.d[1] == filter.d[1]); // assert same channel value
  assert(filter.d[2] == kernel_size.d[0]);
  assert(filter.d[3] == kernel_size.d[1]);

  Dims4 output;
  output.d[0] = input.d[0]; //batch
  output.d[1] = filter.d[0]; //output channel

  int32_t kh = kernel_size.d[0];
  int32_t kw = kernel_size.d[1];
  int32_t sh = stride.d[0];
  int32_t sw = stride.d[1];
  
  int32_t uppadding = prepadding.d[0];
  int32_t leftpadding = prepadding.d[1];
  int32_t downpadding = postpadding.d[0];
  int32_t rightpadding = postpadding.d[1];

  switch (paddingmode)
  {
  case PaddingMode::kEXPLICIT_ROUND_UP:
    output.d[2] = ((input.d[2] + uppadding + downpadding - kh + sh -1 )/ sh) + 1; 
    output.d[3] = ((input.d[3] + leftpadding + rightpadding - kw + sw -1) / sw) + 1;
    break;
  case PaddingMode::kSAME_LOWER:
    output.d[2] = (input.d[2] / sh) + 1; 
    output.d[3] = (input.d[3] / sw) + 1;
    break;
  case PaddingMode::kSAME_UPPER:
    output.d[2] = (input.d[2] + sh -1 / sh) + 1; 
    output.d[3] = (input.d[3] + sw -1 / sw) + 1;
    break;
  default:
    output.d[2] = ((input.d[2] + uppadding + downpadding - kh) / sh) + 1; 
    output.d[3] = ((input.d[3] + leftpadding + rightpadding - kw) / sw) + 1;
    break;
  }
  return output;
}

代码中如
output.d[2] = (input.d[2] / sh) + 1 对应公式 floor(I / S),
output.d[2] = (input.d[2] + sh -1 / sh) + 1 则对应向上取整的 ceil(I / S)
其他paddingmode的计算方式读者可自行添加。
完整测试代码见附录3

附录

附录1

#include "NvInfer.h"
#include <iostream>
#include <cuda_runtime_api.h>
#include <vector>
#include <sstream>
#include <assert.h>

using namespace nvinfer1;

#define DEFAULT_VALUE 1.0

class Logger : public ILogger
{
public:
    void log(Severity severity, const char* msg) noexcept override
    {
        // suppress info-level messages
        if (severity <= Severity::kWARNING)
            std::cout << msg << std::endl;
    }
};

size_t ProductOfDims(Dims dims) {
  size_t result = 1;
  for(size_t i = 0; i < dims.nbDims; i++) {
    result *= dims.d[i];
  }
  return result;
}

std::string DimsToStr(Dims dims) {
  std::stringstream ss;
  for(size_t i = 0; i < dims.nbDims; i++) {
    ss << dims.d[i] << " ";
  }
  return ss.str();
}

Dims4 CalculateConv2dOutput(Dims4 input, Dims4 filter, DimsHW kernel_size, DimsHW stride, DimsHW paddings) {
  /******dataformat is NCHW******/
  assert(input.d[1] == filter.d[1]); // assert same channel value
  assert(filter.d[2] == kernel_size.d[0]);
  assert(filter.d[3] == kernel_size.d[1]);

  Dims4 output;
  output.d[0] = input.d[0]; //batch
  output.d[1] = filter.d[0]; //output channel

  int32_t kh = kernel_size.d[0];
  int32_t kw = kernel_size.d[1];
  int32_t sh = stride.d[0];
  int32_t sw = stride.d[1];
  
  output.d[2] = ((input.d[2] + 2 * paddings.d[0] - kh) / sh) + 1; //cal output height
  output.d[3] = ((input.d[3] + 2 * paddings.d[1] - kw) / sw) + 1; //cal output weight

  return output;
}

int main() {
  Logger logger;

  // Create a Network Definition
  IBuilder* builder = createInferBuilder(logger);
  uint32_t flag = 1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
  INetworkDefinition* network = builder->createNetworkV2(flag);
  
  Dims4 input_shape{3, 3, 8, 8};
  Dims4 filter_shape{3, 3, 2, 2};
  DimsHW kernel_size{2, 2};
  DimsHW stride{1, 1};
  DimsHW paddings{1, 1}; //2d padding (padding is symmetric)
  Dims4 output_shape = CalculateConv2dOutput(input_shape, filter_shape, kernel_size, stride, paddings);

  // Add the Input layer to the network
  auto input_data = network->addInput("input", DataType::kFLOAT, input_shape);
  
  // Add the Convolution layer with hidden layer input nodes, strides and weights for filter and bias.
  std::vector<float>filter(ProductOfDims(filter_shape), DEFAULT_VALUE);
  Weights filter_w{DataType::kFLOAT, filter.data(), filter.size()};
  Weights bias_w{DataType::kFLOAT, nullptr, 0}; // no bias
  int32_t output_channel = filter_shape.d[0];
  auto conv2d = network->addConvolutionNd(*input_data, output_channel, kernel_size, filter_w, bias_w);
  conv2d->setStrideNd(stride);
  conv2d->setPaddingNd(paddings);

  // Add a name for the output of the conv2d layer so that the tensor can be bound to a memory buffer at inference time:
  conv2d->getOutput(0)->setName("output");
  // Mark it as the output of the entire network:
  network->markOutput(*conv2d->getOutput(0));

  // Building an Engine(optimize the network)
  IBuilderConfig* config = builder->createBuilderConfig();
  IHostMemory*  serializedModel = builder->buildSerializedNetwork(*network, *config);
  IRuntime* runtime = createInferRuntime(logger);
  ICudaEngine* engine = runtime->deserializeCudaEngine(serializedModel->data(), serializedModel->size());

  // Prepare input_data
  int32_t inputIndex = engine->getBindingIndex("input");
  int32_t outputIndex = engine->getBindingIndex("output");
  std::vector<float> input(ProductOfDims(input_shape), DEFAULT_VALUE);
  std::vector<float> output(ProductOfDims(output_shape));
  void *GPU_input_Buffer_ptr;  // a host ptr point to a GPU buffer
  void *GPU_output_Buffer_ptr;  // a host ptr point to a GPU buffer
  void* buffers[2];
  cudaMalloc(&GPU_input_Buffer_ptr, sizeof(float)*input.size()); //malloc gpu buffer for input
  cudaMalloc(&GPU_output_Buffer_ptr, sizeof(float)*output.size()); //malloc gpu buffer for output
  cudaMemcpy(GPU_input_Buffer_ptr, input.data(), input.size()*sizeof(float), cudaMemcpyHostToDevice); // copy input data from cpu to gpu
  buffers[inputIndex] = static_cast<void*>(GPU_input_Buffer_ptr);
  buffers[outputIndex] = static_cast<void*>(GPU_output_Buffer_ptr);

  // Performing Inference
  IExecutionContext *context = engine->createExecutionContext();
  context->executeV2(buffers);

  // copy result data from gpu to cpu
  cudaMemcpy(output.data(), GPU_output_Buffer_ptr, output.size()*sizeof(float), cudaMemcpyDeviceToHost); 

  // display output
  std::cout << "output shape : " << DimsToStr(output_shape) << "\n";
  std::cout << "output data : \n";
  for(auto i : output)
    std::cout << i << " ";
  std::cout << std::endl;
}

附录2

#include "NvInfer.h"
#include <iostream>
#include <cuda_runtime_api.h>
#include <vector>
#include <sstream>
#include <assert.h>

using namespace nvinfer1;

#define DEFAULT_VALUE 1.0

class Logger : public ILogger
{
public:
    void log(Severity severity, const char* msg) noexcept override
    {
        // suppress info-level messages
        if (severity <= Severity::kWARNING)
            std::cout << msg << std::endl;
    }
};

size_t ProductOfDims(Dims dims) {
  size_t result = 1;
  for(size_t i = 0; i < dims.nbDims; i++) {
    result *= dims.d[i];
  }
  return result;
}

std::string DimsToStr(Dims dims) {
  std::stringstream ss;
  for(size_t i = 0; i < dims.nbDims; i++) {
    ss << dims.d[i] << " ";
  }
  return ss.str();
}

Dims4 CalculateConv2dOutput(Dims4 input, Dims4 filter, DimsHW kernel_size, DimsHW stride, Dims2 prepadding, Dims2 postpadding) {
  /******dataformat is NCHW******/
  assert(input.d[1] == filter.d[1]); // assert same channel value
  assert(filter.d[2] == kernel_size.d[0]);
  assert(filter.d[3] == kernel_size.d[1]);

  Dims4 output;
  output.d[0] = input.d[0]; //batch
  output.d[1] = filter.d[0]; //output channel

  int32_t kh = kernel_size.d[0];
  int32_t kw = kernel_size.d[1];
  int32_t sh = stride.d[0];
  int32_t sw = stride.d[1];
  
  output.d[2] = ((input.d[2] + prepadding.d[0] + postpadding.d[0] - kh) / sh) + 1; //cal output height
  output.d[3] = ((input.d[3] + prepadding.d[1] + postpadding.d[1] - kw) / sw) + 1; //cal output weight

  return output;
}

int main() {
  Logger logger;

  // Create a Network Definition
  IBuilder* builder = createInferBuilder(logger);
  uint32_t flag = 1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
  INetworkDefinition* network = builder->createNetworkV2(flag);
  
  Dims4 input_shape{1, 1, 2, 2};
  Dims4 filter_shape{1, 1, 2, 2};
  DimsHW kernel_size{2, 2};
  DimsHW stride{1, 1};
  Dims2 prepadding{1,0};
  Dims2 postpadding{0,1};
  Dims4 output_shape = CalculateConv2dOutput(input_shape, filter_shape, kernel_size, stride, prepadding, postpadding);

  // Add the Input layer to the network
  auto input_data = network->addInput("input", DataType::kFLOAT, input_shape);
  
  // Add the Convolution layer with hidden layer input nodes, strides and weights for filter and bias.
  std::vector<float>filter(ProductOfDims(filter_shape), DEFAULT_VALUE);
  Weights filter_w{DataType::kFLOAT, filter.data(), filter.size()};
  Weights bias_w{DataType::kFLOAT, nullptr, 0}; // no bias
  int32_t output_channel = filter_shape.d[0];
  auto conv2d = network->addConvolutionNd(*input_data, output_channel, kernel_size, filter_w, bias_w);
  conv2d->setStrideNd(stride);
  conv2d->setPrePadding(prepadding);
  conv2d->setPostPadding(postpadding);

  // Add a name for the output of the conv2d layer so that the tensor can be bound to a memory buffer at inference time:
  conv2d->getOutput(0)->setName("output");
  // Mark it as the output of the entire network:
  network->markOutput(*conv2d->getOutput(0));

  // Building an Engine(optimize the network)
  IBuilderConfig* config = builder->createBuilderConfig();
  IHostMemory*  serializedModel = builder->buildSerializedNetwork(*network, *config);
  IRuntime* runtime = createInferRuntime(logger);
  ICudaEngine* engine = runtime->deserializeCudaEngine(serializedModel->data(), serializedModel->size());

  // Prepare input_data
  int32_t inputIndex = engine->getBindingIndex("input");
  int32_t outputIndex = engine->getBindingIndex("output");
  std::vector<float> input(ProductOfDims(input_shape), DEFAULT_VALUE);
  std::vector<float> output(ProductOfDims(output_shape));
  void *GPU_input_Buffer_ptr;  // a host ptr point to a GPU buffer
  void *GPU_output_Buffer_ptr;  // a host ptr point to a GPU buffer
  void* buffers[2];
  cudaMalloc(&GPU_input_Buffer_ptr, sizeof(float)*input.size()); //malloc gpu buffer for input
  cudaMalloc(&GPU_output_Buffer_ptr, sizeof(float)*output.size()); //malloc gpu buffer for output
  cudaMemcpy(GPU_input_Buffer_ptr, input.data(), input.size()*sizeof(float), cudaMemcpyHostToDevice); // copy input data from cpu to gpu
  buffers[inputIndex] = static_cast<void*>(GPU_input_Buffer_ptr);
  buffers[outputIndex] = static_cast<void*>(GPU_output_Buffer_ptr);

  // Performing Inference
  IExecutionContext *context = engine->createExecutionContext();
  context->executeV2(buffers);

  // copy result data from gpu to cpu
  cudaMemcpy(output.data(), GPU_output_Buffer_ptr, output.size()*sizeof(float), cudaMemcpyDeviceToHost); 

  // display output
  std::cout << "output shape : " << DimsToStr(output_shape) << "\n";
  std::cout << "output data : \n";
  for(auto i : output)
    std::cout << i << " ";
  std::cout << std::endl;
}

附录3

#include "NvInfer.h"
#include <iostream>
#include <cuda_runtime_api.h>
#include <vector>
#include <sstream>
#include <assert.h>

using namespace nvinfer1;

#define DEFAULT_VALUE 1.0

class Logger : public ILogger
{
public:
    void log(Severity severity, const char* msg) noexcept override
    {
        // suppress info-level messages
        if (severity <= Severity::kWARNING)
            std::cout << msg << std::endl;
    }
};

size_t ProductOfDims(Dims dims) {
  size_t result = 1;
  for(size_t i = 0; i < dims.nbDims; i++) {
    result *= dims.d[i];
  }
  return result;
}

std::string DimsToStr(Dims dims) {
  std::stringstream ss;
  for(size_t i = 0; i < dims.nbDims; i++) {
    ss << dims.d[i] << " ";
  }
  return ss.str();
}

Dims4 CalculateConv2dOutput(Dims4 input, Dims4 filter, DimsHW kernel_size, DimsHW stride, Dims2 prepadding, 
                            Dims2 postpadding, PaddingMode paddingmode = PaddingMode::kEXPLICIT_ROUND_DOWN) {
  /******dataformat is NCHW******/
  assert(input.d[1] == filter.d[1]); // assert same channel value
  assert(filter.d[2] == kernel_size.d[0]);
  assert(filter.d[3] == kernel_size.d[1]);

  Dims4 output;
  output.d[0] = input.d[0]; //batch
  output.d[1] = filter.d[0]; //output channel

  int32_t kh = kernel_size.d[0];
  int32_t kw = kernel_size.d[1];
  int32_t sh = stride.d[0];
  int32_t sw = stride.d[1];
  
  int32_t uppadding = prepadding.d[0];
  int32_t leftpadding = prepadding.d[1];
  int32_t downpadding = postpadding.d[0];
  int32_t rightpadding = postpadding.d[1];

  switch (paddingmode)
  {
  case PaddingMode::kEXPLICIT_ROUND_UP:
    output.d[2] = ((input.d[2] + uppadding + downpadding - kh + sh -1 )/ sh) + 1; 
    output.d[3] = ((input.d[3] + leftpadding + rightpadding - kw + sw -1) / sw) + 1;
    break;
  case PaddingMode::kSAME_LOWER:
    output.d[2] = (input.d[2] / sh) + 1; 
    output.d[3] = (input.d[3] / sw) + 1;
    break;
  case PaddingMode::kSAME_UPPER:
    output.d[2] = (input.d[2] + sh -1 / sh) + 1; 
    output.d[3] = (input.d[3] + sw -1 / sw) + 1;
    break;
  default:
    output.d[2] = ((input.d[2] + uppadding + downpadding - kh) / sh) + 1; 
    output.d[3] = ((input.d[3] + leftpadding + rightpadding - kw) / sw) + 1;
    break;
  }
  return output;
}

int main() {
  Logger logger;

  // Create a Network Definition
  IBuilder* builder = createInferBuilder(logger);
  uint32_t flag = 1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
  INetworkDefinition* network = builder->createNetworkV2(flag);
  
  Dims4 input_shape{1, 1, 4, 4};
  Dims4 filter_shape{1, 1, 2, 2};
  DimsHW kernel_size{2, 2};
  DimsHW stride{1, 1};
  DimsHW padding{1, 1};
  Dims2 prepadding{1,0};
  Dims2 postpadding{0,1};
  Dims4 output_shape = CalculateConv2dOutput(input_shape, filter_shape, kernel_size, stride, prepadding, postpadding);

  // Add the Input layer to the network
  auto input_data = network->addInput("input", DataType::kFLOAT, input_shape);
  
  // Add the Convolution layer with hidden layer input nodes, strides and weights for filter and bias.
  std::vector<float>filter(ProductOfDims(filter_shape), DEFAULT_VALUE);
  Weights filter_w{DataType::kFLOAT, filter.data(), filter.size()};
  Weights bias_w{DataType::kFLOAT, nullptr, 0}; // no bias
  int32_t output_channel = filter_shape.d[0];
  auto conv2d = network->addConvolutionNd(*input_data, output_channel, kernel_size, filter_w, bias_w);
  conv2d->setStrideNd(stride);
  // conv2d->setPaddingNd(padding);
  // conv2d->setPrePadding(prepadding);
  // conv2d->setPostPadding(postpadding);
  conv2d->setPaddingMode(PaddingMode::kSAME_UPPER); //defalt mode

  // Add a name for the output of the conv2d layer so that the tensor can be bound to a memory buffer at inference time:
  conv2d->getOutput(0)->setName("output");
  // Mark it as the output of the entire network:
  network->markOutput(*conv2d->getOutput(0));

  // Building an Engine(optimize the network)
  IBuilderConfig* config = builder->createBuilderConfig();
  IHostMemory*  serializedModel = builder->buildSerializedNetwork(*network, *config);
  IRuntime* runtime = createInferRuntime(logger);
  ICudaEngine* engine = runtime->deserializeCudaEngine(serializedModel->data(), serializedModel->size());

  // Prepare input_data
  int32_t inputIndex = engine->getBindingIndex("input");
  int32_t outputIndex = engine->getBindingIndex("output");
  std::vector<float> input(ProductOfDims(input_shape), DEFAULT_VALUE);
  std::vector<float> output(ProductOfDims(output_shape));
  void *GPU_input_Buffer_ptr;  // a host ptr point to a GPU buffer
  void *GPU_output_Buffer_ptr;  // a host ptr point to a GPU buffer
  void* buffers[2];
  cudaMalloc(&GPU_input_Buffer_ptr, sizeof(float)*input.size()); //malloc gpu buffer for input
  cudaMalloc(&GPU_output_Buffer_ptr, sizeof(float)*output.size()); //malloc gpu buffer for output
  cudaMemcpy(GPU_input_Buffer_ptr, input.data(), input.size()*sizeof(float), cudaMemcpyHostToDevice); // copy input data from cpu to gpu
  buffers[inputIndex] = static_cast<void*>(GPU_input_Buffer_ptr);
  buffers[outputIndex] = static_cast<void*>(GPU_output_Buffer_ptr);

  // Performing Inference
  IExecutionContext *context = engine->createExecutionContext();
  context->executeV2(buffers);

  // copy result data from gpu to cpu
  cudaMemcpy(output.data(), GPU_output_Buffer_ptr, output.size()*sizeof(float), cudaMemcpyDeviceToHost); 

  // display output
  std::cout << "output shape : " << DimsToStr(output_shape) << "\n";
  std::cout << "output data : \n";
  for(auto i : output)
    std::cout << i << " ";
  std::cout << std::endl;
}

  • 2
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值