tensorrt learn
plugin官方
官方下边有三个类IPluginCreator,IPluginRegistry和IPluginFactory
IPluginCreator
底下有PluginFieldType和PluginFieldCollection派生类。
PluginFieldType的成员变量有name、data、type、size。
PluginFieldCollection包括append()、extend()、insert()、pop()函数,其中的操作对象都是PluginFieldType类型
IPluginCreator
主要的成员变量有tensorrt_version、name、plugin_version、field_names、plugin_namespace
主要的成员函数有create_plugin(const char* name,const nvinfer1::PluginFieldCollection *fc)
name – The name of the plugin.
field_collection – The PluginFieldCollection for this plugin.
Returns
IPluginV2 or None on failure.
这里的create_plugin()返回的是序列化的构造函数
virtual nvinfer1::IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLenth) override;
Returns
A new IPluginV2
这里的deserializePlugin返回的是反序列化的构造函数
creator要实现的函数大概如下:
class PReLUPluginCreator : public nvinfer1::IPluginCreator {
public:
PReLUPluginCreator();
// ------------------inherit from IPluginCreator-------------------
// return the plugin type + plugin namesapce
virtual const char* getPluginName() const override;
// return the plugin version
virtual const char* getPluginVersion() const override;
// return a list of fields that needs to be passed to createPlugin
virtual const nvinfer1::PluginFieldCollection* getFieldNames() override;
// return nullptr in case of error
virtual nvinfer1::IPluginV2* createPlugin(const char* name, const nvinfer1::PluginFieldCollection *fc) override;
// Called during deserialization of plugin layer. Return a plugin object.
virtual nvinfer1::IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLenth) override;
// Set the namespace of the plugin creator based on the plugin library it belongs to. This can be set while registering the plugin creator
virtual void setPluginNamespace(const char* pluginNamespace) override {}
// Return the namespace of the plugin creator object.
virtual const char* getPluginNamespace() const override;
private:
nvinfer1::PluginFieldCollection mFC;
std::vector<nvinfer1::PluginField> mPluginAttributes;
};
不要忘记注册REGISTER_TENSORRT_PLUGIN(PReLUPluginCreator);
IPluginRegistry
貌似别人实现的接口文件中,没有重写此方法,先保留官方描述
IPluginFactory
factory工厂,大概就是读入模型文件中的层名判断其是否在trt自带的层和plugin层中
IPluginV2* PluginFactory::createPlugin(const char *layerName, const Weights* weights, int nbWeights, const char* libNamespace)
IPluginV2* PluginFactory::createPlugin(const char* layerName, const void* serialData, size_t serialLength) override;
之前v1是有两个实现都是调用自定义类的构造函数,V2中好像参数只有serialized数据buff,但别人的实现好像是weights
主要要实现以下方法
class PluginFactory : public nvcaffeparser1::IPluginFactoryV2 {
public:
PluginFactory(TrtPluginParams params);
virtual ~PluginFactory() {}
// ------------------inherit from IPluginFactoryV2--------------------
// determines if a layer configuration is provided by an IPluginV2
virtual bool isPluginV2(const char* layerName) override;
// create a plugin
virtual IPluginV2* createPlugin(const char* layerName, const Weights* weights, int nbWeights, const char* libNamespace="") override;
private:
//这里是你的plugin中的参数
// yolo-det layer params
int mYoloClassNum;
int mYolo3NetSize;
// upsample layer params
float mUpsampleScale;
};
自定义plugin
plugin类要实现的操作如下
class PReLUPlugin : public nvinfer1::IPluginV2
{
public:
// @参数: weights 和 nbWeight这两个参数是PluginFactory::createPlugin的参数,可以参见PluginFactory.cpp.如果你的自定义层没有权重,那么这两个参数你不要也可以,这个函数主要就是用来将权重和自定义层的其他参数读取到内部变量里面*/
// 个人理解这个构造函数的作用是将weights等参数序列化
PReLUPlugin(const nvinfer1::Weights* weights, int nbWeight);
// 这个就是从序列化数据里面恢复plugin的相关数据,另一个函数serialize,将类的数据写入到序列化数据里面.在IPluginCreator::deserializePlugin里面会调用到这个函数,注意写的顺序跟读的顺序必须是一样的.
// 个人理解这个构造函数的作用是将序列化后的参数反序列化成weights等
PReLUPlugin(const void* data, size_t length);
// 返回在序列化你的自定义插件的时候,需要占用到多少空间,其实就是你的权重和一些必要的成员变量的空间
virtual size_t getSerializationSize() const override;
// 序列化你的自定义插件到buffer,需要保证write的顺序和read的顺序是一样的
virtual void serialize(void* buffer) const override;
PReLUPlugin() = delete;
~PReLUPlugin();
// 返回输出tensor的数量, 比如说prelu,输出个数跟relu一样是1,这个取决于你的自定义层.
virtual int getNbOutputs() const override;
// @描述 返回输出tensor的维度,很多时候都取决于输入维度.对于prelu来说,输出维度等于输入维度.
// @参数 index 输出tensor的index
// @参数 inputs 输出tensors的纬度.注意有可能有多个输入
// @参数 nbInputDims 输出tensors的个数.
virtual nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) override;
// 查询对应的datatype和format是否支持, 这个取决于你的自定义层实现是否支持.
virtual bool supportsFormat(const nvinfer1::DataType type, nvinfer1::PluginFormat format) const override;
virtual void configureWithFormat(const nvinfer1::Dims* inputDims, int nbInputs, const nvinfer1::Dims* outputDims,
int nbOutputs, nvinfer1::DataType type, nvinfer1::PluginFormat format,
int maxBatchSize) override;
// @描述: 初始化你的插件,其实就是将数据初始化到gpu上,随后enqueue()函数将被使用
virtual int initialize() override;
// @描述: 释放内存和显存, 见cpp
virtual void terminate() override;
// @描述: 很难解释, 直接返回0即可.
virtual size_t getWorkspaceSize(int maxBatchSize) const override;
// @描述: 见cpp
virtual const char* getPluginType() const override;
// @描述: 见cpp
virtual const char* getPluginVersion() const override;
// @描述: 调用这个接口来析构
virtual void destroy()
// @描述: 见cpp
virtual nvinfer1::IPluginV2* clone() const override;
// @描述: 不要实现这个方法,留空即可
virtual void setPluginNamespace(const char* pluginNamespace) override {}
// @描述: 见cpp
virtual const char* getPluginNamespace() const override;
// @描述: 见cpp 执行cuda函数实现自定义层的代数运算
virtual int enqueue(int batchSize, const void*const * inputs, void** outputs,
void* workspace, cudaStream_t stream) override;
private:
//一些必要的参数
int mNbInputChannels, mNbInputHeight, mNbInputWidth;
nvinfer1::Weights mWeights;
nvinfer1::DataType mDataType{nvinfer1::DataType::kFLOAT};
void* mDeviceKernel{nullptr};
};
cuda编程
static const char* G_PRELU_TYPE = "PReLU";
static const char* G_PRELU_NAME = "PReLU_TRT";
//plugin_name = plugin_type + plugin_namespace
static const int CUDA_NUM_THREADS = 512;
// CUDA: number of blocks for threads.
inline int CAFFE_GET_BLOCKS(const int N) {
return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
}
// CUDA: grid stride looping
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
i < (n); \
i += blockDim.x * gridDim.x)
// /******** PReLU CUDA function ********/
// CUDA kernele for forward
template <typename Ftype>
__global__ void PReLUForward(const int n, const int channels, const int dim,
const Ftype* slope_data,
const Ftype* in, Ftype* out,
const Ftype zero,
const int div_factor) {
CUDA_KERNEL_LOOP(index, n) {
int c = (index / dim) % channels / div_factor;
if(in[index] > zero) {
out[index] = in[index];
} else {
out[index] = in[index] * slope_data[c];
}
}
}
template <typename Ftype>
cudaError_t Forward_gpu(const int count, const int channels, const int dim,
const Ftype* mDeviceKernel,
const Ftype* bottom_data, Ftype* top_data,
const Ftype zero,
const int div_factor, const cudaStream_t stream) {
PReLUForward<<<CAFFE_GET_BLOCKS(count), CUDA_NUM_THREADS, 0, stream>>>
(count, channels, dim, mDeviceKernel, bottom_data, top_data, zero, div_factor);
cudaError_t err = cudaGetLastError();
return err;
}
总结
官方给了trt运行的几个步骤:Defining the network、Enabling custom layers in NvCaffeParser、Building the engine、Serializing and deserializing、 Resource management and execution。详细的请看官方的sampleplugin和api
Defining the network
官网上好像是要实现这两个函数getNbOutputs
returns 1
and getOutputDimensions
Enabling custom layers in NvCaffeParser
官方要实现isPlugin和 createPlugin(const char* layerName, const nvinfer1::Weights* weights, int nbWeights)这里返回的是序列化构造函数
Building the engine
调用supportsFormat()和configureWithFormat()函数,这两个函数好像是绑定在一起
Serializing and deserializing
先调用getSerializationSize() 获取序列化大小然后void serialize(void* buffer)将weights写入buffer中,然后engine就配置完成调用createPlugin()返回反序列化的构造函数
Resource management and execution
又是三个配合使用的函数 initialize()、enqueue()、terminate()
The cloned plugin objects are deleted when the network, builder, or engine are destroyed. This is done by invoking the FCPlugin::destroy()
method.
void destroy() { delete this; }
tensorrt5.1.5 sampleplugin阅读
官方给出的自定义接口的流程为Defining the network,Enabling custom layers in NvCaffeParser,Building the engine,Serializing and deserializing,Resource management and execution。document for nvidia
#ifndef PRELU_PLUGIN_H // modidy to your file name
#define PRELU_PLUGIN_H // 改成你自己的文件名
#include "NvInfer.h"
#include "NvInferPlugin.h"
#include "spdlog/spdlog.h"
class PReLUPlugin : public nvinfer1::IPluginV2
{
public:
* @描述: 构造函数
* @参数: weights 和 nbWeight这两个参数是PluginFactory::createPlugin的参数,可以参见PluginFactory.cpp.如果你的自定义层没有权重,那么这两个参数你不要也可以,这个函数主要就是用来将权重和自定义层的其他参数读取到内部变量里面*/
*个人理解这个构造函数的作用是将weights等参数序列化
PReLUPlugin(const nvinfer1::Weights* weights, int nbWeight);
* @描述: 这个就是从序列化数据里面恢复plugin的相关数据,另一个函数serialize 将类的数据写入到序列化数据里面.在IPluginCreator::deserializePlugin里面会调用到这个函数,注意写的顺序跟读的顺序必须是一样的.
*个人理解这个构造函数的作用是将序列化后的参数反序列化成weights等
PReLUPlugin(const void* data, size_t length);
@描述: 返回在序列化你的自定义插件的时候,需要占用到多少空间,其实就是你的权重和 一些必要的成员变量的空间
virtual size_t getSerializationSize() const override;
* @描述: 序列化你的自定义插件到buffer,需要保证write的顺序和read的顺序是一样的
virtual void serialize(void* buffer) const override;
* @描述: 无参构造函数没有意义
PReLUPlugin() = delete;
* @描述:析构函数,释放资源
~PReLUPlugin();
/**
* @description: return the number of output tensors. for prelu return 1 the
* same as relu. it depends on your custom layer
* @描述: 返回输出tensor的数量, 比如说prelu,输出个数跟relu一样是1,这个取决于你的自定义层.
*/
virtual int getNbOutputs() const override;
/**
* @description: return dimensions of output tensor, this might depends on
* input tensors dimensions. for prelu, the output dimension
* is the same as input dimension.
* @描述 返回输出tensor的维度,很多时候都取决于输入维度.对于prelu来说,输出维度等于输入维度.
* @param index The index of the output tensor.
* @参数 index 输出tensor的index
* @param inputs The input tensors.
* @参数 inputs 输出tensors的纬度.注意有可能有多个输入
* @param nbInputDims The number of input tensors.
* @参数 nbInputDims 输出tensors的个数.
*/
virtual nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) override;
/**
* @description: query whether a given datatype and plugin format is support,
* it depends on your custom layer implementation.
* @描述 查询对应的datatype和format是否支持, 这个取决于你的自定义层实现是否支持.
*/
virtual bool supportsFormat(const nvinfer1::DataType type, nvinfer1::PluginFormat format) const override;
virtual void configureWithFormat(const nvinfer1::Dims* inputDims, int nbInputs, const nvinfer1::Dims* outputDims,
int nbOutputs, nvinfer1::DataType type, nvinfer1::PluginFormat format,
int maxBatchSize) override;
/**
* @description: initialize your plugin for execution, for simplicity, you need
* to prepare data in gpu in this function, for example convert
* and copy your weights to gpu.because after this method, enqueue
* will be call.
* @描述: 初始化你的插件,其实就是初始化好gpu context, 将你的权重从内存拷贝到gpu上,如果
* 设定了fp16,当然也要先做转化在拷贝到gpu.
* @return:
*/
virtual int initialize() override;
/**
* @description: free memory, include cpu and gpu, see cpp file.
* @描述: 释放内存和显存, 见cpp
*/
virtual void terminate() override;
/**
* @description: hard to explain, just return 0;
* @描述: 很难解释, 直接返回0即可.
*/
virtual size_t getWorkspaceSize(int maxBatchSize) const override;
/**
* @description: see cpp
* @描述: 见cpp
*/
virtual const char* getPluginType() const override;
/**
* @description: see cpp
* @描述: 见cpp
*/
virtual const char* getPluginVersion() const override;
/**
* @description: the same as ~PReLUPlugin(),just copy my implementation.
* @描述: 调用这个接口来析构,参考我的代码即可.
*/
virtual void destroy();
/**
* @description: see cpp
* @描述: 见cpp
*/
virtual nvinfer1::IPluginV2* clone() const override;
/**
* @description: DO NOT IMPLEMENT THIS FUNCTION
* @描述: 不要实现这个方法,留空即可
*/
virtual void setPluginNamespace(const char* pluginNamespace) override {}
/**
* @description: see cpp
* @描述: 见cpp
*/
virtual const char* getPluginNamespace() const override;
/**
* @description: see cpp
* @描述: 见cpp
*/
virtual int enqueue(int batchSize, const void*const * inputs, void** outputs,
void* workspace, cudaStream_t stream) override;
private:
int mNbInputChannels, mNbInputHeight, mNbInputWidth;
nvinfer1::Weights mWeights;
nvinfer1::DataType mDataType{nvinfer1::DataType::kFLOAT};
void* mDeviceKernel{nullptr};
};
/**
* @description: see cpp and mimic my implementation
* @描述: 直接参见cpp并且直接模仿我的实现即可.
*/
class PReLUPluginCreator : public nvinfer1::IPluginCreator {
public:
PReLUPluginCreator();
// ------------------inherit from IPluginCreator-------------------
// return the plugin type + plugin namesapce
virtual const char* getPluginName() const override;
// return the plugin version
virtual const char* getPluginVersion() const override;
// return a list of fields that needs to be passed to createPlugin
virtual const nvinfer1::PluginFieldCollection* getFieldNames() override;
// return nullptr in case of error
virtual nvinfer1::IPluginV2* createPlugin(const char* name, const nvinfer1::PluginFieldCollection *fc) override;
// Called during deserialization of plugin layer. Return a plugin object.
virtual nvinfer1::IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLenth) override;
// Set the namespace of the plugin creator based on the plugin library it belongs to. This can be set while registering the plugin creator
virtual void setPluginNamespace(const char* pluginNamespace) override {}
// Return the namespace of the plugin creator object.
virtual const char* getPluginNamespace() const override;
private:
nvinfer1::PluginFieldCollection mFC;
std::vector<nvinfer1::PluginField> mPluginAttributes;
};
#endif //PLGUIN_SAMPLE_H
Defining the network
Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override
{
assert(index == 0 && nbInputDims == 1 && inputs[0].nbDims == 3);
assert(mNbInputChannels == inputs[0].d[0] * inputs[0].d[1] * inputs[0].d[2]);
return Dims3(mNbOutputChannels, 1, 1);
}
这个部分主要定义求经过这层网络结构后输出的向量的DIMS。
Enabling custom layers in NvCaffeParser
bool isPlugin(const char* name) override
{
return isPluginExt(name);
}
bool isPluginExt(const char* name) override
{
return !strcmp(name, "ip2");
}
这个函数主要判断你定义的层是否写入tensorrt?目前这么理解
virtual nvinfer1::IPlugin* createPlugin(const char* layerName, const nvinfer1::Weights* weights, int nbWeights) override
{
// there's no way to pass parameters through from the model definition, so we have to define it here explicitly
static const int NB_OUTPUT_CHANNELS = 10;
assert(isPlugin(layerName) && nbWeights == 2);
assert(mPlugin.get() == nullptr);
mPlugin = std::unique_ptr<FCPlugin>(new FCPlugin(weights, nbWeights, NB_OUTPUT_CHANNELS));
return mPlugin.get();
}
// deserialization plugin implementation
IPlugin* createPlugin(const char* layerName, const void* serialData, size_t serialLength) override
{
assert(isPlugin(layerName));
//This plugin object is destroyed when engine is destroyed by calling
//IPluginExt::destroy()
return new FCPlugin(serialData, serialLength);
}
这个函数通过层名去找该层的权重,将权重copy到该接口的构造器。
Building the engine
bool supportsFormat(DataType type, PluginFormat format) const override { return (type == DataType::kFLOAT || type == DataType::kHALF) && format == PluginFormat::kNCHW; }
void configureWithFormat(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, DataType type, PluginFormat format, int maxBatchSize) override
{
assert((type == DataType::kFLOAT || type == DataType::kHALF) && format == PluginFormat::kNCHW);
mDataType = type;
}
这个函数大概是配置tensorrt的识别数据模式
Serializing and deserializing
virtual size_t getSerializationSize() override
{
return sizeof(mNbInputChannels) + sizeof(mNbOutputChannels) +
sizeof(mBiasWeights.count) + sizeof(mDataType) +
(mKernelWeights.count + mBiasWeights.count) *
type2size(mDataType);
}
计算模型权重序列化所需要的大小
virtual void serialize(void* buffer) override
{
char *d = static_cast<char*>(buffer), *a = d;
write(d, mNbInputChannels);
write(d, mNbOutputChannels);
write(d, mBiasWeights.count);
write(d, mDataType);
convertAndCopyToBuffer(d, mKernelWeights);
convertAndCopyToBuffer(d, mBiasWeights);
assert(d == a + getSerializationSize());
}
将数据写入buff区域,相关的host和device的通信还要看看cuda编程
void destroyPlugin()
{
mPlugin.reset();
}
字面意思理解
上述函数都写在类class PluginFactory : public nvinfer1::IPluginFactory, public nvcaffeparser1::IPluginFactoryExt
cuda-编程常用代码块
编程基础
host 代表的是你的cpu及内存,device是你的GPU及内存。
典型的执行流程如下:
1.分配host内存,并进行数据初始化;
2.分配device内存,并从host将数据拷贝到device上;
3.调用CUDA的核函数在device上完成指定的运算;
4.将device上的运算结果拷贝到host上;
5。释放device和host上分配的内存。
kernel是在device上线程中并行执行的函数,核函数用__global__符号声明,在调用时需要用<<<grid, block>>>来指定kernel要执行的线程数量。一个kernel所启动的所有线程称为一个网格(grid),同一个网格上的线程共享相同的全局内存空间,grid是线程结构的第一层次,而网格又可以分为很多线程块(block),一个线程块里面包含很多线程,这是第二个层次。kernel在调用时也必须通过执行配置<<<grid, block>>>来指定kernel所使用的线程数及结构一个线程需要两个内置的坐标变量(blockIdx,threadIdx)来唯一标识
cudaMalloc() #在设备端分配global memory
cudaFree()#释放存储空间
cudaMemcpy(void *dst, void *src, size_t nbytes,enum cudaMemcpyKind direction)
cudaMemcpyHostToDevice(CPU到GPU)
cudaMemcpyDeviceToHost(GPU到CPU)
cudaMemcpyDeviceToDevice(GPU到GPU)
#include "cuda_runtime.h"
#include <stdlib.h>
#include <iostream>
#include <sys/time.h>
using namespace std;
__global__ void Plus(float A[], float B[], float C[], int n)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
C[i] = A[i] + B[i];
}
int main()
{
struct timeval start, end;
gettimeofday( &start, NULL );
float*A, *Ad, *B, *Bd, *C, *Cd;
int n = 1024 * 1024;
int size = n * sizeof(float);
// CPU端分配内存
A = (float*)malloc(size);
B = (float*)malloc(size);
C = (float*)malloc(size);
// 初始化数组
for(int i=0;i<n;i++)
{
A[i] = 90.0;
B[i] = 10.0;
}
// GPU端分配内存
cudaMalloc((void**)&Ad, size);
cudaMalloc((void**)&Bd, size);
cudaMalloc((void**)&Cd, size);
// CPU的数据拷贝到GPU端
cudaMemcpy(Ad, A, size, cudaMemcpyHostToDevice);
cudaMemcpy(Bd, B, size, cudaMemcpyHostToDevice);
cudaMemcpy(Bd, B, size, cudaMemcpyHostToDevice);
// 定义kernel执行配置,(1024*1024/512)个block,每个block里面有512个线程
dim3 dimBlock(512);
dim3 dimGrid(n/512);
// 执行kernel
Plus<<<dimGrid, dimBlock>>>(Ad, Bd, Cd, n);
// 将在GPU端计算好的结果拷贝回CPU端
cudaMemcpy(C, Cd, size, cudaMemcpyDeviceToHost);
// 校验误差
float max_error = 0.0;
for(int i=0;i<n;i++)
{
max_error += fabs(100.0 - C[i]);
}
cout << "max error is " << max_error << endl;
// 释放CPU端、GPU端的内存
free(A);
free(B);
free(C);
cudaFree(Ad);
cudaFree(Bd);
cudaFree(Cd);
gettimeofday( &end, NULL );
int timeuse = 1000000 * ( end.tv_sec - start.tv_sec ) + end.tv_usec - start.tv_usec;
cout << "total time is " << timeuse/1000 << "ms" <<endl;
return 0;
}