tensorrt 自定义层类实现详解
- 公用函数
- class PReLUPlugin详解
- 私有成员
- PReLUPlugin(const nvinfer1::Weights* weights, int nbWeight);
- PReLUPlugin::PReLUPlugin(const void *data, size_t length)
- virtual size_t getSerializationSize() const override
- virtual void serialize(void* buffer) const override;
- PReLUPlugin() = delete;
- ~PReLUPlugin();
- virtual int getNbOutputs() const override;
- virtual nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) override
- virtual bool supportsFormat(const nvinfer1::DataType type, nvinfer1::PluginFormat format) const override;
- virtual void configureWithFormat(const nvinfer1::Dims* inputDims, int nbInputs, const nvinfer1::Dims* outputDims,int nbOutputs, nvinfer1::DataType type, nvinfer1::PluginFormat format, int maxBatchSize) override;
- virtual int initialize() override
- virtual void terminate() override
- virtual size_t getWorkspaceSize(int maxBatchSize) const override
- virtual const char* getPluginType() const override
- virtual const char* getPluginVersion() const override;
- virtual void destroy()
- virtual nvinfer1::IPluginV2* clone() const override;
- virtual void setPluginNamespace(const char* pluginNamespace) override {}
- virtual const char* getPluginNamespace() const override
- virtual int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override
- 核函数调用
公用函数
宏定义
static const char* G_PRELU_TYPE = "PReLU";
static const char* G_PRELU_NAME = "PReLU_TRT";
宏函数
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
i < (n); \
i += blockDim.x * gridDim.x)
核相关定义
static const int CUDA_NUM_THREADS = 512;
// CUDA: number of blocks for threads.
inline int CAFFE_GET_BLOCKS(const int N) {
return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
}
class PReLUPlugin详解
私有成员
输入的CHW,值,类型及GPU端拷贝
int mNbInputChannels, mNbInputHeight, mNbInputWidth;
nvinfer1::Weights mWeights;
nvinfer1::DataType mDataType{nvinfer1::DataType::kFLOAT};
void* mDeviceKernel{nullptr};
PReLUPlugin(const nvinfer1::Weights* weights, int nbWeight);
这个构造函数的作用是将此层的参数(权重,偏置等参数)读取到内部
将模型文件参数的值复制给私有成员
PReLUPlugin::PReLUPlugin(const nvinfer1::Weights *weights, int nbWeights) {
mWeights = weights[0];
mWeights.values = malloc(mWeights.count * type2size(mWeights.type));
memcpy(const_cast<void *>(mWeights.values), weights[0].values, mWeights.count * type2size(mWeights.type));
}
PReLUPlugin::PReLUPlugin(const void *data, size_t length)
从序列化模型文件里读取数据
将参数的CHW,nvinfer1::Weights结构体内的变量copy到私有成员
最后assert内存大小是否相同的
PReLUPlugin::PReLUPlugin(const void *data, size_t length) {
const char *d = static_cast<const char *>(data), *a = d;
read<int>(d, mNbInputChannels);
read<int>(d, mNbInputHeight);
read<int>(d, mNbInputWidth);
read<nvinfer1::DataType>(d, mDataType);
read<int64_t>(d, mWeights.count);
read<nvinfer1::DataType>(d, mWeights.type);
mWeights.values = nullptr;
mWeights.values = malloc(mWeights.count * type2size(mWeights.type));
memcpy(const_cast<void *>(mWeights.values), d, mWeights.count * type2size(mWeights.type));
d = d + mWeights.count * type2size(mWeights.type);
ASSERT(d == a + length);
virtual size_t getSerializationSize() const override
返回下在buffer里占用的大小,其实就是读取序列化构造函数里面所有变量的内存大小
size_t PReLUPlugin::getSerializationSize() const {
return sizeof(mNbInputChannels) + sizeof(mNbInputWidth) + sizeof(mNbInputHeight) + sizeof(mDataType) +
sizeof(mWeights.count) + sizeof(mWeights.type) + mWeights.count * type2size(mWeights.type);
}
virtual void serialize(void* buffer) const override;
序列化插件到buffer 给构造函数相反一个读一个写,读写的参数相同
void PReLUPlugin::serialize(void *buffer) const {
char *d = static_cast<char *>(buffer), *a = d;
write(d, mNbInputChannels);
write(d, mNbInputHeight);
write(d, mNbInputWidth);
write(d, mDataType);
write(d, mWeights.count);
write(d, mWeights.type);
convertAndCopyToBuffer(d, mWeights, mWeights.type);
ASSERT(d == a + getSerializationSize());
}
PReLUPlugin() = delete;
无参数构造函数无意义
~PReLUPlugin();
析构函数
释放你申请的空间防止段错误
PReLUPlugin::~PReLUPlugin() {
if (mWeights.values)
{
free(const_cast<void *>(mWeights.values));
mWeights.values = nullptr;
}
if (mDeviceKernel)
{
cudaFree(mDeviceKernel);
mDeviceKernel = nullptr;
}
}
virtual int getNbOutputs() const override;
返回输出tensor的数量, 比如说prelu,输出个数跟relu一样是1,这个取决于你的自定义层,大概是输入一个变成几个,待研究,后续补充
int PReLUPlugin::getNbOutputs() const {
return 1;
}
virtual nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) override
返回输出tensor的维度,根据输入维度和操作自行计算
index这边有疑问?待解决
nvinfer1::Dims PReLUPlugin::getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) {
if(index == 0) {
return nvinfer1::Dims3(inputs[0].d[0],inputs[0].d[1],inputs[0].d[2]);
} // else if(index == n) {
// for other outputs if exists.
// }
else {
ASSERT(false);
}
}
virtual bool supportsFormat(const nvinfer1::DataType type, nvinfer1::PluginFormat format) const override;
看看你支持什么格式,half,int
bool PReLUPlugin::supportsFormat(nvinfer1::DataType type, nvinfer1::PluginFormat format) const {
return (type == nvinfer1::DataType::kFLOAT | type == nvinfer1::DataType::kHALF)
&& format == nvinfer1::PluginFormat::kNCHW;
}
virtual void configureWithFormat(const nvinfer1::Dims* inputDims, int nbInputs, const nvinfer1::Dims* outputDims,int nbOutputs, nvinfer1::DataType type, nvinfer1::PluginFormat format, int maxBatchSize) override;
判断下支持的格式,并给私有成员赋值
void PReLUPlugin::configureWithFormat(const nvinfer1::Dims* inputDims, int nbInputs,
const nvinfer1::Dims* outputDims, int nbOutputs,
nvinfer1::DataType type, nvinfer1::PluginFormat format,
int maxBatchSize) {
ASSERT((type == nvinfer1::DataType::kFLOAT | type == nvinfer1::DataType::kHALF)
&& format == nvinfer1::PluginFormat::kNCHW);
mNbInputChannels = inputDims[0].d[0];
mNbInputHeight = inputDims[0].d[1];
mNbInputWidth = inputDims[0].d[2];
mDataType = type;
}
virtual int initialize() override
初始化你的插件,其实就是初始化好gpu context, 将你的权重从内存拷贝到gpu上,如果设定了fp16,当然也要先做转化在拷贝到gpu
int PReLUPlugin::initialize() {
convertAndCopyToDeivce(mDeviceKernel, mWeights, mDataType);
return 0;
}
virtual void terminate() override
释放内存和显存,给析构函数一样,只不过调用对象不同
void PReLUPlugin::terminate() {
if (mWeights.values)
{
free(const_cast<void *>(mWeights.values));
mWeights.values = nullptr;
}
if (mDeviceKernel)
{
cudaFree(mDeviceKernel);
mDeviceKernel = nullptr;
}
}
virtual size_t getWorkspaceSize(int maxBatchSize) const override
很难解释, 直接返回0即可
size_t PReLUPlugin::getWorkspaceSize(int maxBatchSize) const
{
return 0;
}
virtual const char* getPluginType() const override
tensorrt内部定义的
const char *PReLUPlugin::getPluginType() const {
return G_PRELU_TYPE;
}
virtual const char* getPluginVersion() const override;
tensorrt内部定义的
const char *PReLUPlugin::getPluginVersion() const {
return G_PLUGIN_VERSION;
}
virtual void destroy()
tensorrt内部接口来析构
void PReLUPlugin::destroy() {
delete this;
}
virtual nvinfer1::IPluginV2* clone() const override;
nvinfer1::IPluginV2* PReLUPlugin::clone() const {
return new PReLUPlugin(&mWeights, 1);
}
virtual void setPluginNamespace(const char* pluginNamespace) override {}
不用实现留空
virtual const char* getPluginNamespace() const override
const char* PReLUPlugin::getPluginNamespace() const {
return G_PLUGIN_NAMESPACE;
}
virtual int enqueue(int batchSize, const voidconst * inputs, void** outputs, void workspace, cudaStream_t stream) override
此层功能的代码实现,此处调用推理函数
int PReLUPlugin::enqueue(int batchSize, const void *const *inputs, void **outputs, void *workspace, cudaStream_t stream)
{
const int count = batchSize * mNbInputChannels * mNbInputWidth * mNbInputHeight;
const int channels = mNbInputChannels;
const int dim = mNbInputWidth * mNbInputHeight;
const int div_factor = 1;
if (mDataType == nvinfer1::DataType::kFLOAT)
{
const float zerof{0.0f};
CUDA_CHECK(Forward_gpu(count, channels, dim,
reinterpret_cast<const float *>(mDeviceKernel),
reinterpret_cast<const float *>(inputs[0]),
reinterpret_cast<float *>(outputs[0]),
zerof,
div_factor,
stream));
} else {
const __half zeroh = __half(0.0f);
CUDA_CHECK(Forward_gpu(count, channels, dim,
reinterpret_cast<const __half *>(mDeviceKernel),
reinterpret_cast<const __half *>(inputs[0]),
reinterpret_cast<__half *>(outputs[0]),
zeroh,
div_factor,
stream));
}
return 0;
}
核函数调用
template <typename Ftype>
cudaError_t Forward_gpu(const int count, const int channels, const int dim,
const Ftype* mDeviceKernel,
const Ftype* bottom_data, Ftype* top_data,
const Ftype zero,
const int div_factor, const cudaStream_t stream) {
PReLUForward<<<CAFFE_GET_BLOCKS(count), CUDA_NUM_THREADS, 0, stream>>>
(count, channels, dim, mDeviceKernel, bottom_data, top_data, zero, div_factor);
cudaError_t err = cudaGetLastError();
return err;
}
template <typename Ftype>
__global__ void PReLUForward(const int n, const int channels, const int dim,
const Ftype* slope_data,
const Ftype* in, Ftype* out,
const Ftype zero,
const int div_factor) {
CUDA_KERNEL_LOOP(index, n) {
int c = (index / dim) % channels / div_factor;
if(in[index] > zero) {
out[index] = in[index];
} else {
out[index] = in[index] * slope_data[c];
}
}
}