tensorrt 自定义层详解

最新推荐文章于 2023-02-13 11:41:58 发布

小涵涵

最新推荐文章于 2023-02-13 11:41:58 发布

阅读量446

点赞数

分类专栏： tensorrt c++ 文章标签： tensorrt

本文链接：https://blog.csdn.net/qq_34929889/article/details/107386946

版权

c++ 同时被 2 个专栏收录

11 篇文章 0 订阅

订阅专栏

tensorrt

7 篇文章 0 订阅

订阅专栏

tensorrt 自定义层类实现详解

公用函数
class PReLUPlugin详解

公用函数

宏定义

static const char* G_PRELU_TYPE = "PReLU";
static const char* G_PRELU_NAME = "PReLU_TRT";

宏函数

#define CUDA_KERNEL_LOOP(i, n) \
  for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
       i < (n); \
       i += blockDim.x * gridDim.x)

核相关定义

static const int CUDA_NUM_THREADS = 512;

// CUDA: number of blocks for threads.
inline int CAFFE_GET_BLOCKS(const int N) {
  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
}

class PReLUPlugin详解

私有成员

输入的CHW,值,类型及GPU端拷贝

int mNbInputChannels, mNbInputHeight, mNbInputWidth;
nvinfer1::Weights mWeights;
nvinfer1::DataType mDataType{nvinfer1::DataType::kFLOAT};
void* mDeviceKernel{nullptr};

PReLUPlugin(const nvinfer1::Weights* weights, int nbWeight);

这个构造函数的作用是将此层的参数(权重,偏置等参数)读取到内部
将模型文件参数的值复制给私有成员

PReLUPlugin::PReLUPlugin(const nvinfer1::Weights *weights, int nbWeights) {
    mWeights = weights[0];
    mWeights.values = malloc(mWeights.count * type2size(mWeights.type));
    memcpy(const_cast<void *>(mWeights.values), weights[0].values, mWeights.count * type2size(mWeights.type));
}

PReLUPlugin::PReLUPlugin(const void *data, size_t length)

从序列化模型文件里读取数据
将参数的CHW,nvinfer1::Weights结构体内的变量copy到私有成员
最后assert内存大小是否相同的

PReLUPlugin::PReLUPlugin(const void *data, size_t length) {
    const char *d = static_cast<const char *>(data), *a = d;
    read<int>(d, mNbInputChannels);
    read<int>(d, mNbInputHeight);
    read<int>(d, mNbInputWidth);
    read<nvinfer1::DataType>(d, mDataType);
    read<int64_t>(d, mWeights.count);
    read<nvinfer1::DataType>(d, mWeights.type);
    mWeights.values = nullptr;
    mWeights.values = malloc(mWeights.count * type2size(mWeights.type));
    memcpy(const_cast<void *>(mWeights.values), d, mWeights.count * type2size(mWeights.type));
    d = d + mWeights.count * type2size(mWeights.type);
    ASSERT(d == a + length);

virtual size_t getSerializationSize() const override

返回下在buffer里占用的大小,其实就是读取序列化构造函数里面所有变量的内存大小

size_t PReLUPlugin::getSerializationSize() const {
    return sizeof(mNbInputChannels) + sizeof(mNbInputWidth) + sizeof(mNbInputHeight) + sizeof(mDataType) + 
           sizeof(mWeights.count) + sizeof(mWeights.type) + mWeights.count * type2size(mWeights.type);
}

virtual void serialize(void* buffer) const override;

序列化插件到buffer 给构造函数相反一个读一个写,读写的参数相同

void PReLUPlugin::serialize(void *buffer) const {
    char *d = static_cast<char *>(buffer), *a = d;
    write(d, mNbInputChannels);
    write(d, mNbInputHeight);
    write(d, mNbInputWidth);
    write(d, mDataType);
    write(d, mWeights.count);
    write(d, mWeights.type);
    convertAndCopyToBuffer(d, mWeights, mWeights.type);
    ASSERT(d == a + getSerializationSize());
}

PReLUPlugin() = delete;

无参数构造函数无意义

~PReLUPlugin();

析构函数
释放你申请的空间防止段错误

PReLUPlugin::~PReLUPlugin() {
    if (mWeights.values) 
    {
        free(const_cast<void *>(mWeights.values));
        mWeights.values = nullptr;
    }
    if (mDeviceKernel) 
    {
        cudaFree(mDeviceKernel);
        mDeviceKernel = nullptr;
    }
}

virtual int getNbOutputs() const override;

返回输出tensor的数量, 比如说prelu,输出个数跟relu一样是1,这个取决于你的自定义层,大概是输入一个变成几个,待研究,后续补充

int PReLUPlugin::getNbOutputs() const {
    return 1;
}

virtual nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) override

返回输出tensor的维度,根据输入维度和操作自行计算
index这边有疑问?待解决

nvinfer1::Dims PReLUPlugin::getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) {
    if(index == 0) {
        return nvinfer1::Dims3(inputs[0].d[0],inputs[0].d[1],inputs[0].d[2]);
    } // else if(index == n) {
        // for other outputs if exists.
    // }
    else {
        ASSERT(false);
    }
}

virtual bool supportsFormat(const nvinfer1::DataType type, nvinfer1::PluginFormat format) const override;

看看你支持什么格式,half,int

bool PReLUPlugin::supportsFormat(nvinfer1::DataType type, nvinfer1::PluginFormat format) const {
    return (type == nvinfer1::DataType::kFLOAT | type == nvinfer1::DataType::kHALF) 
            && format == nvinfer1::PluginFormat::kNCHW;
}

virtual void configureWithFormat(const nvinfer1::Dims* inputDims, int nbInputs, const nvinfer1::Dims* outputDims,int nbOutputs, nvinfer1::DataType type, nvinfer1::PluginFormat format, int maxBatchSize) override;

判断下支持的格式,并给私有成员赋值

void PReLUPlugin::configureWithFormat(const nvinfer1::Dims* inputDims, int nbInputs, 
                                      const nvinfer1::Dims* outputDims, int nbOutputs,
                                      nvinfer1::DataType type, nvinfer1::PluginFormat format, 
                                      int maxBatchSize) {
    ASSERT((type == nvinfer1::DataType::kFLOAT | type == nvinfer1::DataType::kHALF)
            && format == nvinfer1::PluginFormat::kNCHW);
    mNbInputChannels = inputDims[0].d[0]; 
    mNbInputHeight = inputDims[0].d[1];
    mNbInputWidth = inputDims[0].d[2];
    mDataType = type;
}

virtual int initialize() override

初始化你的插件,其实就是初始化好gpu context, 将你的权重从内存拷贝到gpu上,如果设定了fp16,当然也要先做转化在拷贝到gpu

int PReLUPlugin::initialize() {
    convertAndCopyToDeivce(mDeviceKernel, mWeights, mDataType);
    return 0;
}

virtual void terminate() override

释放内存和显存,给析构函数一样,只不过调用对象不同

void PReLUPlugin::terminate() {
    if (mWeights.values)
    {
        free(const_cast<void *>(mWeights.values));
        mWeights.values = nullptr;
    }
    if (mDeviceKernel)
    {
        cudaFree(mDeviceKernel);
        mDeviceKernel = nullptr;
    }
}

virtual size_t getWorkspaceSize(int maxBatchSize) const override

很难解释, 直接返回0即可

size_t PReLUPlugin::getWorkspaceSize(int maxBatchSize) const
{
    return 0;
}

virtual const char* getPluginType() const override

tensorrt内部定义的

const char *PReLUPlugin::getPluginType() const {
    return G_PRELU_TYPE;
}

virtual const char* getPluginVersion() const override;

tensorrt内部定义的

const char *PReLUPlugin::getPluginVersion() const {
    return G_PLUGIN_VERSION;
}

virtual void destroy()

tensorrt内部接口来析构

void PReLUPlugin::destroy() {
    delete this; 
}

virtual nvinfer1::IPluginV2* clone() const override;

nvinfer1::IPluginV2* PReLUPlugin::clone() const {
    return new PReLUPlugin(&mWeights, 1);
}

virtual void setPluginNamespace(const char* pluginNamespace) override {}

不用实现留空

virtual const char* getPluginNamespace() const override

const char* PReLUPlugin::getPluginNamespace() const {
    return G_PLUGIN_NAMESPACE;
}

virtual int enqueue(int batchSize, const voidconst * inputs, void** outputs, void workspace, cudaStream_t stream) override

此层功能的代码实现,此处调用推理函数

int PReLUPlugin::enqueue(int batchSize, const void *const *inputs, void **outputs, void *workspace, cudaStream_t stream)
{
    const int count = batchSize * mNbInputChannels * mNbInputWidth * mNbInputHeight;
    const int channels = mNbInputChannels;
    const int dim = mNbInputWidth * mNbInputHeight;
    const int div_factor = 1;
    if (mDataType == nvinfer1::DataType::kFLOAT)
    {
        const float zerof{0.0f};
        CUDA_CHECK(Forward_gpu(count, channels, dim,
                            reinterpret_cast<const float *>(mDeviceKernel),
                            reinterpret_cast<const float *>(inputs[0]),
                            reinterpret_cast<float *>(outputs[0]),
                            zerof,
                            div_factor,
                            stream));
    } else {
        const __half zeroh = __half(0.0f);
        CUDA_CHECK(Forward_gpu(count, channels, dim,
                            reinterpret_cast<const __half *>(mDeviceKernel),
                            reinterpret_cast<const __half *>(inputs[0]),
                            reinterpret_cast<__half *>(outputs[0]),
                            zeroh,
                            div_factor,
                            stream));
    }

    return 0;
}

核函数调用

template <typename Ftype>
cudaError_t Forward_gpu(const int count, const int channels, const int dim,
                const Ftype* mDeviceKernel,
                const Ftype* bottom_data, Ftype* top_data, 
                const Ftype zero,
                const int div_factor, const cudaStream_t stream) {
    PReLUForward<<<CAFFE_GET_BLOCKS(count), CUDA_NUM_THREADS, 0, stream>>>
        (count, channels, dim, mDeviceKernel, bottom_data, top_data, zero, div_factor);
    cudaError_t err = cudaGetLastError();
    return err;
}

template <typename Ftype>
__global__ void PReLUForward(const int n, const int channels, const int dim,
    const Ftype* slope_data,
    const Ftype* in, Ftype* out,
    const Ftype zero,
    const int div_factor) {
    CUDA_KERNEL_LOOP(index, n) {
        int c = (index / dim) % channels / div_factor;
        if(in[index] > zero) {
            out[index] = in[index];
        } else {
            out[index] = in[index] * slope_data[c];
        }
    }
}

小涵涵

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
打赏
0
评论
tensorrt 自定义层详解

tensorrt 自定义层详解公用函数宏定义宏函数核相关定义class PReLUPlugin详解私有成员PReLUPlugin(const nvinfer1::Weights* weights, int nbWeight);PReLUPlugin::PReLUPlugin(const void *data, size_t length)virtual size_t getSerializationSize() const overridevirtual void serialize(void* buffe
复制链接

扫一扫