Cpp DenseNet Tensorrt部署主体

最新推荐文章于 2024-09-11 11:31:11 发布

tacom_

最新推荐文章于 2024-09-11 11:31:11 发布

阅读量431

点赞数 7

文章标签： Tensorrt c++ 人工智能 cuda

本文链接：https://blog.csdn.net/qq_34524246/article/details/140961247

版权

Cpp DenseNet Tensorrt辅助函数

Python版本导引

Cpp OpenVINO版本导引

CMake工程示例

测试性质的工程结构如下:

Project:

Network: 存放网络推理相关
- NvLogger: Tensorrt需要手动实现的日志类
- DenseGradeWrapper: 推理主结构
- NetworkTools: 部分CV辅助函数
Script: 存放前面博客中的导出以及验证Py文件
- Cpp DenseNet Tensorrt导出
- Cpp DenseNet Tensorrt测试
CMakeLists.txt: cmake工程文件
main.cpp: 简单的执行文件

DenseGradeWrapper

肉眼可见的，Trt版本的执行代码，就会比OpenVINO复杂很多

需要创建运行时，手动实现Logger
模型读取方式需要手动使用io类
手动读取模型输入输出，并分配输入的host和device以及输出的host和device
推理时需要手动完成数据传输，出发函数后需要手动完成数据拷贝回主机的操作

相对的，Trt给了更多的可操作空间，执行速度也是同水平最好的一档，请各位开发者按照自己的需求进行选择、二次开发

#include "DenseGradeWrapper.h"

DenseGradeWrapper::DenseGradeWrapper(QString enginePath, QObject *parent)
    : QObject{parent}, runtime(nvinfer1::createInferRuntime(this->logger)),
    logger(nvinfer1::ILogger::Severity::kVERBOSE)
{
    // De-serialize engine from file
    std::ifstream engineFile(enginePath.toStdString(), std::ios::binary);
    if (engineFile.fail())
    {
        return;
    }

    engineFile.seekg(0, std::ifstream::end);
    auto fsize = engineFile.tellg();
    engineFile.seekg(0, std::ifstream::beg);

    std::vector<char> engineData(fsize);
    engineFile.read(engineData.data(), fsize);

    // load to engine
    this->engine.reset(runtime->deserializeCudaEngine(engineData.data(), fsize));
    assert(this->engine.get() != nullptr);

    // allocate host and device memory
    this->allocateBuffers();
}

DenseInferResult DenseGradeWrapper::infer(QString imageSrc){
    double tS, tE;

    tS = (double) clock();
    cv::Mat transDim = this->preprocess(imageSrc);
    tE = (double) clock();
    qDebug() << "Preprocess Done, cost " << (tE - tS) / (CLOCKS_PER_SEC) << " s";

    tS = (double) clock();
    QVector<float> buffer = this->_infer(transDim);
    tE = (double) clock();
    qDebug() << "Infer Done, cost " << (tE - tS) / (CLOCKS_PER_SEC) << " s";

    tS = (double) clock();
    this->softmax(buffer);
    DenseInferResult res = this->postProcess(buffer);
    tE = (double) clock();
    qDebug() << "Postprocess Done, cost " << (tE - tS) / (CLOCKS_PER_SEC) << " s";

    return res;
}

QVector<float> DenseGradeWrapper::_infer(cv::Mat transDim){
    // copy device data to host
    if(cudaMemcpyAsync(inputs[0], transDim.data, transDim.total() * transDim.elemSize(), cudaMemcpyHostToDevice, this->stream) != cudaSuccess){
        qCritical() << "copy cv data to device failed";
        return {0};
    }

    // bind
    for(size_t i = 0; i < this->engine->getNbIOTensors(); i++){
        this->context->setTensorAddress(this->engine->getIOTensorName(i), bindings[i]);
    }

    // exec
    this->context->enqueueV3(this->stream);

    // copy to host
    float outputBuffer[5];
    if(cudaMemcpyAsync(outputBuffer, this->outputs[0], 5 * sizeof(float), cudaMemcpyDeviceToHost, this->stream) != cudaSuccess){
        qCritical() << "copy device data to host failed";
        return {0};
    }

    cudaStreamSynchronize(this->stream);
    return QVector<float>(outputBuffer, (outputBuffer + 5));
}


void DenseGradeWrapper::softmax(QVector<float> &buffer){
    float denominator = 0.0;

    for(size_t i = 0; i < buffer.size(); i++){
        buffer[i] = std::exp(buffer[i]);
        denominator += buffer[i];
    }

    for(size_t i = 0; i < buffer.size(); i++){
        buffer[i] /= denominator;
    }
}

DenseInferResult DenseGradeWrapper::postProcess(const QVector<float> &buffer){
    int index = buffer.indexOf(*std::max_element(buffer.begin(), buffer.end()));
    DenseInferResult result;
    result.index = index;
    std::copy(buffer.begin(), buffer.end(), result.pie);
    return result;
}

cv::Mat DenseGradeWrapper::preprocess(QString imageSrc){
    cv::Mat cropMat = this->circleCrop(imageSrc);
    std::vector<cv::Mat> batch_mat;
    batch_mat.push_back(cropMat);
    return cv::dnn::blobFromImages(batch_mat, 1.0 / 255.0, cv::Size(),
                                   cv::Scalar(), true);
}

cv::Mat DenseGradeWrapper::circleCrop(QString imageSrc){
    cv::Mat cropMask = NetworkTool::ToolCropImageFromMask(imageSrc);
    return NetworkTool::ToolCropWithGaussian(cropMask);
}



void DenseGradeWrapper::allocateBuffers(){
    // create stream
    if(cudaStreamCreate(&this->stream) != cudaSuccess){
        qCritical() << "ERROR: cuda stream allocation failed";
        return ;
    }

    // create context
    context = std::unique_ptr<nvinfer1::IExecutionContext>(this->engine->createExecutionContext());

    // create memory
    for(size_t i = 0; i < this->engine->getNbIOTensors(); i++){
        // name
        const nvinfer1::AsciiChar *tensorName = this->engine->getIOTensorName(i);

        // shape
        nvinfer1::Dims shape = this->engine->getTensorShape(tensorName);

        // size
        size_t memSize = this->getMemorySize(shape, sizeof(float));

        // cuda ptr
        void *memPtr{nullptr};
        if (cudaMalloc(&memPtr, memSize) != cudaSuccess)
        {
            qCritical() << "ERROR: input cuda memory allocation failed, size = " << memSize << " bytes";
            return ;
        }

        // save
        if(this->engine->getTensorIOMode(tensorName) == nvinfer1::TensorIOMode::kINPUT){
            this->inputs.push_back(memPtr);
        }else{
            this->outputs.push_back(memPtr);
        }
        this->bindings.push_back(memPtr);
    }
}

size_t DenseGradeWrapper::getMemorySize(const nvinfer1::Dims& dims, const int32_t elem_size){
    return std::accumulate(dims.d, dims.d + dims.nbDims, 1, std::multiplies<int64_t>()) * elem_size;
}

DenseGradeWrapper::~DenseGradeWrapper(){
    for(size_t i = 0; i < this->bindings.size(); i++){
        cudaFree(this->bindings[i]);
    }
}

main

同样的，给出封装类的执行方式

#include <QCoreApplication>

#include "Network/DenseGradeWrapper.h"

int main(int argc, char *argv[])
{
    QString model_path = "../tensorrt-gpu-win-msvc2019/Script/export_dense121_gpu.engine";
    QString image_path = "../tensorrt-gpu-win-msvc2019/Script/1.jpg";

    DenseGradeWrapper denseWrapper(model_path);
    DenseInferResult res = denseWrapper.infer(image_path);

    qDebug() << "Level is: " << res.index;
    qDebug() << "Each possible is: ";
    for(int i = 0; i < 5; i++){
        qDebug() << "\t" << res.pie[i];
    }
}