Cpp DenseNet Tensorrt辅助函数
Python版本导引
Cpp OpenVINO版本导引
- Cpp DenseNet OpenVino导出
- Cpp DenseNet OpenVino测试
- Cpp DenseNet OpenVino环境配置
- Cpp DenseNet OpenVino CMake工程
- Cpp DenseNet OpenVino辅助函数
- Cpp DenseNet OpenVino部署主体
CMake工程示例
测试性质的工程结构如下:
Project:
Network
: 存放网络推理相关- NvLogger: Tensorrt需要手动实现的日志类
- DenseGradeWrapper: 推理主结构
- NetworkTools: 部分CV辅助函数
Script
: 存放前面博客中的导出以及验证Py文件CMakeLists.txt
: cmake工程文件main.cpp
: 简单的执行文件
DenseGradeWrapper
肉眼可见的,Trt版本的执行代码,就会比OpenVINO复杂很多
- 需要创建运行时,手动实现Logger
- 模型读取方式需要手动使用io类
- 手动读取模型输入输出,并分配输入的host和device以及输出的host和device
- 推理时需要手动完成数据传输,出发函数后需要手动完成数据拷贝回主机的操作
相对的,Trt给了更多的可操作空间,执行速度也是同水平最好的一档,请各位开发者按照自己的需求进行选择、二次开发
#include "DenseGradeWrapper.h"
DenseGradeWrapper::DenseGradeWrapper(QString enginePath, QObject *parent)
: QObject{parent}, runtime(nvinfer1::createInferRuntime(this->logger)),
logger(nvinfer1::ILogger::Severity::kVERBOSE)
{
// De-serialize engine from file
std::ifstream engineFile(enginePath.toStdString(), std::ios::binary);
if (engineFile.fail())
{
return;
}
engineFile.seekg(0, std::ifstream::end);
auto fsize = engineFile.tellg();
engineFile.seekg(0, std::ifstream::beg);
std::vector<char> engineData(fsize);
engineFile.read(engineData.data(), fsize);
// load to engine
this->engine.reset(runtime->deserializeCudaEngine(engineData.data(), fsize));
assert(this->engine.get() != nullptr);
// allocate host and device memory
this->allocateBuffers();
}
DenseInferResult DenseGradeWrapper::infer(QString imageSrc){
double tS, tE;
tS = (double) clock();
cv::Mat transDim = this->preprocess(imageSrc);
tE = (double) clock();
qDebug() << "Preprocess Done, cost " << (tE - tS) / (CLOCKS_PER_SEC) << " s";
tS = (double) clock();
QVector<float> buffer = this->_infer(transDim);
tE = (double) clock();
qDebug() << "Infer Done, cost " << (tE - tS) / (CLOCKS_PER_SEC) << " s";
tS = (double) clock();
this->softmax(buffer);
DenseInferResult res = this->postProcess(buffer);
tE = (double) clock();
qDebug() << "Postprocess Done, cost " << (tE - tS) / (CLOCKS_PER_SEC) << " s";
return res;
}
QVector<float> DenseGradeWrapper::_infer(cv::Mat transDim){
// copy device data to host
if(cudaMemcpyAsync(inputs[0], transDim.data, transDim.total() * transDim.elemSize(), cudaMemcpyHostToDevice, this->stream) != cudaSuccess){
qCritical() << "copy cv data to device failed";
return {0};
}
// bind
for(size_t i = 0; i < this->engine->getNbIOTensors(); i++){
this->context->setTensorAddress(this->engine->getIOTensorName(i), bindings[i]);
}
// exec
this->context->enqueueV3(this->stream);
// copy to host
float outputBuffer[5];
if(cudaMemcpyAsync(outputBuffer, this->outputs[0], 5 * sizeof(float), cudaMemcpyDeviceToHost, this->stream) != cudaSuccess){
qCritical() << "copy device data to host failed";
return {0};
}
cudaStreamSynchronize(this->stream);
return QVector<float>(outputBuffer, (outputBuffer + 5));
}
void DenseGradeWrapper::softmax(QVector<float> &buffer){
float denominator = 0.0;
for(size_t i = 0; i < buffer.size(); i++){
buffer[i] = std::exp(buffer[i]);
denominator += buffer[i];
}
for(size_t i = 0; i < buffer.size(); i++){
buffer[i] /= denominator;
}
}
DenseInferResult DenseGradeWrapper::postProcess(const QVector<float> &buffer){
int index = buffer.indexOf(*std::max_element(buffer.begin(), buffer.end()));
DenseInferResult result;
result.index = index;
std::copy(buffer.begin(), buffer.end(), result.pie);
return result;
}
cv::Mat DenseGradeWrapper::preprocess(QString imageSrc){
cv::Mat cropMat = this->circleCrop(imageSrc);
std::vector<cv::Mat> batch_mat;
batch_mat.push_back(cropMat);
return cv::dnn::blobFromImages(batch_mat, 1.0 / 255.0, cv::Size(),
cv::Scalar(), true);
}
cv::Mat DenseGradeWrapper::circleCrop(QString imageSrc){
cv::Mat cropMask = NetworkTool::ToolCropImageFromMask(imageSrc);
return NetworkTool::ToolCropWithGaussian(cropMask);
}
void DenseGradeWrapper::allocateBuffers(){
// create stream
if(cudaStreamCreate(&this->stream) != cudaSuccess){
qCritical() << "ERROR: cuda stream allocation failed";
return ;
}
// create context
context = std::unique_ptr<nvinfer1::IExecutionContext>(this->engine->createExecutionContext());
// create memory
for(size_t i = 0; i < this->engine->getNbIOTensors(); i++){
// name
const nvinfer1::AsciiChar *tensorName = this->engine->getIOTensorName(i);
// shape
nvinfer1::Dims shape = this->engine->getTensorShape(tensorName);
// size
size_t memSize = this->getMemorySize(shape, sizeof(float));
// cuda ptr
void *memPtr{nullptr};
if (cudaMalloc(&memPtr, memSize) != cudaSuccess)
{
qCritical() << "ERROR: input cuda memory allocation failed, size = " << memSize << " bytes";
return ;
}
// save
if(this->engine->getTensorIOMode(tensorName) == nvinfer1::TensorIOMode::kINPUT){
this->inputs.push_back(memPtr);
}else{
this->outputs.push_back(memPtr);
}
this->bindings.push_back(memPtr);
}
}
size_t DenseGradeWrapper::getMemorySize(const nvinfer1::Dims& dims, const int32_t elem_size){
return std::accumulate(dims.d, dims.d + dims.nbDims, 1, std::multiplies<int64_t>()) * elem_size;
}
DenseGradeWrapper::~DenseGradeWrapper(){
for(size_t i = 0; i < this->bindings.size(); i++){
cudaFree(this->bindings[i]);
}
}
main
同样的,给出封装类的执行方式
#include <QCoreApplication>
#include "Network/DenseGradeWrapper.h"
int main(int argc, char *argv[])
{
QString model_path = "../tensorrt-gpu-win-msvc2019/Script/export_dense121_gpu.engine";
QString image_path = "../tensorrt-gpu-win-msvc2019/Script/1.jpg";
DenseGradeWrapper denseWrapper(model_path);
DenseInferResult res = denseWrapper.infer(image_path);
qDebug() << "Level is: " << res.index;
qDebug() << "Each possible is: ";
for(int i = 0; i < 5; i++){
qDebug() << "\t" << res.pie[i];
}
}