python 下调用 TensorRT onnx 的一种实现 (验证 retinaface)

最新推荐文章于 2024-08-16 09:58:51 发布

walletiger

最新推荐文章于 2024-08-16 09:58:51 发布

阅读量1.7k

点赞数 3

分类专栏： jetson nano tensorRT onnx

本文链接：https://blog.csdn.net/walletiger/article/details/109780866

版权

jetson nano 同时被 3 个专栏收录

17 篇文章 2 订阅

订阅专栏

tensorRT

2 篇文章 0 订阅

订阅专栏

onnx

2 篇文章 0 订阅

订阅专栏

一实验环境:

硬件及系统： jetson nano jetpack 4.4.1

深度学习库版本： cuda 10.2 opencv4.5、 tensorRT 7.1

二目标：

retinaface 据说是准确率和速度最快的人脸检测和 landmark 框架！tensorRT 的版本可以到 4ms.

可以看看大牛的文章：

https://juejin.im/post/6844903967059771399

模型跑起来发现调用很不方便，习惯了用jupyter 快速搭建demo, 这个是听起来如此之牛的模型竟然只有 C++的。

所以打算给 tensorrt_retinaface 工程增加 python 调用检测支持，

同时学习 C++ 库包装 python 模块的方法，实现 opencv 在 python + c++ 混合编程！

github 地址：

https://github.com/walletiger/tensorrt_retinaface_with_python

后记: 验证完发现了 onnx-tensorRT 有 python bindings 。。能否把retinaface_mbv2_sim.onnx 跑起来？学会了再分享

# readme 
# tensorrt_retinaface_with_python
just add python bindings for tensorrt_retinaface detect frame works (support onnx to tensorRT modules)

# tensorrt_retinaface_with_python
just add python bindings for tensorrt_retinaface detect frame works (support onnx to tensorRT modules)

# TensorRT ONNX with python bindings 

clone from https://github.com/wuchaodzxx/tensorrt_retinaface

I add python bindings for python3 support for tensortRT ONNX (+  opencv4)

#how to run:
#1 ./build_all.sh 
#2 cp build/libtensorrt_engine_lib.so  /usr/local/lib/
#3 cd python && python3 setup.py install 
# you must have cuda , tensorRT , opencv4.x , python-boost installed 
#4 convert onnx models to tensorRT
onnx2trt models/retinaface_mbv2_sim.onnx -o models/retinaface.trt
#5 test 
cd python &&  python3 test.py 


#details:
tensorRT wrapper : tensorrt_engine_wrap.cc
python Modules   : python/trt_engine_retinaface_module.cpp python/setup.py 

#python Modules   :
references : 
https://github.com/PierreFritsch/OpenCV-Python-C-Module-for-Image-Processing/
https://github.com/Algomorph/pyboostcvconverter

OpenCV-Python-C-Module-for-Image-Processing project is simple but does not work for my opencv4 env . 
pyboostcvconverter is a little complicated.

i extract pyboost_cv4_converter.cpp from pyboostcvconverter , and combine it to OpenCV-Python-C-Module-for-Image-Processing

三过程

3.1 C++ 模块包装

提供三个接口方便移植

代码在

include/tensorrt_engine_wrap.h

src/tensorrt_engine_wrap.cc

void * trt_engine_retinaface_create(const char * engine_path);
const char * trt_engine_retinaface_detect(void *h_engine, cv::Mat &img);
void trt_engine_retinaface_destroy(void *h_engine);

# tensorrt_engine_wrap.cc  摘取增加的部分

#include <assert.h>
#include <cuda_runtime_api.h>
#include <sys/stat.h>
#include <algorithm>
#include <cmath>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <sstream>
#include <malloc.h>
#include "NvInfer.h"
#include "NvOnnxParser.h"

#include <opencv2/opencv.hpp>
#include "image.h"
#include <sys/time.h>
#include <iterator>
#include <tensorrt_engine.h>
#include "thor/timer.h"
#include "thor/logging.h"
#include "thor/os.h"
#include "thor/structures.h"

#include "../include/batch_stream.h"
#include "../include/tensorrt_utils.h"
#include "../include/entropy_calibrator.h"

// we will using eigen so something
#include "eigen3/Eigen/Eigen"
#include "eigen3/Eigen/Core"


#include "tensorrt_engine_wrap.h"
/**
 *
 * Inference on a new onnx converted trt model
 * using standalone TensorRT engine
 *
 */



static int to_json_str(vector<FaceInfo> & face_info, char *face_json)
{
    int len;
    int i;
    int num_face = face_info.size();
    
    sprintf(face_json, "{\"num_face\": %d", num_face);

   

    if (num_face > 0){
        len = strlen(face_json);
        sprintf(face_json + len, ", \"faces\": [");

        for(i = 0 ; i < num_face ; ++i){
            len = strlen(face_json);

            if (i != 0){
                sprintf(face_json + len, ",");
                len += 1;
            }
            
            sprintf(face_json + len , "{"
                                         "\"score\": %.3f, \"box\":{\"x1\":%.3f, \"y1\":%.3f, \"x2\":%.3f, \"y2\":%.3f},"
                                         "\"landmark\": {\"x\":[%.3f, %.3f, %.3f, %.3f, %.3f], \"y\":[%.3f, %.3f, %.3f, %.3f, %.3f]}"
                                       "}",

                                        face_info[i].score, 
                                        face_info[i].box.x1, face_info[i].box.y1,face_info[i].box.x2,face_info[i].box.y2, 
                                        face_info[i].landmark.x[0], face_info[i].landmark.x[1], face_info[i].landmark.x[2], 
                                        face_info[i].landmark.x[3], face_info[i].landmark.x[4],
                                        face_info[i].landmark.y[0], face_info[i].landmark.y[1], face_info[i].landmark.y[2], 
                                        face_info[i].landmark.y[3], face_info[i].landmark.y[4]                                       
                                        

                );
            
        }

        
        len = strlen(face_json);
        sprintf(face_json + len, " ]");

    }


    len = strlen(face_json);
    sprintf(face_json + len, "}");

    len = strlen(face_json);

    return len;

}



typedef struct 
{
    tensorrt::TensorRTEngine    *h_trt_engine;
    ICudaEngine                 *h_cuda_engine;
    IExecutionContext           *exe_context;
    vector<Box>                 priors;
    char                        face_out_str[16384];
}TRTEngineRetinaface_Context;


void * trt_engine_retinaface_create(const char * engine_path)
{
    TRTEngineRetinaface_Context * ctx;

    printf("trt_engine_retinaface_create start ... \n");
    ctx = new TRTEngineRetinaface_Context;
    ctx->h_cuda_engine = NULL;
    ctx->h_trt_engine = NULL;
    ctx->exe_context = NULL;

    ctx->h_trt_engine = new tensorrt::TensorRTEngine(engine_path);
    
    ctx->h_cuda_engine = ctx->h_trt_engine->getEngine();
    if (!ctx->h_cuda_engine){
        goto err_out;
    }
    
    CheckEngine(ctx->h_cuda_engine);

    ctx->exe_context = ctx->h_cuda_engine->createExecutionContext();

    if (!ctx->exe_context){
        goto err_out;
    }

    ctx->priors = createPriors(min_sizes, steps, cv::Size(INPUT_W, INPUT_H));

    printf("trt_engine_retinaface_create done  \n");
    return (void *)ctx;
    
err_out:

    
    
    if (ctx->h_trt_engine){
        delete ctx->h_trt_engine;
        ctx->h_trt_engine = NULL;
    }

    delete ctx;

    return NULL;
    
}


const char * trt_engine_retinaface_detect(void *h_engine, cv::Mat &img)
{
    TRTEngineRetinaface_Context *ctx;
    float *data;
    int size;
    int len;
    
    ctx = (TRTEngineRetinaface_Context *)h_engine;
    
    cv::Mat resizedImage = cv::Mat::zeros(INPUT_H, INPUT_W, CV_32FC3);
    
    cv::resize(img, resizedImage, cv::Size(INPUT_W, INPUT_H));
    data = HWC2CHW(resizedImage, kMeans);


    vector<FaceInfo> all_faces = doInference(*ctx->exe_context, data, ctx->priors, 1, 0.4);

    len = to_json_str(all_faces, ctx->face_out_str);

    return ctx->face_out_str;
}

void trt_engine_retinaface_destroy(void *h_engine)
{
    TRTEngineRetinaface_Context   *ctx;

    ctx = (TRTEngineRetinaface_Context *)h_engine;
    
    printf(" trt_engine_retinaface_destroy .. \n");
    if (ctx->h_trt_engine){
        //delete ctx->h_trt_engine;
        ctx->h_trt_engine = NULL;
    }

    printf(" trt_engine_retinaface_destroy .. done\n");
    delete ctx;
}

3.2 python 模块打包

打包代码在 python 目录：

这里参考了

https://github.com/PierreFritsch/OpenCV-Python-C-Module-for-Image-Processing/

https://github.com/Algomorph/pyboostcvconverter

主要难点在：

python 和 c++ 代码 cvMat 的转换，通过 Algomorph/pyboostcvconverter 的 numpy 和 cvMat 转换库实现

c++ 检测结果的返回，整数据结构比较复杂，这里c++直接转 json str 返回 python了

包装代码：

#trt_engine_retinaface_module.cpp

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <Python.h>
#include <time.h>
#include <sys/time.h>

#include <opencv2/opencv.hpp>
#include <opencv2/highgui/highgui.hpp>
#include "tensorrt_engine_wrap.h"
#include "pyboostcvconverter/pyboostcvconverter.hpp"
#include <boost/python.hpp>

using namespace cv;
using namespace boost::python;

static unsigned int time_now_ms()
{
    struct timespec t_now;
    int i_sec, i_ms;

    clock_gettime(CLOCK_MONOTONIC, &t_now);

    i_sec = (int)(t_now.tv_sec);
    i_ms  = (int)(t_now.tv_nsec/1000000);

    return (unsigned int)(1000*i_sec + i_ms);
}


static PyObject * mpyCreate(PyObject *self, PyObject *args)
{
    char *engine_name = NULL;
    int w, h;
    void *ctx = NULL;
    unsigned int t0, t1;
    
    if (!PyArg_ParseTuple(args, "s", &engine_name))
        return NULL;

    printf("create engine , trt name = %s \n", engine_name );
    t0 = time_now_ms();
    ctx = trt_engine_retinaface_create(engine_name);
    t1 = time_now_ms();
    printf("engine create  = %lu, cost %d ms\n", (unsigned long long)ctx, t1 - t0);
    return Py_BuildValue("K", (unsigned long long)ctx);

}

static PyObject *mpyDetect(PyObject *self, PyObject *args)
{
    void *engine = NULL;
    PyObject *ndArray = NULL;
    const char *ret = NULL;
    unsigned long long v; 
    unsigned int t0, t1, t2;

    if (!PyArg_ParseTuple(args, "KO", &v, &ndArray))
        return NULL;

    t0 = time_now_ms();
    Mat mat = pbcvt::fromNDArrayToMat(ndArray);
    t1 = time_now_ms();
    
    engine = (void *)v;
    ret = trt_engine_retinaface_detect(engine, mat);
    t2 = time_now_ms();

    printf("engine do detect , load numpy array to mat cost %d ms, detect cost %d ms \n", t1 - t0, t2 - t1);
    return Py_BuildValue("s", ret);
}

static PyObject * mPyDestroy(PyObject *self, PyObject *args)
{
    void *engine = NULL;
    unsigned long long v; 
    if (!PyArg_ParseTuple(args, "K", &v))
        return NULL;

    printf(" destroy engine , engine = %lu\n", v);
    engine = (void *)v;
    trt_engine_retinaface_destroy(engine);

    return NULL;

}

static PyMethodDef TRTRetinaFaceMeThods[] = {
    {"create", mpyCreate, METH_VARARGS, "Create the engine."},
    {"detect", mpyDetect, METH_VARARGS, "use the engine to detect image"},    
    {"destroy", mPyDestroy, METH_VARARGS, "destroy the engine"},        
    {NULL, NULL, 0, NULL}
};

static struct PyModuleDef TRTRetinaFaceModule = {
    PyModuleDef_HEAD_INIT,
    "TRTRetinaFace",     /* name of module */
    "",          /* module documentation, may be NULL */
    -1,          /* size of per-interpreter state of the module, or -1 if the module keeps state in global variables. */
    TRTRetinaFaceMeThods
};

PyMODINIT_FUNC PyInit_TRTRetinaFace(void) {
    printf("init module ... \n");

    return PyModule_Create(&TRTRetinaFaceModule);
}

3.3 验证

根据 github 上 readme 的指导编译调用，

测试程序：

import cv2
import TRTRetinaFace as t

engine = t.create('../retinaface.trt')

for i in range(1, 6):
    img = cv2.imread('/home/walle/x%d.jpg' % i)
    b = t.detect(engine, img)
    print(b)

执行结果：

我的图像比较大，都是 2k, 4k的。

root@walle-desktop:/workspace/face_detect/tensorrt_retinaface_with_python/python# python3 test.py 
init module ... 
create engine , trt name = ../models/retinaface.trt 
trt_engine_retinaface_create start ... 
=> checking engine....
=> engine maxBatchSize: 32
=> engine NbBindings: 4
    => BindingName at: 0=input.1 Dims=4
    => BindingName at: 1=487 Dims=3
    => BindingName at: 2=513 Dims=3
    => BindingName at: 3=514 Dims=3
I 11/18 18:37:45.933 ..._engine_wrap.cc createPriors:163] 0.00390625 0.0058997 0.015625 0.0235988
I 11/18 18:37:45.934 ..._engine_wrap.cc createPriors:163] 0.00390625 0.0058997 0.03125 0.0471976
I 11/18 18:37:45.934 ..._engine_wrap.cc createPriors:163] 0.0117188 0.0058997 0.015625 0.0235988
I 11/18 18:37:45.934 ..._engine_wrap.cc createPriors:163] 0.0117188 0.0058997 0.03125 0.0471976
I 11/18 18:37:45.934 ..._engine_wrap.cc createPriors:163] 0.0195312 0.0058997 0.015625 0.0235988
trt_engine_retinaface_create done  
engine create  = 822455280, cost 4162 ms
engine do detect , load numpy array to mat cost 0 ms, detect cost 1026 ms 
{"num_face": 0}
engine do detect , load numpy array to mat cost 0 ms, detect cost 181 ms 
{"num_face": 0}
engine do detect , load numpy array to mat cost 0 ms, detect cost 168 ms 
{"num_face": 1, "faces": [{"score": 0.994, "box":{"x1":0.307, "y1":0.076, "x2":0.601, "y2":0.526},"landmark": {"x":[0.408, 0.542, 0.473, 0.387, 0.503], "y":[0.214, 0.249, 0.333, 0.385, 0.416]}} ]}
engine do detect , load numpy array to mat cost 0 ms, detect cost 140 ms 
{"num_face": 0}
engine do detect , load numpy array to mat cost 0 ms, detect cost 132 ms 
{"num_face": 1, "faces": [{"score": 0.989, "box":{"x1":0.415, "y1":0.179, "x2":0.693, "y2":0.600},"landmark": {"x":[0.526, 0.645, 0.616, 0.538, 0.646], "y":[0.337, 0.328, 0.408, 0.479, 0.471]}} ]}