一 实验环境:
硬件及系统 : jetson nano jetpack 4.4.1
深度学习库版本: cuda 10.2 opencv4.5、 tensorRT 7.1
二 目标:
retinaface 据说是 准确 率和速度最快的 人脸检测和 landmark 框架!tensorRT 的版本可以到 4ms.
可以看看大牛的文章:
https://juejin.im/post/6844903967059771399
模型跑起来发现调用很不方便, 习惯了用jupyter 快速搭建demo, 这个是听起来如此之牛的模型竟然 只有 C++的。
所以打算给 tensorrt_retinaface 工程增加 python 调用 检测支持,
同时学习 C++ 库包装 python 模块的方法, 实现 opencv 在 python + c++ 混合编程 !
github 地址:
https://github.com/walletiger/tensorrt_retinaface_with_python
后记: 验证完发现了 onnx-tensorRT 有 python bindings 。。 能否把retinaface_mbv2_sim.onnx 跑起来 ?学会了再分享
# readme
# tensorrt_retinaface_with_python
just add python bindings for tensorrt_retinaface detect frame works (support onnx to tensorRT modules)
# tensorrt_retinaface_with_python
just add python bindings for tensorrt_retinaface detect frame works (support onnx to tensorRT modules)
# TensorRT ONNX with python bindings
clone from https://github.com/wuchaodzxx/tensorrt_retinaface
I add python bindings for python3 support for tensortRT ONNX (+ opencv4)
#how to run:
#1 ./build_all.sh
#2 cp build/libtensorrt_engine_lib.so /usr/local/lib/
#3 cd python && python3 setup.py install
# you must have cuda , tensorRT , opencv4.x , python-boost installed
#4 convert onnx models to tensorRT
onnx2trt models/retinaface_mbv2_sim.onnx -o models/retinaface.trt
#5 test
cd python && python3 test.py
#details:
tensorRT wrapper : tensorrt_engine_wrap.cc
python Modules : python/trt_engine_retinaface_module.cpp python/setup.py
#python Modules :
references :
https://github.com/PierreFritsch/OpenCV-Python-C-Module-for-Image-Processing/
https://github.com/Algomorph/pyboostcvconverter
OpenCV-Python-C-Module-for-Image-Processing project is simple but does not work for my opencv4 env .
pyboostcvconverter is a little complicated.
i extract pyboost_cv4_converter.cpp from pyboostcvconverter , and combine it to OpenCV-Python-C-Module-for-Image-Processing
三 过程
3.1 C++ 模块包装
提供三个接口方便移植
代码在
include/tensorrt_engine_wrap.h
src/tensorrt_engine_wrap.cc
void * trt_engine_retinaface_create(const char * engine_path);
const char * trt_engine_retinaface_detect(void *h_engine, cv::Mat &img);
void trt_engine_retinaface_destroy(void *h_engine);
# tensorrt_engine_wrap.cc 摘取增加的部分
#include <assert.h>
#include <cuda_runtime_api.h>
#include <sys/stat.h>
#include <algorithm>
#include <cmath>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <sstream>
#include <malloc.h>
#include "NvInfer.h"
#include "NvOnnxParser.h"
#include <opencv2/opencv.hpp>
#include "image.h"
#include <sys/time.h>
#include <iterator>
#include <tensorrt_engine.h>
#include "thor/timer.h"
#include "thor/logging.h"
#include "thor/os.h"
#include "thor/structures.h"
#include "../include/batch_stream.h"
#include "../include/tensorrt_utils.h"
#include "../include/entropy_calibrator.h"
// we will using eigen so something
#include "eigen3/Eigen/Eigen"
#include "eigen3/Eigen/Core"
#include "tensorrt_engine_wrap.h"
/**
*
* Inference on a new onnx converted trt model
* using standalone TensorRT engine
*
*/
static int to_json_str(vector<FaceInfo> & face_info, char *face_json)
{
int len;
int i;
int num_face = face_info.size();
sprintf(face_json, "{\"num_face\": %d", num_face);
if (num_face > 0){
len = strlen(face_json);
sprintf(face_json + len, ", \"faces\": [");
for(i = 0 ; i < num_face ; ++i){
len = strlen(face_json);
if (i != 0){
sprintf(face_json + len, ",");
len += 1;
}
sprintf(face_json + len , "{"
"\"score\": %.3f, \"box\":{\"x1\":%.3f, \"y1\":%.3f, \"x2\":%.3f, \"y2\":%.3f},"
"\"landmark\": {\"x\":[%.3f, %.3f, %.3f, %.3f, %.3f], \"y\":[%.3f, %.3f, %.3f, %.3f, %.3f]}"
"}",
face_info[i].score,
face_info[i].box.x1, face_info[i].box.y1,face_info[i].box.x2,face_info[i].box.y2,
face_info[i].landmark.x[0], face_info[i].landmark.x[1], face_info[i].landmark.x[2],
face_info[i].landmark.x[3], face_info[i].landmark.x[4],
face_info[i].landmark.y[0], face_info[i].landmark.y[1], face_info[i].landmark.y[2],
face_info[i].landmark.y[3], face_info[i].landmark.y[4]
);
}
len = strlen(face_json);
sprintf(face_json + len, " ]");
}
len = strlen(face_json);
sprintf(face_json + len, "}");
len = strlen(face_json);
return len;
}
typedef struct
{
tensorrt::TensorRTEngine *h_trt_engine;
ICudaEngine *h_cuda_engine;
IExecutionContext *exe_context;
vector<Box> priors;
char face_out_str[16384];
}TRTEngineRetinaface_Context;
void * trt_engine_retinaface_create(const char * engine_path)
{
TRTEngineRetinaface_Context * ctx;
printf("trt_engine_retinaface_create start ... \n");
ctx = new TRTEngineRetinaface_Context;
ctx->h_cuda_engine = NULL;
ctx->h_trt_engine = NULL;
ctx->exe_context = NULL;
ctx->h_trt_engine = new tensorrt::TensorRTEngine(engine_path);
ctx->h_cuda_engine = ctx->h_trt_engine->getEngine();
if (!ctx->h_cuda_engine){
goto err_out;
}
CheckEngine(ctx->h_cuda_engine);
ctx->exe_context = ctx->h_cuda_engine->createExecutionContext();
if (!ctx->exe_context){
goto err_out;
}
ctx->priors = createPriors(min_sizes, steps, cv::Size(INPUT_W, INPUT_H));
printf("trt_engine_retinaface_create done \n");
return (void *)ctx;
err_out:
if (ctx->h_trt_engine){
delete ctx->h_trt_engine;
ctx->h_trt_engine = NULL;
}
delete ctx;
return NULL;
}
const char * trt_engine_retinaface_detect(void *h_engine, cv::Mat &img)
{
TRTEngineRetinaface_Context *ctx;
float *data;
int size;
int len;
ctx = (TRTEngineRetinaface_Context *)h_engine;
cv::Mat resizedImage = cv::Mat::zeros(INPUT_H, INPUT_W, CV_32FC3);
cv::resize(img, resizedImage, cv::Size(INPUT_W, INPUT_H));
data = HWC2CHW(resizedImage, kMeans);
vector<FaceInfo> all_faces = doInference(*ctx->exe_context, data, ctx->priors, 1, 0.4);
len = to_json_str(all_faces, ctx->face_out_str);
return ctx->face_out_str;
}
void trt_engine_retinaface_destroy(void *h_engine)
{
TRTEngineRetinaface_Context *ctx;
ctx = (TRTEngineRetinaface_Context *)h_engine;
printf(" trt_engine_retinaface_destroy .. \n");
if (ctx->h_trt_engine){
//delete ctx->h_trt_engine;
ctx->h_trt_engine = NULL;
}
printf(" trt_engine_retinaface_destroy .. done\n");
delete ctx;
}
3.2 python 模块打包
打包代码在 python 目录:
这里参考了
https://github.com/PierreFritsch/OpenCV-Python-C-Module-for-Image-Processing/
https://github.com/Algomorph/pyboostcvconverter
主要难点 在:
python 和 c++ 代码 cvMat 的 转换, 通过 Algomorph/pyboostcvconverter 的 numpy 和 cvMat 转换库实现
c++ 检测结果 的返回, 整 数据 结构 比较复杂 , 这里c++直接转 json str 返回 python了
包装代码:
#trt_engine_retinaface_module.cpp
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <Python.h>
#include <time.h>
#include <sys/time.h>
#include <opencv2/opencv.hpp>
#include <opencv2/highgui/highgui.hpp>
#include "tensorrt_engine_wrap.h"
#include "pyboostcvconverter/pyboostcvconverter.hpp"
#include <boost/python.hpp>
using namespace cv;
using namespace boost::python;
static unsigned int time_now_ms()
{
struct timespec t_now;
int i_sec, i_ms;
clock_gettime(CLOCK_MONOTONIC, &t_now);
i_sec = (int)(t_now.tv_sec);
i_ms = (int)(t_now.tv_nsec/1000000);
return (unsigned int)(1000*i_sec + i_ms);
}
static PyObject * mpyCreate(PyObject *self, PyObject *args)
{
char *engine_name = NULL;
int w, h;
void *ctx = NULL;
unsigned int t0, t1;
if (!PyArg_ParseTuple(args, "s", &engine_name))
return NULL;
printf("create engine , trt name = %s \n", engine_name );
t0 = time_now_ms();
ctx = trt_engine_retinaface_create(engine_name);
t1 = time_now_ms();
printf("engine create = %lu, cost %d ms\n", (unsigned long long)ctx, t1 - t0);
return Py_BuildValue("K", (unsigned long long)ctx);
}
static PyObject *mpyDetect(PyObject *self, PyObject *args)
{
void *engine = NULL;
PyObject *ndArray = NULL;
const char *ret = NULL;
unsigned long long v;
unsigned int t0, t1, t2;
if (!PyArg_ParseTuple(args, "KO", &v, &ndArray))
return NULL;
t0 = time_now_ms();
Mat mat = pbcvt::fromNDArrayToMat(ndArray);
t1 = time_now_ms();
engine = (void *)v;
ret = trt_engine_retinaface_detect(engine, mat);
t2 = time_now_ms();
printf("engine do detect , load numpy array to mat cost %d ms, detect cost %d ms \n", t1 - t0, t2 - t1);
return Py_BuildValue("s", ret);
}
static PyObject * mPyDestroy(PyObject *self, PyObject *args)
{
void *engine = NULL;
unsigned long long v;
if (!PyArg_ParseTuple(args, "K", &v))
return NULL;
printf(" destroy engine , engine = %lu\n", v);
engine = (void *)v;
trt_engine_retinaface_destroy(engine);
return NULL;
}
static PyMethodDef TRTRetinaFaceMeThods[] = {
{"create", mpyCreate, METH_VARARGS, "Create the engine."},
{"detect", mpyDetect, METH_VARARGS, "use the engine to detect image"},
{"destroy", mPyDestroy, METH_VARARGS, "destroy the engine"},
{NULL, NULL, 0, NULL}
};
static struct PyModuleDef TRTRetinaFaceModule = {
PyModuleDef_HEAD_INIT,
"TRTRetinaFace", /* name of module */
"", /* module documentation, may be NULL */
-1, /* size of per-interpreter state of the module, or -1 if the module keeps state in global variables. */
TRTRetinaFaceMeThods
};
PyMODINIT_FUNC PyInit_TRTRetinaFace(void) {
printf("init module ... \n");
return PyModule_Create(&TRTRetinaFaceModule);
}
3.3 验证
根据 github 上 readme 的指导编译调用,
测试程序 :
import cv2
import TRTRetinaFace as t
engine = t.create('../retinaface.trt')
for i in range(1, 6):
img = cv2.imread('/home/walle/x%d.jpg' % i)
b = t.detect(engine, img)
print(b)
执行结果 :
我的图像比较大,都是 2k, 4k的。
root@walle-desktop:/workspace/face_detect/tensorrt_retinaface_with_python/python# python3 test.py
init module ...
create engine , trt name = ../models/retinaface.trt
trt_engine_retinaface_create start ...
=> checking engine....
=> engine maxBatchSize: 32
=> engine NbBindings: 4
=> BindingName at: 0=input.1 Dims=4
=> BindingName at: 1=487 Dims=3
=> BindingName at: 2=513 Dims=3
=> BindingName at: 3=514 Dims=3
I 11/18 18:37:45.933 ..._engine_wrap.cc createPriors:163] 0.00390625 0.0058997 0.015625 0.0235988
I 11/18 18:37:45.934 ..._engine_wrap.cc createPriors:163] 0.00390625 0.0058997 0.03125 0.0471976
I 11/18 18:37:45.934 ..._engine_wrap.cc createPriors:163] 0.0117188 0.0058997 0.015625 0.0235988
I 11/18 18:37:45.934 ..._engine_wrap.cc createPriors:163] 0.0117188 0.0058997 0.03125 0.0471976
I 11/18 18:37:45.934 ..._engine_wrap.cc createPriors:163] 0.0195312 0.0058997 0.015625 0.0235988
trt_engine_retinaface_create done
engine create = 822455280, cost 4162 ms
engine do detect , load numpy array to mat cost 0 ms, detect cost 1026 ms
{"num_face": 0}
engine do detect , load numpy array to mat cost 0 ms, detect cost 181 ms
{"num_face": 0}
engine do detect , load numpy array to mat cost 0 ms, detect cost 168 ms
{"num_face": 1, "faces": [{"score": 0.994, "box":{"x1":0.307, "y1":0.076, "x2":0.601, "y2":0.526},"landmark": {"x":[0.408, 0.542, 0.473, 0.387, 0.503], "y":[0.214, 0.249, 0.333, 0.385, 0.416]}} ]}
engine do detect , load numpy array to mat cost 0 ms, detect cost 140 ms
{"num_face": 0}
engine do detect , load numpy array to mat cost 0 ms, detect cost 132 ms
{"num_face": 1, "faces": [{"score": 0.989, "box":{"x1":0.415, "y1":0.179, "x2":0.693, "y2":0.600},"landmark": {"x":[0.526, 0.645, 0.616, 0.538, 0.646], "y":[0.337, 0.328, 0.408, 0.479, 0.471]}} ]}