yolov7 - pose 系列CUDA 前后处理

最新推荐文章于 2024-02-05 15:04:40 发布

Cindy_1224

最新推荐文章于 2024-02-05 15:04:40 发布

阅读量705

点赞数 2

分类专栏： CUDA 模型部署文章标签：算法 c++

本文链接：https://blog.csdn.net/cindywry/article/details/130425189

版权

模型部署同时被 2 个专栏收录

7 篇文章 2 订阅

订阅专栏

CUDA

1 篇文章 1 订阅

订阅专栏

yolov7-pose.cpp

#include <fstream>
#include <iostream>
#include <sstream>
#include <numeric>
#include <chrono>
#include <vector>
#include <opencv2/opencv.hpp>
#include <dirent.h>
#include "NvInfer.h"
#include "cuda_runtime_api.h"
#include "logging.h"
//#include "include/utils.hpp"
#include "utils.hpp"
#include "preprocess.h"
#include "postprocess.h"
#define MAX_IMAGE_INPUT_SIZE_THRESH 5000 * 5000
#define MAX_OBJECTS 2048
#define NUM_BOX_ELEMENT 17

using namespace std;

struct affineMatrix  //letter_box  仿射变换矩阵
{
    float i2d[6];       //仿射变换正变换
    float d2i[6];       //仿射变换逆变换
};

struct bbox 
{
     float x1,x2,y1,y2;
     float landmarks[10]; //5个关键点 2*5 =10
     float score;
};

#define CHECK(status) \
    do\
    {\
        auto ret = (status);\
        if (ret != 0)\
        {\
            std::cerr << "Cuda failure: " << ret << std::endl;\
            abort();\
        }\
    } while (0)

#define DEVICE 0  // GPU id
#define NMS_THRESH 0.45
#define BBOX_CONF_THRESH 0.3

using namespace nvinfer1;

static const int INPUT_W = 640;
static const int INPUT_H = 640;
static const int NUM_CLASSES = 1;  //类别数
static const int CKPT_NUM = 5;  //关键点个数


const char* INPUT_BLOB_NAME = "images"; //onnx 输入  名字
const char* OUTPUT_BLOB_NAME = "output"; //onnx 输出 名字
static Logger gLogger;


void affine_project(float *d2i,float x,float y,float *ox,float *oy) //通过仿射变换逆矩阵，恢复成原图的坐标
{
    *ox = d2i[0]*x+d2i[1]*y+d2i[2];
    *oy = d2i[3]*x+d2i[4]*y+d2i[5];
}


const float color_list[5][3] =
{
    {255, 0, 0},
    {0, 255, 0},
    {0, 0, 255},
    {0, 255, 255},
    {255,255,0},
};

void getd2i(affineMatrix &afmt,cv::Size  to,cv::Size from) //计算仿射变换的矩阵和逆矩阵
{
    float scale = std::min(1.0*to.width/from.width, 1.0*to.height/from.height);
    afmt.i2d[0]=scale;
    afmt.i2d[1]=0;
    afmt.i2d[2]=-scale*from.width*0.5+to.width*0.5;
    afmt.i2d[3]=0;
    afmt.i2d[4]=scale;
    afmt.i2d[5]=-scale*from.height*0.5+to.height*0.5;
    cv::Mat i2d_mat(2,3,CV_32F,afmt.i2d);
    cv::Mat d2i_mat(2,3,CV_32F,afmt.d2i);
    cv::invertAffineTransform(i2d_mat,d2i_mat);
    memcpy(afmt.d2i, d2i_mat.ptr<float>(0), sizeof(afmt.d2i));
}

int main(int argc, char** argv)
 {
    cudaSetDevice(DEVICE);
    char *trtModelStreamDet{nullptr};
    size_t size{0};
    const std::string engine_file_path {argv[1]};  
    std::ifstream file(engine_file_path, std::ios::binary);
    int batch_size = 1;
    if (file.good()) {
        file.seekg(0, file.end);
        size = file.tellg();
        file.seekg(0, file.beg);
        trtModelStreamDet = new char[size];
        assert(trtModelStreamDet);
        file.read(trtModelStreamDet, size);
        file.close();
    }

   

    //det模型trt初始化
    IRuntime* runtime_det = createInferRuntime(gLogger);
    assert(runtime_det != nullptr);
    ICudaEngine* engine_det = runtime_det->deserializeCudaEngine(trtModelStreamDet, size);
    assert(engine_det != nullptr); 
    IExecutionContext* context_det = engine_det->createExecutionContext();
    assert(context_det != nullptr);
    delete[] trtModelStreamDet;

  
    float *buffers[2];
    const int inputIndex = engine_det->getBindingIndex(INPUT_BLOB_NAME);
    const int outputIndex = engine_det->getBindingIndex(OUTPUT_BLOB_NAME);
    assert(inputIndex == 0);
    assert(outputIndex == 1);
    // Create GPU buffers on device
   

    auto out_dims = engine_det->getBindingDimensions(1);
    auto output_size = 1;
    int OUTPUT_CANDIDATES = out_dims.d[1];

       for(int j=0;j<out_dims.nbDims;j++) {
        output_size *= out_dims.d[j];
        cout<< "out_dims.d["<< j << "] : "<< out_dims.d[j] <<endl;
    }

    cout<< "output candidates: "<< OUTPUT_CANDIDATES << endl;
    CHECK(cudaMalloc((void**)&buffers[inputIndex],  3 * INPUT_H * INPUT_W * sizeof(float)));
    CHECK(cudaMalloc((void**)&buffers[outputIndex], output_size * sizeof(float)));


     // Create stream
    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));
    uint8_t* img_host = nullptr;
    uint8_t* img_device = nullptr;
    float *affine_matrix_d2i_host = nullptr;
    float *affine_matrix_d2i_device = nullptr;
    float *decode_ptr_device = nullptr;
    float *decode_ptr_host = nullptr;
    decode_ptr_host = new float[1+MAX_OBJECTS*NUM_BOX_ELEMENT];
    // prepare input data cache in pinned memory 
    CHECK(cudaMallocHost((void**)&img_host, MAX_IMAGE_INPUT_SIZE_THRESH * 3));
    // prepare input data cache in device memory
    CHECK(cudaMalloc((void**)&img_device, MAX_IMAGE_INPUT_SIZE_THRESH * 3));
    CHECK(cudaMallocHost(&affine_matrix_d2i_host,sizeof(float)*6));
    CHECK(cudaMalloc(&affine_matrix_d2i_device,sizeof(float)*6));
    CHECK(cudaMalloc(&decode_ptr_device,sizeof(float)*(1+MAX_OBJECTS*NUM_BOX_ELEMENT)));

    static float* prob = new float[output_size];


    // std::string imgPath ="/mnt/Gpan/Mydata/pytorchPorject/Chinese_license_plate_detection_recognition/imgs";
    std::string input_image_path=argv[2];
     std::string imgPath=argv[2];
    std::vector<std::string> imagList;
    std::vector<std::string>fileType{"jpg","png"};
    readFileList(const_cast<char *>(imgPath.c_str()),imagList,fileType);
    double sumTime = 0;
    int index = 0;
    cv::Size to(INPUT_W, INPUT_H);
    for (auto &input_image_path:imagList) 
    {
        affineMatrix afmt;
        cv::Mat img = cv::imread(input_image_path);

        getd2i(afmt, to, cv::Size(img.cols, img.rows));
        double begin_time = cv::getTickCount();
        float *buffer_idx = (float*)buffers[inputIndex];
        size_t size_image = img.cols * img.rows * 3;
        size_t size_image_dst = INPUT_H * INPUT_W * 3;
        memcpy(affine_matrix_d2i_host,afmt.d2i,sizeof(afmt.d2i));
        memcpy(img_host, img.data, size_image);
       
        CHECK(cudaMemcpyAsync(img_device, 
                                img_host, 
                                size_image, 
                                cudaMemcpyHostToDevice, 
                                stream));
                                
        CHECK(cudaMemcpyAsync(affine_matrix_d2i_device,
                            affine_matrix_d2i_host,
                            sizeof(afmt.d2i),
                            cudaMemcpyHostToDevice,stream));

        preprocess_kernel_img(img_device, 
                            img.cols, 
                            img.rows, 
                            buffer_idx, 
                            INPUT_W, 
                            INPUT_H,
                            affine_matrix_d2i_device, 
                            stream); //前处理 ，相当于letter_box
                            
        double time_pre = cv::getTickCount();
        double time_pre_=(time_pre-begin_time)/cv::getTickFrequency()*1000;
        std::cout<<"preprocessing time is "<<time_pre_<<" ms"<<std::endl;
      
        // doInference_cu(*context_det,stream, (void**)buffers,prob,1,output_size);
        (*context_det).enqueueV2((void**)buffers, stream, nullptr);
        float *predict = (float *)buffers[outputIndex];
        double time_infer = cv::getTickCount();
        time_infer =(time_infer-begin_time)/cv::getTickFrequency()*1000;
        std::cout<<"time_infer is "<<time_infer<<" ms"<<std::endl;

        CHECK(cudaMemsetAsync(decode_ptr_device, 
                                0, 
                                sizeof(int), 
                                stream));
                                
        decode_kernel_invoker(predict, 
                            OUTPUT_CANDIDATES, 
                            NUM_CLASSES, 
                            CKPT_NUM, 
                            BBOX_CONF_THRESH, 
                            affine_matrix_d2i_device,
                            decode_ptr_device,
                            MAX_OBJECTS,
                            stream);  //cuda 后处理


        nms_kernel_invoker(decode_ptr_device, NMS_THRESH, MAX_OBJECTS, stream);//cuda nms
        
        CHECK(cudaMemcpyAsync(decode_ptr_host, decode_ptr_device, sizeof(float)*(1+MAX_OBJECTS*NUM_BOX_ELEMENT), cudaMemcpyDeviceToHost, stream));
        

        cudaStreamSynchronize(stream);
        double end_time = cv::getTickCount();
        std::vector<bbox> boxes;
        //实际保留的bbox 个数 
        int boxes_count= 0;
        
        //decode_ptr_host 第0位表示bboxes 的总的个数
        int count = std::min((int)*decode_ptr_host, MAX_OBJECTS);
        //遍历所有的count 数量
        for (int i = 0; i < count; i++)
        {
           int basic_pos = 1 + i * NUM_BOX_ELEMENT;
           //检查标志位是否保留该bbox
           int keep_flag = decode_ptr_host[basic_pos + 6];
           if (keep_flag == 1)
           {
             boxes_count += 1;
             bbox box;
             box.x1 =  decode_ptr_host[basic_pos + 0];
             box.y1 =  decode_ptr_host[basic_pos + 1];
             box.x2 =  decode_ptr_host[basic_pos + 2];
             box.y2 =  decode_ptr_host[basic_pos + 3];
             box.score = decode_ptr_host[basic_pos + 4];
             //关键点的起点位置
             int landmark_pos = basic_pos + 7;
             //关键点的个数为5 个,（x1,y1）（x2,y2）（x3,y3）...
             for (int id = 0; id < CKPT_NUM; id += 1)
             {
                box.landmarks[2 * id] = decode_ptr_host[landmark_pos + 2 * id];  //xi
                box.landmarks[2 * id + 1 ] = decode_ptr_host[landmark_pos + 2 * id + 1]; //yi
             }
             boxes.push_back(box);
           }
        }
   
        double time_post = cv::getTickCount();
        time_post =(time_post - begin_time)/cv::getTickFrequency()*1000;
        std::cout<<"time_post is " << time_post <<" ms" << std::endl;
        std::cout<<input_image_path <<" " << "boxex count: " << boxes_count << endl;
        //画出关键点 和方框
        for (int i = 0; i<boxes_count; i++)
        {
            cv::Rect roi_area(boxes[i].x1,boxes[i].y1,boxes[i].x2-boxes[i].x1,boxes[i].y2-boxes[i].y1);
            cv::rectangle(img, roi_area, cv::Scalar(0,255,0), 2);
            for (int j= 0; j < CKPT_NUM; j++)
            {
                cv::Scalar color = cv::Scalar(color_list[j][0], color_list[j][1], color_list[j][2]);
                cv::circle(img,cv::Point(boxes[i].landmarks[2*j], boxes[i].landmarks[2*j+1]), 2, color, -1);
            }
        }
          
        auto time_gap = (end_time-begin_time)/cv::getTickFrequency()*1000;
        std::cout<<"  time_gap: "<<time_gap<<"ms ";
        if (index)
        {
            sumTime += time_gap;
        }
        std::cout<<std::endl;
        index += 1;

        int pos = input_image_path.find_last_of("/");
        std::string image_name = input_image_path.substr(pos+1);
        cv::imwrite(image_name, img);
    }
    // destroy the engine
    std::cout<<"averageTime:"<<(sumTime/(imagList.size() - 1 ))<<"ms"<<std::endl;
    context_det->destroy();
    engine_det->destroy();
    runtime_det->destroy();
 
    cudaStreamDestroy(stream);
    CHECK(cudaFree(affine_matrix_d2i_device));
    CHECK(cudaFreeHost(affine_matrix_d2i_host));
    CHECK(cudaFree(img_device));
    CHECK(cudaFreeHost(img_host));
    CHECK(cudaFree(buffers[inputIndex]));
    CHECK(cudaFree(buffers[outputIndex]));
    CHECK(cudaFree(decode_ptr_device));
    delete [] decode_ptr_host;
    return 0;
}

preprocess.cpp

#include "preprocess.h"
 
__global__ void warpaffine_kernel( 
    uint8_t* src, int src_line_size, int src_width, 
    int src_height, float* dst, int dst_width, 
    int dst_height, uint8_t const_value_st,
    float *d2i, int edge) {
    int position = blockDim.x * blockIdx.x + threadIdx.x;
    if (position >= edge) return;

    float m_x1 = d2i[0];
    float m_y1 = d2i[1];
    float m_z1 = d2i[2];
    float m_x2 = d2i[3];
    float m_y2 = d2i[4];
    float m_z2 = d2i[5];

    int dx = position % dst_width;
    int dy = position / dst_width;
    float src_x = m_x1 * dx + m_y1 * dy + m_z1 + 0.5f;
    float src_y = m_x2 * dx + m_y2 * dy + m_z2 + 0.5f;
    float c0, c1, c2;

    if (src_x <= -1 || src_x >= src_width || src_y <= -1 || src_y >= src_height) {
        // out of range
        c0 = const_value_st;
        c1 = const_value_st;
        c2 = const_value_st;
    } else {
        int y_low = floorf(src_y);
        int x_low = floorf(src_x);
        int y_high = y_low + 1;
        int x_high = x_low + 1;

        uint8_t const_value[] = {const_value_st, const_value_st, const_value_st};
        float ly = src_y - y_low;
        float lx = src_x - x_low;
        float hy = 1 - ly;
        float hx = 1 - lx;
        float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
        uint8_t* v1 = const_value;
        uint8_t* v2 = const_value;
        uint8_t* v3 = const_value;
        uint8_t* v4 = const_value;

        if (y_low >= 0) {
            if (x_low >= 0)
                v1 = src + y_low * src_line_size + x_low * 3;

            if (x_high < src_width)
                v2 = src + y_low * src_line_size + x_high * 3;
        }

        if (y_high < src_height) {
            if (x_low >= 0)
                v3 = src + y_high * src_line_size + x_low * 3;

            if (x_high < src_width)
                v4 = src + y_high * src_line_size + x_high * 3;
        }

        c0 = w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0];
        c1 = w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1];
        c2 = w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2];
    }

    //bgr to rgb 
    float t = c2;
    c2 = c0;
    c0 = t;

    //normalization
    c0 = c0 / 255.0f;
    c1 = c1 / 255.0f;
    c2 = c2 / 255.0f;

    //rgbrgbrgb to rrrgggbbb
    int area = dst_width * dst_height;
    float* pdst_c0 = dst + dy * dst_width + dx;
    float* pdst_c1 = pdst_c0 + area;
    float* pdst_c2 = pdst_c1 + area;
    *pdst_c0 = c0;
    *pdst_c1 = c1;
    *pdst_c2 = c2;
}

void preprocess_kernel_img(
    uint8_t* src, int src_width, int src_height,
    float* dst, int dst_width, int dst_height,
    float*d2i,cudaStream_t stream) {
    int jobs = dst_height * dst_width;
    int threads = 256;
    int blocks = ceil(jobs / (float)threads);
    warpaffine_kernel<<<blocks, threads, 0, stream>>>(
        src, src_width*3, src_width,
        src_height, dst, dst_width,
        dst_height, 128, d2i, jobs);

}

postprocess.cpp

#include "postprocess.h"

const int NUM_BOX_ELEMENT = 17;      // left, top, right, bottom, confidence, class, keepflag, 5 keypoints 
static __device__ void affine_project(float* matrix, float x, float y, float* ox, float* oy){
    *ox = matrix[0] * x + matrix[1] * y + matrix[2];
    *oy = matrix[3] * x + matrix[4] * y + matrix[5];
}

static __global__ void decode_kernel(float* predict, int num_bboxes, int num_classes,int ckpt, float confidence_threshold, float* invert_affine_matrix, float* parray, int max_objects){  

    int position = blockDim.x * blockIdx.x + threadIdx.x;
    if (position >= num_bboxes) return;
    // if(position < 3)
    //printf("position：%d, blockDim.x: %d, blockIdx.x: %d, threadIdx.x: %d\n", position, blockDim.x, blockIdx.x, threadIdx.x);
    // printf("position: %d, num_classes:%d, ckpt: %d !\n", position, num_classes, ckpt);  
    
    //5 + 1 + 5*3 =21 一个方框bbox + 关键点 所占的空间
    /*
    //方框描述位 5个
    bx ：方框x中心点坐标
    by ：方框y中心点坐标
    bw : 方框宽度
    bh ：方框高度
    conf：目标置信度
    
    //类别 就1 个分类，有多少个分类写几个
    c0 ：表示分类人脸

    //关键点 人脸有5个 每个关键点有x,y,conf 三个描述位 总共与3X5 =15 个关键点位
    kx ： 关键点x坐标
    ky： 关键点y坐标
    kconf: 关键点置信度

    一个检测对象，方框 总共有：5 + 1 + 15 = 21 个描述位 来描述，包括方框和关键点信息
    */
    /*
    pitem = predict+ 21 * position position 为 0,1,2,3，...
    每个线程处理一个框，处理21位flaot predict 数据
    */
    float* pitem = predict + (5 + num_classes + ckpt * 3) * position; //predict+ 21 * position position 为 0,1,2,3，...
    
    //for test
    // if(position == 0){
    //     for(int j=0; j<21; j++){
    //         printf("pitem[%d]: %f\n", j, pitem[j]);
    //     }
    // }
    // 方框描述位 confidence 
    float objectness = pitem[4]; 
    
    if(objectness < confidence_threshold)
        return;
    
    //判断方框目标的类别，如果有很多个类别的话，需要对比每个类别上的confidence 大小，最大的那个就为检测出来的目标标签
    //pitem + 5 开始为目标类别位，C0 开始，......
    float* class_confidence = pitem + 5; 
    // 第一个目标类别的置信度，
    float confidence = *class_confidence++;
    int label  = 0;

    for(int i = 1; i < num_classes; ++i, ++class_confidence){
        //后面类别的置信度要比前一个大才把标签替换掉,输出方框最终的标签号和置信度
        if(*class_confidence > confidence){
            confidence = *class_confidence;
            label = i;
            printf("*if class confidence :%d\n", (*class_confidence));
        }
    }

    confidence *= objectness;
    if(confidence < confidence_threshold)
        return;

    //原子操作，每个线程分别对变量parray进行+1 操作，
    /*
    CUDA的原子操作可以理解为对一个变量进行“读取-修改-写入”这三个操作的一个最小单位的执行过程，
    这个执行过程不能够再分解为更小的部分，在它执行过程中，
    不允许其他并行线程对该变量进行读取和写入的操作。基于这个机制，
    原子操作实现了对在多个线程间共享的变量的互斥保护，
    确保任何一次对变量的操作的结果的正确性。
    */
    //给parray 的第一位累加bboxes的个数
    int index = atomicAdd(parray, 1);
    
    // printf("parray: %f, index: %d \n", *parray, index);
    if(index >= max_objects)
        return;
    // printf("index %d max_objects %d\n", index,max_objects);
    float cx         = pitem[0];
    float cy         = pitem[1];
    float width      = pitem[2];
    float height     = pitem[3];
    
    //五个关键点
    float *landmarks = pitem + 5 + num_classes;
    float x1         = landmarks[0];
    float y1         = landmarks[1];
    float x2         = landmarks[3];
    float y2         = landmarks[4];
    float x3         = landmarks[6];
    float y3         = landmarks[7];
    float x4         = landmarks[9];
    float y4         = landmarks[10];
    float x5         = landmarks[12];
    float y5         = landmarks[13];

    float left   = cx - width * 0.5f;
    float top    = cy - height * 0.5f;
    float right  = cx + width * 0.5f;
    float bottom = cy + height * 0.5f;
   
    //反射变换出关键点的坐标
    affine_project(invert_affine_matrix, left, top, &left,  &top);
    affine_project(invert_affine_matrix, right, bottom, &right, &bottom);

    affine_project(invert_affine_matrix, x1,y1,&x1,&y1);
    affine_project(invert_affine_matrix, x2,y2,&x2,&y2);
    affine_project(invert_affine_matrix, x3,y3,&x3,&y3);
    affine_project(invert_affine_matrix, x4,y4,&x4,&y4);
    affine_project(invert_affine_matrix, x5,y5,&x5,&y5);

    //往数组阵列中填写结果
    float* pout_item = parray + 1 + index * NUM_BOX_ELEMENT;
    *pout_item++ = left;
    *pout_item++ = top;
    *pout_item++ = right;
    *pout_item++ = bottom;
    *pout_item++ = confidence;
    *pout_item++ = label;
    *pout_item++ = 1; // 1 = keep, 0 = ignore
    
    //five keypoint
    *pout_item++ = x1;
    *pout_item++ = y1;

    *pout_item++ = x2;
    *pout_item++ = y2;

    *pout_item++ = x3;
    *pout_item++ = y3;

    *pout_item++ = x4;
    *pout_item++ = y4;

    *pout_item++ = x5;
    *pout_item++ = y5;
}

static __device__ float box_iou(
    float aleft, float atop, float aright, float abottom, 
    float bleft, float btop, float bright, float bbottom
){

    float cleft 	= max(aleft, bleft); 
    float ctop 		= max(atop, btop);
    float cright 	= min(aright, bright);
    float cbottom 	= min(abottom, bbottom);
    
    float c_area = max(cright - cleft, 0.0f) * max(cbottom - ctop, 0.0f);
    if(c_area == 0.0f)
        return 0.0f;
    
    float a_area = max(0.0f, aright - aleft) * max(0.0f, abottom - atop);
    float b_area = max(0.0f, bright - bleft) * max(0.0f, bbottom - btop);
    return c_area / (a_area + b_area - c_area);
}

static __global__ void nms_kernel(float* bboxes, int max_objects, float threshold){

    int position = (blockDim.x * blockIdx.x + threadIdx.x);
    //bboxes 的数量，一个真正的object 可能存在有多个bboxes重叠，采用nms来去掉这种情况
    int count = min((int)*bboxes, max_objects);
    // printf("bboxes count num:%d\n", count);
    if (position >= count) 
        return;
    
    // left, top, right, bottom, confidence, class, keepflag
    //position 线程id , 一个position 线程处理一个box element，pcurrent是指的当前线程映射的bbox id 号
    float* pcurrent = bboxes + 1 + position * NUM_BOX_ELEMENT;
    //遍历所有的bboxes 
    for(int i = 0; i < count; ++i){
        float* pitem = bboxes + 1 + i * NUM_BOX_ELEMENT;
        //同一个bbox 的element 而且 class 类型还不一样的
        if(i == position || pcurrent[5] != pitem[5]) continue;
        //置信度大于当前的置信度
        if(pitem[4] >= pcurrent[4]){
            //position 位置之前的且与当前element 置信度相等的忽略，为什么呢？
            if(pitem[4] == pcurrent[4] && i < position)
                continue;
            //计算IOU值
            float iou = box_iou(
                pcurrent[0], pcurrent[1], pcurrent[2], pcurrent[3],
                pitem[0], pitem[1], pitem[2], pitem[3]
            );

            if(iou > threshold){
                //标记舍弃当前的bbox 
                pcurrent[6] = 0;  // 1=keep, 0=ignore
                return;
            }
        }
    }
} 

void decode_kernel_invoker(float* predict, int num_bboxes, int num_classes,int ckpt, float confidence_threshold, float* invert_affine_matrix, float* parray, int max_objects, cudaStream_t stream)
{
    /*每个线程块分配256个线程*/
    int block = 256;
    int  grid =  ceil(num_bboxes / (float)block);
    
    decode_kernel<<<grid, block, 0, stream>>>(predict, num_bboxes, num_classes,ckpt, confidence_threshold, invert_affine_matrix, parray, max_objects);
}

void nms_kernel_invoker(float* parray, float nms_threshold, int max_objects, cudaStream_t stream){
    
    int block = max_objects<256? max_objects:256;
    int grid = ceil(max_objects / (float)block);
    nms_kernel<<<grid, block, 0, stream>>>(parray, max_objects, nms_threshold);
}