在Tensorrt、openvino的YOLOv5预处理与后处理

最新推荐文章于 2024-06-14 19:05:55 发布

嗷嗷哦润橘_

最新推荐文章于 2024-06-14 19:05:55 发布

阅读量1.8k

点赞数 45

文章标签： openvino YOLO 人工智能

本文链接：https://blog.csdn.net/qq_64955200/article/details/136042077

版权

内容目录

YOLOv5预处理

Tensorrtx cpp版本

preprocess.cu文件函数preprocess_kernel_img

void preprocess_kernel_img(
    uint8_t* src, int src_width, int src_height,
    float* dst, int dst_width, int dst_height,
    cudaStream_t stream) {
    AffineMatrix s2d,d2s;
    float scale = std::min(dst_height / (float)src_height, dst_width / (float)src_width);

    s2d.value[0] = scale;
    s2d.value[1] = 0;
    s2d.value[2] = -scale * src_width  * 0.5  + dst_width * 0.5;
    s2d.value[3] = 0;
    s2d.value[4] = scale;
    s2d.value[5] = -scale * src_height * 0.5 + dst_height * 0.5;

    cv::Mat m2x3_s2d(2, 3, CV_32F, s2d.value);
    cv::Mat m2x3_d2s(2, 3, CV_32F, d2s.value);
    cv::invertAffineTransform(m2x3_s2d, m2x3_d2s);

    memcpy(d2s.value, m2x3_d2s.ptr<float>(0), sizeof(d2s.value));

    int jobs = dst_height * dst_width;
    int threads = 256;
    int blocks = ceil(jobs / (float)threads);
    warpaffine_kernel<<<blocks, threads, 0, stream>>>(
        src, src_width*3, src_width,
        src_height, dst, dst_width,
        dst_height, 128, d2s, jobs);

}

在这个函数里，取scale系数为目标图像（变换后的图）与原始图像（变换前的图）的宽、高作比例计算，取较小值。
计算s2d，src_img到dst_img的仿射变换矩阵。
这里详细记录一下s2d得来的出处。是opencv库cv.WarpAffine函数cv2.warpAffine(src, M, dsize, flags, borderMode, borderValue)中的M变换矩阵。src输入图像、M变换矩阵、dsize为输出图像尺寸、flags插值方法、borderMode边界像素模式、borderValue像素边界颜色。
其中M矩阵 M = [[x,y,z],[x1,y1,z1]]： z,z1是左上角起点；y=0 y1=1.5 y轴方向拉1.5倍；x=1.5 x1=0 x轴方向1.5倍。
那么在这里理解一下s2d。也就是src_img到dst_img的变换是以(-scale * src_width * 0.5 + dst_width * 0.5,-scale * src_height * 0.5 + dst_height * 0.5)为左上起点，宽高以scale倍数拉拉长拉拉宽。
cv::invertAffineTransform函数为取逆，得到d2s，dst_img到src_img的仿射变换矩阵。

d2s用于warpaffine_kernel的计算，jobs指有dst_img所有的像素点进行并行计算的工作个数。

这里到了函数warpaffine_kernel。

__global__ void warpaffine_kernel( 
    uint8_t* src, int src_line_size, int src_width, 
    int src_height, float* dst, int dst_width, 
    int dst_height, uint8_t const_value_st,
    AffineMatrix d2s, int edge) {
    int position = blockDim.x * blockIdx.x + threadIdx.x;
    if (position >= edge) return;

    float m_x1 = d2s.value[0];
    float m_y1 = d2s.value[1];
    float m_z1 = d2s.value[2];
    float m_x2 = d2s.value[3];
    float m_y2 = d2s.value[4];
    float m_z2 = d2s.value[5];

    int dx = position % dst_width;
    int dy = position / dst_width; #为了取宽高 第几排第几个
    float src_x = m_x1 * dx + m_y1 * dy + m_z1 + 0.5f;
    float src_y = m_x2 * dx + m_y2 * dy + m_z2 + 0.5f;
    float c0, c1, c2;

    if (src_x <= -1 || src_x >= src_width || src_y <= -1 || src_y >= src_height) {
        // out of range
        c0 = const_value_st;
        c1 = const_value_st;
        c2 = const_value_st;
    } else {
        int y_low = floorf(src_y);
        int x_low = floorf(src_x);
        int y_high = y_low + 1;
        int x_high = x_low + 1;

        uint8_t const_value[] = {const_value_st, const_value_st, const_value_st};
        float ly = src_y - y_low;
        float lx = src_x - x_low;
        float hy = 1 - ly;
        float hx = 1 - lx;
        float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
        uint8_t* v1 = const_value;
        uint8_t* v2 = const_value;
        uint8_t* v3 = const_value;
        uint8_t* v4 = const_value;

        #这两个if  是在并行读取value 在cuda设备上/CPU上
        if (y_low >= 0) {
            if (x_low >= 0)
                v1 = src + y_low * src_line_size + x_low * 3;

            if (x_high < src_width)
                v2 = src + y_low * src_line_size + x_high * 3;
        }

        if (y_high < src_height) {
            if (x_low >= 0)
                v3 = src + y_high * src_line_size + x_low * 3;

            if (x_high < src_width)
                v4 = src + y_high * src_line_size + x_high * 3;
        }
        #双线性插值
        c0 = w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0];  #v1 v2
        c1 = w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1];
        c2 = w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2];
    }

    //bgr to rgb 
    float t = c2;
    c2 = c0;
    c0 = t;

    //normalization
    c0 = c0 / 255.0f;
    c1 = c1 / 255.0f;
    c2 = c2 / 255.0f;

    //rgbrgbrgb to rrrgggbbb  #预处理处转换为NCHW
    int area = dst_width * dst_height;
    float* pdst_c0 = dst + dy * dst_width + dx;
    float* pdst_c1 = pdst_c0 + area;
    float* pdst_c2 = pdst_c1 + area;
    *pdst_c0 = c0;
    *pdst_c1 = c1;
    *pdst_c2 = c2;
}

这片就是cuda代码了。position就是这个线程所处的位置。我的理解是这里取了dst_img像素个数多的线程存块用于计算，一个线程所在的位置代表在dst_img的位置，位置之间的关系是position=dy*dst_width+dx。【不懂，我猜的，咱也不熟。】
然后，利用d2s变换矩阵计算得到dst_img上每个像素点对应到src_img的像素近似点，这么说是因为这里取了float，也就是在src_img上并不真实存在的点。用于计算dst_img对应点的像素。【这里的d2s变换计算我没搞清楚公式长什么样…】
接着往下看，对于src_x，src_y判断是否出界，出界bgr三色为[128,128,128]。即为padding的部分颜色。
若没有出界，取y_low<=src_y<=y_high且y_high-y_low=1，向上和向下取整。由此得到lx，ly，hx，hy，为到向上和向下取整的距离。
v1，v2，v3，v4为取线程所在的位置。和position一样取。初始化是为了padding这种情况。
c0，c1，c2是bgr三色，在做双线性插值计算。为双线性插值推导而得的公式。

公式

往下看，剩下就是bgr转换为rgb，归一化的过程。
rgbrgbrgb to rrrgggbbb 为预处理NHWC转换为NCHW的过程！在底层存储放好了这个顺序。
氮素【我这里不懂为什么pdst_c0和pdst_c1，pdst_c2写得不一样！】

Tensorrtx py版本

【I dont understand Yolov5_trt.py 和yolov5_trt_cuda_python.py区别在什么？Pycuda vs cuda+python】

def preprocess_image(self, raw_bgr_image):
        """
        description: Convert BGR image to RGB,
                     resize and pad it to target size, normalize to [0,1],
                     transform to NCHW format.
        param:
            input_image_path: str, image path
        return:
            image:  the processed image
            image_raw: the original image
            h: original height
            w: original width
        """
        image_raw = raw_bgr_image
        h, w, c = image_raw.shape
        image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB)
        # Calculate widht and height and paddings
        r_w = self.input_w / w
        r_h = self.input_h / h
        if r_h > r_w:
            tw = self.input_w
            th = int(r_w * h)
            tx1 = tx2 = 0
            ty1 = int((self.input_h - th) / 2)
            ty2 = self.input_h - th - ty1
        else:
            tw = int(r_h * w)
            th = self.input_h
            tx1 = int((self.input_w - tw) / 2)
            tx2 = self.input_w - tw - tx1
            ty1 = ty2 = 0
        #  
        image = cv2.resize(image, (tw, th))
        # Pad the short side with (128,128,128)
        image = cv2.copyMakeBorder(
            image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128)
        )
        image = image.astype(np.float32)
        # Normalize to [0,1]
        image /= 255.0
        # HWC to CHW format:
        image = np.transpose(image, [2, 0, 1])
        # CHW to NCHW format
        image = np.expand_dims(image, axis=0)
        # Convert the image to row-major order, also known as "C order":
        image = np.ascontiguousarray(image)
        return image, image_raw, h, w

（1）求img_w/input_w img_h/input_h scale的较小值
由此求得scale之后的等比例的input的宽高tw th。tx1 ty1 tx2 ty2 是四个角padding的距离。
（2）image=cv2.resize(image,(tw,th)) Resize the image with long side while maintaining ratio
（3）image = cv2.copyMakeBorder( image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128)) Pad the short side with (128,128,128) 但是pytorch版本喜欢用114 114 114 我司也是
（4）Image = image.astype(np.float32) image/=255.0 换格式就是为了normalize Normalize to [0,1]
（5） image = np.transpose(image, [2, 0, 1]) HWC to CHW format
（6）image = np.expand_dims(image, axis=0) # CHW to NCHW format 一张张处理的，所以需要expand dims 为了后面infer是batch的格式infer 这里也会存下来对应的original_h original_w 用于一张张地后处理
（7）image = np.ascontiguousarray(image) convert the image to row-major order, also known as "C order": ascontiguousarray函数将一个内存不连续存储的数组转换为内存连续存储的数组，使得运行速度更快

Openvino py版本

    def letterbox(self, imgs, size=(640, 640), color=(114, 114, 114), auto=True, scaleFill=False, scaleup=True,ov_version = 2021):
        # Resize image to a 32-pixel-multiple rectangle https://github.com/ultralytics/yolov3/issues/232
        img_list = []
        if len(imgs) == 1:
            img = imgs[0]
            shape = img.shape[:2]  # current shape [height, width]
            w, h = size

            # Scale ratio (new / old)
            r = min(h / shape[0], w / shape[1])
            if not scaleup:  # only scale down, do not scale up (for better test mAP)
                r = min(r, 1.0)

            # Compute padding
            ratio = r, r  # width, height ratios
            new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
            dw, dh = w - new_unpad[0], h - new_unpad[1]  # wh padding
            if auto:  # minimum rectangle
                dw, dh = np.mod(dw, 64), np.mod(dh, 64)  # wh padding
            elif scaleFill:  # stretch
                dw, dh = 0.0, 0.0
                new_unpad = (w, h)
                ratio = w / shape[1], h / shape[0]  # width, height ratios

            dw /= 2  # divide padding into 2 sides
            dh /= 2

            if shape[::-1] != new_unpad:  # resize
                img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
            top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
            left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
            img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border

            top2, bottom2, left2, right2 = 0, 0, 0, 0
            if img.shape[0] != h:
                top2 = (h - img.shape[0]) // 2
                bottom2 = top2
                img = cv2.copyMakeBorder(img, top2, bottom2, left2, right2, cv2.BORDER_CONSTANT,
                                         value=color)  # add border
            elif img.shape[1] != w:
                left2 = (w - img.shape[1]) // 2
                right2 = left2
                img = cv2.copyMakeBorder(img, top2, bottom2, left2, right2, cv2.BORDER_CONSTANT,
                                         value=color)  # add border
            if ov_version < 2022:
                img = img.transpose((2, 0, 1))  # HWC to CHW, BGR to RGB
                img = img.reshape((self.n, self.c, self.h, self.w))
                return img
            else:
                return [img]

letterbox函数分析：
首先是scale系数计算，同上一样的scale。but可选not scale up only scale down。for better test mAP（尴尬）。
这里padding用(114,114,114)。
来看padding大小的计算。可选auto：minimum rectangle能整除64最小的边长；stretch选项，全部展开与原图大小一样。加padding的时候搞个-0.1，+0.1俺是不理解，是为了减少rescale时浮点数带来的误差吗？
然后最后加padding的double check，是因为有可能会对不齐吗？想不出这种情况

主函数

        if OPENVINO_VERSION < 2022:
            imgs = self.letterbox(img0s, (self.w, self.h), ov_version=OPENVINO_VERSION)
            res = self.exec_net.infer(inputs={self.input_blob: imgs})
            det = torch.from_numpy(res['output_0'])
        else:
            imgs = self.letterbox(img0s, (self.w, self.h), ov_version=OPENVINO_VERSION)
            imgs = cv2.dnn.blobFromImages(imgs, 1 / 255.0, swapRB=True, crop=False)
            det = self.exec_net.infer([imgs])[self.net.outputs[0]]
            det = torch.from_numpy(det)

openvino2022版，在letterbox之后接函数cv2.dnn.blobFromImages这个函数。
函数cv2.dnn.blobFromImage(image, scalefactor, size, mean, swapRB, crop, ddepth)
作用：图像进行预处理，包括减均值，比例缩放，裁剪，交换通道等，返回一个4通道的blob(blob可以简单理解为一个N维的数组，用于神经网络的输入)。当同时进行scalefactor,size,mean,swapRB操作时，优先按swapRB交换通道，其次按scalefactor比例缩放，然后按mean求减，最后按size进行resize操作转换。
但是经过我们的测试。这个函数可以这样代替，速度还快效果也不差，由于看不到这个函数内部实现，这个疑惑就放在这里。

    for i in range(90):
        img3 = cv2.imread(r'xxxx.jpg')
        img3 = cv2.resize(img3, (640, 384))

        img3_ = copy.deepcopy(img3)
        print('---------------------')
        t = time.time()
        cv2.cvtColor(img3, cv2.COLOR_BGRA2RGB, img3)  # bgr -> rgb
        img3 = img3.transpose(2, 0, 1)  # HWC -> CHW
        img3 = np.stack([img3])
        img3 = torch.from_numpy(img3).to('cuda:0')
        img3 = img3.half()
        img3 /= 255.0
        print(time.time() - t)

        t = time.time()
        img3_ = cv2.dnn.blobFromImages([img3_], 1 / 255.0, swapRB=True, crop=False)
        print(time.time() - t)
        img3_ = torch.from_numpy(img3_).to('cuda:0')

        print(img3.shape, img3_.shape)

YOLOv5后处理

Tensorrtx cpp版本

nms之前detect的个数限制为1000。nms是一张张处理的。
源码是infer之后进入nms过程。

void nms(std::vector<Yolo::Detection>& res, float *output, float conf_thresh, float nms_thresh = 0.5) {
    int det_size = sizeof(Yolo::Detection) / sizeof(float);
    std::map<float, std::vector<Yolo::Detection>> m;
    for (int i = 0; i < output[0] && i < Yolo::MAX_OUTPUT_BBOX_COUNT; i++) {
        if (output[1 + det_size * i + 4] <= conf_thresh) continue;
        Yolo::Detection det;
        memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float));
        if (m.count(det.class_id) == 0) m.emplace(det.class_id, std::vector<Yolo::Detection>());
        m[det.class_id].push_back(det);
    }
    for (auto it = m.begin(); it != m.end(); it++) {
        //std::cout << it->second[0].class_id << " --- " << std::endl;
        auto& dets = it->second;
        std::sort(dets.begin(), dets.end(), cmp);
        for (size_t m = 0; m < dets.size(); ++m) {
            auto& item = dets[m];
            res.push_back(item);
            for (size_t n = m + 1; n < dets.size(); ++n) {
                if (iou(item.bbox, dets[n].bbox) > nms_thresh) {
                    dets.erase(dets.begin() + n);
                    --n;
                }
            }
        }
    }
}

这里直接比对object confidence。没有比类别。这里iou实现是用xywh格式来计算的。无clip

float iou(float lbox[4], float rbox[4]) {
    float interBox[] = {
        (std::max)(lbox[0] - lbox[2] / 2.f , rbox[0] - rbox[2] / 2.f), //left
        (std::min)(lbox[0] + lbox[2] / 2.f , rbox[0] + rbox[2] / 2.f), //right
        (std::max)(lbox[1] - lbox[3] / 2.f , rbox[1] - rbox[3] / 2.f), //top
        (std::min)(lbox[1] + lbox[3] / 2.f , rbox[1] + rbox[3] / 2.f), //bottom
    };

    if (interBox[2] > interBox[3] || interBox[0] > interBox[1])
        return 0.0f;

    float interBoxS = (interBox[1] - interBox[0])*(interBox[3] - interBox[2]);
    return interBoxS / (lbox[2] * lbox[3] + rbox[2] * rbox[3] - interBoxS);
}

nms后，到了后面get_rect画框和类别的时候才从xywh-->xyxy。

cv::Rect get_rect(cv::Mat& img, float bbox[4]) { #bbox xywh
    float l, r, t, b;
    float r_w = Yolo::INPUT_W / (img.cols * 1.0);
    float r_h = Yolo::INPUT_H / (img.rows * 1.0);
    if (r_h > r_w) {
        l = bbox[0] - bbox[2] / 2.f;
        r = bbox[0] + bbox[2] / 2.f;
        t = bbox[1] - bbox[3] / 2.f - (Yolo::INPUT_H - r_w * img.rows) / 2;
        b = bbox[1] + bbox[3] / 2.f - (Yolo::INPUT_H - r_w * img.rows) / 2;
        l = l / r_w;
        r = r / r_w;
        t = t / r_w;
        b = b / r_w;
    } else {
        l = bbox[0] - bbox[2] / 2.f - (Yolo::INPUT_W - r_h * img.cols) / 2;
        r = bbox[0] + bbox[2] / 2.f - (Yolo::INPUT_W - r_h * img.cols) / 2;
        t = bbox[1] - bbox[3] / 2.f;
        b = bbox[1] + bbox[3] / 2.f;
        l = l / r_h;
        r = r / r_h;
        t = t / r_h;
        b = b / r_h;
    }
    return cv::Rect(round(l), round(t), round(r - l), round(b - t));
}

haha 这里没有做clip越界检查处理。做了xywh->xyxy转换。一个一个框resize到原图范围。所以不做检查吧

Tensorrtx py版本

def post_process(self, output, origin_h, origin_w):
        """
        description: postprocess the prediction
        param:
            output:     A numpy likes [num_boxes,cx,cy,w,h,conf,cls_id, cx,cy,w,h,conf,cls_id, ...] 
            origin_h:   height of original image
            origin_w:   width of original image
        return:
            result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2]
            result_scores: finally scores, a numpy, each element is the score correspoing to box
            result_classid: finally classid, a numpy, each element is the classid correspoing to box
        """
        # Get the num of boxes detected
        num = int(output[0])
        # Reshape to a two dimentional ndarray
        pred = np.reshape(output[1:], (-1, 6))[:num, :]
        # Do nms
        boxes = self.non_max_suppression(pred, origin_h, origin_w, conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD)
        result_boxes = boxes[:, :4] if len(boxes) else np.array([])
        result_scores = boxes[:, 4] if len(boxes) else np.array([])
        result_classid = boxes[:, 5] if len(boxes) else np.array([])
        return result_boxes, result_scores, result_classid

传进post_process时已经是一张图的结果，bn=1。这里依然reshape为2维。

    def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4):
        """
        description: Removes detections with lower object confidence score than 'conf_thres' and performs
        Non-Maximum Suppression to further filter detections.
        param:
            prediction: detections, (x1, y1, x2, y2, conf, cls_id)
            origin_h: original image height
            origin_w: original image width
            conf_thres: a confidence threshold to filter detections
            nms_thres: a iou threshold to filter detections
        return:
            boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id)
        """
        # Get the boxes that score > CONF_THRESH
        boxes = prediction[prediction[:, 4] >= conf_thres]
        # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2]
        boxes[:, :4] = self.xywh2xyxy(origin_h, origin_w, boxes[:, :4])
        # clip the coordinates
        boxes[:, 0] = np.clip(boxes[:, 0], 0, origin_w -1)
        boxes[:, 2] = np.clip(boxes[:, 2], 0, origin_w -1)
        boxes[:, 1] = np.clip(boxes[:, 1], 0, origin_h -1)
        boxes[:, 3] = np.clip(boxes[:, 3], 0, origin_h -1)
        # Object confidence
        confs = boxes[:, 4]
        # Sort by the confs
        boxes = boxes[np.argsort(-confs)]
        # Perform non-maximum suppression
        keep_boxes = []
        while boxes.shape[0]:
            large_overlap = self.bbox_iou(np.expand_dims(boxes[0, :4], 0), boxes[:, :4]) > nms_thres
            label_match = boxes[0, -1] == boxes[:, -1]
            # Indices of boxes with lower confidence scores, large IOUs and matching labels
            invalid = large_overlap & label_match
            keep_boxes += [boxes[0]]
            boxes = boxes[~invalid]
        boxes = np.stack(keep_boxes, 0) if len(keep_boxes) else np.array([])
        return boxes

先对object confidence做个filter。
xywh->xyxy转换。这里已经把框框rescale到original的大小。
clip数值边界处理。
nms过程。

bbox_iou的inter_x1 inter_x2 inter_y1 inter_y2也做了clip。还有+1处理

    def bbox_iou(self, box1, box2, x1y1x2y2=True):
        """
        description: compute the IoU of two bounding boxes
        param:
            box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
            box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))            
            x1y1x2y2: select the coordinate format
        return:
            iou: computed iou
        """
        if not x1y1x2y2:
            # Transform from center and width to exact coordinates
            b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
            b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
            b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
            b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
        else:
            # Get the coordinates of bounding boxes
            b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
            b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]

        # Get the coordinates of the intersection rectangle
        inter_rect_x1 = np.maximum(b1_x1, b2_x1)
        inter_rect_y1 = np.maximum(b1_y1, b2_y1)
        inter_rect_x2 = np.minimum(b1_x2, b2_x2)
        inter_rect_y2 = np.minimum(b1_y2, b2_y2)
        # Intersection area
        inter_area = np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, None) * \
                     np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None)
        # Union Area
        b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1)
        b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1)

        iou = inter_area / (b1_area + b2_area - inter_area + 1e-16)

        return iou

Openvino py版本

这里prediction是batch imgs。不是一张张

    def non_max_suppression(self, prediction, conf_thres=0.25, iou_thres=0.45, classes=None, agnostic=False, labels=()):
        """Performs Non-Maximum Suppression (NMS) on inference results

        Returns:
             detections with shape: nx6 (x1, y1, x2, y2, conf, cls)
        """

        nc = prediction.shape[2] - 5  # number of classes
        xc = prediction[..., 4] > conf_thres  # candidates

        # Settings
        min_wh, max_wh = 2, 4096  # (pixels) minimum and maximum box width and height
        max_det = 300  # maximum number of detections per image
        max_nms = 30000  # maximum number of boxes into torchvision.ops.nms()
        time_limit = 10.0  # seconds to quit after
        redundant = True  # require redundant detections
        multi_label = nc > 1  # multiple labels per box (adds 0.5ms/img)
        merge = False  # use merge-NMS

        t = time.time()
        output = [torch.zeros((0, 6), device=prediction.device)] * prediction.shape[0]
        for xi, x in enumerate(prediction):  # image index, image inference
            # Apply constraints
            # x[((x[..., 2:4] < min_wh) | (x[..., 2:4] > max_wh)).any(1), 4] = 0  # width-height
            x = x[xc[xi]]  # confidence

            # Cat apriori labels if autolabelling
            if labels and len(labels[xi]):
                l = labels[xi]
                v = torch.zeros((len(l), nc + 5), device=x.device)
                v[:, :4] = l[:, 1:5]  # box
                v[:, 4] = 1.0  # conf
                v[range(len(l)), l[:, 0].long() + 5] = 1.0  # cls
                x = torch.cat((x, v), 0)

            # If none remain process next image
            if not x.shape[0]:
                continue

            # Compute conf
            x[:, 5:] *= x[:, 4:5]  # conf = obj_conf * cls_conf

            # Box (center x, center y, width, height) to (x1, y1, x2, y2)
            box = self.xywh2xyxy(x[:, :4])

            # Detections matrix nx6 (xyxy, conf, cls)
            if multi_label:
                i, j = (x[:, 5:] > conf_thres).nonzero(as_tuple=False).T
                x = torch.cat((box[i], x[i, j + 5, None], j[:, None].float()), 1)
            else:  # best class only
                conf, j = x[:, 5:].max(1, keepdim=True)
                x = torch.cat((box, conf, j.float()), 1)[conf.view(-1) > conf_thres]

            # Filter by class
            if classes is not None:
                x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)]

            # Apply finite constraint
            # if not torch.isfinite(x).all():
            #     x = x[torch.isfinite(x).all(1)]

            # Check shape
            n = x.shape[0]  # number of boxes
            if not n:  # no boxes
                continue
            elif n > max_nms:  # excess boxes
                x = x[x[:, 4].argsort(descending=True)[:max_nms]]  # sort by confidence

            # Batched NMS
            c = x[:, 5:6] * (0 if agnostic else max_wh)  # classes
            boxes, scores = x[:, :4] + c, x[:, 4]  # boxes (offset by class), scores
            i = torchvision.ops.nms(boxes, scores, iou_thres)  # NMS
            if i.shape[0] > max_det:  # limit detections
                i = i[:max_det]
            if merge and (1 < n < 3E3):  # Merge NMS (boxes merged using weighted mean)
                # update boxes as boxes(i,4) = weights(i,n) * boxes(n,4)
                iou = self.box_iou(boxes[i], boxes) > iou_thres  # iou matrix
                weights = iou * scores[None]  # box weights
                x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum(1, keepdim=True)  # merged boxes
                if redundant:
                    i = i[iou.sum(1) > 1]  # require redundancy

            output[xi] = x[i]
            if (time.time() - t) > time_limit:
                print(f'WARNING: NMS time limit {time_limit}s exceeded')
                break  # time limit exceeded

        return output

直接先object confidence筛选一批detectbox，不是一张张地选，all in
也有检测时间、nms前后个数限制。
计算分类物体的置信度。conf = obj_conf * cls_conf又筛选一轮。
这里还有多标签和单标签处理。
nms。
紧接着rescale到original size。也有clip处理。

    def scale_coords(self, img1_shape, coords, img0_shape, ratio_pad=None):
        # Rescale coords (xyxy) from img1_shape to img0_shape
        if ratio_pad is None:  # calculate from img0_shape
            gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1])  # gain  = old / new
            pad = (img1_shape[1] - img0_shape[1] * gain) / 2, (img1_shape[0] - img0_shape[0] * gain) / 2  # wh padding
        else:
            gain = ratio_pad[0][0]
            pad = ratio_pad[1]

        coords[:, [0, 2]] -= pad[0]  # x padding
        coords[:, [1, 3]] -= pad[1]  # y padding
        coords[:, :4] /= gain
        self.clip_coords(coords, img0_shape)
        return coords

能看到这里的，栓Q！

希望对你有所帮助！

嗷嗷哦润橘_

关注

45
点赞
踩
51

收藏

觉得还不错? 一键收藏
2
评论
在Tensorrt、openvino的YOLOv5预处理与后处理

也就是src_img到dst_img的变换是以(-scale * src_width * 0.5 + dst_width * 0.5,-scale * src_height * 0.5 + dst_height * 0.5)为左上起点，宽高以scale倍数拉拉长拉拉宽。我的理解是这里取了dst_img像素个数多的线程存块用于计算，一个线程所在的位置代表在dst_img的位置，位置之间的关系是position=dy*dst_width+dx。由此得到lx，ly，hx，hy，为到向上和向下取整的距离。
复制链接

扫一扫