内容目录
YOLOv5预处理
Tensorrtx cpp版本
preprocess.cu文件函数preprocess_kernel_img
void preprocess_kernel_img(
uint8_t* src, int src_width, int src_height,
float* dst, int dst_width, int dst_height,
cudaStream_t stream) {
AffineMatrix s2d,d2s;
float scale = std::min(dst_height / (float)src_height, dst_width / (float)src_width);
s2d.value[0] = scale;
s2d.value[1] = 0;
s2d.value[2] = -scale * src_width * 0.5 + dst_width * 0.5;
s2d.value[3] = 0;
s2d.value[4] = scale;
s2d.value[5] = -scale * src_height * 0.5 + dst_height * 0.5;
cv::Mat m2x3_s2d(2, 3, CV_32F, s2d.value);
cv::Mat m2x3_d2s(2, 3, CV_32F, d2s.value);
cv::invertAffineTransform(m2x3_s2d, m2x3_d2s);
memcpy(d2s.value, m2x3_d2s.ptr<float>(0), sizeof(d2s.value));
int jobs = dst_height * dst_width;
int threads = 256;
int blocks = ceil(jobs / (float)threads);
warpaffine_kernel<<<blocks, threads, 0, stream>>>(
src, src_width*3, src_width,
src_height, dst, dst_width,
dst_height, 128, d2s, jobs);
}
在这个函数里,取scale系数为目标图像(变换后的图)与原始图像(变换前的图)的宽、高作比例计算,取较小值。
计算s2d,src_img到dst_img的仿射变换矩阵。
这里详细记录一下s2d得来的出处。是opencv库cv.WarpAffine函数cv2.warpAffine(src, M, dsize, flags, borderMode, borderValue)中的M变换矩阵。src输入图像、M变换矩阵、dsize为输出图像尺寸、flags插值方法、borderMode边界像素模式、borderValue像素边界颜色。
其中M矩阵 M = [[x,y,z],[x1,y1,z1]]: z,z1是左上角起点;y=0 y1=1.5 y轴方向拉1.5倍;x=1.5 x1=0 x轴方向1.5倍。
那么在这里理解一下s2d。也就是src_img到dst_img的变换是以(-scale * src_width * 0.5 + dst_width * 0.5,-scale * src_height * 0.5 + dst_height * 0.5)为左上起点,宽高以scale倍数拉拉长拉拉宽。
cv::invertAffineTransform函数为取逆,得到d2s,dst_img到src_img的仿射变换矩阵。
d2s用于warpaffine_kernel的计算,jobs指有dst_img所有的像素点进行并行计算的工作个数。
这里到了函数warpaffine_kernel。
__global__ void warpaffine_kernel(
uint8_t* src, int src_line_size, int src_width,
int src_height, float* dst, int dst_width,
int dst_height, uint8_t const_value_st,
AffineMatrix d2s, int edge) {
int position = blockDim.x * blockIdx.x + threadIdx.x;
if (position >= edge) return;
float m_x1 = d2s.value[0];
float m_y1 = d2s.value[1];
float m_z1 = d2s.value[2];
float m_x2 = d2s.value[3];
float m_y2 = d2s.value[4];
float m_z2 = d2s.value[5];
int dx = position % dst_width;
int dy = position / dst_width; #为了取宽高 第几排第几个
float src_x = m_x1 * dx + m_y1 * dy + m_z1 + 0.5f;
float src_y = m_x2 * dx + m_y2 * dy + m_z2 + 0.5f;
float c0, c1, c2;
if (src_x <= -1 || src_x >= src_width || src_y <= -1 || src_y >= src_height) {
// out of range
c0 = const_value_st;
c1 = const_value_st;
c2 = const_value_st;
} else {
int y_low = floorf(src_y);
int x_low = floorf(src_x);
int y_high = y_low + 1;
int x_high = x_low + 1;
uint8_t const_value[] = {const_value_st, const_value_st, const_value_st};
float ly = src_y - y_low;
float lx = src_x - x_low;
float hy = 1 - ly;
float hx = 1 - lx;
float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
uint8_t* v1 = const_value;
uint8_t* v2 = const_value;
uint8_t* v3 = const_value;
uint8_t* v4 = const_value;
#这两个if 是在并行读取value 在cuda设备上/CPU上
if (y_low >= 0) {
if (x_low >= 0)
v1 = src + y_low * src_line_size + x_low * 3;
if (x_high < src_width)
v2 = src + y_low * src_line_size + x_high * 3;
}
if (y_high < src_height) {
if (x_low >= 0)
v3 = src + y_high * src_line_size + x_low * 3;
if (x_high < src_width)
v4 = src + y_high * src_line_size + x_high * 3;
}
#双线性插值
c0 = w1 * v1[0] + w2 * v2[0] + w3 * v3[0] + w4 * v4[0]; #v1 v2
c1 = w1 * v1[1] + w2 * v2[1] + w3 * v3[1] + w4 * v4[1];
c2 = w1 * v1[2] + w2 * v2[2] + w3 * v3[2] + w4 * v4[2];
}
//bgr to rgb
float t = c2;
c2 = c0;
c0 = t;
//normalization
c0 = c0 / 255.0f;
c1 = c1 / 255.0f;
c2 = c2 / 255.0f;
//rgbrgbrgb to rrrgggbbb #预处理处转换为NCHW
int area = dst_width * dst_height;
float* pdst_c0 = dst + dy * dst_width + dx;
float* pdst_c1 = pdst_c0 + area;
float* pdst_c2 = pdst_c1 + area;
*pdst_c0 = c0;
*pdst_c1 = c1;
*pdst_c2 = c2;
}
这片就是cuda代码了。position就是这个线程所处的位置。我的理解是这里取了dst_img像素个数多的线程存块用于计算,一个线程所在的位置代表在dst_img的位置,位置之间的关系是position=dy*dst_width+dx。【不懂,我猜的,咱也不熟。】
然后,利用d2s变换矩阵计算得到dst_img上每个像素点对应到src_img的像素近似点,这么说是因为这里取了float,也就是在src_img上并不真实存在的点。用于计算dst_img对应点的像素。【这里的d2s变换计算我没搞清楚公式长什么样…】
接着往下看,对于src_x,src_y判断是否出界,出界bgr三色为[128,128,128]。即为padding的部分颜色。
若没有出界,取y_low<=src_y<=y_high且y_high-y_low=1,向上和向下取整。由此得到lx,ly,hx,hy,为到向上和向下取整的距离。
v1,v2,v3,v4为取线程所在的位置。和position一样取。初始化是为了padding这种情况。
c0,c1,c2是bgr三色,在做双线性插值计算。为双线性插值推导而得的公式。
公式
往下看,剩下就是bgr转换为rgb,归一化的过程。
rgbrgbrgb to rrrgggbbb 为预处理NHWC转换为NCHW的过程!在底层存储放好了这个顺序。
氮素【我这里不懂为什么pdst_c0和pdst_c1,pdst_c2写得不一样!】
Tensorrtx py版本
【I dont understand Yolov5_trt.py 和yolov5_trt_cuda_python.py区别在什么?Pycuda vs cuda+python】
def preprocess_image(self, raw_bgr_image):
"""
description: Convert BGR image to RGB,
resize and pad it to target size, normalize to [0,1],
transform to NCHW format.
param:
input_image_path: str, image path
return:
image: the processed image
image_raw: the original image
h: original height
w: original width
"""
image_raw = raw_bgr_image
h, w, c = image_raw.shape
image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB)
# Calculate widht and height and paddings
r_w = self.input_w / w
r_h = self.input_h / h
if r_h > r_w:
tw = self.input_w
th = int(r_w * h)
tx1 = tx2 = 0
ty1 = int((self.input_h - th) / 2)
ty2 = self.input_h - th - ty1
else:
tw = int(r_h * w)
th = self.input_h
tx1 = int((self.input_w - tw) / 2)
tx2 = self.input_w - tw - tx1
ty1 = ty2 = 0
#
image = cv2.resize(image, (tw, th))
# Pad the short side with (128,128,128)
image = cv2.copyMakeBorder(
image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128)
)
image = image.astype(np.float32)
# Normalize to [0,1]
image /= 255.0
# HWC to CHW format:
image = np.transpose(image, [2, 0, 1])
# CHW to NCHW format
image = np.expand_dims(image, axis=0)
# Convert the image to row-major order, also known as "C order":
image = np.ascontiguousarray(image)
return image, image_raw, h, w
(1)求img_w/input_w img_h/input_h scale的较小值
由此求得scale之后的等比例的input的宽高tw th。tx1 ty1 tx2 ty2 是四个角padding的距离。
(2)image=cv2.resize(image,(tw,th)) Resize the image with long side while maintaining ratio
(3)image = cv2.copyMakeBorder( image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, None, (128, 128, 128)) Pad the short side with (128,128,128) 但是pytorch版本喜欢用114 114 114 我司也是
(4)Image = image.astype(np.float32) image/=255.0 换格式就是为了normalize Normalize to [0,1]
(5) image = np.transpose(image, [2, 0, 1]) HWC to CHW format
(6)image = np.expand_dims(image, axis=0) # CHW to NCHW format 一张张处理的,所以需要expand dims 为了后面infer是batch的格式infer 这里也会存下来对应的original_h original_w 用于一张张地后处理
(7)image = np.ascontiguousarray(image) convert the image to row-major order, also known as "C order": ascontiguousarray函数将一个内存不连续存储的数组转换为内存连续存储的数组,使得运行速度更快
Openvino py版本
def letterbox(self, imgs, size=(640, 640), color=(114, 114, 114), auto=True, scaleFill=False, scaleup=True,ov_version = 2021):
# Resize image to a 32-pixel-multiple rectangle https://github.com/ultralytics/yolov3/issues/232
img_list = []
if len(imgs) == 1:
img = imgs[0]
shape = img.shape[:2] # current shape [height, width]
w, h = size
# Scale ratio (new / old)
r = min(h / shape[0], w / shape[1])
if not scaleup: # only scale down, do not scale up (for better test mAP)
r = min(r, 1.0)
# Compute padding
ratio = r, r # width, height ratios
new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
dw, dh = w - new_unpad[0], h - new_unpad[1] # wh padding
if auto: # minimum rectangle
dw, dh = np.mod(dw, 64), np.mod(dh, 64) # wh padding
elif scaleFill: # stretch
dw, dh = 0.0, 0.0
new_unpad = (w, h)
ratio = w / shape[1], h / shape[0] # width, height ratios
dw /= 2 # divide padding into 2 sides
dh /= 2
if shape[::-1] != new_unpad: # resize
img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color) # add border
top2, bottom2, left2, right2 = 0, 0, 0, 0
if img.shape[0] != h:
top2 = (h - img.shape[0]) // 2
bottom2 = top2
img = cv2.copyMakeBorder(img, top2, bottom2, left2, right2, cv2.BORDER_CONSTANT,
value=color) # add border
elif img.shape[1] != w:
left2 = (w - img.shape[1]) // 2
right2 = left2
img = cv2.copyMakeBorder(img, top2, bottom2, left2, right2, cv2.BORDER_CONSTANT,
value=color) # add border
if ov_version < 2022:
img = img.transpose((2, 0, 1)) # HWC to CHW, BGR to RGB
img = img.reshape((self.n, self.c, self.h, self.w))
return img
else:
return [img]
letterbox函数分析:
首先是scale系数计算,同上一样的scale。but可选not scale up only scale down。for better test mAP(尴尬)。
这里padding用(114,114,114)。
来看padding大小的计算。可选auto:minimum rectangle能整除64最小的边长;stretch选项,全部展开与原图大小一样。加padding的时候搞个-0.1,+0.1俺是不理解,是为了减少rescale时浮点数带来的误差吗?
然后最后加padding的double check,是因为有可能会对不齐吗?想不出这种情况
主函数
if OPENVINO_VERSION < 2022:
imgs = self.letterbox(img0s, (self.w, self.h), ov_version=OPENVINO_VERSION)
res = self.exec_net.infer(inputs={self.input_blob: imgs})
det = torch.from_numpy(res['output_0'])
else:
imgs = self.letterbox(img0s, (self.w, self.h), ov_version=OPENVINO_VERSION)
imgs = cv2.dnn.blobFromImages(imgs, 1 / 255.0, swapRB=True, crop=False)
det = self.exec_net.infer([imgs])[self.net.outputs[0]]
det = torch.from_numpy(det)
openvino2022版,在letterbox之后接函数cv2.dnn.blobFromImages这个函数。
函数cv2.dnn.blobFromImage(image, scalefactor, size, mean, swapRB, crop, ddepth)
作用:图像进行预处理,包括减均值,比例缩放,裁剪,交换通道等,返回一个4通道的blob(blob可以简单理解为一个N维的数组,用于神经网络的输入)。当同时进行scalefactor,size,mean,swapRB操作时,优先按swapRB交换通道,其次按scalefactor比例缩放,然后按mean求减,最后按size进行resize操作转换。
但是经过我们的测试。这个函数可以这样代替,速度还快效果也不差,由于看不到这个函数内部实现,这个疑惑就放在这里。
for i in range(90):
img3 = cv2.imread(r'xxxx.jpg')
img3 = cv2.resize(img3, (640, 384))
img3_ = copy.deepcopy(img3)
print('---------------------')
t = time.time()
cv2.cvtColor(img3, cv2.COLOR_BGRA2RGB, img3) # bgr -> rgb
img3 = img3.transpose(2, 0, 1) # HWC -> CHW
img3 = np.stack([img3])
img3 = torch.from_numpy(img3).to('cuda:0')
img3 = img3.half()
img3 /= 255.0
print(time.time() - t)
t = time.time()
img3_ = cv2.dnn.blobFromImages([img3_], 1 / 255.0, swapRB=True, crop=False)
print(time.time() - t)
img3_ = torch.from_numpy(img3_).to('cuda:0')
print(img3.shape, img3_.shape)
YOLOv5后处理
Tensorrtx cpp版本
nms之前detect的个数限制为1000。nms是一张张处理的。
源码是infer之后进入nms过程。
void nms(std::vector<Yolo::Detection>& res, float *output, float conf_thresh, float nms_thresh = 0.5) {
int det_size = sizeof(Yolo::Detection) / sizeof(float);
std::map<float, std::vector<Yolo::Detection>> m;
for (int i = 0; i < output[0] && i < Yolo::MAX_OUTPUT_BBOX_COUNT; i++) {
if (output[1 + det_size * i + 4] <= conf_thresh) continue;
Yolo::Detection det;
memcpy(&det, &output[1 + det_size * i], det_size * sizeof(float));
if (m.count(det.class_id) == 0) m.emplace(det.class_id, std::vector<Yolo::Detection>());
m[det.class_id].push_back(det);
}
for (auto it = m.begin(); it != m.end(); it++) {
//std::cout << it->second[0].class_id << " --- " << std::endl;
auto& dets = it->second;
std::sort(dets.begin(), dets.end(), cmp);
for (size_t m = 0; m < dets.size(); ++m) {
auto& item = dets[m];
res.push_back(item);
for (size_t n = m + 1; n < dets.size(); ++n) {
if (iou(item.bbox, dets[n].bbox) > nms_thresh) {
dets.erase(dets.begin() + n);
--n;
}
}
}
}
}
这里直接比对object confidence。没有比类别。这里iou实现是用xywh格式来计算的。无clip
float iou(float lbox[4], float rbox[4]) {
float interBox[] = {
(std::max)(lbox[0] - lbox[2] / 2.f , rbox[0] - rbox[2] / 2.f), //left
(std::min)(lbox[0] + lbox[2] / 2.f , rbox[0] + rbox[2] / 2.f), //right
(std::max)(lbox[1] - lbox[3] / 2.f , rbox[1] - rbox[3] / 2.f), //top
(std::min)(lbox[1] + lbox[3] / 2.f , rbox[1] + rbox[3] / 2.f), //bottom
};
if (interBox[2] > interBox[3] || interBox[0] > interBox[1])
return 0.0f;
float interBoxS = (interBox[1] - interBox[0])*(interBox[3] - interBox[2]);
return interBoxS / (lbox[2] * lbox[3] + rbox[2] * rbox[3] - interBoxS);
}
nms后,到了后面get_rect画框和类别的时候才从xywh-->xyxy。
cv::Rect get_rect(cv::Mat& img, float bbox[4]) { #bbox xywh
float l, r, t, b;
float r_w = Yolo::INPUT_W / (img.cols * 1.0);
float r_h = Yolo::INPUT_H / (img.rows * 1.0);
if (r_h > r_w) {
l = bbox[0] - bbox[2] / 2.f;
r = bbox[0] + bbox[2] / 2.f;
t = bbox[1] - bbox[3] / 2.f - (Yolo::INPUT_H - r_w * img.rows) / 2;
b = bbox[1] + bbox[3] / 2.f - (Yolo::INPUT_H - r_w * img.rows) / 2;
l = l / r_w;
r = r / r_w;
t = t / r_w;
b = b / r_w;
} else {
l = bbox[0] - bbox[2] / 2.f - (Yolo::INPUT_W - r_h * img.cols) / 2;
r = bbox[0] + bbox[2] / 2.f - (Yolo::INPUT_W - r_h * img.cols) / 2;
t = bbox[1] - bbox[3] / 2.f;
b = bbox[1] + bbox[3] / 2.f;
l = l / r_h;
r = r / r_h;
t = t / r_h;
b = b / r_h;
}
return cv::Rect(round(l), round(t), round(r - l), round(b - t));
}
haha 这里没有做clip越界检查处理。做了xywh->xyxy转换。一个一个框resize到原图范围。所以不做检查吧
Tensorrtx py版本
def post_process(self, output, origin_h, origin_w):
"""
description: postprocess the prediction
param:
output: A numpy likes [num_boxes,cx,cy,w,h,conf,cls_id, cx,cy,w,h,conf,cls_id, ...]
origin_h: height of original image
origin_w: width of original image
return:
result_boxes: finally boxes, a boxes numpy, each row is a box [x1, y1, x2, y2]
result_scores: finally scores, a numpy, each element is the score correspoing to box
result_classid: finally classid, a numpy, each element is the classid correspoing to box
"""
# Get the num of boxes detected
num = int(output[0])
# Reshape to a two dimentional ndarray
pred = np.reshape(output[1:], (-1, 6))[:num, :]
# Do nms
boxes = self.non_max_suppression(pred, origin_h, origin_w, conf_thres=CONF_THRESH, nms_thres=IOU_THRESHOLD)
result_boxes = boxes[:, :4] if len(boxes) else np.array([])
result_scores = boxes[:, 4] if len(boxes) else np.array([])
result_classid = boxes[:, 5] if len(boxes) else np.array([])
return result_boxes, result_scores, result_classid
传进post_process时已经是一张图的结果,bn=1。这里依然reshape为2维。
def non_max_suppression(self, prediction, origin_h, origin_w, conf_thres=0.5, nms_thres=0.4):
"""
description: Removes detections with lower object confidence score than 'conf_thres' and performs
Non-Maximum Suppression to further filter detections.
param:
prediction: detections, (x1, y1, x2, y2, conf, cls_id)
origin_h: original image height
origin_w: original image width
conf_thres: a confidence threshold to filter detections
nms_thres: a iou threshold to filter detections
return:
boxes: output after nms with the shape (x1, y1, x2, y2, conf, cls_id)
"""
# Get the boxes that score > CONF_THRESH
boxes = prediction[prediction[:, 4] >= conf_thres]
# Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2]
boxes[:, :4] = self.xywh2xyxy(origin_h, origin_w, boxes[:, :4])
# clip the coordinates
boxes[:, 0] = np.clip(boxes[:, 0], 0, origin_w -1)
boxes[:, 2] = np.clip(boxes[:, 2], 0, origin_w -1)
boxes[:, 1] = np.clip(boxes[:, 1], 0, origin_h -1)
boxes[:, 3] = np.clip(boxes[:, 3], 0, origin_h -1)
# Object confidence
confs = boxes[:, 4]
# Sort by the confs
boxes = boxes[np.argsort(-confs)]
# Perform non-maximum suppression
keep_boxes = []
while boxes.shape[0]:
large_overlap = self.bbox_iou(np.expand_dims(boxes[0, :4], 0), boxes[:, :4]) > nms_thres
label_match = boxes[0, -1] == boxes[:, -1]
# Indices of boxes with lower confidence scores, large IOUs and matching labels
invalid = large_overlap & label_match
keep_boxes += [boxes[0]]
boxes = boxes[~invalid]
boxes = np.stack(keep_boxes, 0) if len(keep_boxes) else np.array([])
return boxes
先对object confidence做个filter。
xywh->xyxy转换。这里已经把框框rescale到original的大小。
clip数值边界处理。
nms过程。
bbox_iou的inter_x1 inter_x2 inter_y1 inter_y2也做了clip。还有+1处理
def bbox_iou(self, box1, box2, x1y1x2y2=True):
"""
description: compute the IoU of two bounding boxes
param:
box1: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
box2: A box coordinate (can be (x1, y1, x2, y2) or (x, y, w, h))
x1y1x2y2: select the coordinate format
return:
iou: computed iou
"""
if not x1y1x2y2:
# Transform from center and width to exact coordinates
b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
else:
# Get the coordinates of bounding boxes
b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]
# Get the coordinates of the intersection rectangle
inter_rect_x1 = np.maximum(b1_x1, b2_x1)
inter_rect_y1 = np.maximum(b1_y1, b2_y1)
inter_rect_x2 = np.minimum(b1_x2, b2_x2)
inter_rect_y2 = np.minimum(b1_y2, b2_y2)
# Intersection area
inter_area = np.clip(inter_rect_x2 - inter_rect_x1 + 1, 0, None) * \
np.clip(inter_rect_y2 - inter_rect_y1 + 1, 0, None)
# Union Area
b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1)
b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1)
iou = inter_area / (b1_area + b2_area - inter_area + 1e-16)
return iou
Openvino py版本
这里prediction是batch imgs。不是一张张
def non_max_suppression(self, prediction, conf_thres=0.25, iou_thres=0.45, classes=None, agnostic=False, labels=()):
"""Performs Non-Maximum Suppression (NMS) on inference results
Returns:
detections with shape: nx6 (x1, y1, x2, y2, conf, cls)
"""
nc = prediction.shape[2] - 5 # number of classes
xc = prediction[..., 4] > conf_thres # candidates
# Settings
min_wh, max_wh = 2, 4096 # (pixels) minimum and maximum box width and height
max_det = 300 # maximum number of detections per image
max_nms = 30000 # maximum number of boxes into torchvision.ops.nms()
time_limit = 10.0 # seconds to quit after
redundant = True # require redundant detections
multi_label = nc > 1 # multiple labels per box (adds 0.5ms/img)
merge = False # use merge-NMS
t = time.time()
output = [torch.zeros((0, 6), device=prediction.device)] * prediction.shape[0]
for xi, x in enumerate(prediction): # image index, image inference
# Apply constraints
# x[((x[..., 2:4] < min_wh) | (x[..., 2:4] > max_wh)).any(1), 4] = 0 # width-height
x = x[xc[xi]] # confidence
# Cat apriori labels if autolabelling
if labels and len(labels[xi]):
l = labels[xi]
v = torch.zeros((len(l), nc + 5), device=x.device)
v[:, :4] = l[:, 1:5] # box
v[:, 4] = 1.0 # conf
v[range(len(l)), l[:, 0].long() + 5] = 1.0 # cls
x = torch.cat((x, v), 0)
# If none remain process next image
if not x.shape[0]:
continue
# Compute conf
x[:, 5:] *= x[:, 4:5] # conf = obj_conf * cls_conf
# Box (center x, center y, width, height) to (x1, y1, x2, y2)
box = self.xywh2xyxy(x[:, :4])
# Detections matrix nx6 (xyxy, conf, cls)
if multi_label:
i, j = (x[:, 5:] > conf_thres).nonzero(as_tuple=False).T
x = torch.cat((box[i], x[i, j + 5, None], j[:, None].float()), 1)
else: # best class only
conf, j = x[:, 5:].max(1, keepdim=True)
x = torch.cat((box, conf, j.float()), 1)[conf.view(-1) > conf_thres]
# Filter by class
if classes is not None:
x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)]
# Apply finite constraint
# if not torch.isfinite(x).all():
# x = x[torch.isfinite(x).all(1)]
# Check shape
n = x.shape[0] # number of boxes
if not n: # no boxes
continue
elif n > max_nms: # excess boxes
x = x[x[:, 4].argsort(descending=True)[:max_nms]] # sort by confidence
# Batched NMS
c = x[:, 5:6] * (0 if agnostic else max_wh) # classes
boxes, scores = x[:, :4] + c, x[:, 4] # boxes (offset by class), scores
i = torchvision.ops.nms(boxes, scores, iou_thres) # NMS
if i.shape[0] > max_det: # limit detections
i = i[:max_det]
if merge and (1 < n < 3E3): # Merge NMS (boxes merged using weighted mean)
# update boxes as boxes(i,4) = weights(i,n) * boxes(n,4)
iou = self.box_iou(boxes[i], boxes) > iou_thres # iou matrix
weights = iou * scores[None] # box weights
x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum(1, keepdim=True) # merged boxes
if redundant:
i = i[iou.sum(1) > 1] # require redundancy
output[xi] = x[i]
if (time.time() - t) > time_limit:
print(f'WARNING: NMS time limit {time_limit}s exceeded')
break # time limit exceeded
return output
直接先object confidence筛选一批detectbox,不是一张张地选,all in
也有检测时间、nms前后个数限制。
计算分类物体的置信度。conf = obj_conf * cls_conf又筛选一轮。
这里还有多标签和单标签处理。
nms。
紧接着rescale到original size。也有clip处理。
def scale_coords(self, img1_shape, coords, img0_shape, ratio_pad=None):
# Rescale coords (xyxy) from img1_shape to img0_shape
if ratio_pad is None: # calculate from img0_shape
gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1]) # gain = old / new
pad = (img1_shape[1] - img0_shape[1] * gain) / 2, (img1_shape[0] - img0_shape[0] * gain) / 2 # wh padding
else:
gain = ratio_pad[0][0]
pad = ratio_pad[1]
coords[:, [0, 2]] -= pad[0] # x padding
coords[:, [1, 3]] -= pad[1] # y padding
coords[:, :4] /= gain
self.clip_coords(coords, img0_shape)
return coords
能看到这里的,栓Q!
希望对你有所帮助!