一、前言
- yolov5的输出为tensor(25200 * 85),
(1)其中85表示为5+80,即cx, cy, width, height, objness(框里是否含有目标), classification * 80类别置信度
(2)25200 = (20 * 20 + 40 * 40 + 80 * 80) * 3, 其中20 * 20, 40 * 40, 80 * 80分别是提取深、中、浅层的网格大小,浅层用来预测小目标,深层用来预测大目标;* 3是因为每个点有三种anchor
- 使用核函数对yolov5推理结果进行解码并恢复成框
- 研究过程,可以把pytorch的数据转换为numpy,通过tobytes写到文件,再使用C++进行读取,快速进行问题研究和排查,不需要tensorRT推理也可以进行后处理研究
- 对于类似GPU解码这种复杂实现,可以先实现CPU解码,再进行修改
二、后处理CPU解码
CPU解码的重点思想
- 避免多余计算,有些数学运算需要的时间远超过很多if语句,减少计算的次数就是性能的关键
- nms实现的优化,例如使用remove_flag且预先分配内存,reserve对输出分配内存
1. 解码返回的结果为预测框
返回类型vector,Box为预测框的结构体
struct Box{
float left, top, right, bottom, confidence;
int label;
Box() = default;
Box(float left, float top, float right, float bottom, float confidence, int label):
left(left), top(top), right(right), bottom(bottom), confidence(confidence), label(label){}
};
2. 避免多余计算,先对objness进行判断
auto data = load_file("predict.data");
auto image = cv::imread("input-image.jpg");
float* ptr = (float*)data.data();
int nelem = data.size() / sizeof(float);
int ncols = 85;
int nrows = nelem / ncols;
auto boxes = cpu_decode(ptr, nrows, ncols);
vector<Box> cpu_decode(float* predict, int rows, int cols, float confidence_threshold = 0.25f, float nms_threshold = 0.45f){
vector<Box> boxes;
int num_classes = cols - 5;
for(int i = 0; i < rows; ++i){
float* pitem = predict + i * cols;
float objness = pitem[4];
if(objness < confidence_threshold)
continue;
3. 避免多余计算,再对置信度进行判断
float* pclass = pitem + 5;
int label = std::max_element(pclass, pclass + num_classes) - pclass;
float prob = pclass[label];
float confidence = prob * objness;
if(confidence < confidence_threshold)
continue;
4. 含有目标且置信度大于阈值的存入boxes
float cx = pitem[0];
float cy = pitem[1];
float width = pitem[2];
float height = pitem[3];
float left = cx - width * 0.5;
float top = cy - height * 0.5;
float right = cx + width * 0.5;
float bottom = cy + height * 0.5;
boxes.emplace_back(left, top, right, bottom, confidence, (float)label);
5. nms的准备工作
- boxes的置信度进行排序,lamda表达式的参数给引用,避免拷贝
std::sort(boxes.begin(), boxes.end(), [](Box& a, Box& b){return a.confidence > b.confidence;})
;- 为避免boxes中框删除过程,vector中框的移动造成时间浪费,采用remove_flags的标记来记录是否删除,true表示已删除
std::vector<bool> remove_flags(boxes.size());
- vector<>.reserve提前分配空间,避免多次使用push_back分配内存,性能降低
std::vector<Box> box_result;
box_result.reserve(boxes.size());
5. iou计算表达式
auto iou = [](const Box& a, const Box& b){
float cross_left = std::max(a.left, b.left);
float cross_top = std::max(a.top, b.top);
float cross_right = std::min(a.right, b.right);
float cross_bottom = std::min(a.bottom, b.bottom);
float cross_area = std::max(0.0f, cross_right - cross_left) * std::max(0.0f, cross_bottom - cross_top);
float union_area = std::max(0.0f, a.right - a.left) * std::max(0.0f, a.bottom - a.top)
+ std::max(0.0f, b.right - b.left) * std::max(0.0f, b.bottom - b.top) - cross_area;
if(cross_area == 0 || union_area == 0) return 0.0f;
return cross_area / union_area;
};
6. nms实现
for(int i = 0; i < boxes.size(); ++i){
if(remove_flags[i]) continue; //判断是否已不是目标框
auto& ibox = boxes[i];
box_result.emplace_back(ibox);
for(int j = i + 1; j < boxes.size(); ++j){
if(remove_flags[j]) continue;
auto& jbox = boxes[j];
if(ibox.label == jbox.label){
// class matched
if(iou(ibox, jbox) >= nms_threshold)
remove_flags[j] = true;
}
}
}
6. 画检测框
for(auto& box : boxes){
cv::rectangle(image, cv::Point(box.left, box.top), cv::Point(box.right, box.bottom), cv::Scalar(0, 255, 0), 2);
cv::putText(image, cv::format("%.2f", box.confidence), cv::Point(box.left, box.top - 7), 0, 0.8, cv::Scalar(0, 0, 255), 2, 16);
}
cv::imwrite("image-draw.jpg", image);
二、后处理GPU解码
GPU解码的重点思想
- cpu中可是使用vector表示数量不确定的数组,gpu解码使用[count,box1,box2,…]的方式,此方法需要有最大数量限制
- 通过atomicAdd实现数组元素的加入,并返回索引
- 避免不必要的计算
1. 创建stream
cudaStream_t stream = nullptr;
checkRuntime(cudaStreamCreate(&stream));
2. 申请内存
- 定义最大目标框数量max_objects = 1000
- 每个框的元素有7个,NUM_BOX_ELEMENT = 7, 分别为框的left, top, right, bottom,confidence置信度,class类别,keepflag(是否保留框的标志位,true保留)
- sizeof(float)表示count,max_objects * NUM_BOX_ELEMENT * sizeof(float))表示框
float* predict_device = nullptr;
float* output_device = nullptr;
float* output_host = nullptr;
int max_objects = 1000;
int NUM_BOX_ELEMENT = 7; // left, top, right, bottom, confidence, class, keepflag
checkRuntime(cudaMalloc(&predict_device, rows * cols * sizeof(float)));
//sizeof(float)表示count,max_objects * NUM_BOX_ELEMENT * sizeof(float))表示框
checkRuntime(cudaMalloc(&output_device, sizeof(float) + max_objects * NUM_BOX_ELEMENT * sizeof(float)));
checkRuntime(cudaMallocHost(&output_host, sizeof(float) + max_objects * NUM_BOX_ELEMENT * sizeof(float)));
checkRuntime(cudaMemcpyAsync(predict_device, predict, rows * cols * sizeof(float), cudaMemcpyHostToDevice, stream));
3. gpu解码
调用cuda函数
decode_kernel_invoker(
predict_device, rows, cols - 5, confidence_threshold,
nms_threshold, nullptr, output_device, max_objects, NUM_BOX_ELEMENT, stream
);
- 定义block和grid的大小
auto block = num_bboxes > 512 ? 512 : num_bboxes;
auto grid = (num_bboxes + block - 1) / block; //向上取整
4. 解码核函数
调用解码核函数
decode_kernel<<<grid, block, 0, stream>>>(
predict, num_bboxes, num_classes, confidence_threshold,
invert_affine_matrix, parray, max_objects, NUM_BOX_ELEMENT
);
-
解码核函数计算类似于cpu解码,避免不必要的计算
-
开启总框数量个线程,25200
-
int index = atomicAdd(parray, 1);计算count大小:parray = [count , box1, box2, box3 …,] atomicAdd -> count += 1 但返回的是未加前的count,详细见CUDA笔记一(atomicAdd)
-
超过符合阈值的最大框数量的return掉,实现动态数组
if(index >= max_objects) return;
static __global__ void decode_kernel(
float* predict, int num_bboxes, int num_classes, float confidence_threshold,
float* invert_affine_matrix, float* parray, int max_objects, int NUM_BOX_ELEMENT
){
int position = blockDim.x * blockIdx.x + threadIdx.x;
if (position >= num_bboxes) return;
float* pitem = predict + (5 + num_classes) * position;
float objectness = pitem[4];
if(objectness < confidence_threshold)
return;
float* class_confidence = pitem + 5;
float confidence = *class_confidence++;
int label = 0;
for(int i = 1; i < num_classes; ++i, ++class_confidence){
if(*class_confidence > confidence){
confidence = *class_confidence;
label = i;
}
}
confidence *= objectness;
if(confidence < confidence_threshold)
return;
int index = atomicAdd(parray, 1);
if(index >= max_objects)
return;
float cx = *pitem++;
float cy = *pitem++;
float width = *pitem++;
float height = *pitem++;
float left = cx - width * 0.5f;
float top = cy - height * 0.5f;
float right = cx + width * 0.5f;
float bottom = cy + height * 0.5f;
// affine_project(invert_affine_matrix, left, top, &left, &top);
// affine_project(invert_affine_matrix, right, bottom, &right, &bottom);
// left, top, right, bottom, confidence, class, keepflag
float* pout_item = parray + 1 + index * NUM_BOX_ELEMENT;
*pout_item++ = left;
*pout_item++ = top;
*pout_item++ = right;
*pout_item++ = bottom;
*pout_item++ = confidence;
*pout_item++ = label;
*pout_item++ = 1; // 1 = keep, 0 = ignore
}
5. nms核函数
调用nms核函数
- 启用最大框数量max_objects个线程
block = max_objects > 512 ? 512 : max_objects;
grid = (max_objects + block - 1) / block;
fast_nms_kernel<<<grid, block, 0, stream>>>(parray, max_objects, nms_threshold, NUM_BOX_ELEMENT);
6. 实现nms核函数
- 每个框都要单独对其余所有框进行nms计算是否是检测框
- 由于并行计算且if过滤掉很多条件,速度很快
static __global__ void fast_nms_kernel(float* bboxes, int max_objects, float threshold, int NUM_BOX_ELEMENT){
int position = (blockDim.x * blockIdx.x + threadIdx.x);
int count = min((int)*bboxes, max_objects);
if (position >= count)
return;
// left, top, right, bottom, confidence, class, keepflag
float* pcurrent = bboxes + 1 + position * NUM_BOX_ELEMENT;
for(int i = 0; i < count; ++i){
float* pitem = bboxes + 1 + i * NUM_BOX_ELEMENT;
if(i == position || pcurrent[5] != pitem[5]) continue;
if(pitem[4] >= pcurrent[4]){
if(pitem[4] == pcurrent[4] && i < position)
continue;
float iou = box_iou(
pcurrent[0], pcurrent[1], pcurrent[2], pcurrent[3],
pitem[0], pitem[1], pitem[2], pitem[3]
);
if(iou > threshold){
pcurrent[6] = 0; // 1=keep, 0=ignore
return;
}
}
}
}
7. 检测框结果存储
checkRuntime(cudaMemcpyAsync(output_host, output_device,
sizeof(int) + max_objects * NUM_BOX_ELEMENT * sizeof(float),
cudaMemcpyDeviceToHost, stream
));
checkRuntime(cudaStreamSynchronize(stream));
int num_boxes = min((int)output_host[0], max_objects);
for(int i = 0; i < num_boxes; ++i){
float* ptr = output_host + 1 + NUM_BOX_ELEMENT * i;
int keep_flag = ptr[6];
if(keep_flag){
box_result.emplace_back(
ptr[0], ptr[1], ptr[2], ptr[3], ptr[4], (int)ptr[5]
);
}
}
checkRuntime(cudaStreamDestroy(stream));
checkRuntime(cudaFree(predict_device));
checkRuntime(cudaFree(output_device));
checkRuntime(cudaFreeHost(output_host));
return box_result;