cuda nms

AI算法网奇

已于 2024-01-21 11:19:23 修改

阅读量1.1k

点赞数

分类专栏： c++基础文章标签： c++ 算法开发语言

于 2022-11-03 21:56:13 首次发布

本文链接：https://blog.csdn.net/jacke121/article/details/127679213

版权

c++基础专栏收录该内容

286 篇文章 28 订阅

订阅专栏

pyhon cuda

CTX = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

  int YoloLayerPlugin::nms_fun(int batch_size, void **inputs, void *const* outputs, size_t count, int detections_per_im, float nms_thresh, void *workspace, size_t workspace_size, cudaStream_t stream) const {

        if (!workspace || !workspace_size) {
            // Return required scratch space size cub style
            workspace_size  = get_size_aligned<bool>(count);  // flags
            workspace_size += get_size_aligned<int>(count);   // indices
            workspace_size += get_size_aligned<int>(count);   // indices_sorted
            workspace_size += get_size_aligned<float>(count); // scores
            workspace_size += get_size_aligned<float>(count); // scores_sorted
        
            size_t temp_size_flag = 0;
            cub::DeviceSelect::Flagged((void *)nullptr, temp_size_flag,
            cub::CountingInputIterator<int>(count),
            (bool *)nullptr, (int *)nullptr, (int *)nullptr, count);
            size_t temp_size_sort = 0;
            cub::DeviceRadixSort::SortPairsDescending((void *)nullptr, temp_size_sort,
            (float *)nullptr, (float *)nullptr, (int *)nullptr, (int *)nullptr, count);
            workspace_size += std::max(temp_size_flag, temp_size_sort);

            return workspace_size;
        }

        auto on_stream = thrust::cuda::par.on(stream);

        auto flags = get_next_ptr<bool>(count, workspace, workspace_size);
        auto indices = get_next_ptr<int>(count, workspace, workspace_size);
        auto indices_sorted = get_next_ptr<int>(count, workspace, workspace_size);
        auto scores = get_next_ptr<float>(count, workspace, workspace_size);
        auto scores_sorted = get_next_ptr<float>(count, workspace, workspace_size);

        // printf("nms batch %d \n", batch_size);

        for (int batch = 0; batch < batch_size; batch++) {
            auto in_scores = static_cast<const float *>(inputs[0]) + batch * count;
            auto in_boxes = static_cast<const float4 *>(inputs[1]) + batch * count;
            auto in_classes = static_cast<const float *>(inputs[2]) + batch * count;
            auto in_points = static_cast<const float *>(inputs[3]) + batch * count;


            auto out_scores = static_cast<float *>(outputs[0]) + batch * detections_per_im;
            auto out_boxes = static_cast<float4 *>(outputs[1]) + batch * detections_per_im;
            auto out_classes = static_cast<float *>(outputs[2]) + batch * detections_per_im;
            auto out_points = static_cast<float4 *>(outputs[3]) + batch * detections_per_im;
            

           
            // cudaMemcpyAsync(tmp, out_scores, 10 * sizeof(float), cudaMemcpyDeviceToHost, stream);
            // printf("output %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f\n", tmp[0],tmp[1],tmp[2],tmp[3],tmp[4],tmp[5],tmp[6],tmp[7],tmp[8],tmp[9]);

            // Discard null scores
            thrust::transform(on_stream, in_scores, in_scores + count,flags, thrust::placeholders::_1 > 0.0f);

            int *num_selected = reinterpret_cast<int *>(indices_sorted);
            cub::DeviceSelect::Flagged(workspace, workspace_size, cub::CountingInputIterator<int>(0),flags, indices, num_selected, count, stream);
            cudaStreamSynchronize(stream);
            int num_detections = *thrust::device_pointer_cast(num_selected);

            // Sort scores and corresponding indices
            thrust::gather(on_stream, indices, indices + num_detections, in_scores, scores);
            cub::DeviceRadixSort::SortPairsDescending(workspace, workspace_size,scores, scores_sorted, indices, indices_sorted, num_detections, 0, sizeof(*scores)*8, stream);

            // Launch actual NMS kernel - 1 block with each thread handling n detections
            const int max_threads = 1024;
            int num_per_thread = ceil((float)num_detections / max_threads);
            nms_kernel<<<1, max_threads, 0, stream>>>(num_per_thread, nms_thresh, num_detections,
            indices_sorted, scores_sorted, in_classes, in_boxes);

            // Re-sort with updated scores
            cub::DeviceRadixSort::SortPairsDescending(workspace, workspace_size,
            scores_sorted, scores, indices_sorted, indices, num_detections, 0, sizeof(*scores)*8, stream);

            // Gather filtered scores, boxes, classes
            num_detections = min(detections_per_im, num_detections);
            cudaMemcpyAsync(out_scores, scores, num_detections * sizeof *scores, cudaMemcpyDeviceToDevice, stream);
            if (num_detections < detections_per_im) {
                thrust::fill_n(on_stream, out_scores + num_detections, detections_per_im - num_detections, 0);
            }
            thrust::gather(on_stream, indices, indices + num_detections, in_boxes, out_boxes);
            thrust::gather(on_stream, indices, indices + num_detections, in_classes, out_classes);
            thrust::gather(on_stream, indices, indices + num_detections, in_points, out_points);

			float tmp[10];
			cudaMemcpyAsync(tmp, out_points, 10 * sizeof(float), cudaMemcpyDeviceToHost, stream);
			printf("out_points %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f\n", tmp[0], tmp[1], tmp[2], tmp[3], tmp[4], tmp[5], tmp[6], tmp[7], tmp[8], tmp[9]);
            // printf("num_detections %d \n", num_detections);
            // cudaMemcpyAsync(tmp, out_scores, 10 * sizeof(float), cudaMemcpyDeviceToHost, stream);
            // printf("output %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f %0.2f\n", tmp[0],tmp[1],tmp[2],tmp[3],tmp[4],tmp[5],tmp[6],tmp[7],tmp[8],tmp[9]);
        }
        
        return 0;
    }