setuptools编译cuda扩展

lucky404

已于 2024-08-09 15:22:24 修改

阅读量57

点赞数 2

文章标签： numpy scikit-learn

于 2024-08-09 15:19:21 首次发布

本文链接：https://blog.csdn.net/lucky404/article/details/141062429

版权

# 我需要在python端调用c++ cuda接口,使用pybind11 实现
# cuda 接口需要图像宽度,高度和numpy array的图像数据作为参数, 返回numpy array 类型, 定义接口如下:
py::array_t<unsigned int> faster_label(size_t image_width, size_t image_height, py::array_t<unsigned char>& input_image);

# 该接口在binding.cu 里面这是一个cuda文件
# binding.cu 完整定义如下:


#include <iostream>
#include <vector>
#include "timer.h"
#include "CCL.cuh"
#include "utils.hpp"
#include <pybind11/pybind11.h>
#include<pybind11/numpy.h>
#include <pybind11/stl.h>
namespace py = pybind11;
using namespace std;


void checkCudaErrors(cudaError_t result) {
    if (result != cudaSuccess) {
        cerr << "CUDA Runtime Error: " << cudaGetErrorString(result) << endl;
        exit(EXIT_FAILURE);
    }
}

py::array_t<unsigned int> faster_label(size_t image_width, size_t image_height, py::array_t<unsigned char>& input_image) {
    auto buf = input_image.request();
    unsigned char* image_data = static_cast<unsigned char*>(buf.ptr);
    size_t numPixels, numRows, numCols;
    numCols = image_width;
    numRows = image_height;
    numPixels = numRows*numCols;
    // Allocate GPU data
    unsigned  int* device_labels;
    cudaMallocManaged(&device_labels, numPixels * sizeof(int));
    checkCudaErrors(cudaGetLastError());
    // Allocate and copy the original image data to GPU memory
    unsigned char* gpu_image;
    cudaMallocManaged(&gpu_image, numPixels * sizeof(char));
    checkCudaErrors(cudaGetLastError());
    cudaMemcpy(gpu_image, image_data, numPixels * sizeof(char), cudaMemcpyHostToDevice);
    checkCudaErrors(cudaGetLastError());
    GpuTimer timer;
    timer.Start();
    connectedComponentLabeling(device_labels, gpu_image, numCols, numRows);
    cudaDeviceSynchronize();
    unsigned int* label;
    label = new unsigned int[numPixels];
    cudaMemcpy(label,  device_labels, numPixels * sizeof(int), cudaMemcpyDeviceToHost);
    py::array_t<unsigned int> out({image_height, image_width});
    auto out_buf = out.request();
    memcpy(static_cast<unsigned int*>(out_buf.ptr), label, out_buf.size * out_buf.itemsize);
    timer.Stop();
    cout << "GPU code ran in: " << timer.Elapsed() << "ms" << endl;
    unsigned int components = util::countComponents(device_labels, numPixels);
    cout << "Number of components: " << components << endl;
    // Free memory
    cudaFree(gpu_image);
    cudaFree(device_labels);
    return out;
}
void release_mem(const unsigned int* ptr) {
    delete [] ptr;
}

PYBIND11_MODULE(faster_ccl, m) {
    m.doc() = "Example bindings for CUDA CCL";
    m.def("faster_label", &faster_label, "Perform connected component labeling on an image.", py::arg("image_data"), py::arg("image_width"), py::arg("image_height"));
    m.def("release_mem", &release_mem, "Release memory allocated by faster_label.");
}

# 我们使用pybind11 定义了接口以及参数
# 接下来需要定义setup.py 文件定义如何编译cuda扩展

from setuptools import setup
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
ext_modules = [CUDAExtension('faster_ccl', ['binding.cu', 'CCL.cu',])]
setup(
    name='cuda ccl python extension',
    version='0.1',
    author='404NotFound',
    description='A Python extension using PyBind11',
    ext_modules=ext_modules,
    cmdclass={'build_ext': BuildExtension},
    zip_safe=False,
)
![使用 python setup.py build_ext 进行编译](https://i-blog.csdnimg.cn/direct/4d85b9b97a1a4e06b756d966ba2d8247.png#pic_center)

编译成功后生成的pyd 直接import 调用即可
import cv2
from skimage import measure
import faster_ccl
import time
image = cv2.imread('2.png', cv2.IMREAD_GRAYSCALE)
image_h, image_w = image.shape[:2]
label = faster_ccl.faster_label(image_w, image_h, image)
start = time.time()
regions = measure.regionprops(label)
for (j, i) in enumerate(regions):
    (min_row, min_col, max_row, max_col) = i.bbox
end = time.time()
cost = end - start
print("cost -------->", cost)