学习记录 --用CUDA完成泛洪算法

本文链接：https://blog.csdn.net/qq_43422837/article/details/105900130

算法思路

传统的泛红算法主要有两种方法，递归或者队列扫描，这两种方法在总体思路大致都是从种子点开始，逐步扫描周围的符合要求的点（比如和种子点颜色距离小于一个规定的值），当出现符合要求的点后将此点染色，同时继续在此点周围扫描下一个符合要求的点。
此cuda泛洪算法也借鉴了传统的cpu算法，从种子点开始“生长”，但与cpu算法不同的是，使用cuda时的“扫描”下一个符合要求的点时，是并行处理的。这就意味着，当生长的区域的接近圆形块状区域时，使用cuda的扫描将比cpu快很多倍，可以理解为cpu扫描一次只能判断一个点，cuda一次可以判断当前已染色区域轮廓外的一圈点。

具体实现

首先确定cuda所使用的block与thread：因为存在扫描的循环何时停止问题，各线程之间需要一个共同的变量来判断while循环是否进行下去，而各线程的处理进度肯定各不相同，所以此时需要一个线程间的同步：__syncthread() 。所以，我选择在一个block里装满线程，正好也可以使用block内的共享内存来完成while循环的判断。
借鉴cpu使用队列来存储需扫描的点的思路，我直接创建了一个与原始图片同样大小的数组state[]（通过x和y计算offset，同样是一维数组），数组初始化全0，表示所有点未被访问过。在扫描的过程中，当某个点被染色后，就讲此点所对应的state数组中的值改为1，表示已访问过，且此后的扫描将不再访问此点。
定义一个共享内存need_continue，作为主while循环的判定条件，定义一个共享内存more，表示此次循环有扫描到新的符合条件的点。在每一次的主while循环的开始，先将more置0，同步一下。此后若有线程扫描到新的点，便将more置1。此轮主循环的最后，若more是1，则主while循环继续，否则，就退出循环，结束计算。

具体代码

CUDA主函数

void floodFillwithGPU( uchar * mat_data, int rows, int cols, int startX, int startY,int randR,int randG,int randB) {
    uchar* gpu_mat_data;
    uchar* gpu_mat_state;
    cudaError_t cudaStatus;

    //申请空间
    cudaStatus = cudaMalloc((void**)&gpu_mat_data, rows * cols * 3);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto err;
    }
    cudaStatus = cudaMalloc((void**)&gpu_mat_state, rows * cols  );
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto err;
    }
    cudaStatus = cudaMemcpy(gpu_mat_data,mat_data, 
        rows * cols * 3,
        cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto err;
    }

    kernel << < 1, threadsPerBlock >> > (gpu_mat_data, gpu_mat_state, startX, startY, rows, cols, randR, randG, randB);
    
    cudaStatus = cudaMemcpy(mat_data, gpu_mat_data,
        rows * cols * 3,
        cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto err;
    }

err:
    cudaFree(gpu_mat_data);
    cudaFree(gpu_mat_state);
}

核函数

__global__ void kernel(unsigned char* mat_p, unsigned char* state_p, int startX, int startY , int rows, int cols, int randR, int randG, int randB) {
    __shared__ int R,G,B;
    __shared__ int need_continue;
    __shared__ int more;

    int size = rows * cols;
    int cnt = 0;
    int i;
    int offset = threadIdx.x;
    int root_offset = startX + startY * cols;

    while (offset < size) {
        //初始化，获得root颜色
        if(offset == root_offset) {
            R = mat_p[root_offset * 3 + 0];
            G = mat_p[root_offset * 3 + 1];
            B = mat_p[root_offset * 3 + 2];
            deal(offset , mat_p, state_p, R, G, B,randR,randG,randB);
            //more = true;
            state_p[offset] = 1;
        }
        else {
            state_p[offset] = 0;
        }
        offset += blockDim.x;
    }
    if (threadIdx.x == 0) {
        need_continue = 1;
        more = 0;
    }

    __syncthreads();

    while (need_continue) {
        offset = threadIdx.x;
        while (offset < size) {
            // cols 为每行元素个数
            if (state_p[offset] == 1) {
                if (!more) {
                    atomicAdd(&more, 1);
                }
                state_p[offset ] = 2;
                if (offset % cols == 0) {//left side
                    if (offset / cols == 0) { //top side
                        deal(offset + 1, mat_p, state_p, R, G, B, randR, randG, randB);
                        deal(offset + cols, mat_p, state_p, R, G, B, randR, randG, randB);
                    }
                    else if (offset / cols == rows -1 ) { //bottom
                        deal(offset + 1, mat_p, state_p, R, G, B, randR, randG, randB);
                        deal(offset - cols, mat_p, state_p, R, G, B, randR, randG, randB);
                    }
                    else {
                        deal(offset + 1, mat_p, state_p, R, G, B, randR, randG, randB);
                        deal(offset + cols, mat_p, state_p, R, G, B, randR, randG, randB);
                        deal(offset - cols, mat_p, state_p, R, G, B, randR, randG, randB);
                    }
                }
                else if (offset % cols == cols - 1) { //right side
                    if (offset / cols == 0) { //top side
                        deal(offset - 1, mat_p, state_p, R, G, B, randR, randG, randB);
                        deal(offset + cols, mat_p, state_p, R, G, B, randR, randG, randB);
                    }
                    else if (offset / cols == rows - 1) { //bottom
                        deal(offset - 1, mat_p, state_p, R, G, B, randR, randG, randB);
                        deal(offset - cols, mat_p, state_p, R, G, B, randR, randG, randB);
                    }
                    else {
                        deal(offset - 1, mat_p, state_p, R, G, B, randR, randG, randB);
                        deal(offset + cols, mat_p, state_p, R, G, B, randR, randG, randB);
                        deal(offset - cols, mat_p, state_p, R, G, B, randR, randG, randB);
                    }
                }
                else if (offset / cols == 0) { //just top
                    deal(offset - 1, mat_p, state_p, R, G, B, randR, randG, randB);
                    deal(offset + 1, mat_p, state_p, R, G, B, randR, randG, randB);
                    deal(offset + cols, mat_p, state_p, R, G, B, randR, randG, randB);
                }
                else if (offset / cols == rows - 1) { //just top
                    deal(offset - 1, mat_p, state_p, R, G, B, randR, randG, randB);
                    deal(offset + 1, mat_p, state_p, R, G, B, randR, randG, randB);
                    deal(offset - cols, mat_p, state_p, R, G, B, randR, randG, randB);
                }
                else {
                    deal(offset - cols, mat_p, state_p, R, G, B, randR, randG, randB);
                    deal(offset + cols, mat_p, state_p, R, G, B, randR, randG, randB);
                    deal(offset - 1, mat_p, state_p, R, G, B, randR, randG, randB);
                    deal(offset + 1, mat_p, state_p, R, G, B, randR, randG, randB);
                }
            }
            offset += blockDim.x;
        }
        __syncthreads();

        if (threadIdx.x == 0) {
            if (more) {
                need_continue = true;
                more = 0;
            }
            else {
                need_continue = false;
            }
        }
        __syncthreads();
    }
}

上色函数

__device__ void deal( int new_offset, unsigned char* mat_p, unsigned char* state_p,int R,int G,int B, int randR, int randG, int randB) {
    if (state_p[new_offset] == 0) {
        int r = mat_p[new_offset * 3 + 0];
        int g = mat_p[new_offset * 3 + 1];
        int b = mat_p[new_offset * 3 + 2];
        if( fabsf(r-R)<10 && fabsf(g-G)<10 && fabsf(b-B)<10 ){
        //if(r==R && g==G && b==B){
            state_p[ new_offset ] = 1;
            mat_p[new_offset * 3 + 0] = randB;
            mat_p[new_offset * 3 + 1] = randG;
            mat_p[new_offset * 3 + 2] = randR;
        }
    }
}