CUDA算法：统计直方图（以及优化加速）

最新推荐文章于 2025-02-18 23:07:42 发布

DevinTT

最新推荐文章于 2025-02-18 23:07:42 发布

阅读量4.2k

点赞数 8

分类专栏： cuda 高性能文章标签： cuda 高性能

本文链接：https://blog.csdn.net/DevinTT/article/details/89359696

版权

cuda 同时被 2 个专栏收录

4 篇文章

订阅专栏

高性能

4 篇文章

订阅专栏

直方图是用来统计像素灰度值分布的一种方法，同理也可以统计RGB色素的分布。调整图像的对比度则需要直方图分布概率，才能进行灰度变换。

对于CUDA来说，可以并行读取多个像素点的值，然后进行统计，但是统计过程则是串行的。这里需要使用CUDA串行指令“原子操作”。

核函数：

__global__ void cudaMatHistogram(CudaImg8Mat *cudaMat, unsigned int *histogram){
    unsigned int col = blockIdx.x*blockDim.x + threadIdx.x;
    unsigned int row = blockIdx.y*blockDim.y + threadIdx.y;
    if(col>= cudaMat->col || row>=cudaMat->row) return;

    atomicAdd(&histogram[cudaMat->mat[col+row*cudaMat->col]],1);
}

这里包含了一个“原子操作”：

int atomicAdd(int* address, int val);

指的是：多个thread对某个地址的变量进行加操作，也就是每个thread有序轮流对这个变量+val。

如果使用单纯的 histogram[cudaMat->mat[col+row*cudaMat->col]]++；则会出现多个threads同步写竞争，造成数据出错。

主函数调用：

#include <cuda_runtime.h>
#include <opencv2/opencv.hpp>

typedef struct CudaImg8Mat{
    unsigned int row;
    unsigned int col;
    unsigned char* mat;
};


int main(int argc,char **argv) {
    
    cv::Mat img = cv::imread("../img/lena.jpg", CV_LOAD_IMAGE_GRAYSCALE);    //打开图像
    CudaImg8Mat *cudaMatA;
    unsigned int *histogram;
    cudaMallocManaged(&cudaMatA, sizeof(CudaMat));
    cudaMallocManaged(&histogram, 256*sizeof(unsigned int));

    cudaMatA->row = img.rows;
    cudaMatA->col = img.cols;

    const int BLOCK_SIZE = 32;
    dim3 DimGrid((cudaMatB->col+BLOCK_SIZE-1)/BLOCK_SIZE, (cudaMatB->row+BLOCK_SIZE-1)/BLOCK_SIZE);
    dim3 DimBlock(BLOCK_SIZE, BLOCK_SIZE);

    cudaMallocManaged(&cudaMatA->mat, cudaMatA->col * cudaMatA->row *sizeof(unsigned char));

    cudaMemcpy(cudaMatA->mat, img.data, cudaMatA->col * cudaMatA->row *sizeof(unsigned char), cudaMemcpyHostToDevice);

    cudaMatHistogram<<<DimGrid,DimBlock>>>(cudaMatA, histogram);    //启动统计直方图
    cudaSafeCall(cudaDeviceSynchronize());

    for(int i=0; i<256; i++) {printf("%d ", histogram[i]); histogram[i]=0;} printf("\n"); //打印直方图信息
    cudaFree(cudaMatA);
    return 0;
}

优化（使用共享内存）：

统计直方图是CUDA共享内存的经典使用之一。应用场景：多个threads同时操作一个内存地址。使用共享内存的作用是减少访问全局内存（板卡显存）的次数，省略不必要的访存延时。

__global__ void cudaMatHistogramAccShared(CudaImg8Mat *cudaMat, unsigned int *histogram){
    unsigned int col = blockIdx.x*blockDim.x + threadIdx.x;
    unsigned int row = blockIdx.y*blockDim.y + threadIdx.y;
    if(col>= cudaMat->col || row>=cudaMat->row) return;

    __shared__ unsigned int s[256];
    if(threadIdx.y<(256/blockDim.x))
        s[threadIdx.x + threadIdx.y*blockDim.x]=0;
    __syncthreads();

    atomicAdd(&s[cudaMat->mat[col+row*cudaMat->col]],1);

    if(threadIdx.y<(256/blockDim.x))
        atomicAdd(&histogram[threadIdx.x + threadIdx.y*blockDim.x], s[threadIdx.x + threadIdx.y*blockDim.x]);
}

这里的思路是将图像按照block的大小切块，分别在block里面统计小图片的直方图，然后再将所有的直方图相加，放回显存中。

值得注意的是，使用共享内存需要在每一步“写入”共享内存之后加入 __syncthreads(); 对所有threads进行同步（也就是众多block的同步）。另外这里还是不能避免使用“原子操作” atomicAdd，这是算法决定的，但是这里“原子操作”的是共享内存（属于L1缓存），访存延时极短，远远小于显存的访存延时，从而获得加速效果。