CUDA算法：矩阵转置（及其加速优化）

最新推荐文章于 2024-08-23 15:56:40 发布

DevinTT

最新推荐文章于 2024-08-23 15:56:40 发布

阅读量3.3k

点赞数 3

分类专栏：高性能 cuda 文章标签： cuda 图像处理高性能

本文链接：https://blog.csdn.net/devintt/article/details/89356545

版权

高性能同时被 2 个专栏收录

4 篇文章 0 订阅

订阅专栏

cuda

4 篇文章 0 订阅

订阅专栏

矩阵的转置就是对角的元素交换位置：定义A的转置为这样一个n×m阶矩阵B，满足B=b(j,i)，即 a(i,j)=b (j,i)（B的第i行第j列元素是A的第j行第i列元素），记A'=B。

由于该算法数据之间没有依赖关系，很适合使用cuda进行并行加速运算。

核函数：

__global__ void cudaMatTrans(CudaImg8Mat *cudaMatA, CudaImg8Mat *cudaMatB){
    unsigned int col = blockIdx.x*blockDim.x + threadIdx.x;
    unsigned int row = blockIdx.y*blockDim.y + threadIdx.y;
    if(col>= cudaMatA->col || row>=cudaMatA->row) return;

    cudaMatB->mat[row+col*cudaMatA->row] = cudaMatA->mat[col+row*cudaMatA->col];

    if(col==0 && row==0){
        cudaMatB->row = cudaMatA->col;
        cudaMatB->col = cudaMatA->row;
    }
}

主函数展开（这里使用了lena的图像进行矩阵转置）：

#include <cuda_runtime.h>
#include <opencv2/opencv.hpp>

typedef struct CudaImg8Mat{
    unsigned int row;
    unsigned int col;
    unsigned char* mat;
};

int main(int argc,char **argv) {
    cv::Mat img = cv::imread("../img/lena.jpg", CV_LOAD_IMAGE_GRAYSCALE);
    CudaImg8Mat *cudaMatA, *cudaMatB;
    unsigned int *histogram;
    cudaMallocManaged(&cudaMatA, sizeof(CudaMat));
    cudaMallocManaged(&cudaMatB, sizeof(CudaMat));
    cudaMallocManaged(&histogram, 256*sizeof(unsigned int));

    cudaMatA->row = img.rows;
    cudaMatA->col = img.cols;
    cudaMatB->row = (unsigned int)(MUTL*img.rows);
    cudaMatB->col = (unsigned int)(MUTL*img.cols);

    const int BLOCK_SIZE = 32;
    dim3 DimGrid((cudaMatB->col+BLOCK_SIZE-1)/BLOCK_SIZE, (cudaMatB->row+BLOCK_SIZE-1)/BLOCK_SIZE);
    dim3 DimBlock(BLOCK_SIZE, BLOCK_SIZE);

    cudaMallocManaged(&cudaMatA->mat, cudaMatA->col * cudaMatA->row *sizeof(unsigned char));
    cudaMallocManaged(&cudaMatB->mat, cudaMatB->col * cudaMatB->row *sizeof(unsigned char));
    cudaMemcpy(cudaMatA->mat, img.data, cudaMatA->col * cudaMatA->row *sizeof(unsigned char), cudaMemcpyHostToDevice);
        
    cudaMatTrans<<<DimGrid,DimBlock>>>(cudaMatA, cudaMatB);
    cudaSafeCall(cudaDeviceSynchronize());

    cv::Mat img2 = cv::Mat(cudaMatB->row, cudaMatB->col, CV_8UC1);
    cudaMemcpy(img2.data, cudaMatB->mat, cudaMatB->col * cudaMatB->row *sizeof(unsigned char), cudaMemcpyDeviceToHost);
    cv::imshow("lenaEQU", img2);
    cv::waitKey(0);
}

优化（使用共享内存）：

__global__ void cudaMatTrans(CudaMat *cudaMat){
    unsigned int col = blockIdx.x*blockDim.x + threadIdx.x;
    unsigned int row = blockIdx.y*blockDim.y + threadIdx.y;
    if(col>= cudaMat->col || row>=cudaMat->row) return;

    extern __shared__ int s[];
    s[col+row*cudaMat->col] = cudaMat->mat[col+row*cudaMat->col];
    __syncthreads();

    cudaMat->mat[row+col*cudaMat->row] = s[col+row*cudaMat->col];

    if(col==0 && row==0){
        unsigned int t=cudaMat->row;
        cudaMat->row=cudaMat->col;
        cudaMat->col=t;
    }
}

但是缺点也很明显，由于共享内存的大小有限，最多可以设置为48K，所以能存储的像素点就很少了，一张8位图像，大小为256*256 = 64KB了，即使能加速但是不太实用。