CUDA算法:矩阵转置(及其加速优化)

矩阵的转置就是对角的元素交换位置:定义A的转置为这样一个n×m阶矩阵B,满足B=b(j,i),即 a(i,j)=b (j,i)(B的第i行第j列元素是A的第j行第i列元素),记A'=B。

由于该算法数据之间没有依赖关系,很适合使用cuda进行并行加速运算。

核函数:

__global__ void cudaMatTrans(CudaImg8Mat *cudaMatA, CudaImg8Mat *cudaMatB){
    unsigned int col = blockIdx.x*blockDim.x + threadIdx.x;
    unsigned int row = blockIdx.y*blockDim.y + threadIdx.y;
    if(col>= cudaMatA->col || row>=cudaMatA->row) return;

    cudaMatB->mat[row+col*cudaMatA->row] = cudaMatA->mat[col+row*cudaMatA->col];

    if(col==0 && row==0){
        cudaMatB->row = cudaMatA->col;
        cudaMatB->col = cudaMatA->row;
    }
}

主函数展开(这里使用了lena的图像进行矩阵转置):

#include <cuda_runtime.h>
#include <opencv2/opencv.hpp>

typedef struct CudaImg8Mat{
    unsigned int row;
    unsigned int col;
    unsigned char* mat;
};

int main(int argc,char **argv) {
    cv::Mat img = cv::imread("../img/lena.jpg", CV_LOAD_IMAGE_GRAYSCALE);
    CudaImg8Mat *cudaMatA, *cudaMatB;
    unsigned int *histogram;
    cudaMallocManaged(&cudaMatA, sizeof(CudaMat));
    cudaMallocManaged(&cudaMatB, sizeof(CudaMat));
    cudaMallocManaged(&histogram, 256*sizeof(unsigned int));

    cudaMatA->row = img.rows;
    cudaMatA->col = img.cols;
    cudaMatB->row = (unsigned int)(MUTL*img.rows);
    cudaMatB->col = (unsigned int)(MUTL*img.cols);

    const int BLOCK_SIZE = 32;
    dim3 DimGrid((cudaMatB->col+BLOCK_SIZE-1)/BLOCK_SIZE, (cudaMatB->row+BLOCK_SIZE-1)/BLOCK_SIZE);
    dim3 DimBlock(BLOCK_SIZE, BLOCK_SIZE);

    cudaMallocManaged(&cudaMatA->mat, cudaMatA->col * cudaMatA->row *sizeof(unsigned char));
    cudaMallocManaged(&cudaMatB->mat, cudaMatB->col * cudaMatB->row *sizeof(unsigned char));
    cudaMemcpy(cudaMatA->mat, img.data, cudaMatA->col * cudaMatA->row *sizeof(unsigned char), cudaMemcpyHostToDevice);
        
    cudaMatTrans<<<DimGrid,DimBlock>>>(cudaMatA, cudaMatB);
    cudaSafeCall(cudaDeviceSynchronize());

    cv::Mat img2 = cv::Mat(cudaMatB->row, cudaMatB->col, CV_8UC1);
    cudaMemcpy(img2.data, cudaMatB->mat, cudaMatB->col * cudaMatB->row *sizeof(unsigned char), cudaMemcpyDeviceToHost);
    cv::imshow("lenaEQU", img2);
    cv::waitKey(0);
}

 

优化(使用共享内存):

__global__ void cudaMatTrans(CudaMat *cudaMat){
    unsigned int col = blockIdx.x*blockDim.x + threadIdx.x;
    unsigned int row = blockIdx.y*blockDim.y + threadIdx.y;
    if(col>= cudaMat->col || row>=cudaMat->row) return;

    extern __shared__ int s[];
    s[col+row*cudaMat->col] = cudaMat->mat[col+row*cudaMat->col];
    __syncthreads();

    cudaMat->mat[row+col*cudaMat->row] = s[col+row*cudaMat->col];

    if(col==0 && row==0){
        unsigned int t=cudaMat->row;
        cudaMat->row=cudaMat->col;
        cudaMat->col=t;
    }
}

但是缺点也很明显,由于共享内存的大小有限,最多可以设置为48K,所以能存储的像素点就很少了,一张8位图像,大小为256*256 = 64KB了,即使能加速但是不太实用。

  • 3
    点赞
  • 12
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值