矩阵的转置就是对角的元素交换位置:定义A的转置为这样一个n×m阶矩阵B,满足B=b(j,i),即 a(i,j)=b (j,i)(B的第i行第j列元素是A的第j行第i列元素),记A'=B。
由于该算法数据之间没有依赖关系,很适合使用cuda进行并行加速运算。
核函数:
__global__ void cudaMatTrans(CudaImg8Mat *cudaMatA, CudaImg8Mat *cudaMatB){
unsigned int col = blockIdx.x*blockDim.x + threadIdx.x;
unsigned int row = blockIdx.y*blockDim.y + threadIdx.y;
if(col>= cudaMatA->col || row>=cudaMatA->row) return;
cudaMatB->mat[row+col*cudaMatA->row] = cudaMatA->mat[col+row*cudaMatA->col];
if(col==0 && row==0){
cudaMatB->row = cudaMatA->col;
cudaMatB->col = cudaMatA->row;
}
}
主函数展开(这里使用了lena的图像进行矩阵转置):
#include <cuda_runtime.h>
#include <opencv2/opencv.hpp>
typedef struct CudaImg8Mat{
unsigned int row;
unsigned int col;
unsigned char* mat;
};
int main(int argc,char **argv) {
cv::Mat img = cv::imread("../img/lena.jpg", CV_LOAD_IMAGE_GRAYSCALE);
CudaImg8Mat *cudaMatA, *cudaMatB;
unsigned int *histogram;
cudaMallocManaged(&cudaMatA, sizeof(CudaMat));
cudaMallocManaged(&cudaMatB, sizeof(CudaMat));
cudaMallocManaged(&histogram, 256*sizeof(unsigned int));
cudaMatA->row = img.rows;
cudaMatA->col = img.cols;
cudaMatB->row = (unsigned int)(MUTL*img.rows);
cudaMatB->col = (unsigned int)(MUTL*img.cols);
const int BLOCK_SIZE = 32;
dim3 DimGrid((cudaMatB->col+BLOCK_SIZE-1)/BLOCK_SIZE, (cudaMatB->row+BLOCK_SIZE-1)/BLOCK_SIZE);
dim3 DimBlock(BLOCK_SIZE, BLOCK_SIZE);
cudaMallocManaged(&cudaMatA->mat, cudaMatA->col * cudaMatA->row *sizeof(unsigned char));
cudaMallocManaged(&cudaMatB->mat, cudaMatB->col * cudaMatB->row *sizeof(unsigned char));
cudaMemcpy(cudaMatA->mat, img.data, cudaMatA->col * cudaMatA->row *sizeof(unsigned char), cudaMemcpyHostToDevice);
cudaMatTrans<<<DimGrid,DimBlock>>>(cudaMatA, cudaMatB);
cudaSafeCall(cudaDeviceSynchronize());
cv::Mat img2 = cv::Mat(cudaMatB->row, cudaMatB->col, CV_8UC1);
cudaMemcpy(img2.data, cudaMatB->mat, cudaMatB->col * cudaMatB->row *sizeof(unsigned char), cudaMemcpyDeviceToHost);
cv::imshow("lenaEQU", img2);
cv::waitKey(0);
}
优化(使用共享内存):
__global__ void cudaMatTrans(CudaMat *cudaMat){
unsigned int col = blockIdx.x*blockDim.x + threadIdx.x;
unsigned int row = blockIdx.y*blockDim.y + threadIdx.y;
if(col>= cudaMat->col || row>=cudaMat->row) return;
extern __shared__ int s[];
s[col+row*cudaMat->col] = cudaMat->mat[col+row*cudaMat->col];
__syncthreads();
cudaMat->mat[row+col*cudaMat->row] = s[col+row*cudaMat->col];
if(col==0 && row==0){
unsigned int t=cudaMat->row;
cudaMat->row=cudaMat->col;
cudaMat->col=t;
}
}
但是缺点也很明显,由于共享内存的大小有限,最多可以设置为48K,所以能存储的像素点就很少了,一张8位图像,大小为256*256 = 64KB了,即使能加速但是不太实用。