GPU矩阵计算

最新推荐文章于 2024-04-30 10:54:54 发布

leon1171

最新推荐文章于 2024-04-30 10:54:54 发布

阅读量4.6k

点赞数

本文链接：https://blog.csdn.net/leon1171/article/details/90438746

版权

学习GPU加速参考例子和网上博文写的GPU矩阵计算

//矩阵转置
__global__ void transposeGPU(double *outdata, double *indata, int width, int height)
{
   __shared__ double block[BLOCK_DIM][BLOCK_DIM + 1];
   unsigned int xIndex = blockIdx.x*BLOCK_DIM + threadIdx.x;
   unsigned int yIndex = blockIdx.y*BLOCK_DIM + threadIdx.y;

   if ((xIndex < width) && (yIndex < height))
   {
       unsigned int index_in = yIndex*width + xIndex;
       block[threadIdx.y][threadIdx.x] = indata[index_in];
   }

__syncthreads();

xIndex = blockIdx.y*BLOCK_DIM + threadIdx.x;
yIndex = blockIdx.x*BLOCK_DIM + threadIdx.y;

   if ((xIndex < height) && (yIndex < width))
   {
       unsigned int index_out = yIndex*height + xIndex;
       outdata[index_out] = block[threadIdx.x][threadIdx.y];
       //       std::cout <<" !!index_out:" <<index_out<<" block[threadIdx.x][threadIdx.y]:"<< block[threadIdx.x][threadIdx.y] <<std::endl;
   }

}

void transpose(double *outdata, double *indata, int width, int height)
{
   double *d_a, *d_b;
   cudaMalloc((double**)&d_a, sizeof(double) * width * height);
   cudaMalloc((double**)&d_b, sizeof(double) * height * width);

cudaMemcpy((void*)d_a, (void*)indata, sizeof(double) * width * height, cudaMemcpyHostToDevice);

   dim3 Threads(BLOCK_DIM, BLOCK_DIM);
   int b_x = ceil((float)width / (float)BLOCK_DIM);
   int b_y = ceil((float)height / (float)BLOCK_DIM);
   dim3 Blocks(b_x, b_y);

transposeGPU << <Blocks, Threads >> > (d_b, d_a, width, height);
cudaMemcpy((void*)outdata, (void*)d_b, sizeof(double) * width * height, cudaMemcpyDeviceToHost);

}
//矩阵乘法

template<int BLOCK_SIZE> __global__ void MatrixMulGPU(double *c, const double *a, const double *b, unsigned int WA, unsigned int WB)
{
// Block index
in

最低0.47元/天解锁文章

leon1171

关注

0
点赞
踩
6

收藏

觉得还不错? 一键收藏
1
评论
GPU矩阵计算

学习GPU加速参考例子和网上博文写的GPU矩阵计算//矩阵转置__global__ void transposeGPU(double *outdata, double *indata, int width, int height){ __shared__ double block[BLOCK_DIM][BLOCK_DIM + 1]; unsigned int xIndex...
复制链接

扫一扫