Cuda编程——使用share memory优化矩阵乘法

本文介绍了如何通过使用CUDA的sharememory来优化矩阵乘法算法,相比于全局内存,共享内存访问速度更快,从而显著提高并行计算的性能,尤其是在大数据量场景中。
摘要由CSDN通过智能技术生成

在上一篇文章:第一个Cuda程序,矩阵相乘代码,我们设计了一种并行的矩阵乘法程序,效果和使用CPU计算的一样,但时间有了很大的降低,然而,这只是最基本的一种方法,事实上我们完全可以让程序变得更快!

仔细看看,会发现我们使用的是global memory,而share memory的访问速度要远远大于global memory,所以我们将使用share memory优化矩阵乘法,让程序更快!

#include <stdio.h>#include<stdlib.h>#include <cuda.h>#include <cuda_runtime.h>#include <device_launch_parameters.h>#include <device_functions.h>
# define BLOCK_SIZE 8# define M 6# define N 8# define K 6

__managed__ float a[M * N];__managed__ float b[N*K];__managed__ float c_gpu[M * K];__managed__ float c_cpu[M * K];


__global__ void gpu_matrix(float* a, float* b, float* c, const int m, const int n, const int k){  __shared__ float sub_a[BLOCK_SIZE][BLOCK_SIZE];  __shared__ float sub_b[BLOCK_SIZE][BLOCK_SIZE];  int x = threadIdx.x + blockIdx.x * blockDim.x;  int y = threadIdx.y + blockIdx.y * blockDim.y;  float temp = 0.0;  int step, i;

  for (step = 0; step <N / BLOCK_SIZE; step++)  {    if ((step * BLOCK_SIZE + threadIdx.x) >= N || y >= M)    {      sub_a[threadIdx.y][threadIdx.x] = 0.0;    }    else    {      sub_a[threadIdx.y][threadIdx.x] = a[y * N + (step * BLOCK_SIZE + threadIdx.x)];    }
    if ((step * BLOCK_SIZE + threadIdx.y) >= N || x >= K)    {      sub_b[threadIdx.y][threadIdx.x] = 0.0;
    }    else    {      sub_b[threadIdx.y][threadIdx.x] = b[(step * BLOCK_SIZE + threadIdx.y) * K + x];    }

    __syncthreads();

    for (i = 0; i < BLOCK_SIZE; i++)    {      temp = temp + sub_a[threadIdx.y][i] * sub_b[i][threadIdx.x];    }    __syncthreads();
    if (x < K && y < M)    {      c[y * K + x] = temp;    }

  }


}
void cpu_matrix(float* a, float* b, float* c, const int m, const int n, const int k){  int y, x, step;  float temp;  for (y = 0; y < M; y++)  {    for (x = 0; x < K; x++)    {      temp = 0;      for (step = 0; step < N; step++)      {        temp += a[y * N + step] * b[step * K + x];      }      c[y * K + x] = temp;    }  }}
int main(){  int x,y;  float item1;  //初始化矩阵  for (y = 0; y < M; y++)  {

    for (x = 0; x < N; x++)    {      item1 = x + y;      a[y * N + x] = item1;
    }
  }



  for (y = 0; y < N; y++)  {

    for (x = 0; x < K; x++)    {      item1 = x + y;      b[y * K + x] = item1;
    }  }  printf("-----------------两个矩阵-#--------------\n");

  for (y = 0; y < M; y++)  {
    for (x = 0; x < N; x++)    {      printf("%f ", a[y * N + x]);    }    printf("\n");  }

  for (y = 0; y < N; y++)  {    for (x = 0; x < K; x++)    {      printf("%f  ", b[y * K + x]);    }    printf("\n");  }


  unsigned int grid_rows = (M + BLOCK_SIZE - 1) / BLOCK_SIZE;  unsigned int grid_cols = (K + BLOCK_SIZE - 1) / BLOCK_SIZE;
  dim3 dimGrid(grid_rows, grid_cols);  dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
  gpu_matrix <<<dimGrid, dimBlock>>> (a, b, c_gpu, M, N, K);
  cudaDeviceSynchronize();
  cpu_matrix(a, b, c_cpu, M, N, K);  //




  //打印cpu计算结果  printf("---------------cpu计算结果---------#------------------\n");
  for (y = 0; y < M; y++)  {    for (x = 0; x < K; x++)    {      printf("%f  ", c_cpu[y * K + x]);    }    printf("\n");
  }
  printf("------------------gpu计算结果------#------------------\n");

  //打印GPU计算结果  for (y = 0; y < M; y++)  {    for (x = 0; x < K; x++)    {      printf("%f  ", c_gpu[y * K + x]);    }    printf("\n");  }

  return 0;}
-----------------两个矩阵-#--------------0.000000 1.000000 2.000000 3.000000 4.000000 5.000000 6.000000 7.0000001.000000 2.000000 3.000000 4.000000 5.000000 6.000000 7.000000 8.0000002.000000 3.000000 4.000000 5.000000 6.000000 7.000000 8.000000 9.0000003.000000 4.000000 5.000000 6.000000 7.000000 8.000000 9.000000 10.0000004.000000 5.000000 6.000000 7.000000 8.000000 9.000000 10.000000 11.0000005.000000 6.000000 7.000000 8.000000 9.000000 10.000000 11.000000 12.0000000.000000  1.000000  2.000000  3.000000  4.000000  5.0000001.000000  2.000000  3.000000  4.000000  5.000000  6.0000002.000000  3.000000  4.000000  5.000000  6.000000  7.0000003.000000  4.000000  5.000000  6.000000  7.000000  8.0000004.000000  5.000000  6.000000  7.000000  8.000000  9.0000005.000000  6.000000  7.000000  8.000000  9.000000  10.0000006.000000  7.000000  8.000000  9.000000  10.000000  11.0000007.000000  8.000000  9.000000  10.000000  11.000000  12.000000---------------cpu计算结果---------#------------------140.000000  168.000000  196.000000  224.000000  252.000000  280.000000168.000000  204.000000  240.000000  276.000000  312.000000  348.000000196.000000  240.000000  284.000000  328.000000  372.000000  416.000000224.000000  276.000000  328.000000  380.000000  432.000000  484.000000252.000000  312.000000  372.000000  432.000000  492.000000  552.000000280.000000  348.000000  416.000000  484.000000  552.000000  620.000000------------------gpu计算结果------#------------------140.000000  168.000000  196.000000  224.000000  252.000000  280.000000168.000000  204.000000  240.000000  276.000000  312.000000  348.000000196.000000  240.000000  284.000000  328.000000  372.000000  416.000000224.000000  276.000000  328.000000  380.000000  432.000000  484.000000252.000000  312.000000  372.000000  432.000000  492.000000  552.000000280.000000  348.000000  416.000000  484.000000  552.000000  620.000000

到了这里,我们能够使得矩阵乘法变得相当快(与仅使用CPU计算相比),这在实际应用中非常重要,尤其是数据计算量非常大的情况。

也许到了这里,这两个程序你并没有完全了解,但,不要担心,先把这些代码运行一下,体会使用GPU计算的魅力,为以后的学习打下基础。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值