第一个Cuda程序,矩阵相乘代码

本文展示了如何使用CUDA在GPU上执行矩阵乘法,将MxN和NxK矩阵相乘得到MxK的结果,并与CPU版本进行性能对比,结果表明GPU计算效率更高。
摘要由CSDN通过智能技术生成

有两个矩阵,维数分别为MxN,  NxK

得到相乘后的矩阵,维数为MxK

这里参数为

  M = 4;  N = 5;  K = 4;​​​​​​
#include <stdio.h>#include<stdlib.h>#include <cuda.h>#include <cuda_runtime.h>#include <device_launch_parameters.h>
# define BLOCK_SIZE 16
__global__ void gpu_matrix(float* a, float* b, float* c, const int M, const int N, const int K){  int x = threadIdx.x + blockIdx.x * blockDim.x;  int y = threadIdx.y + blockIdx.y * blockDim.y;  if (x < K && y < M)   {    float sum = 0;    for (int step = 0; step< N; ++step)    {      sum += a[y * N + step] * b[step * K + x];    }    c[y*K+x] = sum;  }}
void cpu_matrix(float*a, float*b, float*c,const int M, const int N, const int K){  int y, x,step;  float temp;  for (y = 0; y < M; y++)  {    for (x = 0; x < K; x++)    {      temp = 0;      for (step = 0; step < N; step++)      {        temp += a[y * N + step] * b[step * K + x];      }      c[y * K + x] = temp;    }  }}




int main(){  int x, y;  int M, N, K;  float item1 = 0.0;
  float* h_a, * h_b, * h_c,*h_cc;//h_a:(M,N) h_b(N,K),,h_c,(M,K);

  //初始化数据  M = 4;  N = 5;  K = 4;
  cudaMallocHost((void**)&h_a, sizeof(float)*M*N);  cudaMallocHost((void**)&h_b, sizeof(float)*N*K);  cudaMallocHost((void**)&h_c, sizeof(float)*M*K);  cudaMallocHost((void**)&h_cc, sizeof(float) * M * K);
  //初始化矩阵  for (y = 0; y <M; y++)  {          for (x= 0; x <N; x++)    {      item1 = x + y;      h_a[y * N + x] =item1;          }      }



  for (y = 0; y < N; y++)  {            for (x = 0; x < K; x++)    {      item1 = x+y;      h_b[y *K + x] = item1;          }  }  printf("-----------------两个矩阵-#--------------\n");
  for (y = 0; y < M; y++)  {        for (x = 0; x < N; x++)    {      printf("%f ", h_a[y * N + x]);    }    printf("\n");  }

  for (y = 0; y < N; y++)  {    for (x = 0; x < K; x++)    {      printf("%f  ", h_b[y * K + x]);    }    printf("\n");  }
  float* d_a, * d_b, * d_c;  cudaMalloc((void**)&d_a, sizeof(float) * M * N);  cudaMalloc((void**)&d_b, sizeof(float) * N * K);  cudaMalloc((void**)&d_c, sizeof(float) * M * K);
  cudaMemcpy(d_a, h_a, sizeof(float) * M * N, cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, h_b, sizeof(float) * N * K, cudaMemcpyHostToDevice);

  unsigned int grid_rows = (M + BLOCK_SIZE - 1) / BLOCK_SIZE;  unsigned int grid_cols=  (K + BLOCK_SIZE - 1) / BLOCK_SIZE;
  dim3 dimGrid(grid_rows, grid_cols);  dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
  gpu_matrix << <dimGrid, dimBlock >> > (d_a, d_b, d_c, M, N, K);
  cudaMemcpy(h_c, d_c, sizeof(float) * M * K, cudaMemcpyDeviceToHost);
  cpu_matrix(h_a, h_b, h_cc, M, N, K);  cudaDeviceSynchronize();


  //打印cpu计算结果  printf("---------------cpu计算结果---------#------------------\n");
  for (y = 0; y < M; y++)  {    for (x = 0; x < K; x++)    {      printf("%f  ",h_cc[y * K + x]);    }    printf("\n");  }  printf("------------------gpu计算结果------#------------------\n");

  //打印GPU计算结果  for (y = 0; y < M; y++)  {    for (x = 0; x < K; x++)    {      printf("%f  ", h_c[y * K + x]);    }    printf("\n");  }




  

  cudaFreeHost(h_a);  cudaFreeHost(h_b);  cudaFreeHost(h_c);  cudaFreeHost(h_cc);
  cudaFree(d_a);  cudaFree(d_b);  cudaFree(d_c);

  return 0;}

​​​​​​​

结果:-----------------两个矩阵-#--------------0.000000 1.000000 2.000000 3.000000 4.0000001.000000 2.000000 3.000000 4.000000 5.0000002.000000 3.000000 4.000000 5.000000 6.0000003.000000 4.000000 5.000000 6.000000 7.0000000.000000  1.000000  2.000000  3.0000001.000000  2.000000  3.000000  4.0000002.000000  3.000000  4.000000  5.0000003.000000  4.000000  5.000000  6.0000004.000000  5.000000  6.000000  7.000000---------------cpu计算结果---------#------------------30.000000  40.000000  50.000000  60.00000040.000000  55.000000  70.000000  85.00000050.000000  70.000000  90.000000  110.00000060.000000  85.000000  110.000000  135.000000------------------gpu计算结果------#------------------30.000000  40.000000  50.000000  60.00000040.000000  55.000000  70.000000  85.00000050.000000  70.000000  90.000000  110.00000060.000000  85.000000  110.000000  135.000000

结果很符合预期

  • 6
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值