有两个矩阵,维数分别为MxN, NxK
得到相乘后的矩阵,维数为MxK
这里参数为
M = 4;
N = 5;
K = 4;
#include <stdio.h>
#include<stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
# define BLOCK_SIZE 16
__global__ void gpu_matrix(float* a, float* b, float* c, const int M, const int N, const int K)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
if (x < K && y < M)
{
float sum = 0;
for (int step = 0; step< N; ++step)
{
sum += a[y * N + step] * b[step * K + x];
}
c[y*K+x] = sum;
}
}
void cpu_matrix(float*a, float*b, float*c,const int M, const int N, const int K)
{
int y, x,step;
float temp;
for (y = 0; y < M; y++)
{
for (x = 0; x < K; x++)
{
temp = 0;
for (step = 0; step < N; step++)
{
temp += a[y * N + step] * b[step * K + x];
}
c[y * K + x] = temp;
}
}
}
int main()
{
int x, y;
int M, N, K;
float item1 = 0.0;
float* h_a, * h_b, * h_c,*h_cc;//h_a:(M,N) h_b(N,K),,h_c,(M,K);
//初始化数据
M = 4;
N = 5;
K = 4;
cudaMallocHost((void**)&h_a, sizeof(float)*M*N);
cudaMallocHost((void**)&h_b, sizeof(float)*N*K);
cudaMallocHost((void**)&h_c, sizeof(float)*M*K);
cudaMallocHost((void**)&h_cc, sizeof(float) * M * K);
//初始化矩阵
for (y = 0; y <M; y++)
{
for (x= 0; x <N; x++)
{
item1 = x + y;
h_a[y * N + x] =item1;
}
}
for (y = 0; y < N; y++)
{
for (x = 0; x < K; x++)
{
item1 = x+y;
h_b[y *K + x] = item1;
}
}
printf("-----------------两个矩阵-#--------------\n");
for (y = 0; y < M; y++)
{
for (x = 0; x < N; x++)
{
printf("%f ", h_a[y * N + x]);
}
printf("\n");
}
for (y = 0; y < N; y++)
{
for (x = 0; x < K; x++)
{
printf("%f ", h_b[y * K + x]);
}
printf("\n");
}
float* d_a, * d_b, * d_c;
cudaMalloc((void**)&d_a, sizeof(float) * M * N);
cudaMalloc((void**)&d_b, sizeof(float) * N * K);
cudaMalloc((void**)&d_c, sizeof(float) * M * K);
cudaMemcpy(d_a, h_a, sizeof(float) * M * N, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, sizeof(float) * N * K, cudaMemcpyHostToDevice);
unsigned int grid_rows = (M + BLOCK_SIZE - 1) / BLOCK_SIZE;
unsigned int grid_cols= (K + BLOCK_SIZE - 1) / BLOCK_SIZE;
dim3 dimGrid(grid_rows, grid_cols);
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
gpu_matrix << <dimGrid, dimBlock >> > (d_a, d_b, d_c, M, N, K);
cudaMemcpy(h_c, d_c, sizeof(float) * M * K, cudaMemcpyDeviceToHost);
cpu_matrix(h_a, h_b, h_cc, M, N, K);
cudaDeviceSynchronize();
//打印cpu计算结果
printf("---------------cpu计算结果---------#------------------\n");
for (y = 0; y < M; y++)
{
for (x = 0; x < K; x++)
{
printf("%f ",h_cc[y * K + x]);
}
printf("\n");
}
printf("------------------gpu计算结果------#------------------\n");
//打印GPU计算结果
for (y = 0; y < M; y++)
{
for (x = 0; x < K; x++)
{
printf("%f ", h_c[y * K + x]);
}
printf("\n");
}
cudaFreeHost(h_a);
cudaFreeHost(h_b);
cudaFreeHost(h_c);
cudaFreeHost(h_cc);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}
结果:
-----------------两个矩阵-#--------------
0.000000 1.000000 2.000000 3.000000 4.000000
1.000000 2.000000 3.000000 4.000000 5.000000
2.000000 3.000000 4.000000 5.000000 6.000000
3.000000 4.000000 5.000000 6.000000 7.000000
0.000000 1.000000 2.000000 3.000000
1.000000 2.000000 3.000000 4.000000
2.000000 3.000000 4.000000 5.000000
3.000000 4.000000 5.000000 6.000000
4.000000 5.000000 6.000000 7.000000
---------------cpu计算结果---------#------------------
30.000000 40.000000 50.000000 60.000000
40.000000 55.000000 70.000000 85.000000
50.000000 70.000000 90.000000 110.000000
60.000000 85.000000 110.000000 135.000000
------------------gpu计算结果------#------------------
30.000000 40.000000 50.000000 60.000000
40.000000 55.000000 70.000000 85.000000
50.000000 70.000000 90.000000 110.000000
60.000000 85.000000 110.000000 135.000000
结果很符合预期