#include <iostream>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <time.h>
#define M 4
#define N 4
__global__ void add(int a[M][N], int b[M][N], int c[M][N])
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
int j = threadIdx.y + blockIdx.y * blockDim.y;
if (i < M && j < N)
{
c[i][j] = a[i][j] + b[i][j];
}
}
int main()
{
clock_t start, end;
int(*a_h)[N] = new int [M][N];
int(*b_h)[N] = new int [M][N];
int(*c_h)[N] = new int [M][N];
int(*a_d)[N];
int(*b_d)[N];
int(*c_d)[N];
cudaMalloc((void**)&a_d, sizeof(int) * M * N);
cudaMalloc((void**)&b_d, sizeof(int) * M * N);
cudaMalloc((void**)&c_d, sizeof(int) * M * N);
for (int i = 0; i < M; i++)
{
for (int j = 0; j < N; j++)
{
a_h[i][j] = 2;
}
}
for (int i = 0; i < M; i++)
{
for (int j = 0; j < N; j++)
{
b_h[i][j] = 3;
}
}
start = clock();
cudaMemcpy(a_d, a_h, sizeof(int) * M * N, cudaMemcpyHostToDevice);
cudaMemcpy(b_d, b_h, sizeof(int) * M * N, cudaMemcpyHostToDevice);
dim3 DimGrid(1, 1);
dim3 DimBlock(4, 4);
add << <DimGrid, DimBlock >> > (a_d, b_d, c_d);
cudaMemcpy(c_h, c_d, sizeof(int) * M * N, cudaMemcpyDeviceToHost);
end = clock();
for (int i = 0; i < M; i++)
{
for (int j = 0; j < N; j++)
{
printf("%d ", c_h[i][j]);
}
printf("\n");
}
float timel = (float)(end - start) / CLOCKS_PER_SEC;
printf("执行时间为:%f\n", timel);
free(a_h);
free(b_h);
free(c_h);
cudaFree(a_d);
cudaFree(b_d);
cudaFree(c_d);
return 0;
}