1.第一次接触CUDA,还有很多不清楚的地方,此程序为计算2*2矩阵并行运算
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include "assert.h"
//cudaError_t addWithCuda(int c[][2], int a[][2], int b[][2], unsigned int size);
/*
1.cpu 与 gpu 一起在运行程序, cpu负责调度, gpu 负责运算, cpu称为HOST , gpu 称为 DEVICE
2.grid 包含多个 block , block 包含多个 thread
3.例如"AAAABBBBCCCCDDDDEEEE",ABCDE各代表一个block, 总的为一个Grid, 每个block中有四个thread,
若此时指向第一个B,则表示第一个block中的第0个thread
则:
i = blockDim.x * blockIdx.x + threadIdx.x
其中 :
blockDim.x 表示block的size行数,当前例子是4 (如果是一维的block的话,即一行有多少个thread)
blockIdx.x 表示当前运行到的第几个block ,当前例子是1 (一维grid的话,即该grid中第几个block)
threadIdx.x 表示当前运行到的第几个thread ,当前例子是0(一维的block的话.即该block中第几个thread)
所以 i=4*1+0
4.block可以是1,2,3维。根据你用dim3申明时来确定,比如dim3 blockSize(5); 那么就是1维的,其实就是blockSize(5,1,1),
后面没写出来的默认为1了。 dim3 blockSize(5,4);就是2维的,类推。
*/
__global__ void addKernel(int a[][2], int b[][2], int c[][2])
{
printf("我是:threadIdx.x=%d threadIdx.y=%d threadIdx.z=%d || blockIdx.x=%d blockIdx.y=%d blockIdx.z=%d\n"
, threadIdx.x, threadIdx.y, threadIdx.z
, blockIdx.x, blockIdx.y, blockIdx.z);
printf("blockDim.x=%d blockDim.y=%d blockDim.z=%d\n", blockDim.x, blockDim.y, blockDim.z);
printf("gridDim.x=%d gridDim.y=%d gridDim.z=%d\n", gridDim.x, gridDim.y, gridDim.z);
c[0][0] = 40; c[0][1] = 40; c[1][0] = 40; c[1][1] = 40;
int i = threadIdx.x + blockIdx.x * blockDim.x;
int j = threadIdx.y + blockIdx.y * blockDim.y;
if (i < 2 && j < 2) {
c[i][j] = a[i][j] + b[i][j];
}
}
int main()
{
//host端申请内存
int(*a)[2] = new int[2][2];
int(*b)[2] = new int[2][2];
int(*c)[2] = new int[2][2];
a[0][0] = 1; a[0][1] = 2; a[1][0] = 3; a[1][1] = 4;
b[0][0] = 1; b[0][1] = 2; b[1][0] = 3; b[1][1] = 4;
cudaError_t error = cudaSuccess;//断言函数
//int aqqq = 1 + 2;
//assert(aqqq == 3);
int(*device_a)[2], (*device_b)[2], (*device_c)[2];
//device中申请显存
error = cudaMalloc((void **)&device_a, sizeof(int) * 4);
error = cudaMalloc((void **)&device_b, sizeof(int) * 4);
error = cudaMalloc((void **)&device_c, sizeof(int) * 4);
//host端内存copy到device
cudaMemcpy(device_a, a, sizeof(int) * 4, cudaMemcpyHostToDevice);
cudaMemcpy(device_b, b, sizeof(int) * 4, cudaMemcpyHostToDevice);
//启动kernel函数
//dim3 threadsPerBlock(1, 1);
//dim3 numBlocks(2 / threadsPerBlock.x, 2 / threadsPerBlock.y);
//dim3是NVIDIA的CUDA编程中一种自定义的整型向量类型,dim3类型最终设置的是一个三维向量,三维参数分别为x,y,z;
dim3 blocks(1,1,1);//一个grid中block数量
dim3 threads(2,2,1);//一个block中的thread数量
//addKernel << < blocks, threads >> > (device_a, device_b, device_c);
//dim3 grid_size(2, 2, 1);//一个grid中blocks的分布和数量
//dim3 block_size(1, 1, 1);//一个block中的threads的分布和数量
//dim3 threads_size(1, 1, 1);
addKernel <<< blocks, threads >>> (device_a, device_b, device_c);
cudaMemcpy(c, device_c, sizeof(int) * 4, cudaMemcpyDeviceToHost);
for (int i = 0; i < 2; i++)
{
for (int j = 0; j < 2; j++)
{
printf("%10d", c[i][j]);
}
printf("\n");
}
return 0;
}