CUDA 编程demo
1、函数申明
__global__: specifier __global__
to the function, which tells the CUDA C++ compiler that this is a function that runs on the GPU and can be called from CPU code.
2、内存管理
To compute on the GPU, I need to allocate memory accessible by the GPU. Unified Memory in CUDA makes this easy by providing a single memory space accessible by all GPUs and CPUs in your system. To allocate data in unified memory, call cudaMallocManaged()
, which returns a pointer that you can access from host (CPU) code or device (GPU) code. To free the data, just pass the pointer to cudaFree()
.
3、数据同步
I need the CPU to wait until the kernel is done before it accesses the results (because CUDA kernel launches don’t block the calling CPU thread). To do this I just call cudaDeviceSynchronize()
before doing the final error checking on the CPU.
4、host调用
int numBlocks = (N + blockSize - 1) / blockSize;
add<<<numBlocks, blockSize>>>(N, x, y);
numBlocks : block number per grid
blockSize : thread number per block
5、GPUs 显存定位
threadIdx : 线程index,指在block中的idx
blockIdx : block index
gridDim : grid的维度,指grid中block的排列,单位 块
blockDim : 块的维度,指块中的thread的排列,单位 线程
所以:
int index=blockIdx.x*blockDim.x+threadIdx.x;
int stride=gridDim.x*blockDim.x;
index:起始位置
stride:步长,直接跳转到下行,每列并行
6、参考代码:
#include"cuda_runtime.h"
#include"cuda_profiler_api.h"
#include"iostream"
#include"math.h"
void helloCuda()
{
std::cout<<"hello cuda"<<std::endl;
int dev=0;
cudaDeviceProp devProp;
cudaGetDeviceProperties(&devProp,dev);
std::cout<<devProp.name<<std::endl;
std::cout<<devProp.multiProcessorCount<<std::endl;
std::cout<<devProp.maxThreadsPerMultiProcessor<<std::endl;
std::cout<<devProp.sharedMemPerMultiprocessor<<std::endl;
std::cout<<devProp.maxThreadsPerBlock<<std::endl;
std::cout<<devProp.sharedMemPerBlock<<std::endl;
}
__global__ void
addKernel(int N,float *a,float *b)
{
//printf("gridDim: %d %d %d\n",gridDim.x,gridDim.y,gridDim.z);
//printf("blockDim: %d %d %d\n",blockDim.x,blockDim.y,blockDim.z);
//printf("blockIdx: %d %d %d\n",blockIdx.x,blockIdx.y,blockIdx.z);
int index=blockIdx.x*blockDim.x+threadIdx.x;
int stride=gridDim.x*blockDim.x;
for(int i=index;i<N;i+=stride)
a[i]=a[i]+b[i];
}
int main(int argc,char**argv)
{
helloCuda();
int N=1<<20;
float *x,*y;
cudaMallocManaged(&x,N*sizeof(float));
cudaMallocManaged(&y,N*sizeof(float));
for(int i=0;i<N;i++)
{
x[i]=4.3;
y[i]=3.7;
}
addKernel<< <(N-1+256)/256,256>>> (N,x,y);
cudaDeviceSynchronize();
double maxError=0.0;
for(int i=0;i<N;i++)
maxError=fmax(maxError,fabs(x[i]-8.0));
std::cout<<"maxError="<<maxError<<std::endl;
cudaFree(x);
cudaFree(y);
cudaProfilerStop();
cudaDeviceReset();
return 0;
}
7、参考资料:
An Even Easier Introduction to CUDA