1、 allocate GPU mem
int N = 32;
int nbytes = N * sizeof(FLOAT);
FLOAT *dx = NULL, *hx = NULL;
int i;
/* allocate GPU mem */
cudaMalloc((void **)&dx, nbytes);
2、alllocate CPU host mem memory copy is faster than malloc
cudaMallocHost((void **)&hx, nbytes);
3、 // Allocate Unified Memory – accessible from CPU or GPU // 内存分配,在GPU或者CPU上统一分配内存
```cpp
cudaMallocManaged(&x, N*sizeof(float));
cudaMallocManaged(&y, N*sizeof(float));
4、 alllocate CPU mem
hx = (FLOAT *) malloc(nbytes);
5、 copy data to GPU
/* copy data to GPU */
cudaMemcpy(dx, hx, nbytes, cudaMemcpyHostToDevice);
5、 call GPU
int blockSize = 512;
int numBlock = (N + blockSize - 1) / blockSize;
add<<<numBlock, blockSize>>>(N, x, y);//123msm117ms
//
6、 /* let GPU finish */
/* let GPU finish */
cudaDeviceSynchronize();
7、 /* copy data from GPU */
/* copy data from GPU */
cudaMemcpy(&as, dy, sizeof(FLOAT), cudaMemcpyDeviceToHost);
8、释放内存
cudaFree(dy);
free(hx);
9、核函数
// function to add the elements of two arrays
__global__ void add(int n, float *x, float *y)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
//printf("index: %d, stride: %d\n", index, stride);
for (int i = index; i < n; i+=stride)
y[i] = x[i] + y[i];
}