cuda Toolkit 10.2 + VS2015 C++ cuda
GPU运算步骤
包含头文件
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
1. GPU 内存申请
- cudaError_t cudaStatus = cudaMalloc(void **p, size_t s)
2. 内存拷贝 host memory -> Gpu Buffer
-
cudaStatus = cudaMemcpy(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind) //cudaMemcpyHostToDevice
enum __evice_builtin__ cudaMemcpyKind { cudaMemcpyHostToHost = 0, /**< Host -> Host */ cudaMemcpyHostToDevice = 1, /**< Host -> Device */ cudaMemcpyDeviceToHost = 2, /**< Device -> Host */ cudaMemcpyDeviceToDevice = 3, /**< Device -> Device */ cudaMemcpyDefault = 4 /**< Direction of the transfer is inferred from the pointer values. Requires unified virtual addressing */ };
3. 调用 kernel 方法
-
addKernel <<<1, size >>>(dev_c, dev_a, dev_b);
-
__global__ void addKernel(int *c, const int *a, const int *b) { int i = threadIdx.x + blockIdx.x; printf("blockIdx.x %d threadIdx.x %d \n", blockIdx.x, threadIdx.x); printf("i == %d \n", i); c[i] = a[i] + b[i]; }
4. 内存拷贝 Gpu Buffer->Host memory
- cudaMemcpy(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind); // cudaMemcpyDeviceToHost
5. 设备重置
- cudaStatus = cudaDeviceReset();