设备信息
void getDeviceInfo(){
int count;
cudaDeviceProp prop;
cudaGetDeviceCount(&count);
for (int i = 0; i < count; i++) {
cudaGetDeviceProperties(&prop, i);
printf("显卡名称: %s \n", prop.name);
printf("显卡版本号: %d.%d \n", prop.major,prop.minor);
printf("全局总内存: %lld GB\n", prop.totalGlobalMem / (1000*1000*1000));
printf("线程块最大共享内存: %lld KB \n", prop.sharedMemPerBlock / 1000 );
printf("线程块(Block)最大线程数量: %d \n", prop.maxThreadsPerBlock );
printf("多维线程块中每一维最大线程数量: %d %d %d \n",
prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2] );
printf("一个Grid中每一维最大Block数量: %d %d %d \n",
prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
printf("最大并行处理器数量数量: %d \n",
prop.multiProcessorCount);
}
}
申请空间
int *data_arr_on_dev;
cudaError_t cudaStatus ;
cudaStatus = cudaMalloc((void**)&data_arr_on_dev, arr_size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto err;
}
内存复制
主机向设备
cudaStatus = cudaMemcpy(data_arr_on_dev, data_arr_on_host,
arr_size * sizeof(arr_type),
cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto err;
}
设备向主机
cudaStatus = cudaMemcpy(data_arr_on_host, data_arr_on_dev,
arr_size * sizeof(arr_type),
cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto err;
}
核函数调用
kernel << < blocksPerGrid, threadsPerBlock >> > (param_a, param_b);
核函数定义
__global__ void kernel( int param_a, int param_b){
__shared__ int shared_int; // 共享内存 仅可在block内共享
threadIdx.x // block内线程编号
blockDim.x // block大小
blockIdx.x // grid内block编号
gridDim.x // grid大小
__syncthreads(); // 线程同步 同步block内的所有线程
}