复习
CUDA的抽象层次:内存结构,线程结构,障碍同步。
线程管理
CUDA线程管理是通过一个两层的线程层次结构完成的,也就是网格和线程块。通过以下代码可以看到这个结构是如何运行的
#include<cuda_runtime.h>
#include<stdio.h>
#include<iostream>
using namespace std;
__global__ void checkIndex(void){
printf("threadIdx: %d %d %d, blockIdx: %d %d %d,threadDim: %d %d %d, blockDim: %d %d %d\n",threadIdx.x,threadIdx.y,threadIdx.z,blockIdx.x,blockIdx.y,blockIdx.z,blockDim.x,blockDim.y,blockDim.z,gridDim.x,gridDim.y,gridDim.z);
}
int main(){
checkIndex<<<4,8>>>();
cudaDeviceReset();
return 0;
}
输出如下
threadIdx: 0 0 0, blockIdx: 1 0 0,threadDim: 8 1 1, blockDim: 4 1 1
threadIdx: 1 0 0, blockIdx: 1 0 0,threadDim: 8 1 1, blockDim: 4 1 1
threadIdx: 2 0 0, blockIdx: 1 0 0,threadDim: 8 1 1, blockDim: 4 1 1
threadIdx: 3 0 0, blockIdx: 1 0 0,threadDim: 8 1 1, blockDim: 4 1 1
threadIdx: 4 0 0, blockIdx: 1 0 0,threadDim: 8 1 1, blockDim: 4 1 1
threadIdx: 5 0 0, blockIdx: 1 0 0,threadDim: 8 1 1, blockDim: 4 1 1
threadIdx: 6 0 0, blockIdx: 1 0 0,threadDim: 8 1 1, blockDim: 4 1 1
threadIdx: 7 0 0, blockIdx: 1 0 0,threadDim: 8 1 1, blockDim: 4 1 1
threadIdx: 0 0 0, blockIdx: 0 0 0,threadDim: 8 1 1, blockDim: 4 1 1
threadIdx: 1 0 0, blockIdx: 0 0 0,threadDim: 8 1 1, blockDim: 4 1 1
threadIdx: 2 0 0, blockIdx: 0 0 0,threadDim: 8 1 1, blockDim: 4 1 1
threadIdx: 3 0 0, blockIdx: 0 0 0,threadDim: 8 1 1, blockDim: 4 1 1
threadIdx: 4 0 0, blockIdx: 0 0 0,threadDim: 8 1 1, blockDim: 4 1 1
threadIdx: 5 0 0, blockIdx: 0 0 0,threadDim: 8 1 1, blockDim: 4 1 1
threadIdx: 6 0 0, blockIdx: 0 0 0,threadDim: 8 1 1, blockDim: 4 1 1
threadIdx: 7 0 0, blockIdx: 0 0 0,threadDim: 8 1 1, blockDim: 4 1 1
threadIdx: 0 0 0, blockIdx: 2 0 0,threadDim: 8 1 1, blockDim: 4 1 1
threadIdx: 1 0 0, blockIdx: 2 0 0,threadDim: 8 1 1, blockDim: 4 1 1
threadIdx: 2 0 0, blockIdx: 2 0 0,threadDim: 8 1 1, blockDim: 4 1 1
threadIdx: 3 0 0, blockIdx: 2 0 0,threadDim: 8 1 1, blockDim: 4 1 1
threadIdx: 4 0 0, blockIdx: 2 0 0,threadDim: 8 1 1, blockDim: 4 1 1
threadIdx: 5 0 0, blockIdx: 2 0 0,threadDim: 8 1 1, blockDim: 4 1 1
threadIdx: 6 0 0, blockIdx: 2 0 0,threadDim: 8 1 1, blockDim: 4 1 1
threadIdx: 7 0 0, blockIdx: 2 0 0,threadDim: 8 1 1, blockDim: 4 1 1
threadIdx: 0 0 0, blockIdx: 3 0 0,threadDim: 8 1 1, blockDim: 4 1 1
threadIdx: 1 0 0, blockIdx: 3 0 0,threadDim: 8 1 1, blockDim: 4 1 1
threadIdx: 2 0 0, blockIdx: 3 0 0,threadDim: 8 1 1, blockDim: 4 1 1
threadIdx: 3 0 0, blockIdx: 3 0 0,threadDim: 8 1 1, blockDim: 4 1 1
threadIdx: 4 0 0, blockIdx: 3 0 0,threadDim: 8 1 1, blockDim: 4 1 1
threadIdx: 5 0 0, blockIdx: 3 0 0,threadDim: 8 1 1, blockDim: 4 1 1
threadIdx: 6 0 0, blockIdx: 3 0 0,threadDim: 8 1 1, blockDim: 4 1 1
threadIdx: 7 0 0, blockIdx: 3 0 0,threadDim: 8 1 1, blockDim: 4 1 1
编写核函数
限定符 | 执行 | 调用 | 备注 |
---|---|---|---|
global | 在设备端执行 | 可以在主机端调用,计算能力需大于等于3 | 返回类型必须为void |
device | 在设备端执行 | 只能在设备端调用 | |
host | 在主机端执行 | 只能在主机端调用 | 可以省略 |
cuda核函数限制:
- 只能访问设备内存
- 必须拥有void返回类型
- 不支持可变数量的参数
- 不支持静态变量
- 显示异步行为