blocks和threads
// Kernel definition
// Run on GPU
__global__ void add(int *a, int *b, int *c)
{
// blockIdx.x is the index of the block.
// Each block has blockDim.x threads.
// threadIdx.x is the index of the thead.
// Each thread can perform 1 addition.
// a[index] & b[index] are the 2 numbers to add in the current thread.
int index = blockIdx.x * blockDim.x + threadIdx.x;
c[index] = a[index] + b[index];
}
#define N (1024*1024)
#define THREADS_PER_BLOCK 256
int main(void) {
int *a, *b, *c;
int size = N * sizeof(int);
// Alloc space for host copies of a, b, c and setup input values
a = (int *)malloc(size);
b = (int *)malloc(size);
c = (int *)malloc(size);
int *d_a, *d_b, *d_c;
// Alloc space for device copies of a, b, c
cudaMalloc((void **)&d_a, size);
cudaMalloc((void **)&d_b, size);
cudaMalloc((void **)&d_c, size);
// Copy inputs to device
cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
// Launch add() kernel on GPU
add<<<N/THREADS_PER_BLOCK,THREADS_PER_BLOCK>>>(d_a, d_b, d_c);
// Copy result back to host
cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);
// Cleanup
free(a); free(b); free(c);
cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
return 0;
}
这个例子中,我们要算数的大小为1024x1024因为我们每个block中有256个thread所以所需block的数量为4096
线程层次结构(维数)
thread和block都支持三维的,类型使用dim3来表示
比如下面示例,用二维threads
__global__ void MatAdd(float A[N][N], float B[N][N], float C[N][N])
{
// the blockIdx and threadIdx is now 2-dimensional.
//2Dthread矩阵下标
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
C[i][j] = A[i][j] + B[i][j];
}
int main()
{ ...
// Kernel invocation with one block of N * N * 1 threads
dim3 threadsPerBlock(N, N);
MatAdd<<<1, threadsPerBlock>>>(A, B, C);
...
}
二维block 16x16=256个线程是一个非常普遍的选择。
二维线程索引计算
https://blog.csdn.net/sdauzxl/article/details/49997857?fps=1&locationNum=12
二维线程网格模型下计算当前线程索引:
idx = (blockIdx.x * blockDim.x) + threadIdx.x;
idy = (blockIdx.y * blockDim.y) + threadIdx.y;
threadIdx = idx + idy * blockDim.x * gridDim.x;
解释:
idx 得到的是当前线程在整个线程网格中X维度的索引
idy 得到的是当前线程下整个线程网格中Y维度的索引
blockDim.x * gridDim.x计算得到线程网格中一行(x维度)的线程数量
idy*blockDim.x * gridDim.x计算得到线程网格中idy行(x维度)的总的线程数量
threadIdx为得到的当前线程的索引
数据边界
block可能没有正确的到数据边界,比如我们分配的线程数大于实际要计算的矩阵中的元素数量。
这时 C[i][j] = A[i][j] + B[i][j];
下标就会报错了,我们可以加限制条件
if (i < N && j < N)
多余的线程不用去干事了。
__global__ void MatAdd(float A[N][N], float B[N][N], float C[N][N])
{
// the blockIdx and threadIdx is now 2-dimensional.
//2Dthread矩阵下标
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
// Avoid a thread block that go beyond the input data boundary.
if (i < N && j < N)
C[i][j] = A[i][j] + B[i][j];
}
int main()
{ ...
// Kernel invocation with one block of N * N * 1 threads
dim3 threadsPerBlock(N, N);
MatAdd<<<1, threadsPerBlock>>>(A, B, C);
...
}