cuda之thread,block,gird详解

本文将通过一个程序帮助了解线程块的分配,以及线程束,线程全局标号等


[cpp]  view plain  copy
  1. #include<cuda_runtime.h>  
  2. #include<conio.h>  
  3. #include<stdio.h>  
  4. #include<stdlib.h>  
  5. #include<device_launch_parameters.h>  
  6.   
  7. #define ARRAY_SIZE 128  
  8. #define ARRAY_SIZE_IN_BYTES (sizeof(unsigned int)*(ARRAY_SIZE))  
  9.   
  10. __global__ void what_is_my_id(unsigned int *const block,  
  11.     unsigned int *const thread,  
  12.     unsigned int *const warp,  
  13.     unsigned int *const calc_thread)  
  14. {  
  15.     const unsigned int thread_idx = blockIdx.x*blockDim.x + threadIdx.x;  
  16.     block[thread_idx] = blockIdx.x;  
  17.     thread[thread_idx] = threadIdx.x;//内部线程的索引  
  18.     warp[thread_idx] = threadIdx.x / warpSize;  
  19.     calc_thread[thread_idx] = thread_idx;  
  20. }  
  21.   
  22. int main()  
  23. {  
  24.     /* 本地开辟4个数组存放我们要计算的内容 */  
  25.     unsigned int cpu_block[ARRAY_SIZE];  
  26.     unsigned int cpu_thread[ARRAY_SIZE];  
  27.     unsigned int cpu_warp[ARRAY_SIZE];  
  28.     unsigned int cpu_calc_thread[ARRAY_SIZE];  
  29.   
  30.     //设计线程数为2*64=128个线程  
  31.     const unsigned int num_blocks = 2;  
  32.     const unsigned int num_threads = 64;  
  33.   
  34.     /* 在GPU上分配同样大小的4个数组 */  
  35.     unsigned int * gpu_block;  
  36.     unsigned int * gpu_thread;  
  37.     unsigned int * gpu_warp;  
  38.     unsigned int * gpu_calc_thread;  
  39.   
  40.     cudaMalloc((void**)&gpu_block, ARRAY_SIZE_IN_BYTES);  
  41.     cudaMalloc((void**)&gpu_thread, ARRAY_SIZE_IN_BYTES);  
  42.     cudaMalloc((void**)&gpu_warp, ARRAY_SIZE_IN_BYTES);  
  43.     cudaMalloc((void**)&gpu_calc_thread, ARRAY_SIZE_IN_BYTES);  
  44.   
  45.     //执行内核函数  
  46.     what_is_my_id << <num_blocks, num_threads >> >(gpu_block, gpu_thread, gpu_warp, gpu_calc_thread);  
  47.   
  48.     //将GPU运算完的结果复制回本地  
  49.     cudaMemcpy(cpu_block, gpu_block, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost);  
  50.     cudaMemcpy(cpu_thread, gpu_thread, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost);  
  51.     cudaMemcpy(cpu_warp, gpu_warp, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost);  
  52.     cudaMemcpy(cpu_calc_thread, gpu_calc_thread, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost);  
  53.   
  54.     cudaFree(gpu_block);  
  55.     cudaFree(gpu_thread);  
  56.     cudaFree(gpu_warp);  
  57.     cudaFree(gpu_calc_thread);  
  58.   
  59.     //输出  
  60.     for (unsigned int i = 0; i < ARRAY_SIZE; i++)  
  61.     {  
  62.         printf("总线程数%3u-Blocks:%2u-Warp%2u-内部线程数%3u\n",  
  63.             cpu_calc_thread[i], cpu_block[i], cpu_warp[i], cpu_thread[i]);  
  64.     }  
  65.   
  66.     return 0;  
  67. }  

可以看到总线程数为0~127,共有2个线程块,每个线程块包含64个 线程,
  每个线程块内部线程的索引为0~63.一个线程块包含2个线程束(warp)
  (1个warp包括32个线程)



[cpp]  view plain  copy
  1. #include<cuda_runtime.h>  
  2. #include<stdio.h>  
  3. #include<device_launch_parameters.h>  
  4.   
  5. #define ARRAY_SIZE_X 32  
  6. #define ARRAY_SIZE_Y 16  
  7. #define ARRAY_SIZE_IN_BYTES (sizeof(unsigned int)*(ARRAY_SIZE_X)*(ARRAY_SIZE_Y))  
  8.   
  9. __global__ void what_is_my_id_2d_A(unsigned int *const block_x,unsigned int *const block_y,  
  10.     unsigned int *const thread,unsigned int *const calc_thread,  
  11.     unsigned int *const x_thread, unsigned int *const y_thread,  
  12.     unsigned int *const gird_dimx, unsigned int *const block_dimx,  
  13.     unsigned int *const gird_dimy, unsigned int *const block_dimy)  
  14. {  
  15.     const unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x;  
  16.     const unsigned int idy = blockIdx.y*blockDim.y + threadIdx.y;  
  17.     const unsigned int thread_idx =((gridDim.x*blockDim.x)*idy) + idx;  
  18.     block_x[thread_idx] = blockIdx.x;//X维度线程块索引  
  19.     block_y[thread_idx] = blockIdx.y;//Y维度线程块索引  
  20.     thread[thread_idx] = threadIdx.x;//1个线程块内部X维度线程索引  
  21.     calc_thread[thread_idx] = thread_idx;//总索引  
  22.     x_thread[thread_idx] = idx;  
  23.     y_thread[thread_idx] = idy;  
  24.     gird_dimx[thread_idx] = gridDim.x;//线程网格X维度上线程块数量  
  25.     block_dimx[thread_idx] = blockDim.x;//线程网格Y维度上线程数量  
  26.     gird_dimy[thread_idx] = gridDim.y;  
  27.     block_dimy[thread_idx] = blockDim.y;  
  28. }  
  29. int main()  
  30. {  
  31.     /* 本地开辟4个数组存放我们要计算的内容 */  
  32.     unsigned int cpu_block_x[ARRAY_SIZE_Y][ARRAY_SIZE_X];  
  33.     unsigned int cpu_block_y[ARRAY_SIZE_Y][ARRAY_SIZE_X];  
  34.     unsigned int cpu_thread[ARRAY_SIZE_Y][ARRAY_SIZE_X];  
  35.     unsigned int cpu_calc_thread[ARRAY_SIZE_Y][ARRAY_SIZE_X];  
  36.     unsigned int cpu_xthread[ARRAY_SIZE_Y][ARRAY_SIZE_X];     
  37.     unsigned int cpu_ythread[ARRAY_SIZE_Y][ARRAY_SIZE_X];  
  38.     unsigned int cpu_grid_dimx[ARRAY_SIZE_Y][ARRAY_SIZE_X];  
  39.     unsigned int cpu_block_dimx[ARRAY_SIZE_Y][ARRAY_SIZE_X];  
  40.     unsigned int cpu_grid_dimy[ARRAY_SIZE_Y][ARRAY_SIZE_X];  
  41.     unsigned int cpu_block_dimy[ARRAY_SIZE_Y][ARRAY_SIZE_X];  
  42.   
  43.     //设计线程数为2*64=128个线程  
  44.     const dim3 threads_rect(32, 4);  
  45.     const dim3 blocks_rect(1, 4);  
  46.   
  47.     const dim3 threads_square(16, 8);  
  48.     const dim3 blocks_square(2,2);  
  49.   
  50.     /* 在GPU上分配同样大小的4个数组 */  
  51.     unsigned int * gpu_block_x;  
  52.     unsigned int * gpu_block_y;  
  53.     unsigned int * gpu_thread;  
  54.     unsigned int * gpu_warp;  
  55.     unsigned int * gpu_calc_thread;  
  56.     unsigned int * gpu_xthread;  
  57.     unsigned int * gpu_ythread;  
  58.     unsigned int * gpu_grid_dimx;  
  59.     unsigned int * gpu_block_dimx;  
  60.     unsigned int * gpu_grid_dimy;  
  61.     unsigned int * gpu_block_dimy;  
  62.   
  63.   
  64.     cudaMalloc((void**)&gpu_block_x, ARRAY_SIZE_IN_BYTES);  
  65.     cudaMalloc((void**)&gpu_block_y, ARRAY_SIZE_IN_BYTES);  
  66.     cudaMalloc((void**)&gpu_thread, ARRAY_SIZE_IN_BYTES);  
  67.     cudaMalloc((void**)&gpu_calc_thread, ARRAY_SIZE_IN_BYTES);  
  68.     cudaMalloc((void**)&gpu_xthread, ARRAY_SIZE_IN_BYTES);  
  69.     cudaMalloc((void**)&gpu_ythread, ARRAY_SIZE_IN_BYTES);  
  70.     cudaMalloc((void**)&gpu_grid_dimx, ARRAY_SIZE_IN_BYTES);  
  71.     cudaMalloc((void**)&gpu_block_dimx, ARRAY_SIZE_IN_BYTES);  
  72.     cudaMalloc((void**)&gpu_grid_dimy, ARRAY_SIZE_IN_BYTES);  
  73.     cudaMalloc((void**)&gpu_block_dimy, ARRAY_SIZE_IN_BYTES);  
  74.   
  75.           //执行条纹式布局左边的图  
  76. //   what_is_my_id_2d_A << <blocks_rect, threads_rect >> >   ( gpu_block_x, gpu_block_y, gpu_thread,gpu_calc_thread,gpu_xthread,gpu_ythread,gpu_grid_dimx,gpu_block_dimx,gpu_grid_dimy,gpu_block_dimy);  
  77.   
  78.          //执行方块式布局右边的图  
  79.      what_is_my_id_2d_A << <blocks_square, threads_square >> >(gpu_block_x, gpu_block_y, gpu_thread,gpu_calc_thread, gpu_xthread,gpu_ythread,gpu_grid_dimx,gpu_block_dimx,gpu_grid_dimy, gpu_block_dimy);  
  80.           
  81.   
  82.         //将GPU运算完的结果复制回本地  
  83.         cudaMemcpy(cpu_block_x, gpu_block_x, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost);  
  84.         cudaMemcpy(cpu_block_y, gpu_block_y, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost);  
  85.         cudaMemcpy(cpu_thread, gpu_thread, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost);  
  86.         cudaMemcpy(cpu_calc_thread, gpu_calc_thread, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost);  
  87.         cudaMemcpy(cpu_xthread, gpu_xthread, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost);  
  88.         cudaMemcpy(cpu_ythread, gpu_ythread, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost);  
  89.         cudaMemcpy(cpu_grid_dimx, gpu_grid_dimx, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost);  
  90.         cudaMemcpy(cpu_block_dimx, gpu_block_dimx, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost);  
  91.         cudaMemcpy(cpu_grid_dimy, gpu_grid_dimy, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost);  
  92.         cudaMemcpy(cpu_block_dimy, gpu_block_dimy, ARRAY_SIZE_IN_BYTES, cudaMemcpyDeviceToHost);  
  93.   
  94. //      printf("\nKernel &d\n", kernel);  
  95.   
  96.         for (int y = 0; y < ARRAY_SIZE_Y; y++)  
  97.         {  
  98.             for (int x = 0; x < ARRAY_SIZE_X; x++)  
  99.             {  
  100.                 printf("总%3u X维度block索引:%1u Y维度block索引:%1u TID:%2u YTID:%2u XTID:%2uGridX维度上block数量%1u BDX:%1u GridY维度上block数量%1u blockY维度线程数量%1u\n",  
  101.                     cpu_calc_thread[y][x], cpu_block_x[y][x], cpu_block_y[y][x], cpu_thread[y][x], cpu_ythread[y][x], cpu_xthread[y][x],  
  102.                     cpu_grid_dimx[y][x], cpu_block_dimx[y][x],cpu_grid_dimy[y][x], cpu_block_dimy[y][x]);  
  103.             }  
  104.   
  105.         }  
  106. //  }  
  107.     cudaFree(gpu_block_x);  
  108.     cudaFree(gpu_block_y);  
  109.     cudaFree(gpu_thread);  
  110.     cudaFree(gpu_calc_thread);  
  111.     cudaFree(gpu_xthread);  
  112.     cudaFree(gpu_ythread);  
  113.     cudaFree(gpu_grid_dimx);  
  114.     cudaFree(gpu_block_dimx);  
  115.     cudaFree(gpu_grid_dimy);  
  116.     cudaFree(gpu_block_dimy);  
  117.     return 0;  
  118. }  


  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值