0_Simple__simpleCooperativeGroups

最新推荐文章于 2023-08-09 14:44:53 发布

Bruce_0712

最新推荐文章于 2023-08-09 14:44:53 发布

阅读量596

点赞数 1

分类专栏： CUDA9.0

CUDA9.0 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

协作组。CUDA9.0 的新特性。

▶ 源代码：

 1 #include <stdio.h>
 2 #include "cuda_runtime.h"
 3 #include "device_launch_parameters.h"
 4 #include <cooperative_groups.h>
 5 
 6 #define THREAD_PER_BLOCK 64
 7 
 8 using namespace cooperative_groups;                     // 注意使用命名空间
 9 
10 // 规约设备函数，要求共享内存 int *x 要够放得下 g.size() 个参加规约的元素 
11 __device__ int sumReduction(thread_group g, int *x, int val)
12 {
13     int lane = g.thread_rank(); // 线程在协作组中的编号，教程中名字就叫 line ID
14 
15     for (int i = g.size() / 2; i > 0; i /= 2)
16     {
17         x[lane] = val;          // 第一次迭代该步相当于初始化，以后迭代该步相当于存储上一次迭代的结果
18         g.sync();               // 协作组同步
19         if (lane < i)
20             val += x[lane + i]; // 利用每个线程局部变量 val 记录当前结果
21         g.sync();
22     }
23     if (g.thread_rank() == 0)   // 零号线程返回计算结果
24         return val;
25     else
26         return -1;
27 }
28 
29 __global__ void cgkernel()
30 {
31     extern __shared__ int workspace[];
32 
33     thread_block group = this_thread_block();           // 将线程块内所有线程打包为一个协作组
34     int groupSize = group.size();                       // 获得协作组大小（线程个数）
35     int input = group.thread_rank();                    // 获得线程在协作组内的编号，并作为计算输入
36     int output = sumReduction(group, workspace, input); // 规约计算，注意直接使用共享内存作为工作空间
37     int expectedOutput = (groupSize - 1)*groupSize / 2; // 预期计算结果，0 + 1 + 2 +...+ 63 = 2016
38 
39     if (group.thread_rank() == 0)                       // 0 号线程报告计算结果，并且宣布开始新的，4 个协作组的计算任务
40     {
41         printf("\n\tSum of thread 0 ~ %d in group is %d (expected %d)\n", group.size() - 1, output, expectedOutput);
42         printf("\n\tNow creating %d groups, each of size 16 threads:\n", group.size() / 16);                       
43     }
44     group.sync();                                               // 协作组同步
45 
46     thread_block_tile<16> group16 = tiled_partition<16>(group);// 每16个线程分割为一个协作组（只能使用 2 的整数次幂）
47 
48     int offset = group.thread_rank() - group16.thread_rank();  // 各协作组使用的共享内存的地址偏移量
49                                                                // 获得 {0,..., 0(16个), 16,..., 16(16个), 32,..., 32(16个), 48,..., 48(16个)}
50     input = group16.thread_rank();                             // 获得线程在新协作组中的编号，并作为计算输入
51     output = sumReduction(group16, workspace + offset, input); // 规约计算，注意工作空间的地址偏移
52     expectedOutput = 15 * 16 / 2;                              // 预期计算结果，0 + 1 + 2 +...+ 16 = 120
53 
54     if (group16.thread_rank() == 0)                            // 各协作组零号线程报告计算结果
55         printf("\n\tSum of all ranks 0..15 in group16 is %d (expected %d)\n", output, expectedOutput);
56     // 如何获得协作组的编号？
57     return;
58 }
59 
60 int main()
61 {
62     printf("\n\tStart with %d threads.\n", THREAD_PER_BLOCK);
63 
64     cgkernel << <1, THREAD_PER_BLOCK, THREAD_PER_BLOCK * sizeof(int) >> > ();
65     cudaDeviceSynchronize();
66 
67     printf("\n\tFinish.\n");
68     getchar();
69     return 0;
70 }

▶ 输出结果：

    Start with 64 threads.

    Sum of thread 0 ~ 63 in group is 2016 (expected 2016)

    Now creating 4 groups, each of size 16 threads:

    Sum of all ranks 0..15 in group16 is 120 (expected 120)

    Sum of all ranks 0..15 in group16 is 120 (expected 120)

    Sum of all ranks 0..15 in group16 is 120 (expected 120)

    Sum of all ranks 0..15 in group16 is 120 (expected 120)

    Finish.

▶ 涨姿势：

● 相关定义

 1 // cooperative_groups_helper.h
 2 # if !defined(_CG_QUALIFIER)
 3 #  define _CG_QUALIFIER __forceinline__ __device__
 4 # endif
 5 
 6 # define die() assert(0);
 7 
 8 // cooperative_groups.h
 9 class thread_block : public thread_group
10 {
11     friend _CG_QUALIFIER thread_block this_thread_block();
12     friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz);
13     friend _CG_QUALIFIER thread_group tiled_partition(const thread_block& parent, unsigned int tilesz);
14 
15     _CG_QUALIFIER thread_block() : thread_group(__internal::ThreadBlock){}
16 
17     _CG_QUALIFIER thread_group _get_tiled_threads(unsigned int tilesz) const
18     {
19         const bool pow2_tilesz = ((tilesz & (tilesz - 1)) == 0);
20 
21         if (tilesz == 0 || (tilesz > 32) || !pow2_tilesz)
22         {
23             die();
24             return (thread_block());
25         }
26 
27         unsigned int mask;
28         unsigned int base_offset = thread_rank() & (~(tilesz - 1));
29         unsigned int masklength = min(size() - base_offset, tilesz);
30 
31         mask = (unsigned int)(-1) >> (32 - masklength);
32         mask <<= (__internal::laneid() & ~(tilesz - 1));
33         thread_group tile = thread_group(__internal::CoalescedTile);
34         tile._data.coalesced.mask = mask;
35         tile._data.coalesced.size = __popc(mask);
36         return (tile);
37     }
38 
39     public:
40     _CG_QUALIFIER void sync() const { __internal::cta::sync(); }
41 
42     _CG_QUALIFIER unsigned int size() const { return (__internal::cta::size()); }
43 
44     _CG_QUALIFIER unsigned int thread_rank() const { return (__internal::cta::thread_rank()); }
45 
46     _CG_QUALIFIER dim3 group_index() const { return (__internal::cta::group_index()); }
47 
48     _CG_QUALIFIER dim3 thread_index() const { return (__internal::cta::thread_index()); }
49 };
50 
51 _CG_QUALIFIER thread_block this_thread_block()
52 {
53     return (thread_block());
54 }
55 
56 template <unsigned int Size>
57 class thread_block_tile;
58 template <> class thread_block_tile<32> : public __thread_block_tile_base<32> { };
59 template <> class thread_block_tile<16> : public __thread_block_tile_base<16> { };
60 template <> class thread_block_tile<8> : public __thread_block_tile_base<8> { };
61 template <> class thread_block_tile<4> : public __thread_block_tile_base<4> { };
62 template <> class thread_block_tile<2> : public __thread_block_tile_base<2> { };
63 template <> class thread_block_tile<1> : public __thread_block_tile_base<1> { };

● 用到的线程协作相关函数

1 thread_block threadBlockGroup = this_thread_block();    // 将当前线程块分配为一个协作组
2 
3 thread_block_tile<16> tiledPartition16 = tiled_partition<16>(threadBlockGroup); // 协作组分组
4     
5 int in = tiledPartition16.thread_rank();                // 协作组中线程的编号
6 
7 tiledPartition16.sync();                            // 协作组同步