在CC3.0以上,支持了shuffle指令,允许thread直接读其他thread的寄存器值,只要两个thread在 同一个warp中,这种比通过shared Memory进行thread间的通讯效果更好,latency更低,同时也不消耗额外的内存资源来执行数据交换。
这里介绍warp中的一个概念lane,一个lane就是一个warp中的一个thread,每个lane在同一个warp中由lane索引唯一确定,因此其范围为[0,31]。在一个一维的block中,可以通过下面两个公式计算索引:
laneID = threadIdx.x % 32
warpID = threadIdx.x / 32
例如,在同一个block中的thread1和33拥有相同的lane索引1。
Variants of the Warp Shuffle Instruction
有两种设置shuffle的指令:一种针对整型变量,另一种针对浮点型变量。每种设置都包含四种shuffle指令变量。为了交换整型变量,使用过如下函数:
参考书籍:《cuda专家手册|GPU编程权威》
1:_shfl_xor
首先介绍__shfl_xor,因为最先用到它。
__shfl_xor(var,laneMask):Copy from a lane based on bitwise XOR of own lane ID
意思就是从当前的线程id与laneMak异或运算的值作为线程号的,把这个线程号的var值取出来。
演示图:
举例:
tid =0
laneMask =16
tid xor laneMask(0000 xor 1000)=0111=15
所有取到的值为15号线程的var
那我们看下完成测试代码:
__global__ void test_shfl_xor(int A[], int B[])
{
int tid = threadIdx.x;
int best = B[tid];
//best = subgroup_min<32>(best, 0xffffffffu);
best = __shfl_xor(best, 8);
A[tid] = best;
}
int main()
{
int *A,*Ad, *B, *Bd;
int n = 32;
int size = n * sizeof(int);
// CPU端分配内存
A = (int*)malloc(size);
B = (int*)malloc(size);
for (int i = 0; i < n; i++)
{
B[i] = rand()%101;
std::cout << B[i] << std::endl;
}
std::cout <<"----------------------------" << std::endl;
// GPU端分配内存
cudaMalloc((void**)&Ad, size);
cudaMalloc((void**)&Bd, size);
cudaMemcpy(Bd, B, size, cudaMemcpyHostToDevice);
// 定义kernel执行配置,(1024*1024/512)个block,每个block里面有512个线程
dim3 dimBlock(128);
dim3 dimGrid(1000);
// 执行kernel
const auto t1 = std::chrono::system_clock::now();
test__shfl_xor << <1, 32 >> > (Ad,Bd);
cudaMemcpy(A, Ad, size, cudaMemcpyDeviceToHost);
// 校验误差
float max_error = 0.0;
for (int i = 0; i < 32; i++)
{
std::cout << A[i] << std::endl;
}
// 释放CPU端、GPU端的内存
free(A);
cudaFree(Ad);
free(B);
cudaFree(Bd);
return 0;
}
运行结果:
41 85 72 38 80 69 65 68 96 22 49 67 51 61 63 87 66 24 80 83 71 60 64 52 90 60 49 31 23 99 94 11
----------------------------
66 24 80 83 71 60 64 52 90 60 49 31 23 99 94 11 41 85 72 38 80 69 65 68 96 22 49 67 51 61 63 87
--------------------------------
再高阶的用法,求取线程束中最大值:
template <typename T, unsigned int GROUP_SIZE, unsigned int STEP>
struct subgroup_min_impl {
static __device__ T call(T x, uint32_t mask) {
#if CUDA_VERSION >= 9000
x = min(x, __shfl_xor_sync(mask, x, STEP / 2, GROUP_SIZE));
#else
x = min(x, __shfl_xor(x, STEP / 2, GROUP_SIZE));
#endif
return subgroup_min_impl<T, GROUP_SIZE, STEP / 2>::call(x, mask);
}
};
template <typename T, unsigned int GROUP_SIZE>
struct subgroup_min_impl<T, GROUP_SIZE, 1u> {
static __device__ T call(T x, uint32_t) {
return x;
}
};
template <unsigned int GROUP_SIZE, typename T>
__device__ inline T subgroup_min(T x, uint32_t mask) {
return subgroup_min_impl<T, GROUP_SIZE, GROUP_SIZE>::call(x, mask);
}
__global__ void test__shfl_xor(int A[], int B[])
{
int tid = threadIdx.x;
int best = B[tid];
best = subgroup_min<32>(best, 0xffffffffu);
//best = __shfl_xor(best, 16);
A[tid] = best;
}
int main()
{
int *A,*Ad, *B, *Bd;
int n = 32;
int size = n * sizeof(int);
// CPU端分配内存
A = (int*)malloc(size);
B = (int*)malloc(size);
for (int i = 0; i < n; i++)
{
B[i] = rand()%101;
std::cout << B[i] << std::endl;
}
std::cout <<"----------------------------" << std::endl;
// GPU端分配内存
cudaMalloc((void**)&Ad, size);
cudaMalloc((void**)&Bd, size);
cudaMemcpy(Bd, B, size, cudaMemcpyHostToDevice);
// 定义kernel执行配置,(1024*1024/512)个block,每个block里面有512个线程
dim3 dimBlock(128);
dim3 dimGrid(1000);
// 执行kernel
const auto t1 = std::chrono::system_clock::now();
test_shfl_xor << <1, 32 >> > (Ad,Bd);
cudaMemcpy(A, Ad, size, cudaMemcpyDeviceToHost);
// 校验误差
float max_error = 0.0;
for (int i = 0; i < 32; i++)
{
std::cout << A[i] << std::endl;
}
cout << "max error is " << max_error << endl;
// 释放CPU端、GPU端的内存
free(A);
free(B);
cudaFree(Ad);
cudaFree(Bd);
return 0;
}
运行结果
41 85 72 38 80 69 65 68 96 22 49 67 51 61 63 87 66 24 80 83 71 60 64 52 90 60 49 31 23 99 94 11
----------------------------
11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11
_shfl_xor介绍完毕
--------------------------------------
2.__shfl()
Direct copy from indexed lane:复制lane id数据
__shfl(int var,int srclane,int width =32)
这个就是比较简单,咱们直接上代码:
__global__ void test_shfl(int A[], int B[])
{
int tid = threadIdx.x;
int best = B[tid];
best = __shfl(best, 3);
A[tid] = best;
}
int main()
{
int *A,*Ad, *B, *Bd;
int n = 32;
int size = n * sizeof(int);
// CPU端分配内存
A = (int*)malloc(size);
B = (int*)malloc(size);
for (int i = 0; i < n; i++)
{
B[i] = rand()%101;
std::cout << B[i] << std::endl;
}
std::cout <<"----------------------------" << std::endl;
// GPU端分配内存
cudaMalloc((void**)&Ad, size);
cudaMalloc((void**)&Bd, size);
cudaMemcpy(Bd, B, size, cudaMemcpyHostToDevice);
// 定义kernel执行配置,(1024*1024/512)个block,每个block里面有512个线程
dim3 dimBlock(128);
dim3 dimGrid(1000);
// 执行kernel
const auto t1 = std::chrono::system_clock::now();
test_shfl << <1, 32 >> > (Ad,Bd);
cudaMemcpy(A, Ad, size, cudaMemcpyDeviceToHost);
// 校验误差
float max_error = 0.0;
for (int i = 0; i < 32; i++)
{
std::cout << A[i] << std::endl;
}
cout << "max error is " << max_error << endl;
// 释放CPU端、GPU端的内存
free(A);
free(B);
cudaFree(Ad);
cudaFree(Bd);
return 0;
}
按以上代码逻辑,取得数据全是第3号线程的数:
运行结果:
41 85 72 38 80 69 65 68 96 22 49 67 51 61 63 87 66 24 80 83 71 60 64 52 90 60 49 31 23 99 94 11
----------------------------
38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38 38
--------------------------------------------------------------------------------------------------------------------------------
3.__shfl_up()
__shfl_up(int var,unsigned int delta,int width =32):Copy from a lane with lower ID relative to caller
把tid-delta的线程好的var复制给tid的 var,如果tid-delta<0,var保持原来的值
见代码:
__global__ void test_shfl_up(int A[], int B[])
{
int tid = threadIdx.x;
int best = B[tid];
best = __shfl_up(best, 3);
A[tid] = best;
}
运行结果:
41 85 72 38 80 69 65 68 96 22 49 67 51 61 63 87 66 24 80 83 71 60 64 52 90 60 49 31 23 99 94 11
----------------------------
41 85 72 41 85 72 38 80 69 65 68 96 22 49 67 51 61 63 87 66 24 80 83 71 60 64 52 90 60 49 31 23
--------------------------------------------------------------------------------------
4.__shfl_down
__shfl_down(int var,unsigned int delta,int width =32)
把tid+delta的线程好的var复制给tid的 var,如果tid+delta>32,var保持原来的值
测试代码:
__global__ void test_shfl_down(int A[], int B[])
{
int tid = threadIdx.x;
int best = B[tid];
best = __shfl_down(best, 3);
A[tid] = best;
}
运行结果:
41 85 72 38 80 69 65 68 96 22 49 67 51 61 63 87 66 24 80 83 71 60 64 52 90 60 49 31 23 99 94 11
----------------------------
38 80 69 65 68 96 22 49 67 51 61 63 87 66 24 80 83 71 60 64 52 90 60 49 31 23 99 94 11 99 94 11