实验一、
__shfl_sync(unsigned mask, T var, int srcLane, int width=warpSize);
mask 是参与的线程掩码,如0xffffffff,var 是待广播的值,srclane 是被广播的 laneid
__shfl_sync(0xffffffff, value, 2)
Broadcast of a single value across a warp
Synchronize all threads in warp, and get “value” from lane 2
#include <stdio.h>
__global__ void bcast(int arg)
{
int laneId = threadIdx.x & 0x1f;
int value;
if(laneId == 2) {
value = arg;
}
value = __shfl_sync(0xffffffff, value, 2);
if(value != arg)
{
printf("%d", laneId);
}
}
int main() {
bcast<<< 1, 32 >>>(1234);
cudaDeviceSynchronize();
return 0;
}
没有输出,说明将1234通过laneId = 2
广播到所有laneId
实验二、
能实现前缀和,
thread n = 前 n + 1个thread和
#include <stdio.h>
__global__ void scan4() {
int laneId = threadIdx.x & 0x1f;
int value = 31 - laneId;
// Loop to accumulate scan within my partition.
// Scan requires log2(n) == 3 steps for 8 threads
// It works by an accumulated sum up the warp
// by 1, 2, 4, 8 etc. steps.
for (int i=1; i<=4; i*=2) {
int n = __shfl_up_sync(0xffffffff, value, i, 8);
if ((laneId & 7) >= i)
value += n;
}
printf("Thread %d final value = %d\n", threadIdx.x, value);
}
int main() {
scan4<<< 1, 32 >>>();
cudaDeviceSynchronize();
return 0;
}
实验三、
Reduction across a warp
规约
#include <stdio.h>
__global__ void warpReduce() {
int laneId = threadIdx.x & 0x1f;
// Seed starting value as inverse lane ID
int value = 31 - laneId;
// Use XOR mode to perform butterfly reduction
for (int i=16; i>=1; i/=2)
value += __shfl_xor_sync(0xffffffff, value, i, 32);
// "value" now contains the sum across all threads
printf("Thread %d final value = %d\n", threadIdx.x, value);
}
int main() {
warpReduce<<< 1, 32 >>>();
cudaDeviceSynchronize();
return 0;
}