__shfl_up_sync()
up 是指 从取值的线程对齐。
#include <cuda_runtime.h>
#include <stdio.h>
#define warpSize 32
__global__ void scan4(float* a, float* b) {
int laneId = threadIdx.x & 0x1f;
float value;
value = a[laneId];
value = __shfl_up_sync(0xffffffff, value, 4);
b[laneId] = value;
}
void printVector(char* desc, float* ptr_vec, unsigned int n){
printf("%s =\n", desc);
for(int i=0; i<n; i++){
printf(" %5.2f ",ptr_vec[i]);
}
printf("\n");
}
int main() {
float* a_h = NULL;
float* a_d = NULL;
float* b_h = NULL;
float* b_d = NULL;
a_h = (float*)malloc(warpSize*sizeof(float));
b_h = (float*)malloc(warpSize*sizeof(float));
for(int i=0; i<warpSize; i++){
a_h[i] = i+100.0;
}
//memset(b_h, 15, warpSize*sizeof(float));
for(int i=0; i<warpSize; i++){
b_h[i] = i+100.0;
}
printVector("a_h",a_h, warpSize);
printVector("b_h",b_h, warpSize);
cudaMalloc((void**)&a_d, warpSize*sizeof(float));
cudaMalloc((void**)&b_d, warpSize*sizeof(float));
cudaMemcpy(a_d, a_h, warpSize*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(b_d, b_h, warpSize*sizeof(float), cudaMemcpyHostToDevice);
scan4<<< 1, warpSize >>>(a_d, b_d);
cudaDeviceSynchronize();
cudaMemcpy(b_h, b_d, warpSize*sizeof(float), cudaMemcpyDeviceToHost);
printVector("b_d", b_h, warpSize);
cudaFree(a_d);
cudaFree(b_d);
return 0;
}