Block同步
WAIT_FOR_THE_FINAL_BLOCK
自定义
#define WAIT_FOR_THE_FINAL_BLOCK \
do { \
__threadfence(); \
__shared__ int value; \
if (threadIdx.x + threadIdx.y == 0) value = 1 + atomicAdd(d_sync_buffer + sync_buffer_id, 1); \
__syncthreads(); \
if (value < gridDim.z * gridDim.y * gridDim.x) return; \
if (threadIdx.x + threadIdx.y == 0) d_sync_buffer[sync_buffer_id] = 0; \
} while (false)
__threadfence()
__threadfence()
可保证 CUDA 块间同步通信。
thread同步
块内线程同步,等待所有线程结束
__syncthreads();