目录
共享内存和线程同步计算
功能:对于长度为10的数组,用10个线程同步计算当前元素之前所有元素的平均值。
#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
__global__ void gpu_shared_memory(float *d_a)
{
// Defining local variables which are private to each thread
int i, index = threadIdx.x;
float average, sum = 0.0f;
//Define shared memory
__shared__ float sh_arr[10];
sh_arr[index] = d_a[index];
__syncthreads()