CUDA学习（二）：并行扫描Scan

薪哥，很潇洒

已于 2023-11-29 21:20:28 修改

阅读量2.8k

点赞数 8

文章标签：学习 c++

于 2023-11-29 21:18:52 首次发布

本文链接：https://blog.csdn.net/qq_44161734/article/details/134620943

版权

本文记录CUDA并行优化学习过程，聚焦并行扫描算法。介绍了Scan算法的naive实现，包括数据初始化、CPU与GPU计算计时等。还阐述了Hillis Steele和Blelloch两种并行扫描算法，以及针对任意长度数据的扫描策略，最后提及后续将介绍矩阵乘法并行优化。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

CUDA学习（二）：并行扫描Scan

该系列文章主要用于记录自己学习cuda并行优化的整个学习过程，方便本人回顾复习，如果能给有所帮助，不甚荣幸，诸君共勉，振兴中华！

前言

扫描（Scan）通常也称之为前缀和，并行扫描算法在很多算法中都有起到非常重要的基础模块作用。并且扫描也包括了闭扫描和开扫描两种方式，其中闭扫描和开扫描的形式分别如下图所示：

在这里插入图片描述
这种扫描、规约类的算法在计算过程中往往非常依赖其他元素，因此在并行的过程中往往存在着较大的困难。本文主要介绍的是基于Hillis Steele算法以及Blelloch算法的并行扫描算法，并且在这些算法的基础上解决任意长度的并行扫描问题。

一、Scan算法的naive实现

1.1、初始化数据

同reduce算法，scan算法的主函数首先定义输入数据大小以及初始化输入数据，并将主机端数据拷贝到设备端并在设备端进行初始化。

    const int N = (1<<10);
    // 初始化主机上的数据
    float * h_in = (float *)malloc(N * sizeof(float));
    initialData(h_in, N);

    // 初始化设备上的数据
    float * d_in;
    cudaMalloc(&d_in, N * sizeof(float));
    cudaMemcpy(d_in, h_in, N * sizeof(float), cudaMemcpyHostToDevice);

1.2、CPU计算并计时

cpu上针对scan的串行实现包括开扫描和闭扫描两种方式，开扫描需要将数组的第一个元素赋值为0，而闭扫描的第一个元素则为数组的第一个元素。

开扫描的串行实现如下所示：

	// 保存cpu计算结果
    float * seq_result = (float *)malloc(N * sizeof(float));
    double cpu_start = cpuSecond();
    // 开扫描
    seq_result[0] = 0.0;
    for (int i = 1; i < N; i++)
    {
        seq_result[i] = h_in[i] + seq_result[i-1];
    }
    double cpu_time = (cpuSecond() - cpu_start) * 1000.0;

闭扫描的实现如下所示：

	float * seq_result = (float *)malloc(sizeof(float) * N);
    // 闭扫描
    double cpu_start = cpuSecond();
    seq_result[0] = h_in[0];
    for (int i = 1; i < N; i++)
    {
        seq_result[i] = h_in[i] + seq_result[i-1];
    }
    double cpu_time = (cpuSecond() - cpu_start) * 1000.0;

1.3、GPU计算并计时

GPU计算在主函数中主要通过event事件进行计时，并将设备上运算的结果拷贝到主机端。

	float * d_out;
    cudaMalloc(&d_out, sizeof(float) * N);

    float gpu_time = 0.0;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    // 核函数计算
    scan2<<<ceil(N / BLOCK_SIZE), BLOCK_SIZE>>>(d_in, d_out);

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&gpu_time, start, stop);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    float * h_out = (float *)malloc(sizeof(float) * N);
    cudaMemcpy(h_out, d_out, N * sizeof(float), cudaMemcpyDeviceToHost);

1.4、检查并输出

在主函数中最终需要将cpu与gpu上的运算结果进行比较，并输出结果以及加速比

	checkResult(seq_result, h_out, N);
	printf("计算%dKB数据，串行计算时间: %f ms\n", N / (1 << 10) , cpu_time);
    printf("单块Hillis Steele 扫描(两次同步)，CUDA并行计算时间为: %fms\n并行计算对比串行计算加速比为: %.4f\n\n", 
            gpu_time , (cpu_time / gpu_time));
    
    free(h_in);
    free(h_out);
    free(seq_result);

    cudaFree(d_in);
    cudaFree(d_out);
    return 0;

1.5、scan算法的naive实现

scan算法在gpu上并行实现只需要使得每个线程计算对应的前缀和即可：

//  让每个线程去遍历数组中该元素的数组前缀和
__global__ void scan0(float * g_in, float * g_out)
{   
    // 当前线程号
    float tmp = 0.0;
    int idx = blockIdx.x * blockDim.x  + threadIdx.x;
    for (int i = 0; i < idx; i++)
    {
        tmp += g_in[i];
    }
    g_out[idx] = tmp;
}

二、Hillis Steele扫描算法

scan算法的naive实现计算前缀和并不能达到很好的并行效果，为此，针对闭扫描提出了Hillis Steele算法可以实现更好的并行效果。

具体Hillis Steele算法实现：步长s从1开始，每轮迭代步长翻倍，步长不大于参与迭代的元素索引。

如下图所示：
在这里插入图片描述

2.1、单块Hillis Steele算法并行实现

针对单块Hillis Steele算法，首先将数据加载到共享内存，考虑到共享内存中的元素累加时会出现不同线程在读写时处理的是同一个元素，但是线程的执行顺序并不一致，从而出现资源的竞争冲突，因此采用双同步的方式避免资源冲突。

单块扫描的Hillis Steele算法实现如下所示：

	// 单块开扫描的Hillis Steele算法实现
__global__ void scan1(float * g_in, float * g_out)
{
    //（1）将全局内存的元素加载到共享内存
    __shared__ float s_data[BLOCK_SIZE];
    int tid = threadIdx.x;
    if (tid < BLOCK_SIZE)
        s_data[tid] = g_in[tid];
    __syncthreads();

    // (2) 在共享内存中使用Hillis Steele算法实现前缀和
    // 步长s从1开始，每轮迭代步长翻倍，每个数向前加上距离为步长的数
    // 步长不大于参与迭代的元素索引
    for (int s = 1; s <= tid; s *= 2)
    {
        // 两次同步，由于不同线程中指令的执行次序可能与代码中并不相同，导致计算错误
        float tmp = s_data[tid - s];
        __syncthreads();
        s_data[tid] += tmp;
    }

    if (tid < BLOCK_SIZE)
        g_out[tid] = s_data[tid];
}

2.2、基于double buffer的Hillis Steele算法并行实现

考虑到需要避免出现资源冲突，而双同步往往会造成时间上性能的下降。提出double buffer的策略来避免资源冲突，本质上是一种空间换时间的方法。

通过构建写缓冲区和读缓冲区，从读缓冲区中读取数据并将最后的运算结果放入到写缓冲区，从而避免了资源冲突。

基于双缓冲区的Hillis Steele算法代码如下所示：

__global__ void scan2(float * g_in, float * g_out)
{
    // 使用double buffer双缓冲空间避免访存冲突的竞争问题
    // （1）将全局内存加载到共享内存
    __shared__ float s_data[BLOCK_SIZE * 2];
    int tid  = threadIdx.x;
    int pread = 0;
    int pwrite = 1;
    if (tid < BLOCK_SIZE)
        s_data[pread * BLOCK_SIZE + tid] = g_in[tid];
    __syncthreads();

    // (2) 在共享内存中使用double buffer避免竞争问题
    for (int s = 1; s < BLOCK_SIZE; s *= 2)
    {
        if (tid >= s)
            s_data[pwrite * BLOCK_SIZE + tid] = s_data[pread * BLOCK_SIZE + tid] + s_data[pread * BLOCK_SIZE + tid - s];
        else
            s_data[pwrite * BLOCK_SIZE + tid] = s_data[pread * BLOCK_SIZE + tid];
        pread = 1 - pread;
        pwrite = 1 - pwrite;
        __syncthreads();
    }

    g_out[tid] = s_data[pread * BLOCK_SIZE + tid];
}

2.3、任意长度的Hillis Steele算法并行实现

上述Hillis Steele算法实现都是针对单块数据实现的，但是单块block上的线程数最多只有1024个，导致可以扫描的数据长度非常受限。

针对任意长度的数据进行扫描往往会采用scan-then-fan的策略，本质上是一种分而治之的思想

首先将数据划分为数据块
再在每个块上独立进行扫描，并将每个块的总和存储到辅助数组中
对辅助数组进行扫描，并将辅助数组中的元素分散加到原本数据块中

分为三个部分：scan核函数， fan核函数以及scan函数
scan函数代码如下所示：

// 将数据块按照线程块进行划分
// 每个线程块扫描结束后将前缀和存放入前缀和数组中
// 再对前缀和数组进行扫描
// 将扫描后的前缀和数组中前缀再分散到对应的数据块上
void scan5(float * d_in, float ** d_out, int length)
{
    // (1) 对数据块中的所有元素进行第一遍扫描
    int GridSize = ceil((double)length / BLOCK_SIZE);
    float * firstScan;
    cudaMalloc(&firstScan, sizeof(float) * length);

    float * firstScanBlock;
    cudaMalloc(&firstScanBlock, sizeof(float) * GridSize);
    scan<<<GridSize, BLOCK_SIZE>>>(d_in, firstScan, firstScanBlock, length);
    cudaDeviceSynchronize();

    printf("%d\n", GridSize);
    if (GridSize == 1)
    {
        *d_out = firstScan;
        cudaFree(firstScanBlock);
        return;
    }

    // (2) 对第一次扫描的数据做第二次扫描
    float * secondScan;
    scan5(firstScanBlock, &secondScan, GridSize);

    // (3) 第二次扫描后利用第一次扫描和第二次扫描的结果做发散
    float * fanResult;
    cudaMalloc(&fanResult, sizeof(float) * length);
    fan<<<GridSize, BLOCK_SIZE>>>(firstScan, secondScan, fanResult, length);
    cudaDeviceSynchronize();

    *d_out = fanResult;

    cudaFree(firstScan);
    cudaFree(secondScan);
    cudaFree(firstScanBlock);
    return;
}

scan核函数的代码如下所示：

__global__ void scan(float * g_in, float * g_out, float * g_out_block, int length)
{
    // (1) 加载数据到共享内存中
    int tid = threadIdx.x;
    int idx = blockDim.x * blockIdx.x + tid;
    __shared__ float s_data[2 * BLOCK_SIZE];
    int Pread = 0, Pwrite = 1;
    if (idx < length)
    {
        s_data[Pread * BLOCK_SIZE + tid] = g_in[idx];
    }
    __syncthreads();

    int s = 1;
    for (; s < BLOCK_SIZE; s *= 2)
    {
        if (tid >= s)
        {
            s_data[Pwrite * BLOCK_SIZE + tid] = s_data[Pread * BLOCK_SIZE + tid] + s_data[Pread * BLOCK_SIZE + tid - s];
        }
        else
        {
            s_data[Pwrite * BLOCK_SIZE + tid] = s_data[Pread * BLOCK_SIZE + tid];
        }
        __syncthreads();
        Pread = 1 - Pread;
        Pwrite = 1 - Pwrite;
    }

    if (idx < length)
    {
        // 加载回数据
        if (tid == BLOCK_SIZE - 1)
            g_out_block[blockIdx.x] = s_data[Pread * BLOCK_SIZE + tid];

        g_out[idx] = s_data[Pread * BLOCK_SIZE + tid];
    }

}

fan核函数的代码如下所示：

// 将并行扫描后的前缀和数组分散到对应的数据块上
__global__ void fan(float * g_in, float * g_in_block, float * g_out, int length)
{
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    if (idx < length)
    {
        int blockNum = (int)(idx / BLOCK_SIZE);
        if (blockNum != 0)
        {
            g_out[idx] = g_in[idx] + g_in_block[blockNum - 1];
        }
        else
        {
            g_out[idx] = g_in[idx];
        }
    }
}

三、Blelloch扫描算法

针对Blelloch算法，相较于Hillis Steele算法，Blelloch算法的work complexity上表现更好，但是Hillis Steele算法的step complexity是更好的。

该算法主要包括了两个部分，分别是reduce和downsweep，如下图所示：
在这里插入图片描述
上半部分是在做reduce，而下半部分则是再做down sweep，reduce在规约并行优化中已经介绍过了，而down sweep可以理解为反向的规约操作与元素替换的融合。下图中的绿色线便是元素相加操作，红色线是将in2元素替换out1元素。

在这里插入图片描述

3.1、单块Blelloch并行扫描算法

使用单线程块加载多数据块至共享内存中，采用相邻配对的方式进行并行规约，代码如下所示：

// Blelloch 算法
__global__ void scan3(float * g_in, float * g_out)
{
    //（1）加载数据到共享内存中
    int tid = threadIdx.x;
    // 申请共享内存
    __shared__ float s_data[BLOCK_SIZE * 2];
    // 单线程块加载多数据块
    if (tid * 2 < N) 
        s_data[2 * tid] = g_in[2 * tid];
    if ((tid * 2 + 1) < N)
        s_data[2 * tid + 1] = g_in[2 * tid + 1];

    // (2) 并行扫描第一阶段：reduce规约
    for (int s = 1; s <= BLOCK_SIZE; s *= 2)
    {
        // 对每一轮迭代进行同步
        __syncthreads();
        int index = 2 * (tid + 1) * s - 1;
        if( index < BLOCK_SIZE * 2)
        {
            s_data[index] = s_data[index] + s_data[index - s];
        }
    }

    // (3) 开扫描，将最后一个元素记录为0
    if (tid == 0)
    {
        s_data[BLOCK_SIZE * 2 - 1] = 0.0;
    }

    // (4) 并行扫描第二阶段：down sweep
    for (int s = BLOCK_SIZE; s > 0; s >>= 1)
    {
        __syncthreads();
        int index = 2 * (tid + 1) * s - 1;
        if (index < BLOCK_SIZE * 2)
        {
            int tmp = s_data[index];
            s_data[index] = s_data[index] + s_data[index - s];
            s_data[index - s] = tmp;
        }
    }
    __syncthreads();

    // （4）并行扫描结束将共享内存结果存回全局内存
    if (tid * 2 < BLOCK_SIZE * 2)
        g_out[tid * 2] = s_data[tid * 2];
    if ((tid * 2 + 1) < BLOCK_SIZE * 2)
        g_out[tid * 2 + 1] = s_data[tid * 2 + 1];

}

3.2、消除bank conflict的Blelloch并行扫描算法

在并行规约中，我们已经提到相邻配对规约会产生共享内存的bank conflict问题，之前按针对bank conflict，我们使用交错配对的方式进行bank conflict的消除，但是在Blelloch算法中并不能通过这种方式消除bank conflict，因此提出一种更加经典的消除bank conflict的方法——padding。

通过填充的方式消除bank conflict，本质上也是一种空间换时间的思想，为每32个数据后填充一个数据，使得共享内存中对第i个地址的访问变成了对（i+i/32）个元素的访问，下图是以8为例，实际中warp大小为32个。

在这里插入图片描述
具体代码如下所示：

// Blelloch算法，padding解决bank conflict的问题
// padding即在每32个数据后填充一个元素，对共享内存中第i个地址的访问，就变成了对第（i+i/32）个元素的访问
__global__ void scan4(float * g_in, float * g_out)
{
    // (1) 加载数据到共享内存
    int tid = threadIdx.x;
    __shared__ float s_data[BLOCK_SIZE * 2 + (BLOCK_SIZE)];
    if ((2 * tid) < 2 * BLOCK_SIZE)
        s_data[2 * tid + Padding(2 * tid)] = g_in[2 * tid];
    if ((2 * tid + 1) < 2 * BLOCK_SIZE)
        s_data[(2 * tid + 1) + Padding(2 * tid + 1)] = g_in[2 * tid + 1];

    // (2) 第一阶段：规约
    for (int s = 1; s <= BLOCK_SIZE; s *= 2)
    {
        __syncthreads();
        int index = 2 * (tid + 1) * s - 1;

        if (index < N)
        {
            s_data[index + Padding(index)] += s_data[index - s + Padding(index - s)];
        }
    }

    // (3) 开扫描对最后一个元素赋值为0
    if (tid == 0)
        s_data[2 * BLOCK_SIZE - 1 + Padding(2 * BLOCK_SIZE - 1)] = 0.0;

    // (4) 第二阶段： down sweep， 规约的镜像
    for (int s = BLOCK_SIZE; s > 0; s /= 2)
    {
        __syncthreads();
        int index = 2 * (tid + 1) * s - 1;
        if (index < (2 * BLOCK_SIZE))
        {
            int tmp = s_data[index + Padding(index)];
            s_data[index + Padding(index)] += s_data[index - s + Padding(index - s)];
            s_data[index - s + Padding(index - s)] = tmp;
        }
    }
    __syncthreads();

    // (4) 将结果写回
    if ((2 * tid) < N)
        g_out[2 * tid] = s_data[(2 * tid) + Padding(2 * tid)];
    if ((2 * tid + 1) < N)
        g_out[(2 * tid + 1)] = s_data[(2 * tid + 1) + Padding(2 * tid + 1)];
}

3.3、任意长度的Blelloch并行扫描算法

针对任意长度的数据，和任意长度的Hillis_Steele算法一样，使用scan-then-fan的方式进行并行扫描。

// 消除bank conflict的Blelloch扫描算法
__global__ void scan(float * g_in, float * g_out, float * g_out_block, int length)
{
    // (1) 单线程块加载多个数据块
    int tid = threadIdx.x;
    __shared__ float s_data[2 * BLOCK_SIZE + Padding(2 * BLOCK_SIZE)];
    int Block_Offset = blockIdx.x * BLOCK_SIZE * 2;
    if ((Block_Offset + 2 * tid) < length)
        s_data[2 * tid + Padding(2 * tid)] = g_in[Block_Offset + 2 * tid];
    if ((Block_Offset + 2 * tid + 1) < length)
        s_data[ 2 * tid + 1 + Padding(2 * tid + 1)] = g_in[Block_Offset + 2 * tid + 1];
    __syncthreads();

    // (2) 第一阶段做规约
    for (int s = 1; s <= BLOCK_SIZE; s *= 2)
    {
        int index = 2 * s * (tid + 1) - 1;
        if (index < (2 * BLOCK_SIZE))
            s_data[index + Padding(index)] += s_data[index - s + Padding(index - s)];
        __syncthreads();
    }

    // 第一阶段结束对最后一个元素赋值为0
    if (tid == 0)
        s_data[(2 * BLOCK_SIZE - 1) + Padding(2 * BLOCK_SIZE - 1)] = 0;
    __syncthreads();

    // (3) 第二阶段做down-sweep
    for (int s = BLOCK_SIZE; s >= 1; s /= 2)
    {
        // 两个元素，后者等于两数之和，前者等于后者
        int index = 2 * s * (tid + 1) - 1;
        if (index < (2 * BLOCK_SIZE))
        {
            float tmp = s_data[index + Padding(index)];
            s_data[index + Padding(index)] += s_data[index - s + Padding(index - s)];
            s_data[index - s + Padding(index - s)] = tmp;
        }
        __syncthreads();
    }
    __syncthreads();

    // (4) 将扫描的结果写回全局内存
    if ((Block_Offset + 2 * tid) < length)
    {
        g_out[Block_Offset + 2 * tid] = s_data[2 * tid + Padding(2 * tid)];
        if (2 * tid == (2 * BLOCK_SIZE - 1))
            g_out_block[blockIdx.x] = s_data[2 * tid + Padding(2 * tid)] + g_in[2 * tid + Block_Offset];
    } 
    if ((Block_Offset + 2 * tid + 1) < length)
    {
        g_out[Block_Offset + 2 * tid + 1] = s_data[(2 * tid + 1) + Padding(2 * tid + 1)];
        if ((2 * tid + 1) == (2 * BLOCK_SIZE - 1))
            g_out_block[blockIdx.x] = s_data[2 * tid + 1 + Padding(2 * tid + 1)] + g_in[2 * tid + 1 + Block_Offset];
    }
}

// 将第二次扫描的结果重新加回到第一次扫描的结果
__global__ void fan(float * firstScan, float * secondScan, float * fanResult, int length)
{
    int index = blockIdx.x * (BLOCK_SIZE * 2) + threadIdx.x * 2;
    int block_index = (int)(index / (BLOCK_SIZE * 2));
    if (index < length)
    {
        fanResult[index] = firstScan[index] + secondScan[block_index];
    }
    index += 1;
    block_index = (int)(index / (BLOCK_SIZE * 2));
    if (index < length)
        fanResult[index] = firstScan[index] + secondScan[block_index];
}

// 针对任意长度数据的扫描
void scan6(float * g_in, float ** g_out, int length)
{

    //(1)先做一遍扫描
    float * firstScan; // 存储第一次扫描后的每个块内前缀和
    cudaMalloc((void **)&firstScan, length * sizeof(float));

    int Grid_Size = ceil((double)length / (BLOCK_SIZE * 2)); // 注意只需要用一半的block
    float *firstScanBlock;                     // 辅助数组：存储第一次扫描后每个扫描块内部总和
    cudaMalloc((void **)&firstScanBlock, Grid_Size * sizeof(float));

    scan<<<Grid_Size, BLOCK_SIZE>>>(g_in, firstScan, firstScanBlock, length);
    cudaDeviceSynchronize();

    // 如果剩下的只需要一个block
    // 那么d_first_scan_result就是最终的前缀和了，如有递归，此时结束
    if (Grid_Size == 1)
    {
        *g_out = firstScan;
        cudaFree(firstScanBlock);
        return;
    }

    // 第二次扫描
    float * secondScan;

    scan6(firstScanBlock, &secondScan, Grid_Size);

    // 将第二次扫描后的结果分散到第一次扫描
    float * fanResult;
    cudaMalloc((void **)&fanResult, sizeof(float) * length);

    fan<<<Grid_Size, BLOCK_SIZE>>>(firstScan, secondScan, fanResult, length);
    cudaDeviceSynchronize();
    
    *g_out = fanResult;
    cudaFree(firstScan);
    cudaFree(firstScanBlock);
    cudaFree(secondScan);
}