深度学习模型部署(十四)CUDA编程-流&sharememory

战术摸鱼大师

于 2024-08-13 10:52:31 发布

阅读量807

点赞数 19

分类专栏：并行编程模型部署实战文章标签： c++ 边缘计算 CUDA

本文链接：https://blog.csdn.net/qq_45983373/article/details/140956361

版权

模型部署实战同时被 2 个专栏收录

19 篇文章 11 订阅

订阅专栏

并行编程

4 篇文章 0 订阅

订阅专栏

流stream

前面提到，CUDA的线程结构为：grid，block，threads，每个线程之间是并行执行的。我们到目前为止的代码流程为：读取内存，进行计算，导出结果。如果我们要处理一个非常大的数据时，读取内存就需要占用很长时间的时候，那这段时间岂不是浪费了？能否并行读取数据？或者在读取到一部分数据后就开始部分运算？答案是可以的，CUDA提供了流(stream)技术，stream之间是并行的，stream内是串行的。方便程序员编写异步执行代码。
在这里插入图片描述

#include <iostream>

#define SPLIT 4

__global__ void vector_add(float *a, float *b, float *c, int width){
    int index = (blockIdx.x * gridDim.y + blockIdx.y)*blockDim.x * blockDim.y + threadIdx.x * blockDim.y + threadIdx.y;
    // 计算线程的全局索引，第blockIdx.x行第blockIdx.y列的block中的第threadIdx.x行第threadIdx.y列的线程
    if (index >= width) return;
    c[index] = a[index] + b[index];
    
}

void single_stream(float *a, float *b, float *c, int width){
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start,0); //0是指默认流
    cudaEventSynchronize(start);

    float *d_a, *d_b, *d_c;
    cudaMalloc((void **)&d_a, width * sizeof(float));
    cudaMalloc((void **)&d_b, width * sizeof(float));
    cudaMalloc((void **)&d_c, width * sizeof(float));

    cudaMemcpy(d_a, a, width * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, width * sizeof(float), cudaMemcpyHostToDevice);

    dim3 dimGrid(1, width >> 10);
    dim3 dimBlock(256, 4);
    vector_add<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, width);
    cudaMemcpy(c, d_c, width * sizeof(float), cudaMemcpyDeviceToHost);

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    float time;
    cudaEventElapsedTime(&time, start, stop);
    std::cout << "Single stream time: " << time << "ms" << std::endl;

    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
}

void multi_stream(float *a, float *b, float *c, int width){
    cudaStream_t stream1, stream2, stream3,stream4;
    cudaStreamCreate(&stream1);
    cudaStreamCreate(&stream2);
    cudaStreamCreate(&stream3);
    cudaStreamCreate(&stream4);

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);
    cudaEventSynchronize(start);

    float *d_a, *d_b, *d_c_1, *d_c_2, *d_c_3, *d_c_4;

    cudaMalloc((void **)&d_a, width * sizeof(float));
    cudaMalloc((void **)&d_b, width * sizeof(float));
    cudaMalloc((void **)&d_c_1, width / SPLIT * sizeof(float));
    cudaMalloc((void **)&d_c_2, width / SPLIT * sizeof(float));
    cudaMalloc((void **)&d_c_3, width / SPLIT * sizeof(float));
    cudaMalloc((void **)&d_c_4, width / SPLIT * sizeof(float));

    dim3 dimGrid(1, 1);
    dim3 dimBlock(width / SPLIT, 1);
    
    cudaMemcpyAsync(d_a, a, width / SPLIT * sizeof(float), cudaMemcpyHostToDevice, stream1);
    // 注意这里要用异步拷贝
    cudaMemcpyAsync(d_b, b, width / SPLIT * sizeof(float), cudaMemcpyHostToDevice, stream1);
    vector_add<<<dimGrid, dimBlock, 0, stream1>>>(d_a, d_b, d_c_1, width / SPLIT);
    cudaMemcpyAsync(c, d_c_1, width / SPLIT * sizeof(float), cudaMemcpyDeviceToHost, stream1);

    cudaMemcpyAsync(d_a + width / SPLIT, a + width / SPLIT, width / SPLIT * sizeof(float), cudaMemcpyHostToDevice, stream2);
    cudaMemcpyAsync(d_b + width / SPLIT, b + width / SPLIT, width / SPLIT * sizeof(float), cudaMemcpyHostToDevice, stream2);
    vector_add<<<dimGrid, dimBlock, 0, stream2>>>(d_a + width / SPLIT, d_b + width / SPLIT, d_c_2, width / SPLIT);
    cudaMemcpyAsync(c + width / SPLIT, d_c_2, width / SPLIT * sizeof(float), cudaMemcpyDeviceToHost, stream2);

    cudaMemcpyAsync(d_a + 2 * width / SPLIT, a + 2 * width / SPLIT, width / SPLIT * sizeof(float), cudaMemcpyHostToDevice, stream3);
    cudaMemcpyAsync(d_b + 2 * width / SPLIT, b + 2 * width / SPLIT, width / SPLIT * sizeof(float), cudaMemcpyHostToDevice, stream3);
    vector_add<<<dimGrid, dimBlock, 0, stream3>>>(d_a + 2 * width / SPLIT, d_b + 2 * width / SPLIT, d_c_3, width / SPLIT);
    cudaMemcpyAsync(c + 2 * width / SPLIT, d_c_3, width / SPLIT * sizeof(float), cudaMemcpyDeviceToHost, stream3);

    cudaMemcpyAsync(d_a + 3 * width / SPLIT, a + 3 * width / SPLIT, width / SPLIT * sizeof(float), cudaMemcpyHostToDevice, stream4);
    cudaMemcpyAsync(d_b + 3 * width / SPLIT, b + 3 * width / SPLIT, width / SPLIT * sizeof(float), cudaMemcpyHostToDevice, stream4);
    vector_add<<<dimGrid, dimBlock, 0, stream4>>>(d_a + 3 * width / SPLIT, d_b + 3 * width / SPLIT, d_c_4, width / SPLIT);
    cudaMemcpyAsync(c + 3 * width / SPLIT, d_c_4, width / SPLIT * sizeof(float), cudaMemcpyDeviceToHost, stream4);

    cudaStreamSynchronize(stream1);
    cudaStreamSynchronize(stream2);
    cudaStreamSynchronize(stream3);
    cudaStreamSynchronize(stream4);

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);

    float time;
    cudaEventElapsedTime(&time, start, stop);
    std::cout << "Multi stream time: " << time << "ms" << std::endl;

    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c_1);
    cudaFree(d_c_2);
    cudaFree(d_c_3);
    cudaFree(d_c_4);
}

bool if_right(float *c, int width){
    for(int i = 0; i < width; i++){
        if(c[i] != 3.0f){
            std::cout << "Error at: " << i << std::endl;
            return false;
        }
    }
    return true;
}

int main(){
    int width = 1 << 12;
    float *a = new float[width];
    float *b = new float[width];
    float *c = new float[width];

    for(int i = 0; i < width; i++){
        a[i] = 1.0f;
        b[i] = 2.0f;
    }

    single_stream(a, b, c, width);
    if_right(c, width);
    multi_stream(a, b, c, width);
    if_right(c, width);

    return 0;
}

sharememory

之前讨论过，GPU中的内存模型分为几个层次，读取速度由快到慢，大小由小到大：

register：寄存器
share memory
global memory

其中，register是线程自己占有的，share memory是block占有的，一个block内的线程可以共享，global memory是全局的，也就是我们常说的显存，存储介质是片外存储，DRAM存储。

DRAM不能一次读取一个字节，只能一次读取一行，这是其实现原理决定的

合理使用share memory能够有效降低访存延迟，提高整体效率。
例如进行向量元素求和。
如果使用最简单的实现，对于一个长度为D的向量，需要顺序执行D次访存，时间复杂度也是 $O (n)$ ：

__global__ void vectorSum(float *input ,float *result， int N){
	float temp_sum = 0.0;
	for(int i = 0; i < N; ++i){
		temp_sum += input[i];
	}
	result[0] = temp_sum;
}

如果使用二分法，先计算一半元素的和，再将得到的结果再求和，访存变成了2D次，但是由于是二分法，同时读取，所以时间复杂度减小为 $O(\log^n)$ 。这种将一个向量经过计算得到一个标量的过程，被称为规约，这种二分法的规约方式被称为相邻规约：

__global__ void bisectionVectorSum(float *input, float* result, int N){
    int tid = threadIdx.x;
    if (tid >= N) return;
    int step = 1;
    while(step < N){
        if(tid % (2*step) == 0){
            input[tid] += input[tid + step];
        }
        __syncthreads();
        step *= 2;
    }
    if(tid == 0){
        *result = input[0];
    }

}

这种相邻规约会导致读取时多个线程同时读取相邻位置，导致访存冲突，CUDA中还有一种规约方式，称为交错规约：

__global__ void bisectVectorSum(float *input, float *result, int N){
    int tid = threadIdx.x;
    for(int step = N/2; step > 0; step /= 2){
        if(tid < step){
            input[tid] += input[tid + step];
        }
        __syncthreads();
    }
    if(tid == 0){
        *result = input[0];
    }
}

以上都是没有使用share memory版本的，通过合理使用share memory，可以将访存降低至D次，同时保持时间成本为 $O(\log^n)$

__global__ void shareMemoryVectorSum(float *input, float *result, int N){
    int tid = threadIdx.x;
    __shared__ float temp[1024];
    temp[tid] = input[tid];
    __syncthreads();
    for(int step = N/2; step > 0; step /= 2){
        if(tid < step){
            temp[tid] += temp[tid + step];
        }
        __syncthreads();
    }
    if(tid == 0){
        *result = temp[0];
    }

}

share memory由32个bank构成，根据设备计算能力的不同，bank的大小也不同，计算能力3.x的，bank的宽度是8个字节64位，即share memory一共大小为32*8 = 256个byte。
需要注意的是，访问share memory时，每个bank同时只能有一个线程在读写，否则就会造成bank conflict，降低读写效率。所以像上面提到的相邻规约，就不如交错规约，因为交错规约每个线程访问的bank都相隔较远，不会产生bank conflict。