流stream
前面提到,CUDA的线程结构为:grid,block,threads,每个线程之间是并行执行的。我们到目前为止的代码流程为:读取内存,进行计算,导出结果。如果我们要处理一个非常大的数据时,读取内存就需要占用很长时间的时候,那这段时间岂不是浪费了?能否并行读取数据?或者在读取到一部分数据后就开始部分运算?答案是可以的,CUDA提供了流(stream)技术,stream之间是并行的,stream内是串行的。方便程序员编写异步执行代码。
#include <iostream>
#define SPLIT 4
__global__ void vector_add(float *a, float *b, float *c, int width){
int index = (blockIdx.x * gridDim.y + blockIdx.y)*blockDim.x * blockDim.y + threadIdx.x * blockDim.y + threadIdx.y;
// 计算线程的全局索引,第blockIdx.x行第blockIdx.y列的block中的第threadIdx.x行第threadIdx.y列的线程
if (index >= width) return;
c[index] = a[index] + b[index];
}
void single_stream(float *a, float *b, float *c, int width){
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start,0); //0是指默认流
cudaEventSynchronize(start);
float *d_a, *d_b, *d_c;
cudaMalloc((void **)&d_a, width * sizeof(float));
cudaMalloc((void **)&d_b, width * sizeof(float));
cudaMalloc((void **)&d_c, width * sizeof(float));
cudaMemcpy(d_a, a, width * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, width * sizeof(float), cudaMemcpyHostToDevice);
dim3 dimGrid(1, width >> 10);
dim3 dimBlock(256, 4);
vector_add<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, width);
cudaMemcpy(c, d_c, width * sizeof(float), cudaMemcpyDeviceToHost);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float time;
cudaEventElapsedTime(&time, start, stop);
std::cout << "Single stream time: " << time << "ms" << std::endl;
cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
}
void multi_stream(float *a, float *b, float *c, int width){
cudaStream_t stream1, stream2, stream3,stream4;
cudaStreamCreate(&stream1);
cudaStreamCreate(&stream2);
cudaStreamCreate(&stream3);
cudaStreamCreate(&stream4);
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
cudaEventSynchronize(start);
float *d_a, *d_b, *d_c_1, *d_c_2, *d_c_3, *d_c_4;
cudaMalloc((void **)&d_a, width * sizeof(float));
cudaMalloc((void **)&d_b, width * sizeof(float));
cudaMalloc((void **)&d_c_1, width / SPLIT * sizeof(float));
cudaMalloc((void **)&d_c_2, width / SPLIT * sizeof(float));
cudaMalloc((void **)&d_c_3, width / SPLIT * sizeof(float));
cudaMalloc((void **)&d_c_4, width / SPLIT * sizeof(float));
dim3 dimGrid(1, 1);
dim3 dimBlock(width / SPLIT, 1);
cudaMemcpyAsync(d_a, a, width / SPLIT * sizeof(float), cudaMemcpyHostToDevice, stream1);
// 注意这里要用异步拷贝
cudaMemcpyAsync(d_b, b, width / SPLIT * sizeof(float), cudaMemcpyHostToDevice, stream1);
vector_add<<<dimGrid, dimBlock, 0, stream1>>>(d_a, d_b, d_c_1, width / SPLIT);
cudaMemcpyAsync(c, d_c_1, width / SPLIT * sizeof(float), cudaMemcpyDeviceToHost, stream1);
cudaMemcpyAsync(d_a + width / SPLIT, a + width / SPLIT, width / SPLIT * sizeof(float), cudaMemcpyHostToDevice, stream2);
cudaMemcpyAsync(d_b + width / SPLIT, b + width / SPLIT, width / SPLIT * sizeof(float), cudaMemcpyHostToDevice, stream2);
vector_add<<<dimGrid, dimBlock, 0, stream2>>>(d_a + width / SPLIT, d_b + width / SPLIT, d_c_2, width / SPLIT);
cudaMemcpyAsync(c + width / SPLIT, d_c_2, width / SPLIT * sizeof(float), cudaMemcpyDeviceToHost, stream2);
cudaMemcpyAsync(d_a + 2 * width / SPLIT, a + 2 * width / SPLIT, width / SPLIT * sizeof(float), cudaMemcpyHostToDevice, stream3);
cudaMemcpyAsync(d_b + 2 * width / SPLIT, b + 2 * width / SPLIT, width / SPLIT * sizeof(float), cudaMemcpyHostToDevice, stream3);
vector_add<<<dimGrid, dimBlock, 0, stream3>>>(d_a + 2 * width / SPLIT, d_b + 2 * width / SPLIT, d_c_3, width / SPLIT);
cudaMemcpyAsync(c + 2 * width / SPLIT, d_c_3, width / SPLIT * sizeof(float), cudaMemcpyDeviceToHost, stream3);
cudaMemcpyAsync(d_a + 3 * width / SPLIT, a + 3 * width / SPLIT, width / SPLIT * sizeof(float), cudaMemcpyHostToDevice, stream4);
cudaMemcpyAsync(d_b + 3 * width / SPLIT, b + 3 * width / SPLIT, width / SPLIT * sizeof(float), cudaMemcpyHostToDevice, stream4);
vector_add<<<dimGrid, dimBlock, 0, stream4>>>(d_a + 3 * width / SPLIT, d_b + 3 * width / SPLIT, d_c_4, width / SPLIT);
cudaMemcpyAsync(c + 3 * width / SPLIT, d_c_4, width / SPLIT * sizeof(float), cudaMemcpyDeviceToHost, stream4);
cudaStreamSynchronize(stream1);
cudaStreamSynchronize(stream2);
cudaStreamSynchronize(stream3);
cudaStreamSynchronize(stream4);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float time;
cudaEventElapsedTime(&time, start, stop);
std::cout << "Multi stream time: " << time << "ms" << std::endl;
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c_1);
cudaFree(d_c_2);
cudaFree(d_c_3);
cudaFree(d_c_4);
}
bool if_right(float *c, int width){
for(int i = 0; i < width; i++){
if(c[i] != 3.0f){
std::cout << "Error at: " << i << std::endl;
return false;
}
}
return true;
}
int main(){
int width = 1 << 12;
float *a = new float[width];
float *b = new float[width];
float *c = new float[width];
for(int i = 0; i < width; i++){
a[i] = 1.0f;
b[i] = 2.0f;
}
single_stream(a, b, c, width);
if_right(c, width);
multi_stream(a, b, c, width);
if_right(c, width);
return 0;
}
sharememory
之前讨论过,GPU中的内存模型分为几个层次,读取速度由快到慢,大小由小到大:
- register:寄存器
- share memory
- global memory
其中,register是线程自己占有的,share memory是block占有的,一个block内的线程可以共享,global memory是全局的,也就是我们常说的显存,存储介质是片外存储,DRAM存储。
DRAM不能一次读取一个字节,只能一次读取一行,这是其实现原理决定的
合理使用share memory能够有效降低访存延迟,提高整体效率。
例如进行向量元素求和。
如果使用最简单的实现,对于一个长度为D的向量,需要顺序执行D次访存,时间复杂度也是
O
(
n
)
O(n)
O(n):
__global__ void vectorSum(float *input ,float *result, int N){
float temp_sum = 0.0;
for(int i = 0; i < N; ++i){
temp_sum += input[i];
}
result[0] = temp_sum;
}
如果使用二分法,先计算一半元素的和,再将得到的结果再求和,访存变成了2D次,但是由于是二分法,同时读取,所以时间复杂度减小为 O ( log n ) O(\log^n) O(logn)。这种将一个向量经过计算得到一个标量的过程,被称为规约,这种二分法的规约方式被称为相邻规约:
__global__ void bisectionVectorSum(float *input, float* result, int N){
int tid = threadIdx.x;
if (tid >= N) return;
int step = 1;
while(step < N){
if(tid % (2*step) == 0){
input[tid] += input[tid + step];
}
__syncthreads();
step *= 2;
}
if(tid == 0){
*result = input[0];
}
}
这种相邻规约会导致读取时多个线程同时读取相邻位置,导致访存冲突,CUDA中还有一种规约方式,称为交错规约:
__global__ void bisectVectorSum(float *input, float *result, int N){
int tid = threadIdx.x;
for(int step = N/2; step > 0; step /= 2){
if(tid < step){
input[tid] += input[tid + step];
}
__syncthreads();
}
if(tid == 0){
*result = input[0];
}
}
以上都是没有使用share memory版本的,通过合理使用share memory,可以将访存降低至D次,同时保持时间成本为 O ( log n ) O(\log^n) O(logn)
__global__ void shareMemoryVectorSum(float *input, float *result, int N){
int tid = threadIdx.x;
__shared__ float temp[1024];
temp[tid] = input[tid];
__syncthreads();
for(int step = N/2; step > 0; step /= 2){
if(tid < step){
temp[tid] += temp[tid + step];
}
__syncthreads();
}
if(tid == 0){
*result = temp[0];
}
}
share memory由32个bank构成,根据设备计算能力的不同,bank的大小也不同,计算能力3.x的,bank的宽度是8个字节64位,即share memory一共大小为32*8 = 256个byte。
需要注意的是,访问share memory时,每个bank同时只能有一个线程在读写,否则就会造成bank conflict,降低读写效率。所以像上面提到的相邻规约,就不如交错规约,因为交错规约每个线程访问的bank都相隔较远,不会产生bank conflict。
实际中还涉及到当多个线程访问同一个时会触发broadcast等具体情况,看官方手册。