/*对stream的介绍*/
#include <stdio.h>
#define N 1024*1024//每次从CPU传输到GPU的数据块大小
#define M N*10//CPU上的总数据量
/*测试设备是否支持边执行核函数边复制数据*/
bool support_overlap(){
cudaDeviceProp prop;
int preDev;
cudaGetDevice(&preDev);
cudaGetDeviceProperties(&prop,preDev);
if(prop.deviceOverlap)
return true;
return false;
}
__global__ void add(int* a,int* b,int* c){
int tid = threadIdx.x+blockIdx.x*blockDim.x;
if(tid<N){
c[tid] = a[tid] + b[tid];
}
}
int main(){
cudaEvent_t start,stop;
float elapsedTime;
cudaStream_t stream;//声明流
int *a,*b,*c,*d_a,*d_b,*d_c;
if(!support_overlap){
printf("Sorry,the device cannot support overlap.\n");
return 0;
}
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start,0);
cudaStreamCreate(&stream);//初始化流
cudaMalloc((void**)&d_a,N*sizeof(int));
cudaMalloc((void**)&d_b,N*sizeof(int));
cudaMalloc((void**)&d_c,N*sizeof(int));
/*在主机上分配页锁定内存*/
cudaHostAlloc((void**)&a,M*sizeof(int),cudaHostAllocDefault);
cudaHostAlloc((void**)&b,M*sizeof(int),cudaHostAllocDefault);
cudaHostAlloc((void**)&c,M*sizeof(int),cudaHostAllocDefault);
/*用随机数填充主机内存*/
for(int i = 0;i<M;i++){
a[i] = rand();
b[i] = rand();
}
/*将输入缓冲区划分为更小的块,并在每个块上执行“数据传输到GPU”,“计算”,“数据传输回CPU”三个步骤*/
for(int i = 0;i<M;i+=N){
//主机上的页锁定内存以异步方式复制到设备上
cudaMemcpyAsync(d_a,a+i,N*sizeof(int),cudaMemcpyHostToDevice,stream);
cudaMemcpyAsync(d_b,b+i,N*sizeof(int),cudaMemcpyHostToDevice,stream);
add<<<N/256,256,0,stream>>>(d_a,d_b,d_c);
cudaMemcpyAsync(c+i,d_c,sizeof(int),cudaMemcpyDeviceToHost,stream);
}
cudaStreamSynchronize(stream);//实现CPU和GPU的同步
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime,start,stop);
printf("%f",elapsedTime);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
cudaFreeHost(a);
cudaFreeHost(b);
cudaFreeHost(b);
cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaStreamDestroy(stream);
return 0;
}
cuda——使用stream
最新推荐文章于 2024-02-01 21:51:38 发布