定义流:
cudaStream_t stream;
创建流:
cudaStreamCreate(cudaStream_t *s)
销毁流:
cudaStreamDestroy(cudaStream_t s)
流的同步:显示同步
//同步单个流,等待该命令都完成
cudaStreamSynchronize(cudaStream_t stream);
//同步所有流,等待整个设备上的流都完成
cudaDeviceSynchronize();
//通过某个事件:等待某个事件结束后执行该流上的命令
cudaStreamWaitEvent(cudaEvent_t stream);
//查询一个流任务是否完成
cudaStreamQuery(cudaStream_t stream);
流的使用
在使用GPU完成任务时,有2个地方可以使用到流
以_global_定义的kernel函数可以交给流去执行
kernel<<<gridDim, blockDim, shared_memory_size, stream>>>(parameters);
GPU和CPU之间数据传输
cudaMemcpyAsync(dst, src, copy_size, copy_direction, stream);
程序运行截图如下:
代码如下:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
using namespace std;
__global__ void addKernel(int *c, int *a, int *b)
{
int i = threadIdx.x;
c[i] = a[i] + b[i];
}
void testStream() {
int p_data_a[100] = { 0 };
int p_data_b[100] = { 0 };
int p_data_c[100] = { 0 };
for (int i = 0; i < 100; i++) {
p_data_a[i] = i;
p_data_b[i] = 10 + i;
p_data_c[i] = 0;
}
cout << "begin" << endl;
for (int i = 0; i < 100; i++) {
cout << p_data_c[i] << " ";
}
cout << endl;
int *dev_a = nullptr;
int *dev_b = nullptr;
int *dev_c = nullptr;
cudaMalloc(&dev_a, sizeof(int) * 100);
cudaMalloc(&dev_b, sizeof(int) * 100);
cudaMalloc(&dev_c, sizeof(int) * 100);
//拷贝到内存
cudaMemcpy(dev_a, p_data_a, sizeof(int) * 100, cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, p_data_b, sizeof(int) * 100, cudaMemcpyHostToDevice);
cudaStream_t streams[100];
for (int i = 0; i < 100; ++i)
cudaStreamCreate(streams + i);
for (int i = 0; i < 100; ++i)
addKernel << <1, 1, 0 >> > (dev_c + i, dev_a + i, dev_b + i);
cudaDeviceSynchronize();
cudaMemcpy(p_data_c, dev_c, sizeof(int) * 100, cudaMemcpyDeviceToHost);
cout << "end:" << endl;
for (int i = 0; i < 100; i++) {
cout << p_data_c[i] << " ";
}
cout << endl;
cout << "over" << endl;
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
}
int main() {
testStream();
getchar();
return 0;
}