异步传输函数需要和锁页内存函数cudaHostAlloc()搭配使用,因为传递给cudaMemcpyAsync()函数的主机端指针所指内存必须已经被分配固定下来,cudaHostAlloc()申请的就是锁页内存(即内存被固定)。
size_t element_size = 1000 * sizeof(double);//1000个元素所占字节
size_t pitch;
int width = 3 * sizeof(char);//二维数组的宽度:每行3个元素
int height= 3 * sizeof(char);//二维数组的长度:每列1000个元素
dim3 blocks(256);
dim3 grid((num_elements_ + blocks.x - 1) / blocks.x);
cudaStream_t stream[2];//创建流流
for (int i = 0; i< 2; i++)
{
cudaStreamCreate(&stream[i]);
}
double* dev_a;
double* host_a;
cudaMalloc(&dev_a, element_size);
cudaHostAlloc(&host_a, element_size, cudaHostAllocDefault);//主机端锁页内存
for (int i = 0; i < num_elements_; i++)
{
host_a[i] = i;
}
double* dev_anb;
double* host_anb;
cudaMallocPitch(&dev_anb, &pitch, width * sizeof(double), height);//开辟显存空间
cudaHostAlloc(&host_anb, height *width * sizeof(double), cudaHostAllocDefault);//开辟主机端锁业内存
//这里传输将数组全部数据都一下传进去了,真正异步传输应该for循环来分开传输数据来计算
for(int i = 0; i < 1000; i += 200)
{
//一维数组异步传输
cudaMemcpyAsync(dev_a, host_a+i, i * sizeof(double), cudaMemcpyHostToDevice, stream[0]);
//二维数组异步传输
cudaMemcpy2DAsync(dev_anb, pitch, host_anb + i, width * sizeof(double), width * sizeof(double), height, cudaMemcpyHostToDevice, stream[0]);
}
//上面这部分可以多做尝试
//调用核函数
async_kernel << < num_elements_, 1, 0, stream[0] >> > (dev_a, dev_anb, dev_result,width, height, pitch);
cudaStatus = cudaGetLastError();//这个错误百分98概率是数据内存分配不一致导致核函数启动失败
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
}
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
}
//计算结果传出
cudaMemcpyAsync(Result, dev_result, num_elements_ * sizeof(double), cudaMemcpyDeviceToHost, stream[0]);