一 流的定义
1。定义
流的深层理解:
- 比如说我们现在的任务是要打包1万个商品,并把商品搬到货车上。把线程看成worker工人,在没有流管理的时候,他们就是一起打包,并且等所有人都打包完成后,再所有人一起搬运货物。但是如果有流管理,我们可以把工人分成几部分人,一部分工人负责打包,另一部分负责搬运,这时候打包和搬运工作是可以同时进行的,这就是流并行
- 至于使用流是否可以提升效率,那还得看商品数和工人数,还有打包的难度
- 如果工人很多,打包很简单,那么不用流,这个工作也可以很快完成。但是,如果打包难度有差异,有些需要打包很久,有些人打包很快,那在等待其他工人完成打包工作的时候就会有闲置的工人,cuda运算效率这个时候就不高
2 判断自己电脑是否可以使用cuda流
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "curand.h"
#include "curand_kernel.h"
#include <stdio.h>
#include <iostream>
#include <time.h>
using namespace std;
int main()
{
cudaDeviceProp mprop;
cudaGetDeviceProperties(&mprop, 0);
if (!mprop.deviceOverlap)
{
cout << "not support\n" << endl;
}
else
{
cout << " support\n" << endl;
}
return 0;
}
3 流的定义,创建和销毁
两种方式
- 方式1
- 方式2
4 流的同步
- 什么时候要用到同步:流1个某个任务要用到流2的结果
- 时间例如,cudaEvent_t记时事件
5 流的使用
- 在kernel函数里面用的时候,如果不用共享内存,就用0代替即可
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "curand.h"
#include "curand_kernel.h"
#include <stdio.h>
#include <iostream>
#include <time.h>
using namespace std;
__global__ void addKernel(int* c, int* a, int* b)
{
int i = threadIdx.x;
c[i] = a[i] + b[i];
}
// 流并行
void myTestCalcStream(void)
{
int pDataA[100] = { 0 };
int pDataB[100] = { 0 };
int pDataC[100] = { 0 };
for (int i = 0; i < 100; i++) {
pDataA[i] = i;
pDataB[i] = 10 + i;
pDataC[i] = 0;
}
// 申请A、B、C的内存
int* pDevDataA = nullptr, * pDevDataB = nullptr, * pDevDataC = nullptr;
cudaMalloc(&pDevDataA, sizeof(int) * 100);
cudaMalloc(&pDevDataB, sizeof(int) * 100);
cudaMalloc(&pDevDataC, sizeof(int) * 100);
// 内存拷贝
cudaMemcpy(pDevDataA, pDataA, sizeof(int) * 100, cudaMemcpyHostToDevice);
cudaMemcpy(pDevDataB, pDataB, sizeof(int) * 100, cudaMemcpyHostToDevice);
cudaStream_t streams[100];
for (int i = 0; i < 100; ++i)
cudaStreamCreate(streams + i);
{
// 调用核函数并计时
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
for (int i = 0; i < 100; ++i)
addKernel << <1, 1, 0, streams[i] >> > (pDevDataC + i, pDevDataA + i, pDevDataB + i);
/*for (int i = 50; i < 100; ++i)
cudaStreamSynchronize(streams[i]);*/
cudaDeviceSynchronize();
/*cudaThreadSynchronize();*/
// 输出核函数调用时长
cudaEventRecord(stop, 0);
cudaEventSynchronize(start);
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, start, stop);
printf("Kernel time(ms) : %f\n", elapsedTime);
}
{
// 调用核函数并计时
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
for (int i = 0; i < 100; ++i)
addKernel << <1, 1, 0 >> > (pDevDataC + i, pDevDataA + i, pDevDataB + i);
// 输出核函数调用时长
cudaEventRecord(stop, 0);
cudaEventSynchronize(start);
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, start, stop);
printf("Kernel time(ms) : %f\n", elapsedTime);
}
cudaMemcpy(pDataC, pDevDataC, sizeof(int) * 100, cudaMemcpyDeviceToHost);
for (int i = 0; i < 100; ++i)
cudaStreamDestroy(streams[i]);
cudaFree(pDevDataA);
cudaFree(pDevDataB);
cudaFree(pDevDataC);
}
int main() {
myTestCalcStream();
return 0;
}
二 和cuda流配套使用的锁页式内存
流实现矩阵转置
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
#include "gputimer.h"
const int N = 512; // matrix size is NxN
const int K = 32; // tile size is KxK
// Utility functions: compare, print, and fill matrices
#define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__)
template<typename T>
void check(T err, const char* const func, const char* const file, const int line)
{
if (err != cudaSuccess) {
fprintf(stderr, "CUDA error at: %s : %d\n", file, line);
fprintf(stderr, "%s %s\n", cudaGetErrorString(err), func);;
exit(1);
}
}
int compare_matrices(float* gpu, float* ref)
{
int result = 0;
for (int j = 0; j < N; j++)
for (int i = 0; i < N; i++)
if (ref[i + j * N] != gpu[i + j * N])
{
// printf("reference(%d,%d) = %f but test(%d,%d) = %f\n",
// i,j,ref[i+j*N],i,j,test[i+j*N]);
result = 1;
}
return result;
}
void print_matrix(float* mat)
{
for (int j = 0; j < N; j++)
{
for (int i = 0; i < N; i++) { printf("%4.4g ", mat[i + j * N]); }
printf("\n");
}
}
// fill a matrix with sequential numbers in the range 0..N-1
void fill_matrix(float* mat)
{
for (int j = 0; j < N * N; j++)
mat[j] = (float)j;
}
void transpose_CPU(float in[], float out[])
{
for (int j = 0; j < N; j++)
for (int i = 0; i < N; i++)
out[j + i * N] = in[i + j * N]; // out(j,i) = in(i,j)
}
// to be launched on a single thread
__global__ void
transpose_serial(float in[], float out[])
{
for (int j = 0; j < N; j++)
for (int i = 0; i < N; i++)
out[j + i * N] = in[i + j * N]; // out(j,i) = in(i,j)
}
// to be launched with one thread per row of output matrix
__global__ void
transpose_parallel_per_row(float in[], float out[])
{
int i = threadIdx.x;
for (int j = 0; j < N; j++)
out[j + i * N] = in[i + j * N]; // out(j,i) = in(i,j)
}
// to be launched with one thread per element, in KxK threadblocks
// thread (x,y) in grid writes element (i,j) of output matrix
__global__ void
transpose_parallel_per_element(float in[], float out[])
{
int i = blockIdx.x * K + threadIdx.x;
int j = blockIdx.y * K + threadIdx.y;
out[j + i * N] = in[i + j * N]; // out(j,i) = in(i,j)
}
// to be launched with one thread per element, in (tilesize)x(tilesize) threadblocks
// thread blocks read & write tiles, in coalesced fashion
// adjacent threads read adjacent input elements, write adjacent output elmts
__global__ void
transpose_parallel_per_element_tiled(float in[], float out[])
{
// (i,j) locations of the tile corners for input & output matrices:
int in_corner_i = blockIdx.x * K, in_corner_j = blockIdx.y * K;
int out_corner_i = blockIdx.y * K, out_corner_j = blockIdx.x * K;
int x = threadIdx.x, y = threadIdx.y;
__shared__ float tile[K][K];
// coalesced read from global mem, TRANSPOSED write into shared mem:
tile[y][x] = in[(in_corner_i + x) + (in_corner_j + y) * N];
__syncthreads();
// read from shared mem, coalesced write to global mem:
out[(out_corner_i + x) + (out_corner_j + y) * N] = tile[x][y];
}
// to be launched with one thread per element, in (tilesize)x(tilesize) threadblocks
// thread blocks read & write tiles, in coalesced fashion
// adjacent threads read adjacent input elements, write adjacent output elmts
__global__ void
transpose_parallel_per_element_tiled16(float in[], float out[])
{
// (i,j) locations of the tile corners for input & output matrices:
int in_corner_i = blockIdx.x * 16, in_corner_j = blockIdx.y * 16;
int out_corner_i = blockIdx.y * 16, out_corner_j = blockIdx.x * 16;
int x = threadIdx.x, y = threadIdx.y;
__shared__ float tile[16][16];
// coalesced read from global mem, TRANSPOSED write into shared mem:
tile[y][x] = in[(in_corner_i + x) + (in_corner_j + y) * N];
__syncthreads();
// read from shared mem, coalesced write to global mem:
out[(out_corner_i + x) + (out_corner_j + y) * N] = tile[x][y];
}
// to be launched with one thread per element, in KxK threadblocks
// thread blocks read & write tiles, in coalesced fashion
// shared memory array padded to avoid bank conflicts
__global__ void
transpose_parallel_per_element_tiled_padded(float in[], float out[])
{
// (i,j) locations of the tile corners for input & output matrices:
int in_corner_i = blockIdx.x * K, in_corner_j = blockIdx.y * K;
int out_corner_i = blockIdx.y * K, out_corner_j = blockIdx.x * K;
int x = threadIdx.x, y = threadIdx.y;
__shared__ float tile[K][K + 1];
// coalesced read from global mem, TRANSPOSED write into shared mem:
tile[y][x] = in[(in_corner_i + x) + (in_corner_j + y) * N];
__syncthreads();
// read from shared mem, coalesced write to global mem:
out[(out_corner_i + x) + (out_corner_j + y) * N] = tile[x][y];
}
// to be launched with one thread per element, in KxK threadblocks
// thread blocks read & write tiles, in coalesced fashion
// shared memory array padded to avoid bank conflicts
__global__ void
transpose_parallel_per_element_tiled_padded16(float in[], float out[])
{
// (i,j) locations of the tile corners for input & output matrices:
int in_corner_i = blockIdx.x * 16, in_corner_j = blockIdx.y * 16;
int out_corner_i = blockIdx.y * 16, out_corner_j = blockIdx.x * 16;
int x = threadIdx.x, y = threadIdx.y;
__shared__ float tile[16][16 + 1];
// coalesced read from global mem, TRANSPOSED write into shared mem:
tile[y][x] = in[(in_corner_i + x) + (in_corner_j + y) * N];
__syncthreads();
// read from shared mem, coalesced write to global mem:
out[(out_corner_i + x) + (out_corner_j + y) * N] = tile[x][y];
}
__global__ void transpose_parallel_per_stream(float in[], float out[], int stream_id) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = stream_id;
out[j + i * N] = in[i + j * N]; // out(j,i) = in(i,j)
}
void transpose_with_stream(float* d_in, float* d_out, float* out, float* gold, int numbytes, int stream_num) {
GpuTimer timer;
cudaStream_t streams[N];
for (int i = 0; i < N; ++i)
cudaStreamCreate(streams + i);
timer.Start();
// 用流去进行单个矩阵转置
//一个流处理一行数据
for (int i = 0; i < stream_num; ++i)
transpose_parallel_per_stream << <N / 256, 256, 0, streams[i] >> > (d_in, d_out, i);
//这是 CUDA 的执行配置语法,用于配置内核函数的网格(grid)和块(block)。
//N / 256 表示网格中的块数(gridDim),即有2个块。
//256 表示每个块中的线程数(blockDim),即每个块有256个线程。是一维的
//0 是动态共享内存的大小,这里为0表示不使用动态共享内存。
//streams[i] 指定了内核函数在哪个 CUDA 流(stream)中执行,这里使用的是第 i 个流
cudaDeviceSynchronize();
timer.Stop();
cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost);
printf("transpose_with_streams: %g ms.\nVerifying transpose...%s\n",
timer.Elapsed(), compare_matrices(out, gold) ? "Failed" : "Success");
for (int i = 0; i < N; ++i)
cudaStreamDestroy(streams[i]);
}
void run_streams(int stream_num, bool use_hostmalloc) {
// 当use_hostmalloc == 1, 调用锁页式内存,0时调用普通内存
int numbytes = N * N * sizeof(float);
float* in = (float*)malloc(numbytes);
float** out = new float* [stream_num]; //100个矩阵。所以用了二级指针
float* gold = (float*)malloc(numbytes);
fill_matrix(in);
transpose_CPU(in, gold);
//存储的是100个矩阵的GPU地址
float** d_in = new float* [stream_num], ** d_out = new float* [stream_num];
for (int i = 0; i < stream_num; i++)
{
if (use_hostmalloc) //使用锁页式内存
cudaHostAlloc((void**)&out[i], numbytes, cudaHostAllocDefault); //初始化锁页式内存
else
out[i] = (float*)malloc(numbytes);
cudaMalloc((void**)&(d_in[i]), numbytes);
cudaMalloc((void**)&(d_out[i]), numbytes);
}
dim3 blocks16x16(N / 16, N / 16); // blocks per grid
dim3 threads16x16(16, 16); // threads per block
GpuTimer timer;
cudaStream_t streams[N];
for (int i = 0; i < N; ++i)
cudaStreamCreate(streams + i);
timer.Start();
for (int i = 0; i < stream_num; ++i) {
cudaMemcpyAsync(d_in[i], in, numbytes, cudaMemcpyHostToDevice, streams[i]); //锁页式内存数据拷贝 cudaMemcpyAsync
transpose_parallel_per_element_tiled16 << <blocks16x16, threads16x16, 0, streams[i] >> > (d_in[i], d_out[i]);
cudaMemcpyAsync(out[i], d_out[i], numbytes, cudaMemcpyDeviceToHost, streams[i]);
}
cudaDeviceSynchronize();
timer.Stop();
for (int i = 0; i < stream_num; i++) {
//printf(" %s ", compare_matrices(out[i], gold) ? "Failed" : "Success");
}
printf("transpose_with_streams: %g ms.\nVerifying transpose...\n",
timer.Elapsed());
//用了锁页式内存,需要用cudaFreeHost销毁内存
for (int i = 0; i < stream_num; i++) {
cudaStreamDestroy(streams[i]);
if (use_hostmalloc) cudaFreeHost(out[i]);
cudaFree(d_out[i]);
cudaFree(d_in[i]);
}
//不用流
//timer.Start();
//for (int i = 0; i < stream_num; ++i) {
// cudaMemcpy(d_in[i], in, numbytes, cudaMemcpyHostToDevice);
// transpose_parallel_per_element_tiled16 << <blocks16x16, threads16x16, 0 >> > (d_in[i], d_out[i]);
// cudaMemcpy(out[i], d_out[i], numbytes, cudaMemcpyDeviceToHost);
//}
//cudaDeviceSynchronize();
//timer.Stop();
//for (int i = 0; i < stream_num; i++) {
// //printf(" %s ", compare_matrices(out[i], gold) ? "Failed" : "Success");
//}
//printf("transpose_with_nostreams: %g ms.\nVerifying transpose...\n",
// timer.Elapsed());
}
int main(int argc, char** argv)
{
int numbytes = N * N * sizeof(float);
float* in = (float*)malloc(numbytes);
float* out = (float*)malloc(numbytes);
float* gold = (float*)malloc(numbytes);
fill_matrix(in);
transpose_CPU(in, gold);
float* d_in, * d_out;
cudaMalloc(&d_in, numbytes);
cudaMalloc(&d_out, numbytes);
cudaMemcpy(d_in, in, numbytes, cudaMemcpyHostToDevice);
GpuTimer timer;
/*
* Now time each kernel and verify that it produces the correct result.
*
* To be really careful about benchmarking purposes, we should run every kernel once
* to "warm" the system and avoid any compilation or code-caching effects, then run
* every kernel 10 or 100 times and average the timings to smooth out any variance.
* But this makes for messy code and our goal is teaching, not detailed benchmarking.
*/
printf("-------------- 1. 只使用一个线程来进行转置运算,并CPU转置结果进行比较---------------\n");
timer.Start();
transpose_serial << <1, 1 >> > (d_in, d_out);
timer.Stop();
cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost);
printf("transpose_serial: %g ms.\nVerifying transpose...%s\n",
timer.Elapsed(), compare_matrices(out, gold) ? "Failed" : "Success");
printf("-------------- 2. n个线程来进行转置运算,每个线程处理一行/一列数据的转置---------------\n");
timer.Start();
transpose_parallel_per_row << <1, N >> > (d_in, d_out);
timer.Stop();
cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost);
printf("transpose_parallel_per_row: %g ms.\nVerifying transpose...%s\n",
timer.Elapsed(), compare_matrices(out, gold) ? "Failed" : "Success");
printf("-------------- 3. 用网格块来进行转置运算,N*N个元素,总共也是设置了这么多线程---------------\n");
dim3 blocks(N / K, N / K); // blocks per grid
dim3 threads(K, K); // threads per block
timer.Start();
transpose_parallel_per_element << <blocks, threads >> > (d_in, d_out);
timer.Stop();
cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost);
printf("transpose_parallel_per_element: %g ms.\nVerifying transpose...%s\n",
timer.Elapsed(), compare_matrices(out, gold) ? "Failed" : "Success");
printf("-------------- 4. 用网格块来进行转置运算,N*N个元素,总共也是设置了这么多线程,但是和3的区别是用了共享内存---------------\n");
timer.Start();
transpose_parallel_per_element_tiled << <blocks, threads >> > (d_in, d_out);
timer.Stop();
cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost);
printf("transpose_parallel_per_element_tiled %dx%d: %g ms.\nVerifying ...%s\n",
K, K, timer.Elapsed(), compare_matrices(out, gold) ? "Failed" : "Success");
printf("-------------- 5. 用网格块来进行转置运算,N*N个元素,总共也是设置了这么多线程,但是和4的区别是修改了块的大小---------------\n");
dim3 blocks16x16(N / 16, N / 16); // blocks per grid
dim3 threads16x16(16, 16); // threads per block
timer.Start();
transpose_parallel_per_element_tiled16 << <blocks16x16, threads16x16 >> > (d_in, d_out);
timer.Stop();
cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost);
printf("transpose_parallel_per_element_tiled 16x16: %g ms.\nVerifying ...%s\n",
timer.Elapsed(), compare_matrices(out, gold) ? "Failed" : "Success");
printf("-------------- 6. 用网格块来进行转置运算,N*N个元素,总共也是设置了这么多线程,bank冲突---------------\n");
timer.Start();
transpose_parallel_per_element_tiled_padded16 << <blocks16x16, threads16x16 >> > (d_in, d_out);
timer.Stop();
cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost);
printf("transpose_parallel_per_element_tiled_padded 16x16: %g ms.\nVerifying...%s\n",
timer.Elapsed(), compare_matrices(out, gold) ? "Failed" : "Success");
printf("-------------- 7. 用cuda流实现转置---------------\n");
// 使用cuda流去处理N次
transpose_with_stream(d_in, d_out, out, gold, numbytes, N);
printf("-------------- 8.使用锁页式内存或者普通内存进行处理实现转置---------------\n");
// 使用锁页式内存或者普通内存进行处理,对100个一样的矩阵同时进行转置
printf("\n 使用锁页式内存:\n");
run_streams(100, 1);
printf("\n 使用普通内存:\n");
run_streams(100, 0);
cudaFree(d_in);
cudaFree(d_out);
}
- 数据拷贝是有速度限制的,也就是线程之间会存在等待
- 但是数据计算没有速度限制
- 做同一个事情的时候,不推荐使用流,比如下面计算一个矩阵的转置,用流反而慢了
- 上面案例为什么1616个block处理的时间要快于3232个block:1个流处理器处理32个线程可以把整个延时隐藏起来,如果一个线程块有32个线程,可以把整个延时隐藏起来。 但是如果,设置了3232个block,但是实际上只有8个流处理器,1个流处理器是32个线程,那就需要跑4个循环; 1616个block则只需要跑,但是前提是GPU上的空间是有剩余的
流实现矩阵加法
// 矩阵对应位置相加
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "curand.h"
#include "curand_kernel.h"
#include <stdio.h>
#include <iostream>
using namespace std;
#include <stdio.h>
#include <math.h>
// 定义检测函数,检测cuda线程是否崩溃
static void HandleError(cudaError_t err, const char* file, int line)
{
if (err != cudaSuccess)
{
printf("%s in %s at line %d\n", cudaGetErrorString(err),
file, line);
exit(EXIT_FAILURE);
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
#define N (1024*1024)
#define FULL_DATA_SIZE N*20
__global__ void kernel(int* a, int* b, int* c)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int offset = gridDim.x * blockDim.x;
if (idx < N)
{
float as = a[idx];
float bs = b[idx];
c[idx] = (as + bs) / 2;
}
}
int main()
{
cudaDeviceProp prop;
int devID;
HANDLE_ERROR(cudaGetDevice(&devID));
HANDLE_ERROR(cudaGetDeviceProperties(&prop, devID));
// 判断设备是够支持流
if (!prop.deviceOverlap)
{
printf("No device will handle overlaps. so no speed up from stream.\n");
return 0;
}
// 事件计时
cudaEvent_t start, stop;
float elapsedTime;
HANDLE_ERROR(cudaEventCreate(&start));
HANDLE_ERROR(cudaEventCreate(&stop));
HANDLE_ERROR(cudaEventRecord(start, 0));
cudaStream_t stream0;
cudaStream_t stream1;
HANDLE_ERROR(cudaStreamCreate(&stream0));
HANDLE_ERROR(cudaStreamCreate(&stream1));
int* host_a, * host_b, * host_c;
int* dev_a0, * dev_b0, * dev_c0;
int* dev_a1, * dev_b1, * dev_c1;
HANDLE_ERROR(cudaMalloc((void**)&dev_a0, N * sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&dev_b0, N * sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&dev_c0, N * sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&dev_a1, N * sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&dev_b1, N * sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&dev_c1, N * sizeof(int)));
HANDLE_ERROR(cudaHostAlloc((void**)&host_a, FULL_DATA_SIZE * sizeof(int), cudaHostAllocDefault));
HANDLE_ERROR(cudaHostAlloc((void**)&host_b, FULL_DATA_SIZE * sizeof(int), cudaHostAllocDefault));
HANDLE_ERROR(cudaHostAlloc((void**)&host_c, FULL_DATA_SIZE * sizeof(int), cudaHostAllocDefault));
for (int i = 0; i < FULL_DATA_SIZE; i++)
{
host_a[i] = rand();
host_b[i] = rand();
}
// 最上面一段代码为不使用流,下面两段代码为使用两个流
for (int i = 0; i < FULL_DATA_SIZE; i += 2 * N)
{
/* HANDLE_ERROR(cudaMemcpy(dev_a0, host_a + i, N * sizeof(int), cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(dev_a1, host_a + i + N, N * sizeof(int), cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(dev_b0, host_b + i, N * sizeof(int), cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(dev_a1, host_a + i + N, N * sizeof(int), cudaMemcpyHostToDevice));
kernel << <N / 256, 256, 0 >> > (dev_a0, dev_b0, dev_c0);
kernel << <N / 256, 256, 0 >> > (dev_a1, dev_b1, dev_c1);
HANDLE_ERROR(cudaMemcpy(host_c + i, dev_c0, N * sizeof(int), cudaMemcpyDeviceToHost));
HANDLE_ERROR(cudaMemcpy(host_c + i + N, dev_c1, N * sizeof(int), cudaMemcpyDeviceToHost));*/
// 下面两段代码虽然语句的顺序不一样,但是效果是相同的,
//-------------------------------------------------------------------------------------------------------
HANDLE_ERROR(cudaMemcpyAsync(dev_a0, host_a + i, N * sizeof(int), cudaMemcpyHostToDevice, stream0));
HANDLE_ERROR(cudaMemcpyAsync(dev_a1, host_a + i + N, N * sizeof(int), cudaMemcpyHostToDevice, stream1));
HANDLE_ERROR(cudaMemcpyAsync(dev_b0, host_b + i, N * sizeof(int), cudaMemcpyHostToDevice, stream0));
HANDLE_ERROR(cudaMemcpyAsync(dev_a1, host_a + i + N, N * sizeof(int), cudaMemcpyHostToDevice, stream1));
kernel << <N / 256, 256, 0, stream0 >> > (dev_a0, dev_b0, dev_c0);
kernel << <N / 256, 256, 0, stream1 >> > (dev_a1, dev_b1, dev_c1);
HANDLE_ERROR(cudaMemcpyAsync(host_c + i, dev_c0, N * sizeof(int), cudaMemcpyDeviceToHost, stream0));
HANDLE_ERROR(cudaMemcpyAsync(host_c + i + N, dev_c1, N * sizeof(int), cudaMemcpyDeviceToHost, stream1));
//-----------------------------------------------------------------------------------------------------
/*HANDLE_ERROR(cudaMemcpyAsync(dev_a0, host_a+i, N*sizeof(int), cudaMemcpyHostToDevice, stream0));
HANDLE_ERROR(cudaMemcpyAsync(dev_b0, host_b+i, N*sizeof(int), cudaMemcpyHostToDevice, stream0));
kernel<<<N/256, 256, 0, stream0>>>(dev_a0, dev_b0, dev_c0);
HANDLE_ERROR(cudaMemcpyAsync(host_c+i, dev_c0, N*sizeof(int), cudaMemcpyDeviceToHost, stream0));
HANDLE_ERROR(cudaMemcpyAsync(dev_a1, host_a+i+N, N*sizeof(int), cudaMemcpyHostToDevice, stream1));
HANDLE_ERROR(cudaMemcpyAsync(dev_a1, host_a+i+N, N*sizeof(int), cudaMemcpyHostToDevice, stream1));
kernel<<<N/256, 256, 0, stream1>>>(dev_a1, dev_b1, dev_c1);
HANDLE_ERROR(cudaMemcpyAsync(host_c+i+N, dev_c1, N*sizeof(int), cudaMemcpyDeviceToHost, stream1));*/
}
// 等待流执行完成
HANDLE_ERROR(cudaStreamSynchronize(stream0));
HANDLE_ERROR(cudaStreamSynchronize(stream1));
HANDLE_ERROR(cudaEventRecord(stop, 0));
HANDLE_ERROR(cudaEventSynchronize(stop));
HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime, start, stop));
printf("Time taken: %3.1f ms\n", elapsedTime);
// 释放锁页式内存
HANDLE_ERROR(cudaFreeHost(host_a));
HANDLE_ERROR(cudaFreeHost(host_b));
HANDLE_ERROR(cudaFreeHost(host_c));
// 释放显存
HANDLE_ERROR(cudaFree(dev_a0));
HANDLE_ERROR(cudaFree(dev_b0));
HANDLE_ERROR(cudaFree(dev_c0));
HANDLE_ERROR(cudaFree(dev_a1));
HANDLE_ERROR(cudaFree(dev_b1));
HANDLE_ERROR(cudaFree(dev_c1));
// 销毁流
HANDLE_ERROR(cudaStreamDestroy(stream0));
HANDLE_ERROR(cudaStreamDestroy(stream1));
return 0;
}