cuda学习笔记6--cuda流和锁页式内存的调用

猪猪虾的业余生活

于 2024-08-15 19:38:16 发布

阅读量392

点赞数 3

分类专栏： cuda 文章标签：学习笔记

本文链接：https://blog.csdn.net/weixin_43528481/article/details/140107541

版权

cuda 专栏收录该内容

8 篇文章 0 订阅

订阅专栏

一流的定义

1。定义

在这里插入图片描述

流的深层理解：

比如说我们现在的任务是要打包1万个商品，并把商品搬到货车上。把线程看成worker工人，在没有流管理的时候，他们就是一起打包，并且等所有人都打包完成后，再所有人一起搬运货物。但是如果有流管理，我们可以把工人分成几部分人，一部分工人负责打包，另一部分负责搬运，这时候打包和搬运工作是可以同时进行的，这就是流并行
至于使用流是否可以提升效率，那还得看商品数和工人数，还有打包的难度
如果工人很多，打包很简单，那么不用流，这个工作也可以很快完成。但是，如果打包难度有差异，有些需要打包很久，有些人打包很快，那在等待其他工人完成打包工作的时候就会有闲置的工人，cuda运算效率这个时候就不高

2 判断自己电脑是否可以使用cuda流

在这里插入图片描述


#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "curand.h"
#include "curand_kernel.h"
#include <stdio.h>
#include <iostream>
#include <time.h>

using namespace std;

int main()
{
	cudaDeviceProp mprop;
	cudaGetDeviceProperties(&mprop, 0);
	if (!mprop.deviceOverlap)
	{
		cout << "not support\n" << endl;
	}
	else
	{
		cout << " support\n" << endl;
	}
	return 0;
}

3 流的定义，创建和销毁

两种方式

方式1
方式2

4 流的同步

什么时候要用到同步：流1个某个任务要用到流2的结果
时间例如，cudaEvent_t记时事件

5 流的使用

在kernel函数里面用的时候，如果不用共享内存，就用0代替即可

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "curand.h"
#include "curand_kernel.h"
#include <stdio.h>
#include <iostream>
#include <time.h>
using namespace std;


__global__ void addKernel(int* c, int* a, int* b)
{
	int i = threadIdx.x;
	c[i] = a[i] + b[i];
}

// 流并行
void myTestCalcStream(void)
{
	int pDataA[100] = { 0 };
	int pDataB[100] = { 0 };
	int pDataC[100] = { 0 };
	for (int i = 0; i < 100; i++) {
		pDataA[i] = i;
		pDataB[i] = 10 + i;
		pDataC[i] = 0;
	}

	// 申请A、B、C的内存
	int* pDevDataA = nullptr, * pDevDataB = nullptr, * pDevDataC = nullptr;
	cudaMalloc(&pDevDataA, sizeof(int) * 100);
	cudaMalloc(&pDevDataB, sizeof(int) * 100);
	cudaMalloc(&pDevDataC, sizeof(int) * 100);

	// 内存拷贝
	cudaMemcpy(pDevDataA, pDataA, sizeof(int) * 100, cudaMemcpyHostToDevice);
	cudaMemcpy(pDevDataB, pDataB, sizeof(int) * 100, cudaMemcpyHostToDevice);

	cudaStream_t streams[100];
	for (int i = 0; i < 100; ++i)
		cudaStreamCreate(streams + i);
	{
		// 调用核函数并计时
		cudaEvent_t start, stop;
		cudaEventCreate(&start);
		cudaEventCreate(&stop);
		cudaEventRecord(start, 0);
		for (int i = 0; i < 100; ++i)
			addKernel << <1, 1, 0, streams[i] >> > (pDevDataC + i, pDevDataA + i, pDevDataB + i);
		/*for (int i = 50; i < 100; ++i)
			cudaStreamSynchronize(streams[i]);*/
		cudaDeviceSynchronize();
		/*cudaThreadSynchronize();*/
		// 输出核函数调用时长
		cudaEventRecord(stop, 0);
		cudaEventSynchronize(start);
		cudaEventSynchronize(stop);
		float elapsedTime;
		cudaEventElapsedTime(&elapsedTime, start, stop);
		printf("Kernel time(ms) : %f\n", elapsedTime);
	}

	{
		// 调用核函数并计时
		cudaEvent_t start, stop;
		cudaEventCreate(&start);
		cudaEventCreate(&stop);
		cudaEventRecord(start, 0);
		for (int i = 0; i < 100; ++i)
			addKernel << <1, 1, 0 >> > (pDevDataC + i, pDevDataA + i, pDevDataB + i);

		// 输出核函数调用时长
		cudaEventRecord(stop, 0);
		cudaEventSynchronize(start);
		cudaEventSynchronize(stop);
		float elapsedTime;
		cudaEventElapsedTime(&elapsedTime, start, stop);
		printf("Kernel time(ms) : %f\n", elapsedTime);
	}


	cudaMemcpy(pDataC, pDevDataC, sizeof(int) * 100, cudaMemcpyDeviceToHost);


	for (int i = 0; i < 100; ++i)
		cudaStreamDestroy(streams[i]);

	cudaFree(pDevDataA);
	cudaFree(pDevDataB);
	cudaFree(pDevDataC);
}

int main() {
	myTestCalcStream();
	return 0;
}

二和cuda流配套使用的锁页式内存

在这里插入图片描述

流实现矩阵转置

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
#include "gputimer.h"

const int N = 512;		// matrix size is NxN
const int K = 32;				// tile size is KxK

// Utility functions: compare, print, and fill matrices
#define checkCudaErrors(val) check( (val), #val, __FILE__, __LINE__)

template<typename T>
void check(T err, const char* const func, const char* const file, const int line)
{
	if (err != cudaSuccess) {
		fprintf(stderr, "CUDA error at: %s : %d\n", file, line);
		fprintf(stderr, "%s %s\n", cudaGetErrorString(err), func);;
		exit(1);
	}
}

int compare_matrices(float* gpu, float* ref)
{
	int result = 0;

	for (int j = 0; j < N; j++)
		for (int i = 0; i < N; i++)
			if (ref[i + j * N] != gpu[i + j * N])
			{
				// printf("reference(%d,%d) = %f but test(%d,%d) = %f\n",
				// i,j,ref[i+j*N],i,j,test[i+j*N]);
				result = 1;
			}
	return result;
}

void print_matrix(float* mat)
{
	for (int j = 0; j < N; j++)
	{
		for (int i = 0; i < N; i++) { printf("%4.4g ", mat[i + j * N]); }
		printf("\n");
	}
}

// fill a matrix with sequential numbers in the range 0..N-1
void fill_matrix(float* mat)
{
	for (int j = 0; j < N * N; j++)
		mat[j] = (float)j;
}



void transpose_CPU(float in[], float out[])
{
	for (int j = 0; j < N; j++)
		for (int i = 0; i < N; i++)
			out[j + i * N] = in[i + j * N]; // out(j,i) = in(i,j)
}

// to be launched on a single thread
__global__ void 
transpose_serial(float in[], float out[])
{
	for (int j = 0; j < N; j++)
		for (int i = 0; i < N; i++)
			out[j + i * N] = in[i + j * N]; // out(j,i) = in(i,j)
}

// to be launched with one thread per row of output matrix
__global__ void
transpose_parallel_per_row(float in[], float out[])
{
	int i = threadIdx.x;

	for (int j = 0; j < N; j++)
		out[j + i * N] = in[i + j * N]; // out(j,i) = in(i,j)
}

// to be launched with one thread per element, in KxK threadblocks
// thread (x,y) in grid writes element (i,j) of output matrix 
__global__ void
transpose_parallel_per_element(float in[], float out[])
{
	int i = blockIdx.x * K + threadIdx.x;
	int j = blockIdx.y * K + threadIdx.y;

	out[j + i * N] = in[i + j * N]; // out(j,i) = in(i,j)
}

// to be launched with one thread per element, in (tilesize)x(tilesize) threadblocks
// thread blocks read & write tiles, in coalesced fashion
// adjacent threads read adjacent input elements, write adjacent output elmts
__global__ void
transpose_parallel_per_element_tiled(float in[], float out[])
{
	// (i,j) locations of the tile corners for input & output matrices:
	int in_corner_i = blockIdx.x * K, in_corner_j = blockIdx.y * K;
	int out_corner_i = blockIdx.y * K, out_corner_j = blockIdx.x * K;

	int x = threadIdx.x, y = threadIdx.y;

	__shared__ float tile[K][K];

	// coalesced read from global mem, TRANSPOSED write into shared mem:
	tile[y][x] = in[(in_corner_i + x) + (in_corner_j + y) * N];
	__syncthreads();
	// read from shared mem, coalesced write to global mem:
	out[(out_corner_i + x) + (out_corner_j + y) * N] = tile[x][y];
}

// to be launched with one thread per element, in (tilesize)x(tilesize) threadblocks
// thread blocks read & write tiles, in coalesced fashion
// adjacent threads read adjacent input elements, write adjacent output elmts
__global__ void
transpose_parallel_per_element_tiled16(float in[], float out[])
{
	// (i,j) locations of the tile corners for input & output matrices:
	int in_corner_i = blockIdx.x * 16, in_corner_j = blockIdx.y * 16;
	int out_corner_i = blockIdx.y * 16, out_corner_j = blockIdx.x * 16;

	int x = threadIdx.x, y = threadIdx.y;

	__shared__ float tile[16][16];

	// coalesced read from global mem, TRANSPOSED write into shared mem:
	tile[y][x] = in[(in_corner_i + x) + (in_corner_j + y) * N];
	__syncthreads();
	// read from shared mem, coalesced write to global mem:
	out[(out_corner_i + x) + (out_corner_j + y) * N] = tile[x][y];
}

// to be launched with one thread per element, in KxK threadblocks
// thread blocks read & write tiles, in coalesced fashion
// shared memory array padded to avoid bank conflicts
__global__ void
transpose_parallel_per_element_tiled_padded(float in[], float out[])
{
	// (i,j) locations of the tile corners for input & output matrices:
	int in_corner_i = blockIdx.x * K, in_corner_j = blockIdx.y * K;
	int out_corner_i = blockIdx.y * K, out_corner_j = blockIdx.x * K;

	int x = threadIdx.x, y = threadIdx.y;

	__shared__ float tile[K][K + 1];

	// coalesced read from global mem, TRANSPOSED write into shared mem:
	tile[y][x] = in[(in_corner_i + x) + (in_corner_j + y) * N];
	__syncthreads();
	// read from shared mem, coalesced write to global mem:
	out[(out_corner_i + x) + (out_corner_j + y) * N] = tile[x][y];
}

// to be launched with one thread per element, in KxK threadblocks
// thread blocks read & write tiles, in coalesced fashion
// shared memory array padded to avoid bank conflicts
__global__ void
transpose_parallel_per_element_tiled_padded16(float in[], float out[])
{
	// (i,j) locations of the tile corners for input & output matrices:
	int in_corner_i = blockIdx.x * 16, in_corner_j = blockIdx.y * 16;
	int out_corner_i = blockIdx.y * 16, out_corner_j = blockIdx.x * 16;

	int x = threadIdx.x, y = threadIdx.y;

	__shared__ float tile[16][16 + 1];

	// coalesced read from global mem, TRANSPOSED write into shared mem:
	tile[y][x] = in[(in_corner_i + x) + (in_corner_j + y) * N];
	__syncthreads();
	// read from shared mem, coalesced write to global mem:
	out[(out_corner_i + x) + (out_corner_j + y) * N] = tile[x][y];
}

__global__ void transpose_parallel_per_stream(float in[], float out[], int stream_id) {
	int i = blockIdx.x * blockDim.x + threadIdx.x;
	int j = stream_id;

	out[j + i * N] = in[i + j * N]; // out(j,i) = in(i,j)
}

void transpose_with_stream(float* d_in, float* d_out, float* out, float* gold, int numbytes, int stream_num) {
	GpuTimer timer;
	cudaStream_t streams[N];
	for (int i = 0; i < N; ++i)
		cudaStreamCreate(streams + i);

	timer.Start();
	// 用流去进行单个矩阵转置
	//一个流处理一行数据
	for (int i = 0; i < stream_num; ++i)
		transpose_parallel_per_stream << <N / 256, 256, 0, streams[i] >> > (d_in, d_out, i);
		//这是 CUDA 的执行配置语法，用于配置内核函数的网格（grid）和块（block）。
		//N / 256 表示网格中的块数（gridDim），即有2个块。
		//256 表示每个块中的线程数（blockDim），即每个块有256个线程。是一维的
		//0 是动态共享内存的大小，这里为0表示不使用动态共享内存。
		//streams[i] 指定了内核函数在哪个 CUDA 流（stream）中执行，这里使用的是第 i 个流
	cudaDeviceSynchronize();
	timer.Stop();
	cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost);
	printf("transpose_with_streams: %g ms.\nVerifying transpose...%s\n",
		timer.Elapsed(), compare_matrices(out, gold) ? "Failed" : "Success");


	for (int i = 0; i < N; ++i)
		cudaStreamDestroy(streams[i]);
}

void run_streams(int stream_num, bool use_hostmalloc) {
	// 当use_hostmalloc == 1， 调用锁页式内存，0时调用普通内存

	int numbytes = N * N * sizeof(float);

	float* in = (float*)malloc(numbytes);
	float** out = new float* [stream_num];  //100个矩阵。所以用了二级指针

	float* gold = (float*)malloc(numbytes);

	fill_matrix(in);
	transpose_CPU(in, gold);

	//存储的是100个矩阵的GPU地址
	float** d_in = new float* [stream_num], ** d_out = new float* [stream_num];

	for (int i = 0; i < stream_num; i++)
	{
		if (use_hostmalloc)	 //使用锁页式内存	
			cudaHostAlloc((void**)&out[i], numbytes, cudaHostAllocDefault);  //初始化锁页式内存	
		else 
			out[i] = (float*)malloc(numbytes);

		cudaMalloc((void**)&(d_in[i]), numbytes);
		cudaMalloc((void**)&(d_out[i]), numbytes);
	}

	dim3 blocks16x16(N / 16, N / 16); // blocks per grid
	dim3 threads16x16(16, 16);	 // threads per block

	GpuTimer timer;
	cudaStream_t streams[N];
	for (int i = 0; i < N; ++i)
		cudaStreamCreate(streams + i);

	timer.Start();
	for (int i = 0; i < stream_num; ++i) {
		cudaMemcpyAsync(d_in[i], in, numbytes, cudaMemcpyHostToDevice, streams[i]); //锁页式内存数据拷贝 cudaMemcpyAsync
		transpose_parallel_per_element_tiled16 << <blocks16x16, threads16x16, 0, streams[i] >> > (d_in[i], d_out[i]);
		cudaMemcpyAsync(out[i], d_out[i], numbytes, cudaMemcpyDeviceToHost, streams[i]);
	}
	cudaDeviceSynchronize();
	timer.Stop();
	for (int i = 0; i < stream_num; i++) {
		//printf(" %s ", compare_matrices(out[i], gold) ? "Failed" : "Success");
	}

	printf("transpose_with_streams: %g ms.\nVerifying transpose...\n",
		timer.Elapsed());


	//用了锁页式内存,需要用cudaFreeHost销毁内存
	for (int i = 0; i < stream_num; i++) {
		cudaStreamDestroy(streams[i]);
		if (use_hostmalloc)	cudaFreeHost(out[i]);
		cudaFree(d_out[i]);
		cudaFree(d_in[i]);
	}


	//不用流
	//timer.Start();
	//for (int i = 0; i < stream_num; ++i) {
	//	cudaMemcpy(d_in[i], in, numbytes, cudaMemcpyHostToDevice);
	//	transpose_parallel_per_element_tiled16 << <blocks16x16, threads16x16, 0 >> > (d_in[i], d_out[i]);
	//	cudaMemcpy(out[i], d_out[i], numbytes, cudaMemcpyDeviceToHost);
	//}
	//cudaDeviceSynchronize();
	//timer.Stop();
	//for (int i = 0; i < stream_num; i++) {
	//	//printf(" %s ", compare_matrices(out[i], gold) ? "Failed" : "Success");
	//}

	//printf("transpose_with_nostreams: %g ms.\nVerifying transpose...\n",
	//	timer.Elapsed());




}

int main(int argc, char** argv)
{
	int numbytes = N * N * sizeof(float);

	float* in = (float*)malloc(numbytes);
	float* out = (float*)malloc(numbytes);
	float* gold = (float*)malloc(numbytes);

	fill_matrix(in);
	transpose_CPU(in, gold);

	float* d_in, * d_out;

	cudaMalloc(&d_in, numbytes);
	cudaMalloc(&d_out, numbytes);
	cudaMemcpy(d_in, in, numbytes, cudaMemcpyHostToDevice);

	GpuTimer timer;

	/*
	 * Now time each kernel and verify that it produces the correct result.
	 *
	 * To be really careful about benchmarking purposes, we should run every kernel once
	 * to "warm" the system and avoid any compilation or code-caching effects, then run
	 * every kernel 10 or 100 times and average the timings to smooth out any variance.
	 * But this makes for messy code and our goal is teaching, not detailed benchmarking.
	 */

	
	printf("-------------- 1. 只使用一个线程来进行转置运算，并CPU转置结果进行比较---------------\n");
	timer.Start();
	transpose_serial << <1, 1 >> > (d_in, d_out);
	timer.Stop();
	cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost);
	printf("transpose_serial: %g ms.\nVerifying transpose...%s\n",
		timer.Elapsed(), compare_matrices(out, gold) ? "Failed" : "Success");

	printf("-------------- 2. n个线程来进行转置运算，每个线程处理一行/一列数据的转置---------------\n");
	timer.Start();
	transpose_parallel_per_row << <1, N >> > (d_in, d_out);
	timer.Stop();
	cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost);
	printf("transpose_parallel_per_row: %g ms.\nVerifying transpose...%s\n",
		timer.Elapsed(), compare_matrices(out, gold) ? "Failed" : "Success");


	printf("-------------- 3. 用网格块来进行转置运算，N*N个元素，总共也是设置了这么多线程---------------\n");
	dim3 blocks(N / K, N / K); // blocks per grid
	dim3 threads(K, K);	// threads per block
	timer.Start();
	transpose_parallel_per_element << <blocks, threads >> > (d_in, d_out);
	timer.Stop();
	cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost);
	printf("transpose_parallel_per_element: %g ms.\nVerifying transpose...%s\n",
		timer.Elapsed(), compare_matrices(out, gold) ? "Failed" : "Success");

	printf("-------------- 4. 用网格块来进行转置运算，N*N个元素，总共也是设置了这么多线程，但是和3的区别是用了共享内存---------------\n");
	timer.Start();
	transpose_parallel_per_element_tiled << <blocks, threads >> > (d_in, d_out);
	timer.Stop();
	cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost);
	printf("transpose_parallel_per_element_tiled %dx%d: %g ms.\nVerifying ...%s\n",
		K, K, timer.Elapsed(), compare_matrices(out, gold) ? "Failed" : "Success");


	printf("-------------- 5. 用网格块来进行转置运算，N*N个元素，总共也是设置了这么多线程，但是和4的区别是修改了块的大小---------------\n");
	dim3 blocks16x16(N / 16, N / 16); // blocks per grid
	dim3 threads16x16(16, 16);	 // threads per block

	timer.Start();
	transpose_parallel_per_element_tiled16 << <blocks16x16, threads16x16 >> > (d_in, d_out);
	timer.Stop();
	cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost);
	printf("transpose_parallel_per_element_tiled 16x16: %g ms.\nVerifying ...%s\n",
		timer.Elapsed(), compare_matrices(out, gold) ? "Failed" : "Success");

	printf("-------------- 6. 用网格块来进行转置运算，N*N个元素，总共也是设置了这么多线程，bank冲突---------------\n");
	timer.Start();
	transpose_parallel_per_element_tiled_padded16 << <blocks16x16, threads16x16 >> > (d_in, d_out);
	timer.Stop();
	cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost);
	printf("transpose_parallel_per_element_tiled_padded 16x16: %g ms.\nVerifying...%s\n",
		timer.Elapsed(), compare_matrices(out, gold) ? "Failed" : "Success");


	printf("-------------- 7. 用cuda流实现转置---------------\n");
	// 使用cuda流去处理N次
	transpose_with_stream(d_in, d_out, out, gold, numbytes, N);

	printf("-------------- 8.使用锁页式内存或者普通内存进行处理实现转置---------------\n");
	// 使用锁页式内存或者普通内存进行处理，对100个一样的矩阵同时进行转置
	printf("\n 使用锁页式内存:\n");
	run_streams(100, 1);

	printf("\n 使用普通内存:\n");
	run_streams(100, 0);

	cudaFree(d_in);
	cudaFree(d_out);
}

在这里插入图片描述

数据拷贝是有速度限制的，也就是线程之间会存在等待
但是数据计算没有速度限制
做同一个事情的时候，不推荐使用流，比如下面计算一个矩阵的转置，用流反而慢了
上面案例为什么1616个block处理的时间要快于3232个block:1个流处理器处理32个线程可以把整个延时隐藏起来，如果一个线程块有32个线程，可以把整个延时隐藏起来。但是如果，设置了3232个block，但是实际上只有8个流处理器,1个流处理器是32个线程，那就需要跑4个循环； 1616个block则只需要跑，但是前提是GPU上的空间是有剩余的

流实现矩阵加法

// 矩阵对应位置相加
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "curand.h"
#include "curand_kernel.h"
#include <stdio.h>
#include <iostream>

using namespace std;


#include <stdio.h>
#include <math.h>

// 定义检测函数，检测cuda线程是否崩溃
static void HandleError(cudaError_t err, const char* file, int line)
{
	if (err != cudaSuccess)
	{
		printf("%s in %s at line %d\n", cudaGetErrorString(err),
			file, line);
		exit(EXIT_FAILURE);
	}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
#define N (1024*1024)
#define FULL_DATA_SIZE N*20

__global__ void kernel(int* a, int* b, int* c)
{
	int idx = blockIdx.x * blockDim.x + threadIdx.x;
	int offset = gridDim.x * blockDim.x;

	if (idx < N)
	{
		float as = a[idx];
		float bs = b[idx];
		c[idx] = (as + bs) / 2;
	}
}

int main()
{
	cudaDeviceProp prop;
	int devID;

	HANDLE_ERROR(cudaGetDevice(&devID));
	HANDLE_ERROR(cudaGetDeviceProperties(&prop, devID));
	// 判断设备是够支持流
	if (!prop.deviceOverlap)
	{
		printf("No device will handle overlaps. so no speed up from stream.\n");
		return 0;
	}

	// 事件计时
	cudaEvent_t start, stop;
	float elapsedTime;
	HANDLE_ERROR(cudaEventCreate(&start));
	HANDLE_ERROR(cudaEventCreate(&stop));
	HANDLE_ERROR(cudaEventRecord(start, 0));

	cudaStream_t stream0;
	cudaStream_t stream1;
	HANDLE_ERROR(cudaStreamCreate(&stream0));
	HANDLE_ERROR(cudaStreamCreate(&stream1));

	int* host_a, * host_b, * host_c;
	int* dev_a0, * dev_b0, * dev_c0;
	int* dev_a1, * dev_b1, * dev_c1;

	HANDLE_ERROR(cudaMalloc((void**)&dev_a0, N * sizeof(int)));
	HANDLE_ERROR(cudaMalloc((void**)&dev_b0, N * sizeof(int)));
	HANDLE_ERROR(cudaMalloc((void**)&dev_c0, N * sizeof(int)));

	HANDLE_ERROR(cudaMalloc((void**)&dev_a1, N * sizeof(int)));
	HANDLE_ERROR(cudaMalloc((void**)&dev_b1, N * sizeof(int)));
	HANDLE_ERROR(cudaMalloc((void**)&dev_c1, N * sizeof(int)));

	HANDLE_ERROR(cudaHostAlloc((void**)&host_a, FULL_DATA_SIZE * sizeof(int), cudaHostAllocDefault));
	HANDLE_ERROR(cudaHostAlloc((void**)&host_b, FULL_DATA_SIZE * sizeof(int), cudaHostAllocDefault));
	HANDLE_ERROR(cudaHostAlloc((void**)&host_c, FULL_DATA_SIZE * sizeof(int), cudaHostAllocDefault));


	for (int i = 0; i < FULL_DATA_SIZE; i++)
	{
		host_a[i] = rand();
		host_b[i] = rand();
	}


	// 最上面一段代码为不使用流，下面两段代码为使用两个流
	for (int i = 0; i < FULL_DATA_SIZE; i += 2 * N)
	{
	/*	HANDLE_ERROR(cudaMemcpy(dev_a0, host_a + i, N * sizeof(int), cudaMemcpyHostToDevice));
		HANDLE_ERROR(cudaMemcpy(dev_a1, host_a + i + N, N * sizeof(int), cudaMemcpyHostToDevice));

		HANDLE_ERROR(cudaMemcpy(dev_b0, host_b + i, N * sizeof(int), cudaMemcpyHostToDevice));
		HANDLE_ERROR(cudaMemcpy(dev_a1, host_a + i + N, N * sizeof(int), cudaMemcpyHostToDevice));

		kernel << <N / 256, 256, 0 >> > (dev_a0, dev_b0, dev_c0);
		kernel << <N / 256, 256, 0 >> > (dev_a1, dev_b1, dev_c1);

		HANDLE_ERROR(cudaMemcpy(host_c + i, dev_c0, N * sizeof(int), cudaMemcpyDeviceToHost));
		HANDLE_ERROR(cudaMemcpy(host_c + i + N, dev_c1, N * sizeof(int), cudaMemcpyDeviceToHost));*/

		// 下面两段代码虽然语句的顺序不一样，但是效果是相同的，
		//-------------------------------------------------------------------------------------------------------
		HANDLE_ERROR(cudaMemcpyAsync(dev_a0, host_a + i, N * sizeof(int), cudaMemcpyHostToDevice, stream0));
		HANDLE_ERROR(cudaMemcpyAsync(dev_a1, host_a + i + N, N * sizeof(int), cudaMemcpyHostToDevice, stream1));

		HANDLE_ERROR(cudaMemcpyAsync(dev_b0, host_b + i, N * sizeof(int), cudaMemcpyHostToDevice, stream0));
		HANDLE_ERROR(cudaMemcpyAsync(dev_a1, host_a + i + N, N * sizeof(int), cudaMemcpyHostToDevice, stream1));

		kernel << <N / 256, 256, 0, stream0 >> > (dev_a0, dev_b0, dev_c0);
		kernel << <N / 256, 256, 0, stream1 >> > (dev_a1, dev_b1, dev_c1);

		HANDLE_ERROR(cudaMemcpyAsync(host_c + i, dev_c0, N * sizeof(int), cudaMemcpyDeviceToHost, stream0));
		HANDLE_ERROR(cudaMemcpyAsync(host_c + i + N, dev_c1, N * sizeof(int), cudaMemcpyDeviceToHost, stream1));

		//-----------------------------------------------------------------------------------------------------
		/*HANDLE_ERROR(cudaMemcpyAsync(dev_a0, host_a+i, N*sizeof(int), cudaMemcpyHostToDevice, stream0));
		HANDLE_ERROR(cudaMemcpyAsync(dev_b0, host_b+i, N*sizeof(int), cudaMemcpyHostToDevice, stream0));
		kernel<<<N/256, 256, 0, stream0>>>(dev_a0, dev_b0, dev_c0);
		HANDLE_ERROR(cudaMemcpyAsync(host_c+i, dev_c0, N*sizeof(int), cudaMemcpyDeviceToHost, stream0));

		HANDLE_ERROR(cudaMemcpyAsync(dev_a1, host_a+i+N, N*sizeof(int), cudaMemcpyHostToDevice, stream1));
		HANDLE_ERROR(cudaMemcpyAsync(dev_a1, host_a+i+N, N*sizeof(int), cudaMemcpyHostToDevice, stream1));
		kernel<<<N/256, 256, 0, stream1>>>(dev_a1, dev_b1, dev_c1);
		HANDLE_ERROR(cudaMemcpyAsync(host_c+i+N, dev_c1, N*sizeof(int), cudaMemcpyDeviceToHost, stream1));*/
	}

	// 等待流执行完成
	HANDLE_ERROR(cudaStreamSynchronize(stream0));
	HANDLE_ERROR(cudaStreamSynchronize(stream1));

	HANDLE_ERROR(cudaEventRecord(stop, 0));
	HANDLE_ERROR(cudaEventSynchronize(stop));
	HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime, start, stop));

	printf("Time taken: %3.1f ms\n", elapsedTime);

	// 释放锁页式内存
	HANDLE_ERROR(cudaFreeHost(host_a));
	HANDLE_ERROR(cudaFreeHost(host_b));
	HANDLE_ERROR(cudaFreeHost(host_c));
	// 释放显存
	HANDLE_ERROR(cudaFree(dev_a0));
	HANDLE_ERROR(cudaFree(dev_b0));
	HANDLE_ERROR(cudaFree(dev_c0));
	HANDLE_ERROR(cudaFree(dev_a1));
	HANDLE_ERROR(cudaFree(dev_b1));
	HANDLE_ERROR(cudaFree(dev_c1));
	// 销毁流
	HANDLE_ERROR(cudaStreamDestroy(stream0));
	HANDLE_ERROR(cudaStreamDestroy(stream1));
	return 0;
}