一定要遵循一下的顺序
创建流:cudaCreatStream(),
主机和设备数据拷贝: cudaMemcpyAsync(),
流的同步:cudaStreamSynchronize(),
流的销毁:cudaStreamDestory()。
#include <stdio.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cuda.h>
#define N (1024*1024)
#define FULL_DATA_SIZE (N*20)
__global__ void kernel(int*a,int *b,int*c)
{
int idx=threadIdx.x+blockIdx.x*blockDim.x;
if(idx<N){
int idx1 = (idx + 1) % 256;
int idx2 = (idx + 2) % 256;
float as = (a[idx] + a[idx1] + a[idx2]) / 3.0f;
float bs = (b[idx] + b[idx1] +