文件名为main.cu
#include<stdio.h>
#include<iostream>
// 两个向量加法kernel,grid和block均为一维
__global__ void add(float* x, float * y, float* z, int n)
{
// 获取全局索引
int index = threadIdx.x + blockIdx.x * blockDim.x;
// 步长
int stride = blockDim.x * gridDim.x;
for (int i = index; i < n; i += stride)
{
z[i] = x[i] + y[i];
}
}
int main()
{
int N = 1 << 10;
int nBytes = N * sizeof(float);
// 申请host内存
float *x, *y, *z;
x = (float*)malloc(nBytes);
y = (float*)malloc(nBytes);
z = (float*)malloc(nBytes);
// 初始化数据
for (int i = 0; i < N; ++i)
{
x[i] = 10.0;
y[i] = 20.0;
}
// 申请device内存
float *d_x, *d_y, *d_z;
cudaMalloc((void**)&d_x, nBytes);
cudaMalloc((void**)&d_y, nBytes);
cudaMalloc((void**)&d_z, nBytes);
// 将host数据拷贝到device
cuda