#include<stdlib.h>
#include<stdio.h>
#include<string.h>
#include<math.h>
#include<cutil.h>
#define N 4
/*****功能:两个向量从主机端到设备端的拷贝,
进行向量求和后,再将数据拷贝加主机端*/
//设备端代码
__global__ void VecAdd(float* A,float* B,float* C)
{
int i=threadIdx.x;
if(i<N)//跳过没有用的线程
C[i]=A[i]+B[i];
}
//主函数
int main()
{
char c;
int i;
//显存中分配向量空间
size_t size=N*sizeof(float);
float* d_A;
cudaMalloc((void**)&d_A,size);
float* d_B;
cudaMalloc((void**)&d_B,size);
float* d_C;
cudaMalloc((void**)&d_C,size);
//内存中分配向量空间
float* h_A=(float*)malloc(size);
float* h_B=(float*)malloc(size);
float* h_C=(float*)malloc(size);
//从内存向显存拷贝向量
cudaMemcpy(d_A,h_A,size,cudaMemcpyHostToDevice);
cudaMemcpy(d_B,h_B,size,cudaMemcpyHostToDevice);
//启动Kernel
int threadsPerBlock=256;
int threadsPerGrid=(N+threadsPerBlock-1)/threadsPerBlock;
VecAdd<<<threadsPerGrid,threadsPerBlock>>>(d_A,d_B,d_C);
//从显存向内存拷回结果
cudaMemcpy(h_C,d_C,size,cudaMemcpyDeviceToHost);
//释放显存空间
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
for(i=0;i<N;i++)
{
printf("%5.2f",h_C[i]);
}
scanf("%c",&c);
}