#include<math.h>#include<stdio.h>#include<stdlib.h>constdouble EPSILON =1.0e-15;constdouble a =1.23;constdouble b =2.34;constdouble c =3.57;/*
同样长度的一维数组之和
*/voidadd(constdouble*x,constdouble*y,double*z,constint N);voidcheck(constdouble*z,constint N);voidadd(constdouble*x,constdouble*y,double*z,constint N){for(int n =0; n < N;++n){
z[n]= x[n]+ y[n];}}voidcheck(constdouble*z,constint N){bool has_error =false;for(int n =0; n < N;++n){if(fabs(z[n]- c)> EPSILON)
has_error =true;}printf("%s\n", has_error ?"Has error!":"No error!");}intmain(){constint N =10000;constint M =sizeof(double)* N;double*x =(double*)malloc(M);double*y =(double*)malloc(M);double*z =(double*)malloc(M);for(int n =0; n < N;++n){
x[n]= a;
y[n]= b;}add(x, y, z, N);check(z, N);free(x);free(y);free(z);return0;}
这是CUDA版
#include<math.h>#include<stdio.h>#include<stdlib.h>constdouble EPSILON =1.0e-3;constdouble a =1.23;constdouble b =2.34;constdouble c =3.57;/*
同样长度的一维数组之和
*/void __global__ add(constdouble*x,constdouble*y,double*z);voidcheck(constdouble*z,constint N);void __global__ add(constdouble*x,constdouble*y,double*z){constint n = blockDim.x * blockIdx.x + threadIdx.x;
z[n]= x[n]+ y[n];}voidcheck(constdouble*z,constint N){bool has_error =false;for(int n =0; n < N;++n){if(fabs(z[n]- c)> EPSILON)// printf("%f\n", z[n]);
has_error =true;}printf("%s\n", has_error ?"Has error!":"No error!");}intmain(){constint N =100000;constint M =sizeof(double)* N;double*h_x =(double*)malloc(M);double*h_y =(double*)malloc(M);double*h_z =(double*)malloc(M);for(int n =0; n < N;++n){
h_x[n]= a;
h_y[n]= b;}double*d_x,*d_y,*d_z;cudaMalloc((void**)&d_x, M);cudaMalloc((void**)&d_y, M);cudaMalloc((void**)&d_z, M);cudaMemcpy(d_x, h_x, M, cudaMemcpyHostToDevice);cudaMemcpy(d_y, h_y, M, cudaMemcpyHostToDevice);constint block_size =160;constint grid_size = N / block_size;printf("%d\t%d\n", block_size, grid_size);// add<<<block_size, grid_size>>>(d_x, d_y, d_z);// add<<<block_size, grid_size>>>(d_x, d_y, d_z);// 这是为什么,当我将block_size,grid_size设置的比较大的时候就会有问题:P20
add<<<block_size, grid_size>>>(d_x, d_y, d_z);// add<<<1, 1>>>(d_x, d_y, d_z);cudaMemcpy(h_z, d_z, M, cudaMemcpyDeviceToHost);check(h_z, N);free(h_x);free(h_y);free(h_z);cudaFree(d_x);cudaFree(d_y);cudaFree(d_z);return0;}