使用CUDA实现两个数组的相加,熟悉CUDA相关函数的使用。
#include <cuda_runtime.h> #include <stdio.h> __global__ void addKernel( int* c, const int* a, const int* b ) { int i = threadIdx.x; c[i]= a[i] + b[i]; } cudaError_t CUDA_Add( const int* a, const int* b, int* out, int size ) { int *dev_a; int *dev_b; int *dev_c; //1、设置设备 cudaError_t cudaStatus = cudaSetDevice( 0 ); switch( true ) { default: if( cudaStatus != cudaSuccess ) { fprintf(stderr, "调用cudaSetDevice()函数失败!" ); return cudaStatus; } //2、分配显存空间 cudaStatus= cudaMalloc( (void**)&dev_a, size * sizeof(int) ); if( cudaStatus != cudaSuccess ) { fprintf(stderr, "调用cudaMalloc()函数初始化显卡中a数组时失败!" ); break; } cudaStatus= cudaMalloc( (void**)&dev_b, size * sizeof(int) ); if( cudaStatus != cudaSuccess ) { fprintf(stderr, "调用cudaMalloc()函数初始化显卡中b数组时失败!" ); break; } cudaStatus= cudaMalloc( (void**)&dev_c, size * sizeof(int) ); if( cudaStatus != cudaSuccess ) { fprintf(stderr, "调用cudaMalloc()函数初始化显卡中c数组时失败!" ); break; } //3、将宿主程序数据复制到显存中 cudaStatus= cudaMemcpy( dev_a, a, size * sizeof( int ), cudaMemcpyHostToDevice ); if( cudaStatus != cudaSuccess ) { fprintf( stderr, "调用cudaMemcpy()函数初始化宿主程序数据a数组到显卡时失败!"); break; } cudaStatus= cudaMemcpy( dev_b, b, size * sizeof( int ), cudaMemcpyHostToDevice ); if( cudaStatus != cudaSuccess ) { fprintf(stderr, "调用cudaMemcpy()函数初始化宿主程序数据b数组到显卡时失败!" ); break; } //4、执行程序,宿主程序等待显卡执行完毕 addKernel<<<1,size>>>( dev_c, dev_a, dev_b ); //5、查询内核初始化的时候是否出错 cudaStatus= cudaGetLastError( ); if( cudaStatus != cudaSuccess ) { fprintf(stderr, "显卡执行程序时失败!" ); break; } //6、与内核同步等待执行完毕 cudaStatus= cudaDeviceSynchronize( ); if( cudaStatus != cudaSuccess ) { fprintf(stderr, "在与内核同步的过程中发生问题!" ); break; } //7、获取数据 cudaStatus= cudaMemcpy( out, dev_c, size * sizeof( int ), cudaMemcpyDeviceToHost ); if( cudaStatus != cudaSuccess ) { fprintf(stderr, "在将结果数据从显卡复制到宿主程序中失败!" ); break; } } cudaFree(dev_c ); cudaFree(dev_a ); cudaFree(dev_b ); return cudaStatus; } int main( int argc, char** argv ) { const int arraySize = 5; const int a[arraySize] = { 1, 2, 3, 4, 5 }; const int b[arraySize] = { 10, 20, 30, 40, 50 }; int c[arraySize] = { 0 }; int i; cudaDeviceProp prop; //获取cuda数目 int count; cudaGetDeviceCount(&count); printf("CUDA数==%d\n",count); for(i=0;i<count;i++){ //获取设备的属性 cudaGetDeviceProperties(&prop,i); printf("compute capability: %d.%d\n",prop.major,prop.minor); printf("Shared mem per mp: %ld\n",prop.sharedMemPerBlock); } cudaError_t cudaStatus; cudaStatus= CUDA_Add( a, b, c, arraySize ); printf("运算结果是:\nc数组[%d, %d, %d, %d, %d]\n", c[0],c[1], c[2], c[3], c[4] ); cudaStatus= cudaDeviceReset( ); if( cudaStatus != cudaSuccess ) { fprintf(stderr, "调用cudaDeviceReset()函数失败!" ); return 1; } return 0; }
编译运行指令:
# nvcc first_cuda.cu -o first_cuda
# ./first_cuda
运行结果:
CUDA数==1
compute capability: 3.5
Shared mem per mp: 49152
运算结果是:
c数组[11, 22, 33, 44, 55]