首先笔者的GPU显卡是Nvidia的GTX1060 6g,安装好显卡驱动和CUDA软件包之后就可以写并行程序了,编译可执行文件的命令为
nvcc -o helloWorld helloWorld.cu -lcurand
其中-o helloWorld表示生成可执行文件helloWorld,helloWorld.cu是编写的程序文件,-lcurand表示动态链接库libcurand.so,其中需要保证链接库路径已经放到系统路径搜索目录里面
随机数生成
随机数可以在主机CPU端生成,也可以在GPU设备端生成,如果涉及大量的随机数运算,需要将随机数放到GPU设备端计算,为了避免传输带宽导致的性能低下,一般在GPU设备端直接生成随机数.使用CUDA软件包的一个随机数生成库——CuRand库实现这个功能,为此在程序文件中需要导入头文件,并确保调用链接库 l i b c u r a n d . s o libcurand.so libcurand.so
#include<curand_kernel.h>
在GPU设备端生成随机数之后,将数据传输到CPU主机端,可以将其打印出来查看随机数的具体内容,需要注意到,在GPU设备端的数据指针虽然在代码里面和CPU设备端的数据指针没有区别,但是实际上是不可以进行任何读写操作的,但是编译器并不会检查这个错误,所以涉及这方面的编写程序时需要检查注意,所以我们查看随机数内容时,只能在CPU主机端创建一个一样大小的随机数组,然后将GPU设备端的数据拷贝到CPU主机端,之后才能进行打印查看的操作.以下是代码
#include<stdio.h>
#include<stdlib.h>
#include<curand_kernel.h>
//#include"cuda_helper.h"
#include"helper_cuda.h"
#include"cuda.h"
#include"book.h"
#define CURAND_CALL(x){const curandStatus_t a = (x);if(a != CURAND_STATUS_SUCCESS){printf("\nCuRand Error:(err_num = %d) \n",a);cudaDeviceReset();}}
__host__ void print_array(const float * __restrict__ const data,const int num_elem){
for(int i = 0;i < num_elem;i++){
if(i % 8 == 0)
printf("\n");
printf("%2d: %f ",i,data[i]);
}
printf("\n");
}
__host__ int main(int argc,char *argv[]){
const int num_elem = 20000;
const size_t size_in_bytes = (num_elem * sizeof(float));
curandGenerator_t rand_generator_device;
const unsigned long int seed = 987654321;
const curandRngType_t generator_type = CURAND_RNG_PSEUDO_DEFAULT;
//Allocate memory on the device
float *device_ptr;
HANDLE_ERROR(cudaMallocHost((void **)&device_ptr,size_in_bytes));
//Allocate memory on the host for the device copy
float *host_ptr;
HANDLE_ERROR(cudaMallocHost((void **)&host_ptr,size_in_bytes));
//Print library version number
int version;
CURAND_CALL(curandGetVersion(&version));
printf("\nUsing CuRand Version: %d and generator: CURAND_RNG_PSEUDO_DEFAULT",version);
//Register the generator
CURAND_CALL(curandCreateGenerator(&rand_generator_device,generator_type));
//Set the seed for the random number generators
CURAND_CALL(curandSetPseudoRandomGeneratorSeed(rand_generator_device,seed));
//Create a set of random numbers on the device
CURAND_CALL(curandGenerateUniform(rand_generator_device,device_ptr,num_elem));
//Copy the set of device generated data to the host
HANDLE_ERROR(cudaMemcpy(host_ptr,device_ptr,size_in_bytes,cudaMemcpyDeviceToHost));
printf("\n\nRandom numbers from GPU");
print_array(host_ptr,num_elem);
//Free device resources
CURAND_CALL(curandDestroyGenerator(rand_generator_device));
cudaFree(device_ptr);
cudaFreeHost(host_ptr);
cudaDeviceReset();
return 0;
}
在以上代码中可能会涉及到一个自定义的头文件cuda_helper.h,但是笔者找了半天没看到哪里有,但是发现在CUDA软件包的samples目录中可以找到一个helper_cuda.h,就试着用这个文件来代替,但是可能并不能完全代替.随机数打印出来如下
数组求和
对一个一维数组进行求和,在主机端和设备端都创建数组的内存空间,将数据从主机端拷贝到设备端,然后对数组进行并行归约计算,并行归约时采用共享内存保存中间结果,最后再从共享内存的中间结果归约到最终结果。这里有两种方式,第一种是将共享内存上的中间结果传回主机端计算数组总和,第二种是直接在设备端计算数组总和,但是在设备端计算总和时,是多线程对同一个数据进行操作,为了避免计算错误,采用内存锁机制,使多线程按顺序依次对一个数据进行操作。为了估计运算性能,使用CUDA自带的计时函数统计运算时间。
#include<iostream>
#include"book.h"
#include"lock.h"
#define imin(a,b) (a<b?a:b)
const int N = 1000000 * 1024;
const int threadsPerBlock = 1024;
const int blocksPerGrid =
imin( 1024, (N+threadsPerBlock-1) / threadsPerBlock );
__global__ void sum( Lock lock, float *a, float *c ) {
__shared__ float cache[threadsPerBlock];
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int cacheIndex = threadIdx.x;
float temp = 0;
while (tid < N) {
temp += a[tid];
tid += blockDim.x * gridDim.x;
}
// set the cache values
cache[cacheIndex] = temp;
// synchronize threads in this block
__syncthreads();
// for reductions, threadsPerBlock must be a power of 2
// because of the following code
int i = blockDim.x/2;
while (i != 0) {
if (cacheIndex < i)
cache[cacheIndex] += cache[cacheIndex + i];
__syncthreads();
i /= 2;
}
if (cacheIndex == 0){
lock.lock();
*c += cache[0];
lock.unlock();
}
}
int main( void ) {
float *a, c = 0;
float *dev_a, *dev_c;
Lock lock;
//记录起始时间
cudaEvent_t start,stop;
float elapsedTime;
HANDLE_ERROR( cudaEventCreate( &start ) );
HANDLE_ERROR( cudaEventCreate( &stop ) );
// allocate memory on the cpu side
a = (float*)malloc( N*sizeof(float) );
// allocate the memory on the GPU
HANDLE_ERROR( cudaMalloc( (void**)&dev_a,
N*sizeof(float) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_c,
sizeof(float) ) );
// fill in the host memory with data
for (int i=0; i<N; i++) {
a[i] = .0001;
}
// copy the arrays 'a' to the GPU
HANDLE_ERROR( cudaMemcpy( dev_a, a, N*sizeof(float),
cudaMemcpyHostToDevice ) );
HANDLE_ERROR( cudaEventRecord( start, 0 ) );
//gettimeofday(&tstart,NULL);
sum<<<blocksPerGrid,threadsPerBlock>>>( lock, dev_a, dev_c );
// copy the array 'c' back from the GPU to the CPU
HANDLE_ERROR( cudaMemcpy( &c, dev_c,
sizeof(float),
cudaMemcpyDeviceToHost ) );
//获取结束时间,并显示计时结果
HANDLE_ERROR( cudaEventRecord( stop, 0 ) );
HANDLE_ERROR( cudaEventSynchronize( stop ) );
HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime, start, stop ) );
printf( "Time to generate: %3.1f ms\n", elapsedTime );
HANDLE_ERROR( cudaEventDestroy( start ) );
HANDLE_ERROR( cudaEventDestroy( stop ) );
printf( "Sum of the value array:%lf\n",c );
// free memory on the gpu side
HANDLE_ERROR( cudaFree( dev_a ) );
HANDLE_ERROR( cudaFree( dev_c ) );
// free memory on the cpu side
free( a );
}
其中涉及到内存锁的功能,需要用到头文件lock.h
#ifndef __LOCK_H__
#define __LOCK_H__
struct Lock {
int *mutex;
Lock( void ) {
HANDLE_ERROR( cudaMalloc( (void**)&mutex,
sizeof(int) ) );
HANDLE_ERROR( cudaMemset( mutex, 0, sizeof(int) ) );
}
~Lock( void ) {
cudaFree( mutex );
}
__device__ void lock( void ) {
while( atomicCAS( mutex, 0, 1 ) != 0 );
}
__device__ void unlock( void ) {
atomicExch( mutex, 0 );
}
};
#endif