CPU实现
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<time.h>
void sumArraysOnHost(float *A, float *B, float *C, const int N){
for (int idx=0; idx<N; idx++){
C[idx] = A[idx] + B[idx];
printf("%f \t %f \t %f \n",C[idx],A[idx],B[idx]);
}
}
void initialData(float *ip, int size){
time_t t;
srand((unsigned int) time(&t));
for (int i=0; i<size; i++){
ip[i] = (float)( rand() & 0xFF) / 10.0f;
}
}
int main(int argc, char **argv){
int nElem = 1024;
size_t nBytes = nElem * sizeof(float);
float *h_A, *h_B, *h_C;
h_A = (float *)malloc(nBytes);
h_B = (float *)malloc(nBytes);
h_C = (float *)malloc(nBytes);
initialData(h_A, nElem);
initialData(h_B, nElem);
sumArraysOnHost(h_A, h_B, h_C, nElem);
free(h_A);
free(h_B);
free(h_C);
return(0);
}
编译
nvcc -Xcompiler -std=c99 sumArraysOnHost.c -o sum
-Xcompiler
用于指定命令行选项是指向C编译器还是预处理器
std=c99
C程序按照C99标准编写的
执行
./sum
GPU实现
#include<cuda_runtime.h>
#include<stdio.h>
#define CHECK(call) \
{ \
const cudaError_t error = call; \
if(error != cudaSuccess){ \
printf("Error: %s:%d, ", __FILE__,__LINE__); \
printf("code:%d, reason:%s\n",error, cudaGetErrorString(error)); \
exit(1); \
} \
}
void checkResult(float *hostRef, float *gpuRef,const int N){
double epsilon = 1.0E-8;
bool match = 1;
for(int i=0;i<N;i++){
if(abs(hostRef[i] - gpuRef[i]) > epsilon){
match = 0;
printf("Arrays do not match:\n");
printf("host %5.2f gpu %5.2f at current %d\n",hostRef[i],gpuRef[i],i);
break;
}
}
if(match) printf("Arrays match.\n\n");
}
void initialData(float *ip,int size){
// generate different seed for random number
time_t t;
srand((unsigned) time(&t));
for(int i=0;i<size;i++){
ip[i] = (float)(rand() & 0xFF)/10.0f;
}
}
void sumArraysOnHost(float *A, float *B, float *C, const int N){
for(int idx=0;idx<N;idx++){
C[idx] = A[idx] + B[idx];
}
}
__global__ void sumArraysOnGPU(float *A, float *B, float *C){
int i = threadIdx.x;
C[i] = A[i] + B[i];
}
int main(int argc, char **argv){
printf("%s Starting... \n",argv[0]);
//set up device
int dev = 0;
cudaSetDevice(dev);
//set up data size of vectors
int nElem = 32;
printf("Vector size %d\n", nElem);
//malloc host memory
size_t nBytes = nElem * sizeof(float);
float *h_A, *h_B, *hostRef, *gpuRef;
h_A = (float *)malloc(nBytes);
h_B = (float *)malloc(nBytes);
hostRef = (float *)malloc(nBytes);
gpuRef = (float *)malloc(nBytes);
//initialize data at host side
initialData(h_A, nElem);
initialData(h_B, nElem);
memset(hostRef, 0, nBytes);
memset(gpuRef, 0, nBytes);
// malloc device global memory
float *d_A, *d_B, *d_C;
cudaMalloc((float**)&d_A, nBytes);
cudaMalloc((float**)&d_B, nBytes);
cudaMalloc((float**)&d_C, nBytes);
cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice);
// invoke kernel at host side
dim3 block (nElem);
dim3 grid (nElem/block.x);
sumArraysOnGPU<<<grid,block>>>(d_A, d_B, d_C);
printf("Execution configuration <<<%d, %d>>>\n",grid.x,block.x);
// copy kernel result back to host side
cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost);
// add vector at host side for result checks
sumArraysOnHost(h_A, h_B, hostRef, nElem);
// check device results
checkResult(hostRef, gpuRef, nElem);
// free device global memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
// free host memory
free(h_A);
free(h_B);
free(hostRef);
free(gpuRef);
return(0);
}
编译
nvcc -Xcompiler -std=c99 sumArraysOnHost.c -o sum
执行
./addvector
代码详解
内存管理
cudaMalloc(void** devPtr, size_t size)
- 用于执行GPU内存分配
- 该函数负责向设备分配一定字节的线程内存,并以devPtr的形式返回指向所分配内存的指针
cudaMemcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind)
- 函数负责主机和设备之间的数据传输
- 函数从src指向的源存储区复制一定数量的字节到dst指向的目标存储区,复制方向由kind指定
- 这个函数以同步方式执行,函数返回以及传输操作完成之前主机应用程序是阻塞的.
- 除内核启动之后的cuda调用都会返回一个错误枚举类型
cuda Error_t
- kind种类:
cudaMemcpyHostToHost |
---|
cudaMemcpyHostToDevice |
cudaMemcpyDeviceToHost |
cudaMemcpyDeviceToDevice |