#include <cuda_runtime.h>
#include <stdio.h>
#include "freshman.h"
void SumMatrixOnHost(float* a_h, float* b_h, float* res_h, const int &size){
for(int i = 0; i < size; i++){
res_h[i] = a_h[i] + b_h[i];
}
}
void SumMatrixOnHost(float* a_h, float* b_h, float* res_h, const int nx, const int ny){
float * a = a_h;
float * b = b_h;
float * c = res_h;
for(int j=0;j<ny;j++)
{
for(int i=0;i<nx;i++)
{
c[i]=a[i]+b[i];
}
c+=nx;
b+=nx;
a+=nx;
}
}
__global__ void SumMatrixOnGpu(float* a_d, float* b_d, float* res_d, int nx, int ny){
unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int iy = threadIdx.y + blockIdx.y * blockDim.y;
unsigned int idx = iy * nx + ix;
if(ix < nx && iy < ny){
res_d[idx] = a_d[idx] + b_d[idx];
}
}
int main(){
//set up device
int dev = 0;
cudaDeviceProp deviceprop;
CHECK(cudaGetDeviceProperties(&deviceprop, dev));
printf("Using Device : %d: %s \n", dev, deviceprop.name);
CHECK(cudaSetDevice(dev));
//set up data size of matrix
int nx = 1 << 14;
int ny = 1 << 14;
int nxy = nx * ny;
int nBytes = nxy * sizeof(float);
printf("Matrix size : nx %d ny %d \n", nx, ny);
//malloc host memory
float *a_h, *b_h, *res_h, *res_gpu;
a_h = (float*)malloc(nBytes);
b_h = (float*)malloc(nBytes);
res_h = (float*)malloc(nBytes);
res_gpu = (float*)malloc(nBytes);
//initialize data at host device
initialData(a_h, nxy);
initialData(b_h, nxy);
printf("initialize data complete! %f, %f", a_h[0], b_h[0]);
//set 0 at host device for res
memset(res_gpu, 0, nBytes);
memset(res_h, 0, nBytes);
printf("set 0 at host device complete!");
//add matrix at host device
double istart = cpuSecond();
SumMatrixOnHost(a_h, b_h, res_h, nx, ny);
double iElaps = cpuSecond() - istart;
printf("CPU Exection time : %f \n", iElaps);
//malloc device memory
float* a_d, *b_d, *c_d;
cudaMalloc((void**)&(a_d), nBytes);
cudaMalloc((void**)&(b_d), nBytes);
cudaMalloc((void**)&(c_d), nBytes);
//transfer data from host to device
cudaMemcpy(a_d, a_h, nBytes, cudaMemcpyHostToDevice);
cudaMemcpy(b_d, b_h, nBytes, cudaMemcpyHostToDevice);
//ivok kernel at host device
int dimx = 32;
int dimy = 32;
dim3 block(dimx, dimy);
dim3 grid((nx + block.x - 1) / block.x, (ny + block.y - 1) / block.y);
istart = cpuSecond();
SumMatrixOnGpu<<<grid, block>>>(a_d, b_d, c_d, nx, ny);
cudaDeviceSynchronize();
iElaps = cpuSecond() - istart;
printf("GPU Execution configuration<<<(%d,%d),(%d,%d)>>> Time elapsed %f sec\n",
grid.x, grid.y, block.x, block.y, iElaps);
cudaMemcpy(res_gpu, c_d, nBytes, cudaMemcpyDeviceToHost);
//check if cpures and gpures is same
checkResult(res_h, res_gpu ,nxy);
//free device memory
cudaFree(a_d);
cudaFree(b_d);
cudaFree(c_d);
//free host memory
free(a_h);
free(b_h);
free(res_gpu);
free(res_h);
//reset device
cudaDeviceReset();
return 0;
}
//#include <cuda_runtime.h>
//#include <stdio.h>
#include <time.h>
#ifndef FRESHMAN_H
#define FRESHMAN_H
#define CHECK(call)\
{\
const cudaError_t error=call;\
if(error!=cudaSuccess)\
{\
printf("ERROR: %s:%d,",__FILE__,__LINE__);\
printf("code:%d,reason:%s\n",error,cudaGetErrorString(error));\
exit(1);\
}\
}
#ifdef _WIN32
# include <windows.h>
#else
# include <sys/time.h>
#endif
#ifdef _WIN32
int gettimeofday(struct timeval *tp, void *tzp)
{
time_t clock;
struct tm tm;
SYSTEMTIME wtm;
GetLocalTime(&wtm);
tm.tm_year = wtm.wYear - 1900;
tm.tm_mon = wtm.wMonth - 1;
tm.tm_mday = wtm.wDay;
tm.tm_hour = wtm.wHour;
tm.tm_min = wtm.wMinute;
tm.tm_sec = wtm.wSecond;
tm. tm_isdst = -1;
clock = mktime(&tm);
tp->tv_sec = clock;
tp->tv_usec = wtm.wMilliseconds * 1000;
return (0);
}
#endif
double cpuSecond()
{
struct timeval tp;
gettimeofday(&tp,NULL);
return((double)tp.tv_sec+(double)tp.tv_usec*1e-6);
}
void initialData(float* ip,int size)
{
time_t t;
srand((unsigned )time(&t));
for(int i=0;i<size;i++)
{
ip[i]=(float)(rand()&0xffff)/1000.0f;
}
}
void initialData_int(int* ip, int size)
{
time_t t;
srand((unsigned)time(&t));
for (int i = 0; i<size; i++)
{
ip[i] = int(rand()&0xff);
}
}
void printMatrix(float * C,const int nx,const int ny)
{
float *ic=C;
printf("Matrix<%d,%d>:",ny,nx);
for(int i=0;i<ny;i++)
{
for(int j=0;j<nx;j++)
{
printf("%6f ",C[j]);
}
ic+=nx;
printf("\n");
}
}
void initDevice(int devNum)
{
int dev = devNum;
cudaDeviceProp deviceProp;
CHECK(cudaGetDeviceProperties(&deviceProp,dev));
printf("Using device %d: %s\n",dev,deviceProp.name);
CHECK(cudaSetDevice(dev));
}
void checkResult(float * hostRef,float * gpuRef,const int N)
{
double epsilon=1.0E-8;
for(int i=0;i<N;i++)
{
if(abs(hostRef[i]-gpuRef[i])>epsilon)
{
printf("Results don\'t match!\n");
printf("%f(hostRef[%d] )!= %f(gpuRef[%d])\n",hostRef[i],i,gpuRef[i],i);
return;
}
}
printf("Check result success!\n");
}
#endif//FRESHMAN_H
CMakeLists.txt文件的编写
cmake_minimum_required(VERSION 3.10 FATAL_ERROR)
Project(CUDA_Freshman CXX C CUDA)
#set_target_properties(CUDA_Freshman PROPERTIES CUDA_ARCHITECTURES "52;60;61")
set(CMAKE_CUDA_FLAGS "-arch=compute_35 -g -G -O3")
include_directories("./include")
add_executable(freshman "./src/sumMatrix2D")