CUDA计算二维矩阵的加法

最新推荐文章于 2024-07-25 22:07:31 发布

Rainbow Sea

最新推荐文章于 2024-07-25 22:07:31 发布

阅读量175

点赞数

文章标签：矩阵算法线性代数

本文链接：https://blog.csdn.net/qq_46463876/article/details/133942975

版权

#include <cuda_runtime.h>
#include <stdio.h>
#include "freshman.h"

void SumMatrixOnHost(float* a_h, float* b_h, float* res_h, const int &size){
    for(int i = 0; i < size; i++){
        res_h[i] = a_h[i] + b_h[i];
    }
}

void SumMatrixOnHost(float* a_h, float* b_h, float* res_h, const int nx, const int ny){
    float * a = a_h;
    float * b = b_h;
    float * c = res_h;
    for(int j=0;j<ny;j++)
    {
        for(int i=0;i<nx;i++)
        {
            c[i]=a[i]+b[i];
        }
    c+=nx;
    b+=nx;
    a+=nx;
    }
}

__global__ void SumMatrixOnGpu(float* a_d, float* b_d, float* res_d, int nx, int ny){
    unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;
    unsigned int iy = threadIdx.y + blockIdx.y * blockDim.y;
    unsigned int idx = iy * nx + ix;

    if(ix < nx && iy < ny){
        res_d[idx] = a_d[idx] + b_d[idx];
    }
}

int main(){
    //set up device
    int dev = 0;
    cudaDeviceProp deviceprop;
    CHECK(cudaGetDeviceProperties(&deviceprop, dev));
    printf("Using Device : %d: %s \n", dev, deviceprop.name);
    CHECK(cudaSetDevice(dev));

    //set up data  size of matrix
    int nx = 1 << 14;
    int ny = 1 << 14;

    int nxy = nx * ny;
    int nBytes = nxy * sizeof(float);
    printf("Matrix size : nx %d ny %d \n", nx, ny);

    //malloc host memory
    float *a_h, *b_h, *res_h, *res_gpu;
    a_h = (float*)malloc(nBytes);
    b_h = (float*)malloc(nBytes);
    res_h = (float*)malloc(nBytes);
    res_gpu = (float*)malloc(nBytes);
    
    //initialize data at host device
    initialData(a_h, nxy);
    initialData(b_h, nxy);
    printf("initialize data complete! %f, %f", a_h[0], b_h[0]);

    //set 0 at host device for res
    memset(res_gpu, 0, nBytes);
    memset(res_h, 0, nBytes);
    printf("set 0 at host device  complete!");

    //add matrix at host device
    double istart = cpuSecond();
    SumMatrixOnHost(a_h, b_h, res_h, nx, ny);
    double iElaps = cpuSecond() - istart;
    printf("CPU Exection time : %f \n", iElaps);

    //malloc device memory 
    float* a_d, *b_d, *c_d;
    cudaMalloc((void**)&(a_d), nBytes);
    cudaMalloc((void**)&(b_d), nBytes);
    cudaMalloc((void**)&(c_d), nBytes);

    //transfer data from host to device
    cudaMemcpy(a_d, a_h, nBytes, cudaMemcpyHostToDevice);
    cudaMemcpy(b_d, b_h, nBytes, cudaMemcpyHostToDevice);

    //ivok kernel at host device
    int dimx = 32;
    int dimy = 32;
    dim3 block(dimx, dimy);
    dim3 grid((nx + block.x - 1) / block.x, (ny + block.y - 1) / block.y);

    istart = cpuSecond();
    SumMatrixOnGpu<<<grid, block>>>(a_d, b_d, c_d, nx, ny);
    cudaDeviceSynchronize();
    iElaps = cpuSecond() - istart;
    printf("GPU Execution configuration<<<(%d,%d),(%d,%d)>>> Time elapsed %f sec\n",
        grid.x, grid.y, block.x, block.y, iElaps);

    cudaMemcpy(res_gpu, c_d, nBytes, cudaMemcpyDeviceToHost);

    //check if cpures and gpures is same
    checkResult(res_h, res_gpu ,nxy);

    //free device memory
    cudaFree(a_d);
    cudaFree(b_d);
    cudaFree(c_d);

    //free host memory
    free(a_h);
    free(b_h);
    free(res_gpu);
    free(res_h);

    //reset device
    cudaDeviceReset();

    return 0;
}


//#include <cuda_runtime.h>
//#include <stdio.h>
#include <time.h>

#ifndef FRESHMAN_H
#define FRESHMAN_H
#define CHECK(call)\
{\
  const cudaError_t error=call;\
  if(error!=cudaSuccess)\
  {\
      printf("ERROR: %s:%d,",__FILE__,__LINE__);\
      printf("code:%d,reason:%s\n",error,cudaGetErrorString(error));\
      exit(1);\
  }\
}


#ifdef _WIN32
#	include <windows.h>
#else
#	include <sys/time.h>
#endif
#ifdef _WIN32
int gettimeofday(struct timeval *tp, void *tzp)
{
  time_t clock;
  struct tm tm;
  SYSTEMTIME wtm;
  GetLocalTime(&wtm);
  tm.tm_year   = wtm.wYear - 1900;
  tm.tm_mon   = wtm.wMonth - 1;
  tm.tm_mday   = wtm.wDay;
  tm.tm_hour   = wtm.wHour;
  tm.tm_min   = wtm.wMinute;
  tm.tm_sec   = wtm.wSecond;
  tm. tm_isdst  = -1;
  clock = mktime(&tm);
  tp->tv_sec = clock;
  tp->tv_usec = wtm.wMilliseconds * 1000;
  return (0);
}
#endif
double cpuSecond()
{
  struct timeval tp;
  gettimeofday(&tp,NULL);
  return((double)tp.tv_sec+(double)tp.tv_usec*1e-6);

}
void initialData(float* ip,int size)
{
  time_t t;
  srand((unsigned )time(&t));
  for(int i=0;i<size;i++)
  {
    ip[i]=(float)(rand()&0xffff)/1000.0f;
  }
}
void initialData_int(int* ip, int size)
{
	time_t t;
	srand((unsigned)time(&t));
	for (int i = 0; i<size; i++)
	{
		ip[i] = int(rand()&0xff);
	}
}
void printMatrix(float * C,const int nx,const int ny)
{
  float *ic=C;
  printf("Matrix<%d,%d>:",ny,nx);
  for(int i=0;i<ny;i++)
  {
    for(int j=0;j<nx;j++)
    {
      printf("%6f ",C[j]);
    }
    ic+=nx;
    printf("\n");
  }
}

void initDevice(int devNum)
{
  int dev = devNum;
  cudaDeviceProp deviceProp;
  CHECK(cudaGetDeviceProperties(&deviceProp,dev));
  printf("Using device %d: %s\n",dev,deviceProp.name);
  CHECK(cudaSetDevice(dev));

}
void checkResult(float * hostRef,float * gpuRef,const int N)
{
  double epsilon=1.0E-8;
  for(int i=0;i<N;i++)
  {
    if(abs(hostRef[i]-gpuRef[i])>epsilon)
    {
      printf("Results don\'t match!\n");
      printf("%f(hostRef[%d] )!= %f(gpuRef[%d])\n",hostRef[i],i,gpuRef[i],i);
      return;
    }
  }
  printf("Check result success!\n");
}
#endif//FRESHMAN_H

CMakeLists.txt文件的编写

cmake_minimum_required(VERSION 3.10 FATAL_ERROR)

Project(CUDA_Freshman CXX C CUDA)

#set_target_properties(CUDA_Freshman PROPERTIES CUDA_ARCHITECTURES "52;60;61")

set(CMAKE_CUDA_FLAGS "-arch=compute_35 -g -G -O3")

include_directories("./include")

add_executable(freshman "./src/sumMatrix2D")