国科大cuda编程作业：加速计算基础 —— CUDA C/C++

最新推荐文章于 2024-06-03 12:06:00 发布

wjsay

最新推荐文章于 2024-06-03 12:06:00 发布

阅读量6.3k

点赞数 11

分类专栏： C/C++

本文链接：https://blog.csdn.net/baisedeqingting/article/details/103703653

版权

C/C++ 专栏收录该内容

4 篇文章 1 订阅

订阅专栏

使用 CUDA C/C++ 加速应用程序

被加速的C/C++应用程序的异步流和可视化分
利用基本的 CUDA 内存管理技术来优化加速应用程序
被加速的C/C++应用程序的异步流和可视化分析

完成第三步中的GPU task便可拿到该课程的证书，完成作业。

我的代码思路是：

将bodyForce函数改为核函数，在GPU上运行。因为多个epoch必须按序执行，所以无法使用并发的cuda流，默认的串行流行为可以完成任务。
将bodyForce执行结束后的for循环改为核函数。
其他技巧就是第二个学习文档提到的块数和线程数设置

为了将博客升到6级，我得多写点博文了。
以下是在速度和正确性上都通过测试的cuda代码。

#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include "timer.h"
#include "check.h"

#define SOFTENING 1e-9f

/*
 * Each body contains x, y, and z coordinate positions,
 * as well as velocities in the x, y, and z directions.
 */

typedef struct { float x, y, z, vx, vy, vz; } Body;

/*
 * Do not modify this function. A constraint of this exercise is
 * that it remain a host function.
 */

void randomizeBodies(float *data, int n) {
  for (int i = 0; i < n; i++) {
    data[i] = 2.0f * (rand() / (float)RAND_MAX) - 1.0f;
  }
}

/*
 * This function calculates the gravitational impact of all bodies in the system
 * on all others, but does not update their positions.
 */

__global__
void bodyForce(Body *p, float dt, int n) {
  int index = threadIdx.x + blockIdx.x * blockDim.x;
  int stride = blockDim.x * gridDim.x;

  for (int i = index; i < n; i += stride) {
    float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f;
    for (int j = 0; j < n; j++) {
      float dx = p[j].x - p[i].x;
      float dy = p[j].y - p[i].y;
      float dz = p[j].z - p[i].z;
      float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING;
      float invDist = rsqrtf(distSqr);
      float invDist3 = invDist * invDist * invDist;

      Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3;
    }

    p[i].vx += dt*Fx; 
    p[i].vy += dt*Fy; 
    p[i].vz += dt*Fz; 
  }

}
__global__ void add(Body*p, float dt,int n) {
  int index = threadIdx.x + blockIdx.x * blockDim.x;
  int stride = blockDim.x * gridDim.x;
  for (int i = index; i < n; i += stride) {
    p[i].x += p[i].vx*dt;
    p[i].y += p[i].vy*dt;
    p[i].z += p[i].vz*dt;
  }
}

int main(const int argc, const char** argv) {
  int deviceId;
  int numberOfSMs;

  cudaGetDevice(&deviceId);
  cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId);

  /*
   * Do not change the value for `nBodies` here. If you would like to modify it,
   * pass values into the command line.
   */

  int nBodies = 2<<11;
  int salt = 0;
  if (argc > 1) nBodies = 2<<atoi(argv[1]);

  /*
   * This salt is for assessment reasons. Tampering with it will result in automatic failure.
   */

  if (argc > 2) salt = atoi(argv[2]);

  const float dt = 0.01f; // time step
  const int nIters = 10;  // simulation iterations

  int bytes = nBodies * sizeof(Body);
  float *buf;

  cudaMallocManaged(&buf, bytes);
  //cudaMemPrefetchAsync(buf, bytes, deviceId);

  Body *p = (Body*)buf;

  /*
   * As a constraint of this exercise, `randomizeBodies` must remain a host function.
   */

  randomizeBodies(buf, 6 * nBodies); // Init pos / vel data

  size_t threadsPerBlock = 256;
  size_t numberOfBlocks = 32 * numberOfSMs;

  double totalTime = 0.0;

  /*
   * This simulation will run for 10 cycles of time, calculating gravitational
   * interaction amongst bodies, and adjusting their positions to reflect.
   */

  /*******************************************************************/
  // Do not modify these 2 lines of code.
  for (int iter = 0; iter < nIters; iter++) {
    StartTimer();
  /*******************************************************************/

  /*
   * You will likely wish to refactor the work being done in `bodyForce`,
   * as well as the work to integrate the positions.
   */
  bodyForce<<< numberOfBlocks, threadsPerBlock >>>(p, dt, nBodies); // compute interbody forces  
  cudaDeviceSynchronize();
  add<<< numberOfBlocks, threadsPerBlock >>>(p, dt, nBodies);
  // Do not modify the code in this section.
    const double tElapsed = GetTimer() / 1000.0;
    totalTime += tElapsed;
  }
  cudaDeviceSynchronize();

  double avgTime = totalTime / (double)(nIters);
  float billionsOfOpsPerSecond = 1e-9 * nBodies * nBodies / avgTime;

#ifdef ASSESS
  checkPerformance(buf, billionsOfOpsPerSecond, salt);
#else
  checkAccuracy(buf, nBodies);
  printf("%d Bodies: average %0.3f Billion Interactions / second\n", nBodies, billionsOfOpsPerSecond);
  salt += 1;
#endif
  /*******************************************************************/

  /*
   * Feel free to modify code below.
   */

  cudaFree(buf);
}

wjsay

关注

11
点赞
踩
32

收藏

觉得还不错? 一键收藏
27
评论
国科大cuda编程作业：加速计算基础 —— CUDA C/C++

使用 CUDA C/C++ 加速应用程序被加速的C/C++应用程序的异步流和可视化分利用基本的 CUDA 内存管理技术来优化加速应用程序被加速的C/C++应用程序的异步流和可视化分析完成第三步中的GPU task便可拿到该课程的证书，完成作业。我的代码思路是：将bodyForce函数改为核函数，在GPU上运行。因为多个epoch必须按序执行，所以无法使用并发的cuda流，默...
复制链接

扫一扫