【nvidia英伟达认证】加速计算基础 —— CUDA C/C++（附通过代码）

失舵之舟-

已于 2024-11-20 15:43:12 修改

阅读量1.8k

点赞数 6

文章标签： c语言 c++ 开发语言英伟达认证 cuda nvidia

于 2024-06-13 17:20:58 首次发布

本文链接：https://blog.csdn.net/qq_50771882/article/details/139659208

版权

nvidia英伟达认证

1 介绍
2 完成任务1和任务2
3 完成任务3
- 3.1 代码

1 介绍

这是我们学校高性能计算的课程任务，最后会得到这样一个证书。
在这里插入图片描述
要完成这个认证，一共有三个任务，前面两个任务就自己慢慢学慢慢看就行，第三个任务有个小测试。

2 完成任务1和任务2

图片看不清楚的话，双击就会放大。
点击start开始
在这里插入图片描述

打开之后就是jupyter notebook

接下来一个个运行，自己认真学习即可，做完就点stop task，任务二也是这样。

3 完成任务3

前面都是一步步运行
在这里插入图片描述
一直到最后的练习：加速和优化N体模拟器

3.1 代码

#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include "timer.h"
#include "files.h"

#define SOFTENING 1e-9f

/*
 * Each body contains x, y, and z coordinate positions,
 * as well as velocities in the x, y, and z directions.
 */

typedef struct { float x, y, z, vx, vy, vz; } Body;

/*
 * Calculate the gravitational impact of all bodies in the system
 * on all others.
 */

__global__ void bodyForce(Body *p, float dt, int n) {
  int index = threadIdx.x + blockIdx.x * blockDim.x;
  int stride = blockDim.x * gridDim.x;
  for (int i = index; i < n; i+=stride) {
    float Fx = 0.0f; float Fy = 0.0f; float Fz = 0.0f;

    for (int j = 0; j < n; j++) {
      float dx = p[j].x - p[i].x;
      float dy = p[j].y - p[i].y;
      float dz = p[j].z - p[i].z;
      float distSqr = dx*dx + dy*dy + dz*dz + SOFTENING;
      float invDist = rsqrtf(distSqr);
      float invDist3 = invDist * invDist * invDist;

      Fx += dx * invDist3; Fy += dy * invDist3; Fz += dz * invDist3;
    }

    p[i].vx += dt*Fx; p[i].vy += dt*Fy; p[i].vz += dt*Fz;
  }
}


int main(const int argc, const char** argv) {
  
  int deviceId;
  int numberOfSMs;

  cudaGetDevice(&deviceId);
  cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId);
  
  
    
  // The assessment will test against both 2<11 and 2<15.
  // Feel free to pass the command line argument 15 when you gernate ./nbody report files
  int nBodies = 2<<11;
  if (argc > 1) nBodies = 2<<atoi(argv[1]);

  // The assessment will pass hidden initialized values to check for correctness.
  // You should not make changes to these files, or else the assessment will not work.
  const char * initialized_values;
  const char * solution_values;

  if (nBodies == 2<<11) {
    initialized_values = "files/initialized_4096";
    solution_values = "files/solution_4096";
  } else { // nBodies == 2<<15
    initialized_values = "files/initialized_65536";
    solution_values = "files/solution_65536";
  }

  if (argc > 2) initialized_values = argv[2];
  if (argc > 3) solution_values = argv[3];

  const float dt = 0.01f; // Time step
  const int nIters = 10;  // Simulation iterations

  int bytes = nBodies * sizeof(Body);
  float *buf;

 
  cudaMallocManaged(&buf, bytes);
  Body *p = (Body*)buf;

  cudaMemPrefetchAsync(p, bytes, deviceId);
  
  read_values_from_file(initialized_values, buf, bytes);

  double totalTime = 0.0;

  /*
   * This simulation will run for 10 cycles of time, calculating gravitational
   * interaction amongst bodies, and adjusting their positions to reflect.
   */

  for (int iter = 0; iter < nIters; iter++) {
    StartTimer();

  /*
   * You will likely wish to refactor the work being done in `bodyForce`,
   * and potentially the work to integrate the positions.
   */
    cudaStream_t stream;
    cudaStreamCreate(&stream);
    
    int blocknum = nBodies/512;
    bodyForce<<<blocknum,512,0,stream>>>(p, dt, nBodies); // compute interbody forces
    cudaStreamDestroy(stream);
  /*
   * This position integration cannot occur until this round of `bodyForce` has completed.
   * Also, the next round of `bodyForce` cannot begin until the integration is complete.
   */
    cudaDeviceSynchronize();
    //cudaMemPrefetchAsync(p, bytes, cudaCpuDeviceId);
    for (int i = 0 ; i < nBodies; i++) { // integrate position
      p[i].x += p[i].vx*dt;
      p[i].y += p[i].vy*dt;
      p[i].z += p[i].vz*dt;
    }

    const double tElapsed = GetTimer() / 1000.0;
    totalTime += tElapsed;
  }

  double avgTime = totalTime / (double)(nIters);
  float billionsOfOpsPerSecond = 1e-9 * nBodies * nBodies / avgTime;
  write_values_to_file(solution_values, buf, bytes);

  // You will likely enjoy watching this value grow as you accelerate the application,
  // but beware that a failure to correctly synchronize the device might result in
  // unrealistically high values.
  printf("%0.3f Billion Interactions / second", billionsOfOpsPerSecond);

  free(buf);
}
test_xzr