Accelerated Ray Tracing （二）

shchojj

于 2020-10-23 16:47:23 发布

阅读量177

点赞数

分类专栏： cuda

原文链接：https://developer.nvidia.com/blog/how-implement-performance-metrics-cuda-cc/

版权

cuda 专栏收录该内容

12 篇文章 1 订阅

订阅专栏

这篇博客介绍了如何使用CUDA将简单的C++数组加法操作迁移到GPU上，以利用其并行计算能力。通过示例代码展示了如何编写CUDA内核函数，设置线程块和网格，并使用nvcc编译器和nvprof分析器进行编译和性能分析。文章强调了线程索引和块索引在CUDA编程中的重要性，以及如何通过调整线程块大小实现高效的数据处理。

摘要由CSDN通过智能技术生成

https://developer.nvidia.com/blog/even-easier-introduction-cuda/

首先是一个简单的c++数组加法

#include <iostream>
#include <math.h>

// function to add the elements of two arrays
void add(int n, float* x, float* y)
{
    for (int i = 0; i < n; i++)
        y[i] = x[i] + y[i];
}

int main(void)
{
    int N = 1 << 20; // 1M elements

    float* x = new float[N];
    float* y = new float[N];

    // initialize x and y arrays on the host
    for (int i = 0; i < N; i++) {
        x[i] = 1.0f;
        y[i] = 2.0f;
    }

    // Run kernel on 1M elements on the CPU
    add(N, x, y);

    // Check for errors (all values should be 3.0f)
    float maxError = 0.0f;
    for (int i = 0; i < N; i++)
        maxError = fmax(maxError, fabs(y[i] - 3.0f));
    std::cout << "Max error: " << maxError << std::endl;

    // Free memory
    delete[] x;
    delete[] y;

    return 0;
}

CUDA文件的文件扩展名为.cu。因此，请将此代码保存在名为add.cu的文件中，然后使用CUDA C ++编译器nvcc对其进行编译。

> nvcc add.cu -o add_cuda
> ./add_cuda
Max error: 0.000000

使用CUDA Toolkit中的nvprof 分析运行时长

$ nvprof ./add_cuda
==3355== NVPROF is profiling process 3355, command: ./add_cuda
Max error: 0
==3355== Profiling application: ./add_cuda
==3355== Profiling result:
Time(%)      Time     Calls       Avg       Min       Max  Name
100.00%  463.25ms         1  463.25ms  463.25ms  463.25ms  add(int, float*, float*)
...

上面的操作还是在没有并行，<<<1, 1>>>告诉GPU要启动多少并行线程。如果我们改变第二个参数每个thread block中调用多少个thread。CUDA GPU使用大小为32的倍数的线程块运行内核，当然只改变这个没有意义，如果这么修改就是n个线程执行了n次加法，而不是并行。我们还需要修改kernel代码，可以通过CUDA提供的关键字获取运行线程的index，这里比如threadIdx.x和blockDim.x

#include <iostream>
#include <math.h>

// function to add the elements of two arrays
__global__// CUDA Kernel function to add the elements of two arrays on the GPU
void add(int n, float* x, float* y)
{//__global__ functions are known as kernels, and code that runs on the GPU 
 //is often called device code, while code that runs on the CPU is host code.
    //for (int i = 0; i < n; i++)
        //y[i] = x[i] + y[i];
    int index = threadIdx.x;
    int stride = blockDim.x;
    for (int i = index; i < n; i+= stride)//为什么把stride放在这里就有意思了，因为计算全局索引的时候blockDim.x参与的是乘法
        y[i] = x[i] + y[i];
}

int main(void)
{
    int N = 1 << 20; // 1M elements

    float* x;// = new float[N];
    float* y;// = new float[N];
    // Allocate Unified Memory -- accessible from CPU or GPU
    cudaMallocManaged(&x, N * sizeof(float));
    cudaMallocManaged(&y, N * sizeof(float));

    // initialize x and y arrays on the host
    for (int i = 0; i < N; i++) {
        x[i] = 1.0f;
        y[i] = 2.0f;
    }

    // Run kernel on 1M elements on the CPU
    //add(N, x, y);

    // Run kernel on 1M elements on the GPU
    add << <1, 1 >> > (N, x, y);
    // Wait for GPU to finish before accessing on host
    cudaDeviceSynchronize();

    // Check for errors (all values should be 3.0f)
    float maxError = 0.0f;
    for (int i = 0; i < N; i++)
        maxError = fmax(maxError, fabs(y[i] - 3.0f));
    std::cout << "Max error: " << maxError << std::endl;

    // Free memory
    //delete[] x;
    //delete[] y;
     // Free memory
    cudaFree(x);
    cudaFree(y);

    return 0;
}

GPU有很多parallel processors，然后这些parallel processors 都被放在Streaming Multiprocessors上，而每个SM都可以运行多个thread blocks。为了充分利用所有这些线程，我应该使用多个thread block启动kernel。并行的thread block又组成了一个grid。因为我有N个元素要处理，每个block有256个线程，用N除以块大小就可以得到block的个数(在N不是块大小的倍数的情况下要小心四舍五入)。

"Figure

看到Idx和Dim的差异了吧。

再进一步考虑grid。CUDA提供gridDim.x，它包含grid中的block数量，以及blockIdx.x，它包含grid中当前thread block的index。图1演示了使用blockDim.x，gridDim.x, threadIdx.x在CUDA中对数组(一维)进行索引的方法。其思想是，每个thread通过计算其block开始处的偏移量(blockDim.x+blockIdx.x)来获得自己的索引。并在block中添加线程索引(threadIdx.x)。blockIdx.x * blockDim.x + threadIdx.x

#include <iostream>
#include <math.h>

// function to add the elements of two arrays
__global__// CUDA Kernel function to add the elements of two arrays on the GPU
void add(int n, float* x, float* y)
{//__global__ functions are known as kernels, and code that runs on the GPU 
 //is often called device code, while code that runs on the CPU is host code.
    //for (int i = 0; i < n; i++)
        //y[i] = x[i] + y[i];
    
    //int index = threadIdx.x;
    //int stride = blockDim.x;
    //for (int i = index; i < n; i+= stride)//为什么把stride放在这里就有意思了，因为计算全局索引的时候blockDim.x参与的是乘法
        //y[i] = x[i] + y[i];
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;//grid-stride loop.
    for (int i = index; i < n; i += stride)
        y[i] = x[i] + y[i];
}

int main(void)
{
    int N = 1 << 20; // 1M elements

    float* x;// = new float[N];
    float* y;// = new float[N];
    // Allocate Unified Memory -- accessible from CPU or GPU
    cudaMallocManaged(&x, N * sizeof(float));
    cudaMallocManaged(&y, N * sizeof(float));

    // initialize x and y arrays on the host
    for (int i = 0; i < N; i++) {
        x[i] = 1.0f;
        y[i] = 2.0f;
    }

    // Run kernel on 1M elements on the CPU
    //add(N, x, y);

    // Run kernel on 1M elements on the GPU
    //add << <1, 1 >> > (N, x, y);
    int blockSize = 256;
    int numBlocks = (N + blockSize - 1) / blockSize;
    add << <numBlocks, blockSize >> > (N, x, y);

    // Wait for GPU to finish before accessing on host
    cudaDeviceSynchronize();

    // Check for errors (all values should be 3.0f)
    float maxError = 0.0f;
    for (int i = 0; i < N; i++)
        maxError = fmax(maxError, fabs(y[i] - 3.0f));
    std::cout << "Max error: " << maxError << std::endl;

    // Free memory
    //delete[] x;
    //delete[] y;
     // Free memory
    cudaFree(x);
    cudaFree(y);

    return 0;
}