https://developer.nvidia.com/blog/even-easier-introduction-cuda/
首先是一个简单的c++数组加法
#include <iostream>
#include <math.h>
// function to add the elements of two arrays
void add(int n, float* x, float* y)
{
for (int i = 0; i < n; i++)
y[i] = x[i] + y[i];
}
int main(void)
{
int N = 1 << 20; // 1M elements
float* x = new float[N];
float* y = new float[N];
// initialize x and y arrays on the host
for (int i = 0; i < N; i++) {
x[i] = 1.0f;
y[i] = 2.0f;
}
// Run kernel on 1M elements on the CPU
add(N, x, y);
// Check for errors (all values should be 3.0f)
float maxError = 0.0f;
for (int i = 0; i < N; i++)
maxError = fmax(maxError, fabs(y[i] - 3.0f));
std::cout << "Max error: " << maxError << std::endl;
// Free memory
delete[] x;
delete[] y;
return 0;
}
CUDA文件的文件扩展名为.cu。因此,请将此代码保存在名为add.cu的文件中,然后使用CUDA C ++编译器nvcc对其进行编译。
> nvcc add.cu -o add_cuda
> ./add_cuda
Max error: 0.000000
使用CUDA Toolkit中的nvprof 分析运行时长
$ nvprof ./add_cuda
==3355== NVPROF is profiling process 3355, command: ./add_cuda
Max error: 0
==3355== Profiling application: ./add_cuda
==3355== Profiling result:
Time(%) Time Calls Avg Min Max Name
100.00% 463.25ms 1 463.25ms 463.25ms 463.25ms add(int, float*, float*)
...
上面的操作还是在没有并行,<<<1, 1>>>告诉GPU要启动多少并行线程。如果我们改变第二个参数每个thread block中调用多少个thread。CUDA GPU使用大小为32的倍数的线程块运行内核,当然只改变这个没有意义,如果这么修改就是n个线程执行了n次加法,而不是并行。我们还需要修改kernel代码,可以通过CUDA提供的关键字获取运行线程的index,这里比如threadIdx.x和blockDim.x
#include <iostream>
#include <math.h>
// function to add the elements of two arrays
__global__// CUDA Kernel function to add the elements of two arrays on the GPU
void add(int n, float* x, float* y)
{//__global__ functions are known as kernels, and code that runs on the GPU
//is often called device code, while code that runs on the CPU is host code.
//for (int i = 0; i < n; i++)
//y[i] = x[i] + y[i];
int index = threadIdx.x;
int stride = blockDim.x;
for (int i = index; i < n; i+= stride)//为什么把stride放在这里就有意思了,因为计算全局索引的时候blockDim.x参与的是乘法
y[i] = x[i] + y[i];
}
int main(void)
{
int N = 1 << 20; // 1M elements
float* x;// = new float[N];
float* y;// = new float[N];
// Allocate Unified Memory -- accessible from CPU or GPU
cudaMallocManaged(&x, N * sizeof(float));
cudaMallocManaged(&y, N * sizeof(float));
// initialize x and y arrays on the host
for (int i = 0; i < N; i++) {
x[i] = 1.0f;
y[i] = 2.0f;
}
// Run kernel on 1M elements on the CPU
//add(N, x, y);
// Run kernel on 1M elements on the GPU
add << <1, 1 >> > (N, x, y);
// Wait for GPU to finish before accessing on host
cudaDeviceSynchronize();
// Check for errors (all values should be 3.0f)
float maxError = 0.0f;
for (int i = 0; i < N; i++)
maxError = fmax(maxError, fabs(y[i] - 3.0f));
std::cout << "Max error: " << maxError << std::endl;
// Free memory
//delete[] x;
//delete[] y;
// Free memory
cudaFree(x);
cudaFree(y);
return 0;
}
GPU有很多parallel processors,然后这些parallel processors 都被放在Streaming Multiprocessors上,而每个SM都可以运行多个thread blocks。为了充分利用所有这些线程,我应该使用多个thread block启动kernel。并行的thread block又组成了一个grid。因为我有N个元素要处理,每个block有256个线程,用N除以块大小就可以得到block的个数(在N不是块大小的倍数的情况下要小心四舍五入)。
看到Idx和Dim的差异了吧。
再进一步考虑grid。CUDA提供gridDim.x,它包含grid中的block数量,以及blockIdx.x,它包含grid中当前thread block的index。图1演示了使用blockDim.x,gridDim.x, threadIdx.x在CUDA中对数组(一维)进行索引的方法。其思想是,每个thread通过计算其block开始处的偏移量(blockDim.x+blockIdx.x)来获得自己的索引。并在block中添加线程索引(threadIdx.x)。blockIdx.x * blockDim.x + threadIdx.x
#include <iostream>
#include <math.h>
// function to add the elements of two arrays
__global__// CUDA Kernel function to add the elements of two arrays on the GPU
void add(int n, float* x, float* y)
{//__global__ functions are known as kernels, and code that runs on the GPU
//is often called device code, while code that runs on the CPU is host code.
//for (int i = 0; i < n; i++)
//y[i] = x[i] + y[i];
//int index = threadIdx.x;
//int stride = blockDim.x;
//for (int i = index; i < n; i+= stride)//为什么把stride放在这里就有意思了,因为计算全局索引的时候blockDim.x参与的是乘法
//y[i] = x[i] + y[i];
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;//grid-stride loop.
for (int i = index; i < n; i += stride)
y[i] = x[i] + y[i];
}
int main(void)
{
int N = 1 << 20; // 1M elements
float* x;// = new float[N];
float* y;// = new float[N];
// Allocate Unified Memory -- accessible from CPU or GPU
cudaMallocManaged(&x, N * sizeof(float));
cudaMallocManaged(&y, N * sizeof(float));
// initialize x and y arrays on the host
for (int i = 0; i < N; i++) {
x[i] = 1.0f;
y[i] = 2.0f;
}
// Run kernel on 1M elements on the CPU
//add(N, x, y);
// Run kernel on 1M elements on the GPU
//add << <1, 1 >> > (N, x, y);
int blockSize = 256;
int numBlocks = (N + blockSize - 1) / blockSize;
add << <numBlocks, blockSize >> > (N, x, y);
// Wait for GPU to finish before accessing on host
cudaDeviceSynchronize();
// Check for errors (all values should be 3.0f)
float maxError = 0.0f;
for (int i = 0; i < N; i++)
maxError = fmax(maxError, fabs(y[i] - 3.0f));
std::cout << "Max error: " << maxError << std::endl;
// Free memory
//delete[] x;
//delete[] y;
// Free memory
cudaFree(x);
cudaFree(y);
return 0;
}