cuda并行计算流程
CPU上运行代码的例子:
#include<iostream>
#include<math.h>
//定义函数add对两个浮点数据求和
void add(int n,float *x,float *y)
{
for(int i=0;i<n;i++)
{
y[i] = x[i]+y[i];
}
}
int main(void)
{
int N = 1<<20;
//在内存上开辟空间存储
float *x = new float[N];
float *y = new float[N];
//初始化
for(int i=0;i<N;i++)
{
x[i] = 1.0f;
y[i] = 2.0f;
}
add(N,x,y);
float maxError = 0.0f;
for(int i = 0;i<N;i++)
//比较相加后元素和3.0f的大小
maxError = fmax(maxError,fabs(y[i]-3.0f));
std::cout<<"Max Error: "<<maxError<<std::endl;
// 收回存储空间
delete [] x;
delete [] y;
}
CUDA代码
#include <iostream>
#include <math.h>
// Kernel function to add the elements of two arrays
__global__
void add(int n, float *x, float *y)
{
for (int i = 0; i < n; i++)
y[i] = x[i] + y[i];
}
int main(void)
{
int N = 1<<20;
float *x, *y;
// Allocate Unified Memory – accessible from CPU or GPU
cudaMallocManaged(&x, N*sizeof(float));
cudaMallocManaged(&y, N*sizeof(float));
// initialize x and y arrays on the host
for (int i = 0; i < N; i++) {
x[i] = 1.0f;
y[i] = 2.0f;
}
// Run kernel on 1M elements on the GPU
add<<<1, 1>>>(N, x, y);
// Wait for GPU to finish before accessing on host
cudaDeviceSynchronize();
// Check for errors (all values should be 3.0f)
float maxError = 0.0f;
for (int i = 0; i < N; i++)
maxError = fmax(maxError, fabs(y[i]-3.0f));
std::cout << "Max error: " << maxError << std::endl;
// Free memory
cudaFree(x);
cudaFree(y);
return 0;
}
探测执行过程nvprof
==11602== NVPROF is profiling process 11602, command: a
Max error: 0
==11602== Profiling application: a
==11602== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 100.00% 157.35ms 1 157.35ms 157.35ms 157.35ms add(int, float*, float*)
API calls: 71.00% 392.91ms 2 196.45ms 78.803us 392.83ms cudaMallocManaged
28.43% 157.36ms 1 157.36ms 157.36ms 157.36ms cudaDeviceSynchronize
0.24% 1.3030ms 188 6.9300us 285ns 341.40us cuDeviceGetAttribute
0.20% 1.1251ms 2 562.56us 553.30us 571.82us cudaFree
0.08% 443.21us 2 221.61us 215.69us 227.53us cuDeviceTotalMem
0.02% 135.73us 1 135.73us 135.73us 135.73us cudaLaunch
0.02% 113.82us 2 56.907us 51.766us 62.049us cuDeviceGetName
0.00% 14.493us 3 4.8310us 342ns 13.254us cudaSetupArgument
0.00% 2.4870us 3 829ns 299ns 1.7410us cuDeviceGetCount
0.00% 2.3770us 1 2.3770us 2.3770us 2.3770us cudaConfigureCall
0.00% 1.9400us 4 485ns 278ns 993ns cuDeviceGet
==11602== Unified Memory profiling result:
Device "GeForce GTX 1080 (0)"
Count Avg Size Min Size Max Size Total Size Total Time Name
48 170.67KB 4.0000KB 0.9961MB 8.000000MB 729.6640us Host To Device
24 170.67KB 4.0000KB 0.9961MB 4.000000MB 334.6240us Device To Host
12 - - - - 3.444032ms Gpu page fault groups
Total CPU Page faults: 36
常见变量
gridDim.x
:grid中包含的线程块的索引(上图包含4096个线程块)blockIdx.x
:grid中包含当前线程块的索引(上图每个线程块的索引从0-255)blockDim.x
:线程块中线程数量(上图线程块中的线程数为256)threadIdx.x
:线程块中的线程的索引(上图的中橙色3的线程索引为3)
线程索引:index = blockIdx.x * blockDim.x + threadIdx.x
- 线程块大小
int blockSize = 256
; - N表示需要处理的元素个数
int numBlocks = (N + blockSize - 1) / blockSize
; - 每个时钟处理blocksize个线程,需要至少numBlock个线程块处理
add<<<numBlocks, blockSize>>>(N, x, y);
上面的cuda代码中线程块大小为1,每个始终处理的块数为1,下面设置为每个时钟处理块256.
代码:
#include <iostream>
#include <math.h>
// Kernel function to add the elements of two arrays
__global__
void add(int n, float *x, float *y)
{
int index = threadIdx.x+blockIdx.x*blockDim.x;
int stride = gridDim.x*blockDim.x;
for (int i = index; i < n; i += stride)
y[i] = x[i] + y[i];
}
int main(void)
{
int N = 1<<20;
float *x, *y;
// Allocate Unified Memory – accessible from CPU or GPU
cudaMallocManaged(&x, N*sizeof(float));
cudaMallocManaged(&y, N*sizeof(float));
// initialize x and y arrays on the host
for (int i = 0; i < N; i++) {
x[i] = 1.0f;
y[i] = 2.0f;
}
// Run kernel on 1M elements on the GPU
int blockSize = 256;
int numBlocks = (N-1+blockSize)/blockSize;
add<<<numBlocks, blockSize>>>(N, x, y);
// Wait for GPU to finish before accessing on host
cudaDeviceSynchronize();
// Check for errors (all values should be 3.0f)
float maxError = 0.0f;
for (int i = 0; i < N; i++)
maxError = fmax(maxError, fabs(y[i]-3.0f));
std::cout << "Max error: " << maxError << std::endl;
// Free memory
cudaFree(x);
cudaFree(y);
return 0;
}
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 100.00% 2.7540ms 1 2.7540ms 2.7540ms 2.7540ms add(int, float*, float*)
CUDA工具
- nvprof:探测cu程序运行统计信息
- nvcc:cuda程序编译器