cuda学习（1）

最新推荐文章于 2024-05-13 06:34:25 发布

bleedingfight

最新推荐文章于 2024-05-13 06:34:25 发布

阅读量354

点赞数

分类专栏： cuda Linux

本文链接：https://blog.csdn.net/bleedingfight/article/details/79706725

版权

Linux 同时被 2 个专栏收录

20 篇文章 0 订阅

订阅专栏

cuda

4 篇文章 0 订阅

订阅专栏

cuda并行计算流程

CPU上运行代码的例子:

#include<iostream>
#include<math.h>
//定义函数add对两个浮点数据求和
void add(int n,float *x,float *y)
{
    for(int i=0;i<n;i++)
    {
        y[i] = x[i]+y[i];
    }
}
int main(void)
{
    int N = 1<<20;
    //在内存上开辟空间存储
    float *x = new float[N];
    float *y = new float[N];
    //初始化
    for(int i=0;i<N;i++)
    {
        x[i] = 1.0f;
        y[i] = 2.0f;
    }
    add(N,x,y);
    float maxError = 0.0f;
    for(int i = 0;i<N;i++)
    //比较相加后元素和3.0f的大小
    maxError = fmax(maxError,fabs(y[i]-3.0f));
    std::cout<<"Max Error: "<<maxError<<std::endl;
    // 收回存储空间
    delete [] x;
    delete [] y;
}

CUDA代码

#include <iostream>
#include <math.h>
// Kernel function to add the elements of two arrays
__global__
void add(int n, float *x, float *y)
{
  for (int i = 0; i < n; i++)
    y[i] = x[i] + y[i];
}

int main(void)
{
  int N = 1<<20;
  float *x, *y;

  // Allocate Unified Memory – accessible from CPU or GPU
  cudaMallocManaged(&x, N*sizeof(float));
  cudaMallocManaged(&y, N*sizeof(float));

  // initialize x and y arrays on the host
  for (int i = 0; i < N; i++) {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }

  // Run kernel on 1M elements on the GPU
  add<<<1, 1>>>(N, x, y);

  // Wait for GPU to finish before accessing on host
  cudaDeviceSynchronize();

  // Check for errors (all values should be 3.0f)
  float maxError = 0.0f;
  for (int i = 0; i < N; i++)
    maxError = fmax(maxError, fabs(y[i]-3.0f));
  std::cout << "Max error: " << maxError << std::endl;

  // Free memory
  cudaFree(x);
  cudaFree(y);

  return 0;
}

探测执行过程nvprof

==11602== NVPROF is profiling process 11602, command: a
Max error: 0
==11602== Profiling application: a
==11602== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  157.35ms         1  157.35ms  157.35ms  157.35ms  add(int, float*, float*)
      API calls:   71.00%  392.91ms         2  196.45ms  78.803us  392.83ms  cudaMallocManaged
                   28.43%  157.36ms         1  157.36ms  157.36ms  157.36ms  cudaDeviceSynchronize
                    0.24%  1.3030ms       188  6.9300us     285ns  341.40us  cuDeviceGetAttribute
                    0.20%  1.1251ms         2  562.56us  553.30us  571.82us  cudaFree
                    0.08%  443.21us         2  221.61us  215.69us  227.53us  cuDeviceTotalMem
                    0.02%  135.73us         1  135.73us  135.73us  135.73us  cudaLaunch
                    0.02%  113.82us         2  56.907us  51.766us  62.049us  cuDeviceGetName
                    0.00%  14.493us         3  4.8310us     342ns  13.254us  cudaSetupArgument
                    0.00%  2.4870us         3     829ns     299ns  1.7410us  cuDeviceGetCount
                    0.00%  2.3770us         1  2.3770us  2.3770us  2.3770us  cudaConfigureCall
                    0.00%  1.9400us         4     485ns     278ns     993ns  cuDeviceGet

==11602== Unified Memory profiling result:
Device "GeForce GTX 1080 (0)"
   Count  Avg Size  Min Size  Max Size  Total Size  Total Time  Name
      48  170.67KB  4.0000KB  0.9961MB  8.000000MB  729.6640us  Host To Device
      24  170.67KB  4.0000KB  0.9961MB  4.000000MB  334.6240us  Device To Host
      12         -         -         -           -  3.444032ms  Gpu page fault groups
Total CPU Page faults: 36

常见变量

gridDim.x：grid中包含的线程块的索引（上图包含4096个线程块）
blockIdx.x：grid中包含当前线程块的索引(上图每个线程块的索引从0-255)
blockDim.x：线程块中线程数量（上图线程块中的线程数为256）
threadIdx.x:线程块中的线程的索引(上图的中橙色3的线程索引为3)

线程索引:index = blockIdx.x * blockDim.x + threadIdx.x

线程块大小
int blockSize = 256;
N表示需要处理的元素个数
int numBlocks = (N + blockSize - 1) / blockSize;
每个时钟处理blocksize个线程，需要至少numBlock个线程块处理
add<<<numBlocks, blockSize>>>(N, x, y);

上面的cuda代码中线程块大小为1，每个始终处理的块数为1，下面设置为每个时钟处理块256.

代码:

#include <iostream>
#include <math.h>
// Kernel function to add the elements of two arrays
__global__
void add(int n, float *x, float *y)
{
    int index = threadIdx.x+blockIdx.x*blockDim.x;
    int stride = gridDim.x*blockDim.x;
    for (int i = index; i < n; i += stride)
        y[i] = x[i] + y[i];
}

int main(void)
{
  int N = 1<<20;
  float *x, *y;

  // Allocate Unified Memory – accessible from CPU or GPU
  cudaMallocManaged(&x, N*sizeof(float));
  cudaMallocManaged(&y, N*sizeof(float));

  // initialize x and y arrays on the host
  for (int i = 0; i < N; i++) {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }

  // Run kernel on 1M elements on the GPU
  int blockSize = 256;
  int numBlocks = (N-1+blockSize)/blockSize;
  add<<<numBlocks, blockSize>>>(N, x, y);

  // Wait for GPU to finish before accessing on host
  cudaDeviceSynchronize();

  // Check for errors (all values should be 3.0f)
  float maxError = 0.0f;
  for (int i = 0; i < N; i++)
    maxError = fmax(maxError, fabs(y[i]-3.0f));
  std::cout << "Max error: " << maxError << std::endl;

  // Free memory
  cudaFree(x);
  cudaFree(y);

  return 0;
}

            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  2.7540ms         1  2.7540ms  2.7540ms  2.7540ms  add(int, float*, float*)

CUDA工具

nvprof：探测cu程序运行统计信息
nvcc:cuda程序编译器

参考

An Even Easier Introduction to CUDA

bleedingfight

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
cuda学习（1）

cuda并行计算流程Created with Raphaël 2.1.2CPU memoryCPU memoryGPU memoryGPU memory需要算点东西，数据发送给你执行代码。算好的结果给你CPU上运行代码的例子:#include&lt;iostream&gt;#include&lt;math.h&gt;//定义函数add对两个浮点数据求和void add(int...
复制链接

扫一扫

专栏目录