cuda学习(1)

4 篇文章 0 订阅

cuda并行计算流程

Created with Raphaël 2.1.2 CPU memory CPU memory GPU memory GPU memory 需要算点东西,数据发送给你 执行代码。算好的结果给你

CPU上运行代码的例子:

#include<iostream>
#include<math.h>
//定义函数add对两个浮点数据求和
void add(int n,float *x,float *y)
{
    for(int i=0;i<n;i++)
    {
        y[i] = x[i]+y[i];
    }
}
int main(void)
{
    int N = 1<<20;
    //在内存上开辟空间存储
    float *x = new float[N];
    float *y = new float[N];
    //初始化
    for(int i=0;i<N;i++)
    {
        x[i] = 1.0f;
        y[i] = 2.0f;
    }
    add(N,x,y);
    float maxError = 0.0f;
    for(int i = 0;i<N;i++)
    //比较相加后元素和3.0f的大小
    maxError = fmax(maxError,fabs(y[i]-3.0f));
    std::cout<<"Max Error: "<<maxError<<std::endl;
    // 收回存储空间
    delete [] x;
    delete [] y;
}

CUDA代码

#include <iostream>
#include <math.h>
// Kernel function to add the elements of two arrays
__global__
void add(int n, float *x, float *y)
{
  for (int i = 0; i < n; i++)
    y[i] = x[i] + y[i];
}

int main(void)
{
  int N = 1<<20;
  float *x, *y;

  // Allocate Unified Memory – accessible from CPU or GPU
  cudaMallocManaged(&x, N*sizeof(float));
  cudaMallocManaged(&y, N*sizeof(float));

  // initialize x and y arrays on the host
  for (int i = 0; i < N; i++) {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }

  // Run kernel on 1M elements on the GPU
  add<<<1, 1>>>(N, x, y);

  // Wait for GPU to finish before accessing on host
  cudaDeviceSynchronize();

  // Check for errors (all values should be 3.0f)
  float maxError = 0.0f;
  for (int i = 0; i < N; i++)
    maxError = fmax(maxError, fabs(y[i]-3.0f));
  std::cout << "Max error: " << maxError << std::endl;

  // Free memory
  cudaFree(x);
  cudaFree(y);

  return 0;
}

探测执行过程nvprof

==11602== NVPROF is profiling process 11602, command: a
Max error: 0
==11602== Profiling application: a
==11602== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  157.35ms         1  157.35ms  157.35ms  157.35ms  add(int, float*, float*)
      API calls:   71.00%  392.91ms         2  196.45ms  78.803us  392.83ms  cudaMallocManaged
                   28.43%  157.36ms         1  157.36ms  157.36ms  157.36ms  cudaDeviceSynchronize
                    0.24%  1.3030ms       188  6.9300us     285ns  341.40us  cuDeviceGetAttribute
                    0.20%  1.1251ms         2  562.56us  553.30us  571.82us  cudaFree
                    0.08%  443.21us         2  221.61us  215.69us  227.53us  cuDeviceTotalMem
                    0.02%  135.73us         1  135.73us  135.73us  135.73us  cudaLaunch
                    0.02%  113.82us         2  56.907us  51.766us  62.049us  cuDeviceGetName
                    0.00%  14.493us         3  4.8310us     342ns  13.254us  cudaSetupArgument
                    0.00%  2.4870us         3     829ns     299ns  1.7410us  cuDeviceGetCount
                    0.00%  2.3770us         1  2.3770us  2.3770us  2.3770us  cudaConfigureCall
                    0.00%  1.9400us         4     485ns     278ns     993ns  cuDeviceGet

==11602== Unified Memory profiling result:
Device "GeForce GTX 1080 (0)"
   Count  Avg Size  Min Size  Max Size  Total Size  Total Time  Name
      48  170.67KB  4.0000KB  0.9961MB  8.000000MB  729.6640us  Host To Device
      24  170.67KB  4.0000KB  0.9961MB  4.000000MB  334.6240us  Device To Host
      12         -         -         -           -  3.444032ms  Gpu page fault groups
Total CPU Page faults: 36

线程

常见变量

  • gridDim.x:grid中包含的线程块的索引(上图包含4096个线程块)
  • blockIdx.x:grid中包含当前线程块的索引(上图每个线程块的索引从0-255)
  • blockDim.x:线程块中线程数量(上图线程块中的线程数为256)
  • threadIdx.x:线程块中的线程的索引(上图的中橙色3的线程索引为3)

线程索引:index = blockIdx.x * blockDim.x + threadIdx.x

  • 线程块大小
    int blockSize = 256;
  • N表示需要处理的元素个数
    int numBlocks = (N + blockSize - 1) / blockSize;
  • 每个时钟处理blocksize个线程,需要至少numBlock个线程块处理
    add<<<numBlocks, blockSize>>>(N, x, y);

上面的cuda代码中线程块大小为1,每个始终处理的块数为1,下面设置为每个时钟处理块256.

代码:

#include <iostream>
#include <math.h>
// Kernel function to add the elements of two arrays
__global__
void add(int n, float *x, float *y)
{
    int index = threadIdx.x+blockIdx.x*blockDim.x;
    int stride = gridDim.x*blockDim.x;
    for (int i = index; i < n; i += stride)
        y[i] = x[i] + y[i];
}

int main(void)
{
  int N = 1<<20;
  float *x, *y;

  // Allocate Unified Memory – accessible from CPU or GPU
  cudaMallocManaged(&x, N*sizeof(float));
  cudaMallocManaged(&y, N*sizeof(float));

  // initialize x and y arrays on the host
  for (int i = 0; i < N; i++) {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }

  // Run kernel on 1M elements on the GPU
  int blockSize = 256;
  int numBlocks = (N-1+blockSize)/blockSize;
  add<<<numBlocks, blockSize>>>(N, x, y);

  // Wait for GPU to finish before accessing on host
  cudaDeviceSynchronize();

  // Check for errors (all values should be 3.0f)
  float maxError = 0.0f;
  for (int i = 0; i < N; i++)
    maxError = fmax(maxError, fabs(y[i]-3.0f));
  std::cout << "Max error: " << maxError << std::endl;

  // Free memory
  cudaFree(x);
  cudaFree(y);

  return 0;
}
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  2.7540ms         1  2.7540ms  2.7540ms  2.7540ms  add(int, float*, float*)

CUDA工具

  • nvprof:探测cu程序运行统计信息
  • nvcc:cuda程序编译器

参考

An Even Easier Introduction to CUDA

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值