CUDA入门学习笔记（一）

最新推荐文章于 2022-12-05 13:59:46 发布

流浪机器人

最新推荐文章于 2022-12-05 13:59:46 发布

阅读量230

点赞数

分类专栏： CUDA学习

本文链接：https://blog.csdn.net/qq_26623879/article/details/103313830

版权

CUDA学习专栏收录该内容

1 篇文章 0 订阅

订阅专栏

CUDA高性能并行计算

在这里插入图片描述
Code

运行第一cuda程序

学习几个概念

核函数：核函数是一种特殊的函数，加载核函数与常规函数调用很像，格式：
<<<Dg，Db>>> (args)
Dg:网格中的线程块数，Db:线程块中的线程数目，args:传入参数

函数标识符

  __global__  是标志着和函数的标识符
  __host__ 函数从主机端调用在主机执行
  __device__ 函数从设备端调用并在设备端执行

CUDA 运行时 API 可以将输入数据传输到设备端和将结果传回到主机端


cudaMalloc()    函数可以分配设备端内存
cudaMemcpy()    将数据传入或者传出设备
cudaFree()      释放掉设备中不再使用的内存
__syncThreads() 可以在一个线程块中进行线程同步
cudaDeviceSynchronize() 函数可以有效地同步一个网格中的所有线程
atomaicAdd()    可以防止多线程并发访问一个变量时造成冲突
size_t：        代表内存大小的专用变量类型
cudaError_r     错误处理的专用变量

将源程序.cpp转为核函数.cu

dist_v1 中 main.cpp

#include <math.h> //Include standard math library containing sqrt.
#define N 64 // Specify a constant value for array length.

// A scaling function to convert integers 0,1,...,N-1
// to evenly spaced floats ranging from 0 to 1.
float scale(int i, int n)
{
  return ((float)i) / (n - 1);
}

// Compute the distance between 2 points on a line.
float distance(float x1, float x2)
{
  return sqrt((x2 - x1)*(x2 - x1));
}

int main()
{
  // Create an array of N floats (initialized to 0.0).
  // We will overwrite these values to store our results.
  float out[N] = { 0.0f };

  // Choose a reference value from which distances are measured.
  const float ref = 0.5f;

  /* for loop to scale the index to obtain coordinate value,
   * compute the distance from the reference point,
   * and store the result in the corresponding entry in out. */
  for (int i = 0; i < N; ++i)
  {
    float x = scale(i, N);
    out[i] = distance(x, ref);
  }

  return 0;
}

修改为Kernel.cu
将.cpp中的 for 函数修改为 __ global __ 和__device__ 的循环调用
distanceKernel<<<N/TPB,TPB>>>(d_out,ref,N)

#include <stdio.h>
#define N 64
#define TPB 32

__device__ float scale(int i,int n)
{
  return ((float)i)/(n-1) ;
}

__device__ float distance(float x1,float x2)
{
  return sqrt((x2-x1)*(x2-x1));
}

__global__ void distanceKernel(float *d_out,float ref, int len)
{

  const int i = blockIdx.x*blockDim.x + threadIdx.x;
  const float x = scale(i,len);
  d_out[i] = distance(x,ref);
  printf("i = %2d: dist from %f to %f is %f.\n",i,ref,x,d_out[i]);
}


int main()
{
  const float ref = 0.5f;
  // Declare a pointer for an array of floats
  float *d_out = 0;
  //Allocate device memory to store the output array
  cudaMalloc(&d_out,N*sizeof(float));
  //Launch kernel to computer and store distance values
  distanceKernel<<<N/TPB,TPB>>>(d_out,ref,N);
  
  cudaFree(d_out); //Free the memory
  return 0;

}