CUDA高性能并行计算
运行第一cuda程序
学习几个概念
核函数 :核函数是一种特殊的函数,加载核函数与常规函数调用很像,格式:
<<<Dg,Db>>> (args)
Dg:网格中的线程块数 ,Db:线程块中的线程数目,args:传入参数
函数标识符
__global__ 是标志着和函数的标识符
__host__ 函数从主机端调用在主机执行
__device__ 函数从设备端调用并在设备端执行
CUDA 运行时 API 可以将输入数据传输到设备端和将结果传回到主机端
cudaMalloc() 函数可以分配设备端内存
cudaMemcpy() 将数据传入或者传出设备
cudaFree() 释放掉设备中不再使用的内存
__syncThreads() 可以在一个线程块中进行线程同步
cudaDeviceSynchronize() 函数可以有效地同步一个网格中的所有线程
atomaicAdd() 可以防止多线程并发访问一个变量时造成冲突
size_t: 代表内存大小的专用变量类型
cudaError_r 错误处理的专用变量
将源程序.cpp转为核函数.cu
dist_v1 中 main.cpp
#include <math.h> //Include standard math library containing sqrt.
#define N 64 // Specify a constant value for array length.
// A scaling function to convert integers 0,1,...,N-1
// to evenly spaced floats ranging from 0 to 1.
float scale(int i, int n)
{
return ((float)i) / (n - 1);
}
// Compute the distance between 2 points on a line.
float distance(float x1, float x2)
{
return sqrt((x2 - x1)*(x2 - x1));
}
int main()
{
// Create an array of N floats (initialized to 0.0).
// We will overwrite these values to store our results.
float out[N] = { 0.0f };
// Choose a reference value from which distances are measured.
const float ref = 0.5f;
/* for loop to scale the index to obtain coordinate value,
* compute the distance from the reference point,
* and store the result in the corresponding entry in out. */
for (int i = 0; i < N; ++i)
{
float x = scale(i, N);
out[i] = distance(x, ref);
}
return 0;
}
修改为Kernel.cu
将.cpp中的 for 函数修改为 __ global __ 和__device__ 的循环调用
distanceKernel<<<N/TPB,TPB>>>(d_out,ref,N)
#include <stdio.h>
#define N 64
#define TPB 32
__device__ float scale(int i,int n)
{
return ((float)i)/(n-1) ;
}
__device__ float distance(float x1,float x2)
{
return sqrt((x2-x1)*(x2-x1));
}
__global__ void distanceKernel(float *d_out,float ref, int len)
{
const int i = blockIdx.x*blockDim.x + threadIdx.x;
const float x = scale(i,len);
d_out[i] = distance(x,ref);
printf("i = %2d: dist from %f to %f is %f.\n",i,ref,x,d_out[i]);
}
int main()
{
const float ref = 0.5f;
// Declare a pointer for an array of floats
float *d_out = 0;
//Allocate device memory to store the output array
cudaMalloc(&d_out,N*sizeof(float));
//Launch kernel to computer and store distance values
distanceKernel<<<N/TPB,TPB>>>(d_out,ref,N);
cudaFree(d_out); //Free the memory
return 0;
}