http://blog.csdn.net/lwfcgz/article/details/44056607
简介
下面一个例子介绍了向量加法的OpenCL版,相当于学习C语言中的“Hello World”,本篇教程中的代码以及其余相关教程都可以通过OLCF github下载
vecAdd.c
- #include <stdio.h>
- #include <stdlib.h>
- #include <math.h>
- #include <CL/opencl.h>
- // OpenCL kernel. Each work item takes care of one element of c
- const char *kernelSource = "\n" \
- "#pragma OPENCL EXTENSION cl_khr_fp64 : enable \n" \
- "__kernel void vecAdd( __global double *a, \n" \
- " __global double *b, \n" \
- " __global double *c, \n" \
- " const unsigned int n) \n" \
- "{ \n" \
- " //Get our global thread ID \n" \
- " int id = get_global_id(0); \n" \
- " \n" \
- " //Make sure we do not go out of bounds \n" \
- " if (id < n) \n" \
- " c[id] = a[id] + b[id]; \n" \
- "} \n" \
- "\n" ;
- int main( int argc, char* argv[] )
- {
- // Length of vectors
- unsigned int n = 100000;
- // Host input vectors
- double *h_a;
- double *h_b;
- // Host output vector
- double *h_c;
- // Device input buffers
- cl_mem d_a;
- cl_mem d_b;
- // Device output buffer
- cl_mem d_c;
- cl_platform_id cpPlatform; // OpenCL platform
- cl_device_id device_id; // device ID
- cl_context context; // context
- cl_command_queue queue; // command queue
- cl_program program; // program
- cl_kernel kernel; // kernel
- // Size, in bytes, of each vector
- size_t bytes = n*sizeof(double);
- // Allocate memory for each vector on host
- h_a = (double*)malloc(bytes);
- h_b = (double*)malloc(bytes);
- h_c = (double*)malloc(bytes);
- // Initialize vectors on host
- int i;
- for( i = 0; i < n; i++ )
- {
- h_a[i] = sinf(i)*sinf(i);
- h_b[i] = cosf(i)*cosf(i);
- }
- size_t globalSize, localSize;
- cl_int err;
- // Number of work items in each local work group
- localSize = 64;
- // Number of total work items - localSize must be devisor
- globalSize = ceil(n/(float)localSize)*localSize;
- // Bind to platform
- err = clGetPlatformIDs(1, &cpPlatform, NULL);
- // Get ID for the device
- err = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
- // Create a context
- context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
- // Create a command queue
- queue = clCreateCommandQueue(context, device_id, 0, &err);
- // Create the compute program from the source buffer
- program = clCreateProgramWithSource(context, 1,
- (const char **) & kernelSource, NULL, &err);
- // Build the program executable
- clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
- // Create the compute kernel in the program we wish to run
- kernel = clCreateKernel(program, "vecAdd", &err);
- // Create the input and output arrays in device memory for our calculation
- d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);
- d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);
- d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, bytes, NULL, NULL);
- // Write our data set into the input array in device memory
- err = clEnqueueWriteBuffer(queue, d_a, CL_TRUE, 0,
- bytes, h_a, 0, NULL, NULL);
- err |= clEnqueueWriteBuffer(queue, d_b, CL_TRUE, 0,
- bytes, h_b, 0, NULL, NULL);
- // Set the arguments to our compute kernel
- err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a);
- err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b);
- err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_c);
- err |= clSetKernelArg(kernel, 3, sizeof(unsigned int), &n);
- // Execute the kernel over the entire range of the data set
- err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize,
- 0, NULL, NULL);
- // Wait for the command queue to get serviced before reading back results
- clFinish(queue);
- // Read the results from the device
- clEnqueueReadBuffer(queue, d_c, CL_TRUE, 0,
- bytes, h_c, 0, NULL, NULL );
- //Sum up vector c and print result divided by n, this should equal 1 within error
- double sum = 0;
- for(i=0; i<n; i++)
- sum += h_c[i];
- printf("final result: %f\n", sum/n);
- // release OpenCL resources
- clReleaseMemObject(d_a);
- clReleaseMemObject(d_b);
- clReleaseMemObject(d_c);
- clReleaseProgram(program);
- clReleaseKernel(kernel);
- clReleaseCommandQueue(queue);
- clReleaseContext(context);
- //release host memory
- free(h_a);
- free(h_b);
- free(h_c);
- return 0;
- }
代码分析
内核(kernel):
kernel是OpenCL代码的核心部分,整个内核必须通过C字符串的形式读入,最简单的办法是像代码一样定义一个长长的字符串,在真实的项目代码中通常都会从单独的文件中读入内核。
- // OpenCL kernel. Each work item takes care of one element of c
- const char *kernelSource = "\n" \
- "#pragma OPENCL EXTENSION cl_khr_fp64 : enable \n" \
- "__kernel void vecAdd( __global double *a, \n" \
- " __global double *b, \n" \
- " __global double *c, \n" \
- " const unsigned int n) \n" \
- "{ \n" \
- " //Get our global thread ID \n" \
- " int id = get_global_id(0); \n" \
- " \n" \
- " //Make sure we do not go out of bounds \n" \
- " if (id < n) \n" \
- " c[id] = a[id] + b[id]; \n" \
- "} \n" \
- "\n" ;
下面是内核的函数声明:
__kernel
void
vecAdd( __global
double
*a, __global
double
*b,
__global
double
*c,
const
unsigned
int
n)
|
__kernel是一个定义OpenCL内核的关键字,__global则定义函数指针指向全局设备内存空间,否则可以使用一般的C语言函数声明语法。内核的返回值必须为空void
int
id = get_global_id(0);
|
通过get_global_id函数可以获得当前工作单元(work item)的全局id,参数为0表示获取X维上的ID。
if
(id < n)
c[id] = a[id] + b[id];
|
工作组(work group)的个数必定是整数,由于工作组的大小不一定是需要的线程数的整数倍,因此通常使用的线程数比需要的线程数要多,在程序设计时可以将无用的线程简单丢弃掉。
内存(Memory)
- // Host input vectors
- double *h_a;
- double *h_b;
- // Host output vector
- double *h_c;
- // Device input buffers
- cl_mem d_a;
- cl_mem d_b;
- // Device output buffer
- cl_mem d_c;
主机CPU和GPU有不同的内存空间,因此需要分别定义,上面的代码中前半部分定义主机(host)CPU的内存指针,后半部分定义设备(device)内存的handle,分别用h_和d_前缀来区分。
线程映射(Thread Mapping)
- // Number of work items in each local work group
- localSize = 64;
- // Number of total work items - localSize must be devisor
- globalSize = ceil(n/(float)localSize)*localSize;
为了将我们要解决的问题映射到底层硬件结构,必须定义局部尺寸(local size)和全局尺寸(global size)。局部尺寸定义了每个工作组中的工作单元数,在NVIDIA GPU上等价于每个线程块(thread block)中的线程数。全局尺寸定义了工作单元的总数目。局部尺寸必须是全局尺寸的倍数。
OpenCL前期准备(setup)
- // Bind to platform
- err = clGetPlatformIDs(1, &cpPlatform, NULL);
- // Get ID for the device
- err = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
- // Create a context
- context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
在使用OpenCL设备之前,必须先准备一个上下文(context),上下文对象用来管理命令队列(command queue)、内存(memory)、内核操作(Kernel activity),一个上下文对象可一般含多个设备。
- // Create a command queue
- queue = clCreateCommandQueue(context, device_id, 0, &err);
编译内核(Compile Kernel)
- program = clCreateProgramWithSource(context, 1,
- (const char **) & kernelSource, NULL, &err);
- // Build the program executable
- clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
- // Create the compute kernel in the program we wish to run
- kernel = clCreateKernel(program, "vecAdd", &err);
准备数据(prepare data)
- // Create the input and output arrays in device memory for our calculation
- d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);
- d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);
- d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, bytes, NULL, NULL);
- // Write our data set into the input array in device memory
- err = clEnqueueWriteBuffer(queue, d_a, CL_TRUE, 0,
- bytes, h_a, 0, NULL, NULL);
- err |= clEnqueueWriteBuffer(queue, d_b, CL_TRUE, 0,
- bytes, h_b, 0, NULL, NULL);
- // Set the arguments to our compute kernel
- err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a);
- err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b);
- err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_c);
- err |= clSetKernelArg(kernel, 3, sizeof(unsigned int), &n);
在启动内核之前,我们必须创建主机和设备之间的缓存(buffer),并将主机数据(host data)和这些新创建的设备缓存想绑定,最后再设定内核参数。
启动内核(Launch Kernel)
- // Execute the kernel over the entire range of the data set
- err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize,
- 0, NULL, NULL);
- // Wait for the command queue to get serviced before reading back results
- clFinish(queue);
- // Read the results from the device
- clEnqueueReadBuffer(queue, d_c, CL_TRUE, 0,
- bytes, h_c, 0, NULL, NULL );
编译(Compile)
- $ module load cudatoolkit
- $ cc -lOpenCL vecAdd.c -o vecAdd.out
运行(Running)
- $ aprun ./vecAdd.out
- final result: 1.000000
VecAdd.cc
C++绑定在OpenCL的开发中非常常用,它比标准C接口更为流畅,下面是一个使用这些绑定的例子。
- #define __CL_ENABLE_EXCEPTIONS
- #include "cl.hpp"
- #include <cstdio>
- #include <cstdlib>
- #include <iostream>
- #include <math.h>
- // OpenCL kernel. Each work item takes care of one element of c
- const char *kernelSource = "\n" \
- "#pragma OPENCL EXTENSION cl_khr_fp64 : enable \n" \
- "__kernel void vecAdd( __global double *a, \n" \
- " __global double *b, \n" \
- " __global double *c, \n" \
- " const unsigned int n) \n" \
- "{ \n" \
- " //Get our global thread ID \n" \
- " int id = get_global_id(0); \n" \
- " \n" \
- " //Make sure we do not go out of bounds \n" \
- " if (id < n) \n" \
- " c[id] = a[id] + b[id]; \n" \
- "} \n" \
- "\n" ;
- int main(int argc, char *argv[])
- {
- // Length of vectors
- unsigned int n = 1000;
- // Host input vectors
- double *h_a;
- double *h_b;
- // Host output vector
- double *h_c;
- // Device input buffers
- cl::Buffer d_a;
- cl::Buffer d_b;
- // Device output buffer
- cl::Buffer d_c;
- // Size, in bytes, of each vector
- size_t bytes = n*sizeof(double);
- // Allocate memory for each vector on host
- h_a = new double[n];
- h_b = new double[n];
- h_c = new double[n];
- // Initialize vectors on host
- for(int i = 0; i < n; i++ )
- {
- h_a[i] = sinf(i)*sinf(i);
- h_b[i] = cosf(i)*cosf(i);
- }
- cl_int err = CL_SUCCESS;
- try {
- // Query platforms
- std::vector<cl::Platform> platforms;
- cl::Platform::get(&platforms);
- if (platforms.size() == 0) {
- std::cout << "Platform size 0\n";
- return -1;
- }
- // Get list of devices on default platform and create context
- cl_context_properties properties[] =
- { CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(), 0};
- cl::Context context(CL_DEVICE_TYPE_GPU, properties);
- std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
- // Create command queue for first device
- cl::CommandQueue queue(context, devices[0], 0, &err);
- // Create device memory buffers
- d_a = cl::Buffer(context, CL_MEM_READ_ONLY, bytes);
- d_b = cl::Buffer(context, CL_MEM_READ_ONLY, bytes);
- d_c = cl::Buffer(context, CL_MEM_WRITE_ONLY, bytes);
- // Bind memory buffers
- queue.enqueueWriteBuffer(d_a, CL_TRUE, 0, bytes, h_a);
- queue.enqueueWriteBuffer(d_b, CL_TRUE, 0, bytes, h_b);
- //Build kernel from source string
- cl::Program::Sources source(1,
- std::make_pair(kernelSource,strlen(kernelSource)));
- cl::Program program_ = cl::Program(context, source);
- program_.build(devices);
- // Create kernel object
- cl::Kernel kernel(program_, "vecAdd", &err);
- // Bind kernel arguments to kernel
- kernel.setArg(0, d_a);
- kernel.setArg(1, d_b);
- kernel.setArg(2, d_c);
- kernel.setArg(3, n);
- // Number of work items in each local work group
- cl::NDRange localSize(64);
- // Number of total work items - localSize must be devisor
- cl::NDRange globalSize((int)(ceil(n/(float)64)*64));
- // Enqueue kernel
- cl::Event event;
- queue.enqueueNDRangeKernel(
- kernel,
- cl::NullRange,
- globalSize,
- localSize,
- NULL,
- &event);
- // Block until kernel completion
- event.wait();
- // Read back d_c
- queue.enqueueReadBuffer(d_c, CL_TRUE, 0, bytes, h_c);
- }
- catch (cl::Error err) {
- std::cerr
- << "ERROR: "<<err.what()<<"("<<err.err()<<")"<<std::endl;
- }
- // Sum up vector c and print result divided by n, this should equal 1 within error
- double sum = 0;
- for(int i=0; i<n; i++)
- sum += h_c[i];
- std::cout<<"final result: "<<sum/n<<std::endl;
- // Release host memory
- delete(h_a);
- delete(h_b);
- delete(h_c);
- return 0;
- }
编译(Compile)
需要先下载cl.hpp
- $ module load cudatoolkit
- $ CC vecAdd.cc -lOpenCL -o vecAdd.out
运行(Running)
- $ aprun ./vecAdd.out
- final result: 1.000000