OpenCL向量加法

最新推荐文章于 2023-10-11 20:03:24 发布

leeo1010

最新推荐文章于 2023-10-11 20:03:24 发布

阅读量995

点赞数

http://blog.csdn.net/lwfcgz/article/details/44056607

简介

下面一个例子介绍了向量加法的OpenCL版，相当于学习C语言中的“Hello World”，本篇教程中的代码以及其余相关教程都可以通过OLCF github下载

vecAdd.c

[cpp]view plain copy 
   
 
 #include <stdio.h>  
 #include <stdlib.h>  
 #include <math.h>  
 #include <CL/opencl.h>  
    
 // OpenCL kernel. Each work item takes care of one element of c  
 const char *kernelSource =                                       "\n" \  
 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable                    \n" \  
 "__kernel void vecAdd(  __global double *a,                       \n" \  
 "                       __global double *b,                       \n" \  
 "                       __global double *c,                       \n" \  
 "                       const unsigned int n)                    \n" \  
 "{                                                               \n" \  
 "    //Get our global thread ID                                  \n" \  
 "    int id = get_global_id(0);                                  \n" \  
 "                                                                \n" \  
 "    //Make sure we do not go out of bounds                      \n" \  
 "    if (id < n)                                                 \n" \  
 "        c[id] = a[id] + b[id];                                  \n" \  
 "}                                                               \n" \  
                                                                 "\n" ;  
    
 int main( int argc, char* argv[] )  
 {  
     // Length of vectors  
     unsigned int n = 100000;  
    
     // Host input vectors  
     double *h_a;  
     double *h_b;  
     // Host output vector  
     double *h_c;  
    
     // Device input buffers  
     cl_mem d_a;  
     cl_mem d_b;  
     // Device output buffer  
     cl_mem d_c;  
    
     cl_platform_id cpPlatform;        // OpenCL platform  
     cl_device_id device_id;           // device ID  
     cl_context context;               // context  
     cl_command_queue queue;           // command queue  
     cl_program program;               // program  
     cl_kernel kernel;                 // kernel  
    
     // Size, in bytes, of each vector  
     size_t bytes = n*sizeof(double);  
    
     // Allocate memory for each vector on host  
     h_a = (double*)malloc(bytes);  
     h_b = (double*)malloc(bytes);  
     h_c = (double*)malloc(bytes);  
    
     // Initialize vectors on host  
     int i;  
     for( i = 0; i < n; i++ )  
     {  
         h_a[i] = sinf(i)*sinf(i);  
         h_b[i] = cosf(i)*cosf(i);  
     }  
      size_t globalSize, localSize;  
     cl_int err;  
    
     // Number of work items in each local work group  
     localSize = 64;  
    
     // Number of total work items - localSize must be devisor  
     globalSize = ceil(n/(float)localSize)*localSize;  
    
     // Bind to platform  
     err = clGetPlatformIDs(1, &cpPlatform, NULL);  
    
     // Get ID for the device  
     err = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);  
    
     // Create a context    
     context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);  
    
     // Create a command queue   
     queue = clCreateCommandQueue(context, device_id, 0, &err);  
    
     // Create the compute program from the source buffer  
     program = clCreateProgramWithSource(context, 1,  
                             (const char **) & kernelSource, NULL, &err);  
    
     // Build the program executable   
     clBuildProgram(program, 0, NULL, NULL, NULL, NULL);  
    
     // Create the compute kernel in the program we wish to run  
     kernel = clCreateKernel(program, "vecAdd", &err);  
    
     // Create the input and output arrays in device memory for our calculation  
     d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);  
     d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);  
     d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, bytes, NULL, NULL);  
    
     // Write our data set into the input array in device memory  
     err = clEnqueueWriteBuffer(queue, d_a, CL_TRUE, 0,  
                                    bytes, h_a, 0, NULL, NULL);  
     err |= clEnqueueWriteBuffer(queue, d_b, CL_TRUE, 0,  
                                    bytes, h_b, 0, NULL, NULL);  
    
     // Set the arguments to our compute kernel  
     err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a);  
     err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b);  
     err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_c);  
     err |= clSetKernelArg(kernel, 3, sizeof(unsigned int), &n);  
    
     // Execute the kernel over the entire range of the data set    
     err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize,  
                                                               0, NULL, NULL);  
    
     // Wait for the command queue to get serviced before reading back results  
     clFinish(queue);  
    
     // Read the results from the device  
     clEnqueueReadBuffer(queue, d_c, CL_TRUE, 0,  
                                 bytes, h_c, 0, NULL, NULL );  
    
     //Sum up vector c and print result divided by n, this should equal 1 within error  
     double sum = 0;  
     for(i=0; i<n; i++)  
          sum += h_c[i];  
     printf("final result: %f\n", sum/n);  
    
     // release OpenCL resources  
     clReleaseMemObject(d_a);  
     clReleaseMemObject(d_b);  
     clReleaseMemObject(d_c);  
     clReleaseProgram(program);  
     clReleaseKernel(kernel);  
     clReleaseCommandQueue(queue);  
     clReleaseContext(context);  
    
     //release host memory  
     free(h_a);  
     free(h_b);  
     free(h_c);  
    
     return 0;  
 }  

代码分析

内核（kernel）：

kernel是OpenCL代码的核心部分，整个内核必须通过C字符串的形式读入，最简单的办法是像代码一样定义一个长长的字符串，在真实的项目代码中通常都会从单独的文件中读入内核。

[cpp]view plain copy 
   
 
 // OpenCL kernel. Each work item takes care of one element of c  
 const char *kernelSource =                                      "\n" \  
 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable                    \n" \  
 "__kernel void vecAdd(  __global double *a,                       \n" \  
 "                       __global double *b,                       \n" \  
 "                       __global double *c,                       \n" \  
 "                       const unsigned int n)                    \n" \  
 "{                                                               \n" \  
 "    //Get our global thread ID                                  \n" \  
 "    int id = get_global_id(0);                                  \n" \  
 "                                                                \n" \  
 "    //Make sure we do not go out of bounds                      \n" \  
 "    if (id < n)                                                 \n" \  
 "        c[id] = a[id] + b[id];                                  \n" \  
 "}                                                               \n" \  
                                                                 "\n" ;  

下面是内核的函数声明：

 
   
        __kernel 
        void  
        vecAdd(  __global  
        double  
        *a, __global  
        double  
        *b, 
       
 
                                
        __global 
        double  
        *c,  
        const  
        unsigned  
        int  
        n) 
       
 
 

__kernel是一个定义OpenCL内核的关键字，__global则定义函数指针指向全局设备内存空间，否则可以使用一般的C语言函数声明语法。内核的返回值必须为空void

 
        int  
        id = get_global_id(0);

通过get_global_id函数可以获得当前工作单元（work item）的全局id，参数为0表示获取X维上的ID。

 
        if 
        (id < n) 
       
        c[id] = a[id] + b[id];

工作组（work group）的个数必定是整数，由于工作组的大小不一定是需要的线程数的整数倍，因此通常使用的线程数比需要的线程数要多，在程序设计时可以将无用的线程简单丢弃掉。

内存（Memory）

[cpp]view plain copy 
   
 
 // Host input vectors  
 double *h_a;  
 double *h_b;  
 // Host output vector  
 double *h_c;  
     
 // Device input buffers  
 cl_mem d_a;  
 cl_mem d_b;  
 // Device output buffer  
 cl_mem d_c;  

主机CPU和GPU有不同的内存空间，因此需要分别定义，上面的代码中前半部分定义主机（host）CPU的内存指针，后半部分定义设备（device）内存的handle，分别用h_和d_前缀来区分。

线程映射（Thread Mapping）

[cpp]view plain copy 
   
 
 // Number of work items in each local work group  
 localSize = 64;  
     
 // Number of total work items - localSize must be devisor  
 globalSize = ceil(n/(float)localSize)*localSize;  

为了将我们要解决的问题映射到底层硬件结构，必须定义局部尺寸（local size）和全局尺寸（global size）。局部尺寸定义了每个工作组中的工作单元数，在NVIDIA GPU上等价于每个线程块（thread block）中的线程数。全局尺寸定义了工作单元的总数目。局部尺寸必须是全局尺寸的倍数。

OpenCL前期准备（setup）

[cpp]view plain copy 
   
 
 // Bind to platform  
 err = clGetPlatformIDs(1, &cpPlatform, NULL);  

每个硬件厂商都会绑定一个不同的平台（platform），在这里clGetPlatformIDs会将cpPlatform设置成包含系统可用平台的变量。举个例子，如果一个系统包含AMD CPU以及NVIDIA GPU，并且安装了恰当的OpenCL驱动，那么两个OpenCL平台会被返回。

[cpp]view plain copy 
   
 
 // Get ID for the device  
 err = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);  

可以询问每一个平台都包含哪些设备，在这里我们通过使用CL_DEVICE_TYPE_GPU来查询GPU设备。

[cpp]view plain copy 
   
 
 // Create a context  
 context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);  

在使用OpenCL设备之前，必须先准备一个上下文（context），上下文对象用来管理命令队列（command queue）、内存（memory）、内核操作（Kernel activity），一个上下文对象可一般含多个设备。

[cpp]view plain copy 
   
 
 // Create a command queue  
 queue = clCreateCommandQueue(context, device_id, 0, &err);  

命令队列（command queue）用来流式地将命令从主机送到指定的设备，可以把数据传输和内核操作命令放到命令队列上，当条件适宜的时候命令就会被执行。

编译内核（Compile Kernel）

[cpp]view plain copy 
   
 
 program = clCreateProgramWithSource(context, 1,  
                         (const char **) & kernelSource, NULL, &err);  
     
 // Build the program executable  
 clBuildProgram(program, 0, NULL, NULL, NULL, NULL);  
     
 // Create the compute kernel in the program we wish to run  
 kernel = clCreateKernel(program, "vecAdd", &err);  

为了保证OpenCL代码可以移植到许多不同的设备上，运行kernel的默认方式是JIT（Just-in-time, 实时编译）。首先创建一个program对象（包含一系列内核代码），然后再创建一系列的内核。

准备数据（prepare data）

[cpp]view plain copy 
   
 
 // Create the input and output arrays in device memory for our calculation  
 d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);  
 d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);  
 d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, bytes, NULL, NULL);  
     
 // Write our data set into the input array in device memory  
 err = clEnqueueWriteBuffer(queue, d_a, CL_TRUE, 0,  
                                bytes, h_a, 0, NULL, NULL);  
 err |= clEnqueueWriteBuffer(queue, d_b, CL_TRUE, 0,  
                                bytes, h_b, 0, NULL, NULL);  
     
 // Set the arguments to our compute kernel  
 err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a);  
 err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b);  
 err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_c);  
 err |= clSetKernelArg(kernel, 3, sizeof(unsigned int), &n);  

在启动内核之前，我们必须创建主机和设备之间的缓存（buffer），并将主机数据（host data）和这些新创建的设备缓存想绑定，最后再设定内核参数。

启动内核（Launch Kernel）

[cpp]view plain copy 
   
 
 // Execute the kernel over the entire range of the data set  
 err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize,  
                                                           0, NULL, NULL);  

将结果拷贝回主机（Copy results to host）

[cpp]view plain copy 
   
 
 // Wait for the command queue to get serviced before reading back results  
 clFinish(queue);  
     
 // Read the results from the device  
 clEnqueueReadBuffer(queue, d_c, CL_TRUE, 0,  
                             bytes, h_c, 0, NULL, NULL );  

我们可以阻塞程序直到命令队列变为空，然后把结果拷贝回主机。

编译（Compile）

[plain]view plain copy 
   
 
 $ module load cudatoolkit  
 $ cc -lOpenCL vecAdd.c -o vecAdd.out  

运行（Running）

[plain]view plain copy 
   
 
 $ aprun ./vecAdd.out  
 final result: 1.000000  

VecAdd.cc

C++绑定在OpenCL的开发中非常常用，它比标准C接口更为流畅，下面是一个使用这些绑定的例子。

[cpp]view plain copy 
   
 
 #define __CL_ENABLE_EXCEPTIONS  
    
 #include "cl.hpp"  
 #include <cstdio>  
 #include <cstdlib>  
 #include <iostream>  
 #include <math.h>  
    
 // OpenCL kernel. Each work item takes care of one element of c  
 const char *kernelSource =                                      "\n" \  
 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable                    \n" \  
 "__kernel void vecAdd(  __global double *a,                       \n" \  
 "                       __global double *b,                       \n" \  
 "                       __global double *c,                       \n" \  
 "                       const unsigned int n)                    \n" \  
 "{                                                               \n" \  
 "    //Get our global thread ID                                  \n" \  
 "    int id = get_global_id(0);                                  \n" \  
 "                                                                \n" \  
 "    //Make sure we do not go out of bounds                      \n" \  
 "    if (id < n)                                                 \n" \  
 "        c[id] = a[id] + b[id];                                  \n" \  
 "}                                                               \n" \  
                                                                 "\n" ;  
    
    
 int main(int argc, char *argv[])  
 {  
    
     // Length of vectors  
     unsigned int n = 1000;  
    
     // Host input vectors  
     double *h_a;  
     double *h_b;  
     // Host output vector  
     double *h_c;  
    
     // Device input buffers  
     cl::Buffer d_a;  
     cl::Buffer d_b;  
     // Device output buffer  
     cl::Buffer d_c;  
    
     // Size, in bytes, of each vector  
     size_t bytes = n*sizeof(double);  
    
     // Allocate memory for each vector on host  
     h_a = new double[n];  
     h_b = new double[n];  
     h_c = new double[n];  
    
     // Initialize vectors on host  
     for(int i = 0; i < n; i++ )  
     {  
         h_a[i] = sinf(i)*sinf(i);  
         h_b[i] = cosf(i)*cosf(i);  
     }  
    
     cl_int err = CL_SUCCESS;  
     try {  
     // Query platforms  
         std::vector<cl::Platform> platforms;  
         cl::Platform::get(&platforms);  
         if (platforms.size() == 0) {  
             std::cout << "Platform size 0\n";  
             return -1;  
          }  
    
         // Get list of devices on default platform and create context  
         cl_context_properties properties[] =  
            { CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(), 0};  
         cl::Context context(CL_DEVICE_TYPE_GPU, properties);  
         std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();  
    
         // Create command queue for first device  
         cl::CommandQueue queue(context, devices[0], 0, &err);  
    
         // Create device memory buffers  
         d_a = cl::Buffer(context, CL_MEM_READ_ONLY, bytes);  
         d_b = cl::Buffer(context, CL_MEM_READ_ONLY, bytes);  
         d_c = cl::Buffer(context, CL_MEM_WRITE_ONLY, bytes);  
    
         // Bind memory buffers  
         queue.enqueueWriteBuffer(d_a, CL_TRUE, 0, bytes, h_a);  
         queue.enqueueWriteBuffer(d_b, CL_TRUE, 0, bytes, h_b);  
    
         //Build kernel from source string  
         cl::Program::Sources source(1,  
             std::make_pair(kernelSource,strlen(kernelSource)));  
         cl::Program program_ = cl::Program(context, source);  
         program_.build(devices);  
    
         // Create kernel object  
         cl::Kernel kernel(program_, "vecAdd", &err);  
    
         // Bind kernel arguments to kernel  
         kernel.setArg(0, d_a);  
         kernel.setArg(1, d_b);  
         kernel.setArg(2, d_c);  
         kernel.setArg(3, n);  
    
         // Number of work items in each local work group  
         cl::NDRange localSize(64);  
         // Number of total work items - localSize must be devisor  
         cl::NDRange globalSize((int)(ceil(n/(float)64)*64));  
    
         // Enqueue kernel  
         cl::Event event;  
         queue.enqueueNDRangeKernel(  
             kernel,  
             cl::NullRange,  
             globalSize,  
             localSize,  
             NULL,  
             &event);  
    
         // Block until kernel completion  
         event.wait();  
      // Read back d_c  
         queue.enqueueReadBuffer(d_c, CL_TRUE, 0, bytes, h_c);  
         }  
     catch (cl::Error err) {  
          std::cerr  
             << "ERROR: "<<err.what()<<"("<<err.err()<<")"<<std::endl;  
     }  
    
     // Sum up vector c and print result divided by n, this should equal 1 within error  
     double sum = 0;  
     for(int i=0; i<n; i++)  
         sum += h_c[i];  
     std::cout<<"final result: "<<sum/n<<std::endl;  
    
     // Release host memory  
     delete(h_a);  
     delete(h_b);  
     delete(h_c);  
    
     return 0;  
 }  

编译（Compile）

需要先下载cl.hpp

[plain]view plain copy 
   
 
 $ module load cudatoolkit  
 $ CC vecAdd.cc -lOpenCL -o vecAdd.out  

运行（Running）

[plain]view plain copy 
   
 
 $ aprun ./vecAdd.out  
 final result: 1.000000  

leeo1010

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
OpenCL向量加法

http://blog.csdn.net/lwfcgz/article/details/44056607简介下面一个例子介绍了向量加法的OpenCL版，相当于学习C语言中的“Hello World”，本篇教程中的代码以及其余相关教程都可以通过OLCF github下载vecAdd.c[cpp] view
复制链接

扫一扫