OpenCL向量加法

最新推荐文章于 2024-10-14 16:36:04 发布

lwfcgz

最新推荐文章于 2024-10-14 16:36:04 发布

阅读量5k

点赞数 2

分类专栏： C++ 硬件问题

C++ 同时被 2 个专栏收录

14 篇文章 0 订阅

订阅专栏

硬件问题

3 篇文章 0 订阅

订阅专栏

简介

下面一个例子介绍了向量加法的OpenCL版，相当于学习C语言中的“Hello World”，本篇教程中的代码以及其余相关教程都可以通过OLCF github下载

vecAdd.c

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <CL/opencl.h>
 
// OpenCL kernel. Each work item takes care of one element of c
const char *kernelSource =                                       "\n" \
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable                    \n" \
"__kernel void vecAdd(  __global double *a,                       \n" \
"                       __global double *b,                       \n" \
"                       __global double *c,                       \n" \
"                       const unsigned int n)                    \n" \
"{                                                               \n" \
"    //Get our global thread ID                                  \n" \
"    int id = get_global_id(0);                                  \n" \
"                                                                \n" \
"    //Make sure we do not go out of bounds                      \n" \
"    if (id < n)                                                 \n" \
"        c[id] = a[id] + b[id];                                  \n" \
"}                                                               \n" \
                                                                "\n" ;
 
int main( int argc, char* argv[] )
{
    // Length of vectors
    unsigned int n = 100000;
 
    // Host input vectors
    double *h_a;
    double *h_b;
    // Host output vector
    double *h_c;
 
    // Device input buffers
    cl_mem d_a;
    cl_mem d_b;
    // Device output buffer
    cl_mem d_c;
 
    cl_platform_id cpPlatform;        // OpenCL platform
    cl_device_id device_id;           // device ID
    cl_context context;               // context
    cl_command_queue queue;           // command queue
    cl_program program;               // program
    cl_kernel kernel;                 // kernel
 
    // Size, in bytes, of each vector
    size_t bytes = n*sizeof(double);
 
    // Allocate memory for each vector on host
    h_a = (double*)malloc(bytes);
    h_b = (double*)malloc(bytes);
    h_c = (double*)malloc(bytes);
 
    // Initialize vectors on host
    int i;
    for( i = 0; i < n; i++ )
    {
        h_a[i] = sinf(i)*sinf(i);
        h_b[i] = cosf(i)*cosf(i);
    }
     size_t globalSize, localSize;
    cl_int err;
 
    // Number of work items in each local work group
    localSize = 64;
 
    // Number of total work items - localSize must be devisor
    globalSize = ceil(n/(float)localSize)*localSize;
 
    // Bind to platform
    err = clGetPlatformIDs(1, &cpPlatform, NULL);
 
    // Get ID for the device
    err = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
 
    // Create a context  
    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
 
    // Create a command queue 
    queue = clCreateCommandQueue(context, device_id, 0, &err);
 
    // Create the compute program from the source buffer
    program = clCreateProgramWithSource(context, 1,
                            (const char **) & kernelSource, NULL, &err);
 
    // Build the program executable 
    clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
 
    // Create the compute kernel in the program we wish to run
    kernel = clCreateKernel(program, "vecAdd", &err);
 
    // Create the input and output arrays in device memory for our calculation
    d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);
    d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);
    d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, bytes, NULL, NULL);
 
    // Write our data set into the input array in device memory
    err = clEnqueueWriteBuffer(queue, d_a, CL_TRUE, 0,
                                   bytes, h_a, 0, NULL, NULL);
    err |= clEnqueueWriteBuffer(queue, d_b, CL_TRUE, 0,
                                   bytes, h_b, 0, NULL, NULL);
 
    // Set the arguments to our compute kernel
    err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a);
    err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b);
    err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_c);
    err |= clSetKernelArg(kernel, 3, sizeof(unsigned int), &n);
 
    // Execute the kernel over the entire range of the data set  
    err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize,
                                                              0, NULL, NULL);
 
    // Wait for the command queue to get serviced before reading back results
    clFinish(queue);
 
    // Read the results from the device
    clEnqueueReadBuffer(queue, d_c, CL_TRUE, 0,
                                bytes, h_c, 0, NULL, NULL );
 
    //Sum up vector c and print result divided by n, this should equal 1 within error
    double sum = 0;
    for(i=0; i<n; i++)
         sum += h_c[i];
    printf("final result: %f\n", sum/n);
 
    // release OpenCL resources
    clReleaseMemObject(d_a);
    clReleaseMemObject(d_b);
    clReleaseMemObject(d_c);
    clReleaseProgram(program);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(queue);
    clReleaseContext(context);
 
    //release host memory
    free(h_a);
    free(h_b);
    free(h_c);
 
    return 0;
}

代码分析

内核（kernel）：

kernel是OpenCL代码的核心部分，整个内核必须通过C字符串的形式读入，最简单的办法是像代码一样定义一个长长的字符串，在真实的项目代码中通常都会从单独的文件中读入内核。

// OpenCL kernel. Each work item takes care of one element of c
const char *kernelSource =                                      "\n" \
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable                    \n" \
"__kernel void vecAdd(  __global double *a,                       \n" \
"                       __global double *b,                       \n" \
"                       __global double *c,                       \n" \
"                       const unsigned int n)                    \n" \
"{                                                               \n" \
"    //Get our global thread ID                                  \n" \
"    int id = get_global_id(0);                                  \n" \
"                                                                \n" \
"    //Make sure we do not go out of bounds                      \n" \
"    if (id < n)                                                 \n" \
"        c[id] = a[id] + b[id];                                  \n" \
"}                                                               \n" \
                                                                "\n" ;

下面是内核的函数声明：

 
   
        __kernel 
        void 
        vecAdd(  __global  
        double 
        *a, __global  
        double 
        *b, 
       
 
                                
        __global 
        double 
        *c,  
        const 
        unsigned  
        int 
        n) 
       
 
 

__kernel是一个定义OpenCL内核的关键字，__global则定义函数指针指向全局设备内存空间，否则可以使用一般的C语言函数声明语法。内核的返回值必须为空void

 
        int  
        id = get_global_id(0);

通过get_global_id函数可以获得当前工作单元（work item）的全局id，参数为0表示获取X维上的ID。

 
        if 
        (id < n) 
       
        c[id] = a[id] + b[id];

工作组（work group）的个数必定是整数，由于工作组的大小不一定是需要的线程数的整数倍，因此通常使用的线程数比需要的线程数要多，在程序设计时可以将无用的线程简单丢弃掉。

内存（Memory）

// Host input vectors
double *h_a;
double *h_b;
// Host output vector
double *h_c;
  
// Device input buffers
cl_mem d_a;
cl_mem d_b;
// Device output buffer
cl_mem d_c;

主机CPU和GPU有不同的内存空间，因此需要分别定义，上面的代码中前半部分定义主机（host）CPU的内存指针，后半部分定义设备（device）内存的handle，分别用h_和d_前缀来区分。

线程映射（Thread Mapping）

// Number of work items in each local work group
localSize = 64;
  
// Number of total work items - localSize must be devisor
globalSize = ceil(n/(float)localSize)*localSize;

为了将我们要解决的问题映射到底层硬件结构，必须定义局部尺寸（local size）和全局尺寸（global size）。局部尺寸定义了每个工作组中的工作单元数，在NVIDIA GPU上等价于每个线程块（thread block）中的线程数。全局尺寸定义了工作单元的总数目。局部尺寸必须是全局尺寸的倍数。

OpenCL前期准备（setup）

// Bind to platform
err = clGetPlatformIDs(1, &cpPlatform, NULL);

每个硬件厂商都会绑定一个不同的平台（platform），在这里clGetPlatformIDs会将cpPlatform设置成包含系统可用平台的变量。举个例子，如果一个系统包含AMD CPU以及NVIDIA GPU，并且安装了恰当的OpenCL驱动，那么两个OpenCL平台会被返回。

// Get ID for the device
err = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);

可以询问每一个平台都包含哪些设备，在这里我们通过使用CL_DEVICE_TYPE_GPU来查询GPU设备。

// Create a context
context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);

在使用OpenCL设备之前，必须先准备一个上下文（context），上下文对象用来管理命令队列（command queue）、内存（memory）、内核操作（Kernel activity），一个上下文对象可一般含多个设备。

// Create a command queue
queue = clCreateCommandQueue(context, device_id, 0, &err);

命令队列（command queue）用来流式地将命令从主机送到指定的设备，可以把数据传输和内核操作命令放到命令队列上，当条件适宜的时候命令就会被执行。

编译内核（Compile Kernel）

program = clCreateProgramWithSource(context, 1,
                        (const char **) & kernelSource, NULL, &err);
  
// Build the program executable
clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
  
// Create the compute kernel in the program we wish to run
kernel = clCreateKernel(program, "vecAdd", &err);

为了保证OpenCL代码可以移植到许多不同的设备上，运行kernel的默认方式是JIT（Just-in-time, 实时编译）。首先创建一个program对象（包含一系列内核代码），然后再创建一系列的内核。

准备数据（prepare data）

// Create the input and output arrays in device memory for our calculation
d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);
d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);
d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, bytes, NULL, NULL);
  
// Write our data set into the input array in device memory
err = clEnqueueWriteBuffer(queue, d_a, CL_TRUE, 0,
                               bytes, h_a, 0, NULL, NULL);
err |= clEnqueueWriteBuffer(queue, d_b, CL_TRUE, 0,
                               bytes, h_b, 0, NULL, NULL);
  
// Set the arguments to our compute kernel
err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b);
err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_c);
err |= clSetKernelArg(kernel, 3, sizeof(unsigned int), &n);

在启动内核之前，我们必须创建主机和设备之间的缓存（buffer），并将主机数据（host data）和这些新创建的设备缓存想绑定，最后再设定内核参数。

启动内核（Launch Kernel）

// Execute the kernel over the entire range of the data set
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize,
                                                          0, NULL, NULL);

将结果拷贝回主机（Copy results to host）

// Wait for the command queue to get serviced before reading back results
clFinish(queue);
  
// Read the results from the device
clEnqueueReadBuffer(queue, d_c, CL_TRUE, 0,
                            bytes, h_c, 0, NULL, NULL );

我们可以阻塞程序直到命令队列变为空，然后把结果拷贝回主机。

编译（Compile）

$ module load cudatoolkit
$ cc -lOpenCL vecAdd.c -o vecAdd.out

运行（Running）

$ aprun ./vecAdd.out
final result: 1.000000

VecAdd.cc

C++绑定在OpenCL的开发中非常常用，它比标准C接口更为流畅，下面是一个使用这些绑定的例子。

#define __CL_ENABLE_EXCEPTIONS
 
#include "cl.hpp"
#include <cstdio>
#include <cstdlib>
#include <iostream>
#include <math.h>
 
// OpenCL kernel. Each work item takes care of one element of c
const char *kernelSource =                                      "\n" \
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable                    \n" \
"__kernel void vecAdd(  __global double *a,                       \n" \
"                       __global double *b,                       \n" \
"                       __global double *c,                       \n" \
"                       const unsigned int n)                    \n" \
"{                                                               \n" \
"    //Get our global thread ID                                  \n" \
"    int id = get_global_id(0);                                  \n" \
"                                                                \n" \
"    //Make sure we do not go out of bounds                      \n" \
"    if (id < n)                                                 \n" \
"        c[id] = a[id] + b[id];                                  \n" \
"}                                                               \n" \
                                                                "\n" ;
 
 
int main(int argc, char *argv[])
{
 
    // Length of vectors
    unsigned int n = 1000;
 
    // Host input vectors
    double *h_a;
    double *h_b;
    // Host output vector
    double *h_c;
 
    // Device input buffers
    cl::Buffer d_a;
    cl::Buffer d_b;
    // Device output buffer
    cl::Buffer d_c;
 
    // Size, in bytes, of each vector
    size_t bytes = n*sizeof(double);
 
    // Allocate memory for each vector on host
    h_a = new double[n];
    h_b = new double[n];
    h_c = new double[n];
 
    // Initialize vectors on host
    for(int i = 0; i < n; i++ )
    {
        h_a[i] = sinf(i)*sinf(i);
        h_b[i] = cosf(i)*cosf(i);
    }
 
    cl_int err = CL_SUCCESS;
    try {
    // Query platforms
        std::vector<cl::Platform> platforms;
        cl::Platform::get(&platforms);
        if (platforms.size() == 0) {
            std::cout << "Platform size 0\n";
            return -1;
         }
 
        // Get list of devices on default platform and create context
        cl_context_properties properties[] =
           { CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(), 0};
        cl::Context context(CL_DEVICE_TYPE_GPU, properties);
        std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
 
        // Create command queue for first device
        cl::CommandQueue queue(context, devices[0], 0, &err);
 
        // Create device memory buffers
        d_a = cl::Buffer(context, CL_MEM_READ_ONLY, bytes);
        d_b = cl::Buffer(context, CL_MEM_READ_ONLY, bytes);
        d_c = cl::Buffer(context, CL_MEM_WRITE_ONLY, bytes);
 
        // Bind memory buffers
        queue.enqueueWriteBuffer(d_a, CL_TRUE, 0, bytes, h_a);
        queue.enqueueWriteBuffer(d_b, CL_TRUE, 0, bytes, h_b);
 
        //Build kernel from source string
        cl::Program::Sources source(1,
            std::make_pair(kernelSource,strlen(kernelSource)));
        cl::Program program_ = cl::Program(context, source);
        program_.build(devices);
 
        // Create kernel object
        cl::Kernel kernel(program_, "vecAdd", &err);
 
        // Bind kernel arguments to kernel
        kernel.setArg(0, d_a);
        kernel.setArg(1, d_b);
        kernel.setArg(2, d_c);
        kernel.setArg(3, n);
 
        // Number of work items in each local work group
        cl::NDRange localSize(64);
        // Number of total work items - localSize must be devisor
        cl::NDRange globalSize((int)(ceil(n/(float)64)*64));
 
        // Enqueue kernel
        cl::Event event;
        queue.enqueueNDRangeKernel(
            kernel,
            cl::NullRange,
            globalSize,
            localSize,
            NULL,
            &event);
 
        // Block until kernel completion
        event.wait();
     // Read back d_c
        queue.enqueueReadBuffer(d_c, CL_TRUE, 0, bytes, h_c);
        }
    catch (cl::Error err) {
         std::cerr
            << "ERROR: "<<err.what()<<"("<<err.err()<<")"<<std::endl;
    }
 
    // Sum up vector c and print result divided by n, this should equal 1 within error
    double sum = 0;
    for(int i=0; i<n; i++)
        sum += h_c[i];
    std::cout<<"final result: "<<sum/n<<std::endl;
 
    // Release host memory
    delete(h_a);
    delete(h_b);
    delete(h_c);
 
    return 0;
}

编译（Compile）

需要先下载cl.hpp

$ module load cudatoolkit
$ CC vecAdd.cc -lOpenCL -o vecAdd.out

运行（Running）

$ aprun ./vecAdd.out
final result: 1.000000

原文地址：https://www.olcf.ornl.gov/tutorials/opencl-vector-addition/

lwfcgz

关注

2
点赞
踩
10

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

专栏目录