openCL（一）——hello word

最新推荐文章于 2023-07-19 17:08:34 发布

一步一个脚印的屌丝

最新推荐文章于 2023-07-19 17:08:34 发布

阅读量2.5k

点赞数

分类专栏：并行计算文章标签： output null resources input command

本文链接：https://blog.csdn.net/liurong_cn/article/details/8046178

版权

并行计算专栏收录该内容

30 篇文章 2 订阅

订阅专栏

过了一个十一长假，算是舒服了一下，又要给自己充电了，自己最近吧openCL的东西了解了一下，贴出程序和大家分享。。。

对于openCL的安装和配置，网上已经很多了，我也有转载。

对于openCL在初始化配置比较繁琐，不过在AMD的DEMOL里给提供了，是一个初始化模板。有一个头文件和一个CPP文件

一头文件 Template.h。包含了函数申明和一些全局变量的声明。这里不做说明，将在cpp中介绍

#ifndef TEMPLATE_H_
#define TEMPLATE_H_




#include <CL/cl.h>
#include <string.h>
#include <cstdlib>
#include <iostream>
#include <string>
#include <fstream>


// GLOBALS 
#define SDK_SUCCESS 0
#define SDK_FAILURE 1

/*
 * Input data is stored here.
 */
cl_uint *input;

/*
 * Output data is stored here.
 */
cl_uint *output;

/*
 * Multiplier is stored in this variable 
 */
cl_uint multiplier;

/* problem size for 1D algorithm and width of problem size for 2D algorithm */
cl_uint width;

/* The memory buffer that is used as input/output for OpenCL kernel */
cl_mem   inputBuffer;
cl_mem	 outputBuffer;

cl_context          context;
cl_device_id        *devices;
cl_command_queue    commandQueue;

cl_program program;

/* This program uses only one kernel and this serves as a handle to it */
cl_kernel  kernel;


// FUNCTION DECLARATIONS 

/*
 * OpenCL related initialisations are done here.
 * Context, Device list, Command Queue are set up.
 * Calls are made to set up OpenCL memory buffers that this program uses
 * and to load the programs into memory and get kernel handles.
 */
int initializeCL(void);

/*
 * Convert char* to string
 */
std::string convertToString(const char * filename);

/*
 * This is called once the OpenCL context, memory etc. are set up,
 * the program is loaded into memory and the kernel handles are ready.
 * 
 * It sets the values for kernels' arguments and enqueues calls to the kernels
 * on to the command queue and waits till the calls have finished execution.
 *
 * It also gets kernel start and end time if profiling is enabled.
 * @return returns SDK_SUCCESS on success and SDK_FAILURE otherwise
 */
int runCLKernels(void);

/**
 * Releases OpenCL resources (Context, Memory etc.) 
 * @return returns SDK_SUCCESS on success and SDK_FAILURE otherwise
 */
int cleanupCL(void);

/**
 * Releases program's resources
 * @return returns SDK_SUCCESS on success and SDK_FAILURE otherwise 
 */
void cleanupHost(void);

/*
 * Prints no more than 256 elements of the given array.
 * Prints full array if length is less than 256.
 *
 * Prints Array name followed by elements.
 */
void print1DArray(
         const std::string arrayName, 
         const unsigned int * arrayData, 
         const unsigned int length);


#endif  /* #ifndef TEMPLATE_H_ */

二Template.cpp文件，包括了函数实现和main调用

首先在IopenCL中，大致的流程如下：

1.把openCl装置初始化

2.在openCL装置上配置三块记忆体，以存放数据的资料

3.将数据的复制到OPEVNCL装置上

4.编译要执行的程序

5.执行编译好的程序

6.将计算结果从openCL装置上复制到内存，显示或存入文件

下面将在程序中一步一步实现上面的流程

/* ============================================================

#include "Template.h"

/*
 * \brief Host Initialization 
 *        Allocate and initialize memory 
 *        on the host. Print input array. 
 */
int
initializeHost(void)//初始化需要处理的数据和将要存储的空间，即生成两个数组：input,output
{
    width               = 256;
    input               = NULL;
    output              = NULL;
    multiplier          = 2;

    /
    // Allocate and initialize memory used by host 
    /
    cl_uint sizeInBytes = width * sizeof(cl_uint);
    input = (cl_uint *) malloc(sizeInBytes);
    if(!input)
    {
        std::cout << "Error: Failed to allocate input memory on host\n";
        return SDK_FAILURE;
    }

    output = (cl_uint *) malloc(sizeInBytes);
    if(!output)
    {
        std::cout << "Error: Failed to allocate input memory on host\n";
        return SDK_FAILURE;
    }

    for(cl_uint i = 0; i < width; i++)
        input[i] = i;

    // print input array
    print1DArray(std::string("Input").c_str(), input, width);
    return SDK_SUCCESS;
}

/*
 * Converts the contents of a file into a string
 */
std::string
convertToString(const char *filename)//将kernel源码，即自己写的并行化的函数，转化成字符串
{
    size_t size;
    char*  str;
    std::string s;

    std::fstream f(filename, (std::fstream::in | std::fstream::binary));

    if(f.is_open())
    {
        size_t fileSize;
        f.seekg(0, std::fstream::end);
        size = fileSize = (size_t)f.tellg();
        f.seekg(0, std::fstream::beg);

        str = new char[size+1];
        if(!str)
        {
            f.close();
            std::cout << "Memory allocation failed";
            return NULL;
        }

        f.read(str, fileSize);
        f.close();
        str[size] = '\0';
    
        s = str;
        delete[] str;
        return s;
    }
    else
    {
        std::cout << "\nFile containg the kernel code(\".cl\") not found. Please copy the required file in the folder containg the executable.\n";
        exit(1);
    }
    return NULL;
}

/*
 * \brief OpenCL related initialization 
 *        Create Context, Device list, Command Queue
 *        Create OpenCL memory buffer objects
 *        Load CL file, compile, link CL source 
 *		  Build program and kernel objects
 */
int
initializeCL(void)//openCL装置初始化
{
    cl_int status = 0;
    size_t deviceListSize;

     
    // STEP 1 Getting Platform.获得系统山所有可支持openCL的装置，我的电脑室AMD显卡
     
    
    /*
     * Have a look at the available platforms and pick either
     * the AMD one if available or a reasonable default.
     */

    cl_uint numPlatforms;//平台数目
    cl_platform_id platform = NULL;//平台ID
    status = clGetPlatformIDs(0, NULL, &numPlatforms);//获取平台数目
    if(status != CL_SUCCESS)
    {
        std::cout << "Error: Getting Platforms. (clGetPlatformsIDs)\n";
        return SDK_FAILURE;
    }
    
    if(numPlatforms > 0)
    {
        cl_platform_id* platforms = new cl_platform_id[numPlatforms];
        status = clGetPlatformIDs(numPlatforms, platforms, NULL);//测试是否能获得平台信息
        if(status != CL_SUCCESS)
        {
            std::cout << "Error: Getting Platform Ids. (clGetPlatformsIDs)\n";
            return SDK_FAILURE;
        }
        for(unsigned int i=0; i < numPlatforms; ++i)//获得每个平台的信息
        {
            char pbuff[100];
            status = clGetPlatformInfo(
                        platforms[i],
                        CL_PLATFORM_VENDOR,
                        sizeof(pbuff),
                        pbuff,
                        NULL);
            if(status != CL_SUCCESS)
            {
                std::cout << "Error: Getting Platform Info.(clGetPlatformInfo)\n";
                return SDK_FAILURE;
            }
            platform = platforms[i];
            if(!strcmp(pbuff, "Advanced Micro Devices, Inc."))
            {
                break;
            }
        }
        delete platforms;
    }

    if(NULL == platform)
    {
        std::cout << "NULL platform found so Exiting Application." << std::endl;
        return SDK_FAILURE;
    }


     
    // STEP 2 Creating context using the platform selected
    //      Context created from type includes all available
    //      devices of the specified type from the selected platform 
	 //   从指定的装置类别中，建立一个openCL context.
     
    

    /*
     * If we could find our platform, use it. Otherwise use just available platform.
     */
    cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 };
	/
	//函数参数介绍
	//第一个参数：cl_context_properties，包含三个数据的数组：property种类（这里                 //                 CL_CONTEXT_PLATFORM为使用平台ID，第二个是平台ID，）
	//第二个参数：指定要使用的装置的类别：
	  /*
	  CL_DEVICE_TYPE_CPU:使用CPU装置
	  CL_DEVICE_TYPE_GPU:使用显示晶片装置
	  CL_DEVICE_TYPE_ACCELERATOR:特定的OPENCL加速装置，例如CELL
	  CL_DEVICE_TYPE_DEFAULT:系统预定的openCL装置
	  CL_DEVICE_TYPE_ALL:所有系统统计中的openCL装置
	  */
	//其他参数不知道了。。。。
    context = clCreateContextFromType(cps, 
                                      CL_DEVICE_TYPE_CPU, 
                                      NULL, 
                                      NULL, 
                                      &status);
    if(status != CL_SUCCESS) 
    {  
        std::cout << "Error: Creating Context. (clCreateContextFromType)\n";
        return SDK_FAILURE; 
    }


     
    // STEP 3
    //      3.1 Query context for the device list size,
    //      3.2 Allocate that much memory using malloc or new
    //      3.3 Again query context info to get the array of device
    //              available in the created context
	//      3.1获得装置列表大小
	//      3.2开辟空间为列表大小
	//      3.3获得所有装置的列表
     
    
    // First, get the size of device list data获得装置的列表的大小
    status = clGetContextInfo(context, 
                              CL_CONTEXT_DEVICES, //表示要取得的装置的列表
                              0, 
                              NULL, 
                              &deviceListSize);
    if(status != CL_SUCCESS) 
    {  
        std::cout <<
            "Error: Getting Context Info \
            (device list size, clGetContextInfo)\n";
        return SDK_FAILURE;
    }

    devices = (cl_device_id *)malloc(deviceListSize);
    if(devices == 0)
    {
        std::cout << "Error: No devices found.\n";
        return SDK_FAILURE;
    }

    // Now, get the device list data获得装置的列表信息
    status = clGetContextInfo(
                 context, 
                 CL_CONTEXT_DEVICES, 
                 deviceListSize, 
                 devices, 
                 NULL);
    if(status != CL_SUCCESS) 
    { 
        std::cout <<
            "Error: Getting Context Info \
            (device list, clGetContextInfo)\n";
        return SDK_FAILURE;
    }

     
    // STEP 4 Creating command queue for a single device
    //      Each device in the context can have a 
    //      dedicated commandqueue object for itself
	//    建立Command Queue命令队列，接受对openCL的各种操作     
     
    
    commandQueue = clCreateCommandQueue(
                       context, //之前获得装置的列表信息
                       devices[0], //自己选择的装置
                       0, 
                       &status);
    if(status != CL_SUCCESS) 
    { 
        std::cout << "Creating Command Queue. (clCreateCommandQueue)\n";
        return SDK_FAILURE;
    }

    /
    // STEP 5 Creating cl_buffer objects from host buffer
    //          These buffer objects can be passed to the kernel
    //          as kernel arguments
	//        配置记忆体并复制资料到装置
    /
	//分别创建两个记忆体，输入和输出
    inputBuffer = clCreateBuffer(
                      context, 
                      CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
                      sizeof(cl_uint) * width,
                      input, 
                      &status);
    if(status != CL_SUCCESS) 
    { 
        std::cout << "Error: clCreateBuffer (inputBuffer)\n";
        return SDK_FAILURE;
    }

    outputBuffer = clCreateBuffer(
                       context, 
                       CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
                       sizeof(cl_uint) * width,
                       output, 
                       &status);
    if(status != CL_SUCCESS) 
    { 
        std::cout << "Error: clCreateBuffer (outputBuffer)\n";
        return SDK_FAILURE;
    }


    /
    // STEP 6. Building Kernel
    //      6.1 Load CL file, using basic file i/o
    //      6.2 Build CL program object
    //      6.3 Create CL kernel object
	//      6.1导入CL文件，即自己写的并行化的函数
	//      6.2建立程序
	//      6.3编译程序
    /
    const char * filename  = "Template_Kernels.cl";
    std::string  sourceStr = convertToString(filename);
    const char * source    = sourceStr.c_str();
    size_t sourceSize[]    = { strlen(source) };
	//直接将CL文件读到记忆体
    program = clCreateProgramWithSource(
                  context, 
                  1, 
                  &source,
                  sourceSize,
                  &status);
    if(status != CL_SUCCESS) 
    { 
      std::cout <<
               "Error: Loading Binary into cl_program \
               (clCreateProgramWithBinary)\n";
      return SDK_FAILURE;
    }

    // create a cl program executable for all the devices specified
    status = clBuildProgram(program, 1, devices, NULL, NULL, NULL);
    if(status != CL_SUCCESS) 
    { 
        std::cout << "Error: Building Program (clBuildProgram)\n";
        return SDK_FAILURE; 
    }

    // get a kernel object handle for a kernel with the given name
    kernel = clCreateKernel(program, "templateKernel", &status);
    if(status != CL_SUCCESS) 
    {  
        std::cout << "Error: Creating Kernel from program. (clCreateKernel)\n";
        return SDK_FAILURE;
    }

    return SDK_SUCCESS;
}


/*
 * \brief Run OpenCL program 
 *
 *        Bind host variables to kernel arguments 
 *        Run the CL kernel
 */
int 
runCLKernels(void)
{
    cl_int   status;
    cl_uint maxDims;
    cl_event events[2];
    size_t globalThreads[1];
    size_t localThreads[1];
    size_t maxWorkGroupSize;
    size_t maxWorkItemSizes[3];

     
    // STEP 7 Analyzing proper workgroup size for the kernel
    //          by querying device information
    //    7.1 Device Info CL_DEVICE_MAX_WORK_GROUP_SIZE
    //    7.2 Device Info CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS
    //    7.3 Device Info CL_DEVICE_MAX_WORK_ITEM_SIZES
     
    
    
    /**
    * Query device capabilities. Maximum 
    * work item dimensions and the maximmum
    * work item sizes
    */ 
    status = clGetDeviceInfo(
        devices[0], 
        CL_DEVICE_MAX_WORK_GROUP_SIZE, 
        sizeof(size_t), 
        (void*)&maxWorkGroupSize, 
        NULL);
    if(status != CL_SUCCESS) 
    {  
        std::cout << "Error: Getting Device Info. (clGetDeviceInfo)\n";
        return SDK_FAILURE;
    }
    
    status = clGetDeviceInfo(
        devices[0], 
        CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, 
        sizeof(cl_uint), 
        (void*)&maxDims, 
        NULL);
    if(status != CL_SUCCESS) 
    {  
        std::cout << "Error: Getting Device Info. (clGetDeviceInfo)\n";
        return SDK_FAILURE;
    }

    status = clGetDeviceInfo(
        devices[0], 
        CL_DEVICE_MAX_WORK_ITEM_SIZES, 
        sizeof(size_t)*maxDims,
        (void*)maxWorkItemSizes,
        NULL);
    if(status != CL_SUCCESS) 
    {  
        std::cout << "Error: Getting Device Info. (clGetDeviceInfo)\n";
        return SDK_FAILURE;
    }
    
    globalThreads[0] = width;
    localThreads[0]  = 1;

    if(localThreads[0] > maxWorkGroupSize ||
        localThreads[0] > maxWorkItemSizes[0])
    {
        std::cout << "Unsupported: Device does not support requested number of work items.";
        return SDK_FAILURE;
    }

     
    // STEP 8 Set appropriate arguments to the kernel
    //    8.1 Kernel Arg outputBuffer ( cl_mem object)
    //    8.2 Kernel Arg inputBuffer (cl_mem object)
    //    8.3 Kernel Arg multiplier (cl_uint)
     
    
    // the output array to the kernel
    status = clSetKernelArg(
                    kernel, 
                    0, 
                    sizeof(cl_mem), 
                    (void *)&outputBuffer);
    if(status != CL_SUCCESS) 
    { 
        std::cout << "Error: Setting kernel argument. (output)\n";
        return SDK_FAILURE;
    }

    // the input array to the kernel
    status = clSetKernelArg(
                    kernel, 
                    1, 
                    sizeof(cl_mem), 
                    (void *)&inputBuffer);
    if(status != CL_SUCCESS) 
    { 
        std::cout << "Error: Setting kernel argument. (input)\n";
        return SDK_FAILURE;
    }

    // multiplier
    status = clSetKernelArg(
                    kernel, 
                    2, 
                    sizeof(cl_uint), 
                    (void *)&multiplier);
    if(status != CL_SUCCESS) 
    { 
        std::cout << "Error: Setting kernel argument. (multiplier)\n";
        return SDK_FAILURE;
    }

     
    // STEP 9 Enqueue a kernel run call.
    //          Wait till the event completes and release the event
     
    status = clEnqueueNDRangeKernel(
                 commandQueue,
                 kernel,
                 1,
                 NULL,
                 globalThreads,
                 localThreads,
                 0,
                 NULL,
                 &events[0]);
    if(status != CL_SUCCESS) 
    { 
        std::cout <<
            "Error: Enqueueing kernel onto command queue. \
            (clEnqueueNDRangeKernel)\n";
        return SDK_FAILURE;
    }


    // wait for the kernel call to finish execution
    status = clWaitForEvents(1, &events[0]);
    if(status != CL_SUCCESS) 
    { 
        std::cout <<
            "Error: Waiting for kernel run to finish. \
            (clWaitForEvents)\n";
        return SDK_FAILURE;
    }

    status = clReleaseEvent(events[0]);
    if(status != CL_SUCCESS) 
    { 
        std::cout <<
            "Error: Release event object. \
            (clReleaseEvent)\n";
        return SDK_FAILURE;
    }

     
    // STEP 10  Enqueue readBuffer to read the output back
    //  Wait for the event and release the event
     
    status = clEnqueueReadBuffer(
                commandQueue,
                outputBuffer,
                CL_TRUE,
                0,
                width * sizeof(cl_uint),
                output,
                0,
                NULL,
                &events[1]);
    
    if(status != CL_SUCCESS) 
    { 
        std::cout << 
            "Error: clEnqueueReadBuffer failed. \
             (clEnqueueReadBuffer)\n";
        return SDK_FAILURE;
    }
    
    // Wait for the read buffer to finish execution
    status = clWaitForEvents(1, &events[1]);
    if(status != CL_SUCCESS) 
    { 
        std::cout <<
            "Error: Waiting for read buffer call to finish. \
            (clWaitForEvents)\n";
        return SDK_FAILURE;
    }
    
    status = clReleaseEvent(events[1]);
    if(status != CL_SUCCESS) 
    { 
        std::cout <<
            "Error: Release event object. \
            (clReleaseEvent)\n";
        return SDK_FAILURE;
    }
    return SDK_SUCCESS;
}


/*
 * \brief Release OpenCL resources (Context, Memory etc.) 
 */
int  
cleanupCL(void)
{
    cl_int status;

     
    // STEP 11  CLean up the opencl resources used 
     
    
    status = clReleaseKernel(kernel);
    if(status != CL_SUCCESS)
    {
        std::cout << "Error: In clReleaseKernel \n";
        return SDK_FAILURE; 
    }
    status = clReleaseProgram(program);
    if(status != CL_SUCCESS)
    {
        std::cout << "Error: In clReleaseProgram\n";
        return SDK_FAILURE; 
    }
    status = clReleaseMemObject(inputBuffer);
    if(status != CL_SUCCESS)
    {
        std::cout << "Error: In clReleaseMemObject (inputBuffer)\n";
        return SDK_FAILURE; 
    }
    status = clReleaseMemObject(outputBuffer);
    if(status != CL_SUCCESS)
    {
        std::cout << "Error: In clReleaseMemObject (outputBuffer)\n";
        return SDK_FAILURE; 
    }
    status = clReleaseCommandQueue(commandQueue);
    if(status != CL_SUCCESS)
    {
        std::cout << "Error: In clReleaseCommandQueue\n";
        return SDK_FAILURE;
    }
    status = clReleaseContext(context);
    if(status != CL_SUCCESS)
    {
        std::cout << "Error: In clReleaseContext\n";
        return SDK_FAILURE;
    }
    return SDK_SUCCESS;
}


/* 
 * \brief Releases program's resources 
 */
void
cleanupHost(void)
{
    if(input != NULL)
    {
        free(input);
        input = NULL;
    }
    if(output != NULL)
    {
        free(output);
        output = NULL;
    }
    if(devices != NULL)
    {
        free(devices);
        devices = NULL;
    }
}


/*
 * \brief Print no more than 256 elements of the given array.
 *
 *        Print Array name followed by elements.
 */
void print1DArray(
         const std::string arrayName, 
         const unsigned int * arrayData, 
         const unsigned int length)
{
    cl_uint i;
    cl_uint numElementsToPrint = (256 < length) ? 256 : length;

    std::cout << std::endl;
    std::cout << arrayName << ":" << std::endl;
    for(i = 0; i < numElementsToPrint; ++i)
    {
        std::cout << arrayData[i] << " ";
    }
    std::cout << std::endl;

}

void verify()
{
    bool passed = true;
    for(unsigned int i = 0; i < width; ++i)
        if(input[i] * multiplier != output[i])
            passed = false;

    if(passed == true)
        std::cout << "Passed!\n" << std::endl;
    else
        std::cout << "Failed!\n" << std::endl;
}

int 
main(int argc, char * argv[])
{
    // Initialize Host application 
    if(initializeHost() != SDK_SUCCESS)
        return SDK_FAILURE;

    // Initialize OpenCL resources
    if(initializeCL() != SDK_SUCCESS)
        return SDK_FAILURE;

    // Run the CL program
    if(runCLKernels() != SDK_SUCCESS)
        return SDK_FAILURE;

    // Print output array
    print1DArray(std::string("Output"), output, width);

    // Verify output
    verify();

    // Releases OpenCL resources 
    if(cleanupCL()!= SDK_SUCCESS)
        return SDK_FAILURE;

    // Release host resources
    cleanupHost();

    return SDK_SUCCESS;
}