二进制版opencl程序

转载-原文地址:http://www.photoneray.com/opencl_04/

在前一篇介绍 program 等术语时,提到创建 program 对象有两种方式: clCreateProgramWithSourceclCreateProgramWithBinary。区别仅在于 opencl 程序在用户面前的展现形式,前者是源代码形式,后者是二进制形式。二进制形式的数据格式是不透明的,不同的实现可以有不同的标准。使用二进制形式的好处有二:一是由于二进制码已经经过编译(部分编译为中间件或全部编译为可执行文件),所以加载速度更快,需要的内存更少;二是可以保护 opencl 代码,保护知识产权。

下面我们就来看看如何利用二进制形式:


存储 opencl 程序为二进制

我们的第一个问题是:二进制版的 opencl 程序从哪里来?前文说过,所有的 cl 代码都要经过加载并创建 program 对象,然后由 program 对象在 device 上面编译并执行。难道还有其他方式编译 opencl 代码?答案是:NO!

意味着我们还是需要将代码送到 device 里面编译。你会说,这不是多此一举吗?看起来确实有点,不过一般的做法是在软件安装的时候就进行编译保存二进制形式,然后真正运行时才加载二进制。这样分成两个步骤的话,倒也说的过去。

省去前面那些与 platformdevicecontext的代码,我们直接进入创建 program 的地方。首先还是利用 clCreateProgramWithSource 函数读取源代码文件并用 clBuildProgram 函数编译。示例代码如下:

cl_int status;
cl_program program;

ifstream kernelFile("binary_kernel.ocl", ios::in);
if(!kernelFile.is_open())
	return;

ostringstream oss;
oss << kernelFile.rdbuf();

string srcStdStr = oss.str();
const char *srcStr = srcStdStr.c_str();

program = clCreateProgramWithSource(context, 
									1, 
									(const char **)&srcStr,
									NULL,
									NULL);

if(program ==NULL)
	return;

status = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);

if(status != CL_SUCCESS)
	return;

代码可能不完整,完整示例请看文末。

现在我们已经将 opencl 代码在 device 编译完成了。接下来要做的就是将编译好的二进制取出来存在磁盘上。使用的 API 就是 clGetProgramInfo:

cl_int clGetProgramInfo(cl_program program,
						cl_program_info param_name,
						size_t param_value_size,
						void *param_value,
						size_t *param_value_size_ret)

使用方法见如下代码片段(为使逻辑清晰,省略了错误检测,实际开发可不要省啊):

cl_uint numDevices = 0;

// 获取 program 绑定过的 device 数量
clGetProgramInfo(program,
				CL_PROGRAM_NUM_DEVICES,
				sizeof(cl_uint),
				&numDevices,
				NULL);

// 获取所有的 device ID
cl_device_id *devices = new cl_device_id[numDevices];
clGetProgramInfo(program,
				CL_PROGRAM_DEVICES,
				sizeof(cl_device_id) * numDevices,
				devices,
				NULL);

// 决定每个 program 二进制的大小
size_t *programBinarySizes = new size_t[numDevices];
clGetProgramInfo(program,
				CL_PROGRAM_BINARY_SIZES,
				sizeof(size_t) * numDevices,
				programBinarySizes,
				NULL);

unsigned char **programBinaries = new unsigned char *[numDevices];
for(cl_uint i = 0; i < numDevices; ++i)
	programBinaries[i] = new unsigned char[programBinarySizes[i]];

// 获取所有的 program 二进制
clGetProgramInfo(program,
				CL_PROGRAM_BINARIES,
				sizeof(unsigned char *) * numDevices,
				programBinaries,
				NULL);

// 存储 device 所需要的二进制
for(cl_uint i = 0; i < numDevices; ++i){
	// 只存储 device 需要的二进制,多个 device 需要存储多个二进制
	if(devices[i] == device){
		FILE *fp = fopen("kernel_binary_ocl.bin", "wb"); 
		fwrite(programBinaries[i], 1, programBinarySizes[i], fp);
		fclose(fp);
		break;
	}
}

// 清理
delete[] devices;
delete [] programBinarySizes;
for(cl_uint i = 0; i < numDevices; ++i)
	delete [] programBinaries[i];
delete[] programBinaries;

要注意的是,可能有很多个 device 都编译了 program,所以将二进制提取出来时,我们是遍历了所有编译了 programdevice


读取二进制版opencl程序

经过上面一系列的操作,我们的磁盘上应该存在一个二进制版的 opencl 程序了。里面的内容可能是可读的,也可能是不可读的。这个视不同厂商实现而不同。

相对于存储,读取看起来就清爽的多,无非是打开二进制文件,然后调用 clCreateProgramWithBinary函数。示例如下:

FILE *fp= fopen("kernel_binary_ocl.bin", "rb");

// 获取二进制的大小
size_t binarySize;
fseek(fp, 0, SEEK_END);
binarySize = ftell(fp);
rewind(fp);

// 加载二进制文件
unsigned char *programBinary = new unsigned char[binarySize];
fread(programBinary, 1, binarySize, fp);
fclose(fp);

cl_program program;
program = clCreateProgramWithBinary(context,
									1,
									&device,
									&binarySize,
									(const unsigned char**)&programBinary,
									NULL,
									NULL);

delete [] programBinary;

clBuildProgram(program 0, NULL, NULL, NULL, NULL);

这里要注意,即使加载是二进制版,我们在之后还是要对其进行 clBuildProgram。原因在于,我们无法保证所谓的二进制一定是可执行码。因为每个厂商的实现不一,有的可能就是最终执行码,而有的却是中间码。所以无论是从源代码还是二进制创建 program,之后都需要 clBuildProgram

这样兜了一圈,发现要使用二进制版还是用了一遍源代码方式,感觉代码复杂好多,有点多余。其实换个角度来看,我们完全可以写成两个程序,一个专门用来读取源代码并编译生成二进制,另一个才是读取二进制运行软件。前者开发人员使用,后者才是给用户使用的。只有这样才能体现二进制版的优势。


示例代码

OpenCLCompileToBin.cpp


/*  OpenCLCompileToBin.cpp
 *  (c) by keyring <keyrings@163.com>
 *  2016.01.13
 */

#include <iostream>

#if defined(__APPLE__) || defined(__MACOSX)
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif


#define KERNEL(...)#__VA_ARGS__

const char *kernelSourceCode = KERNEL(
                                   __kernel void hellocl(__global uint *buffer)
{
    size_t gidx = get_global_id(0);
    size_t gidy = get_global_id(1);
    size_t lidx = get_local_id(0);
    buffer[gidx + 4 * gidy] = (1 << gidx) | (0x10 << gidy);

}
                               );

int main(int argc, char const *argv[])
{
    printf("hello OpenCL\n");
    cl_int status = 0;
    size_t deviceListSize;

    // 得到并选择可用平台
    cl_uint numPlatforms;
    cl_platform_id platform = NULL;
    status = clGetPlatformIDs(0, NULL, &numPlatforms);

    if (status != CL_SUCCESS) {
        printf("ERROR: Getting Platforms.(clGetPlatformIDs)\n");
        return EXIT_FAILURE;
    }

    if (numPlatforms > 0) {
        cl_platform_id *platforms = (cl_platform_id *)malloc(numPlatforms * sizeof(cl_platform_id));
        status = clGetPlatformIDs(numPlatforms, platforms, NULL);
        if (status != CL_SUCCESS) {
            printf("Error: Getting Platform Ids.(clGetPlatformIDs)\n");
            return -1;
        }

        for (unsigned int i = 0; i < numPlatforms; ++i) {
            char pbuff[100];
            status = clGetPlatformInfo(
                         platforms[i],
                         CL_PLATFORM_VENDOR,
                         sizeof(pbuff),
                         pbuff,
                         NULL);
            platform = platforms[i];
            if (!strcmp(pbuff, "Advanced Micro Devices, Inc.")) {
                break;
            }
        }

        delete platforms;
    }

    // 如果我们能找到相应平台,就使用它,否则返回NULL
    cl_context_properties cps[3] = {
        CL_CONTEXT_PLATFORM,
        (cl_context_properties)platform,
        0
    };

    cl_context_properties *cprops = (NULL == platform) ? NULL : cps;


    // 生成 context
    cl_context context = clCreateContextFromType(
                             cprops,
                             CL_DEVICE_TYPE_GPU,
                             NULL,
                             NULL,
                             &status);
    if (status != CL_SUCCESS) {
        printf("Error: Creating Context.(clCreateContexFromType)\n");
        return EXIT_FAILURE;
    }

    // 寻找OpenCL设备

    // 首先得到设备列表的长度
    status = clGetContextInfo(context,
                              CL_CONTEXT_DEVICES,
                              0,
                              NULL,
                              &deviceListSize);
    if (status != CL_SUCCESS) {
        printf("Error: Getting Context Info device list size, clGetContextInfo)\n");
        return EXIT_FAILURE;
    }
    cl_device_id *devices = (cl_device_id *)malloc(deviceListSize);
    if (devices == 0) {
        printf("Error: No devices found.\n");
        return EXIT_FAILURE;
    }

    // 现在得到设备列表
    status = clGetContextInfo(context,
                              CL_CONTEXT_DEVICES,
                              deviceListSize,
                              devices,
                              NULL);
    if (status != CL_SUCCESS) {
        printf("Error: Getting Context Info (device list, clGetContextInfo)\n");
        return EXIT_FAILURE;
    }


    // 装载内核程序,编译CL program ,生成CL内核实例

    size_t sourceSize[] = {strlen(kernelSourceCode)};
    cl_program program = clCreateProgramWithSource(context,
                         1,
                         &kernelSourceCode,
                         sourceSize,
                         &status);
    if (status != CL_SUCCESS) {
        printf("Error: Loading Binary into cl_program (clCreateProgramWithBinary)\n");
        return EXIT_FAILURE;
    }

    // 为指定的设备编译CL program.
    status = clBuildProgram(program, 1, devices, NULL, NULL, NULL);
    if (status != CL_SUCCESS) {
        printf("Error: Building Program (clBuildingProgram)\n");
        return EXIT_FAILURE;
    }
    deviceListSize = 1;
    // 决定每个 program 二进制的大小
    size_t *programBinarySizes = new size_t[deviceListSize];
    clGetProgramInfo(program,
                    CL_PROGRAM_BINARY_SIZES,
                    sizeof(size_t) * deviceListSize,
                    programBinarySizes,
                    NULL);

    printf("%lu\n", deviceListSize);
    unsigned char **programBinaries = new unsigned char *[deviceListSize];
    for(cl_uint i = 0; i < deviceListSize; ++i)
        programBinaries[i] = new unsigned char[programBinarySizes[i]];

    // 获取所有的 program 二进制
    clGetProgramInfo(program,
                    CL_PROGRAM_BINARIES,
                    sizeof(unsigned char *) * deviceListSize,
                    programBinaries,
                    NULL);

    printf("ready write to file\n");
    // 写入文件
    FILE *fp = fopen("./kernel_binary_ocl.bin", "wb"); 
    fwrite(programBinaries[0], 1, programBinarySizes[0], fp);
    fclose(fp);


    // 资源回收

    status = clReleaseProgram(program);
    status = clReleaseContext(context);

    free(devices);
    delete [] programBinarySizes;
    for(cl_uint i = 0; i < deviceListSize; ++i)
        delete [] programBinaries[i];
    delete programBinaries;

    return 0;
}

OpenCLRunWithBin.cpp


/*  OpenCLRunWithBin.cpp
 *  (c) by keyring <keyrings@163.com>
 *  2016.01.13
 */

#include <iostream>

#if defined(__APPLE__) || defined(__MACOSX)
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif


// #define KERNEL(...)#__VA_ARGS__

// const char *kernelSourceCode = KERNEL(
//                                    __kernel void hellocl(__global uint *buffer)
// {
//     size_t gidx = get_global_id(0);
//     size_t gidy = get_global_id(1);
//     size_t lidx = get_local_id(0);
//     buffer[gidx + 4 * gidy] = (1 << gidx) | (0x10 << gidy);

// }
//                                );

int main(int argc, char const *argv[])
{
    printf("hello OpenCL\n");
    cl_int status = 0;
    size_t deviceListSize;

    // 得到并选择可用平台
    cl_uint numPlatforms;
    cl_platform_id platform = NULL;
    status = clGetPlatformIDs(0, NULL, &numPlatforms);

    if (status != CL_SUCCESS) {
        printf("ERROR: Getting Platforms.(clGetPlatformIDs)\n");
        return EXIT_FAILURE;
    }

    if (numPlatforms > 0) {
        cl_platform_id *platforms = (cl_platform_id *)malloc(numPlatforms * sizeof(cl_platform_id));
        status = clGetPlatformIDs(numPlatforms, platforms, NULL);
        if (status != CL_SUCCESS) {
            printf("Error: Getting Platform Ids.(clGetPlatformIDs)\n");
            return -1;
        }

        for (unsigned int i = 0; i < numPlatforms; ++i) {
            char pbuff[100];
            status = clGetPlatformInfo(
                         platforms[i],
                         CL_PLATFORM_VENDOR,
                         sizeof(pbuff),
                         pbuff,
                         NULL);
            platform = platforms[i];
            if (!strcmp(pbuff, "Advanced Micro Devices, Inc.")) {
                break;
            }
        }

        delete platforms;
    }

    // 如果我们能找到相应平台,就使用它,否则返回NULL
    cl_context_properties cps[3] = {
        CL_CONTEXT_PLATFORM,
        (cl_context_properties)platform,
        0
    };

    cl_context_properties *cprops = (NULL == platform) ? NULL : cps;


    // 生成 context
    cl_context context = clCreateContextFromType(
                             cprops,
                             CL_DEVICE_TYPE_GPU,
                             NULL,
                             NULL,
                             &status);
    if (status != CL_SUCCESS) {
        printf("Error: Creating Context.(clCreateContexFromType)\n");
        return EXIT_FAILURE;
    }

    // 寻找OpenCL设备

    // 首先得到设备列表的长度
    status = clGetContextInfo(context,
                              CL_CONTEXT_DEVICES,
                              0,
                              NULL,
                              &deviceListSize);
    if (status != CL_SUCCESS) {
        printf("Error: Getting Context Info device list size, clGetContextInfo)\n");
        return EXIT_FAILURE;
    }
    cl_device_id *devices = (cl_device_id *)malloc(deviceListSize);
    if (devices == 0) {
        printf("Error: No devices found.\n");
        return EXIT_FAILURE;
    }

    // 现在得到设备列表
    status = clGetContextInfo(context,
                              CL_CONTEXT_DEVICES,
                              deviceListSize,
                              devices,
                              NULL);
    if (status != CL_SUCCESS) {
        printf("Error: Getting Context Info (device list, clGetContextInfo)\n");
        return EXIT_FAILURE;
    }

    FILE *fp= fopen("./kernel_binary_ocl.bin", "rb");

    // 获取二进制的大小
    size_t binarySize;
    fseek(fp, 0, SEEK_END);
    binarySize = ftell(fp);
    rewind(fp);

    // 加载二进制文件
    unsigned char *programBinary = new unsigned char[binarySize];
    fread(programBinary, 1, binarySize, fp);
    fclose(fp);

    cl_program program;
    program = clCreateProgramWithBinary(context,
                                        1,
                                        &devices[0],
                                        &binarySize,
                                        (const unsigned char**)&programBinary,
                                        NULL,
                                        NULL);

    delete [] programBinary;

    // 装载内核程序,编译CL program ,生成CL内核实例

    // size_t sourceSize[] = {strlen(kernelSourceCode)};
    // cl_program program = clCreateProgramWithSource(context,
    //                      1,
    //                      &kernelSourceCode,
    //                      sourceSize,
    //                      &status);
    // if (status != CL_SUCCESS) {
    //     printf("Error: Loading Binary into cl_program (clCreateProgramWithBinary)\n");
    //     return EXIT_FAILURE;
    // }

    // 为指定的设备编译CL program.
    status = clBuildProgram(program, 1, devices, NULL, NULL, NULL);
    if (status != CL_SUCCESS) {
        printf("Error: Building Program (clBuildingProgram)\n");
        return EXIT_FAILURE;
    }

    // 得到指定名字的内核实例的句柄
    cl_kernel kernel = clCreateKernel(program, "hellocl", &status);
    if (status != CL_SUCCESS) {
        printf("Error: Creating Kernel from program.(clCreateKernel)\n");
        return EXIT_FAILURE;
    }

    // 创建 OpenCL buffer 对象
    unsigned int *outbuffer = new unsigned int [4 * 4];
    memset(outbuffer, 0, 4 * 4 * 4);
    cl_mem outputBuffer = clCreateBuffer(
        context, 
        CL_MEM_ALLOC_HOST_PTR, 
        4 * 4 * 4, 
        NULL, 
        &status);

    if (status != CL_SUCCESS) {
        printf("Error: Create Buffer, outputBuffer. (clCreateBuffer)\n");
        return EXIT_FAILURE;
    }


    //  为内核程序设置参数
    status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&outputBuffer);
    if (status != CL_SUCCESS) {
        printf("Error: Setting kernel argument. (clSetKernelArg)\n");
        return EXIT_FAILURE;
    }

    // 创建一个OpenCL command queue
    cl_command_queue commandQueue = clCreateCommandQueue(context,
                                    devices[0],
                                    0,
                                    &status);
    if (status != CL_SUCCESS) {
        printf("Error: Create Command Queue. (clCreateCommandQueue)\n");
        return EXIT_FAILURE;
    }


    // 将一个kernel 放入 command queue
    size_t globalThreads[] = {4, 4};
    size_t localThreads[] = {2, 2};
    status = clEnqueueNDRangeKernel(commandQueue, kernel,
                                    2, NULL, globalThreads,
                                    localThreads, 0,
                                    NULL, NULL);
    if (status != CL_SUCCESS) {
        printf("Error: Enqueueing kernel\n");
        return EXIT_FAILURE;
    }

    // 确认 command queue 中所有命令都执行完毕
    status = clFinish(commandQueue);
    if (status != CL_SUCCESS) {
        printf("Error: Finish command queue\n");
        return EXIT_FAILURE;
    }

    // 将内存对象中的结果读回Host
    status = clEnqueueReadBuffer(commandQueue,
                                 outputBuffer, CL_TRUE, 0,
                                 4 * 4 * 4, outbuffer, 0, NULL, NULL);
    if (status != CL_SUCCESS) {
        printf("Error: Read buffer queue\n");
        return EXIT_FAILURE;
    }

    // Host端打印结果
    printf("out:\n");
    for (int i = 0; i < 16; ++i) {
        printf("%x ", outbuffer[i]);
        if ((i + 1) % 4 == 0)
            printf("\n");
    }

    // 资源回收
    status = clReleaseKernel(kernel);
    status = clReleaseProgram(program);
    status = clReleaseMemObject(outputBuffer);
    status = clReleaseCommandQueue(commandQueue);
    status = clReleaseContext(context);

    free(devices);
    delete outbuffer;

    system("pause");

    return 0;
}

先使用compile文件编译一个bin,然后使用run文件加载bin运行。

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值