数据并行化计算与任务并行化分解可以加快程序的运行速度。现在只讲数据并行。下一节讲任务并行。
如下基本算术例子,输入数组A和数组B,得到输出数组C,C的结果如图中output所示。
A数组如下:5行4列。
B数组如下:同样是5行4列。
实现的结果:
第1列蓝色 的值相加;
第2列绿色 的值相减;
第3列红色 的值相乘;
第4列青色 的值相除;
C++ 的示意代码如下:
float C[16];
int i;
for(i=0; i<5; i++)
{
C[i*4+0] = A[i*4+0] + B[i*4+0]; //task A
C[i*4+1] = A[i*4+1] - B[i*4+1];//task B
C[i*4+2] = A[i*4+2] * B[i*4+2];//task C
C[i*4+3] = A[i*4+3] / B[i*4+3];// task D
}
可以发现每一个for循环都由加减乘除4个任务组成,分别为task A、task B、task C和task D。按时间顺序从0时刻开始执行i=0到i=4的5个计算单元,运行完成时间假设为T。
对于每个程序块,A,B的数据来源都不同,图中的颜色对应task的颜色,由于数据之间并没有依赖关系,所以在程序设计时可以使i=0,1,2,3,4 五个程序块一起运行,将不同的数据给相同的处理函数同时运行,理想化得使运行时间缩减到T/5,如上图所示。这种办法对不同的数据使用相同的核函数,称为数据并行。
数据化并行使用的OpenCL的API函数是:clEnqueueNDRangeKernel()
以下是参考程序:
Template.cl
__kernel void dataParallel(__global float* A, __global float* B, __global float* C)
{
int base = 4*get_global_id(0);
C[base+0] = A[base+0] + B[base+0];
C[base+1] = A[base+1] - B[base+1];
C[base+2] = A[base+2] * B[base+2];
C[base+3] = A[base+3] / B[base+3];
printf("%d %f %f %f %f\n",base,C[base+0],C[base+1],C[base+2],C[base+3]);
}
util.c
#include <stdio.h>
#include <stdlib.h>
#include <tchar.h>
#include <memory.h>
#include <windows.h>
#include "CL\cl.h"
#include "CL\cl_ext.h"
#include "utils.h"
#include <assert.h>
//we want to use POSIX functions
#pragma warning( push )
#pragma warning( disable : 4996 )
void LogInfo(const char* str, ...)
{
if (str)
{
va_list args;
va_start(args, str);
vfprintf(stdout, str, args);
va_end(args);
}
}
void LogError(const char* str, ...)
{
if (str)
{
va_list args;
va_start(args, str);
vfprintf(stderr, str, args);
va_end(args);
}
}
// Upload the OpenCL C source code to output argument source
// The memory resource is implicitly allocated in the function
// and should be deallocated by the caller
int ReadSourceFromFile(const char* fileName, char** source, size_t* sourceSize)
{
int errorCode = CL_SUCCESS;
FILE* fp = NULL;
fopen_s(&fp, fileName, "rb");
if (fp == NULL)
{
LogError("Error: Couldn't find program source file '%s'.\n", fileName);
errorCode = CL_INVALID_VALUE;
}
else {
fseek(fp, 0, SEEK_END);
*sourceSize = ftell(fp);
fseek(fp, 0, SEEK_SET);
*source = new char[*sourceSize];
if (*source == NULL)
{
LogError("Error: Couldn't allocate %d bytes for program source from file '%s'.\n", *sourceSize, fileName);
errorCode = CL_OUT_OF_HOST_MEMORY;
}
else {
fread(*source, 1, *sourceSize, fp);
}
}
return errorCode;
}
#pragma warning( pop )
main.c
#include <stdio.h>
#include <stdlib.h>
#include <tchar.h>
#include <memory.h>
#include <vector>
#include "CL\cl.h"
#include "utils.h"
//for perf. counters
#include <Windows.h>
// Macros for OpenCL versions
#define OPENCL_VERSION_1_2 1.2f
#define OPENCL_VERSION_2_0 2.0f
/* This function helps to create informative messages in
* case when OpenCL errors occur. It returns a string
* representation for an OpenCL error code.
* (E.g. "CL_DEVICE_NOT_FOUND" instead of just -1.)
*/
const char* TranslateOpenCLError(cl_int errorCode)
{
switch(errorCode)
{
case CL_SUCCESS: return "CL_SUCCESS";
case CL_DEVICE_NOT_FOUND: return "CL_DEVICE_NOT_FOUND";
case CL_DEVICE_NOT_AVAILABLE: return "CL_DEVICE_NOT_AVAILABLE";
case CL_COMPILER_NOT_AVAILABLE: return "CL_COMPILER_NOT_AVAILABLE";
case CL_MEM_OBJECT_ALLOCATION_FAILURE: return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
case CL_OUT_OF_RESOURCES: return "CL_OUT_OF_RESOURCES";
case CL_OUT_OF_HOST_MEMORY: return "CL_OUT_OF_HOST_MEMORY";
case CL_PROFILING_INFO_NOT_AVAILABLE: return "CL_PROFILING_INFO_NOT_AVAILABLE";
case CL_MEM_COPY_OVERLAP: return "CL_MEM_COPY_OVERLAP";
case CL_IMAGE_FORMAT_MISMATCH: return "CL_IMAGE_FORMAT_MISMATCH";
case CL_IMAGE_FORMAT_NOT_SUPPORTED: return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
case CL_BUILD_PROGRAM_FAILURE: return "CL_BUILD_PROGRAM_FAILURE";
case CL_MAP_FAILURE: return "CL_MAP_FAILURE";
case CL_MISALIGNED_SUB_BUFFER_OFFSET: return "CL_MISALIGNED_SUB_BUFFER_OFFSET"; //-13
case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST: return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST"; //-14
case CL_COMPILE_PROGRAM_FAILURE: return "CL_COMPILE_PROGRAM_FAILURE"; //-15
case CL_LINKER_NOT_AVAILABLE: return "CL_LINKER_NOT_AVAILABLE"; //-16
case CL_LINK_PROGRAM_FAILURE: return "CL_LINK_PROGRAM_FAILURE"; //-17
case CL_DEVICE_PARTITION_FAILED: return "CL_DEVICE_PARTITION_FAILED"; //-18
case CL_KERNEL_ARG_INFO_NOT_AVAILABLE: return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE"; //-19
case CL_INVALID_VALUE: return "CL_INVALID_VALUE";
case CL_INVALID_DEVICE_TYPE: return "CL_INVALID_DEVICE_TYPE";
case CL_INVALID_PLATFORM: return "CL_INVALID_PLATFORM";
case CL_INVALID_DEVICE: return "CL_INVALID_DEVICE";
case CL_INVALID_CONTEXT: return "CL_INVALID_CONTEXT";
case CL_INVALID_QUEUE_PROPERTIES: return "CL_INVALID_QUEUE_PROPERTIES";
case CL_INVALID_COMMAND_QUEUE: return "CL_INVALID_COMMAND_QUEUE";
case CL_INVALID_HOST_PTR: return "CL_INVALID_HOST_PTR";
case CL_INVALID_MEM_OBJECT: return "CL_INVALID_MEM_OBJECT";
case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
case CL_INVALID_IMAGE_SIZE: return "CL_INVALID_IMAGE_SIZE";
case CL_INVALID_SAMPLER: return "CL_INVALID_SAMPLER";
case CL_INVALID_BINARY: return "CL_INVALID_BINARY";
case CL_INVALID_BUILD_OPTIONS: return "CL_INVALID_BUILD_OPTIONS";
case CL_INVALID_PROGRAM: return "CL_INVALID_PROGRAM";
case CL_INVALID_PROGRAM_EXECUTABLE: return "CL_INVALID_PROGRAM_EXECUTABLE";
case CL_INVALID_KERNEL_NAME: return "CL_INVALID_KERNEL_NAME";
case CL_INVALID_KERNEL_DEFINITION: return "CL_INVALID_KERNEL_DEFINITION";
case CL_INVALID_KERNEL: return "CL_INVALID_KERNEL";
case CL_INVALID_ARG_INDEX: return "CL_INVALID_ARG_INDEX";
case CL_INVALID_ARG_VALUE: return "CL_INVALID_ARG_VALUE";
case CL_INVALID_ARG_SIZE: return "CL_INVALID_ARG_SIZE";
case CL_INVALID_KERNEL_ARGS: return "CL_INVALID_KERNEL_ARGS";
case CL_INVALID_WORK_DIMENSION: return "CL_INVALID_WORK_DIMENSION";
case CL_INVALID_WORK_GROUP_SIZE: return "CL_INVALID_WORK_GROUP_SIZE";
case CL_INVALID_WORK_ITEM_SIZE: return "CL_INVALID_WORK_ITEM_SIZE";
case CL_INVALID_GLOBAL_OFFSET: return "CL_INVALID_GLOBAL_OFFSET";
case CL_INVALID_EVENT_WAIT_LIST: return "CL_INVALID_EVENT_WAIT_LIST";
case CL_INVALID_EVENT: return "CL_INVALID_EVENT";
case CL_INVALID_OPERATION: return "CL_INVALID_OPERATION";
case CL_INVALID_GL_OBJECT: return "CL_INVALID_GL_OBJECT";
case CL_INVALID_BUFFER_SIZE: return "CL_INVALID_BUFFER_SIZE";
case CL_INVALID_MIP_LEVEL: return "CL_INVALID_MIP_LEVEL";
case CL_INVALID_GLOBAL_WORK_SIZE: return "CL_INVALID_GLOBAL_WORK_SIZE"; //-63
case CL_INVALID_PROPERTY: return "CL_INVALID_PROPERTY"; //-64
case CL_INVALID_IMAGE_DESCRIPTOR: return "CL_INVALID_IMAGE_DESCRIPTOR"; //-65
case CL_INVALID_COMPILER_OPTIONS: return "CL_INVALID_COMPILER_OPTIONS"; //-66
case CL_INVALID_LINKER_OPTIONS: return "CL_INVALID_LINKER_OPTIONS"; //-67
case CL_INVALID_DEVICE_PARTITION_COUNT: return "CL_INVALID_DEVICE_PARTITION_COUNT"; //-68
// case CL_INVALID_PIPE_SIZE: return "CL_INVALID_PIPE_SIZE"; //-69
// case CL_INVALID_DEVICE_QUEUE: return "CL_INVALID_DEVICE_QUEUE"; //-70
default:
return "UNKNOWN ERROR CODE";
}
}
/* Convenient container for all OpenCL specific objects used in the sample
*
* It consists of two parts:
* - regular OpenCL objects which are used in almost each normal OpenCL applications
* - several OpenCL objects that are specific for this particular sample
*
* You collect all these objects in one structure for utility purposes
* only, there is no OpenCL specific here: just to avoid global variables
* and make passing all these arguments in functions easier.
*/
struct ocl_args_d_t
{
ocl_args_d_t();
~ocl_args_d_t();
// Regular OpenCL objects:
cl_context context; // hold the context handler
cl_device_id device; // hold the selected device handler
cl_command_queue commandQueue; // hold the commands-queue handler
cl_program program; // hold the program handler
cl_kernel kernel; // hold the kernel handler
float platformVersion; // hold the OpenCL platform version (default 1.2)
float deviceVersion; // hold the OpenCL device version (default. 1.2)
float compilerVersion; // hold the device OpenCL C version (default. 1.2)
// Objects that are specific for algorithm implemented in this sample
cl_mem srcA; // hold first source buffer
cl_mem srcB; // hold second source buffer
cl_mem dstMem; // hold destination buffer
};
ocl_args_d_t::ocl_args_d_t():
context(NULL),
device(NULL),
commandQueue(NULL),
program(NULL),
kernel(NULL),
platformVersion(OPENCL_VERSION_1_2),
deviceVersion(OPENCL_VERSION_1_2),
compilerVersion(OPENCL_VERSION_1_2),
srcA(NULL),
srcB(NULL),
dstMem(NULL)
{
}
/*
* destructor - called only once
* Release all OpenCL objects
* This is a regular sequence of calls to deallocate all created OpenCL resources in bootstrapOpenCL.
*
* You may want to call these deallocation procedures in the middle of your application execution
* (not at the end) if you don't further need OpenCL runtime.
* You may want to do that in order to free some memory, for example,
* or recreate OpenCL objects with different parameters.
*
*/
ocl_args_d_t::~ocl_args_d_t()
{
cl_int err = CL_SUCCESS;
if (kernel)
{
err = clReleaseKernel(kernel);
if (CL_SUCCESS != err)
{
LogError("Error: clReleaseKernel returned '%s'.\n", TranslateOpenCLError(err));
}
}
if (program)
{
err = clReleaseProgram(program);
if (CL_SUCCESS != err)
{
LogError("Error: clReleaseProgram returned '%s'.\n", TranslateOpenCLError(err));
}
}
if (srcA)
{
err = clReleaseMemObject(srcA);
if (CL_SUCCESS != err)
{
LogError("Error: clReleaseMemObject returned '%s'.\n", TranslateOpenCLError(err));
}
}
if (srcB)
{
err = clReleaseMemObject(srcB);
if (CL_SUCCESS != err)
{
LogError("Error: clReleaseMemObject returned '%s'.\n", TranslateOpenCLError(err));
}
}
if (dstMem)
{
err = clReleaseMemObject(dstMem);
if (CL_SUCCESS != err)
{
LogError("Error: clReleaseMemObject returned '%s'.\n", TranslateOpenCLError(err));
}
}
if (commandQueue)
{
err = clReleaseCommandQueue(commandQueue);
if (CL_SUCCESS != err)
{
LogError("Error: clReleaseCommandQueue returned '%s'.\n", TranslateOpenCLError(err));
}
}
if (device)
{
err = clReleaseDevice(device);
if (CL_SUCCESS != err)
{
LogError("Error: clReleaseDevice returned '%s'.\n", TranslateOpenCLError(err));
}
}
if (context)
{
err = clReleaseContext(context);
if (CL_SUCCESS != err)
{
LogError("Error: clReleaseContext returned '%s'.\n", TranslateOpenCLError(err));
}
}
/*
* Note there is no procedure to deallocate platform
* because it was not created at the startup,
* but just queried from OpenCL runtime.
*/
}
/*
* Check whether an OpenCL platform is the required platform
* (based on the platform's name)
*/
bool CheckPreferredPlatformMatch(cl_platform_id platform, const char* preferredPlatform)
{
size_t stringLength = 0;
cl_int err = CL_SUCCESS;
bool match = false;
// In order to read the platform's name, we first read the platform's name string length (param_value is NULL).
// The value returned in stringLength
err = clGetPlatformInfo(platform, CL_PLATFORM_NAME, 0, NULL, &stringLength);
if (CL_SUCCESS != err)
{
LogError("Error: clGetPlatformInfo() to get CL_PLATFORM_NAME length returned '%s'.\n", TranslateOpenCLError(err));
return false;
}
// Now, that we know the platform's name string length, we can allocate enough space before read it
std::vector<char> platformName(stringLength);
// Read the platform's name string
// The read value returned in platformName
err = clGetPlatformInfo(platform, CL_PLATFORM_NAME, stringLength, &platformName[0], NULL);
if (CL_SUCCESS != err)
{
LogError("Error: clGetplatform_ids() to get CL_PLATFORM_NAME returned %s.\n", TranslateOpenCLError(err));
return false;
}
// Now check if the platform's name is the required one
if (strstr(&platformName[0], preferredPlatform) != 0)
{
// The checked platform is the one we're looking for
match = true;
}
return match;
}
/*
* Find and return the preferred OpenCL platform
* In case that preferredPlatform is NULL, the ID of the first discovered platform will be returned
*/
cl_platform_id FindOpenCLPlatform(const char* preferredPlatform, cl_device_type deviceType)
{
cl_uint numPlatforms = 0;
cl_int err = CL_SUCCESS;
// Get (in numPlatforms) the number of OpenCL platforms available
// No platform ID will be return, since platforms is NULL
err = clGetPlatformIDs(0, NULL, &numPlatforms);
if (CL_SUCCESS != err)
{
LogError("Error: clGetplatform_ids() to get num platforms returned %s.\n", TranslateOpenCLError(err));
return NULL;
}
LogInfo("Number of available platforms: %u\n", numPlatforms);
if (0 == numPlatforms)
{
LogError("Error: No platforms found!\n");
return NULL;
}
std::vector<cl_platform_id> platforms(numPlatforms);
// Now, obtains a list of numPlatforms OpenCL platforms available
// The list of platforms available will be returned in platforms
err = clGetPlatformIDs(numPlatforms, &platforms[0], NULL);
if (CL_SUCCESS != err)
{
LogError("Error: clGetplatform_ids() to get platforms returned %s.\n", TranslateOpenCLError(err));
return NULL;
}
// Check if one of the available platform matches the preferred requirements
for (cl_uint i = 0; i < numPlatforms; i++)
{
bool match = true;
cl_uint numDevices = 0;
// If the preferredPlatform is not NULL then check if platforms[i] is the required one
// Otherwise, continue the check with platforms[i]
if ((NULL != preferredPlatform) && (strlen(preferredPlatform) > 0))
{
// In case we're looking for a specific platform
match = CheckPreferredPlatformMatch(platforms[i], preferredPlatform);
}
// match is true if the platform's name is the required one or don't care (NULL)
if (match)
{
// Obtains the number of deviceType devices available on platform
// When the function failed we expect numDevices to be zero.
// We ignore the function return value since a non-zero error code
// could happen if this platform doesn't support the specified device type.
err = clGetDeviceIDs(platforms[i], deviceType, 0, NULL, &numDevices);
if (CL_SUCCESS != err)
{
LogError("clGetDeviceIDs() returned %s.\n", TranslateOpenCLError(err));
}
if (0 != numDevices)
{
// There is at list one device that answer the requirements
return platforms[i];
}
}
}
return NULL;
}
/*
* This function read the OpenCL platdorm and device versions
* (using clGetxxxInfo API) and stores it in the ocl structure.
* Later it will enable us to support both OpenCL 1.2 and 2.0 platforms and devices
* in the same program.
*/
int GetPlatformAndDeviceVersion (cl_platform_id platformId, ocl_args_d_t *ocl)
{
cl_int err = CL_SUCCESS;
// Read the platform's version string length (param_value is NULL).
// The value returned in stringLength
size_t stringLength = 0;
err = clGetPlatformInfo(platformId, CL_PLATFORM_VERSION, 0, NULL, &stringLength);
if (CL_SUCCESS != err)
{
LogError("Error: clGetPlatformInfo() to get CL_PLATFORM_VERSION length returned '%s'.\n", TranslateOpenCLError(err));
return err;
}
// Now, that we know the platform's version string length, we can allocate enough space before read it
std::vector<char> platformVersion(stringLength);
// Read the platform's version string
// The read value returned in platformVersion
err = clGetPlatformInfo(platformId, CL_PLATFORM_VERSION, stringLength, &platformVersion[0], NULL);
if (CL_SUCCESS != err)
{
LogError("Error: clGetplatform_ids() to get CL_PLATFORM_VERSION returned %s.\n", TranslateOpenCLError(err));
return err;
}
if (strstr(&platformVersion[0], "OpenCL 2.0") != NULL)
{
ocl->platformVersion = OPENCL_VERSION_2_0;
}
// Read the device's version string length (param_value is NULL).
err = clGetDeviceInfo(ocl->device, CL_DEVICE_VERSION, 0, NULL, &stringLength);
if (CL_SUCCESS != err)
{
LogError("Error: clGetDeviceInfo() to get CL_DEVICE_VERSION length returned '%s'.\n", TranslateOpenCLError(err));
return err;
}
// Now, that we know the device's version string length, we can allocate enough space before read it
std::vector<char> deviceVersion(stringLength);
// Read the device's version string
// The read value returned in deviceVersion
err = clGetDeviceInfo(ocl->device, CL_DEVICE_VERSION, stringLength, &deviceVersion[0], NULL);
if (CL_SUCCESS != err)
{
LogError("Error: clGetDeviceInfo() to get CL_DEVICE_VERSION returned %s.\n", TranslateOpenCLError(err));
return err;
}
if (strstr(&deviceVersion[0], "OpenCL 2.0") != NULL)
{
ocl->deviceVersion = OPENCL_VERSION_2_0;
}
// Read the device's OpenCL C version string length (param_value is NULL).
err = clGetDeviceInfo(ocl->device, CL_DEVICE_OPENCL_C_VERSION, 0, NULL, &stringLength);
if (CL_SUCCESS != err)
{
LogError("Error: clGetDeviceInfo() to get CL_DEVICE_OPENCL_C_VERSION length returned '%s'.\n", TranslateOpenCLError(err));
return err;
}
// Now, that we know the device's OpenCL C version string length, we can allocate enough space before read it
std::vector<char> compilerVersion(stringLength);
// Read the device's OpenCL C version string
// The read value returned in compilerVersion
err = clGetDeviceInfo(ocl->device, CL_DEVICE_OPENCL_C_VERSION, stringLength, &compilerVersion[0], NULL);
if (CL_SUCCESS != err)
{
LogError("Error: clGetDeviceInfo() to get CL_DEVICE_OPENCL_C_VERSION returned %s.\n", TranslateOpenCLError(err));
return err;
}
else if (strstr(&compilerVersion[0], "OpenCL C 2.0") != NULL)
{
ocl->compilerVersion = OPENCL_VERSION_2_0;
}
return err;
}
/*
* Generate random value for input buffers
*/
void generateInputA(float* inputArray, int arrayWidth, int arrayHeight)
{
for (int i = 0; i < arrayHeight; i++) {
for (int j = 0; j < arrayWidth; j++) {
inputArray[i * arrayWidth + j] = i * arrayWidth + j + 1;
}
}
}
/*
* Generate random value for input buffers
*/
void generateInputB(float* inputArray, int arrayWidth, int arrayHeight)
{
for (int i = 0; i < arrayHeight; i++) {
for (int j = 0; j < arrayWidth; j++) {
inputArray[i * arrayWidth + j] = j * arrayWidth + i + 1;
}
}
}
/*
* This function picks/creates necessary OpenCL objects which are needed.
* The objects are:
* OpenCL platform, device, context, and command queue.
*
* All these steps are needed to be performed once in a regular OpenCL application.
* This happens before actual compute kernels calls are performed.
*
* For convenience, in this application you store all those basic OpenCL objects in structure ocl_args_d_t,
* so this function populates fields of this structure, which is passed as parameter ocl.
* Please, consider reviewing the fields before going further.
* The structure definition is right in the beginning of this file.
*/
int SetupOpenCL(ocl_args_d_t *ocl, cl_device_type deviceType)
{
// The following variable stores return codes for all OpenCL calls.
cl_int err = CL_SUCCESS;
// Query for all available OpenCL platforms on the system
// Here you enumerate all platforms and pick one which name has preferredPlatform as a sub-string
cl_platform_id platformId = FindOpenCLPlatform("Intel", deviceType);
if (NULL == platformId)
{
LogError("Error: Failed to find OpenCL platform.\n");
return CL_INVALID_VALUE;
}
// Create context with device of specified type.
// Required device type is passed as function argument deviceType.
// So you may use this function to create context for any CPU or GPU OpenCL device.
// The creation is synchronized (pfn_notify is NULL) and NULL user_data
cl_context_properties contextProperties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platformId, 0};
ocl->context = clCreateContextFromType(contextProperties, deviceType, NULL, NULL, &err);
if ((CL_SUCCESS != err) || (NULL == ocl->context))
{
LogError("Couldn't create a context, clCreateContextFromType() returned '%s'.\n", TranslateOpenCLError(err));
return err;
}
// Query for OpenCL device which was used for context creation
err = clGetContextInfo(ocl->context, CL_CONTEXT_DEVICES, sizeof(cl_device_id), &ocl->device, NULL);
if (CL_SUCCESS != err)
{
LogError("Error: clGetContextInfo() to get list of devices returned %s.\n", TranslateOpenCLError(err));
return err;
}
// Read the OpenCL platform's version and the device OpenCL and OpenCL C versions
GetPlatformAndDeviceVersion(platformId, ocl);
// Create command queue.
// OpenCL kernels are enqueued for execution to a particular device through special objects called command queues.
// Command queue guarantees some ordering between calls and other OpenCL commands.
// Here you create a simple in-order OpenCL command queue that doesn't allow execution of two kernels in parallel on a target device.
#ifdef CL_VERSION_2_0
if (OPENCL_VERSION_2_0 == ocl->deviceVersion)
{
const cl_command_queue_properties properties[] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
ocl->commandQueue = clCreateCommandQueueWithProperties(ocl->context, ocl->device, properties, &err);
}
else {
// default behavior: OpenCL 1.2
cl_command_queue_properties properties = CL_QUEUE_PROFILING_ENABLE;
ocl->commandQueue = clCreateCommandQueue(ocl->context, ocl->device, properties, &err);
}
#else
// default behavior: OpenCL 1.2
cl_command_queue_properties properties = CL_QUEUE_PROFILING_ENABLE;
ocl->commandQueue = clCreateCommandQueue(ocl->context, ocl->device, properties, &err);
#endif
if (CL_SUCCESS != err)
{
LogError("Error: clCreateCommandQueue() returned %s.\n", TranslateOpenCLError(err));
return err;
}
return CL_SUCCESS;
}
/*
* Create and build OpenCL program from its source code
*/
int CreateAndBuildProgram(ocl_args_d_t *ocl)
{
cl_int err = CL_SUCCESS;
// Upload the OpenCL C source code from the input file to source
// The size of the C program is returned in sourceSize
char* source = NULL;
size_t src_size = 0;
err = ReadSourceFromFile("Template.cl", &source, &src_size);
if (CL_SUCCESS != err)
{
LogError("Error: ReadSourceFromFile returned %s.\n", TranslateOpenCLError(err));
goto Finish;
}
// And now after you obtained a regular C string call clCreateProgramWithSource to create OpenCL program object.
ocl->program = clCreateProgramWithSource(ocl->context, 1, (const char**)&source, &src_size, &err);
if (CL_SUCCESS != err)
{
LogError("Error: clCreateProgramWithSource returned %s.\n", TranslateOpenCLError(err));
goto Finish;
}
// Build the program
// During creation a program is not built. You need to explicitly call build function.
// Here you just use create-build sequence,
// but there are also other possibilities when program consist of several parts,
// some of which are libraries, and you may want to consider using clCompileProgram and clLinkProgram as
// alternatives.
err = clBuildProgram(ocl->program, 1, &ocl->device, "", NULL, NULL);
if (CL_SUCCESS != err)
{
LogError("Error: clBuildProgram() for source program returned %s.\n", TranslateOpenCLError(err));
// In case of error print the build log to the standard output
// First check the size of the log
// Then allocate the memory and obtain the log from the program
if (err == CL_BUILD_PROGRAM_FAILURE)
{
size_t log_size = 0;
clGetProgramBuildInfo(ocl->program, ocl->device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
std::vector<char> build_log(log_size);
clGetProgramBuildInfo(ocl->program, ocl->device, CL_PROGRAM_BUILD_LOG, log_size, &build_log[0], NULL);
LogError("Error happened during the build of OpenCL program.\nBuild log:%s", &build_log[0]);
}
}
Finish:
if (source)
{
delete[] source;
source = NULL;
}
return err;
}
/*
* Create OpenCL buffers from host memory
* These buffers will be used later by the OpenCL kernel
*/
int CreateBufferArguments(ocl_args_d_t *ocl, float* inputA, float* inputB, float* outputC, int arrayWidth, int arrayHeight)
{
cl_int err = CL_SUCCESS;
// Create new OpenCL buffer objects
// As these buffer are used only for read by the kernel, you are recommended to create it with flag CL_MEM_READ_ONLY.
// Always set minimal read/write flags for buffers, it may lead to better performance because it allows runtime
// to better organize data copying.
// You use CL_MEM_COPY_HOST_PTR here, because the buffers should be populated with bytes at inputA and inputB.
ocl->srcA = clCreateBuffer(ocl->context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(float) * arrayWidth * arrayHeight, inputA, &err);
if (CL_SUCCESS != err)
{
LogError("Error: clCreateBuffer for srcA returned %s\n", TranslateOpenCLError(err));
return err;
}
ocl->srcB = clCreateBuffer(ocl->context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(float) * arrayWidth * arrayHeight, inputB, &err);
if (CL_SUCCESS != err)
{
LogError("Error: clCreateBuffer for srcB returned %s\n", TranslateOpenCLError(err));
return err;
}
// If the output buffer is created directly on top of output buffer using CL_MEM_USE_HOST_PTR,
// then, depending on the OpenCL runtime implementation and hardware capabilities,
// it may save you not necessary data copying.
// As it is known that output buffer will be write only, you explicitly declare it using CL_MEM_WRITE_ONLY.
ocl->dstMem = clCreateBuffer(ocl->context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, sizeof(float) * arrayWidth * arrayHeight, outputC, &err);
if (CL_SUCCESS != err)
{
LogError("Error: clCreateBuffer for dstMem returned %s\n", TranslateOpenCLError(err));
return err;
}
return CL_SUCCESS;
}
/*
* Set kernel arguments
*/
cl_uint SetKernelArguments(ocl_args_d_t *ocl)
{
cl_int err = CL_SUCCESS;
err = clSetKernelArg(ocl->kernel, 0, sizeof(cl_mem), (void *)&ocl->srcA);
if (CL_SUCCESS != err)
{
LogError("error: Failed to set argument srcA, returned %s\n", TranslateOpenCLError(err));
return err;
}
err = clSetKernelArg(ocl->kernel, 1, sizeof(cl_mem), (void *)&ocl->srcB);
if (CL_SUCCESS != err)
{
LogError("Error: Failed to set argument srcB, returned %s\n", TranslateOpenCLError(err));
return err;
}
err = clSetKernelArg(ocl->kernel, 2, sizeof(cl_mem), (void *)&ocl->dstMem);
if (CL_SUCCESS != err)
{
LogError("Error: Failed to set argument dstMem, returned %s\n", TranslateOpenCLError(err));
return err;
}
return err;
}
/*
* Execute the kernel
*/
cl_uint ExecuteAddKernel(ocl_args_d_t *ocl, cl_uint width, cl_uint height)
{
cl_int err = CL_SUCCESS;
// Define global iteration space for clEnqueueNDRangeKernel.
size_t globalWorkSize[2] = {width, height};
size_t global_item_size = height;
size_t local_item_size = 1;
// execute kernel
err = clEnqueueNDRangeKernel(ocl->commandQueue, ocl->kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);
if (CL_SUCCESS != err)
{
LogError("Error: Failed to run kernel, return %s\n", TranslateOpenCLError(err));
return err;
}
// Wait until the queued kernel is completed by the device
err = clFinish(ocl->commandQueue);
if (CL_SUCCESS != err)
{
LogError("Error: clFinish return %s\n", TranslateOpenCLError(err));
return err;
}
return CL_SUCCESS;
}
/*
* "Read" the result buffer (mapping the buffer to the host memory address)
*/
bool ReadAndVerify(ocl_args_d_t *ocl, int width, int height, float *inputA, float *inputB)
{
cl_int err = CL_SUCCESS;
bool result = true;
// Enqueue a command to map the buffer object (ocl->dstMem) into the host address space and returns a pointer to it
// The map operation is blocking
float *resultPtr = (float *)clEnqueueMapBuffer(ocl->commandQueue, ocl->dstMem, true, CL_MAP_READ, 0, sizeof(cl_uint) * width * height, 0, NULL, NULL, &err);
if (CL_SUCCESS != err)
{
LogError("Error: clEnqueueMapBuffer returned %s\n", TranslateOpenCLError(err));
return false;
}
// Call clFinish to guarantee that output region is updated
err = clFinish(ocl->commandQueue);
if (CL_SUCCESS != err)
{
LogError("Error: clFinish returned %s\n", TranslateOpenCLError(err));
}
// We mapped dstMem to resultPtr, so resultPtr is ready and includes the kernel output !!!
// Verify the results
/*unsigned int size = width * height;
for (unsigned int k = 0; k < size; ++k)
{
if (resultPtr[k] != inputA[k] + inputB[k])
{
LogError("Verification failed at %d: (%d + %d = %d)\n", k, inputA[k], inputB[k], resultPtr[k]);
result = false;
}
}*/
printf("Calculation result:\n");
for (int i = 0; i < height; i++) {
for (int j = 0; j < width; j++) {
printf("%7.2f\t", resultPtr[i * width + j]);
}
printf("\n");
}
// Unmapped the output buffer before releasing it
err = clEnqueueUnmapMemObject(ocl->commandQueue, ocl->dstMem, resultPtr, 0, NULL, NULL);
if (CL_SUCCESS != err)
{
LogError("Error: clEnqueueUnmapMemObject returned %s\n", TranslateOpenCLError(err));
}
return result;
}
/*
* main execution routine
* Basically it consists of three parts:
* - generating the inputs
* - running OpenCL kernel
* - reading results of processing
*/
int _tmain(int argc, TCHAR* argv[])
{
cl_int err;
ocl_args_d_t ocl;
cl_device_type deviceType = CL_DEVICE_TYPE_GPU;
LARGE_INTEGER perfFrequency;
LARGE_INTEGER performanceCountNDRangeStart;
LARGE_INTEGER performanceCountNDRangeStop;
int arrayWidth = 4;//列
int arrayHeight = 5;//行
//initialize Open CL objects (context, queue, etc.)
if (CL_SUCCESS != SetupOpenCL(&ocl, deviceType))
{
return -1;
}
// allocate working buffers.
// the buffer should be aligned with 4K page and size should fit 64-byte cached line
int i, j;
float *inputA;
float *inputB;
float *outputC;
inputA = new float[arrayWidth * arrayHeight];// (float *)malloc(arrayWidth * arrayHeight * sizeof(float));
inputB = new float[arrayWidth * arrayHeight];//(float *)malloc(arrayWidth * arrayHeight * sizeof(float));
outputC = new float[arrayWidth * arrayHeight];//(float *)malloc(arrayWidth * arrayHeight * sizeof(float));
if (NULL == inputA || NULL == inputB || NULL == outputC)
{
LogError("Error: _aligned_malloc failed to allocate buffers.\n");
return -1;
}
//random input
generateInputA(inputA, arrayWidth, arrayHeight);
generateInputB(inputB, arrayWidth, arrayHeight);
printf("\n");
printf("A array data:\n");
for (i = 0; i < arrayHeight; i++) {
for (int j = 0; j < arrayWidth; j++) {
printf("%.2f\t", inputA[i * arrayWidth + j]);
}
printf("\n");
}
printf("B array data:\n");
for (i = 0; i < arrayHeight; i++) {
for (int j = 0; j < arrayWidth; j++) {
printf("%.2f\t", inputB[i * arrayWidth + j]);
}
printf("\n");
}
// Create OpenCL buffers from host memory
// These buffers will be used later by the OpenCL kernel
if (CL_SUCCESS != CreateBufferArguments(&ocl, inputA, inputB, outputC, arrayWidth, arrayHeight))
{
return -1;
}
// Create and build the OpenCL program
if (CL_SUCCESS != CreateAndBuildProgram(&ocl))
{
return -1;
}
// Program consists of kernels.
// Each kernel can be called (enqueued) from the host part of OpenCL application.
// To call the kernel, you need to create it from existing program.
ocl.kernel = clCreateKernel(ocl.program, "dataParallel", &err);
if (CL_SUCCESS != err)
{
LogError("Error: clCreateKernel returned %s\n", TranslateOpenCLError(err));
return -1;
}
// Passing arguments into OpenCL kernel.
if (CL_SUCCESS != SetKernelArguments(&ocl))
{
return -1;
}
// Regularly you wish to use OpenCL in your application to achieve greater performance results
// that are hard to achieve in other ways.
// To understand those performance benefits you may want to measure time your application spent in OpenCL kernel execution.
// The recommended way to obtain this time is to measure interval between two moments:
// - just before clEnqueueNDRangeKernel is called, and
// - just after clFinish is called
// clFinish is necessary to measure entire time spending in the kernel, measuring just clEnqueueNDRangeKernel is not enough,
// because this call doesn't guarantees that kernel is finished.
// clEnqueueNDRangeKernel is just enqueue new command in OpenCL command queue and doesn't wait until it ends.
// clFinish waits until all commands in command queue are finished, that suits your need to measure time.
bool queueProfilingEnable = true;
if (queueProfilingEnable)
QueryPerformanceCounter(&performanceCountNDRangeStart);
// Execute (enqueue) the kernel
if (CL_SUCCESS != ExecuteAddKernel(&ocl, arrayWidth, arrayHeight))
{
return -1;
}
if (queueProfilingEnable)
QueryPerformanceCounter(&performanceCountNDRangeStop);
// The last part of this function: getting processed results back.
// use map-unmap sequence to update original memory area with output buffer.
ReadAndVerify(&ocl, arrayWidth, arrayHeight, inputA, inputB);
// retrieve performance counter frequency
if (queueProfilingEnable)
{
QueryPerformanceFrequency(&perfFrequency);
LogInfo("NDRange performance counter time %f ms.\n",
1000.0f*(float)(performanceCountNDRangeStop.QuadPart - performanceCountNDRangeStart.QuadPart) / (float)perfFrequency.QuadPart);
}
delete[]inputA;
delete[]inputB;
delete[]outputC;
/*free(inputB);
free(outputC);*/
return 0;
}
输出结果如下:
Number of available platforms: 1
A array data:
1.00 2.00 3.00 4.00
5.00 6.00 7.00 8.00
9.00 10.00 11.00 12.00
13.00 14.00 15.00 16.00
17.00 18.00 19.00 20.00
B array data:
1.00 5.00 9.00 13.00
2.00 6.00 10.00 14.00
3.00 7.00 11.00 15.00
4.00 8.00 12.00 16.00
5.00 9.00 13.00 17.00
*****************************************************
0 2.000000 -3.000000 27.000000 0.307692
12 17.000000 6.000000 180.000000 1.000000
16 22.000000 9.000000 247.000000 1.176471
8 12.000000 3.000000 121.000000 0.800000
4 7.000000 0.000000 70.000000 0.571429
*****************************************************
Calculation result:
2.00 -3.00 27.00 0.31
7.00 0.00 70.00 0.57
12.00 3.00 121.00 0.80
17.00 6.00 180.00 1.00
22.00 9.00 247.00 1.18
NDRange performance counter time 8.532231 ms.
从执行结果来看,实现了设计的并行。*****号中间的 LOG可以看出,共分5个工作项并行。工作项的global_id分别为:0,4,8,12,16.
留个疑问,上述的分组,有时会有冲突,待以后解决。
测试环境:win10 + vs 2015 + intelGPU + opencl