opencl学习任务紧迫,仅看文字资料速度太慢,还是跟着大家一起通过尝试不同代码,边学边学。昨天写了把上次写的一维数组相加的做了修改,变成二维数组相加。初学者就是这么无聊,大家见谅!还是在freescale的i.MX6q上跑的,这个芯片的GPU中work group大小最大为1024,因此一维数组元素个数最大就是1024,二维中也只能width * height <= 1024。另外,此代码中通过事件的方式,利用opencl的API进行了kernel运算计时。借此可以慢慢了解一些事件相关的使用。
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <iostream>
#include <CL/cl.h>
using namespace std;
int main()
{
int width = 32, height = 32;
float *buf1 = 0;
float *buf2 = 0;
float *buf = 0;
buf1 = (float *)malloc(width * height * sizeof(float));
buf2 = (float *)malloc(width * height * sizeof(float));
buf = (float *)malloc(width * height * sizeof(float));
for(int i=0; i<32*32; i++)
{
buf1[i] = i*1.2;
buf2[i] = i*1.5;
}
cl_int ret;
//get platform id
cl_uint numplatforms;
cl_platform_id platform;
cl_platform_id *platforms;
ret = clGetPlatformIDs(0, NULL, &numplatforms);
if( ret != CL_SUCCESS )
{
printf("get platformID failed!");
return 0;
}
platforms = new cl_platform_id[numplatforms];
ret = clGetPlatformIDs(numplatforms, platforms, NULL);
platform = platforms[0];//here we just use the first one of the platforms
delete[] platforms;
//get device id
cl_uint numdevices;
cl_device_id device;
cl_device_id *devices;
ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numdevices);
if( ret != CL_SUCCESS )
{
printf("get deviceID failed!");
return 0;
}
devices = new cl_device_id[numdevices];
ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numdevices, devices, NULL);
device = devices[0];
delete[] devices;
//create context
cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
//create commandqueue
cl_command_queue queue = clCreateCommandQueue( context, device, CL_QUEUE_PROFILING_ENABLE, &ret );
if( ret != CL_SUCCESS )
{
printf(" queue creation failure!\n");
return 0;
}
//create opencl memory objects
//and copy buf1 to clbuf1 implicitly
//while copy buf2 to clbuf2 explicitly
cl_mem clbuf1 = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
width*height*sizeof(cl_float), buf1, NULL);
cl_mem clbuf2 = clCreateBuffer(context, CL_MEM_READ_ONLY, width*height*sizeof(cl_float), NULL, NULL);
ret = clEnqueueWriteBuffer(queue, clbuf2, 1, 0, width*height*sizeof(cl_float), buf2, 0, 0, NULL);// &writeEvt);
cl_mem clbuf = clCreateBuffer(context, CL_MEM_WRITE_ONLY, width*height*sizeof(cl_float), NULL, NULL);
const char* source = " __kernel void vecadd(__global const float *a, \
__global const float *b, __global const float *c)\
{\
int x = get_global_id(0);\
int y = get_global_id(1);\
int width = get_global_size(0);\
int height = get_global_size(1);\
c[x+y*width] = a[x+y*width]+b[x+y*width];\
}";
//create program object
cl_program program = clCreateProgramWithSource(context, 1, &source, NULL, &ret);
if( ret != CL_SUCCESS )
{
printf(" program creation failure!\n");
return 0;
}
//build program object
ret = clBuildProgram( program, 1, &device, NULL, NULL, NULL);
if( ret != CL_SUCCESS )
{
printf(" program build failure!\n");
return 0;
}
//create kernel
cl_kernel kernel = clCreateKernel(program, "vecadd", &ret);
if( ret != CL_SUCCESS )
{
printf("kernel creation failure!\n");
return 0;
}
//set kernel argument
cl_int num = width*height;
clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&clbuf1);
clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&clbuf2);
clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&clbuf);
//create kernel, rang 2dim, work itmes width*height
size_t localx, localy;
if(width/8 >4)
localx = 16;
else if(width<8)
localx = width;
else localx = 8;
if(height/8 >4)
localy = 16;
else if(height<8)
localy = height;
else localy = 8;
size_t globalThreads[] = {width, height};
size_t localThreads[] = {localx, localy};
cl_event ev;
//execu kernel
ret = clEnqueueNDRangeKernel(queue, kernel, 2, 0, globalThreads,
localThreads, 0, NULL, &ev);
if( ret != CL_SUCCESS )
{
if(ret == CL_INVALID_KERNEL) printf("invalid kernel!\n");
printf("%d\n", ret);
printf("execution failure!\n");
return 0;
}
clFinish( queue );
//calc the execu time
cl_ulong startTime = 0, endTime = 0;
clGetEventProfilingInfo(ev, CL_PROFILING_COMMAND_START,
sizeof(cl_ulong), &startTime, NULL);
clGetEventProfilingInfo(ev, CL_PROFILING_COMMAND_END,
sizeof(cl_ulong), &endTime, NULL);
cl_ulong time = endTime - startTime;
printf("simple kernel start time: %8.6f ms\n", startTime*1e-6);
printf("simple kernel end time: %8.6f ms\n", endTime*1e-6);
printf("simple kernel exec time: %8.6f ms\n", time*1e-6);
return 0;
}
本来想把同样的计时方式添加到上篇中一维数组的计算当中,但不论怎么修改,程序运行都正常,但计时始终显示为0,不能得到有效时间。有感兴趣的初学者朋友可以试试,ok的话给留个言。多谢。
刚说完问题就找到了,在创建命令队列的时候有一个参数需要使能。多亏一起学习的哥们帮忙,看来常用的API每一个参数的功能还是要好好看。
创建命令队列的函数原型为:
cl_command_queue clCreateCommandQueue ( cl_context context ,
cl_device_id device ,
cl_command_queue_properties properties,
cl_int *errcode_ret )
第一个参数为一个有效的上下文context。
第二个参数为与该上下文绑定的设备device。
第三个参数为要建立的队列的属性,有两个备选的属性,CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE和CL_QUEUE_PROFILING_ENABLE。前者决定该命令队列是否按顺序执行,还是乱序执行,如果设置了,则该命令队列将乱序执行,否则按顺序执行。第二个属性设置是否剖析命令。如果设置则剖析命令,否则不剖析。这个剖析命令的使能设置就影响了该命令执行计时的问题。 详情请查询opencl手册。
第四个参数返回错误码。
具体问题如下:
此篇中,二维数组命令队列建立语句为:
//create commandqueue
cl_command_queue queue = clCreateCommandQueue( context, device, CL_QUEUE_PROFILING_ENABLE, &ret );
上篇中,一维数组由于没想使用事件,命令队列的建立语句为:
//Command-queue
queue = clCreateCommandQueue( context, device, 0, &errNum );
其中第三个参数在一维数组的计算中没有设置剖析命令,因此无法获取命令执行时间。设置为CL_QUEUE_PROFILING_ENABLE即可计算运行时间。
对比发现,在freescale的i.MX6q上,两个含有1024个元素的一维数组相加耗时,是两个32*32的二维数组相加耗时的至少两倍。