opencl学习（三）

最新推荐文章于 2024-06-23 14:25:49 发布

CV码

最新推荐文章于 2024-06-23 14:25:49 发布

阅读量4k

点赞数 1

分类专栏： OpenCL 文章标签： OpenCL

本文链接：https://blog.csdn.net/eric41050808/article/details/10135781

版权

OpenCL 专栏收录该内容

11 篇文章 4 订阅

订阅专栏

opencl学习任务紧迫，仅看文字资料速度太慢，还是跟着大家一起通过尝试不同代码，边学边学。昨天写了把上次写的一维数组相加的做了修改，变成二维数组相加。初学者就是这么无聊，大家见谅！还是在freescale的i.MX6q上跑的，这个芯片的GPU中work group大小最大为1024，因此一维数组元素个数最大就是1024，二维中也只能width * height <= 1024。另外，此代码中通过事件的方式，利用opencl的API进行了kernel运算计时。借此可以慢慢了解一些事件相关的使用。

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <iostream>
#include <CL/cl.h>

using namespace std;

int main()
{
	int width = 32, height = 32;
	
	float *buf1 = 0;
	float *buf2 = 0;
	float *buf  = 0;
	
	buf1 = (float *)malloc(width * height * sizeof(float));
	buf2 = (float *)malloc(width * height * sizeof(float));
	buf  = (float *)malloc(width * height * sizeof(float));

	for(int i=0; i<32*32; i++)
	{
		buf1[i] = i*1.2;
		buf2[i] = i*1.5;
	}
	
	cl_int ret;
	//get platform id
	cl_uint numplatforms;
	cl_platform_id platform;
	cl_platform_id *platforms;
	
	ret = clGetPlatformIDs(0, NULL, &numplatforms);	
	if( ret != CL_SUCCESS )
	{
		printf("get platformID failed!");
		return 0;
	}
	platforms = new cl_platform_id[numplatforms];
	ret = clGetPlatformIDs(numplatforms, platforms, NULL);
	platform = platforms[0];//here we just use the first one of the platforms
	
	delete[] platforms;

	//get device id
	cl_uint numdevices;
	cl_device_id device;
	cl_device_id *devices;
	
	ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numdevices);
	if( ret != CL_SUCCESS )
	{
		printf("get deviceID failed!");
		return 0;
	}
	devices = new cl_device_id[numdevices];
	ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numdevices, devices, NULL);
	device = devices[0];
	delete[] devices;
	
	//create context
	cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);	
	
	//create commandqueue
	cl_command_queue queue = clCreateCommandQueue( context, device, CL_QUEUE_PROFILING_ENABLE, &ret );
	if( ret != CL_SUCCESS )
	{
		printf(" queue creation failure!\n");
		return 0;
	}

	//create opencl memory objects
	//and copy buf1 to clbuf1 implicitly
	//while copy buf2 to clbuf2 explicitly
	cl_mem clbuf1 = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 
									width*height*sizeof(cl_float), buf1, NULL);
	cl_mem clbuf2 = clCreateBuffer(context, CL_MEM_READ_ONLY, width*height*sizeof(cl_float), NULL, NULL);	
	
	ret = clEnqueueWriteBuffer(queue, clbuf2, 1, 0, width*height*sizeof(cl_float), buf2, 0, 0, NULL);// &writeEvt);

	cl_mem clbuf = clCreateBuffer(context, CL_MEM_WRITE_ONLY, width*height*sizeof(cl_float), NULL, NULL);
	
	const char* source = "	__kernel void vecadd(__global const float *a, \
							__global const float *b, __global const float *c)\
							{\
								int x = get_global_id(0);\
								int y = get_global_id(1);\
								int width = get_global_size(0);\
								int height = get_global_size(1);\
								c[x+y*width] = a[x+y*width]+b[x+y*width];\
							}";
	//create program object
	cl_program program = clCreateProgramWithSource(context,	1, &source, NULL, &ret);
	if( ret != CL_SUCCESS )
	{
		printf(" program creation failure!\n");
		return 0;
	}
	//build program object
	ret = clBuildProgram( program, 1, &device, NULL, NULL, NULL);
	if( ret != CL_SUCCESS )
	{
		printf(" program build failure!\n");
		return 0;
	}

	//create kernel
	cl_kernel kernel = clCreateKernel(program, "vecadd", &ret);
	if( ret != CL_SUCCESS )
	{
		printf("kernel creation failure!\n");
		return 0;
	}
	
	//set kernel argument
	cl_int num = width*height;
	clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&clbuf1);
	clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&clbuf2);
	clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&clbuf);

	//create kernel, rang 2dim, work itmes width*height
	size_t localx, localy;
	if(width/8 >4)
		localx = 16;
	else if(width<8)
		localx = width;
	else localx = 8;

	if(height/8 >4)
		localy = 16;
	else if(height<8)
		localy = height;
	else localy = 8;

	size_t globalThreads[] = {width, height};	
	size_t localThreads[] = {localx, localy};

	cl_event ev;
	//execu kernel
	ret = clEnqueueNDRangeKernel(queue, kernel, 2, 0, globalThreads, 
								localThreads, 0, NULL, &ev);
	if( ret != CL_SUCCESS )
	{
		if(ret == CL_INVALID_KERNEL) printf("invalid kernel!\n");
		printf("%d\n", ret);
		printf("execution failure!\n");
		return 0;
	}
	clFinish( queue );
	//calc the execu time
	cl_ulong startTime = 0, endTime = 0;
	clGetEventProfilingInfo(ev, CL_PROFILING_COMMAND_START, 
							sizeof(cl_ulong), &startTime, NULL);	
	clGetEventProfilingInfo(ev, CL_PROFILING_COMMAND_END, 
							sizeof(cl_ulong), &endTime, NULL);	
	cl_ulong time = endTime - startTime;
	printf("simple kernel start time: %8.6f ms\n", startTime*1e-6);
	printf("simple kernel end time: %8.6f ms\n", endTime*1e-6);
	printf("simple kernel exec time: %8.6f ms\n", time*1e-6);
	return 0;
}

本来想把同样的计时方式添加到上篇中一维数组的计算当中，但不论怎么修改，程序运行都正常，但计时始终显示为0，不能得到有效时间。有感兴趣的初学者朋友可以试试，ok的话给留个言。多谢。

刚说完问题就找到了，在创建命令队列的时候有一个参数需要使能。多亏一起学习的哥们帮忙，看来常用的API每一个参数的功能还是要好好看。

创建命令队列的函数原型为：

cl_command_queue clCreateCommandQueue ( cl_context context ,
cl_device_id device ,
cl_command_queue_properties properties,
cl_int *errcode_ret )

第一个参数为一个有效的上下文context。

第二个参数为与该上下文绑定的设备device。

第三个参数为要建立的队列的属性，有两个备选的属性，CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE和CL_QUEUE_PROFILING_ENABLE。前者决定该命令队列是否按顺序执行，还是乱序执行，如果设置了，则该命令队列将乱序执行，否则按顺序执行。第二个属性设置是否剖析命令。如果设置则剖析命令，否则不剖析。这个剖析命令的使能设置就影响了该命令执行计时的问题。详情请查询opencl手册。

第四个参数返回错误码。

具体问题如下：

此篇中，二维数组命令队列建立语句为：

//create commandqueue
cl_command_queue queue = clCreateCommandQueue( context, device, CL_QUEUE_PROFILING_ENABLE, &ret );

上篇中，一维数组由于没想使用事件，命令队列的建立语句为：

//Command-queue
queue = clCreateCommandQueue( context, device, 0, &errNum );

其中第三个参数在一维数组的计算中没有设置剖析命令，因此无法获取命令执行时间。设置为CL_QUEUE_PROFILING_ENABLE即可计算运行时间。

对比发现，在freescale的i.MX6q上，两个含有1024个元素的一维数组相加耗时，是两个32*32的二维数组相加耗时的至少两倍。