AMD-SDK的学习[2]--AtomicCounters

最新推荐文章于 2022-01-20 17:18:17 发布

元气少女缘结神

最新推荐文章于 2022-01-20 17:18:17 发布

阅读量449

点赞数

分类专栏： Improve Performance 文章标签： OpenCL linux

本文链接：https://blog.csdn.net/wd1603926823/article/details/78296915

版权

Improve Performance 专栏收录该内容

39 篇文章 8 订阅

订阅专栏

第二个例子：AtomicCounters：我依旧是改成了自己习惯看的样子

main：

#include <CL/cl.h>
#include <CL/cl_ext.h>
#include <string>
#include "a_needed_headers/SDKCommon.hpp"
#include "n_needed_headers/oclUtils.h"
using namespace std;
#define  GROUP_SIZE 256
int main()
{
	//set up OpenCL...
	cl_uint platformNum;
	cl_int status;
	status=clGetPlatformIDs(0,NULL,&platformNum);
	if(status!=CL_SUCCESS){
		printf("cannot get platforms number.\n");
		return -1;
	}
	cl_platform_id* platforms;
	platforms=(cl_platform_id*)alloca(sizeof(cl_platform_id)*platformNum);
	status=clGetPlatformIDs(platformNum,platforms,NULL);
	if(status!=CL_SUCCESS){
		printf("cannot get platforms addresses.\n");
		return -1;
	}
	cl_platform_id platformInUse=platforms[0];
	cl_device_id device;
	status=clGetDeviceIDs(platformInUse,CL_DEVICE_TYPE_DEFAULT,1,&device,NULL);
	cl_context context=clCreateContext(NULL,1,&device,NULL,NULL,&status);
	cl_command_queue_properties prop=0; //CL_QUEUE_PROFILING_ENABLE;
	cl_command_queue_properties *propers;
	propers=∝
	cl_command_queue commandQueue=clCreateCommandQueueWithProperties(context,device,propers, &status);
	std::ifstream srcFile("/home/jumper/OpenCL_projects/AMD-Sample-AtomicCounters/AtomicCounters_Kernels.cl");
	std::string srcProg(std::istreambuf_iterator<char>(srcFile),(std::istreambuf_iterator<char>()));
	const char * src = srcProg.c_str();
	size_t srclength = srcProg.length();
	cl_program program=clCreateProgramWithSource(context,1,&src,&srclength,&status);
	status=clBuildProgram(program,1,&device,NULL,NULL,&status);
	if (status != CL_SUCCESS)
	 {
		 cout<<"error:clBuildProgram()..."<<endl;
		 shrLogEx(LOGBOTH | ERRORMSG, status, STDERROR);
		 oclLogBuildInfo(program, oclGetFirstDev(context));
		 oclLogPtx(program, oclGetFirstDev(context), "oclproblem.ptx");
		 return(EXIT_FAILURE);
	 }


	//host data...
	// Make sure length is multiples of GROUP_SIZE
	cl_uint length=1024;
	length = (length / GROUP_SIZE);
	length = length ? length * GROUP_SIZE : GROUP_SIZE;
	cl_uint *input = (cl_uint*)malloc(length * sizeof(cl_uint));
	CHECK_ALLOCATION(input, "Allocation failed(input)");
	// Set the input data
	cl_uint value = 2;
	for(cl_uint i = 0; i < length; ++i)
	{
		input[i] = (cl_uint)(rand() % 5);
	}


	size_t globalWorkItems = length;
	size_t localWorkItems = GROUP_SIZE;
	//atomicCounters kernel process...
	cl_kernel counterKernel = clCreateKernel(program, "atomicCounters", &status);
	CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(counterKernel).");
	cl_mem inBuf = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_PERSISTENT_MEM_AMD, length * sizeof(cl_uint), NULL,&status);
	CHECK_OPENCL_ERROR(status, "clCreateBuffer failed.(inBuf)");
	// Set up data for input array
	cl_event writeEvt;
	status = clEnqueueWriteBuffer(commandQueue, inBuf,CL_FALSE,0,length * sizeof(cl_uint),input,0,NULL,&writeEvt);
	CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer(inBuf) failed..");
	status = clFlush(commandQueue);
	CHECK_OPENCL_ERROR(status, "clFlush(commandQueue) failed.");
	status = clEnqueueWaitForEvents(commandQueue,1,&writeEvt);
	clReleaseEvent(writeEvt);
	CHECK_OPENCL_ERROR(status, "waitForEventAndRelease(writeEvt) failed.");
	cl_mem counterOutBuf = clCreateBuffer(context,CL_MEM_READ_WRITE,sizeof(cl_uint),NULL,&status);
	CHECK_OPENCL_ERROR(status, "clCreateBuffer failed.(counterOutBuf).");
	cl_uint initValue=0;
	cl_event writeEvt1;
	status = clEnqueueWriteBuffer(commandQueue,counterOutBuf,CL_FALSE,0, sizeof(cl_uint),&initValue, 0, NULL,&writeEvt1);
	CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer(counterOutBuf) failed.");
	// Initialize the counter value
	status = clFlush(commandQueue);
	CHECK_OPENCL_ERROR(status, "clFlush(commandQueue)failed.");
	// Wait for event and release event
	status = clEnqueueWaitForEvents(commandQueue,1,&writeEvt1);
	clReleaseEvent(writeEvt1);
	CHECK_OPENCL_ERROR(status, "waitForEventAndRelease(writeEvt) failed.");
	// Set kernel arguments
	status = clSetKernelArg(counterKernel, 0, sizeof(cl_mem), &inBuf);
	CHECK_OPENCL_ERROR(status, "clSetKernelArg(inBuf) failed.");
	status = clSetKernelArg(counterKernel, 1, sizeof(cl_uint), &value);
	CHECK_OPENCL_ERROR(status, "clSetKernelArg(value) failed.");
	status = clSetKernelArg(counterKernel, 2, sizeof(cl_mem), &counterOutBuf);
	CHECK_OPENCL_ERROR(status, "clSetKernelArg(counterOutBuf) failed.");
	// Run Kernel
	cl_event ndrEvt;
	status = clEnqueueNDRangeKernel(commandQueue,counterKernel, 1, NULL,&globalWorkItems,&localWorkItems,0, NULL,&ndrEvt);
	CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel(counterKernel) failed.");
	status = clFlush(commandQueue);
	CHECK_OPENCL_ERROR(status, "clFlush(commandQueue) failed.");
	cl_int eventStatus = CL_QUEUED;
	while(eventStatus != CL_COMPLETE)
	{
		status = clGetEventInfo(ndrEvt, CL_EVENT_COMMAND_EXECUTION_STATUS,sizeof(cl_int),&eventStatus,NULL);
		CHECK_OPENCL_ERROR(status, "clGetEventInfo(ndrEvt) failed.");
	}
	// Get profiling information
	cl_ulong startTime1,endTime1;
	status = clGetEventProfilingInfo( ndrEvt, CL_PROFILING_COMMAND_START,sizeof(cl_ulong), &startTime1,NULL);
	CHECK_OPENCL_ERROR(status,"clGetEventProfilingInfo(CL_PROFILING_COMMAND_START) failed.");
	status = clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong), &endTime1, NULL);
	CHECK_OPENCL_ERROR(status,"clGetEventProfilingInfo(CL_PROFILING_COMMAND_END) failed.");
	double sec1 = 1e-9 * (endTime1 - startTime1);
	printf("counter kernel time: %f.\n",sec1);
	status = clReleaseEvent(ndrEvt);
	CHECK_OPENCL_ERROR(status, "clReleaseEvent(ndrEvt) failed.");
	// Get the occurrences of Value from atomicKernel
	cl_uint counterOut;
	cl_event readEvt;
	status = clEnqueueReadBuffer( commandQueue,counterOutBuf, CL_FALSE,0, sizeof(cl_uint), &counterOut,0, NULL,&readEvt);
	CHECK_OPENCL_ERROR(status, "clEnqueueReadBuffer(counterOutBuf) failed.");
	status = clFlush(commandQueue);
	CHECK_OPENCL_ERROR(status, "clFlush() failed.");
	// Wait for event and release event
	status = clEnqueueWaitForEvents(commandQueue,1,&readEvt);
	clReleaseEvent(readEvt);
	CHECK_OPENCL_ERROR(status, "waitForEventAndRelease(readEvt) failed.");


	//atomicCounters kernel process...
	cl_kernel globalKernel = clCreateKernel(program, "globalAtomics", &status);
	CHECK_OPENCL_ERROR(status, "clCreateKernel(globalKernel) failed.");
	cl_mem globalOutBuf = clCreateBuffer(context,CL_MEM_READ_WRITE,sizeof(cl_uint),NULL,&status);
	CHECK_OPENCL_ERROR(status, "clCreateBuffer failed.(globalOutBuf).");
	cl_event writeEvt2;
	status = clEnqueueWriteBuffer(commandQueue,globalOutBuf, CL_FALSE, 0, sizeof(cl_uint),&initValue, 0, NULL, &writeEvt2);
	CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer(globalOutBuf) failed.");
	status = clFlush(commandQueue);
	CHECK_OPENCL_ERROR(status, "clFlush() failed.");
	// Wait for event and release event
	status = clEnqueueWaitForEvents(commandQueue,1,&writeEvt2);
	clReleaseEvent(writeEvt2);
	CHECK_OPENCL_ERROR(status, "waitForEventAndRelease(writeEvt) failed.");
	// Set kernel arguments
	status = clSetKernelArg(globalKernel, 0, sizeof(cl_mem), &inBuf);
	CHECK_OPENCL_ERROR(status, "clSetKernelArg(inBuf) failed.");
	status = clSetKernelArg(globalKernel, 1, sizeof(cl_uint), &value);
	CHECK_OPENCL_ERROR(status, "clSetKernelArg(value) failed.");
	status = clSetKernelArg(globalKernel, 2, sizeof(cl_mem), &globalOutBuf);
	CHECK_OPENCL_ERROR(status, "clSetKernelArg(globalOutBuf) failed.");
	// Run Kernel
	cl_event ndrEvt2;
	status = clEnqueueNDRangeKernel(commandQueue,globalKernel, 1,NULL,&globalWorkItems,&localWorkItems,0,NULL,&ndrEvt2);
	CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel(globalKernel) failed.");
	status = clFlush(commandQueue);
	CHECK_OPENCL_ERROR(status, "clFlush(commandQueue) failed.");
	cl_int eventStatus2 = CL_QUEUED;
	while(eventStatus2 != CL_COMPLETE)
	{
		status = clGetEventInfo(ndrEvt2,CL_EVENT_COMMAND_EXECUTION_STATUS,sizeof(cl_int),&eventStatus2,NULL);
		CHECK_OPENCL_ERROR(status, "clGetEventInfo(ndrEvt) failed.");
	}
	cl_ulong startTime2;
	cl_ulong endTime2;
	// Get profiling information
	status = clGetEventProfilingInfo(ndrEvt2,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime2,NULL);
	CHECK_OPENCL_ERROR(status,"clGetEventProfilingInfo(CL_PROFILING_COMMAND_START) failed.");
	status = clGetEventProfilingInfo(ndrEvt2,CL_PROFILING_COMMAND_END,sizeof(cl_ulong), &endTime2, NULL);
	CHECK_OPENCL_ERROR(status, "clGetEventProfilingInfo(CL_PROFILING_COMMAND_END) failed.");
	double sec2 = 1e-9 * (endTime2 - startTime2);
	printf("global kernel time: %f.\n",sec2);
	status = clReleaseEvent(ndrEvt2);
	CHECK_OPENCL_ERROR(status, "clReleaseEvent(ndrEvt) failed.");
	// Get the occurrences of Value from atomicKernel
	cl_uint globalOut;
	cl_event readEvt2;
	status = clEnqueueReadBuffer(commandQueue, globalOutBuf, CL_FALSE,0, sizeof(cl_uint), &globalOut, 0, NULL,&readEvt2);
	CHECK_OPENCL_ERROR(status, "clEnqueueReadBuffer(globalOutBuf) failed.");
	status = clFlush(commandQueue);
	CHECK_OPENCL_ERROR(status, "clFlush() failed.");
	// Wait for event and release event
	status = clEnqueueWaitForEvents(commandQueue,1,&readEvt2);
	clReleaseEvent(readEvt2);
	CHECK_OPENCL_ERROR(status, "waitForEventAndRelease(readEvt) failed.");


	//cl_int status;
	status = clReleaseMemObject(inBuf);
	CHECK_OPENCL_ERROR(status, "clReleaseMemObject(inBuf) failed.");
	status = clReleaseMemObject(counterOutBuf);
	CHECK_OPENCL_ERROR(status, "clReleaseMemObject(counterOutBuf) failed.");
	status = clReleaseMemObject(globalOutBuf);
	CHECK_OPENCL_ERROR(status, "clReleaseMemObject(globalOutBuf) failed.");
	status = clReleaseKernel(counterKernel);
	CHECK_OPENCL_ERROR(status, "clReleaseKernel(counterKernel) failed.");
	status = clReleaseKernel(globalKernel);
	CHECK_OPENCL_ERROR(status, "clReleaseKernel(globalKernel) failed.");
	status = clReleaseProgram(program);
	CHECK_OPENCL_ERROR(status, "clReleaseProgram(program) failed.");
	status = clReleaseCommandQueue(commandQueue);
	CHECK_OPENCL_ERROR(status, "clReleaseCommandQueue(commandQueue) failed.");
	status = clReleaseContext(context);
	CHECK_OPENCL_ERROR(status, "clReleaseContext(context) failed.");
	free(input);

	return 0;
}

clCreateCommandQueueWithProperties()是那样子用的，但csdn好像在code里打印不出取地址符号。 cl部分：

/**
 * Counts number of occurrences of value in input array using 
 * atomic counters
 */

#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable        

__kernel 
void atomicCounters(
		volatile __global uint *input,
		uint value,
		counter32_t counter)                          
{
	
	size_t globalId = get_global_id(0);
	
	if(value == input[globalId])
		atomic_inc(counter);
		
}                                                                         

/**
 * Counts number of occurrences of value in input array using 
 * global atomics
 */

#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
__kernel 
void globalAtomics(
		volatile __global uint *input,
		uint value,
		__global uint* counter)                         
{                                                                         
	size_t globalId = get_global_id(0);
	
	if(value == input[globalId])
		atomic_inc(&counter[0]);
}

补充：所有的clReleaseEvent()前忘记了一句：clWaitForEvents(1,&); 补上去！！！

但我的OpenCL不支持扩展counter32_t 以及'cl_ext_atomic_counters_32 。我想想解决办法。。。

今天谷歌可以打开了，我用自己的谷歌打开了一会儿又打不开了！同事买了蓝灯的专业版他可以打开，我在他电脑上搜了下，果然这个扩展在2.0里。我1.2的版本没有：

For OpenCL 2 would be good to have:
*Atomic counters (cl_ext_atomic_counters_32) in core.. they provide an order of magnitude improvement vs global atomics at least on old D3D11 HW (Fermi, AMD 5xxx series) and are foundation of HW accelerated queues.
*Kernels can send interrupts to CPU and/or initiate host system calls.. that seems coming for a while I think even Fermi whitepaper suggested that but still no avaiable.. AMD SI support SEND_MSG in ISA as Lottes suggest in his blog so AMD should be able too..
*warp/wavefront vote functions: this functions are in NV HW since GTX 2xx (200 useful for example in currently most better dynamic mem allocator for GPUs see "Fast Dynamic Memory Allocator for Massively Parallel Architectures" they said:
"The used hardware must provide a voting function for an effi cient implementation" thus seems and OpenCL port will need exposure of that..
*Dynamic parallelism: well that should be expected also now that GK110 is shipping and also seems SI could support some limited form of it as shown in a ADFS session..
*Named barriers: Well this is shipping in CUDA since Fermi days and can be used for warp specialization like in CUDADMA project that can bring better memory bandwith explotation in some apps and also as shown in HPP study can bring support for "true function composability" i.e. GPU functions that use barriers can call other GPU functions that use barriers without breaking expected usage see HPP paper by Gaster et al.
*Crossvendor MultiGPU like CUDA P2P functionality: i.e. memory from one GPU addressable by other GPU directly from kernel without previous copy (also present in cl_amd_bus_addressable in AMD OCL)
*Exposing some common intra warp/wavefront ops? (like existing NV shuffle.. makes sense more like median, min/max could be beneficial for platforms like Xeon Phi but not on GPUs)
*Expose some cross vendor multimedia extension ISAs? i.e. some common cl_amd_media_ops/cl_amd_media_ops2 and ptx SIMD instructions.. this can be good jointly with interop with video encoders and encoders for accelerated video processing and even NV uses in their fast raytracing kernels..
*Finalize to bring parity vs exisiting compute exposure in graphics APIs like OGL 4.3/D3D 11 compute shaders: like said atomic counters where one thing..
->other being new gather4 instuctions..
->DispatchComputeIndirect: i.e. ability to launch kernel with size of workgroup total size being fetched from GPU mem.. it's more efficient for variable work kernels that depend on work generated by a previous kernel.. in this case we avoid a CPU trip but note that could be done with new Dynamic Parallelism so perhaps doesn't need to be exposed..
->Promote into core MSAA and depth extensions
->MipMap support like in CUDA 5
->compressed tex formats support
->a cross vendor extension for bindless support (assuming will get broad support in coming years)
->cross vendor ext for sparse texture/buffer support..

所以这个例子运行不了，我就不继续探究了。

大神说，不要用这种老式的扩展了，而在buildprogram时使用：-legacy -Dcl_ext_atomic_counters_32 如https://community.amd.com/message/2786372 所说。但我编译没通过？然后报的错误是乱码？

元气少女缘结神

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
打赏
0
评论
AMD-SDK的学习[2]--AtomicCounters

第二个例子：AtomicCounters：我依旧是改成了自己习惯看的样子main：#include #include #include #include "a_needed_headers/SDKCommon.hpp"#include "n_needed_headers/oclUtils.h"using namespace std;#define GROUP_SIZE 256
复制链接

扫一扫