OpenCL Programming Template -- Juliet

最新推荐文章于 2024-08-20 16:04:23 发布

iJuliet

最新推荐文章于 2024-08-20 16:04:23 发布

阅读量5.4k

点赞数 1

分类专栏： C/C++ GPU/CUDA/OpenCL 文章标签： null float profiling integer struct delete

本文链接：https://blog.csdn.net/ijuliet/article/details/5266189

版权

GPU/CUDA/OpenCL 同时被 2 个专栏收录

12 篇文章

订阅专栏

C/C++

8 篇文章

订阅专栏

本文提供了一个OpenCL程序模板，用于测试不同设备上的浮点运算性能，包括加法、乘法等操作，并通过自定义函数获取运行时间。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

很久了，CSDN博客图片一直处于和谐态，灰常影响博友心情=_=!!!

这里(http://download.csdn.net/source/2030769)有OpenCL开发流程，Platforms/Devices/Context概念澄清的图解，需要的朋友可以下载看看，免积分。本篇我写OCL程序的一个模板，拿出来晒晒太阳，大家喜欢了可以拿去用。CUDA与OpenCL的裙带关系矣然大白于天下。下一篇博客我会对比CUDA与OpenCL的相关函数，有兴趣的朋友可以做一个转换工具，于人于已也就方便多了。言归正传，代码说事儿：

#include <stdio.h>

#include <Cl/cl.h>

//InitTimer()、GetTimer()

#include <intrin.h>

#include <windows.h>

//we refuse to use oclUtils.h, including oclXXX() or shrXXX() because they are not OCL intrinsic functions.

//#include <oclUtils.h>

#include "FloatTest.h"

cl_context cxGPUContext = NULL;

cl_device_id device;

cl_command_queue cqCommandQueue = NULL;

cl_mem d_data = NULL;

cl_int ciErrNum = CL_SUCCESS;

cl_program cpProgram = NULL, cpProgram1 = NULL, cpProgram2=NULL, cpProgram3 = NULL; //Because the length of the lenght is limited, so a big .cl needs several cpPrograms.

size_t program_length = 0, program_length1=0, program_length2=0, program_length3 = 0;

cl_kernel kernel = NULL;

size_t max_item[4];

size_t localWorkSize;

size_t globalWorkSize;

const int testnum = 32;

double time[256] = {0.0};

unsigned long ops_cnt = 1 * 1024 * 1024 * 1024UL;

const float mem_size = sizeof(float) * 65536;

static int k = 0;

bool itemFlag = true;

typedef struct

{

cl_platform_id platform;

cl_device_id* devices;

cl_uint numDevices;

}PD;*/

PD* platforms;

cl_uint numPlatforms;

int main()

{

unsigned int i;

//Get counterFreq of your CPU, used in InitTimer()/GetTimer()

counterFreq = GetCPUSpeed();

//Get the number of the platforms

ciErrNum = clGetPlatformIDs(0, NULL, &numPlatforms);

if(ciErrNum != CL_SUCCESS) return 1;

if( numPlatforms > 0)

{

platforms = new PD[numPlatforms];

//malloc for these platforms

cl_platform_id* platformsTem = new cl_platform_id[numPlatforms];

//get these platforms

ciErrNum = clGetPlatformIDs(numPlatforms, platformsTem, NULL);

CHECK(ciErrNum);

for(i=0; i<numPlatforms; i++)

platforms[i].platform = platformsTem[i];

delete[] platformsTem;

cl_uint maxPerPlatform = 0;

for(i = 0; i < numPlatforms; i++)

{

char pbuf[100];

//get detailed info about this platform,e.g., CL_PLATFORM_NAME, /version/vendor

ciErrNum = clGetPlatformInfo(platforms[i].platform, CL_PLATFORM_NAME, sizeof(pbuf), pbuf, NULL);

CHECK(ciErrNum);

//get the number of devices supported by this platform.(type could be CL_DEVICE_TYPE_ALL)

ciErrNum = clGetDeviceIDs(platforms[i].platform, CL_DEVICE_TYPE_GPU, 0, NULL, &(platforms[i].numDevices));

CHECK(ciErrNum);

if(platforms[i].numDevices > maxPerPlatform)

maxPerPlatform = platforms[i].numDevices;

//get these devices supported by this platform

platforms[i].devices = new cl_device_id[platforms[i].numDevices];

//get one device is simple

//clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL);

ciErrNum = clGetDeviceIDs(platforms[i].platform, CL_DEVICE_TYPE_GPU, platforms[i].numDevices, platforms[i].devices, NULL);

CHECK(ciErrNum);

}//for

/* get platforms and its supporting devices through oclXXX()

{

//get platforms

char cBuffer[1024];

cl_platform_id cpPlatform = NULL;

oclGetPlatformID(&cpPlatform);

clGetPlatformInfo (cpPlatform, CL_PLATFORM_NAME, sizeof(cBuffer), cBuffer, NULL);

//get devices

clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &ciDeviceCount);

cdDevices = (cl_device_id *)malloc(ciDeviceCount * sizeof(cl_device_id) );

clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, ciDeviceCount, cdDevices, NULL);

clGetDeviceInfo(cdDevices[i], CL_DEVICE_NAME, sizeof(cBuffer), &cBuffer, NULL);

oclPrintDevInfo(LOGBOTH, cdDevices[i]);

//record into log

std::string sProfileString = "oclDeviceQuery,[Platform]Device Name = ";

sProfileString += ", Device = ";

sProfileString += cBuffer;

sProfileString += "/n";

shrLogEx(LOGBOTH | MASTER, 0, sProfileString.c_str());

}

if(maxPerPlatform < 0)

return 1;

double* dpTime = new double[ 10 * numPlatforms * maxPerPlatform]; //item <= 10

unsigned long item = 0;

int flag = FloatTest(dpTime, &item, &ciErrNum, platforms, numPlatforms);

if(flag != 0 ) return 1;

for(int j=0; j< 5; j++)

printf("%.3lf %.3lf GLOPS/n", dpTime[j] / 1000000000.0,dpTime[j+5]/ 1000000000.0);

delete[] dpTime;

for(i = 0; i < numPlatforms; i++)

delete[] platforms[i].devices;

delete[] platforms;

}

else

{

return 1;

}

@ double* dpTime, 返回数据的数组；

@ unsigned long* item, 测试项的个数，在每个平台下的每个平台上都会测这几个测试项，例如Add、Mul、Mad；

@ cl_int* pciErrNum，错误代号；

@ PD* platforms，自定义的PD类型的platforms[]；

@ cl_uint numPlatforms，用来说明平台个数，即platforms[]数组元素个数；

@ __int64 counterFreq_main，计时函数中需要用到的一个参数；

int FloatTest(double* dpTime, unsigned long* item, cl_int* pciErrNum, PD* platforms, cl_uint numPlatforms)

{

cl_uint pi,di;

for(pi = 0; pi<numPlatforms; pi++)

{

cxGPUContext = clCreateContext(0, platforms[pi].numDevices, platforms[pi].devices, NULL, NULL, &*pciErrNum);

//-----------reate context---------NO clCreateContextFromType()，.x and later driver doesn't support it well

cxGPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, &*pciErrNum); //Jun: cxGPUContext = clCreateContext(0, pInfoDlg->m_iCLDeviceBMCount, pInfoDlg->m_idCLDeviceListBM, NULL, NULL, &ciErrNum);

//--------choose device----------

//Above, we use platform info to get devices info.

//Here, we could get devices info from context.We could get devices under certain platform(context <-> platform), or, we get all devices(just one context).

size_t nDeviceBytes;

*pciErrNum |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &nDeviceBytes);

ciDeviceCount = (cl_uint)nDeviceBytes/sizeof(cl_device_id);

for(unsigned int i = 0; i < ciDeviceCount; ++i)

{

// get and print the device for this queue

device = oclGetDev(cxGPUContext, i);

oclPrintDevName(LOGBOTH, device);

}

//program setup, create the program, build the program, create d_data

int flag = subFloatTest(pciErrNum);

if(flag != 0) return 1; //one fail, then return.

for(di = 0; di<platforms[pi].numDevices; di++)

{

device = (platforms[pi].devices)[di];

//Attention: some global var may be changed during one iteration, so we assign it the initial value.

//If the value will not be changed, I suggest to declare it as "const"

ops_cnt = 1 * 1024 * 1024 * 1024UL;

runOnDevice(dpTime, item, pciErrNum);

}

//release

clReleaseProgram(cpProgram);

clReleaseProgram(cpProgram1);

clReleaseProgram(cpProgram2);

clReleaseProgram(cpProgram3);

clReleaseMemObject(d_data);

clReleaseContext(cxGPUContext);

}

int subFloatTest(cl_int* pciErrNum)

{

//program setup, .h(char[]) replaces .cl

//create the program

cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&FloatTestCLsource, &program_length, &*pciErrNum);

CHECK(*pciErrNum);

cpProgram1 = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&kernelMul, &program_length1, &*pciErrNum);

CHECK(*pciErrNum);

cpProgram2 = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&kernelMAD, &program_length2, &*pciErrNum);

CHECK(*pciErrNum);

cpProgram3 = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&kernelSF, &program_length3, &*pciErrNum);

CHECK(*pciErrNum);

//if we use .cl

char* source_path = "FloatTest.cl";

char *source = oclLoadProgSource(source_path, "", &program_length);

cpProgram = clCreateProgramWithSource(cxGPUContext, 1,

(const char **)&source, &program_length, &*pciErrNum);

if(CL_SUCCESS != *pciErrNum ) return 1;

//build the program

*pciErrNum = clBuildProgram(cpProgram, 0, NULL, "", NULL, NULL);

CHECK(*pciErrNum);

*pciErrNum = clBuildProgram(cpProgram1, 0, NULL, "", NULL, NULL);

CHECK(*pciErrNum);

*pciErrNum = clBuildProgram(cpProgram2, 0, NULL, "", NULL, NULL);

CHECK(*pciErrNum);

*pciErrNum = clBuildProgram(cpProgram3, 0, NULL, "", NULL, NULL);

CHECK(*pciErrNum);

//input，host memory

d_data = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, mem_size, NULL, &*pciErrNum);

CHECK(*pciErrNum);

//cl_mem h_A = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,mem_size_A, h_A_data, &ciErrNum);

return 0;

}

int runOnDevice(double* AddIOPS, unsigned long* item, cl_int* pciErrNum)

{

//get Throughput, /10^9 (GIOPS)

double final;

*pciErrNum = ms(&final,"floatAddTest",pciErrNum,1024, 0);

CHECK(*pciErrNum);

if(final > 0.0)

AddIOPS[k++] = ops_cnt / (final /1000.0);

*pciErrNum = ms(&final,"floatMulTest",pciErrNum,1024,1);

CHECK(*pciErrNum);

if(final > 0.0)

AddIOPS[k++] = ops_cnt / (final /1000.0);

*pciErrNum = ms(&final,"floatMADTest",pciErrNum,1024,2);

CHECK(*pciErrNum);

if(final > 0.0)

AddIOPS[k++] = ops_cnt / (final /1000.0) * 2.0;

ops_cnt = 128 * 1024 * 1024UL;

*pciErrNum = ms(&final,"floatSFTest",pciErrNum,16,3);

CHECK(*pciErrNum);

if(final > 0.0)

AddIOPS[k++] = ops_cnt / (final /1000.0);

*pciErrNum = ms(&final,"floatNativeSFTest",pciErrNum,16,3);

CHECK(*pciErrNum);

if(final > 0.0)

AddIOPS[k++] = ops_cnt / (final /1000.0);

if(itemFlag == true)

{

*item = k;

itemFlag = false;

}

return 0;

}

int ms(double *final,const char* kernelName, cl_int *pciErrNum, int coef,int id)

{

// create a command-queue

cqCommandQueue = clCreateCommandQueue(cxGPUContext, device, 0, &*pciErrNum);

if(CL_SUCCESS != *pciErrNum )

return 1;

//which kernel

switch(id)

{

case 0: kernel = clCreateKernel(cpProgram, kernelName, &*pciErrNum); break;

case 1: kernel = clCreateKernel(cpProgram1, kernelName, &*pciErrNum); break;

case 2: kernel = clCreateKernel(cpProgram2, kernelName, &*pciErrNum); break;

case 3: kernel = clCreateKernel(cpProgram3, kernelName, &*pciErrNum); break;

}

if(CL_SUCCESS != *pciErrNum )

return 1;

//setup execution parameter

clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(max_item), max_item, NULL);

localWorkSize = max_item[0] < 256 ? max_item[0] : 256;

globalWorkSize = ops_cnt / coef;

//set kernel arg

*pciErrNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&d_data);

if(CL_SUCCESS != *pciErrNum )

return 1;

//input data in device memory. clCreateBuffer + clEnqueueCopyBuffer()

d_A[i] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, workSize[i] * sizeof(float) * WA, NULL,NULL);

clEnqueueCopyBuffer(commandQueue[i], h_A, d_A[i], workOffset[i] * sizeof(float) * WA, 0, workSize[i] * sizeof(float) * WA, 0, NULL, NULL);

//or, we do like this: create + copy

d_B[i] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,

mem_size_B, h_B_data, NULL);

//Asynchronous write of data to GPU device

clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcA, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize * 4, srcA, 0, NULL, NULL);

//warmup so we don't time driver startup

*pciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, kernel, 1, 0,&globalWorkSize, &localWorkSize, 0, NULL, NULL);

if(CL_SUCCESS != *pciErrNum ) return 1;

//event

//cl_event kernel_completion;

//foreach testnum

for(int i = 0; i< testnum; i++)

{

// shrDeltaT(0); //us

InitTimer();

*pciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, kernel, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL); //&kernel_completion

if(CL_SUCCESS != *pciErrNum ) return 1;

// clWaitForEvents(1, &kernel_completion);

*pciErrNum = clFinish(cqCommandQueue);

if(CL_SUCCESS != *pciErrNum ) return 1;

// time[i] = shrDeltaT(0) * 1000.0; //ms

time[i] = GetTimer(); //ms

}

//clReleaseEvent(kernel_completion); //we use clFinish() or event to synchronize CPU and GPU

// Read back results and check accumulated errors

//clEnqueueReadBuffer(cqCommandQueue, cmDevDst, CL_TRUE, 0, sizeof(cl_float) * szGlobalWorkSize, dst, 0, NULL, NULL);

//release

clReleaseKernel(kernel);

clReleaseCommandQueue(cqCommandQueue);

//get ms

*final = GetTimeMin(testnum);

return 0;

}

double GetTimeMin(int len)

{

double tem = time[0];

for(int i=1;i<len;i++)

if(time[i] < tem)

tem = time[i];

return tem;

}

__inline void InitTimer(void)

{

counterT0 = __rdtsc();

}

__inline double GetTimer(void)

{

return (__rdtsc() - counterT0) * 1000.0 / counterFreq;

}

unsigned __int64 GetCPUSpeed(void)

{

unsigned __int64 start, stop;

unsigned __int64 nCtr, nFreq, nCtrStop;

QueryPerformanceFrequency((LARGE_INTEGER *)&nFreq);

_asm _emit 0x0F

_asm _emit 0x31

_asm mov DWORD PTR start, eax

_asm mov DWORD PTR [start + 4], edx

QueryPerformanceCounter((LARGE_INTEGER *)&nCtrStop);

nCtrStop += nFreq / 5;

{

QueryPerformanceCounter((LARGE_INTEGER *)&nCtr);

}while (nCtr < nCtrStop);

_asm _emit 0x0F

_asm _emit 0x31

_asm mov DWORD PTR stop, eax

_asm mov DWORD PTR [stop + 4], edx

counterFreq = (stop - start) * 5;

return counterFreq;

}

//--------------------.h---------------

#ifndef INT32TEST_H
#define INT32TEST_H

#define CHECK(i) if((i)!=CL_SUCCESS) return 1;

const char *FloatTestCLsource = " /
__kernel void floatAddTest(__global float *dummy_buf) /

{ /

} /
";

typedef struct
{
cl_platform_id platform;
cl_device_id* devices;
cl_uint numDevices;
}PD;
__int64 counterT0, counterFreq;

int FloatTest(double* dpTime, unsigned long* item, cl_int* pciErrNum, PD* platforms, cl_uint numPlatforms);
int subFloatTest(cl_int* pciErrNum);
int runOnDevice(double* AddIOPS, unsigned long* item, cl_int* pciErrNum);
int ms(double *final,const char* kernelName, cl_int *pciErrNum, int coef, int id);
double GetTimeMin(int len);
__inline void InitTimer(void);
__inline double GetTimer(void);
unsigned __int64 GetCPUSpeed(void);

#endif

//some log functions

shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt");

shrSetLogFileName("log.txt");

shrLog("Hello World!!!/n/n");

shrLog(" CL_PLATFORM_NAME: /t%s/n", cBuffer);

shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);

oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));

oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclDotProduct.ptx");

//some checking functions

oclCheckError(ciErrNum, CL_SUCCESS);

shrCheckErrorEX (ciErrNum, CL_SUCCESS, pCleanup);

//some filling and result-checking functions

void* srcA = (void *)malloc(sizeof(cl_float4) * szGlobalWorkSize);

shrFillArray((float*)srcA, 4 * iNumElements);

shrDiffArray((const float*)dst, (const float*)Golden, iNumElements);

shrCompareL2fe(reference, h_C, size_C, 1e-6f);

//print WINDOWS systemInfo:

#ifdef _WIN32

SYSTEM_INFO stProcInfo; // processor info struct

OSVERSIONINFO stOSVerInfo; // Win OS info struct

SYSTEMTIME stLocalDateTime; // local date / time struct

// processor

SecureZeroMemory(&stProcInfo, sizeof(SYSTEM_INFO));

GetSystemInfo(&stProcInfo);

// OS

SecureZeroMemory(&stOSVerInfo, sizeof(OSVERSIONINFO));

stOSVerInfo.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);

GetVersionEx(&stOSVerInfo);

// date and time

GetLocalTime(&stLocalDateTime);

// write time and date to logs

shrLog(" Local Time/Date = %i:%i:%i, %i/%i/%i/n",

stLocalDateTime.wHour, stLocalDateTime.wMinute, stLocalDateTime.wSecond,

stLocalDateTime.wMonth, stLocalDateTime.wDay, stLocalDateTime.wYear);

// write proc and OS info to logs

shrLog(" CPU Arch: %i/n CPU Level: %i/n # of CPU processors: %u/n Windows Build: %u/n Windows Ver: %u.%u/n/n/n",

stProcInfo.wProcessorArchitecture, stProcInfo.wProcessorLevel, stProcInfo.dwNumberOfProcessors,

stOSVerInfo.dwBuildNumber, stOSVerInfo.dwMajorVersion, stOSVerInfo.dwMinorVersion);

#endif

//oclMatrixMul use event to time:

double executionTime(cl_event &event)

{

cl_ulong start, end;

clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);

clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);

return (double)1.0e-9 * (end - start); // convert nanoseconds to seconds on return

}