OpenCL程序完整例子

#include<iostream>

#include "common.h"


#define DATA_SIZE 8388608

using namespace std;


int valuesOK(UserData* to, UserData* from) 
{
	for (int i = 0; i < DATA_SIZE; ++i) {
		if (to[i].w != from[i].w) return 0;
	}
	return 1;
}







int buffer_query(int argc, char* argv[])
{
	/* OpenCL 1.1 data structures */
	cl_platform_id* platforms;
	cl_program program;
	cl_device_id device;
	cl_context context;

	/* OpenCL 1.1 scalar data types */
	cl_uint numOfPlatforms;
	cl_int  error;

	/*
	Prepare an array of UserData via dynamic memory allocation
	*/
	UserData* ud_in = (UserData*)malloc(sizeof(UserData) * DATA_SIZE); // input to device
	UserData* ud_out = (UserData*)malloc(sizeof(UserData) * DATA_SIZE); // output from device
	for (int i = 0; i < DATA_SIZE; ++i) {
		(ud_in + i)->x = i;
		(ud_in + i)->y = i;
		(ud_in + i)->z = i;
		(ud_in + i)->w = 3 * i;
	}
	/*
	Get the number of platforms
	Remember that for each vendor's SDK installed on the computer,
	the number of available platform also increased.
	*/
	error = clGetPlatformIDs(0, NULL, &numOfPlatforms);
	if (error != CL_SUCCESS) {
		perror("Unable to find any OpenCL platforms");
		exit(1);
	}

	platforms = (cl_platform_id*)alloca(sizeof(cl_platform_id) * numOfPlatforms);
	printf("Number of OpenCL platforms found: %d\n", numOfPlatforms);

	error = clGetPlatformIDs(numOfPlatforms, platforms, NULL);
	if (error != CL_SUCCESS) {
		perror("Unable to find any OpenCL platforms");
		exit(1);
	}
	// Search for a CPU/GPU device through the installed platforms
	// Build a OpenCL program and do not run it.
	for (cl_uint i = 0; i < numOfPlatforms; i++) {
		// Get the GPU device
		error = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, 1, &device, NULL);
		if (error != CL_SUCCESS) {
			// Otherwise, get the CPU
			error = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_CPU, 1, &device, NULL);
		}
		if (error != CL_SUCCESS) {
			perror("Can't locate any OpenCL compliant device");
			exit(1);
		}
		/* Create a context */
		context = clCreateContext(NULL, 1, &device, NULL, NULL, &error);
		if (error != CL_SUCCESS) {
			perror("Can't create a valid OpenCL context");
			exit(1);
		}

		/* Load the two source files into temporary datastores */
		const char *file_names[] = { "user_test.cl" };
		const int NUMBER_OF_FILES = 1;
		char* buffer[NUMBER_OF_FILES];
		size_t sizes[NUMBER_OF_FILES];
		loadProgramSource(file_names, NUMBER_OF_FILES, buffer, sizes);

		/* Create the OpenCL program object */
		program = clCreateProgramWithSource(context, NUMBER_OF_FILES, (const char**)buffer, sizes, &error);
		if (error != CL_SUCCESS) {
			perror("Can't create the OpenCL program object");
			exit(1);
		}
		/* Build OpenCL program object and dump the error message, if any */
		char *program_log;
		size_t log_size;
		error = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
		if (error != CL_SUCCESS) {
			// If there's an error whilst building the program, dump the log
			clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
			program_log = (char*)malloc(log_size + 1);
			program_log[log_size] = '\0';
			clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,
				log_size + 1, program_log, NULL);
			printf("\n=== ERROR ===\n\n%s\n=============\n", program_log);
			free(program_log);
			exit(1);
		}

		/* Query the program as to how many kernels were detected */
		cl_uint numOfKernels;
		error = clCreateKernelsInProgram(program, 0, NULL, &numOfKernels);
		if (error != CL_SUCCESS) {
			perror("Unable to retrieve kernel count from program");
			exit(1);
		}
		cl_kernel* kernels = (cl_kernel*)alloca(sizeof(cl_kernel) * numOfKernels);
		error = clCreateKernelsInProgram(program, numOfKernels, kernels, NULL);
		

		for (cl_uint i = 0; i < numOfKernels; i++) 
		{
			char kernelName[32];
			cl_uint argCnt;
			clGetKernelInfo(kernels[i], CL_KERNEL_FUNCTION_NAME, sizeof(kernelName), kernelName, NULL);
			clGetKernelInfo(kernels[i], CL_KERNEL_NUM_ARGS, sizeof(argCnt), &argCnt, NULL);
			printf("Kernel name: %s with arity: %d\n", kernelName, argCnt);
			printf("About to create command queue and enqueue this kernel...\n");

			/* Create a command queue */ ///创建命令队列
			cl_command_queue cQ = clCreateCommandQueue(context, device, 0, &error);
			if (error != CL_SUCCESS) {
				perror("Unable to create command-queue");
				exit(1);
			}

			/* Create a OpenCL buffer object */ ///创建缓存对象
			cl_mem UDObj = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,sizeof(UserData) * DATA_SIZE, ud_in, &error);

			if (error != CL_SUCCESS) {
				perror("Unable to create buffer object");
				exit(1);
			}

			/* Extract some info about the buffer object we created */
			displayBufferDetails(UDObj);

			/* Let OpenCL know that the kernel is suppose to receive an argument */
			// 让OpenCL知道内核应该接收一个参数
			error = clSetKernelArg(kernels[i], 0, sizeof(cl_mem), &UDObj);
			if (error != CL_SUCCESS) {
				perror("Unable to create buffer object");
				exit(1);
			}

			/* Enqueue the kernel to the command queue */
			// 将内核放入命令队列
			error = clEnqueueTask(cQ, kernels[i], 0, NULL, NULL);
			if (error != CL_SUCCESS) {
				perror("Unable to enqueue task to command-queue");
				exit(1);
			}
			printf("Task has been enqueued successfully!\n");

			/* Enqueue the read-back from device to host */
			error = clEnqueueReadBuffer(cQ, UDObj,
				CL_TRUE,                    // blocking read
				0,                          // write from the start
				sizeof(UserData) * DATA_SIZE, // how much to copy
				ud_out, 0, NULL, NULL);

			if (valuesOK(ud_in, ud_out)) {
				printf("Check passed!\n");
			}
			else printf("Check failed!\n");

			/* Release the command queue */
			clReleaseCommandQueue(cQ);
			clReleaseMemObject(UDObj);
		}

		/* Clean up */

		for (cl_uint i = 0; i < numOfKernels; i++) { clReleaseKernel(kernels[i]); }
		for (i = 0; i< NUMBER_OF_FILES; i++) { free(buffer[i]); }
		clReleaseProgram(program);
		clReleaseContext(context);
	}

	free(ud_in);
	free(ud_out);
}


int main(int argc,char* argv[])
{
 

 //PlatForminfo(argc,argv);
//	BuildProgram(argc, argv);

 cout<<"My Love Ye!"<<endl;
 system("pause");
 return 0;
}

头文件common.h里代码:

#ifndef COMMON_H_
#define COMMON_H_

#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>

#include <string>

//#include <alloca.h>



#ifdef APPLE
#include<OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif // APPLE



// Simulating the OpenCL vector data type: int4
typedef struct UserData {
	int x;
	int y;
	int z;
	int w;
} UserData;




void displayPlatformInfo(cl_platform_id id,cl_platform_info param_name,const char* paramNameAsStr)
{
	cl_int error = 0;
	size_t paramSize = 0;
	error = clGetPlatformInfo(id, param_name, 0, NULL, &paramSize);
	
	char* moreInfo = (char*)alloca(sizeof(char)*paramSize);
	error = clGetPlatformInfo(id, param_name, paramSize, moreInfo, NULL);
	if (error!=CL_SUCCESS)
	{
		perror("Unable to find any OpenCl platform Information!");
		return;
	}
	printf("%s:%s\n", paramNameAsStr, moreInfo);
}


/************************************************************************/
/* file  file name
*  length  file number
*  buffer  save file program
*  sizes   file length
*/
/************************************************************************/

void loadProgramSource(const char** files,size_t length,char **buffer,size_t*sizes)
{
	for (size_t i=0;i<length;++i)
	{
		FILE *file = fopen(files[i], "r");
		if (file==NULL)
		{
			perror("Couldn't read the program files");
			exit(1);
		}
		fseek(file,0,SEEK_END);
		sizes[i] = ftell(file);//得到文件的长度
		rewind(file);//重置文件指针到文件头,进行文件的读取
		buffer[i] = (char*)malloc(sizes[i] + 1);
		buffer[i][sizes[i]] = '\0';
		fread(buffer[i],sizeof(char),sizes[i],file);
		fclose(file);
	}

	return;
}


void displayBufferDetails(cl_mem memobj) {
	cl_mem_object_type objT;
	cl_mem_flags flags;
	size_t memSize;

	clGetMemObjectInfo(memobj, CL_MEM_TYPE, sizeof(cl_mem_object_type), &objT, 0);
	clGetMemObjectInfo(memobj, CL_MEM_FLAGS, sizeof(cl_mem_flags), &flags, 0);
	clGetMemObjectInfo(memobj, CL_MEM_SIZE, sizeof(size_t), &memSize, 0);

	char* str = '\0';
	switch (objT) {
	case CL_MEM_OBJECT_BUFFER: str = "Buffer or Sub-buffer"; break;
	case CL_MEM_OBJECT_IMAGE2D: str = "2D Image Object"; break;
	case CL_MEM_OBJECT_IMAGE3D: str = "3D Image Object"; break;
	}

	char flagStr[128] = { '\0' };
	if (flags & CL_MEM_READ_WRITE)     strcat(flagStr, "Read-Write|");
	if (flags & CL_MEM_WRITE_ONLY)     strcat(flagStr, "Write Only|");
	if (flags & CL_MEM_READ_ONLY)      strcat(flagStr, "Read Only|");
	if (flags & CL_MEM_COPY_HOST_PTR)  strcat(flagStr, "Copy from Host|");
	if (flags & CL_MEM_USE_HOST_PTR)   strcat(flagStr, "Use from Host|");
	if (flags & CL_MEM_ALLOC_HOST_PTR) strcat(flagStr, "Alloc from Host|");

	printf("\tOpenCL Buffer's details =>\n\t size: %lu MB,\n\t object type is: %s,\n\t flags:0x%lx (%s) \n", memSize >> 20, str, flags, flagStr);
}














#endif

user_test.cl里的代码:

typedef struct UserData {
    int x;
    int y;
    int z;
    int w;
} UserData;


__kernel void hello(__global UserData* data) {
    int id = get_global_id(0);
    data[id].w = data[id].x + data[id].y + data[id].z;        
}

OpenCL (Open Computing Language)是一种由Khronos Group制定的并行计算标准,用于跨平台加速图形处理、科学计算和通用计算任务。它允许开发者编写可在各种设备上运行的高性能并行程序,包括GPU、CPU、FPGA等。 一个简单的OpenCL并行计算例子可能是矩阵乘法。在CPU上,你可以用循环遍历的方式来完成这个计算,但在GPU上,可以利用其大量的并行核心。下面是一个简化的矩阵乘法示例: ```c++ // 硬件设备和上下文初始化 cl_command_queue queue; cl_program program; cl_kernel kernel; // 定义两个矩阵数据和结果矩阵 float* A_data, *B_data, *C_data; // OpenCL源代码 const char* source = """ __kernel void matrix_multiply(__global float* A, __global float* B, __global float* C, const int N) { int gid = get_global_id(0); // 获取线程ID int row = gid / N; int col = gid % N; float sum = 0.0f; for(int i = 0; i < N; ++i) { sum += A[row*N + i] * B[i*N + col]; } C[row*N + col] = sum; } """; // 加载并编译代码 program = clCreateProgramWithSource(context, 1, &source, NULL, &status); clBuildProgram(program, device_count, devices, "", NULL, NULL); // 创建并设置内核函数 kernel = clCreateKernel(program, "matrix_multiply", NULL); // 分配内存并复制数据到设备 A_data = ...; // load A data to device memory B_data = ...; // load B data to device memory C_data = ...; // allocate space for C on the device // 执行矩阵乘法 size_t global_work_size[] = {N * N}; clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, NULL); // 数据从设备复制回CPU clEnqueueReadBuffer(queue, C_data, CL_TRUE, 0, N*N * sizeof(float), C_data_host, 0, NULL, NULL); ``` 在这个例子中,每个线程都在GPU上独立计算矩阵的一个元素,实现了并行化。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

点云SLAM

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值