1、在gpu编写opencl代码
#include <iostream>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#if defined(__APPLE__) || defined(__MACOSX)
#include <OpenCL/cl.hpp>
#else
#include <CL/cl.h>
#endif
using namespace std;
#define KERNEL(...)#__VA_ARGS__
const char *kernelSourceCode = KERNEL(__kernel void hellocl(__global uint *buffer)
{
size_t gidx = get_global_id(0);
size_t gidy = get_global_id(1);
size_t lidx = get_local_id(0);
buffer[gidx + 4 * gidy] = (1 << gidx) | (0x10 << gidy);
});
int main(int argc, char const *argv[])
{
printf("hello OpenCL\n");
cl_int status = 0;
size_t deviceListSize;
// 当前服务器上配置的仅有NVIDIA Tesla C2050 的GPU
cl_platform_id platform = NULL;
status = clGetPlatformIDs(1, &platform, NULL);
if (status != CL_SUCCESS) {
printf("ERROR: Getting Platforms.(clGetPlatformIDs)\n");
return EXIT_FAILURE;
}
// 如果我们能找到相应平台,就使用它,否则返回NULL
cl_context_properties cps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)platform,0};
cl_context_properties *cprops = (NULL == platform) ? NULL : cps;
// 生成 context
cl_context context = clCreateContextFromType(cprops,CL_DEVICE_TYPE_GPU,NULL,NULL,&status);
if (status != CL_SUCCESS) {
printf("Error: Creating Context.(clCreateContexFromType)\n");
return EXIT_FAILURE;
}
// 寻找OpenCL设备
// 首先得到设备列表的长度
status = clGetContextInfo(context,CL_CONTEXT_DEVICES,0,NULL,&deviceListSize);
if (status != CL_SUCCESS) {
printf("Error: Getting Context Info device list size, clGetContextInfo)\n");
return EXIT_FAILURE;
}
cl_device_id *devices = (cl_device_id *)malloc(deviceListSize);
if (devices == 0) {
printf("Error: No devices found.\n");
return EXIT_FAILURE;
}
// 现在得到设备列表
status = clGetContextInfo(context,CL_CONTEXT_DEVICES,deviceListSize,devices,NULL);
if (status != CL_SUCCESS) {
printf("Error: Getting Context Info (device list, clGetContextInfo)\n");
return EXIT_FAILURE;
}
// 装载内核程序,编译CL program ,生成CL内核实例
size_t sourceSize[] = {strlen(kernelSourceCode)};
cl_program program = clCreateProgramWithSource(context,1,&kernelSourceCode,sourceSize,&status);
if (status != CL_SUCCESS) {
printf("Error: Loading Binary into cl_program (clCreateProgramWithBinary)\n");
return EXIT_FAILURE;
}
// 为指定的设备编译CL program.
status = clBuildProgram(program, 1, devices, NULL, NULL, NULL);
if (status != CL_SUCCESS) {
printf("Error: Building Program (clBuildingProgram)\n");
return EXIT_FAILURE;
}
// 得到指定名字的内核实例的句柄
cl_kernel kernel = clCreateKernel(program, "hellocl", &status);
if (status != CL_SUCCESS) {
printf("Error: Creating Kernel from program.(clCreateKernel)\n");
return EXIT_FAILURE;
}
// 创建 OpenCL buffer 对象
unsigned int *outbuffer = new unsigned int [4 * 4];
memset(outbuffer, 0, 4 * 4 * 4);
cl_mem outputBuffer = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, 4 * 4 * 4, NULL, &status);
if (status != CL_SUCCESS) {
printf("Error: Create Buffer, outputBuffer. (clCreateBuffer)\n");
return EXIT_FAILURE;
}
// 为内核程序设置参数
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&outputBuffer);
if (status != CL_SUCCESS) {
printf("Error: Setting kernel argument. (clSetKernelArg)\n");
return EXIT_FAILURE;
}
// 创建一个OpenCL command queue
cl_command_queue commandQueue = clCreateCommandQueue(context,devices[0],0,&status);
if (status != CL_SUCCESS) {
printf("Error: Create Command Queue. (clCreateCommandQueue)\n");
return EXIT_FAILURE;
}
// 将一个kernel 放入 command queue
size_t globalThreads[] = {4, 4};
size_t localThreads[] = {2, 2};
status = clEnqueueNDRangeKernel(commandQueue, kernel,2, NULL, globalThreads,localThreads, 0,NULL, NULL);
if (status != CL_SUCCESS) {
printf("Error: Enqueueing kernel\n");
return EXIT_FAILURE;
}
// 确认 command queue 中所有命令都执行完毕
status = clFinish(commandQueue);
if (status != CL_SUCCESS) {
printf("Error: Finish command queue\n");
return EXIT_FAILURE;
}
// 将内存对象中的结果读回Host
status = clEnqueueReadBuffer(commandQueue,outputBuffer, CL_TRUE, 0,4 * 4 * 4, outbuffer, 0, NULL, NULL);
if (status != CL_SUCCESS) {
printf("Error: Read buffer queue\n");
return EXIT_FAILURE;
}
// Host端打印结果
printf("out:\n");
for (int i = 0; i < 16; ++i) {
printf("%x ", outbuffer[i]);
if ((i + 1) % 4 == 0)
printf("\n");
}
// 资源回收
status = clReleaseKernel(kernel);
status = clReleaseProgram(program);
status = clReleaseMemObject(outputBuffer);
status = clReleaseCommandQueue(commandQueue);
status = clReleaseContext(context);
free(devices);
delete outbuffer;
return 0;
}
2、使用交叉编译工具进行编译
cmake_minimum_required(VERSION 3.5.1)
project(cl_test)
SET(CMAKE_BUILD_TYPE "Release")
# SET(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -O3 -Wall")
# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall")
set(CMAKE_SYSTEM_NAME Linux)
set(CMAKE_SYSTEM_PROCESSOR arm)
set(tools /gzy_mnt/gcc-linaro-7.4.1-2019.02-x86_64_aarch64-linux-gnu)
set(CMAKE_C_COMPILER ${tools}/bin/aarch64-linux-gnu-gcc)
set(CMAKE_CXX_COMPILER ${tools}/bin/aarch64-linux-gnu-g++)
set(EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/)
include_directories("/gzy_mnt/SDK/include/CL/")
# file(GLOB CL_LIB_DIRS "/gzy_mnt/SDK/aarch64-linux-gnu-7.4.1/lib64/*")
add_executable(${CMAKE_PROJECT_NAME} cl-test.cpp)
target_link_libraries (
${CMAKE_PROJECT_NAME}
# /gzy_mnt/SDK/aarch64-linux-gnu-7.4.1/lib64/libEGL.so.1.4.0
# /gzy_mnt/SDK/aarch64-linux-gnu-7.4.1/lib64/libGLESv1_CM.so.1.1.0
# /gzy_mnt/SDK/aarch64-linux-gnu-7.4.1/lib64/libGLESv2.so.2.1.0
/gzy_mnt/SDK/aarch64-linux-gnu-7.4.1/lib64/libmali.so.0
/gzy_mnt/SDK/aarch64-linux-gnu-7.4.1/lib64/libOpenCL.so.2
)