opencl_gpu缩放rgb888数据

国服卤鸡蛋

已于 2024-07-29 12:31:50 修改

阅读量199

点赞数 8

分类专栏： OpenCL 文章标签： linux c++

于 2024-07-28 23:30:54 首次发布

本文链接：https://blog.csdn.net/qq_44895902/article/details/140758387

版权

OpenCL 专栏收录该内容

2 篇文章 0 订阅

订阅专栏

opencl_gpu缩放rgb888数据
主要学习下双三次插值算法(bicubic)缩放rgb数据。

算法原理参考博客: https://www.cnblogs.com/ycliu/articles/17132289.html

一.环境配置

这边使用英伟达cuda的 opencl环境库，windows电脑配置了英伟达显卡，如果是AMD显卡,可以去AMD官网下载配置opencl库：https://community.amd.com/t5/drivers-software/where-can-i-download-amd-opencl-sdk/td-p/114538
为什么要用gpu缩放图像，而不使用cpu缩放图像，cpu缩放图像ffmpeg, opencv库中有很成熟的算法库，如果是在嵌入式平台的话，使用cpu缩放，性能受限，所以选择gpu缩放。

ffmpeg

struct SwsContext *sws_getContext(int srcW, int srcH, enum AVPixelFormat srcFormat,
                                  int dstW, int dstH, enum AVPixelFormat dstFormat,
                                  int flags, SwsFilter *srcFilter,
                                  SwsFilter *dstFilter, const double *param);
int flags: 这个参数选择算法定义如下，
#define SWS_FAST_BILINEAR     1			//临近插值
#define SWS_BILINEAR          2			//双线性插值
#define SWS_BICUBIC           4			//双三次插值算法

等会可以使用ffmpeg命令行验证缩写gpu缩放代码是否正确

ffmpeg -f rawvideo -pix_fmt rgb24 -s 1920x1080 -i input.rgb -vf scale=1280:720 -f rawvideo -pix_fmt rgb24 output.rgb

二.代码

main.c

int main(int argc, char*argv[])
{
	opencl_scale_rgb24();
	return 0;
}

opencl_scaled_yuv.c 需要修改下rgb888文件路径名，图像宽高

/*
 * 双三次插值算法 缩放RGB888数据
 */
int opencl_scale_rgb24()
{
	/*
	 * 1. 在平台创造一个上下文，选择opencl
	 */
	cl_uint numPlatforms;
	cl_platform_id *platformIds = (cl_platform_id *)malloc(sizeof(cl_platform_id));
	cl_context context = 0;
	int errNum = 0;

	errNum = clGetPlatformIDs(0, NULL, &numPlatforms);			//获取opencl 平台数目
	if (errNum != CL_SUCCESS || numPlatforms <= 0)
	{
		cout << "find any opencl platforms failed" << endl;
		return -1;
	}
	cout << "platform numbers: " << numPlatforms << endl;

	errNum = clGetPlatformIDs(numPlatforms, platformIds, NULL);	//创建所有的opencl平台
	if (errNum != CL_SUCCESS)
	{
		cout << "find any opencl flatforms failed" << endl;
		return -1;
	}

	cl_context_properties contextProperties[] = {
		CL_CONTEXT_PLATFORM,
		(cl_context_properties)platformIds[0],					//选择第一个opencl平台
		0
	};

	context = clCreateContextFromType(contextProperties,		//为gpu创建上下文
		CL_DEVICE_TYPE_GPU,
		NULL,
		NULL,
		&errNum);
	if (errNum != CL_SUCCESS)
	{
		cout << "create an opencl gpu failed" << endl;
		return -1;
	}

	/**********************************************************************/
	//打印平台信息
	size_t ext_size = 0;
	errNum = clGetPlatformInfo(platformIds[0], CL_PLATFORM_NAME, 0, NULL, &ext_size);
	if (errNum < 0) {
		cout << "get paltform information failed" << endl;
		return -1;
	}
	char *name = (char*)malloc(ext_size);
	clGetPlatformInfo(platformIds[0], CL_PLATFORM_NAME, ext_size, name, NULL);
	cout << "paltform name:" << name << endl;

	//供应商信息
	errNum = clGetPlatformInfo(platformIds[0], CL_PLATFORM_VENDOR, 0, NULL, &ext_size);
	if (errNum < 0) {
		cout << "Couldn't read CL_PLATFORM_VENDOR." << endl;
		return -1;
	}
	char *vendor = (char*)malloc(ext_size);
	clGetPlatformInfo(platformIds[0], CL_PLATFORM_VENDOR, ext_size, vendor, NULL);
	cout << "platform vendor:" << vendor << endl;

	//最高支持的OpenCL版本
	errNum = clGetPlatformInfo(platformIds[0], CL_PLATFORM_VERSION, 0, NULL, &ext_size);
	if (errNum < 0) {
		cout << "Couldn't read CL_PLATFORM_VERSION." << endl;
		return -1;
	}
	char *version = (char*)malloc(ext_size);
	clGetPlatformInfo(platformIds[0], CL_PLATFORM_VERSION, ext_size, version, NULL);
	cout << "platform version:" << version << endl;

	free(name);
	free(vendor);
	free(version);

	/**********************************************************************/
	/*
	 * 2. 选择设备，创建命令队列
	 */
	cl_device_id *devices;
	cl_device_id device = 0;
	cl_command_queue commandQueue = NULL;
	size_t deviceBufferSize = -1;

	errNum = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &deviceBufferSize);
	if (errNum != CL_SUCCESS)
	{
		cout << "failed to get context information" << endl;
		return -1;
	}

	devices = new cl_device_id[deviceBufferSize / sizeof(cl_device_id)];
	errNum = clGetContextInfo(context, CL_CONTEXT_DEVICES, deviceBufferSize, devices, NULL);
	if (errNum != CL_SUCCESS)
	{
		cout << "failed to get device id" << endl;
		return -1;
	}

	commandQueue = clCreateCommandQueue(context, devices[0], 0, NULL);
	if (commandQueue == NULL)
	{
		cout << "failed to create commond queue for device 0" << endl;
		return -1;
	}
	device = devices[0];
	delete[]devices;

	/*
	 * 3.创建和构建程序对象源码
	 */
	cl_program program = 0;
	size_t szKernelLen;
	char *sourceCL = NULL;

	sourceCL = clLoadProgSource("D:/2024/visual_studio_app/vs_project/cuda_opencl/cuda_opencl/opencl_scale_rgb24.cl", "", &szKernelLen);
	if (sourceCL == NULL)
	{
		cout << "load sourceCL failed" << endl;
		return -1;
	}
	// 使用源代码创建程序对象
	program = clCreateProgramWithSource(context, 1, (const char **)&sourceCL, &szKernelLen, &errNum);
	CHECK_ERROR(errNum);

	// 编译内核源代码
	errNum = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);		
	if (errNum != CL_SUCCESS) {
		// 输出编译错误信息
		char buildLog[16384];
		clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buildLog), buildLog, NULL);
		std::cerr << "Error in kernel: " << std::endl;
		std::cerr << buildLog;
		clReleaseProgram(program);
		return -1;
	}

	 /*
	  * 4.创建内核和内存对象
	  */
	cl_kernel kernel = clCreateKernel(program, "bicubic_interpolation2", &errNum);		//创建内核
	CHECK_ERROR(errNum);

	int inputWidth = 1920;
	int inputHeight = 1080;
	int outputWidth = inputWidth / 2;
	int outputHeight = inputHeight / 2;
	double scale_factor_w = (double)outputWidth / (double)inputWidth;
	double scale_factor_h = (double)outputHeight / (double)inputHeight;

	uint8_t* original_image = (uint8_t*)malloc(inputWidth * inputHeight * 3);
	uint8_t* scaled_image = (uint8_t*)malloc(outputWidth*outputHeight * 3);

	FILE *fd_src = fopen("G:\\share\\1920X1080.rgb", "rb+");
	if (fd_src == NULL) {
		std::cout << "fopen failed" << std::endl;
		return -1;
	}
	fread(original_image, inputWidth * inputHeight * 3, 1, fd_src);

	FILE *fd_dst = fopen("G:\\share\\opencl_960x540_xxb.rgb", "wb+");
	if (fd_dst == NULL) {
		std::cout << "fopen failed" << std::endl;
		return -1;
	}
	
	cl_mem cl_original_image = clCreateBuffer(context, CL_MEM_READ_ONLY , inputWidth * inputHeight * 3 * sizeof(uint8_t), NULL, &errNum);
	CHECK_ERROR(errNum);
	cl_mem cl_scaled_image = clCreateBuffer(context, CL_MEM_WRITE_ONLY , outputWidth * outputHeight * 3 * sizeof(uint8_t), NULL, &errNum);
	CHECK_ERROR(errNum);

	// 将原始图像复制到设备 
	errNum = clEnqueueWriteBuffer(commandQueue, cl_original_image, CL_TRUE, 0, inputWidth * inputHeight * 3 * sizeof(uint8_t), original_image, 0, NULL, NULL); 
	CHECK_ERROR(errNum);

	/*
	 * 5.执行内核
	 */
	size_t globalWorkSize[2] = { outputWidth,  outputHeight };

	errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_original_image);
	errNum = clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_scaled_image);
	errNum = clSetKernelArg(kernel, 2, sizeof(int), &inputWidth);
	errNum = clSetKernelArg(kernel, 3, sizeof(int), &inputHeight);
	errNum = clSetKernelArg(kernel, 4, sizeof(int), &outputWidth);
	errNum = clSetKernelArg(kernel, 5, sizeof(int), &outputHeight);
	errNum = clSetKernelArg(kernel, 6, sizeof(double), &scale_factor_w);
	errNum = clSetKernelArg(kernel, 7, sizeof(double), &scale_factor_h);

	errNum = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
	CHECK_ERROR(errNum); 
	
	errNum = clEnqueueReadBuffer(commandQueue, cl_scaled_image, CL_TRUE, 0, outputWidth*outputHeight * 3, scaled_image, 0, NULL, NULL);
	CHECK_ERROR(errNum);

	// 写数据，保存数据到本地
	fwrite(scaled_image, outputWidth*outputHeight * 3, 1, fd_dst);
	
	fclose(fd_src);
	fclose(fd_dst);
	free(original_image);
	free(scaled_image);

	clReleaseMemObject(cl_original_image);
	clReleaseMemObject(cl_scaled_image);
	clReleaseKernel(kernel);
	clReleaseProgram(program);
	clReleaseCommandQueue(commandQueue);
	clReleaseDevice(device);
	clReleaseContext(context);

	return 0;
}

// 定义 OpenCL 错误检查宏
#define CHECK_ERROR(err) \
    if (err!= CL_SUCCESS) { \
        printf("OpenCL error: %d %d\n", err, __LINE__); \
        exit(1); \
    }

opencl_scaled_yuv.h

int opencl_scale_rgb24();

内核函数

#define NUM_CHANNELS 3
typedef unsigned char      uint8_t;

double bicubic_weight(double t) 
{
	// Bicubic kernel function
	double A = -0.5;
	double abs_t = fabs(t);
	double weight = 0;

	if (abs_t <= 1) 
	{
		weight = (A + 2) * pow(abs_t, 3) - (A + 3) * pow(abs_t, 2) + 1;
	}
	else if (abs_t <= 2) 
	{
		weight = A * pow(abs_t, 3) - 5 * A * pow(abs_t, 2) + 8 * A * abs_t - 4 * A;
	}
	return weight;
}

__kernel void bicubic_interpolation2(__global uint8_t* original_image, 
                                     __global uint8_t* scaled_image, 
									 int width, 
									 int height, 
									 int new_width, 
									 int new_height, 
									 double scale_factor_w,
									 double scale_factor_h) 
{
    int x = get_global_id(0);
	int y = get_global_id(1);

    double original_x = x / scale_factor_w;
    double original_y = y / scale_factor_h;
    int x1 = (int)(floor(original_x)) - 1;
    int y1 = (int)(floor(original_y)) - 1;
    double dx = original_x - x1 - 1;
    double dy = original_y - y1 - 1;
    double interpolated_pixel[NUM_CHANNELS] = {0};

    for (int j = 0; j < 4; ++j) 
	{
        for (int i = 0; i < 4; ++i) 
		{
            double weight_x = bicubic_weight(dx - i);
            double weight_y = bicubic_weight(dy - j);
            int px = min(max(x1 + i, 0), width - 1);
            int py = min(max(y1 + j, 0), height - 1);

            for (int c = 0; c < NUM_CHANNELS; ++c) 
			{
                interpolated_pixel[c] += weight_x * weight_y * original_image[(py * width + px) * NUM_CHANNELS + c];
            }
        }
    }

    for (int c = 0; c < NUM_CHANNELS; ++c) 
	{
        scaled_image[(y * new_width + x) * NUM_CHANNELS + c] = min(max((int)(interpolated_pixel[c]), 0), 255);
    }
}