opencl_gpu缩放rgb888数据
主要学习下 双三次插值算法(bicubic)缩放rgb数据。
算法原理参考博客: https://www.cnblogs.com/ycliu/articles/17132289.html
一.环境配置
这边使用英伟达cuda的 opencl环境库,windows电脑配置了英伟达显卡,如果是AMD显卡,可以去AMD官网下载配置opencl库:https://community.amd.com/t5/drivers-software/where-can-i-download-amd-opencl-sdk/td-p/114538
为什么要用gpu缩放图像,而不使用cpu缩放图像,cpu缩放图像ffmpeg, opencv库中有很成熟的算法库,如果是在嵌入式平台的话,使用cpu缩放,性能受限,所以选择gpu缩放。
ffmpeg
struct SwsContext *sws_getContext(int srcW, int srcH, enum AVPixelFormat srcFormat,
int dstW, int dstH, enum AVPixelFormat dstFormat,
int flags, SwsFilter *srcFilter,
SwsFilter *dstFilter, const double *param);
int flags: 这个参数选择算法定义如下,
#define SWS_FAST_BILINEAR 1 //临近插值
#define SWS_BILINEAR 2 //双线性插值
#define SWS_BICUBIC 4 //双三次插值算法
等会可以使用ffmpeg命令行验证缩写gpu缩放代码是否正确
ffmpeg -f rawvideo -pix_fmt rgb24 -s 1920x1080 -i input.rgb -vf scale=1280:720 -f rawvideo -pix_fmt rgb24 output.rgb
二.代码
main.c
int main(int argc, char*argv[])
{
opencl_scale_rgb24();
return 0;
}
opencl_scaled_yuv.c 需要修改下rgb888文件路径名,图像宽高
/*
* 双三次插值算法 缩放RGB888数据
*/
int opencl_scale_rgb24()
{
/*
* 1. 在平台创造一个上下文,选择opencl
*/
cl_uint numPlatforms;
cl_platform_id *platformIds = (cl_platform_id *)malloc(sizeof(cl_platform_id));
cl_context context = 0;
int errNum = 0;
errNum = clGetPlatformIDs(0, NULL, &numPlatforms); //获取opencl 平台数目
if (errNum != CL_SUCCESS || numPlatforms <= 0)
{
cout << "find any opencl platforms failed" << endl;
return -1;
}
cout << "platform numbers: " << numPlatforms << endl;
errNum = clGetPlatformIDs(numPlatforms, platformIds, NULL); //创建所有的opencl平台
if (errNum != CL_SUCCESS)
{
cout << "find any opencl flatforms failed" << endl;
return -1;
}
cl_context_properties contextProperties[] = {
CL_CONTEXT_PLATFORM,
(cl_context_properties)platformIds[0], //选择第一个opencl平台
0
};
context = clCreateContextFromType(contextProperties, //为gpu创建上下文
CL_DEVICE_TYPE_GPU,
NULL,
NULL,
&errNum);
if (errNum != CL_SUCCESS)
{
cout << "create an opencl gpu failed" << endl;
return -1;
}
/**********************************************************************/
//打印平台信息
size_t ext_size = 0;
errNum = clGetPlatformInfo(platformIds[0], CL_PLATFORM_NAME, 0, NULL, &ext_size);
if (errNum < 0) {
cout << "get paltform information failed" << endl;
return -1;
}
char *name = (char*)malloc(ext_size);
clGetPlatformInfo(platformIds[0], CL_PLATFORM_NAME, ext_size, name, NULL);
cout << "paltform name:" << name << endl;
//供应商信息
errNum = clGetPlatformInfo(platformIds[0], CL_PLATFORM_VENDOR, 0, NULL, &ext_size);
if (errNum < 0) {
cout << "Couldn't read CL_PLATFORM_VENDOR." << endl;
return -1;
}
char *vendor = (char*)malloc(ext_size);
clGetPlatformInfo(platformIds[0], CL_PLATFORM_VENDOR, ext_size, vendor, NULL);
cout << "platform vendor:" << vendor << endl;
//最高支持的OpenCL版本
errNum = clGetPlatformInfo(platformIds[0], CL_PLATFORM_VERSION, 0, NULL, &ext_size);
if (errNum < 0) {
cout << "Couldn't read CL_PLATFORM_VERSION." << endl;
return -1;
}
char *version = (char*)malloc(ext_size);
clGetPlatformInfo(platformIds[0], CL_PLATFORM_VERSION, ext_size, version, NULL);
cout << "platform version:" << version << endl;
free(name);
free(vendor);
free(version);
/**********************************************************************/
/*
* 2. 选择设备,创建命令队列
*/
cl_device_id *devices;
cl_device_id device = 0;
cl_command_queue commandQueue = NULL;
size_t deviceBufferSize = -1;
errNum = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &deviceBufferSize);
if (errNum != CL_SUCCESS)
{
cout << "failed to get context information" << endl;
return -1;
}
devices = new cl_device_id[deviceBufferSize / sizeof(cl_device_id)];
errNum = clGetContextInfo(context, CL_CONTEXT_DEVICES, deviceBufferSize, devices, NULL);
if (errNum != CL_SUCCESS)
{
cout << "failed to get device id" << endl;
return -1;
}
commandQueue = clCreateCommandQueue(context, devices[0], 0, NULL);
if (commandQueue == NULL)
{
cout << "failed to create commond queue for device 0" << endl;
return -1;
}
device = devices[0];
delete[]devices;
/*
* 3.创建和构建程序对象源码
*/
cl_program program = 0;
size_t szKernelLen;
char *sourceCL = NULL;
sourceCL = clLoadProgSource("D:/2024/visual_studio_app/vs_project/cuda_opencl/cuda_opencl/opencl_scale_rgb24.cl", "", &szKernelLen);
if (sourceCL == NULL)
{
cout << "load sourceCL failed" << endl;
return -1;
}
// 使用源代码创建程序对象
program = clCreateProgramWithSource(context, 1, (const char **)&sourceCL, &szKernelLen, &errNum);
CHECK_ERROR(errNum);
// 编译内核源代码
errNum = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (errNum != CL_SUCCESS) {
// 输出编译错误信息
char buildLog[16384];
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buildLog), buildLog, NULL);
std::cerr << "Error in kernel: " << std::endl;
std::cerr << buildLog;
clReleaseProgram(program);
return -1;
}
/*
* 4.创建内核和内存对象
*/
cl_kernel kernel = clCreateKernel(program, "bicubic_interpolation2", &errNum); //创建内核
CHECK_ERROR(errNum);
int inputWidth = 1920;
int inputHeight = 1080;
int outputWidth = inputWidth / 2;
int outputHeight = inputHeight / 2;
double scale_factor_w = (double)outputWidth / (double)inputWidth;
double scale_factor_h = (double)outputHeight / (double)inputHeight;
uint8_t* original_image = (uint8_t*)malloc(inputWidth * inputHeight * 3);
uint8_t* scaled_image = (uint8_t*)malloc(outputWidth*outputHeight * 3);
FILE *fd_src = fopen("G:\\share\\1920X1080.rgb", "rb+");
if (fd_src == NULL) {
std::cout << "fopen failed" << std::endl;
return -1;
}
fread(original_image, inputWidth * inputHeight * 3, 1, fd_src);
FILE *fd_dst = fopen("G:\\share\\opencl_960x540_xxb.rgb", "wb+");
if (fd_dst == NULL) {
std::cout << "fopen failed" << std::endl;
return -1;
}
cl_mem cl_original_image = clCreateBuffer(context, CL_MEM_READ_ONLY , inputWidth * inputHeight * 3 * sizeof(uint8_t), NULL, &errNum);
CHECK_ERROR(errNum);
cl_mem cl_scaled_image = clCreateBuffer(context, CL_MEM_WRITE_ONLY , outputWidth * outputHeight * 3 * sizeof(uint8_t), NULL, &errNum);
CHECK_ERROR(errNum);
// 将原始图像复制到设备
errNum = clEnqueueWriteBuffer(commandQueue, cl_original_image, CL_TRUE, 0, inputWidth * inputHeight * 3 * sizeof(uint8_t), original_image, 0, NULL, NULL);
CHECK_ERROR(errNum);
/*
* 5.执行内核
*/
size_t globalWorkSize[2] = { outputWidth, outputHeight };
errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_original_image);
errNum = clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_scaled_image);
errNum = clSetKernelArg(kernel, 2, sizeof(int), &inputWidth);
errNum = clSetKernelArg(kernel, 3, sizeof(int), &inputHeight);
errNum = clSetKernelArg(kernel, 4, sizeof(int), &outputWidth);
errNum = clSetKernelArg(kernel, 5, sizeof(int), &outputHeight);
errNum = clSetKernelArg(kernel, 6, sizeof(double), &scale_factor_w);
errNum = clSetKernelArg(kernel, 7, sizeof(double), &scale_factor_h);
errNum = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
CHECK_ERROR(errNum);
errNum = clEnqueueReadBuffer(commandQueue, cl_scaled_image, CL_TRUE, 0, outputWidth*outputHeight * 3, scaled_image, 0, NULL, NULL);
CHECK_ERROR(errNum);
// 写数据,保存数据到本地
fwrite(scaled_image, outputWidth*outputHeight * 3, 1, fd_dst);
fclose(fd_src);
fclose(fd_dst);
free(original_image);
free(scaled_image);
clReleaseMemObject(cl_original_image);
clReleaseMemObject(cl_scaled_image);
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(commandQueue);
clReleaseDevice(device);
clReleaseContext(context);
return 0;
}
// 定义 OpenCL 错误检查宏
#define CHECK_ERROR(err) \
if (err!= CL_SUCCESS) { \
printf("OpenCL error: %d %d\n", err, __LINE__); \
exit(1); \
}
opencl_scaled_yuv.h
int opencl_scale_rgb24();
内核函数
#define NUM_CHANNELS 3
typedef unsigned char uint8_t;
double bicubic_weight(double t)
{
// Bicubic kernel function
double A = -0.5;
double abs_t = fabs(t);
double weight = 0;
if (abs_t <= 1)
{
weight = (A + 2) * pow(abs_t, 3) - (A + 3) * pow(abs_t, 2) + 1;
}
else if (abs_t <= 2)
{
weight = A * pow(abs_t, 3) - 5 * A * pow(abs_t, 2) + 8 * A * abs_t - 4 * A;
}
return weight;
}
__kernel void bicubic_interpolation2(__global uint8_t* original_image,
__global uint8_t* scaled_image,
int width,
int height,
int new_width,
int new_height,
double scale_factor_w,
double scale_factor_h)
{
int x = get_global_id(0);
int y = get_global_id(1);
double original_x = x / scale_factor_w;
double original_y = y / scale_factor_h;
int x1 = (int)(floor(original_x)) - 1;
int y1 = (int)(floor(original_y)) - 1;
double dx = original_x - x1 - 1;
double dy = original_y - y1 - 1;
double interpolated_pixel[NUM_CHANNELS] = {0};
for (int j = 0; j < 4; ++j)
{
for (int i = 0; i < 4; ++i)
{
double weight_x = bicubic_weight(dx - i);
double weight_y = bicubic_weight(dy - j);
int px = min(max(x1 + i, 0), width - 1);
int py = min(max(y1 + j, 0), height - 1);
for (int c = 0; c < NUM_CHANNELS; ++c)
{
interpolated_pixel[c] += weight_x * weight_y * original_image[(py * width + px) * NUM_CHANNELS + c];
}
}
}
for (int c = 0; c < NUM_CHANNELS; ++c)
{
scaled_image[(y * new_width + x) * NUM_CHANNELS + c] = min(max((int)(interpolated_pixel[c]), 0), 255);
}
}
三.结果
四.完整代码工程
下载链接:https://download.csdn.net/download/qq_44895902/89593035