本文使用opencl的C++绑定,实现了batchsize>1,多路图像输入下,利用GPU-opencl处理图像(NV12转rgb),并输出多张图像为本地jpg文件。
首先,文件读取和保存接口如下:
1.data_io.hpp
#include <opencv2/opencv.hpp>
#include <opencv2/highgui.hpp>
#include <iostream>
void write_jpg_from_rgbdata_by_opencv(unsigned char *rgbdata,int img_h,int img_w,const std::string &filename);
2.data_io.cpp
#include "../include/data_io.hpp"
#include <fstream>
using namespace std;
using namespace cv;
void write_jpg_from_rgbdata_by_opencv(unsigned char *rgbdata,int img_h,int img_w,const std::string &filename)
{
cv::Mat src_Image =cv::Mat(img_h, img_w, CV_8UC3, (void*)rgbdata);
cv::Mat dst_Image(img_h, img_w, CV_8UC3);
cv::cvtColor(src_Image, dst_Image,cv::COLOR_RGB2BGR);
cv::imwrite(filename, dst_Image);
}
3.multi_deal.hpp
#ifndef _MULTI_DEAL_
#define _MULTI_DEAL_
#define CL_HPP_MINIMUM_OPENCL_VERSION 120
#define CL_HPP_TARGET_OPENCL_VERSION 120
#include <CL/cl2.hpp>
#include <CL/cl.hpp>
#include <string>
#include <fstream>
#include <streambuf>
#include <vector>
#include <cstdio>
struct Rawimage_t {
unsigned char *nv12data;
unsigned char *rgbdata;
};
namespace deal
{
class Multi_deal
{
private:
/* OpenCL Context used in this class */
cl::Context m_context;
/* OpenCL device to use */
cl::Device m_device;
/* OpenCL command queue to use */
cl::CommandQueue m_queue;
/* Program containing the kernels */
cl::Program m_program;
/* Kernels */
cl::Kernel m_transformation_kernel;
/**
* Prepare the device that runs the encoding process by uploading
* the color conversion table and preparing dct, huffman, ...
*/
void prepare_device(void);
public:
Multi_deal(cl_device_type type);
u_char* convert_image_to_multi(unsigned char* input_image);
int read_Image_from_raw_by_iostream(const std::string filename, unsigned char **buffer);
u_char* encode_Mat_memory(std::vector<Rawimage_t> &imgs) ;
void decode_Mat_memory(u_char* buf,std::vector<Rawimage_t> &imgs) ;
size_t input_w,input_h;
size_t output_w,output_h;
size_t input_size,output_size;
size_t batch_size;
};
}
#endif
4.multi_deal.cpp
#include <iostream>
#include <fstream>
#include <sstream>
#include "../include/multi_deal.hpp"
namespace deal
{
static cl::Program build_from_file(cl::Context &context, cl::Device &device, const char* const file)
{
std::ifstream t(file);
std::string str;
t.seekg(0, std::ios::end);
str.reserve(t.tellg());
t.seekg(0, std::ios::beg);
str.assign((std::istreambuf_iterator<char>(t)),
std::istreambuf_iterator<char>());
cl::Program ret(context, str);
ret.build({device});
return ret;
}
Multi_deal::Multi_deal(cl_device_type type) :
m_context(type),
m_device(m_context.getInfo<CL_CONTEXT_DEVICES>()[0]),
m_queue(m_context, m_device, CL_QUEUE_PROFILING_ENABLE),
m_program(build_from_file(m_context, m_device, "kernel/nv12_to_rgb.cl"))
{
this->prepare_device();
}
int Multi_deal::read_Image_from_raw_by_iostream(const std::string filename, unsigned char **buffer)
{
// open raw data
std::ifstream fin;
// 注意,这里要指定binary读取模式
fin.open(filename, std::ios::binary);
if (!fin) {
std::cerr << "open failed: " << filename << std::endl;
}
// seek函数会把标记移动到输入流的结尾
fin.seekg(0, fin.end);
// tell会告知整个输入流(从开头到标记)的字节数量
int length = fin.tellg();
// 再把标记移动到流的开始位置
fin.seekg(0, fin.beg);
std::cout << "file length: " << length << std::endl;
// load buffer
char* temp_buf = new char [length];
// read函数读取(拷贝)流中的length各字节到buffer
fin.read(temp_buf, length);
*buffer=reinterpret_cast<u_char *>(temp_buf);
return 0;
}
void Multi_deal::decode_Mat_memory(u_char* buf,std::vector<Rawimage_t> &imgs)
{
size_t mem_size = input_h*input_w*3;
for (int i = 0; i < batch_size; i++)
{
imgs[i].rgbdata=(u_char*)malloc(output_size);
memcpy(imgs.at(i).rgbdata,buf+(mem_size * i),output_size);
}
}
u_char* Multi_deal:: encode_Mat_memory(std::vector<Rawimage_t> &imgs)
{
input_size=sizeof(u_char)*input_h*input_w*3/2;
output_size=sizeof(u_char)*input_h*input_w*3;
size_t mem_size = input_h*input_w*3/2; //type = channel * data_type
u_char *batch_mat = new u_char[batch_size *input_size];
for (int i = 0; i < batch_size; i++)
{
memcpy(batch_mat + (mem_size * i),imgs.at(i).nv12data,input_size);
}
return batch_mat;
}
void Multi_deal::prepare_device(void)
{
/* create kernels */
this->m_transformation_kernel = cl::Kernel(this->m_program, "pre_process");
}
size_t RoundUp(int groupSize, int globalSize)
{
int r = globalSize % groupSize;
if(r == 0)
{
return globalSize;
}
else
{
return globalSize + groupSize - r;
}
}
u_char * Multi_deal::convert_image_to_multi(unsigned char* input_image)
{
size_t width=input_w;
size_t height=input_h;
size_t wg;
/* Make sure the image pointer is valid */
if(input_image == NULL)
{
fprintf(stderr, "Image data needs to be provided\n");
return nullptr;
}
//read_image
cl::Buffer nv12_buffer(this->m_context, CL_MEM_READ_WRITE, input_size*batch_size);
int ernum=this->m_queue.enqueueWriteBuffer(nv12_buffer, true, 0,input_size*batch_size, input_image);
cl::Buffer image_buffer(this->m_context, CL_MEM_READ_WRITE,output_size*batch_size);
/* Set arguments */
this->m_transformation_kernel.setArg<cl::Buffer>(0, nv12_buffer);
this->m_transformation_kernel.setArg<cl::Buffer>(1, image_buffer);
this->m_transformation_kernel.setArg<cl_uint>(2, (cl_uint)(input_w));
this->m_transformation_kernel.setArg<cl_uint>(3, (cl_uint)(input_h));
this->m_transformation_kernel.setArg<cl_uint>(4, (cl_uint)(input_size));
cl::Event evt;
size_t localWorkSize[3] = {batch_size, 16,16};
cl::NDRange ND1(localWorkSize[0], RoundUp(localWorkSize[1],width),RoundUp(localWorkSize[2],height) );//THIS global
cl::NDRange ND2(1, localWorkSize[1],localWorkSize[2]);
ernum=this->m_queue.enqueueNDRangeKernel(this->m_transformation_kernel, 0,ND1, ND2,NULL,&evt);
evt.wait();
cl_ulong time_start= (cl_ulong)0;
cl_ulong time_end= (cl_ulong)0;
evt.getProfilingInfo(CL_PROFILING_COMMAND_START,&time_start);
evt.getProfilingInfo(CL_PROFILING_COMMAND_END,&time_end);
double total_time =(double)( time_end - time_start);
printf("\n color_conversion time in milliseconds = %f ms\n", (total_time / 1000000.0) );
u_char * out_image= (u_char*)malloc(output_size*batch_size);
this->m_queue.enqueueReadBuffer(image_buffer, true, 0,output_size*batch_size, out_image);
return out_image;
}
}
5.main.cpp
#include <cstdio>
#include <cstdlib>
#include <cstdlib>
#include <fstream>
#include <sstream>
#include <iostream>
#include <cmath>
#include <iomanip>
#include <sstream>
#include "../include/multi_deal.hpp"
#include "../include/data_io.hpp"
const char raw[7][100]={"data/raw/4.raw","data/raw/5.raw","data/raw/6.raw","data/raw/7.raw",
"data/raw/8.raw","data/raw/9.raw","data/raw/10.raw"};
const char jpg[7][100]={"data/res/4.jpg","data/res/5.jpg","data/res/6.jpg","data/res/7.jpg",
"data/res/8.jpg","data/res/9.jpg","data/res/10.jpg"};
//
// Main function
//
int main(int argc, char** argv)
{
/* Create the encoder */
deal::Multi_deal dealer(CL_DEVICE_TYPE_GPU);
dealer.batch_size=7;
dealer.input_w=3840;
dealer.input_h=2176;
/* Read input image */
Rawimage_t image[dealer.batch_size];
std::vector<Rawimage_t> input_img;
for(int i=0;i<dealer.batch_size;++i)
{
if(dealer.read_Image_from_raw_by_iostream(raw[i], &image[i].nv12data))
{
fprintf(stderr, "Error Reading input file\naborting...\n");
return 0x2;
}
input_img.push_back(image[i]);
}
//组合输入buffer
u_char * p=dealer.encode_Mat_memory(input_img);
//转换执行
u_char *out=dealer.convert_image_to_multi(p);
//decodebuffer
dealer.decode_Mat_memory(out,input_img);
for(int i=0;i<dealer.batch_size;++i)
{
write_jpg_from_rgbdata_by_opencv(input_img.at(i).rgbdata,
dealer.input_h,dealer.input_w,jpg[i]);
free(input_img.at(i).nv12data);
free(input_img.at(i).rgbdata);
}
return 0;
}
6.nv12_to_rgb.cl
__kernel void pre_process(__global unsigned char *input_Mat,
__global unsigned char *output_Mat,
int input_w, int input_h,int stride){
int batch_size = get_global_id(0);
int x = get_global_id(1);
int y = get_global_id(2);
unsigned char *ybase = input_Mat + (batch_size * stride) ;
unsigned char *ubase = &ybase[input_w * input_h];
unsigned char Y = ybase[ x + y * input_w];
unsigned char U = ubase[y / 2 * input_w + (x / 2) * 2];
unsigned char V = ubase[y / 2 * input_w + (x / 2) * 2 + 1];
int index = (batch_size * input_w * input_h) + (y * input_w + x );
output_Mat[index*3] = Y + 1.402 * (V - 128);//R
output_Mat[index*3+1] = Y - 0.34413 * (U - 128) - 0.71414 * (V - 128);//G
output_Mat[index*3+2] = Y + 1.772 * (U - 128);//B
}