CUDA实现模型推理前处理，Resize和Padding

最新推荐文章于 2024-07-14 13:39:50 发布

ccWepo~

最新推荐文章于 2024-07-14 13:39:50 发布

阅读量261

点赞数 8

文章标签：图像处理 c++ 人工智能

本文链接：https://blog.csdn.net/qq_40645782/article/details/140269814

版权

CUDA实现模型推理前处理，Resize和Padding

介绍
- 1.线性插值Resize
- 2.Padding边缘填充
程序

介绍

1.线性插值Resize

这里参考了Opencv中双线性插值的写法，具体可以看cv::resize的函数
ps：这里试过手动实现线性/双线性插值写法，但是出来效果没有Opencv的好，差异点在于，Opencv的写法是做了一个像素值的放大计算，具体可以看看源代码，效果确实会好很多。

2.Padding边缘填充

主要是通过在Resize中位置的判断来填充，所以前提得根据推理模型的输入大小计算出left、right、top、bottom的值（这里推荐top&right是0，后处理就不用再做偏移了）。

程序

下面是程序代码，几点要注意一下：

1.对于dim3的block输入，这里只用x一个维度。
2.默认图片输入格式是RGB。
3.默认数据排列格式是{3,w,h}
PS：我是为了模型输入写的，数据最后我是做归一化的，但是这里为了通用调试使用没有改啊，需要就自行改一下吧。

__global__ void resize_padding_rgb_device_kernel_from_opencv(unsigned char* src,unsigned char* dst,
	int dst_width,int dst_height,int src_width,int src_height,
	int resize_w,int resize_h,
	int pad_left,int pad_right,int pad_top,int pad_bottom,
	int val=114,bool RGB2BGR=true)
{
	// <[dimx,dimy],[idxX,idxY]>
	int block_row = blockIdx.y;
    int block_col = blockIdx.x;
    int row = threadIdx.y;
    int col = threadIdx.x;
	// 原始图位置
	int dex = block_col*blockDim.x+threadIdx.x;
	// 目标图位置
	int dst_y = dex / dst_width;
	int dst_x = dex % dst_width;

	int stepSrc = src_width*3;
	int stepDst = dst_width;

	unsigned char* dst_3C[3] = {
		dst,
		dst+dst_width*dst_height,
		dst+dst_width*dst_height*2,

	};

	// pad pass
	if(dst_x< pad_left || dst_x>=dst_width-pad_right || 
	 	dst_y < pad_top || dst_y >= dst_height-pad_bottom) {
			for (int k = 0; k <3; ++k)
			{
				*(dst_3C[k]+ dst_y*stepDst + dst_x) = val;
			}
			return;
		};

	// float scalex = (float)src_width/(float)dst_width;
	// float scaley = (float)src_height/(float)dst_height;

	float scalex = (float)src_width/(float)resize_w;
	float scaley = (float)src_height/(float)resize_h;

	// Y
	float fy = (float)((dst_y-pad_top + 0.5) * scaley - 0.5);
	int sy = int(fy);
	fy -= sy;
	sy = sy> (src_height - 2) ? (src_height - 2) : sy ;
	sy = 0 > sy ? 0:sy;

	short cbufy[2];
	cbufy[0] = ((1.f - fy) * 2048);
	cbufy[1] = 2048 - cbufy[0];

	// X
	float fx = (float)((dst_x-pad_left + 0.5) * scalex - 0.5);
	int sx = int(fx);
	fx -= sx;

	if (sx < 0) {
		fx = 0, sx = 0;
	}
	if (sx >= src_width - 1) {
		fx = 0, sx = src_width - 2;
	}

	short cbufx[2];
	cbufx[0] = ((1.f - fx) * 2048);
	cbufx[1] = 2048 - cbufx[0];

	
	// 双线性插值
    if(RGB2BGR)
    {
        for (int k = 0; k <3; ++k)
        {
            int src_k = 2-k;
            *(dst_3C[k]+ dst_y*stepDst + dst_x) = (*(src + sy*stepSrc + 3*sx + src_k) * cbufx[0] * cbufy[0] + 
                *(src + (sy+1)*stepSrc + 3*sx + src_k) * cbufx[0] * cbufy[1] + 
                *(src + sy*stepSrc + 3*(sx+1) + src_k) * cbufx[1] * cbufy[0] + 
                *(src + (sy+1)*stepSrc + 3*(sx+1) + src_k) * cbufx[1] * cbufy[1]) >> 22;
        }
    }
    else{
        for (int k = 0; k <3; ++k)
        {
            *(dst_3C[k]+ dst_y*stepDst + dst_x) = (*(src + sy*stepSrc + 3*sx + k) * cbufx[0] * cbufy[0] + 
                *(src + (sy+1)*stepSrc + 3*sx + k) * cbufx[0] * cbufy[1] + 
                *(src + sy*stepSrc + 3*(sx+1) + k) * cbufx[1] * cbufy[0] + 
                *(src + (sy+1)*stepSrc + 3*(sx+1) + k) * cbufx[1] * cbufy[1]) >> 22;
            
        }
    }
}