预处理优化——cuda bilinear resize

最新推荐文章于 2023-07-09 21:40:57 发布

Gotta-C

最新推荐文章于 2023-07-09 21:40:57 发布

阅读量4.8k

点赞数 3

分类专栏：预处理优化

本文链接：https://blog.csdn.net/cgt19910923/article/details/86536184

版权

预处理优化专栏收录该内容

3 篇文章 0 订阅

订阅专栏

https://github.com/Keylost/BilinearImageResize

YOLOv3中处理一张1080P的图片，resize到输入416*416尺寸，调用内部接口做cpu resize，可能80%~90%的时间耗在图像解码、resize上，对比推理时间耗时严重。尝试用cuda做外部resize。

修改下工程用于Ubuntu16.04,1080ti显卡，提供个包其中需要cmakelist修改下opencv路径。

https://pan.baidu.com/s/10RC1Lvxt4FFg5bsbrtnX8w

resizeGPU.cu


#include "resizeGPU.cuh"
//#define _DEBUG

#define BLOCK_DIM 64
#define threadNum 1024
#define WARP_SIZE 32
#define elemsPerThread 1

int32_t* deviceDataResized; //отмасштабированное изображение в памяти GPU
int32_t* deviceData; //оригинальное изображение в памяти GPU
int32_t* hostOriginalImage;
int32_t* hostResizedImage;

void reAllocPinned(int w, int h, int w2, int h2, int32_t* dataSource)
{
	cudaMallocHost((void**)&hostOriginalImage, w*h* sizeof(int32_t)); // host pinned
	cudaMallocHost((void**)&hostResizedImage, w2*h2 * sizeof(int32_t)); // host pinned
	memcpy(hostOriginalImage, dataSource, w*h * sizeof(int32_t));

	return;
}

void freePinned()
{
	cudaFreeHost(hostOriginalImage);
	cudaFreeHost(hostResizedImage);

	return;
}

void initGPU(const int maxResolutionX, const int maxResolutionY)
{
	cudaMalloc((void**)&deviceDataResized, maxResolutionX*maxResolutionY * sizeof(int32_t));
	cudaMalloc((void**)&deviceData, maxResolutionX*maxResolutionY * sizeof(int32_t));

	return;
}

void deinitGPU()
{
	cudaFree(deviceData);
	cudaFree(deviceDataResized);

	return;
}

__global__ void SomeKernel(int32_t* originalImage, int32_t* resizedImage, int w, int h, int w2, int h2/*, float x_ratio, float y_ratio*/)
{
	__shared__ int32_t tile[1024];
	const float x_ratio = ((float)(w - 1)) / w2;
	const float y_ratio = ((float)(h - 1)) / h2;
	//const int blockbx = blockIdx.y * w2 + blockIdx.x*BLOCK_DIM;
	//unsigned int threadId = blockIdx.x * threadNum*elemsPerThread + threadIdx.x;
	unsigned int threadId = blockIdx.x * threadNum*elemsPerThread + threadIdx.x*elemsPerThread;
	//__shared__ float result[threadNum*elemsPerThread];
	unsigned int shift = 0;
	//int32_t a, b, c, d, x, y, index;
	while((threadId < w2*h2 && shift<elemsPerThread))
	{
		const int32_t i = threadId / w2;
		const int32_t j = threadId - (i*w2);
		//float x_diff, y_diff, blue, red, green;
		
		const int32_t x = (int)(x_ratio * j);
		const int32_t y = (int)(y_ratio * i);
		const float x_diff = (x_ratio * j) - x;
		const float y_diff = (y_ratio * i) - y;
		const int32_t index = (y*w + x);
		const int32_t a = originalImage[index];
		const int32_t b = originalImage[index + 1];
		const int32_t c = originalImage[index + w];
		const int32_t d = originalImage[index + w + 1];
		// blue element
		// Yb = Ab(1-w)(1-h) + Bb(w)(1-h) + Cb(h)(1-w) + Db(wh)
		const float blue = (a & 0xff)*(1 - x_diff)*(1 - y_diff) + (b & 0xff)*(x_diff)*(1 - y_diff) +
			(c & 0xff)*(y_diff)*(1 - x_diff) + (d & 0xff)*(x_diff*y_diff);

		// green element
		// Yg = Ag(1-w)(1-h) + Bg(w)(1-h) + Cg(h)(1-w) + Dg(wh)
		const float green = ((a >> 8) & 0xff)*(1 - x_diff)*(1 - y_diff) + ((b >> 8) & 0xff)*(x_diff)*(1 - y_diff) +
			((c >> 8) & 0xff)*(y_diff)*(1 - x_diff) + ((d >> 8) & 0xff)*(x_diff*y_diff);

		// red element
		// Yr = Ar(1-w)(1-h) + Br(w)(1-h) + Cr(h)(1-w) + Dr(wh)
		const float red = ((a >> 16) & 0xff)*(1 - x_diff)*(1 - y_diff) + ((b >> 16) & 0xff)*(x_diff)*(1 - y_diff) +
			((c >> 16) & 0xff)*(y_diff)*(1 - x_diff) + ((d >> 16) & 0xff)*(x_diff*y_diff);

		/*
		resizedImage[threadId] =
			0xff000000 |
			((((int32_t)red) << 16) & 0xff0000) |
			((((int32_t)green) << 8) & 0xff00) |
			((int32_t)blue);
		*/
		tile[threadIdx.x] =
			0xff000000 |
			((((int32_t)red) << 16) & 0xff0000) |
			((((int32_t)green) << 8) & 0xff00) |
			((int32_t)blue);

		threadId++;
		//threadId+= WARP_SIZE;
		shift++;
	}
	
	__syncthreads();
	threadId = blockIdx.x * threadNum*elemsPerThread + threadIdx.x*elemsPerThread;
	resizedImage[threadId] = tile[threadIdx.x];
	/*
	shift--;
	threadId = blockIdx.x * threadNum*elemsPerThread + threadIdx.x*elemsPerThread+ shift;

	while (shift >= 0)
	{
		resizedImage[threadId] = tile[shift];
		shift--;
		threadId--;
	}
	*/
}



int32_t* resizeBilinear_gpu(int w, int h, int w2, int h2)
{
#ifdef _DEBUG
	cudaError_t error; //store cuda error codes
#endif
	int length = w2 * h2;

	// Копирование исходных данных в GPU для обработки
	cudaMemcpy(deviceData, hostOriginalImage, w*h * sizeof(int32_t), cudaMemcpyHostToDevice);
	//cudaMemcpy2D(deviceData, w * sizeof(int32_t), hostOriginalImage, w * sizeof(int32_t), w * sizeof(int32_t), h, cudaMemcpyHostToDevice);
	//error = cudaMemcpyToSymbol(deviceData, pixels, w*h * sizeof(int32_t),0, cudaMemcpyHostToDevice);
#ifdef _DEBUG
	if (error != cudaSuccess)
	{
		printf("cudaMemcpy (pixels->deviceData), returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__);
		exit(EXIT_FAILURE);
	}
#endif

	dim3 threads = dim3(threadNum, 1,1); //block size 32,32,x
	dim3 blocks = dim3(w2*h2/ threadNum*elemsPerThread, 1,1);
	//printf("Blockdim.x %d\n", blocks.x);
	//printf("thrdim.x %d\n", threads.x);

	// Запуск ядра из (length / 256) блоков по 256 потоков,
	// предполагая, что length кратно 256
	SomeKernel << <blocks, threads >> >(deviceData, deviceDataResized, w, h, w2, h2/*, x_ratio, y_ratio*/);


	cudaDeviceSynchronize();
	// Считывание результата из GPU
	cudaMemcpy(hostResizedImage, deviceDataResized, length * sizeof(int32_t), cudaMemcpyDeviceToHost);

	return hostResizedImage;
}

converter.cpp

#include "converter.hpp"

int32_t* cvtMat2Int32(const cv::Mat& srcImage)
{
	int32_t *result = new int32_t[srcImage.cols*srcImage.rows];
	int offset = 0;

	for (int i = 0; i<srcImage.cols*srcImage.rows * 3; i += 3)
	{
		int32_t blue = srcImage.data[i];
		int32_t green = srcImage.data[i + 1];
		int32_t red = srcImage.data[i + 2];
		result[offset++] =
			0xff000000 |
			((((int32_t)red) << 16) & 0xff0000) |
			((((int32_t)green) << 8) & 0xff00) |
			((int32_t)blue);
	}

	return result;
}

void cvtInt322Mat(int32_t *pxArray, cv::Mat& outImage)
{
	int offset = 0;
	for (int i = 0; i<outImage.cols*outImage.rows * 3; i += 3)
	{
		int32_t a = pxArray[offset++];
		int32_t blue = a & 0xff;
		int32_t green = ((a >> 8) & 0xff);
		int32_t red = ((a >> 16) & 0xff);
		outImage.data[i] = blue;
		outImage.data[i + 1] = green;
		outImage.data[i + 2] = red;
	}
	return;
}

resizeCPU.cpp


#include "resizeCPU.hpp"

int* resizeBilinear_cpu(int32_t* pixels, int w, int h, int w2, int h2)
{
	int32_t* temp = new int32_t[w2*h2];
	int32_t a, b, c, d, x, y, index;
	float x_ratio = ((float)(w - 1)) / w2;
	float y_ratio = ((float)(h - 1)) / h2;
	float x_diff, y_diff, blue, red, green;
	int offset = 0;
	for (int i = 0; i<h2; i++)
	{
		for (int j = 0; j<w2; j++)
		{
			x = (int)(x_ratio * j);
			y = (int)(y_ratio * i);
			x_diff = (x_ratio * j) - x;
			y_diff = (y_ratio * i) - y;
			index = (y*w + x);
			a = pixels[index];
			b = pixels[index + 1];
			c = pixels[index + w];
			d = pixels[index + w + 1];

			// blue element
			// Yb = Ab(1-w)(1-h) + Bb(w)(1-h) + Cb(h)(1-w) + Db(wh)
			blue = (a & 0xff)*(1 - x_diff)*(1 - y_diff) + (b & 0xff)*(x_diff)*(1 - y_diff) +
				(c & 0xff)*(y_diff)*(1 - x_diff) + (d & 0xff)*(x_diff*y_diff);

			// green element
			// Yg = Ag(1-w)(1-h) + Bg(w)(1-h) + Cg(h)(1-w) + Dg(wh)
			green = ((a >> 8) & 0xff)*(1 - x_diff)*(1 - y_diff) + ((b >> 8) & 0xff)*(x_diff)*(1 - y_diff) +
				((c >> 8) & 0xff)*(y_diff)*(1 - x_diff) + ((d >> 8) & 0xff)*(x_diff*y_diff);

			// red element
			// Yr = Ar(1-w)(1-h) + Br(w)(1-h) + Cr(h)(1-w) + Dr(wh)
			red = ((a >> 16) & 0xff)*(1 - x_diff)*(1 - y_diff) + ((b >> 16) & 0xff)*(x_diff)*(1 - y_diff) +
				((c >> 16) & 0xff)*(y_diff)*(1 - x_diff) + ((d >> 16) & 0xff)*(x_diff*y_diff);

			temp[offset++] =
				0xff000000 |
				((((int32_t)red) << 16) & 0xff0000) |
				((((int32_t)green) << 8) & 0xff00) |
				((int32_t)blue);
		}
	}
	return temp;
}

对比下结果，在1080ti下，resize 1080P图片到416*416尺寸，cuda resize 1.6ms，cpu resize 3.8ms，darknet内部接口cpu resize 8.0ms。cpu resize相比darknet resize 接口主要是移位操作有提速，cuda resize处理时间减少很多，但是需要做数据类型Mat与Int32相互转换。

Gotta-C

关注

3
点赞
踩
11

收藏

觉得还不错? 一键收藏
4
评论
预处理优化——cuda bilinear resize

https://github.com/Keylost/BilinearImageResizeYOLOv3中处理一张1080P的图片，resize到输入416*416尺寸，调用内部接口做cpu resize，可能80%~90%的时间耗在图像解码、resize上，对比推理时间耗时严重。尝试用cuda做外部resize。修改下工程用于Ubuntu16.04,1080ti显卡，提供个包其中需要cm...
复制链接

扫一扫

专栏目录