https://github.com/Keylost/BilinearImageResize
YOLOv3中处理一张1080P的图片,resize到输入416*416尺寸,调用内部接口做cpu resize,可能80%~90%的时间耗在图像解码、resize上,对比推理时间耗时严重。尝试用cuda做外部resize。
修改下工程用于Ubuntu16.04,1080ti显卡,提供个包其中需要cmakelist修改下opencv路径。
https://pan.baidu.com/s/10RC1Lvxt4FFg5bsbrtnX8w
resizeGPU.cu
#include "resizeGPU.cuh"
//#define _DEBUG
#define BLOCK_DIM 64
#define threadNum 1024
#define WARP_SIZE 32
#define elemsPerThread 1
int32_t* deviceDataResized; //отмасштабированное изображение в памяти GPU
int32_t* deviceData; //оригинальное изображение в памяти GPU
int32_t* hostOriginalImage;
int32_t* hostResizedImage;
void reAllocPinned(int w, int h, int w2, int h2, int32_t* dataSource)
{
cudaMallocHost((void**)&hostOriginalImage, w*h* sizeof(int32_t)); // host pinned
cudaMallocHost((void**)&hostResizedImage, w2*h2 * sizeof(int32_t)); // host pinned
memcpy(hostOriginalImage, dataSource, w*h * sizeof(int32_t));
return;
}
void freePinned()
{
cudaFreeHost(hostOriginalImage);
cudaFreeHost(hostResizedImage);
return;
}
void initGPU(const int maxResolutionX, const int maxResolutionY)
{
cudaMalloc((void**)&deviceDataResized, maxResolutionX*maxResolutionY * sizeof(int32_t));
cudaMalloc((void**)&deviceData, maxResolutionX*maxResolutionY * sizeof(int32_t));
return;
}
void deinitGPU()
{
cudaFree(deviceData);
cudaFree(deviceDataResized);
return;
}
__global__ void SomeKernel(int32_t* originalImage, int32_t* resizedImage, int w, int h, int w2, int h2/*, float x_ratio, float y_ratio*/)
{
__shared__ int32_t tile[1024];
const float x_ratio = ((float)(w - 1)) / w2;
const float y_ratio = ((float)(h - 1)) / h2;
//const int blockbx = blockIdx.y * w2 + blockIdx.x*BLOCK_DIM;
//unsigned int threadId = blockIdx.x * threadNum*elemsPerThread + threadIdx.x;
unsigned int threadId = blockIdx.x * threadNum*elemsPerThread + threadIdx.x*elemsPerThread;
//__shared__ float result[threadNum*elemsPerThread];
unsigned int shift = 0;
//int32_t a, b, c, d, x, y, index;
while((threadId < w2*h2 && shift<elemsPerThread))
{
const int32_t i = threadId / w2;
const int32_t j = threadId - (i*w2);
//float x_diff, y_diff, blue, red, green;
const int32_t x = (int)(x_ratio * j);
const int32_t y = (int)(y_ratio * i);
const float x_diff = (x_ratio * j) - x;
const float y_diff = (y_ratio * i) - y;
const int32_t index = (y*w + x);
const int32_t a = originalImage[index];
const int32_t b = originalImage[index + 1];
const int32_t c = originalImage[index + w];
const int32_t d = originalImage[index + w + 1];
// blue element
// Yb = Ab(1-w)(1-h) + Bb(w)(1-h) + Cb(h)(1-w) + Db(wh)
const float blue = (a & 0xff)*(1 - x_diff)*(1 - y_diff) + (b & 0xff)*(x_diff)*(1 - y_diff) +
(c & 0xff)*(y_diff)*(1 - x_diff) + (d & 0xff)*(x_diff*y_diff);
// green element
// Yg = Ag(1-w)(1-h) + Bg(w)(1-h) + Cg(h)(1-w) + Dg(wh)
const float green = ((a >> 8) & 0xff)*(1 - x_diff)*(1 - y_diff) + ((b >> 8) & 0xff)*(x_diff)*(1 - y_diff) +
((c >> 8) & 0xff)*(y_diff)*(1 - x_diff) + ((d >> 8) & 0xff)*(x_diff*y_diff);
// red element
// Yr = Ar(1-w)(1-h) + Br(w)(1-h) + Cr(h)(1-w) + Dr(wh)
const float red = ((a >> 16) & 0xff)*(1 - x_diff)*(1 - y_diff) + ((b >> 16) & 0xff)*(x_diff)*(1 - y_diff) +
((c >> 16) & 0xff)*(y_diff)*(1 - x_diff) + ((d >> 16) & 0xff)*(x_diff*y_diff);
/*
resizedImage[threadId] =
0xff000000 |
((((int32_t)red) << 16) & 0xff0000) |
((((int32_t)green) << 8) & 0xff00) |
((int32_t)blue);
*/
tile[threadIdx.x] =
0xff000000 |
((((int32_t)red) << 16) & 0xff0000) |
((((int32_t)green) << 8) & 0xff00) |
((int32_t)blue);
threadId++;
//threadId+= WARP_SIZE;
shift++;
}
__syncthreads();
threadId = blockIdx.x * threadNum*elemsPerThread + threadIdx.x*elemsPerThread;
resizedImage[threadId] = tile[threadIdx.x];
/*
shift--;
threadId = blockIdx.x * threadNum*elemsPerThread + threadIdx.x*elemsPerThread+ shift;
while (shift >= 0)
{
resizedImage[threadId] = tile[shift];
shift--;
threadId--;
}
*/
}
int32_t* resizeBilinear_gpu(int w, int h, int w2, int h2)
{
#ifdef _DEBUG
cudaError_t error; //store cuda error codes
#endif
int length = w2 * h2;
// Копирование исходных данных в GPU для обработки
cudaMemcpy(deviceData, hostOriginalImage, w*h * sizeof(int32_t), cudaMemcpyHostToDevice);
//cudaMemcpy2D(deviceData, w * sizeof(int32_t), hostOriginalImage, w * sizeof(int32_t), w * sizeof(int32_t), h, cudaMemcpyHostToDevice);
//error = cudaMemcpyToSymbol(deviceData, pixels, w*h * sizeof(int32_t),0, cudaMemcpyHostToDevice);
#ifdef _DEBUG
if (error != cudaSuccess)
{
printf("cudaMemcpy (pixels->deviceData), returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__);
exit(EXIT_FAILURE);
}
#endif
dim3 threads = dim3(threadNum, 1,1); //block size 32,32,x
dim3 blocks = dim3(w2*h2/ threadNum*elemsPerThread, 1,1);
//printf("Blockdim.x %d\n", blocks.x);
//printf("thrdim.x %d\n", threads.x);
// Запуск ядра из (length / 256) блоков по 256 потоков,
// предполагая, что length кратно 256
SomeKernel << <blocks, threads >> >(deviceData, deviceDataResized, w, h, w2, h2/*, x_ratio, y_ratio*/);
cudaDeviceSynchronize();
// Считывание результата из GPU
cudaMemcpy(hostResizedImage, deviceDataResized, length * sizeof(int32_t), cudaMemcpyDeviceToHost);
return hostResizedImage;
}
converter.cpp
#include "converter.hpp"
int32_t* cvtMat2Int32(const cv::Mat& srcImage)
{
int32_t *result = new int32_t[srcImage.cols*srcImage.rows];
int offset = 0;
for (int i = 0; i<srcImage.cols*srcImage.rows * 3; i += 3)
{
int32_t blue = srcImage.data[i];
int32_t green = srcImage.data[i + 1];
int32_t red = srcImage.data[i + 2];
result[offset++] =
0xff000000 |
((((int32_t)red) << 16) & 0xff0000) |
((((int32_t)green) << 8) & 0xff00) |
((int32_t)blue);
}
return result;
}
void cvtInt322Mat(int32_t *pxArray, cv::Mat& outImage)
{
int offset = 0;
for (int i = 0; i<outImage.cols*outImage.rows * 3; i += 3)
{
int32_t a = pxArray[offset++];
int32_t blue = a & 0xff;
int32_t green = ((a >> 8) & 0xff);
int32_t red = ((a >> 16) & 0xff);
outImage.data[i] = blue;
outImage.data[i + 1] = green;
outImage.data[i + 2] = red;
}
return;
}
resizeCPU.cpp
#include "resizeCPU.hpp"
int* resizeBilinear_cpu(int32_t* pixels, int w, int h, int w2, int h2)
{
int32_t* temp = new int32_t[w2*h2];
int32_t a, b, c, d, x, y, index;
float x_ratio = ((float)(w - 1)) / w2;
float y_ratio = ((float)(h - 1)) / h2;
float x_diff, y_diff, blue, red, green;
int offset = 0;
for (int i = 0; i<h2; i++)
{
for (int j = 0; j<w2; j++)
{
x = (int)(x_ratio * j);
y = (int)(y_ratio * i);
x_diff = (x_ratio * j) - x;
y_diff = (y_ratio * i) - y;
index = (y*w + x);
a = pixels[index];
b = pixels[index + 1];
c = pixels[index + w];
d = pixels[index + w + 1];
// blue element
// Yb = Ab(1-w)(1-h) + Bb(w)(1-h) + Cb(h)(1-w) + Db(wh)
blue = (a & 0xff)*(1 - x_diff)*(1 - y_diff) + (b & 0xff)*(x_diff)*(1 - y_diff) +
(c & 0xff)*(y_diff)*(1 - x_diff) + (d & 0xff)*(x_diff*y_diff);
// green element
// Yg = Ag(1-w)(1-h) + Bg(w)(1-h) + Cg(h)(1-w) + Dg(wh)
green = ((a >> 8) & 0xff)*(1 - x_diff)*(1 - y_diff) + ((b >> 8) & 0xff)*(x_diff)*(1 - y_diff) +
((c >> 8) & 0xff)*(y_diff)*(1 - x_diff) + ((d >> 8) & 0xff)*(x_diff*y_diff);
// red element
// Yr = Ar(1-w)(1-h) + Br(w)(1-h) + Cr(h)(1-w) + Dr(wh)
red = ((a >> 16) & 0xff)*(1 - x_diff)*(1 - y_diff) + ((b >> 16) & 0xff)*(x_diff)*(1 - y_diff) +
((c >> 16) & 0xff)*(y_diff)*(1 - x_diff) + ((d >> 16) & 0xff)*(x_diff*y_diff);
temp[offset++] =
0xff000000 |
((((int32_t)red) << 16) & 0xff0000) |
((((int32_t)green) << 8) & 0xff00) |
((int32_t)blue);
}
}
return temp;
}
对比下结果,在1080ti下,resize 1080P图片到416*416尺寸,cuda resize 1.6ms,cpu resize 3.8ms,darknet内部接口cpu resize 8.0ms。cpu resize相比darknet resize 接口主要是移位操作有提速,cuda resize处理时间减少很多,但是需要做数据类型Mat与Int32相互转换。