文章目录
1. 原理介绍
1.1 目标图的像素点与原图之间的投影关系
利用双线性插值构建目标图,需要先将目标图上的像素点投影到原始图像中,并根据原图中周围的四个数据点,从而得到该像素点在目标图上的值。
-
几何对齐
【tip】由于原图像和目标图像的原点(0,0)均选择左上角,然后根据插值公式计算目标图像上每个像素点在原图上的投影,计算公式如下:
但是该公式存在一个问题,导致目标图像的中心点与原图的中心不对齐。实际上,对于图像中坐标点为(h, w),而每个像素点是一个边长为1的正方形,因此实际上中心点应该是(h+0.5, w+0.5).
转换一下可以得到:
-
计算在源图上四个近邻点的位置
左边坐标点的位置为目标点位置对其向下取整,
右边坐标点的位置是在左边坐标点的基础上+1,且由于坐标点的是以0开始计数,故右边点的大小<=(原图的宽度或高度-1)
1.2 如何求得投影点的值
如图所示,双线性插值的目标是在已知红色数据点的前提下求得绿色点的值。
其中:Q11, Q12,Q21,Q22为原图上点,而P点是目标图上像素点在原图上的投影。
根据单线性插值可以得到,上述问题的思路:
先在横轴方向上进行两次线性插值计算(先求R0和R1这两个蓝色点的像素值),然后在纵轴上进行一次插值计算(通过求得的两个值,得到P点的像素值)
具体计算公式如下:
1. 横轴上的两次线性插值计算
2. 纵轴上的线性插值计算
3. 合并上述公式
其由于四个Q点为P四周的点,即分母的值为1.即上式可简写为:
令x-x1=u;y-y1=v,可以得到:
至此,双线性插值的原理介绍完毕。
2. Python实现
import numpy as np
def bilinear_interpolate(src, dst_size):
height_src, width_src, channel_src = src.shape # (h, w, ch)
height_dst, width_dst = dst_size # (h, w)
"""
中心对齐,投影目标图的横轴和纵轴到原图上
"""
ws_p = np.array([(i + 0.5) / width_dst * width_src - 0.5 for i in range(width_dst)], dtype=np.float32)
hs_p = np.array([(i + 0.5) / height_dst * height_src - 0.5 for i in range(height_dst)], dtype=np.float32)
# 投影点在原图上的坐标
ws_p = np.clip(ws_p, 0, width_src-1) # 实验发现要这样子来一下才能跟torch的输出结果一致
hs_p = np.clip(hs_p, 0, height_src-1)
"""找出每个投影点在原图横轴方向的近邻点坐标对"""
# w_0的取值范围是 0 ~ (width_src-2),因为w_1 = w_0 + 1
ws_0 = np.clip(np.floor(ws_p), 0, width_src-2).astype(np.int)
print(ws_p)
"""找出每个投影点在原图纵轴方向的近邻点坐标对"""
# h_0的取值范围是 0 ~ (height_src-2),因为h_1 = h_0 + 1
hs_0 = np.clip(np.floor(hs_p), 0, height_src-2).astype(np.int)
"""
计算目标图各个点的像素值
f(h, w) = f(h_0, w_0) * (1 - u) * (1 - v)
+ f(h_0, w_1) * (1 - u) * v
+ f(h_1, w_0) * u * (1 - v)
+ f(h_1, w_1) * u * v
"""
dst = np.zeros(shape=(height_dst, width_dst, channel_src), dtype=np.float32)
us = hs_p - hs_0
vs = ws_p - ws_0
print(vs)
_1_us = 1 - us
_1_vs = 1 - vs
for h in range(height_dst):
h_0, h_1 = hs_0[h], hs_0[h]+1 # 原图的坐标
for w in range(width_dst):
w_0, w_1 = ws_0[w], ws_0[w]+1 # 原图的坐标
for c in range(channel_src):
'''
* _1_us[h] * vs[w]
* us[h] * _1_vs[w]
* us[h] * vs[w]
'''
dst[h][w][c] = src[h_0][w_0][c] * _1_us[h] * _1_vs[w] \
+ src[h_0][w_1][c] * _1_us[h] * vs[w] \
+ src[h_1][w_0][c] * us[h] * _1_vs[w] \
+ src[h_1][w_1][c] * us[h] * vs[w]
return dst
if __name__ == '__main__':
src = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]])
src = np.expand_dims(src, axis=2)
dst = bilinear_interpolate(src, dst_size=(5, 5))
print(dst[:, :, 0])
3. cuda实现
kernel.cu文件
#include "kernel.h"
#include "stdio.h"
#define threadNum 32
float* deviceDataResized;
float* deviceData;
float* hostOriginalImage;
float* hostResizedImage;
void reAllocPinned(unsigned int lengthSrc, unsigned int lengthResize, float* dataSource)
{
cudaMallocHost((void**)&hostOriginalImage, lengthSrc); // host pinned
cudaMallocHost((void**)&hostResizedImage, lengthResize); // host pinned
memcpy(hostOriginalImage, dataSource, lengthSrc*sizeof(float));
}
void freePinned()
{
cudaFreeHost(hostOriginalImage);
cudaFreeHost(hostResizedImage);
}
void initGPU(int w, int h, int c){
cudaMalloc((void**)&deviceDataResized, w*h*c*sizeof(float));
cudaMalloc((void**)&deviceData, w*h*c*sizeof(float));
}
void deinitGPU(){
cudaFree(deviceDataResized);
cudaFree(deviceData);
}
#define clip(x, a, b) x >= a ? (x <= b ? x : b) : a;
__global__ void resizeBilinear_kernel(float* src, float* out,int w, int h, int c, int w2, int h2)
{
int numberPixel = w2 * h2;
float x_ratio = float(w)/w2;
float y_ratio = float(h)/h2;
int threadId = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
for(int i=threadId; i < numberPixel; i+=stride)
{
// resize后的坐标值
int dy = i / w2;
int dx = i % w2;
// printf("dx: %d | dy: %d \n", dx, dy);
// 投影到原图的坐标值
float fx = (dx + 0.5) * x_ratio - 0.5;
float fy = (dy + 0.5) * y_ratio - 0.5;
fx = clip(fx, 0, w-1);
fy = clip(fy, 0, h-1);
// w0 和 h0
int ix = floor(fx);
int iy = floor(fy);
ix = clip(ix, 0, w-2);
iy = clip(iy, 0, h-2);
float u = fy - iy;
float v = fx - ix;
// printf("v: %f", v);
float _1_us = (1.f - u); // 1 - u
float _1_vs = (1.f - v); // 1 - v
for (int k = 0; k < c; ++k)
{
out[dy*w2 + dx + k*numberPixel] = src[iy*w + ix + k*w*h] * _1_us * _1_vs + // w[0] h[0]
src[(iy+1)*w + ix + k*w*h] * u * _1_vs + // w[0] h[1]
src[iy*w + (ix+1) + k*w*h] * v * _1_us + // w[1] h[0]
src[(iy+1)*w + (ix+1) + k*w*h] * u * v; // w[1] h[1]
}
}
}
float* resizeBilinear_gpu(int w, int h, int c, int w2, int h2)
{
cudaError_t error; //store cuda error codes
int length = w2 * h2;
error = cudaMemcpy(deviceData, hostOriginalImage, w*h*c*sizeof(float), cudaMemcpyHostToDevice);
if (error != cudaSuccess)
{
printf("cudaMemcpy (pixels->deviceData), returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__);
exit(EXIT_FAILURE);
}
dim3 blockSize(threadNum, 1, 1); //block size 32,32,x
dim3 gridSize(32, 1, 1);
printf("gridDim.x %d\n", gridSize.x);
printf("BlockDim.x %d\n", blockSize.x);
resizeBilinear_kernel <<<gridSize, blockSize >>>(deviceData, deviceDataResized, w, h, c, w2, h2);
cudaDeviceSynchronize();
cudaMemcpy(hostResizedImage, deviceDataResized, length*c* sizeof(float), cudaMemcpyDeviceToHost);
return hostResizedImage;
}
cudaError_t resizeBilinear(float* origin, float* resized, int w, int h, int c, int w2, int h2)
{
// 定义kernel的执行配置
dim3 blockSize(32);
// dim3 gridSize((N + blockSize.x - 1) / blockSize.x);
dim3 gridSize(32);
// 执行kernel
printf("perform resize operation . \n");
resizeBilinear_kernel <<< gridSize, blockSize >>>(origin, resized, w, h, c, w2, h2);
return cudaPeekAtLastError();
}
kernel.h
#ifndef __KERNEL_H_
#define __KERNEL_H_
#include "NvInfer.h"
#include <cuda_runtime.h>
float* resizeBilinear_gpu(int w, int h, int c, int w2, int h2);
cudaError_t resizeBilinear(float* origin, float* resized, int w, int h, int c, int w2, int h2);
void reAllocPinned(unsigned int lengthSrc, unsigned int lengthResize, float* dataSource);
void freePinned();
void initGPU(int w, int h, int c);
void deinitGPU();
#endif
test.cpp【测试代码】
#include <iostream>
#include <cuda_runtime.h>
#include <cuda.h>
#include <stdlib.h>
#include <device_launch_parameters.h>
#include <math.h>
#include "stdio.h"
#include "kernel.h"
int main()
{
std::cout << "device information about cuda : " << std::endl;
std::cout << std::endl;
/* ==================================================================================== */
int dev = 0;
cudaDeviceProp devProp;
cudaGetDeviceProperties(&devProp, dev);
std::cout << " 使用GPU device " << dev << ": " << devProp.name << std::endl;
std::cout << " SM的数量:" << devProp.multiProcessorCount << std::endl;
std::cout << " 每个线程块的共享内存大小:" << devProp.sharedMemPerBlock / 1024.0 << " KB" << std::endl;
std::cout << " 每个线程块的最大线程数:" << devProp.maxThreadsPerBlock << std::endl;
std::cout << " 每个SM的最大线程数:" << devProp.maxThreadsPerMultiProcessor << std::endl;
std::cout << " 每个SM的最大线程束数:" << devProp.maxThreadsPerMultiProcessor / 32 << std::endl;
/* ===================================================================================== */
std::cout << std::endl;
/* ======================= resize ================ */
float* data = nullptr;
float* dataGpu = nullptr;
nvinfer1::Dims3 src(1, 112, 112);
nvinfer1::Dims3 dst(1, 448, 448);
unsigned int lengthSrc = src.d[0] * src.d[1] * src.d[2] * sizeof(float);
unsigned int lengthDst = dst.d[0] * dst.d[1] * dst.d[2] * sizeof(float);
data = (float*)malloc(lengthSrc);
for(int i=0; i<112*112; i++)
{
data[i] = i + 1.f;
}
for(int i=0; i<9; i++)
{
printf("%d | %f \n", i, data[i]);
}
reAllocPinned(lengthSrc, lengthDst, data);
initGPU(dst.d[2], dst.d[1], dst.d[0]);
dataGpu = resizeBilinear_gpu(src.d[2], src.d[1], src.d[0], dst.d[2], dst.d[1]);
for(int i=448*446; i<448*447; i++)
{
if(i%16==0)
printf("\n");
printf("%f ", dataGpu[i]);
}
printf("\n");
deinitGPU();
freePinned();
free(data);
return 0;
}
4. 常见库的用法
4.1 OpenCV的resize函数
cv2.resize(InputArray src, OutputArray dst,
Size, fx, fy, interpolation)
- size:输出图像尺寸
- fx,fy:沿着x轴,y轴的缩放系数
- interpolation:插入方式
- INTER_NEAREST:最邻近插值
- INTER_LINEAR:双线性插值
4.2 Pytorch的upsample函数
torch.nn.functional.upsample(input, size=None, scale_factor=None,
mode='nearest', align_corners=None)
size:输出大小
- scale_factor:输入大小的比例
- mode
- nearest:最邻近插值
- linear:线性
- bilinear:双线性插值