双线性插值法c语言,双线性插值算法ARM NEON优化

C语言版本双线性插值算法

inline double bilinear_interp(double x, double y, double v11, double v12,

double v21, double v22) {

return (v11 * (1 - y) + v12 * y) * (1 - x) + (v21 * (1 - y) + v22 * y) * x;

}

使用ARM NOEN优化后的双线性插值版本

inline uint8x8_t bilinear_interp_NEON(double x, double y, uint8x8_t v11,uint8x8_t v12,

uint8x8_t v21,uint8x8_t v22)

{

uint16x8_t v11_16 = vmovl_u8(v11);

uint16x8_t v12_16 = vmovl_u8(v12);

uint16x8_t v21_16 = vmovl_u8(v21);

uint16x8_t v22_16 = vmovl_u8(v22);

///convert v11 to two float32x4

uint16x4_t v_16_low = vget_low_u16(v11_16);

uint16x4_t v_16_high = vget_high_u16(v11_16);

uint32x4_t v_32_low = vmovl_u16(v_16_low);

uint32x4_t v_32_high = vmovl_u16(v_16_high);

float32x4_t v11_32f_low = vcvtq_f32_u32(v_32_low);

float32x4_t v11_32f_high = vcvtq_f32_u32(v_32_high);

//v12

v_16_low = vget_low_u16(v12_16);

v_16_high = vget_high_u16(v12_16);

v_32_low = vmovl_u16(v_16_low);

v_32_high = vmovl_u16(v_16_high);

float32x4_t v12_32f_low = vcvtq_f32_u32(v_32_low);

float32x4_t v12_32f_high = vcvtq_f32_u32(v_32_high);

//v21

v_16_low = vget_low_u16(v21_16);

v_16_high = vget_high_u16(v21_16);

v_32_low = vmovl_u16(v_16_low);

v_32_high = vmovl_u16(v_16_high);

float32x4_t v21_32f_low = vcvtq_f32_u32(v_32_low);

float32x4_t v21_32f_high = vcvtq_f32_u32(v_32_high);

//v22

v_16_low = vget_low_u16(v22_16);

v_16_high = vget_high_u16(v22_16);

v_32_low = vmovl_u16(v_16_low);

v_32_high = vmovl_u16(v_16_high);

float32x4_t v22_32f_low = vcvtq_f32_u32(v_32_low);

float32x4_t v22_32f_high = vcvtq_f32_u32(v_32_high);

float32_t fx = (float32_t)x;

float32_t fy = (float32_t)y;

float32_t one_fx = 1-fx;

float32_t one_fy = 1-fy;

float32x4_t tmp1,tmp2,tmp3,tmp4,tmp5,tmp;

uint32x4_t result_32_low,result_32_high;

uint16x4_t result_16_low,result_16_high;

//for low 32x4

tmp1 = vmulq_n_f32(v11_32f_low, one_fy);

tmp2 = vmulq_n_f32(v12_32f_low, fy);

tmp3 = vaddq_f32(tmp1, tmp2);

tmp4 = vmulq_n_f32(tmp3, one_fx);

tmp1 = vmulq_n_f32(v21_32f_low, one_fy);

tmp2 = vmulq_n_f32(v22_32f_low, fy);

tmp3 = vaddq_f32(tmp1, tmp2);

tmp5 = vmulq_n_f32(tmp3, fx);

tmp = vaddq_f32(tmp4, tmp5);

result_32_low = vcvtq_u32_f32(tmp);

result_16_low = vqmovn_u32(result_32_low);

//for high 32x4

tmp1 = vmulq_n_f32(v11_32f_high, one_fy);

tmp2 = vmulq_n_f32(v12_32f_high, fy);

tmp3 = vaddq_f32(tmp1, tmp2);

tmp4 = vmulq_n_f32(tmp3, one_fx);

tmp1 = vmulq_n_f32(v21_32f_high, one_fy);

tmp2 = vmulq_n_f32(v22_32f_high, fy);

tmp3 = vaddq_f32(tmp1, tmp2);

tmp5 = vmulq_n_f32(tmp3, fx);

tmp = vaddq_f32(tmp4, tmp5);

result_32_high = vcvtq_u32_f32(tmp);

result_16_high = vqmovn_u32(result_32_high);

uint16x8_t result_16 = vcombine_u16(result_16_low,result_16_high);

uint8x8_t result_8 = vqmovn_u16(result_16);

return result_8;

}

使用ARM NEON后,一次可以处理8个像素,成倍提高了运行的速度。实践中需要特别注意对边界的处理(行的开始和结尾处)。

到这里我们还不能满足。要追求更快!!!

注意上面的代码中虽然用了ARM NEON,但是在ARM 指令集的操作中用到了浮点操作。所以,还可以继续使用浮点数定点化的优化方式,优化后的代码如下:

inline uint8x8_t bilinear_interp_NEON_FixedPoint(double x, double y, uint8x8_t v11,uint8x8_t v12,

uint8x8_t v21,uint8x8_t v22)

{

uint16x8_t v11_16 = vmovl_u8(v11);

uint16x8_t v12_16 = vmovl_u8(v12);

uint16x8_t v21_16 = vmovl_u8(v21);

uint16x8_t v22_16 = vmovl_u8(v22);

uint16x4_t v_16_low = vget_low_u16(v11_16);

uint16x4_t v_16_high = vget_high_u16(v11_16);

uint32x4_t v11_32_low = vmovl_u16(v_16_low);

uint32x4_t v11_32_high = vmovl_u16(v_16_high);

v_16_low = vget_low_u16(v12_16);

v_16_high = vget_high_u16(v12_16);

uint32x4_t v12_32_low = vmovl_u16(v_16_low);

uint32x4_t v12_32_high = vmovl_u16(v_16_high);

v_16_low = vget_low_u16(v21_16);

v_16_high = vget_high_u16(v21_16);

uint32x4_t v21_32_low = vmovl_u16(v_16_low);

uint32x4_t v21_32_high = vmovl_u16(v_16_high);

v_16_low = vget_low_u16(v22_16);

v_16_high = vget_high_u16(v22_16);

uint32x4_t v22_32_low = vmovl_u16(v_16_low);

uint32x4_t v22_32_high = vmovl_u16(v_16_high);

unsigned int intX = x*4096;

unsigned int intY = y*4096;

unsigned int one_x = 4096-intX;

unsigned int one_y = 4096-intY;

uint32_t intX_32 = (uint32_t) intX;

uint32_t intY_32 = (uint32_t) intY;

uint32_t oneX_32 = (uint32_t) one_x;

uint32_t oneY_32 = (uint32_t) one_y;

uint32x4_t tmp1,tmp2,tmp3,tmp4,tmp5,tmp;

uint16x4_t result_16_low, result_16_high;

//for low 4 numbers

tmp1 = vmulq_n_u32(v11_32_low,oneY_32);

tmp2 = vmulq_n_u32(v12_32_low, intY_32);

tmp3 = vaddq_u32(tmp1, tmp2);

tmp4 = vmulq_n_u32(tmp3, oneX_32);

tmp1 = vmulq_n_u32(v21_32_low, oneY_32);

tmp2 = vmulq_n_u32(v22_32_low, intY_32);

tmp3 = vaddq_u32(tmp1, tmp2);

tmp5 = vmulq_n_u32(tmp3, intX_32);

tmp = vaddq_u32(tmp4, tmp5);

result_16_low = vshrn_n_u32(tmp,16); //shift right 16 bytes

result_16_low = vrshr_n_u16(result_16_low,8); //shift right 8 bytes, totally 24 bytes

//for high 4 numbers

tmp1 = vmulq_n_u32(v11_32_high,oneY_32);

tmp2 = vmulq_n_u32(v12_32_high, intY_32);

tmp3 = vaddq_u32(tmp1, tmp2);

tmp4 = vmulq_n_u32(tmp3, oneX_32);

tmp1 = vmulq_n_u32(v21_32_high, oneY_32);

tmp2 = vmulq_n_u32(v22_32_high, intY_32);

tmp3 = vaddq_u32(tmp1, tmp2);

tmp5 = vmulq_n_u32(tmp3, intX_32);

tmp = vaddq_u32(tmp4, tmp5);

result_16_high = vshrn_n_u32(tmp,16); //shift right 16 bytes

result_16_high = vrshr_n_u16(result_16_high,8); //shift right 8 bytes, totally 24 bytes

uint16x8_t result_16 = vcombine_u16(result_16_low,result_16_high);

uint8x8_t result_8 = vqmovn_u16(result_16);

return result_8;

}

加入浮点定点化之后的优化,时间能进一步提升一倍左右。

  • 0
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值