关闭

双线性插值算法ARM NEON优化

标签: 双线性插值算法ARM-NEON优化
1268人阅读 评论(0) 收藏 举报
分类:

C语言版本双线性插值算法

inline double bilinear_interp(double x, double y, double v11, double v12,
                              double v21, double v22) {

    return (v11 * (1 - y) + v12 * y) * (1 - x) + (v21 * (1 - y) + v22 * y) * x;
}

使用ARM NOEN优化后的双线性插值版本

inline uint8x8_t bilinear_interp_NEON(double x, double y, uint8x8_t v11,uint8x8_t v12,
                                      uint8x8_t v21,uint8x8_t v22)
{
    uint16x8_t v11_16 = vmovl_u8(v11);
    uint16x8_t v12_16 = vmovl_u8(v12);
    uint16x8_t v21_16 = vmovl_u8(v21);
    uint16x8_t v22_16 = vmovl_u8(v22);

    ///convert v11 to two float32x4
    uint16x4_t v_16_low = vget_low_u16(v11_16);
    uint16x4_t v_16_high = vget_high_u16(v11_16);
    uint32x4_t v_32_low = vmovl_u16(v_16_low);
    uint32x4_t v_32_high = vmovl_u16(v_16_high);
    float32x4_t v11_32f_low = vcvtq_f32_u32(v_32_low);
    float32x4_t v11_32f_high = vcvtq_f32_u32(v_32_high);

    //v12
    v_16_low = vget_low_u16(v12_16);
    v_16_high = vget_high_u16(v12_16);
    v_32_low = vmovl_u16(v_16_low);
    v_32_high = vmovl_u16(v_16_high);
    float32x4_t v12_32f_low = vcvtq_f32_u32(v_32_low);
    float32x4_t v12_32f_high = vcvtq_f32_u32(v_32_high);

    //v21
    v_16_low = vget_low_u16(v21_16);
    v_16_high = vget_high_u16(v21_16);
    v_32_low = vmovl_u16(v_16_low);
    v_32_high = vmovl_u16(v_16_high);
    float32x4_t v21_32f_low = vcvtq_f32_u32(v_32_low);
    float32x4_t v21_32f_high = vcvtq_f32_u32(v_32_high);

    //v22
    v_16_low = vget_low_u16(v22_16);
    v_16_high = vget_high_u16(v22_16);
    v_32_low = vmovl_u16(v_16_low);
    v_32_high = vmovl_u16(v_16_high);
    float32x4_t v22_32f_low = vcvtq_f32_u32(v_32_low);
    float32x4_t v22_32f_high = vcvtq_f32_u32(v_32_high);

    float32_t fx = (float32_t)x;
    float32_t fy = (float32_t)y;
    float32_t one_fx = 1-fx;
    float32_t one_fy = 1-fy;

    float32x4_t tmp1,tmp2,tmp3,tmp4,tmp5,tmp;
    uint32x4_t result_32_low,result_32_high;
    uint16x4_t result_16_low,result_16_high;
    //for low  32x4
    tmp1 = vmulq_n_f32(v11_32f_low, one_fy);
    tmp2 = vmulq_n_f32(v12_32f_low, fy);
    tmp3 = vaddq_f32(tmp1, tmp2);
    tmp4 = vmulq_n_f32(tmp3, one_fx);

    tmp1 = vmulq_n_f32(v21_32f_low, one_fy);
    tmp2 = vmulq_n_f32(v22_32f_low, fy);
    tmp3 = vaddq_f32(tmp1, tmp2);
    tmp5 = vmulq_n_f32(tmp3, fx);

    tmp = vaddq_f32(tmp4, tmp5);
    result_32_low = vcvtq_u32_f32(tmp);
    result_16_low = vqmovn_u32(result_32_low);

    //for high 32x4
    tmp1 = vmulq_n_f32(v11_32f_high, one_fy);
    tmp2 = vmulq_n_f32(v12_32f_high, fy);
    tmp3 = vaddq_f32(tmp1, tmp2);
    tmp4 = vmulq_n_f32(tmp3, one_fx);

    tmp1 = vmulq_n_f32(v21_32f_high, one_fy);
    tmp2 = vmulq_n_f32(v22_32f_high, fy);
    tmp3 = vaddq_f32(tmp1, tmp2);
    tmp5 = vmulq_n_f32(tmp3, fx);

    tmp = vaddq_f32(tmp4, tmp5);
    result_32_high = vcvtq_u32_f32(tmp);
    result_16_high = vqmovn_u32(result_32_high);


    uint16x8_t result_16 = vcombine_u16(result_16_low,result_16_high);

    uint8x8_t result_8 = vqmovn_u16(result_16);
    return result_8; 
}

使用ARM NEON后,一次可以处理8个像素,成倍提高了运行的速度。实践中需要特别注意对边界的处理(行的开始和结尾处)。
到这里我们还不能满足。要追求更快!!!
注意上面的代码中虽然用了ARM NEON,但是在ARM 指令集的操作中用到了浮点操作。所以,还可以继续使用浮点数定点化的优化方式,优化后的代码如下:

inline uint8x8_t bilinear_interp_NEON_FixedPoint(double x, double y, uint8x8_t v11,uint8x8_t v12,
                                      uint8x8_t v21,uint8x8_t v22)
{
    uint16x8_t v11_16 = vmovl_u8(v11);
    uint16x8_t v12_16 = vmovl_u8(v12);
    uint16x8_t v21_16 = vmovl_u8(v21);
    uint16x8_t v22_16 = vmovl_u8(v22);


    uint16x4_t v_16_low = vget_low_u16(v11_16);
    uint16x4_t v_16_high = vget_high_u16(v11_16);
    uint32x4_t v11_32_low = vmovl_u16(v_16_low);
    uint32x4_t v11_32_high = vmovl_u16(v_16_high);

    v_16_low = vget_low_u16(v12_16);
    v_16_high = vget_high_u16(v12_16);
    uint32x4_t v12_32_low = vmovl_u16(v_16_low);
    uint32x4_t v12_32_high = vmovl_u16(v_16_high);

    v_16_low = vget_low_u16(v21_16);
    v_16_high = vget_high_u16(v21_16);
    uint32x4_t v21_32_low = vmovl_u16(v_16_low);
    uint32x4_t v21_32_high = vmovl_u16(v_16_high);

    v_16_low = vget_low_u16(v22_16);
    v_16_high = vget_high_u16(v22_16);
    uint32x4_t v22_32_low = vmovl_u16(v_16_low);
    uint32x4_t v22_32_high = vmovl_u16(v_16_high);



    unsigned int intX = x*4096;
    unsigned int intY = y*4096;
    unsigned int one_x = 4096-intX;
    unsigned int one_y = 4096-intY;

    uint32_t intX_32 = (uint32_t) intX;
    uint32_t intY_32 = (uint32_t) intY;
    uint32_t oneX_32 = (uint32_t) one_x;
    uint32_t oneY_32 = (uint32_t) one_y;

    uint32x4_t tmp1,tmp2,tmp3,tmp4,tmp5,tmp;
    uint16x4_t result_16_low, result_16_high;
    //for low 4 numbers
    tmp1 = vmulq_n_u32(v11_32_low,oneY_32);
    tmp2 = vmulq_n_u32(v12_32_low, intY_32);
    tmp3 = vaddq_u32(tmp1, tmp2);
    tmp4 = vmulq_n_u32(tmp3, oneX_32);

    tmp1 = vmulq_n_u32(v21_32_low, oneY_32);
    tmp2 = vmulq_n_u32(v22_32_low, intY_32);
    tmp3 = vaddq_u32(tmp1, tmp2);
    tmp5 = vmulq_n_u32(tmp3, intX_32);

    tmp = vaddq_u32(tmp4, tmp5);
    result_16_low = vshrn_n_u32(tmp,16); //shift right 16 bytes
    result_16_low = vrshr_n_u16(result_16_low,8); //shift right 8 bytes, totally 24 bytes

    //for high 4 numbers
    tmp1 = vmulq_n_u32(v11_32_high,oneY_32);
    tmp2 = vmulq_n_u32(v12_32_high, intY_32);
    tmp3 = vaddq_u32(tmp1, tmp2);
    tmp4 = vmulq_n_u32(tmp3, oneX_32);

    tmp1 = vmulq_n_u32(v21_32_high, oneY_32);
    tmp2 = vmulq_n_u32(v22_32_high, intY_32);
    tmp3 = vaddq_u32(tmp1, tmp2);
    tmp5 = vmulq_n_u32(tmp3, intX_32);

    tmp = vaddq_u32(tmp4, tmp5);
    result_16_high = vshrn_n_u32(tmp,16);  //shift right 16 bytes
    result_16_high = vrshr_n_u16(result_16_high,8);  //shift right 8 bytes, totally 24 bytes

    uint16x8_t result_16 = vcombine_u16(result_16_low,result_16_high);

    uint8x8_t result_8 = vqmovn_u16(result_16);
    return result_8;

}

加入浮点定点化之后的优化,时间能进一步提升一倍左右。

0
0
查看评论
发表评论
* 以上用户言论只代表其个人观点,不代表CSDN网站的观点或立场

双线性插值算法ARM NEON优化

C语言版本双线性插值算法inline double bilinear_interp(double x, double y, double v11, double v12, ...
  • computerme
  • computerme
  • 2016-12-23 17:34
  • 1268

最近邻内插值与双线性插值

最近邻内插值与双线性插值的实现
  • zhangla1220
  • zhangla1220
  • 2014-11-11 17:36
  • 4373

数字图像缩放之最近邻插值与双线性插值处理效果对比

基本原理: 1、最近邻插值:变换后的目标图像某点像素值等于源图像中与变换前相应点最近的点的像素值。具体操作为,设水平方向和垂直方向缩放的比例分别为w和h,那么目标图像中的点des(x,y)对...
  • yi_tech_blog
  • yi_tech_blog
  • 2016-11-01 16:32
  • 1628

图像处理界双线性插值算法的优化

在图像处理中,双线性插值算法的使用频率相当高,比如在图像的缩放中,在所有的扭曲算法中,都可以利用该算法改进处理的视觉效果。首先,我们看看该算法的简介。      在数学上,双线性插值算法可以看成是两...
  • shangyaowei
  • shangyaowei
  • 2013-12-19 20:45
  • 438

图像处理界双线性插值算法的优化

在图像处理中,双线性插值算法的使用频率相当高,比如在图像的缩放中,在所有的扭曲算法中,都可以利用该算法改进处理的视觉效果。首先,我们看看该算法的简介。      在数学上,双线性插值算法可以看成是两...
  • shangyaowei
  • shangyaowei
  • 2013-12-19 20:48
  • 1388

图像处理界双线性插值算法的优化

在图像处理中,双线性插值算法的使用频率相当高,比如在图像的缩放中,在所有的扭曲算法中,都可以利用该算法改进处理的视觉效果。首先,我们看看该算法的简介。      在数学上,双线性插值算法可以看成是两...
  • shangyaowei
  • shangyaowei
  • 2013-12-19 20:45
  • 355

图像处理界双线性插值算法的优化

转自 http://www.cnblogs.com/Imageshop/archive/2011/11/12/2246808.html 在图像处理中,双线性插值算法的使用频率相当高,...
  • arau_sh
  • arau_sh
  • 2012-05-17 12:27
  • 730

C#二维数组双线性插值算法

  • 2014-03-16 11:59
  • 1KB
  • 下载

MFC实现的双线性插值算法

  • 2015-03-30 22:25
  • 882KB
  • 下载

双线性插值算法及实现.

  • 2009-10-31 11:23
  • 1.83MB
  • 下载
    个人资料
    • 访问:182622次
    • 积分:2760
    • 等级:
    • 排名:第15013名
    • 原创:86篇
    • 转载:6篇
    • 译文:0篇
    • 评论:73条
    文章分类
    最新评论