利用neon技术对矩阵旋转进行加速(2)

本文介绍了如何使用NEON技术实现矩阵的顺时针180度和270度旋转。通过逆序排列矩阵的行元素及行本身来实现180度旋转;270度旋转则通过对NEON寄存器中的向量进行特殊转置操作完成。

上次介绍的是顺时针旋转90度,最近用到了180度和270度,在这里记录一下。

 

1.利用neon技术将矩阵顺时针旋转180度:

顺时针旋转180度比顺时针旋转90度容易很多,如下图

A1 A2 A3 A4              D4 D3 D2 D1

B1 B2 B3 B4        顺时针旋转180度    C4 C3 C2 C1

C1 C2 C3 C4              B4 B3 B2 B1

D1 D2 D3 D4               A4 A3 A2 A1

 

其实就是把矩阵每一行的元素逆序排列,再把矩阵的每一行逆序排列,代码如下:

void rotate180(unsigned char* dstImg,unsigned char* srcImg,int width,int height)
{
    uint8x8x4_t y_mat1;  //use 2 register array to load a 8x8 patch
    uint8x8x4_t y_mat2;
    for(int i=0;i<height;i+=8)
    {
        for(int j=0;j<width;j+=8)
        {
            //step0 load 8x8 bytes in 8 registers
            y_mat1.val[0]=vld1_u8(srcImg+i*width+j);
            y_mat1.val[1]=vld1_u8(srcImg+(i+1)*width+j);
            y_mat1.val[2]=vld1_u8(srcImg+(i+2)*width+j);
            y_mat1.val[3]=vld1_u8(srcImg+(i+3)*width+j);
            y_mat2.val[0]=vld1_u8(srcImg+(i+4)*width+j);
            y_mat2.val[1]=vld1_u8(srcImg+(i+5)*width+j);
            y_mat2.val[2]=vld1_u8(srcImg+(i+6)*width+j);
            y_mat2.val[3]=vld1_u8(srcImg+(i+7)*width+j);
            //step1 reverse every element in a row
            y_mat1.val[0]=vrev64_u8(y_mat1.val[0]);
            y_mat1.val[1]=vrev64_u8(y_mat1.val[1]);
            y_mat1.val[2]=vrev64_u8(y_mat1.val[2]);
            y_mat1.val[3]=vrev64_u8(y_mat1.val[3]);
            y_mat2.val[0]=vrev64_u8(y_mat2.val[0]);
            y_mat2.val[1]=vrev64_u8(y_mat2.val[1]);
            y_mat2.val[2]=vrev64_u8(y_mat2.val[2]);
            y_mat2.val[3]=vrev64_u8(y_mat2.val[3]);
            //step2 store every row in reverse order
            vst1_u8(dstImg+(height-i-8)*width+(width-j-8),y_mat2.val[3]);
            vst1_u8(dstImg+(height-i-7)*width+(width-j-8),y_mat2.val[2]);
            vst1_u8(dstImg+(height-i-6)*width+(width-j-8),y_mat2.val[1]);
            vst1_u8(dstImg+(height-i-5)*width+(width-j-8),y_mat2.val[0]);
            vst1_u8(dstImg+(height-i-4)*width+(width-j-8),y_mat1.val[3]);
            vst1_u8(dstImg+(height-i-3)*width+(width-j-8),y_mat1.val[2]);
            vst1_u8(dstImg+(height-i-2)*width+(width-j-8),y_mat1.val[1]);
            vst1_u8(dstImg+(height-i-1)*width+(width-j-8),y_mat1.val[0]);
        }
    }
}

 

2.利用neon技术将矩阵顺时针旋转270度:

这个和顺时针旋转90度非常像,只是在对neon寄存器中的向量进行转置时不太一样,这点需要注意

void rotate270(unsigned char* dstImg,unsigned char* srcImg,int width,int height)
{
    uint8x8x4_t y_mat1;  //use 2 register array to load a 8x8 patch
    uint8x8x4_t y_mat2;

    uint8x8x2_t temp1;
    uint8x8x2_t temp2;
    uint8x8x2_t temp3;
    uint8x8x2_t temp4;

    uint16x4x2_t temp5;
    uint16x4x2_t temp6;
    uint16x4x2_t temp7;
    uint16x4x2_t temp8;
    uint16x4x2_t temp9;
    uint16x4x2_t temp10;
    uint16x4x2_t temp11;
    uint16x4x2_t temp12;

    uint32x2x2_t temp13;
    uint32x2x2_t temp14;
    uint32x2x2_t temp15;
    uint32x2x2_t temp16;
    uint32x2x2_t temp17;
    uint32x2x2_t temp18;
    uint32x2x2_t temp19;
    uint32x2x2_t temp20;
    for(int i=0;i<height;i+=8)
    {
        for(int j=0;j<width;j+=8)
        {
            //step0 load 8x8 bytes in 8 registers
            y_mat1.val[0]=vld1_u8(srcImg+i*width+j);
            y_mat1.val[1]=vld1_u8(srcImg+(i+1)*width+j);
            y_mat1.val[2]=vld1_u8(srcImg+(i+2)*width+j);
            y_mat1.val[3]=vld1_u8(srcImg+(i+3)*width+j);
            y_mat2.val[0]=vld1_u8(srcImg+(i+4)*width+j);
            y_mat2.val[1]=vld1_u8(srcImg+(i+5)*width+j);
            y_mat2.val[2]=vld1_u8(srcImg+(i+6)*width+j);
            y_mat2.val[3]=vld1_u8(srcImg+(i+7)*width+j);
            //step1 trn nearby registers
            temp1=vtrn_u8(y_mat1.val[0],y_mat1.val[1]);
            temp2=vtrn_u8(y_mat1.val[2],y_mat1.val[3]);
            temp3=vtrn_u8(y_mat2.val[0],y_mat2.val[1]);
            temp4=vtrn_u8(y_mat2.val[2],y_mat2.val[3]);
            //step2 trn 1,3 2,4 5,7 6,8
            temp5.val[0]= vreinterpret_u16_u8(temp1.val[0]);
            temp5.val[1]= vreinterpret_u16_u8(temp1.val[1]);
            temp6.val[0]= vreinterpret_u16_u8(temp2.val[0]);
            temp6.val[1]= vreinterpret_u16_u8(temp2.val[1]);
            temp7.val[0]= vreinterpret_u16_u8(temp3.val[0]);
            temp7.val[1]= vreinterpret_u16_u8(temp3.val[1]);
            temp8.val[0]= vreinterpret_u16_u8(temp4.val[0]);
            temp8.val[1]= vreinterpret_u16_u8(temp4.val[1]);
            temp9=vtrn_u16(temp5.val[0],temp6.val[0]);
            temp10=vtrn_u16(temp5.val[1],temp6.val[1]);
            temp11=vtrn_u16(temp7.val[0],temp8.val[0]);
            temp12=vtrn_u16(temp7.val[1],temp8.val[1]);
            //step3 trn 1,5 2,6 3,7 4,8
            temp13.val[0]= vreinterpret_u32_u16(temp9.val[0]);
            temp13.val[1]= vreinterpret_u32_u16(temp9.val[1]);
            temp14.val[0]= vreinterpret_u32_u16(temp10.val[0]);
            temp14.val[1]= vreinterpret_u32_u16(temp10.val[1]);
            temp15.val[0]= vreinterpret_u32_u16(temp11.val[0]);
            temp15.val[1]= vreinterpret_u32_u16(temp11.val[1]);
            temp16.val[0]= vreinterpret_u32_u16(temp12.val[0]);
            temp16.val[1]= vreinterpret_u32_u16(temp12.val[1]);
            temp17=vtrn_u32(temp13.val[0],temp15.val[0]);
            temp18=vtrn_u32(temp13.val[1],temp15.val[1]);
            temp19=vtrn_u32(temp14.val[0],temp16.val[0]);
            temp20=vtrn_u32(temp14.val[1],temp16.val[1]);
            //step4 store bytes in correct position,the order now is 1,2,3,4,5,6,7,8
            temp1.val[0]= vreinterpret_u8_u32(temp20.val[1]);
            temp1.val[1]= vreinterpret_u8_u32(temp18.val[1]);
            temp2.val[0]= vreinterpret_u8_u32(temp19.val[1]);
            temp2.val[1]= vreinterpret_u8_u32(temp17.val[1]);
            temp3.val[0]= vreinterpret_u8_u32(temp20.val[0]);
            temp3.val[1]= vreinterpret_u8_u32(temp18.val[0]);
            temp4.val[0]= vreinterpret_u8_u32(temp19.val[0]);
            temp4.val[1]= vreinterpret_u8_u32(temp17.val[0]);
            vst1_u8(dstImg+(width-j-8)*height+i,temp1.val[0]);
            vst1_u8(dstImg+(width-j-7)*height+i,temp1.val[1]);
            vst1_u8(dstImg+(width-j-6)*height+i,temp2.val[0]);
            vst1_u8(dstImg+(width-j-5)*height+i,temp2.val[1]);
            vst1_u8(dstImg+(width-j-4)*height+i,temp3.val[0]);
            vst1_u8(dstImg+(width-j-3)*height+i,temp3.val[1]);
            vst1_u8(dstImg+(width-j-2)*height+i,temp4.val[0]);
            vst1_u8(dstImg+(width-j-1)*height+i,temp4.val[1]);
        }
    }
}

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值