(NEON - intrinsics) - Swapping color channels

 在neon_programmers_guide中有个Swapping color channels示例,展示了neon技术的加速效果。

这里我编程实现了下,观察下加速效果,在两个平台上做对比试验,测试素材是1920*1080 rgb图像。

 arm(de-interleave)neon(de-interleave)arm(interleave)neon(interleave)
nvidia tk1 38(ms) 14(ms) 46(ms)5(ms)
respbarry pi 156(ms) 20(ms) 159(ms) 24(ms)

 

可以看到在nvidia tk1上 de-interleave的速度提升大概是2.7倍,interleave的提升要更多,大概是9.2倍。

可以看到在raspberry上 de-interleave的速度提升大概是7.8倍,interleave的提升大概是6.8倍。

这里需要说明的是raspberry是armv8架构,nvidia tk1是armv7架构的cortex-a15。

这里我也没指定特殊的编译选项,根据自己实验,指定相应arm架构的编译选项反而会引起效果更差,看来还需要进一步research。

 

下面贴一下makefile 和 源代码

de_inter:
    arm-linux-gnueabihf-g++ de_inter.cpp -o de_inter.out -mfpu=neon -mfloat-abi=hard

 

 

  1 #include <stdlib.h>
  2 #include <memory.h>
  3 #include <fstream>
  4 #include <sys/time.h>
  5 
  6 #include "/usr/lib/gcc-cross/arm-linux-gnueabihf/5/include/arm_neon.h"
  7 
  8 using namespace std;
  9 
 10 int main(int argc, char **argv)
 11 {
 12     int width = 1920;
 13     int height = 1080;
 14 
 15     int pixel_number = width * height;
 16     int image_size = width * height * 3;
 17 
 18     struct timeval tstart, tend;
 19 
 20     unsigned char *rgb_buffer = (unsigned char *)malloc(image_size);
 21     unsigned char *rgb_buffer_result = (unsigned char *)malloc(image_size);
 22 
 23     for (int i = 0; i < pixel_number; i++)
 24     {
 25         rgb_buffer[3 * i + 0] = 1;
 26         rgb_buffer[3 * i + 1] = 2;
 27         rgb_buffer[3 * i + 2] = 3;
 28     }
 29 
 30     fstream wfile("./origin.dat", ios::binary | ios::out);
 31     wfile.write((char *)rgb_buffer, image_size);
 32     wfile.close();
 33 
 34     for (int j = 0; j < 5; j++)
 35     {
 36         gettimeofday(&tstart, NULL);
 37 
 38         for (int i = 0; i < pixel_number; i++)
 39         {
 40             rgb_buffer_result[i] = rgb_buffer[i * 3];
 41             rgb_buffer_result[i + pixel_number] = rgb_buffer[i * 3 + 1];
 42             rgb_buffer_result[i + 2 * pixel_number] = rgb_buffer[i * 3 + 2];
 43         }
 44 
 45         gettimeofday(&tend, NULL);
 46         int timeuse = (1000000 * (tend.tv_sec - tstart.tv_sec) + (tend.tv_usec - tstart.tv_usec)) / 1000;
 47         printf("tk1 de-interleave with arm (1920*1080) cost time(ms) = %d\n", timeuse);
 48     }
 49     
 50 
 51     fstream wfile1("./arm_de-interleave.dat", ios::binary | ios::out);
 52     wfile1.write((char *)rgb_buffer_result, image_size);
 53     wfile1.close();
 54 
 55     memset(rgb_buffer_result, 0, image_size);
 56 
 57     uint8x16x3_t neon_1;
 58 
 59     for (int j = 0; j < 5; j++)
 60     {
 61         gettimeofday(&tstart, NULL);
 62 
 63         for (int i = 0; i < image_size / 48; i++)
 64         {
 65             neon_1 = vld3q_u8((uint8_t *)rgb_buffer + i * 48);
 66             vst1q_u8(rgb_buffer_result + i * 16, neon_1.val[0]);
 67             vst1q_u8(rgb_buffer_result + pixel_number + i * 16, neon_1.val[1]);
 68             vst1q_u8(rgb_buffer_result + 2 * pixel_number + i * 16, neon_1.val[2]);
 69         }
 70 
 71         gettimeofday(&tend, NULL);
 72         int timeuse = (1000000 * (tend.tv_sec - tstart.tv_sec) + (tend.tv_usec - tstart.tv_usec)) / 1000;
 73         printf("tk1 de-interleave with neon (1920*1080) cost time(ms) = %d\n", timeuse);
 74     }
 75 
 76     fstream wfile2("./neon_de-interleave.dat", ios::binary | ios::out);
 77     wfile2.write((char *)rgb_buffer_result, image_size);
 78     wfile2.close();
 79 
 80     memset(rgb_buffer, 0, image_size);
 81 
 82     for (int j = 0; j < 5; j++)
 83     {
 84         gettimeofday(&tstart, NULL);
 85 
 86         for (int i = 0; i < pixel_number; i++)
 87         {
 88             rgb_buffer[3 * i + 0] = rgb_buffer_result[i];
 89             rgb_buffer[3 * i + 1] = rgb_buffer_result[pixel_number + i];
 90             rgb_buffer[3 * i + 2] = rgb_buffer_result[2 * pixel_number + i];
 91         }
 92 
 93         gettimeofday(&tend, NULL);
 94 
 95         int timeuse = (1000000 * (tend.tv_sec - tstart.tv_sec) + (tend.tv_usec - tstart.tv_usec)) / 1000;
 96         printf("tk1 interleave with arm (1920*1080) cost time(ms) = %d\n", timeuse);
 97     }
 98 
 99     fstream wfile3("./arm_interleave.dat", ios::binary | ios::out);
100     wfile3.write((char *)rgb_buffer, image_size);
101     wfile3.close();
102 
103     uint8x16x3_t neon_2;
104 
105     memset(rgb_buffer, 0, image_size);
106 
107     for (int j = 0; j < 5; j++)
108     {
109         gettimeofday(&tstart, NULL);
110 
111         for (int i = 0; i < image_size / 48; i++)
112         {
113             neon_2.val[0] = vld1q_u8((uint8_t *)rgb_buffer_result + i * 16);
114             neon_2.val[1] = vld1q_u8((uint8_t *)rgb_buffer_result + pixel_number + i * 16);
115             neon_2.val[2] = vld1q_u8((uint8_t *)rgb_buffer_result + 2 * pixel_number + i * 16);
116             vst3q_u8(rgb_buffer + i * 48, neon_2);
117         }
118 
119         gettimeofday(&tend, NULL);
120 
121         int timeuse = (1000000 * (tend.tv_sec - tstart.tv_sec) + (tend.tv_usec - tstart.tv_usec)) / 1000;
122         printf("tk1 interleave with neon (1920*1080) cost time(ms) = %d\n", timeuse);
123     }
124 
125     fstream wfile4("./neon_interleave.dat", ios::binary | ios::out);
126     wfile4.write((char *)rgb_buffer, image_size);
127     wfile4.close();
128 
129     return 0;
130 }

 

转载于:https://www.cnblogs.com/aperolchen/p/9991253.html

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值