在neon_programmers_guide中有个Swapping color channels示例,展示了neon技术的加速效果。
这里我编程实现了下,观察下加速效果,在两个平台上做对比试验,测试素材是1920*1080 rgb图像。
arm(de-interleave) | neon(de-interleave) | arm(interleave) | neon(interleave) | |
nvidia tk1 | 38(ms) | 14(ms) | 46(ms) | 5(ms) |
respbarry pi | 156(ms) | 20(ms) | 159(ms) | 24(ms) |
可以看到在nvidia tk1上 de-interleave的速度提升大概是2.7倍,interleave的提升要更多,大概是9.2倍。
可以看到在raspberry上 de-interleave的速度提升大概是7.8倍,interleave的提升大概是6.8倍。
这里需要说明的是raspberry是armv8架构,nvidia tk1是armv7架构的cortex-a15。
这里我也没指定特殊的编译选项,根据自己实验,指定相应arm架构的编译选项反而会引起效果更差,看来还需要进一步research。
下面贴一下makefile 和 源代码
de_inter: arm-linux-gnueabihf-g++ de_inter.cpp -o de_inter.out -mfpu=neon -mfloat-abi=hard
1 #include <stdlib.h> 2 #include <memory.h> 3 #include <fstream> 4 #include <sys/time.h> 5 6 #include "/usr/lib/gcc-cross/arm-linux-gnueabihf/5/include/arm_neon.h" 7 8 using namespace std; 9 10 int main(int argc, char **argv) 11 { 12 int width = 1920; 13 int height = 1080; 14 15 int pixel_number = width * height; 16 int image_size = width * height * 3; 17 18 struct timeval tstart, tend; 19 20 unsigned char *rgb_buffer = (unsigned char *)malloc(image_size); 21 unsigned char *rgb_buffer_result = (unsigned char *)malloc(image_size); 22 23 for (int i = 0; i < pixel_number; i++) 24 { 25 rgb_buffer[3 * i + 0] = 1; 26 rgb_buffer[3 * i + 1] = 2; 27 rgb_buffer[3 * i + 2] = 3; 28 } 29 30 fstream wfile("./origin.dat", ios::binary | ios::out); 31 wfile.write((char *)rgb_buffer, image_size); 32 wfile.close(); 33 34 for (int j = 0; j < 5; j++) 35 { 36 gettimeofday(&tstart, NULL); 37 38 for (int i = 0; i < pixel_number; i++) 39 { 40 rgb_buffer_result[i] = rgb_buffer[i * 3]; 41 rgb_buffer_result[i + pixel_number] = rgb_buffer[i * 3 + 1]; 42 rgb_buffer_result[i + 2 * pixel_number] = rgb_buffer[i * 3 + 2]; 43 } 44 45 gettimeofday(&tend, NULL); 46 int timeuse = (1000000 * (tend.tv_sec - tstart.tv_sec) + (tend.tv_usec - tstart.tv_usec)) / 1000; 47 printf("tk1 de-interleave with arm (1920*1080) cost time(ms) = %d\n", timeuse); 48 } 49 50 51 fstream wfile1("./arm_de-interleave.dat", ios::binary | ios::out); 52 wfile1.write((char *)rgb_buffer_result, image_size); 53 wfile1.close(); 54 55 memset(rgb_buffer_result, 0, image_size); 56 57 uint8x16x3_t neon_1; 58 59 for (int j = 0; j < 5; j++) 60 { 61 gettimeofday(&tstart, NULL); 62 63 for (int i = 0; i < image_size / 48; i++) 64 { 65 neon_1 = vld3q_u8((uint8_t *)rgb_buffer + i * 48); 66 vst1q_u8(rgb_buffer_result + i * 16, neon_1.val[0]); 67 vst1q_u8(rgb_buffer_result + pixel_number + i * 16, neon_1.val[1]); 68 vst1q_u8(rgb_buffer_result + 2 * pixel_number + i * 16, neon_1.val[2]); 69 } 70 71 gettimeofday(&tend, NULL); 72 int timeuse = (1000000 * (tend.tv_sec - tstart.tv_sec) + (tend.tv_usec - tstart.tv_usec)) / 1000; 73 printf("tk1 de-interleave with neon (1920*1080) cost time(ms) = %d\n", timeuse); 74 } 75 76 fstream wfile2("./neon_de-interleave.dat", ios::binary | ios::out); 77 wfile2.write((char *)rgb_buffer_result, image_size); 78 wfile2.close(); 79 80 memset(rgb_buffer, 0, image_size); 81 82 for (int j = 0; j < 5; j++) 83 { 84 gettimeofday(&tstart, NULL); 85 86 for (int i = 0; i < pixel_number; i++) 87 { 88 rgb_buffer[3 * i + 0] = rgb_buffer_result[i]; 89 rgb_buffer[3 * i + 1] = rgb_buffer_result[pixel_number + i]; 90 rgb_buffer[3 * i + 2] = rgb_buffer_result[2 * pixel_number + i]; 91 } 92 93 gettimeofday(&tend, NULL); 94 95 int timeuse = (1000000 * (tend.tv_sec - tstart.tv_sec) + (tend.tv_usec - tstart.tv_usec)) / 1000; 96 printf("tk1 interleave with arm (1920*1080) cost time(ms) = %d\n", timeuse); 97 } 98 99 fstream wfile3("./arm_interleave.dat", ios::binary | ios::out); 100 wfile3.write((char *)rgb_buffer, image_size); 101 wfile3.close(); 102 103 uint8x16x3_t neon_2; 104 105 memset(rgb_buffer, 0, image_size); 106 107 for (int j = 0; j < 5; j++) 108 { 109 gettimeofday(&tstart, NULL); 110 111 for (int i = 0; i < image_size / 48; i++) 112 { 113 neon_2.val[0] = vld1q_u8((uint8_t *)rgb_buffer_result + i * 16); 114 neon_2.val[1] = vld1q_u8((uint8_t *)rgb_buffer_result + pixel_number + i * 16); 115 neon_2.val[2] = vld1q_u8((uint8_t *)rgb_buffer_result + 2 * pixel_number + i * 16); 116 vst3q_u8(rgb_buffer + i * 48, neon_2); 117 } 118 119 gettimeofday(&tend, NULL); 120 121 int timeuse = (1000000 * (tend.tv_sec - tstart.tv_sec) + (tend.tv_usec - tstart.tv_usec)) / 1000; 122 printf("tk1 interleave with neon (1920*1080) cost time(ms) = %d\n", timeuse); 123 } 124 125 fstream wfile4("./neon_interleave.dat", ios::binary | ios::out); 126 wfile4.write((char *)rgb_buffer, image_size); 127 wfile4.close(); 128 129 return 0; 130 }