没啥效果,如果表的长度在 64个uint8_t之类,应该可以提高查表速度,否则还是C来的快
#ifdef HAVE_NEON_AARCH64
void table_lookup_AArch64_neon(uint8_t* lookup_table, uint32_t length, uint8_t* input_ptr, uint8_t* output_ptr) { /* Load lookup table. */ uint8x16x4_t table0 = vld1q_u8_x4(lookup_table); uint8x16x4_t table1 = vld1q_u8_x4(lookup_table+64); uint8x16x4_t table2 = vld1q_u8_x4(lookup_table+128); uint8x16x4_t table3 =