ARMNEON优化

最新推荐文章于 2023-12-12 21:09:32 发布

sunjing_

最新推荐文章于 2023-12-12 21:09:32 发布

阅读量1k

点赞数

分类专栏： linux

原文链接：https://www.veryarm.com/95523.html

版权

linux 专栏收录该内容

268 篇文章 7 订阅

订阅专栏

https://www.veryarm.com/95523.html

确认处理器是否支持NEON cat /proc/cpuinfo | grep neon 看是否有如下内容 Features : swp half thumb fastmult vfp edsp neon vfpv3 tls vfpv4 idiva idivt 以Android为例, 并计算float数组的和标准 C 代码实现 01 // 标准 C 代码实现 02 static float calc_c(const float* data, int size) 03 { 04 float sum = 0.f; 05 06 for (int i = 0; i < size; ++i) { 07 sum += data[i]; 08 } 09 10 return sum; 11 } 运用 ARM NEON 优化 01 // 运用 ARM NEON 优化 02 #include <arm_neon.h> 03 04 static float calc_neon(const float* data, int size) 05 { 06 float sum = 0.f; 07 float32x4_t sum_vec = vdupq_n_f32(0); 08 09 for (int i = 0; i < size / 4; ++i) { 10 float32x4_t tmp_vec = vld1q_f32 (data + 4*i); 11 sum_vec = vaddq_f32(sum_vec, tmp_vec); 12 } 13 14 sum += vgetq_lane_f32(sum_vec, 0); 15 sum += vgetq_lane_f32(sum_vec, 1); 16 sum += vgetq_lane_f32(sum_vec, 2); 17 sum += vgetq_lane_f32(sum_vec, 3); 18 19 int odd = size &; 3; 20 if(odd) { 21 for(int i = size - odd; i < size; ++i) { 22 sum += data[i]; 23 } 24 } 25 26 return sum; 27 } 运用 Ne10 优化 01 // 运用 Ne10 优化 02 #include <NE10.h> 03 #define ALIGH_UNIT 4 04 05 static float calc_ne10(const float* data, int size) 06 { 07 float sum = 0.f; 08 float sum_vec[ALIGH_UNIT] = {0}; 09 10 for (int i = 0; i < size / ALIGH_UNIT; ++i) { 11 ne10_add_float_neon (sum_vec, sum_vec, (float*)data+ALIGH_UNIT*i, ALIGH_UNIT); 12 } 13 14 for (int i = 0; i < ALIGH_UNIT; ++i) { 15 sum += sum_vec[i]; 16 } 17 18 int odd = size &; (ALIGH_UNIT-1); 19 if(odd) { 20 for(int i = size - odd; i < size; ++i) { 21 sum += data[i]; 22 } 23 } 24 25 return sum; 26 } view source print ? 01 // 主程序 Main 02 #include <stdlib.h> 03 #include <time.h> 04 #include <android/log.h> 05 06 #define LOG_TAG "Neon/Pref" 07 #define LOGD(...) __android_log_print(ANDROID_LOG_DEBUG,LOG_TAG,__VA_ARGS__) 08 09 #define ARRAY_SIZE 5000 10 #define ELAPSE_BEGIN(a) struct timeval start##a = {0}; gettimeofday(&;start##a, 0); 11 #define ELAPSE_END(a) struct timeval end##a = {0}; gettimeofday(&;end##a, 0); 12 #define ELAPSE_COUNT(a) (1000000 * (end##a.tv_sec - start##a.tv_sec) + (end##a.tv_usec - start##a.tv_usec)) 13 14 #define DO_ELAPSE(fn,...) 15 { 16 ELAPSE_BEGIN(_##fn); 17 float sum = fn(__VA_ARGS__); 18 ELAPSE_END(_##fn); 19 LOGD( #fn " : %d, Result: %f", (int)ELAPSE_COUNT(_##fn), sum); 20 } 21 22 23 int main(int argc, char** argv) { 24 float data[ARRAY_SIZE] = {0}; 25 26 for (int i = 0; i < ARRAY_SIZE; ++i) { 27 data[i] = rand() % 5; 28 } 29 30 DO_ELAPSE(calc_c , data, ARRAY_SIZE); 31 DO_ELAPSE(calc_neon, data, ARRAY_SIZE); 32 DO_ELAPSE(calc_ne10, data, ARRAY_SIZE); 33 } [资料文档] ARM NEON: http://gcc.gnu.org/onlinedocs/gcc/ARM-NEON-Intrinsics.html NE10 Manual: http://blogs.arm.com/software-enablement/874-ne10-library-getting-started/ [参考资料] http://hilbert-space.de/?p=22 http://www.crickettechnology.com/blog/?p=691 有的时候其实网络上资料比较多,但是自己很难找到。譬如我一直想要做Android NDK的源代码优化,知道可以利用NEON,可以利用汇编进行。但是却找不到正确的门路。所以耗费了很多时间。在针对C代码的优化上,实在是收益甚微,对某个函数进行的代码优化,对整个系统来说,影响一般很小(一方面代码本来在优化上性能的提升倍数不多,另一方面单个函数在整个系统中占用的比重都很低),所以优化了几天也见不到明显的进展。找到一些相关的资料也花费了很多功夫, 首先找到了要在C源代码中只用NEON库需要的头文件 arm_neon.h [cpp] view plain copy #include <arm_neon.h> // 在代码中先添加了这行语句,然后执行ndk-build 却提示了错误 // 提示要增加什么标志,自己在 LOCAL_CXX_FLAGS 的后面添加了,但是仍然报错 // 后来搜索 NDK + NEON 终于找到了一点点苗头并开始发现。 // 遂总结如下内容 Android.mk 文件内容可以参考这个: http://download.csdn.net/download/carlonelong/4153631 我做了一点修改,改后的文件如下: LOCAL_PATH := $(call my-dir) include $(CLEAR_VARS) # 这里填写要编译的源文件路径,这里只列举了一部分 LOCAL_SRC_FILES := NcHevcDecoder.cpp JNI_OnLoad.cpp TAppDecTop.cpp # 默认包含的头文件路径 LOCAL_C_INCLUDES := $(LOCAL_PATH) $(LOCAL_PATH)/.. # -g 后面的一系列附加项目添加了才能使用 arm_neon.h 头文件 <pre name="code" class="cpp"> # -mfloat-abi=softfp -mfpu=neon 使用 arm_neon.h 必须 LOCAL_CFLAGS := -D__cpusplus -g -mfloat-abi=softfp -mfpu=neon -march=armv7-a -mtune=cortex-a8 LOCAL_LDLIBS := -lz -llog TARGET_ARCH_ABI :=armeabi-v7aLOCAL_ARM_MODE := arm ifeq ($(TARGET_ARCH_ABI),armeabi-v7a) # 采用NEON优化技术 LOCAL_ARM_NEON := true endif LOCAL_MODULE := NcHevcDecoder # 生成动态调用库 include $(BUILD_STATIC_LIBRARY) 同时需要修改一下Application.mk文件,其内容如下: 参考: http://blog.csdn.net/gg137608987/article/details/7565843 APP_PROJECT_PATH := $(call my-dir)/.. APP_PLATFORM := android-10 APP_STL := stlport_static APP_ABI := armeabi-v7a APP_CPPFLAGS += -fexceptions 其中APP_ABI这句指定了编译的目标平台类型,可以针对不同平台进行优化。当然这样指定了之后,就需要相应的设备支持NEON指令。我的一个NDK应用,在使用上述配置之后,即NEON优化等,程序的性能提升了近一倍。系统的处理延时由原来的 95ms左右降低到了 51ms。后续可以使用NEON库进一步优化 NDK 程序代码,实现更加优化的结果。 NEON优化的部分将在后面介绍,我会一边应用一边更新博客。网上有一个用NEON优化YUV转RGB的NEON优化例子,可以参见: http://hilbert-space.de/?p=22 这里摘录一下其优化过程: 1、原始代码 void reference_convert (uint8_t * __restrict dest, uint8_t * __restrict src, int n) { int i; for (i=0; i<n; i++) { int r = *src++; // load red int g = *src++; // load green int b = *src++; // load blue // build weighted average: int y = (r*77)+(g*151)+(b*28); // undo the scale by 256 and write to memory: *dest++ = (y>>8); } } 2、使用NEON库进行代码优化 Since NEON works in 64 or 128 bit registers it’s best to process eight pixels in parallel.

That way we can exploit the parallel nature of the SIMD-unit. Here is what I came up with: 因为NEON工作在64位或128位的寄存器上,因此最适合同时处理8个像素点的转换。这样就形成了下面这样的代码 void neon_convert (uint8_t * __restrict dest, uint8_t * __restrict src, int n) { int i; uint8x8_t rfac = vdup_n_u8 (77); // 转换权值 R uint8x8_t gfac = vdup_n_u8 (151); // 转换权值 G uint8x8_t bfac = vdup_n_u8 (28); // 转换权值 B n/=8; for (i=0; i<n; i++) { uint16x8_t temp; uint8x8x3_t rgb = vld3_u8 (src); uint8x8_t result; temp = vmull_u8 (rgb.val[0], rfac); // vmull_u8 每个字节(8bit)对应相乘,结果为每个单位2字节(16bit) temp = vmlal_u8 (temp,rgb.val[1], gfac); // 每个比特对应相乘并加上 temp = vmlal_u8 (temp,rgb.val[2], bfac); result = vshrn_n_u16 (temp, 8); // 全部移位8位 vst1_u8 (dest, result); // 转存运算结果 src += 8*3; dest += 8; } } vmull.u8 multiplies each byte of the first argument with each corresponding byte of the second argument. Each result becomes a 16 bit unsigned integer, so no overflow can happen. The entire result is returned as a 128 bit NEON register pair. vmlal.u8 does the same thing as vmull.u8 but also adds the content of another register to the result. So we end up with just three instructions for weighted average of eight pixels. Nice. Now it’s time to undo the scaling of the weight factors. To do so I shift each 16 bit result to the right by 8 bits. This equals to a division by 256. ARM NEON has lots of instructions to do the shift, but also a “narrow” variant exists. This one does two things at once: It does the shift and afterwards converts the 16 bit integers back to 8 bit by removing all the high-bytes from the result. We get back from the 128 bit register pair to a single 64 bit register. 3、结果对比 (1)C语言NEON版本汇编 [cpp] view plain copy /* 未进行汇编优化的结果 C-version: 15.1 cycles per pixel. NEON-version: 9.9 cycles per pixel. 这里是说优化结果并不非常理想,所以查看了一下它的汇编文件 That’s only a speed-up of factor 1.5. I expected much more from the NEON implementation. It processes 8 pixels with just 6 instructions after all. What’s going on here? A look at the assembler output explained it all. Here is the inner-loop part of the convert function: */ 160: f46a040f vld3.8 {d16-d18}, [sl] 164: e1a0c005 mov ip, r5 168: ecc80b06 vstmia r8, {d16-d18} 16c: e1a04007 mov r4, r7 170: e2866001 add r6, r6, #1 ; 0x1 174: e28aa018 add sl, sl, #24 ; 0x18 178: e8bc000f ldm ip!, {r0, r1, r2, r3} 17c: e15b0006 cmp fp, r6 180: e1a08005 mov r8, r5 184: e8a4000f stmia r4!, {r0, r1, r2, r3} 188: eddd0b06 vldr d16, [sp, #24] 18c: e89c0003 ldm ip, {r0, r1} 190: eddd2b08 vldr d18, [sp, #32] 194: f3c00ca6 vmull.u8 q8, d16, d22 198: f3c208a5 vmlal.u8 q8, d18, d21 19c: e8840003 stm r4, {r0, r1} 1a0: eddd3b0a vldr d19, [sp, #40] 1a4: f3c308a4 vmlal.u8 q8, d19, d20 1a8: f2c80830 vshrn.i16 d16, q8, #8 1ac: f449070f vst1.8 {d16}, [r9] 1b0: e2899008 add r9, r9, #8 ; 0x8 1b4: caffffe9 bgt 160 (2)NEON汇编优化 Since the compiler can’t generate good code I wrote the same loop in assembler. In a nutshell I just took the intrinsic based loop and converted the instructions one by one. The loop-control is a bit different, but that’s all. [cpp] view plain copy // 这里针对生成的目标汇编代码进一步作了优化,优化的代码如下: convert_asm_neon: # r0: Ptr to destination data # r1: Ptr to source data # r2: Iteration count: push {r4-r5,lr} lsr r2, r2, #3 # build the three constants: mov r3, #77 mov r4, #151 mov r5, #28 vdup.8 d3, r3 vdup.8 d4, r4 vdup.8 d5, r5 .loop: # load 8 pixels: vld3.8 {d0-d2}, [r1]! # do the weight average: vmull.u8 q3, d0, d3 vmlal.u8 q3, d1, d4 vmlal.u8 q3, d2, d5 # shift and store: vshrn.u16 d6, q3, #8 vst1.8 {d6}, [r0]! subs r2, r2, #1 bne .loop pop { r4-r5, pc } 可以见到NEON优化在性能上提速了 7 倍多(同时处理8个像素) [cpp] view plain copy C-version: 15.1 cycles per pixel. NEON-version: 9.9 cycles per pixel. Assembler: 2.0 cycles per pixel. ARM cortex A8/9 - Android NDK - NEON介绍以及优化(资源的整理总结) (1)What is NDK: Android开发官网介绍: http://developer.android.com/sdk/ndk/overview.html The Android NDK is a toolset that letsyou embed components that make use of native code in your Android applications. Android applications run in the Dalvikvirtual machine. The NDK allows you to implement parts of your applicationsusing native-code languages such as C and C++. This can provide benefits tocertain classes of applications, in the form of reuse of existing code and insome cases increased speed. The NDK provides: · A set of tools and build files used to generate nativecode libraries from C and C++ sources · A way to embed the corresponding native libraries into anapplication package file (.apk) that can be deployed on Androiddevices · A set of native system headers and libraries that will besupported in all future versions of the Android platform, starting from Android1.5. Applications that use native activities must be run on Android 2.3 orlater. · Documentation, samples, and tutorials The latest release of the NDK supportsthese ARM instruction sets: · ARMv5TE (including Thumb-1 instructions) · ARMv7-A (including Thumb-2 and VFPv3-D16 instructions,with optional support for NEON/VFPv3-D32 instructions) · x86 instructions (see CPU-ARCH-ABIS.HTML for moreinformation) ARMv5TE machine code will run on allARM-based Android devices. ARMv7-A will run only on devices such as the VerizonDroid or Google Nexus One that have a compatible CPU. The main differencebetween the two instruction sets is that ARMv7-A supports hardware FPU,Thumb-2, and NEON instructions. You can target either or both of theinstruction sets — ARMv5TE is the default, but switching to ARMv7-A is as easyas adding a single line to the application's Application.mk file, without needing to changeanything else in the file. You can also build for both architectures at thesame time and have everything stored in the final .apk. Complete information is provided inthe CPU-ARCH-ABIS.HTML in the NDK package. The NDK provides stable headers for libc(the C library), libm (the Math library), OpenGL ES (3D graphics library), theJNI interface, and other libraries. (2) Sampleapplications The NDK includes sample applicationsthat illustrate how to use native code in your Android applications: · hello-jni — a simpleapplication that loads a string from a native method implemented in a sharedlibrary and then displays it in the application UI. · hello-neon — a simple application that shows how to use the cpufeatures library to check CPU capabilitiesat runtime, then use NEON intrinsics if supported by the CPU. Specifically, theapplication implements two versions of a tiny benchmark for a FIR filter loop,a C version and a NEON-optimized version for devices that support it. (3) Download the Android NDK http://developer.android.com/sdk/ndk/index.html (4)AndroidNDK安装设置 http://blog.csdn.net/zwcai/article/details/6211670 Android NDK 基本就是 Linux的开发,不过主要是生成.so形式供SDK调用。涉及的工具,就是Linux开发工具+SDK接口组件建立 NDK 编译环境 1. 下载android NDK r4 Windows 安装包,解压缩到你想放的位置,如d:/android 2. 安装较新版本的cygwin,安装中需要选择安装的Linux相关组件,主要是make、gcc、g++工具 3. 运行cygwin, 配置文件.bash_profile中添加环境变量 NDK=/cygdrive/d/android/android-ndk-r4; export NDK 。Windows的环境变量PATH 里面设置该路径 4.进入android-ndk-r4/sample中例子,如hello-jni,运行ndk-build,进行编译,结果是libxxx.so,成功说明编译环境搭建好了。、 EclipseNDK设置: 1. Eclipse中打开sample中的工程,具体操作是:新建android工程->从源代码建立。这时候可以编译Java工程 2. 设置 NDK编译选项,调用cygwin编译工具,完成后刷新相关文件(主要是libxxx.so),设置方法参考: http://www.360doc.com/content/11/0223/17/2734308_95473676.shtml (5)JavaNative Interface http://en.wikipedia.org/wiki/Java_Native_Interface JNI enables oneto write native methods to handle situations when an application cannot bewritten entirely in the Java programming language, e.g. when the standard Javaclass library does not support the platform-specific features or programlibrary. It is also used to modify an existing application—written in anotherprogramming language—to be accessible to Java applications. Many of thestandard library classes depend on JNI to provide functionality to thedeveloper and the user, e.g. file I/O and sound capabilities. Includingperformance- and platform-sensitive API implementations in the standard libraryallows all Java applications to access this functionality in a safe andplatform-independent manner. The JNIframework lets a native method use Java objects in the same way that Java codeuses these objects. A native method can create Java objects and then inspectand use these objects to perform its tasks. A native method can also inspectand use objects created by Java application code. JNI is sometimesreferred to as the "escape hatch" for Java developers because itenables them to add functionality to their Java application that the standardJava APIs cannot otherwise provide. It can be used to interface with codewritten in other languages, such as C and C++. It is also used fortime-critical calculations or operations like solving complicated mathematicalequations, because native code may be faster than JVM code. http://java.sun.com/docs/books/jni/Oracle官网,针对JNI的详细介绍以及JNI说明文档: An entire chapter isdevoted to avoiding common traps and pitfalls. The book uses numerous examplesto illustrate programming techniques that have proven to be effective. View HTML Download HTML (zip, ~531K) Download PDF (~ 3608K) Download the example code in this book in ZIP or tar.gz formats. Order this book through DigitalGuru amazon.com (6)NDK Android* 应用移植方法 http://software.intel.com/zh-cn/articles/ndk-android-application-porting-methodologies/ 本指南用于帮助开发人员将现有的基于 ARM* 的 NDK 应用移植到 x86。如果您已经拥有一个正常运行的应用,需要知道如何能够快速让 x86 设备在 Android* market 中找到您的应用,本文将可以为您提供一些入门信息。同时本指南还提供了一些技巧和指南,以帮助您解决在移植过程中可能会遇到的编译器问题。 (7) Android NDK 之NEON优化针对ARM cortex A8/A9 http://blog.csdn.net/zwcai/article/details/6843531 近期正在往Android平台移植算法。确切地说,是针对ARM A8 A9 平台进行优化。发现不同芯片的浮点能力差别颇大。A9系列明显强于A8系列,大约有3倍多的提升,应该就是VFP管线化的优势。不过即使相同核心,不同厂家的芯片也会有不少差别。起初用本人手机,ATRIX,Tegra2处理器,A9双核。测算了一下,跑浮点算法速度是我台式机的三分之一。折算为相同频率的话,已经相差无几了。PC的算法直接编译就可以使用,速度直接达标,DSP时期的什么浮点转定点,直接就Pass掉,啥优化不用,真是惊叹。不过拿上其他A8板子,惊喜立马就飞走了,优化还是得做的,活省不了。主要可用的就是NEON了。优化NEON时,挑了几个典型函数,比如向量内积、比例求和、互相干系数,让人去尝试看看。一开始按照TI DSP的惯用招数,将运算用一系列NEON内联函数去整,发现速度仅提升了10%,搞不下去。我分析了汇编代码,发现编出来的有很多栈操作,比如关键的运算语句就一条,但前后 vstd 和 vldr 有十来条,不慢才怪。网上搜搜,也有类似情况,似乎编译器对NEON内联的优化较弱,没法把运算串起来。使出最后一招:嵌入式汇编手工优化,看了半天指令集,挑了最简单的比例求和函数,其实汇编的话,也就对应三条运算语句,就是算上加载和保存,也就十来句,比起编译器出来的几十条省了很多。运行一下,速度提升了5倍。这下有搞头了,让工程师把其他几个也整了,最高有20倍提升。就是编起来有点费劲,半天一个小函数,只能用于优化核心费时的部分。有两个比较不错的参考资料: http://hilbert-space.de/?p=22 RGB转灰度的优化实例,里面展现了函数对应的汇编指令,以及手工汇编的结果,主要是对加载、保存进行了优化。 http://blogs.arm.com/software-enablement/241-coding-for-neon-part-3-matrix-multiplication/ 官方实例,矩阵乘法的NEON移植分析与实现。另外 OpenMAX库也可以看看。 (7)Cortex™-A8 Technical Reference Manual http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0344k/index.html (8)Introducing NEON™ Development Article http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dht0002a/index.html (9)Coding for NEON: Coding for NEON - Part 1: Load and Stores - ARM Community http://blogs.arm.com/software-enablement/161-coding-for-neon-part-1-load-and-stores/ Coding for NEON - Part 2: Dealing With Leftovers - ARM ... http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/ Coding for NEON - Part 3: Matrix Multiplication - ARM ... http://blogs.arm.com/software-enablement/241-coding-for-neon-part-3-matrix-multiplication/ Coding for NEON - Part 4: Shifting Left and Right - ARM ... http://blogs.arm.com/software-enablement/277-coding-for-neon-part-4-shifting-left-and-right/ http://search.arm.com/search?q=Coding+for+NEON&;site=Site-Search&;btnG=Search&;entqr=0&;output=xml_no_dtd&;sort=date%3AD%3AL%3Ad1&;client=Search&;ud=1&;oe=UTF-8&;ie=UTF-8&;getfields=Description&;proxystylesheet=Search (10)ARM NEONOptimization. An Example http://hilbert-space.de/?p=22 (11)What is the fastest way to copy memory on a Cortex-A8? Can AXI-basedARM cores generate bursts across 1KB boundaries? My Cortex-A8 DSMdoes not produce a tarmac log PerformanceMonitor Unit example code for ARM11 and Cortex-A/R What is thefastest way to copy memory on a Cortex-A8? http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.faqs/ka13544.html ARM Technical Support Knowledge Articles Applies to: Cortex-A8, RealViewDevelopment Suite (RVDS) Answer Many applications frequently copy substantialamounts of data from one area of memory to another, typically using thememcpy() Clibrary function. As this can be quite time consuming, it may be worth spendingsome time optimizing the functions that do this. There is no single ‘bestmethod’ for implementing a copy routine, as the performance will depend uponmany factors (see below). This article looks at seven possible schemes, andcompares their performance in a particular system. Target system andconditions The various schemes were tested using aBeagle Board from Texas Instruments. This board is based on an OMAP 3530 SoC,which is based on an ARM Cortex-A8 processor. The implementations have beenwritten with the assumption that the source address, destination address andnumber of bytes to transfer are all multiples of the level 1 cache line size(64 bytes). However, the tests all copied 16 Mbytes of data, so the overhead ofchecking alignment and ensuring the main loop could assume 64 byte alignmentwould be insignificant by comparison (this may not be the case for smallermemory copies). The execution time was measured using the performance countersintegrated into the Cortex-A8 processor, which provide a highly accuratemeasure. For all tests, the L1NEON bit was set, meaning that loads using theNEON instruction set can cause an L1 data cache linefill. Both the level 1 andlevel 2 caches are enabled, with both the code and data memory regions beingused marked as cacheable. The MMU and branch prediction are also enabled. Some of the routines make use of the preloadinstruction (PLD). This instruction causes the level 2 cache to start loading the data sometime before the processor executes the code to access this data. Thiseffectively starts the memory request early, so may mean the processor does nothave to wait as long for it be available. Routines: 1. Word by Word memorycopy This is a very simple loop which reads oneword from the source, writes it to the destination and decrements a counter.The performance of this function is taken as the reference for the other tests. WordCopy LDR r3, [r1],#4 STR r3, [r0],#4 SUBS r2, r2, #4 BGE WordCopy 2. Load-Multiple memorycopy The previous example is modified to use LDM and STM instructions,transferring 8 words per iteration. Due to the extra registers used, these mustbe stored to the stack and later restored. LDMCopy PUSH {r4-r10} LDMloop LDMIA r1!, {r3- r10} STMIA r0!, {r3- r10} SUBS r2, r2,#32 BGE LDMloop &;

sunjing_

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
ARMNEON优化

https://www.veryarm.com/95523.html确认处理器是否支持NEON cat/proc/cpuinfo|grepneon 看是否有如下内容 Features:swphalfthumbfastmultvfpedspneonvfpv3tlsvfpv4idivaidivt 以Android为例, 并计算float数组的和标准 C 代...
复制链接

扫一扫

专栏目录