这是一个开源的库,地址为https://code.google.com/p/math-neon/,根据项目介绍应该是利用neon指令实现的数学库:包括三角、对数、指数等基于浮点的运算实现,以及矩阵运算,因为是基于neon指令它必须在arm cortex-a架构(有neon指令支持)上才能运行。从项目介绍说因为gcc对于neon的支持不是很好(估计是指neon内在函数效率不如汇编),所以核心的运算代码都是使用内联汇编写成的。如果想编译并测试,可以下载作者写的Makefile(地址为http://gitorious.org/vjaquez-misc/math-neon/commit/14ba470caad37c33cf7245be69efc9a1366d8f99?format=patch)。
本人是想在WINCE下使用(平台为cortex-a8架构),因为代码使用了大量的内联汇编,如果想移植到WINCE平台需要重写汇编文件或利用WEC7编译器的内在函数功能(参见http://blog.csdn.net/alien75/article/details/8740641),两者均有很大的工作量,此时想到了久未使用的mingw32ce这个toolchain工具(参见http://blog.csdn.net/alien75/article/details/6998223),因为仅仅是编译出PE架构的静态库,此工具完全能满足需要,只是要修改一下Makefile才能进行正常编译。
原Makefile内容如下
CFLAGS := -O2 -ggdb -mcpu=cortex-a9 -mfloat-abi=softfp -mfpu=neon -ansi -std=gnu99 -pedantic
WARNINGS := -Wall -Wextra -Wno-unused-parameter -Wmissing-prototypes
ASSEMBLER := -Wa,-mimplicit-it=thumb
override CFLAGS += $(WARNINGS) $(ASSEMBLER)
LIBS := -lm
all: math_debug
libmathneon.a: math_acosf.o math_ldexpf.o math_powf.o math_sqrtfv.o \
math_asinf.o math_expf.o math_log10f.o math_runfast.o math_tanf.o \
math_atan2f.o math_fabsf.o math_logf.o math_sincosf.o math_tanhf.o \
math_atanf.o math_floorf.o math_mat2.o math_sinf.o math_vec2.o \
math_ceilf.o math_fmodf.o math_mat3.o math_sinfv.o math_vec3.o \
math_cosf.o math_frexpf.o math_mat4.o math_sinhf.o math_vec4.o \
math_coshf.o math_invsqrtf.o math_modf.o math_sqrtf.o
math_debug: math_debug.o libmathneon.a
$(CC) $(LDFLAGS) -o $@ $^ $(LIBS)
%.o:: %.c
$(CC) $(CFLAGS) -o $@ -c $<
%.a::
$(AR) rcs $@ $^
clean:
$(RM) -v math_debug *.o *.a
修改后的内容
CC=arm-mingw32ce-gcc
AR=arm-mingw32ce-ar rc
CFLAGS := -O2 -ggdb -mcpu=cortex-a8 -mfloat-abi=softfp -mfpu=neon -ansi -std=gnu99 -pedantic -DNO_ERRNO_H -D_WIN32_WCE
LDFLAGS := -L.
WARNINGS := -Wall -Wextra -Wno-unused-parameter -Wmissing-prototypes
ASSEMBLER := -Wa,-mimplicit-it=thumb
override CFLAGS += $(WARNINGS) $(ASSEMBLER)
#LIBS := -lm
all: math_debug
libmathneon.a: math_acosf.o math_ldexpf.o math_powf.o math_sqrtfv.o \
math_asinf.o math_expf.o math_log10f.o math_runfast.o math_tanf.o \
math_atan2f.o math_fabsf.o math_logf.o math_sincosf.o math_tanhf.o \
math_atanf.o math_floorf.o math_mat2.o math_sinf.o math_vec2.o \
math_ceilf.o math_fmodf.o math_mat3.o math_sinfv.o math_vec3.o \
math_cosf.o math_frexpf.o math_mat4.o math_sinhf.o math_vec4.o \
math_coshf.o math_invsqrtf.o math_modf.o math_sqrtf.o
math_debug: math_debug.o libmathneon.a
$(CC) $(LDFLAGS) -o $@ $^ $(LIBS)
%.o:: %.c
$(CC) $(CFLAGS) -o $@ -c $<
%.a::
$(AR) $@ $^
clean:
$(RM) -v math_debug *.o *.a
测试结果(系统函数、c语言优化函数和neon汇编函数比较结果见Rate后数字)
RUNFAST: Enabled
------------------------------------------------------------------------------------------------------
MATRIX FUNCTION TESTS
------------------------------------------------------------------------------------------------------
matmul2_c =
|2.66, -2.73|
|-5.74, -15.83|
matmul2_neon =
|2.66, -2.73|
|-5.74, -15.83|
matmul2: c=112000 neon=65000 rate=1.72
matvec2_c = |2.66, -5.74|
matvec2_neon = |2.66, -5.74|
matvec2: c=66000 neon=53000 rate=1.25
matmul3_c =
|-17.73, -8.39, -1.10|
|8.30, -5.32, 23.03|
|-5.67, -7.81, 9.07|
matmul3_neon =
|-17.73, -8.39, -1.10|
|8.30, -5.32, 23.03|
|-5.67, -7.81, 9.07|
matmul3: c=394000 neon=120000 rate=3.28
matvec3_c = |-17.73, 8.30, -5.67|
matvec3_neon = |-17.73, 8.30, -5.67|
matvec3: c=66000 neon=53000 rate=1.25
matmul4_c =
|-8.86, 8.70, -17.78, -7.64|
|-13.15, 20.92, -10.97, -14.02|
|17.37, -14.46, -13.16, 33.82|
|15.42, -27.32, -5.66, -6.37|
matmul4_neon =
|-8.86, 8.70, -17.78, -7.64|
|-13.15, 20.92, -10.97, -14.02|
|17.37, -14.46, -13.16, 33.82|
|15.42, -27.32, -5.66, -6.37|
matmul4: c=991000 neon=141000 rate=7.03
matvec4_c = |-8.86, -13.15, 17.37, 15.418112|
matvec4_neon = |-8.86, -13.15, 17.37, 15.418112|
matvec4: c=66000 neon=53000 rate=1.25
dot2_c = 3.756326
dot2_neon = 3.756326
dot2: c=532000 neon=497000 rate=1.07
normalize2_c = [-0.74, -0.68]
normalize2_neon = [-0.74, -0.68]
normalize2: c=691000 neon=313000 rate=2.21
dot3_c = 3.698457
dot3_neon = 3.698457
dot3: c=572000 neon=514000 rate=1.11
normalize3_c = [-0.74, -0.68, -0.01]
normalize3_neon = [-0.74, -0.68, -0.01]
normalize3: c=806000 neon=353000 rate=2.28
cross3_c = [-4.69, 5.12, -1.46]
cross3_neon = [-4.69, 5.12, -1.46]
cross3: c=586000 neon=373000 rate=1.57
dot4_c = -4.564567
dot4_neon = -4.564566
dot4: c=625000 neon=487000 rate=1.28
normalize4_c = [-0.24, -0.22, -0.00, 0.95]
normalize4_neon = [-0.24, -0.22, -0.00, 0.95]
normalize4: c=924000 neon=343000 rate=2.69
------------------------------------------------------------------------------------------------------
CMATH FUNCTION TESTS
------------------------------------------------------------------------------------------------------
Function Range Number ABS Max Error REL Max Error RMS Error Time Rate
------------------------------------------------------------------------------------------------------
sinf [-3.14, 3.14] 500000 0.00e+000 0.00e+000% 0.00e+000 880000 x1.00
sinf_c [-3.14, 3.14] 500000 8.34e-007 1.00e+002% 4.09e-007 162000 x5.43
sinf_neon [-3.14, 3.14] 500000 8.34e-007 1.00e+002% 4.09e-007 96000 x9.17
cosf [-3.14, 3.14] 500000 0.00e+000 0.00e+000% 0.00e+000 906000 x1.00
cosf_c [-3.14, 3.14] 500000 8.34e-007 6.74e-001% 4.16e-007 192000 x4.72
cosf_neon [-3.14, 3.14] 500000 1.41e+000 6.64e+007% 1.00e+000 142000 x6.38
tanf [-0.79, 0.79] 500000 0.00e+000 0.00e+000% 0.00e+000 1140000 x1.00
tanf_c [-0.79, 0.79] 500000 2.98e-006 7.97e-004% 1.31e-006 200000 x5.70
tanf_neon [-0.79, 0.79] 500000 1.91e-006 3.62e-004% 6.66e-007 126000 x9.05
asinf [-1.00, 1.00] 500000 0.00e+000 0.00e+000% 0.00e+000 2732000 x1.00
asinf_c [-1.00, 1.00] 500000 5.53e-005 1.06e-002% 1.69e-005 277000 x9.86
asinf_neon [-1.00, 1.00] 500000 4.65e-005 8.87e-003% 1.#Re+000 151000 x18.09
acosf [-1.00, 1.00] 500000 0.00e+000 0.00e+000% 0.00e+000 2670000 x1.00
acosf_c [-1.00, 1.00] 500000 5.56e-005 6.46e-003% 1.69e-005 312000 x8.56
acosf_neon [-1.00, 1.00] 500000 4.67e-005 6.35e-003% 1.#Re+000 171000 x15.61
atanf [-1.00, 1.00] 500000 0.00e+000 0.00e+000% 0.00e+000 1021000 x1.00
atanf_c [-1.00, 1.00] 500000 1.67e-004 2.12e-002% 7.40e-005 198000 x5.16
atanf_neon [-1.00, 1.00] 500000 1.67e-004 2.12e-002% 7.40e-005 121000 x8.44
sinhf [-3.14, 3.14] 500000 0.00e+000 0.00e+000% 0.00e+000 1509000 x1.00
sinhf_c [-3.14, 3.14] 500000 1.91e-006 1.52e-001% 2.37e-007 280000 x5.39
sinhf_neon [-3.14, 3.14] 500000 1.91e-006 1.52e-001% 1.90e-007 108000 x13.97
coshf [-3.14, 3.14] 500000 0.00e+000 0.00e+000% 0.00e+000 1163000 x1.00
coshf_c [-3.14, 3.14] 500000 1.91e-006 2.37e-005% 2.28e-007 283000 x4.11
coshf_neon [-3.14, 3.14] 500000 1.91e-006 2.22e-005% 1.68e-007 108000 x10.77
tanhf [-3.14, 3.14] 500000 0.00e+000 0.00e+000% 0.00e+000 1555000 x1.00
tanhf_c [-3.14, 3.14] 500000 1.21e-005 2.48e-001% 5.48e-006 235000 x6.62
tanhf_neon [-3.14, 3.14] 500000 2.38e-007 2.47e-001% 5.40e-008 119000 x13.07
expf [0.00, 10.00] 500000 0.00e+000 0.00e+000% 0.00e+000 960000 x1.00
expf_c [0.00, 10.00] 500000 9.77e-003 6.58e-005% 1.64e-003 132000 x7.27
expf_neon [0.00, 10.00] 500000 9.77e-003 6.58e-005% 1.64e-003 88000 x10.91
logf [1.00, 1000.00] 500000 0.00e+000 0.00e+000% 0.00e+000 1027000 x1.00
logf_c [1.00, 1000.00] 500000 7.63e-006 1.03e-002% 1.07e-006 116000 x8.85
logf_neon [1.00, 1000.00] 500000 7.63e-006 1.03e-002% 1.07e-006 82000 x12.52
log10f [1.00, 1000.00] 500000 0.00e+000 0.00e+000% 0.00e+000 1202000 x1.00
log10f_c [1.00, 1000.00] 500000 3.34e-006 6.68e-003% 4.84e-007 116000 x10.36
log10f_neon [1.00, 1000.00] 500000 3.34e-006 6.68e-003% 4.84e-007 81000 x14.84
floorf [1.00, 1000.00] 5000000 0.00e+000 0.00e+000% 0.00e+000 4705000 x1.00
floorf_c [1.00, 1000.00] 5000000 0.00e+000 0.00e+000% 0.00e+000 819000 x5.74
floorf_neon [1.00, 1000.00] 5000000 0.00e+000 0.00e+000% 0.00e+000 671000 x7.01
ceilf [1.00, 1000.00] 5000000 0.00e+000 0.00e+000% 0.00e+000 5734000 x1.00
ceilf_c [1.00, 1000.00] 5000000 0.00e+000 0.00e+000% 0.00e+000 814000 x7.04
ceilf_neon [1.00, 1000.00] 5000000 0.00e+000 0.00e+000% 0.00e+000 696000 x8.24
fabsf [1.00, 1000.00] 5000000 0.00e+000 0.00e+000% 0.00e+000 2005000 x1.00
fabsf_c [1.00, 1000.00] 5000000 0.00e+000 0.00e+000% 0.00e+000 455000 x4.41
fabsf_neon [1.00, 1000.00] 5000000 0.00e+000 0.00e+000% 0.00e+000 446000 x4.50
sqrtf [1.00, 1000.00] 500000 0.00e+000 0.00e+000% 0.00e+000 3222000 x1.00
sqrtf_c [1.00, 1000.00] 500000 2.33e-004 1.06e-003% 8.69e-005 139000 x23.18
sqrtf_neon [1.00, 1000.00] 500000 7.63e-006 2.91e-005% 1.60e-006 85000 x37.91
invsqrtf [1.00, 1000.00] 500000 0.00e+000 0.00e+000% 0.00e+000 106000 x1.00
invsqrtf_c [1.00, 1000.00] 500000 4.35e-006 4.78e-004% 2.00e-007 94000 x1.13
invsqrtf_neon [1.00, 1000.00] 500000 1.19e-007 2.12e-005% 4.81e-009 70000 x1.51
atan2f [0.10, 10.00] 10000 0.00e+000 0.00e+000% 0.00e+000 2388000 x1.00
atan2f_c [0.10, 10.00] 10000 1.73e-004 2.23e-002% 0.00e+000 657000 x3.63
atan2f_neon [0.10, 10.00] 10000 1.67e-004 2.12e-002% 0.00e+000 278000 x8.59
powf [1.00, 10.00] 10000 0.00e+000 0.00e+000% 0.00e+000 8316000 x1.00
powf_c [1.00, 10.00] 10000 1.36e+005 5.88e-003% 0.00e+000 493000 x16.87
powf_neon [1.00, 10.00] 10000 1.36e+005 5.88e-003% 0.00e+000 292000 x28.48
fmodf [1.00, 10.00] 10000 0.00e+000 0.00e+000% 0.00e+000 1394000 x1.00
fmodf_c [1.00, 10.00] 10000 9.99e+000 8.06e-002% 0.00e+000 341000 x4.09
fmodf_neon [1.00, 10.00] 10000 9.97e+000 8.06e-002% 0.00e+000 238000 x5.86