一个基于NEON指令的数学库

最新推荐文章于 2024-08-10 08:41:10 发布

alien75

最新推荐文章于 2024-08-10 08:41:10 发布

阅读量9.8k

点赞数 2

分类专栏： wince

本文链接：https://blog.csdn.net/alien75/article/details/9128453

版权

wince 专栏收录该内容

101 篇文章 0 订阅

订阅专栏

这是一个开源的库，地址为https://code.google.com/p/math-neon/，根据项目介绍应该是利用neon指令实现的数学库：包括三角、对数、指数等基于浮点的运算实现，以及矩阵运算，因为是基于neon指令它必须在arm cortex-a架构(有neon指令支持)上才能运行。从项目介绍说因为gcc对于neon的支持不是很好(估计是指neon内在函数效率不如汇编)，所以核心的运算代码都是使用内联汇编写成的。如果想编译并测试，可以下载作者写的Makefile(地址为http://gitorious.org/vjaquez-misc/math-neon/commit/14ba470caad37c33cf7245be69efc9a1366d8f99?format=patch)。

本人是想在WINCE下使用(平台为cortex-a8架构)，因为代码使用了大量的内联汇编，如果想移植到WINCE平台需要重写汇编文件或利用WEC7编译器的内在函数功能(参见http://blog.csdn.net/alien75/article/details/8740641)，两者均有很大的工作量，此时想到了久未使用的mingw32ce这个toolchain工具(参见http://blog.csdn.net/alien75/article/details/6998223)，因为仅仅是编译出PE架构的静态库，此工具完全能满足需要，只是要修改一下Makefile才能进行正常编译。

原Makefile内容如下

CFLAGS := -O2 -ggdb -mcpu=cortex-a9 -mfloat-abi=softfp -mfpu=neon -ansi -std=gnu99 -pedantic
WARNINGS := -Wall -Wextra -Wno-unused-parameter -Wmissing-prototypes
ASSEMBLER := -Wa,-mimplicit-it=thumb

override CFLAGS += $(WARNINGS) $(ASSEMBLER)
LIBS := -lm

all: math_debug

libmathneon.a: math_acosf.o math_ldexpf.o math_powf.o math_sqrtfv.o \
	math_asinf.o math_expf.o math_log10f.o math_runfast.o math_tanf.o \
	math_atan2f.o  math_fabsf.o math_logf.o math_sincosf.o math_tanhf.o \
	math_atanf.o math_floorf.o math_mat2.o math_sinf.o math_vec2.o \
	math_ceilf.o math_fmodf.o math_mat3.o math_sinfv.o math_vec3.o \
	math_cosf.o math_frexpf.o math_mat4.o math_sinhf.o math_vec4.o \
	math_coshf.o math_invsqrtf.o math_modf.o math_sqrtf.o

math_debug: math_debug.o libmathneon.a
	$(CC) $(LDFLAGS) -o $@ $^ $(LIBS)

%.o:: %.c
	$(CC) $(CFLAGS) -o $@ -c $<

%.a::
	$(AR) rcs $@ $^

clean:
	$(RM) -v math_debug *.o *.a

修改后的内容

CC=arm-mingw32ce-gcc
AR=arm-mingw32ce-ar rc

CFLAGS := -O2 -ggdb -mcpu=cortex-a8 -mfloat-abi=softfp -mfpu=neon -ansi -std=gnu99 -pedantic -DNO_ERRNO_H -D_WIN32_WCE
LDFLAGS := -L.
WARNINGS := -Wall -Wextra -Wno-unused-parameter -Wmissing-prototypes
ASSEMBLER := -Wa,-mimplicit-it=thumb

override CFLAGS += $(WARNINGS) $(ASSEMBLER)
#LIBS := -lm

all: math_debug

libmathneon.a: math_acosf.o math_ldexpf.o math_powf.o math_sqrtfv.o \
	math_asinf.o math_expf.o math_log10f.o math_runfast.o math_tanf.o \
	math_atan2f.o  math_fabsf.o math_logf.o math_sincosf.o math_tanhf.o \
	math_atanf.o math_floorf.o math_mat2.o math_sinf.o math_vec2.o \
	math_ceilf.o math_fmodf.o math_mat3.o math_sinfv.o math_vec3.o \
	math_cosf.o math_frexpf.o math_mat4.o math_sinhf.o math_vec4.o \
	math_coshf.o math_invsqrtf.o math_modf.o math_sqrtf.o

math_debug: math_debug.o libmathneon.a
	$(CC) $(LDFLAGS) -o $@ $^ $(LIBS)

%.o:: %.c
	$(CC) $(CFLAGS) -o $@ -c $<

%.a::
	$(AR) $@ $^

clean:
	$(RM) -v math_debug *.o *.a

测试结果(系统函数、c语言优化函数和neon汇编函数比较结果见Rate后数字)

RUNFAST: Enabled 
------------------------------------------------------------------------------------------------------
MATRIX FUNCTION TESTS 
------------------------------------------------------------------------------------------------------
matmul2_c = 
			|2.66, -2.73|
			|-5.74, -15.83|
matmul2_neon = 
			|2.66, -2.73|
			|-5.74, -15.83|
matmul2: c=112000 	 neon=65000 	 rate=1.72 

matvec2_c = |2.66, -5.74|
matvec2_neon = |2.66, -5.74|
matvec2: c=66000 	 neon=53000 	 rate=1.25
 
matmul3_c =
			|-17.73, -8.39, -1.10|
			|8.30, -5.32, 23.03|
			|-5.67, -7.81, 9.07|
matmul3_neon =
			|-17.73, -8.39, -1.10|
			|8.30, -5.32, 23.03|
			|-5.67, -7.81, 9.07|
matmul3: c=394000 	 neon=120000 	 rate=3.28 

matvec3_c = |-17.73, 8.30, -5.67|
matvec3_neon = |-17.73, 8.30, -5.67|
matvec3: c=66000 	 neon=53000 	 rate=1.25 

matmul4_c =
			|-8.86, 8.70, -17.78, -7.64|
			|-13.15, 20.92, -10.97, -14.02|
			|17.37, -14.46, -13.16, 33.82|
			|15.42, -27.32, -5.66, -6.37|
matmul4_neon =
			|-8.86, 8.70, -17.78, -7.64|
			|-13.15, 20.92, -10.97, -14.02|
			|17.37, -14.46, -13.16, 33.82|
			|15.42, -27.32, -5.66, -6.37|
matmul4: c=991000 	 neon=141000 	 rate=7.03 

matvec4_c = |-8.86, -13.15, 17.37, 15.418112|
matvec4_neon = |-8.86, -13.15, 17.37, 15.418112|
matvec4: c=66000 	 neon=53000 	 rate=1.25 

dot2_c = 3.756326
dot2_neon = 3.756326
dot2: c=532000 	 neon=497000 	 rate=1.07 

normalize2_c = [-0.74, -0.68]
normalize2_neon = [-0.74, -0.68]
normalize2: c=691000 	 neon=313000 	 rate=2.21 

dot3_c = 3.698457
dot3_neon = 3.698457
dot3: c=572000 	 neon=514000 	 rate=1.11 

normalize3_c = [-0.74, -0.68, -0.01]
normalize3_neon = [-0.74, -0.68, -0.01]
normalize3: c=806000 	 neon=353000 	 rate=2.28 

cross3_c = [-4.69, 5.12, -1.46]
cross3_neon = [-4.69, 5.12, -1.46]
cross3: c=586000 	 neon=373000 	 rate=1.57 

dot4_c = -4.564567
dot4_neon = -4.564566
dot4: c=625000 	 neon=487000 	 rate=1.28 

normalize4_c = [-0.24, -0.22, -0.00, 0.95]
normalize4_neon = [-0.24, -0.22, -0.00, 0.95]
normalize4: c=924000 	 neon=343000 	 rate=2.69 

------------------------------------------------------------------------------------------------------
CMATH FUNCTION TESTS 
------------------------------------------------------------------------------------------------------
Function	Range		Number	ABS Max Error	REL Max Error	RMS Error	Time	Rate
------------------------------------------------------------------------------------------------------
sinf       	[-3.14, 3.14]	500000	0.00e+000	0.00e+000%	0.00e+000	880000	x1.00	
sinf_c     	[-3.14, 3.14]	500000	8.34e-007	1.00e+002%	4.09e-007	162000	x5.43	
sinf_neon  	[-3.14, 3.14]	500000	8.34e-007	1.00e+002%	4.09e-007	96000	x9.17	
cosf       	[-3.14, 3.14]	500000	0.00e+000	0.00e+000%	0.00e+000	906000	x1.00	
cosf_c     	[-3.14, 3.14]	500000	8.34e-007	6.74e-001%	4.16e-007	192000	x4.72	
cosf_neon  	[-3.14, 3.14]	500000	1.41e+000	6.64e+007%	1.00e+000	142000	x6.38	
tanf       	[-0.79, 0.79]	500000	0.00e+000	0.00e+000%	0.00e+000	1140000	x1.00	
tanf_c     	[-0.79, 0.79]	500000	2.98e-006	7.97e-004%	1.31e-006	200000	x5.70	
tanf_neon  	[-0.79, 0.79]	500000	1.91e-006	3.62e-004%	6.66e-007	126000	x9.05	
asinf      	[-1.00, 1.00]	500000	0.00e+000	0.00e+000%	0.00e+000	2732000	x1.00	
asinf_c    	[-1.00, 1.00]	500000	5.53e-005	1.06e-002%	1.69e-005	277000	x9.86	
asinf_neon 	[-1.00, 1.00]	500000	4.65e-005	8.87e-003%	1.#Re+000	151000	x18.09	
acosf      	[-1.00, 1.00]	500000	0.00e+000	0.00e+000%	0.00e+000	2670000	x1.00	
acosf_c    	[-1.00, 1.00]	500000	5.56e-005	6.46e-003%	1.69e-005	312000	x8.56	
acosf_neon 	[-1.00, 1.00]	500000	4.67e-005	6.35e-003%	1.#Re+000	171000	x15.61	
atanf      	[-1.00, 1.00]	500000	0.00e+000	0.00e+000%	0.00e+000	1021000	x1.00	
atanf_c    	[-1.00, 1.00]	500000	1.67e-004	2.12e-002%	7.40e-005	198000	x5.16	
atanf_neon 	[-1.00, 1.00]	500000	1.67e-004	2.12e-002%	7.40e-005	121000	x8.44	
sinhf       	[-3.14, 3.14]	500000	0.00e+000	0.00e+000%	0.00e+000	1509000	x1.00	
sinhf_c     	[-3.14, 3.14]	500000	1.91e-006	1.52e-001%	2.37e-007	280000	x5.39	
sinhf_neon  	[-3.14, 3.14]	500000	1.91e-006	1.52e-001%	1.90e-007	108000	x13.97	
coshf       	[-3.14, 3.14]	500000	0.00e+000	0.00e+000%	0.00e+000	1163000	x1.00	
coshf_c     	[-3.14, 3.14]	500000	1.91e-006	2.37e-005%	2.28e-007	283000	x4.11	
coshf_neon  	[-3.14, 3.14]	500000	1.91e-006	2.22e-005%	1.68e-007	108000	x10.77	
tanhf       	[-3.14, 3.14]	500000	0.00e+000	0.00e+000%	0.00e+000	1555000	x1.00	
tanhf_c     	[-3.14, 3.14]	500000	1.21e-005	2.48e-001%	5.48e-006	235000	x6.62	
tanhf_neon  	[-3.14, 3.14]	500000	2.38e-007	2.47e-001%	5.40e-008	119000	x13.07	
expf       	[0.00, 10.00]	500000	0.00e+000	0.00e+000%	0.00e+000	960000	x1.00	
expf_c     	[0.00, 10.00]	500000	9.77e-003	6.58e-005%	1.64e-003	132000	x7.27	
expf_neon  	[0.00, 10.00]	500000	9.77e-003	6.58e-005%	1.64e-003	88000	x10.91	
logf       	[1.00, 1000.00]	500000	0.00e+000	0.00e+000%	0.00e+000	1027000	x1.00	
logf_c     	[1.00, 1000.00]	500000	7.63e-006	1.03e-002%	1.07e-006	116000	x8.85	
logf_neon  	[1.00, 1000.00]	500000	7.63e-006	1.03e-002%	1.07e-006	82000	x12.52	
log10f       	[1.00, 1000.00]	500000	0.00e+000	0.00e+000%	0.00e+000	1202000	x1.00	
log10f_c     	[1.00, 1000.00]	500000	3.34e-006	6.68e-003%	4.84e-007	116000	x10.36	
log10f_neon  	[1.00, 1000.00]	500000	3.34e-006	6.68e-003%	4.84e-007	81000	x14.84	
floorf     	[1.00, 1000.00]	5000000	0.00e+000	0.00e+000%	0.00e+000	4705000	x1.00	
floorf_c   	[1.00, 1000.00]	5000000	0.00e+000	0.00e+000%	0.00e+000	819000	x5.74	
floorf_neon	[1.00, 1000.00]	5000000	0.00e+000	0.00e+000%	0.00e+000	671000	x7.01	
ceilf     	[1.00, 1000.00]	5000000	0.00e+000	0.00e+000%	0.00e+000	5734000	x1.00	
ceilf_c   	[1.00, 1000.00]	5000000	0.00e+000	0.00e+000%	0.00e+000	814000	x7.04	
ceilf_neon	[1.00, 1000.00]	5000000	0.00e+000	0.00e+000%	0.00e+000	696000	x8.24	
fabsf     	[1.00, 1000.00]	5000000	0.00e+000	0.00e+000%	0.00e+000	2005000	x1.00	
fabsf_c   	[1.00, 1000.00]	5000000	0.00e+000	0.00e+000%	0.00e+000	455000	x4.41	
fabsf_neon	[1.00, 1000.00]	5000000	0.00e+000	0.00e+000%	0.00e+000	446000	x4.50	
sqrtf      	[1.00, 1000.00]	500000	0.00e+000	0.00e+000%	0.00e+000	3222000	x1.00	
sqrtf_c    	[1.00, 1000.00]	500000	2.33e-004	1.06e-003%	8.69e-005	139000	x23.18	
sqrtf_neon 	[1.00, 1000.00]	500000	7.63e-006	2.91e-005%	1.60e-006	85000	x37.91	
invsqrtf      	[1.00, 1000.00]	500000	0.00e+000	0.00e+000%	0.00e+000	106000	x1.00	
invsqrtf_c    	[1.00, 1000.00]	500000	4.35e-006	4.78e-004%	2.00e-007	94000	x1.13	
invsqrtf_neon 	[1.00, 1000.00]	500000	1.19e-007	2.12e-005%	4.81e-009	70000	x1.51	
atan2f       	[0.10, 10.00]	10000	0.00e+000	0.00e+000%	0.00e+000	2388000	x1.00	
atan2f_c     	[0.10, 10.00]	10000	1.73e-004	2.23e-002%	0.00e+000	657000	x3.63	
atan2f_neon  	[0.10, 10.00]	10000	1.67e-004	2.12e-002%	0.00e+000	278000	x8.59	
powf       	[1.00, 10.00]	10000	0.00e+000	0.00e+000%	0.00e+000	8316000	x1.00	
powf_c     	[1.00, 10.00]	10000	1.36e+005	5.88e-003%	0.00e+000	493000	x16.87	
powf_neon  	[1.00, 10.00]	10000	1.36e+005	5.88e-003%	0.00e+000	292000	x28.48	
fmodf       	[1.00, 10.00]	10000	0.00e+000	0.00e+000%	0.00e+000	1394000	x1.00	
fmodf_c     	[1.00, 10.00]	10000	9.99e+000	8.06e-002%	0.00e+000	341000	x4.09	
fmodf_neon  	[1.00, 10.00]	10000	9.97e+000	8.06e-002%	0.00e+000	238000	x5.86