ARM架构NEON intrinsic函数学习笔记(三)

        本篇文章记录一下如何使用结构体寄存器变量并行处理数据,并且实现最普通的一次处理单个字节的方法,比较两种方法的性能优劣。

        这次是通过实现一张rgb图像的r,g,b三通道分离来进行学习。rgb图像是三个通道组成的,一个像素点的色彩是由三个通道对应的r,g,b三个颜色来表示的,并且一个像素点的三个r,g,b是连续存储的。

rgbrgb

        通过通道分离,将属于一个通道的存放在一个数组当中:

rrr
ggg
bbb

                数据类型uint8x16x3_t 可以用来定义一个存储3个uint8x16_t的结构体。由于一个uint8x16_t就已经是一个128位的寄存器了,所以uint8x16x3_t其实就是三个128位寄存器的组合;

                函数vld3q_u8()可以实现将一个元素为uint8的数组通过交叉存储的方式,存储到三个neon寄存器中 。

                另外,由于uint8占8比特,vld3q_u8一次可以将三个128位的寄存器填满,所以在读取rgb数组的时候,一个是可以处理128/8=16个像素点的。

                对于一次处理16个像素点来讲,图像的像素数量如果不是16的整数倍的话,应该将前面是16整数倍的通过neon进行处理,后面不足16个像素组的像素点只能按照单个uint8的方式处理。

       

        通过上面的分析,可以得到处理函数:

void get_rgb(uint8_t *r,uint8_t *g,uint8_t *b,const uint8_t *rgb,const int len){

        uint8x16x3_t intlv_rgb;
        int num=len/16;

        for(int i=0;i<num;i++){
                intlv_rgb=vld3q_u8(&rgb[i*16*3]);
                vst1q_u8(&r[i*16],intlv_rgb.val[0]);
                vst1q_u8(&g[i*16],intlv_rgb.val[1]);
                vst1q_u8(&b[i*16],intlv_rgb.val[2]);
        }
        for(int j=num*16,i=num*16*3;i<len*3&&j<len;i+=3,j++){
                r[j]=rgb[i];
                g[j]=rgb[i+1];
                b[j]=rgb[i+2];
        }
}

        ps:为了方面,我没有使用完整的一张图,而是定义了一个含有100000个像素点的数组来进行实验。

        实验结果:

                可以看出,使用intrinsic函数的效果还是很明显的。

------------------------------------------------------------------------------------------------------------------

补充:

        后面又了解到gcc编译器有多个级别的优化,分别对应着O0,O1,O2,Os,O3以及其他优化,现在来通过使用不同的优化级别来观察性能好坏。

-O0

-O1

-O2

-Os

-O3

        可以看出较高的优化等级确实进行了某些方面的加速,但具体加速在哪儿,由于我现在看汇编还很费劲,所以暂放哈哈哈哈哈哈,等能看懂汇编了再来补充。

        考虑到互联网上大佬很多,所以将O0和O3的汇编代码展示出来,希望有懂这块内容的大佬们深处援助之手,帮忙解答一下。

-O0

	.arch armv8-a
	.file	"test.cpp"
	.text
	.section	.rodata
	.align	3
	.type	_ZStL19piecewise_construct, %object
	.size	_ZStL19piecewise_construct, 1
_ZStL19piecewise_construct:
	.zero	1
	.local	_ZStL8__ioinit
	.comm	_ZStL8__ioinit,1,8
	.text
	.align	2
	.global	_Z7get_rgbPhS_S_PKhi
	.type	_Z7get_rgbPhS_S_PKhi, %function
_Z7get_rgbPhS_S_PKhi:
.LFB5248:
	.cfi_startproc
	stp	x29, x30, [sp, -320]!
	.cfi_def_cfa_offset 320
	.cfi_offset 29, -320
	.cfi_offset 30, -312
	add	x29, sp, 0
	.cfi_def_cfa_register 29
	str	x0, [x29, 56]
	str	x1, [x29, 48]
	str	x2, [x29, 40]
	str	x3, [x29, 32]
	str	w4, [x29, 28]
	adrp	x0, :got:__stack_chk_guard
	ldr	x0, [x0, #:got_lo12:__stack_chk_guard]
	ldr	x1, [x0]
	str	x1, [x29, 312]
	mov	x1,0
	ldr	w0, [x29, 28]
	add	w1, w0, 15
	cmp	w0, 0
	csel	w0, w1, w0, lt
	asr	w0, w0, 4
	str	w0, [x29, 76]
	str	wzr, [x29, 64]
.L4:
	ldr	w1, [x29, 64]
	ldr	w0, [x29, 76]
	cmp	w1, w0
	bge	.L2
	ldr	w1, [x29, 64]
	mov	w0, w1
	lsl	w0, w0, 1
	add	w0, w0, w1
	lsl	w0, w0, 4
	sxtw	x0, w0
	ldr	x1, [x29, 32]
	add	x0, x1, x0
	str	x0, [x29, 80]
	ldr	x0, [x29, 80]
	ld3	{v0.16b - v2.16b}, [x0]
	add	x0, x29, 160
	st1	{v0.16b - v2.16b}, [x0]
	add	x0, x29, 160
	ld1	{v0.16b - v2.16b}, [x0]
	add	x0, x29, 256
	str	q0, [x0]
	add	x0, x29, 160
	ld1	{v0.16b - v2.16b}, [x0]
	mov	v0.16b, v1.16b
	add	x0, x29, 256
	str	q0, [x0, 16]
	add	x0, x29, 160
	ld1	{v0.16b - v2.16b}, [x0]
	mov	v0.16b, v2.16b
	add	x0, x29, 256
	str	q0, [x0, 32]
	add	x0, x29, 256
	ld1	{v0.16b - v2.16b}, [x0]
	add	x0, x29, 208
	st1	{v0.16b - v2.16b}, [x0]
	ldr	w0, [x29, 64]
	lsl	w0, w0, 4
	sxtw	x0, w0
	ldr	x1, [x29, 56]
	add	x0, x1, x0
	add	x1, x29, 208
	ldr	q0, [x1]
	str	x0, [x29, 88]
	str	q0, [x29, 112]
	ldr	q0, [x29, 112]
	ldr	x0, [x29, 88]
	str	q0, [x0]
	ldr	w0, [x29, 64]
	lsl	w0, w0, 4
	sxtw	x0, w0
	ldr	x1, [x29, 48]
	add	x0, x1, x0
	add	x1, x29, 208
	ldr	q0, [x1, 16]
	str	x0, [x29, 96]
	str	q0, [x29, 128]
	ldr	q0, [x29, 128]
	ldr	x0, [x29, 96]
	str	q0, [x0]
	ldr	w0, [x29, 64]
	lsl	w0, w0, 4
	sxtw	x0, w0
	ldr	x1, [x29, 40]
	add	x0, x1, x0
	add	x1, x29, 208
	ldr	q0, [x1, 32]
	str	x0, [x29, 104]
	str	q0, [x29, 144]
	ldr	q0, [x29, 144]
	ldr	x0, [x29, 104]
	str	q0, [x0]
	ldr	w0, [x29, 64]
	add	w0, w0, 1
	str	w0, [x29, 64]
	b	.L4
.L2:
	ldr	w0, [x29, 76]
	lsl	w0, w0, 4
	str	w0, [x29, 68]
	ldr	w1, [x29, 76]
	mov	w0, w1
	lsl	w0, w0, 1
	add	w0, w0, w1
	lsl	w0, w0, 4
	str	w0, [x29, 72]
.L6:
	ldr	w1, [x29, 28]
	mov	w0, w1
	lsl	w0, w0, 1
	add	w0, w0, w1
	ldr	w1, [x29, 72]
	cmp	w1, w0
	bge	.L8
	ldr	w1, [x29, 68]
	ldr	w0, [x29, 28]
	cmp	w1, w0
	bge	.L8
	ldrsw	x0, [x29, 72]
	ldr	x1, [x29, 32]
	add	x1, x1, x0
	ldrsw	x0, [x29, 68]
	ldr	x2, [x29, 56]
	add	x0, x2, x0
	ldrb	w1, [x1]
	strb	w1, [x0]
	ldrsw	x0, [x29, 72]
	add	x0, x0, 1
	ldr	x1, [x29, 32]
	add	x1, x1, x0
	ldrsw	x0, [x29, 68]
	ldr	x2, [x29, 48]
	add	x0, x2, x0
	ldrb	w1, [x1]
	strb	w1, [x0]
	ldrsw	x0, [x29, 72]
	add	x0, x0, 2
	ldr	x1, [x29, 32]
	add	x1, x1, x0
	ldrsw	x0, [x29, 68]
	ldr	x2, [x29, 40]
	add	x0, x2, x0
	ldrb	w1, [x1]
	strb	w1, [x0]
	ldr	w0, [x29, 72]
	add	w0, w0, 3
	str	w0, [x29, 72]
	ldr	w0, [x29, 68]
	add	w0, w0, 1
	str	w0, [x29, 68]
	b	.L6
.L8:
	nop
	adrp	x0, :got:__stack_chk_guard
	ldr	x0, [x0, #:got_lo12:__stack_chk_guard]
	ldr	x1, [x29, 312]
	ldr	x0, [x0]
	eor	x0, x1, x0
	cmp	x0, 0
	beq	.L7
	bl	__stack_chk_fail
.L7:
	ldp	x29, x30, [sp], 320
	.cfi_restore 30
	.cfi_restore 29
	.cfi_def_cfa 31, 0
	ret
	.cfi_endproc
.LFE5248:
	.size	_Z7get_rgbPhS_S_PKhi, .-_Z7get_rgbPhS_S_PKhi
	.align	2
	.global	main
	.type	main, %function
main:
.LFB5249:
	.cfi_startproc
	mov	w0, 0
	ret
	.cfi_endproc
.LFE5249:
	.size	main, .-main
	.align	2
	.type	_Z41__static_initialization_and_destruction_0ii, %function
_Z41__static_initialization_and_destruction_0ii:
.LFB5730:
	.cfi_startproc
	stp	x29, x30, [sp, -32]!
	.cfi_def_cfa_offset 32
	.cfi_offset 29, -32
	.cfi_offset 30, -24
	add	x29, sp, 0
	.cfi_def_cfa_register 29
	str	w0, [x29, 28]
	str	w1, [x29, 24]
	ldr	w0, [x29, 28]
	cmp	w0, 1
	bne	.L13
	ldr	w1, [x29, 24]
	mov	w0, 65535
	cmp	w1, w0
	bne	.L13
	adrp	x0, _ZStL8__ioinit
	add	x0, x0, :lo12:_ZStL8__ioinit
	bl	_ZNSt8ios_base4InitC1Ev
	adrp	x0, __dso_handle
	add	x2, x0, :lo12:__dso_handle
	adrp	x0, _ZStL8__ioinit
	add	x1, x0, :lo12:_ZStL8__ioinit
	adrp	x0, :got:_ZNSt8ios_base4InitD1Ev
	ldr	x0, [x0, #:got_lo12:_ZNSt8ios_base4InitD1Ev]
	bl	__cxa_atexit
.L13:
	nop
	ldp	x29, x30, [sp], 32
	.cfi_restore 30
	.cfi_restore 29
	.cfi_def_cfa 31, 0
	ret
	.cfi_endproc
.LFE5730:
	.size	_Z41__static_initialization_and_destruction_0ii, .-_Z41__static_initialization_and_destruction_0ii
	.align	2
	.type	_GLOBAL__sub_I__Z7get_rgbPhS_S_PKhi, %function
_GLOBAL__sub_I__Z7get_rgbPhS_S_PKhi:
.LFB5731:
	.cfi_startproc
	stp	x29, x30, [sp, -16]!
	.cfi_def_cfa_offset 16
	.cfi_offset 29, -16
	.cfi_offset 30, -8
	add	x29, sp, 0
	.cfi_def_cfa_register 29
	mov	w1, 65535
	mov	w0, 1
	bl	_Z41__static_initialization_and_destruction_0ii
	ldp	x29, x30, [sp], 16
	.cfi_restore 30
	.cfi_restore 29
	.cfi_def_cfa 31, 0
	ret
	.cfi_endproc
.LFE5731:
	.size	_GLOBAL__sub_I__Z7get_rgbPhS_S_PKhi, .-_GLOBAL__sub_I__Z7get_rgbPhS_S_PKhi
	.section	.init_array,"aw"
	.align	3
	.xword	_GLOBAL__sub_I__Z7get_rgbPhS_S_PKhi
	.hidden	__dso_handle
	.ident	"GCC: (Ubuntu/Linaro 7.5.0-3ubuntu1~18.04) 7.5.0"
	.section	.note.GNU-stack,"",@progbits

-O3

	.arch armv8-a
	.file	"test.cpp"
	.text
	.align	2
	.p2align 3,,7
	.global	_Z7get_rgbPhS_S_PKhi
	.type	_Z7get_rgbPhS_S_PKhi, %function
_Z7get_rgbPhS_S_PKhi:
.LFB5319:
	.cfi_startproc
	cmp	w4, 0
	add	w5, w4, 15
	csel	w5, w5, w4, lt
	asr	w5, w5, 4
	cmp	w5, 0
	ble	.L2
	add	x7, x3, 48
	sub	w10, w5, #1
	mov	w11, 48
	mov	x6, x3
	mov	x9, x0
	mov	x8, x1
	umaddl	x10, w10, w11, x7
	mov	x7, x2
	.p2align 3
.L3:
	ld3	{v1.16b - v3.16b}, [x6], 48
	str	q1, [x9], 16
	cmp	x10, x6
	str	q2, [x8], 16
	str	q3, [x7], 16
	bne	.L3
.L2:
	add	w7, w5, w5, lsl 1
	lsl	w5, w5, 4
	add	w9, w4, w4, lsl 1
	cmp	w4, w5
	lsl	w7, w7, 4
	ccmp	w7, w9, 0, gt
	bge	.L1
	sxtw	x8, w5
	add	x3, x3, x7, sxtw
	add	x0, x0, x8
	add	x1, x1, x8
	add	x2, x2, x8
	mov	x6, 0
	.p2align 3
.L5:
	ldrb	w8, [x3]
	add	x3, x3, 3
	strb	w8, [x0, x6]
	add	w5, w5, 1
	add	w7, w7, 3
	cmp	w4, w5
	ldrb	w8, [x3, -2]
	ccmp	w7, w9, 0, gt
	strb	w8, [x1, x6]
	ldrb	w8, [x3, -1]
	strb	w8, [x2, x6]
	add	x6, x6, 1
	blt	.L5
.L1:
	ret
	.cfi_endproc
.LFE5319:
	.size	_Z7get_rgbPhS_S_PKhi, .-_Z7get_rgbPhS_S_PKhi
	.section	.text.startup,"ax",@progbits
	.align	2
	.p2align 3,,7
	.global	main
	.type	main, %function
main:
.LFB5320:
	.cfi_startproc
	mov	w0, 0
	ret
	.cfi_endproc
.LFE5320:
	.size	main, .-main
	.align	2
	.p2align 3,,7
	.type	_GLOBAL__sub_I__Z7get_rgbPhS_S_PKhi, %function
_GLOBAL__sub_I__Z7get_rgbPhS_S_PKhi:
.LFB5801:
	.cfi_startproc
	stp	x29, x30, [sp, -32]!
	.cfi_def_cfa_offset 32
	.cfi_offset 29, -32
	.cfi_offset 30, -24
	add	x29, sp, 0
	.cfi_def_cfa_register 29
	str	x19, [sp, 16]
	.cfi_offset 19, -16
	adrp	x19, .LANCHOR0
	add	x19, x19, :lo12:.LANCHOR0
	mov	x0, x19
	bl	_ZNSt8ios_base4InitC1Ev
	adrp	x0, :got:_ZNSt8ios_base4InitD1Ev
	mov	x1, x19
	ldr	x19, [sp, 16]
	adrp	x2, __dso_handle
	ldr	x0, [x0, #:got_lo12:_ZNSt8ios_base4InitD1Ev]
	add	x2, x2, :lo12:__dso_handle
	ldp	x29, x30, [sp], 32
	.cfi_restore 30
	.cfi_restore 29
	.cfi_restore 19
	.cfi_def_cfa 31, 0
	b	__cxa_atexit
	.cfi_endproc
.LFE5801:
	.size	_GLOBAL__sub_I__Z7get_rgbPhS_S_PKhi, .-_GLOBAL__sub_I__Z7get_rgbPhS_S_PKhi
	.section	.init_array,"aw"
	.align	3
	.xword	_GLOBAL__sub_I__Z7get_rgbPhS_S_PKhi
	.bss
	.align	3
	.set	.LANCHOR0,. + 0
	.type	_ZStL8__ioinit, %object
	.size	_ZStL8__ioinit, 1
_ZStL8__ioinit:
	.zero	1
	.hidden	__dso_handle
	.ident	"GCC: (Ubuntu/Linaro 7.5.0-3ubuntu1~18.04) 7.5.0"
	.section	.note.GNU-stack,"",@progbits

        我只知道O3优化后的代码变少了哈哈哈哈哈哈。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值