编译器对全局变量的处理____AV_COPY128比memcpy快
#include <stdio.h>
#include <stdlib.h>
typedef struct buf_st{
int buf1;
int buffer[20];
}buf_st;
buf_st *st;
int main()
{
st=(buf_st *)malloc(sizeof(buf_st));
scanf("%d",&st->buf1);
printf("%p: %d\n",st->buffer,st->buf1);
return 0;
}
----------------------编译成的汇编:(命令是: arm-none-linux-gnueabi-gcc test_address.c -S -o test_address.asm)
.cpu arm10tdmi
.fpu softvfp
.eabi_attribute 20, 1
.eabi_attribute 21, 1
.eabi_attribute 23, 3
.eabi_attribute 24, 1
.eabi_attribute 25, 1
.eabi_attribute 26, 2
.eabi_attribute 30, 6
.eabi_attribute 18, 4
.file "test_address.c"
.section .rodata
.align 2
.LC0:
.ascii "%d\000"
.align 2
.LC1:
.ascii "%p: %d\012\000"
.text
.align 2
.global main
.type main, %function
main:
.fnstart
.LFB2:
@ args = 0, pretend = 0, frame = 0
@ frame_needed = 1, uses_anonymous_args = 0
stmfd sp!, {fp, lr}
.save {fp, lr}
.LCFI0:
.setfp fp, sp, #4
add fp, sp, #4
.LCFI1:
mov r0, #84
bl malloc
mov r3, r0
mov r2, r3
ldr r3, .L3
str r2, [r3, #0]
ldr r3, .L3
ldr r3, [r3, #0]
ldr r0, .L3+4
mov r1, r3
bl scanf
ldr r3, .L3
ldr r3, [r3, #0]
add r2, r3, #4
ldr r3, .L3 ///把全局指针的地址加载进来。
ldr r3, [r3, #0]
ldr r3, [r3, #0] ///暂时不清楚这里为什么是两个?????
ldr r0, .L3+8
mov r1, r2
mov r2, r3
bl printf
mov r3, #0
mov r0, r3
ldmfd sp!, {fp, pc}
.L4:
.align 2
.L3:
.word st ---->这里说明这个全局指针的地址在使用之前已经配置好了。这样在使用的时候,系统会自动的对其进行优化和偏移使用,如上面的ldr r0, .L3+8就是只加偏移量。
.word .LC0
.word .LC1
.LFE2:
.fnend
.size main, .-main
.comm st,4,4
.ident "GCC: (Sourcery G++ Lite 2009q1-203) 4.3.3"
.section .note.GNU-stack,"",%progbits
-----------------------------------------------------------------------------------
根据上面实验:一个全局变量的地址在编译器阶段就已经计算出相关的偏移量,从而可以间接的知道这个全局变量的地址是否是4字节对齐或是否是8字节对齐。
故:ffmpeg中的AV_COPY128 这个宏编译成汇编如下
.L4:
ldmia r0, {r3-r4} ///这里直接使用ldm而不经过判断,是因为编译器已经知道这个地址是8字节对齐的,直接读出和写入。
stmia ip, {r3-r4}
add r2, r0, #8
ldmia r2, {r1-r2}
add r0, r0, #16
cmp r0, lr
str r1, [ip, #8]
str r2, [ip, #12]
add ip, ip, #16
bne .L4
AV_COPY128定义如下:
#define av_alias __attribute__((may_alias))
typedef union {
uint64_t u64;
uint32_t u32[2];
uint16_t u16[4];
uint8_t u8 [8];
double f64;
float f32[2];
} av_alias av_alias64;
#define AV_COPY(n, d, s) \
(((av_alias##n*)(d))->u##n = ((const av_alias##n*)(s))->u##n)
#ifndef AV_COPY64
# define AV_COPY64(d, s) AV_COPY(64, d, s)
#endif
#ifndef AV_COPY128
# define AV_COPY128(d, s) \
do { \
AV_COPY64(d, s); \
AV_COPY64((char*)(d)+8, (char*)(s)+8); \
} while(0)
#endif
-------------------------------------------------------------------------
测试一下:
源代码:
#include <stdio.h>
#include <stdlib.h>
#include <inttypes.h>
void *memset_snov6(void *s, int c, size_t n);
void bzero_snov6(void *s, size_t n);
void *memcpy_snov6(void *d,void *s, size_t len);
#define av_alias __attribute__((may_alias))
typedef union {
uint64_t u64;
uint32_t u32[2];
uint16_t u16[4];
uint8_t u8 [8];
double f64;
float f32[2];
} av_alias av_alias64;
typedef union {
uint32_t u32;
uint16_t u16[2];
uint8_t u8 [4];
float f32;
} av_alias av_alias32;
#define AV_RNA(s, p) (((const av_alias##s*)(p))->u##s) ///read from
#define AV_WNA(s, p, v) (((av_alias##s*)(p))->u##s = (v)) ///extend p to 32bits, and write v to p's element;
#define AV_RNA(s, p) (((const av_alias##s*)(p))->u##s)
#define AV_WN32A(p, v) AV_WNA(32, p, v)
#define AV_RN32A(p) AV_RNA(32, p) ///read
#define AV_ZERO(n, d) (((av_alias##n*)(d))->u##n = 0)
#ifndef AV_ZERO64
# define AV_ZERO64(d) AV_ZERO(64, d)
#endif
#ifndef AV_ZERO128
# define AV_ZERO128(d) \
do { \
AV_ZERO64(d); \
AV_ZERO64((char*)(d)+8); \
} while(0)
#endif
#define AV_COPY(n, d, s) \
(((av_alias##n*)(d))->u##n = ((const av_alias##n*)(s))->u##n)
#ifndef AV_COPY16
# define AV_COPY16(d, s) AV_COPY(16, d, s)
#endif
#ifndef AV_COPY32
# define AV_COPY32(d, s) AV_COPY(32, d, s)
#endif
#ifndef AV_COPY64
# define AV_COPY64(d, s) AV_COPY(64, d, s)
#endif
#ifndef AV_COPY128
# define AV_COPY128(d, s) \
do { \
AV_COPY64(d, s); \
AV_COPY64((char*)(d)+8, (char*)(s)+8); \
} while(0)
#endif
#define END_PRINT_TIME(id) \
gettimeofday(&tpend,NULL); \
timeuse=(tpend.tv_sec*1000*1000+tpend.tv_usec)-(tpstart.tv_sec*1000*1000+tpstart.tv_usec); \
printf("%s :time is %f\n",id,timeuse);
typedef struct xxx{
unsigned char __attribute__ ((aligned (4))) buffer[1024*1024];
unsigned char __attribute__ ((aligned (4))) srcbuffer[1024*1024];
}xxxs;
int main()
{
struct timeval tpstart,tpend;
float timeuse;
int i,buffer_size;
xxxs *st;
st=malloc(sizeof(xxxs));
buffer_size=100*16;
printf("\n-------------%s-------------------------------buffer_size is %d---\n",__TIME__,buffer_size);
gettimeofday(&tpstart,NULL);
for(i=0;i<buffer_size;i+=16)
AV_COPY128(&st->buffer[i],&st->srcbuffer[i]);
/// AV_ZERO128(&buffer[i]);
END_PRINT_TIME("using AV_COPY128")
gettimeofday(&tpstart,NULL);
for(i=0;i<buffer_size;i+=16)
memcpy_snov6(&st->buffer[i],&st->srcbuffer[i],16);
/// bzero_snov6(buffer,16);
END_PRINT_TIME("using memcpy_snov6")
buffer_size=200*16;
printf("\n-------------%s-------------------------------buffer_size is %d---\n",__TIME__,buffer_size);
gettimeofday(&tpstart,NULL);
for(i=0;i<buffer_size;i+=16)
AV_COPY128(&st->buffer[i],&st->srcbuffer[i]);
/// AV_ZERO128(&buffer[i]);
END_PRINT_TIME("using AV_COPY128")
gettimeofday(&tpstart,NULL);
for(i=0;i<buffer_size;i+=16)
memcpy_snov6(&st->buffer[i],&st->srcbuffer[i],16);
/// bzero_snov6(buffer,16);
END_PRINT_TIME("using memcpy_snov6")
buffer_size=1000*16;
printf("\n-------------%s-------------------------------buffer_size is %d---\n",__TIME__,buffer_size);
gettimeofday(&tpstart,NULL);
for(i=0;i<buffer_size;i+=16)
AV_COPY128(&st->buffer[i],&st->srcbuffer[i]);
/// AV_ZERO128(&buffer[i]);
END_PRINT_TIME("using AV_COPY128")
gettimeofday(&tpstart,NULL);
for(i=0;i<buffer_size;i+=16)
memcpy_snov6(&st->buffer[i],&st->srcbuffer[i],16);
/// bzero_snov6(buffer,16);
END_PRINT_TIME("using memcpy_snov6")
}
结果:
/*
-------------11:26:03-------------------------------buffer_size is 1600---
using AV_ZERO64 :time is 38.000000
using bzero_snov6 :time is 60.000000
-------------11:26:03-------------------------------buffer_size is 3200---
using AV_ZERO64 :time is 12.000000
using bzero_snov6 :time is 114.000000
-------------11:26:03-------------------------------buffer_size is 16000---
using AV_ZERO64 :time is 123.000000
using bzero_snov6 :time is 557.000000
=====================================================================================
-------------11:28:00-------------------------------buffer_size is 1600---
using AV_ZERO128 :time is 39.000000
using bzero_snov6 :time is 33.000000
-------------11:28:00-------------------------------buffer_size is 3200---
using AV_ZERO128 :time is 11.000000
using bzero_snov6 :time is 60.000000
-------------11:28:00-------------------------------buffer_size is 16000---
using AV_ZERO128 :time is 122.000000
using bzero_snov6 :time is 280.000000
*/
编译器对全局变量的处理____AV_COPY128比memcpy快
最新推荐文章于 2021-11-21 16:28:12 发布