基本上我们编写的第一个C程序都是打印hello world, 但很少有人去分析打印一个字符串是怎么实现的,认为这是理所当然的,起码我当时是这么认为,没有任何疑问,而且还很兴奋,当时大学的C语言都学完了,我都不知道printf的原理,或者说根本就没有去研究过,实际上一个简单的printf背后做了大量的工作。
从完全开发手册的stdio实例出发,看下printf的实现。
1、输入输出的终端设备采用串口,也可以使用LCD或者其他。
2、变参函数的实现。printf和scanf的参数个数是不固定的,但通过第一个参数的地址,可以找到其他参数的地址,通过fmt的格式可以确定参数的个数,一般都要用到这样一个宏:
#ifndef _VALIST
#define _VALIST
typedef char *va_list;
#endif /* _VALIST */
/*
* Storage alignment properties
*/
#define NATIVE_INT int
#define _AUPBND (sizeof (NATIVE_INT) - 1)
#define _ADNBND (sizeof (NATIVE_INT) - 1)
/*
* Variable argument list macro definitions
*/
#define _bnd(X, bnd) (((sizeof (X)) + (bnd)) & (~(bnd)))
#define va_arg(ap, T) (*(T *)(((ap) += (_bnd (T, _AUPBND))) - (_bnd (T,_ADNBND))))
#define va_end(ap) (void) 0
#define va_start(ap, A) (void) ((ap) = (((char *) &(A)) + (_bnd (A,_AUPBND))))
#endif /* va_arg */
具体的使用可以百度。
3、64位整数的处理。
64位整数的除法或者乘法是要自己实现的,毕竟2440是32位的CPU,如果使用现成的库就不用这么费劲了,现在是在没有任何库的情况下。先说下除法:
#define do_div(n,base) \
({ \
register unsigned int __base asm("r4") = base; \
register unsigned long long __n asm("r0") = n; \
register unsigned long long __res asm("r2"); \
register unsigned int __rem asm(__xh); \
asm( __asmeq("%0", __xh) \
__asmeq("%1", "r2") \
__asmeq("%2", "r0") \
__asmeq("%3", "r4") \
"bl __do_div64" \
: "=r" (__rem), "=r" (__res) \
: "r" (__n), "r" (__base) \
: "ip", "lr", "cc"); \
n = __res; \
__rem; \
})
这是一个长整数除法的一个宏,其中n是64位的,base是除数,register这个关键字表示某个变量用寄存器代替,如regiser unsigned long long __n asm("r0")表示用寄存器r0代替__n,这里有个地方不大理解,为什么r0=n之后,n的高32位就赋给r1了?
内联汇编的格式asm(code : output operand list : input operand list : clobber list);其中code要用""引起来,换行要用\n, %0表示输出输入列表的第一个参数,其他依次类推,=r表示被赋值或者输出,=&r表示只能用做输出,clobber list表示修改过的参数。
__do_div64采用移位的方法实现除法,最多也就移动64次,从高位开始算,我曾经在DSP上也做过一个除法,现在想想当时的算法实在太笨了,我用减法实现除法,如果是一个64位的数,除以一个比较小的数,那延时就大了,这个算法实在很巧,具体代码入下:
#define ALIGN .align 4,0x90
#define __LINUX_ARM_ARCH__ 1
#define ENTRY(name) \
.globl name; \
ALIGN; \
name:
#ifdef __ARMEB__
#define xh r0
#define xl r1
#define yh r2
#define yl r3
#else
#define xl r0
#define xh r1
#define yl r2
#define yh r3
#endif
/*
* __do_div64: perform a division with 64-bit dividend and 32-bit divisor.
*
* Note: Calling convention is totally non standard for optimal code.
* This is meant to be used by do_div() from include/asm/div64.h only.
*
* Input parameters:
* xh-xl = dividend (clobbered)
* r4 = divisor (preserved)
*
* Output values:
* yh-yl = result
* xh = remainder
*
* Clobbered regs: xl, ip
*/
ENTRY(__do_div64)
@ Test for easy paths first.
subs ip, r4, #1
bls 9f @ divisor is 0 or 1
tst ip, r4
beq 8f @ divisor is power of 2
@ See if we need to handle upper 32-bit result.
cmp xh, r4
mov yh, #0
blo 3f
@ Align divisor with upper part of dividend.
@ The aligned divisor is stored in yl preserving the original.
@ The bit position is stored in ip.
#if __LINUX_ARM_ARCH__ >= 5
clz yl, r4
clz ip, xh
sub yl, yl, ip
mov ip, #1
mov ip, ip, lsl yl
mov yl, r4, lsl yl
#else
mov yl, r4
mov ip, #1
1: cmp yl, #0x80000000
cmpcc yl, xh
movcc yl, yl, lsl #1
movcc ip, ip, lsl #1
bcc 1b
#endif
@ The division loop for needed upper bit positions.
@ Break out early if dividend reaches 0.
2: cmp xh, yl
orrcs yh, yh, ip
subcss xh, xh, yl
movnes ip, ip, lsr #1
mov yl, yl, lsr #1
bne 2b
@ See if we need to handle lower 32-bit result.
3: cmp xh, #0
mov yl, #0
cmpeq xl, r4
movlo xh, xl
movlo pc, lr
@ The division loop for lower bit positions.
@ Here we shift remainer bits leftwards rather than moving the
@ divisor for comparisons, considering the carry-out bit as well.
mov ip, #0x80000000
4: movs xl, xl, lsl #1
adcs xh, xh, xh
beq 6f
cmpcc xh, r4
5: orrcs yl, yl, ip
subcs xh, xh, r4
movs ip, ip, lsr #1
bne 4b
mov pc, lr
@ The top part of remainder became zero. If carry is set
@ (the 33th bit) this is a false positive so resume the loop.
@ Otherwise, if lower part is also null then we are done.
6: bcs 5b
cmp xl, #0
moveq pc, lr
@ We still have remainer bits in the low part. Bring them up.
#if __LINUX_ARM_ARCH__ >= 5
clz xh, xl @ we know xh is zero here so...
add xh, xh, #1
mov xl, xl, lsl xh
mov ip, ip, lsr xh
#else
7: movs xl, xl, lsl #1
mov ip, ip, lsr #1
bcc 7b
#endif
@ Current remainder is now 1. It is worthless to compare with
@ divisor at this point since divisor can not be smaller than 3 here.
@ If possible, branch for another shift in the division loop.
@ If no bit position left then we are done.
movs ip, ip, lsr #1
mov xh, #1
bne 4b
mov pc, lr
8: @ Division by a power of 2: determine what that divisor order is
@ then simply shift values around
#if __LINUX_ARM_ARCH__ >= 5
clz ip, r4
rsb ip, ip, #31
#else
mov yl, r4
cmp r4, #(1 << 16)
mov ip, #0
movhs yl, yl, lsr #16
movhs ip, #16
cmp yl, #(1 << 8)
movhs yl, yl, lsr #8
addhs ip, ip, #8
cmp yl, #(1 << 4)
movhs yl, yl, lsr #4
addhs ip, ip, #4
cmp yl, #(1 << 2)
addhi ip, ip, #3
addls ip, ip, yl, lsr #1
#endif
mov yh, xh, lsr ip
mov yl, xl, lsr ip
rsb ip, ip, #32
orr yl, yl, xh, lsl ip
mov xh, xl, lsl ip
mov xh, xh, lsr ip
mov pc, lr
@ eq -> division by 1: obvious enough...
9: moveq yl, xl
moveq yh, xh
moveq xh, #0
moveq pc, lr
@ Division by 0:
str lr, [sp, #-4]!
/* bl __div0 */
@ as wrong as it could be...
mov yl, #0
mov yh, #0
mov xh, #0
ldr pc, [sp], #4
注释已经比较详细了,其中xl(r0)为被除数的低位,xh(r1)为被除数的高位,yl(r2)商的低位,yh(r3)商的高位,xh还作为余数。
然后是64位的乘法。在不包含任何库的情况下,用arm-linux-gcc编译一个长整数的乘法,会提示找不到__muldi3的定义,我一开始我就奇怪,我根本就没用到这个函数,实际上是我太想当然了,32位的CPU不能实现64位的乘法,看乘法的部分:
#define umul_ppmm(xh, xl, a, b) \
{register USItype __t0, __t1, __t2; \
__asm__ ("%@ Inlined umul_ppmm \n\
mov %2, %5, lsr #16 \n\
mov %0, %6, lsr #16 \n\
bic %3, %5, %2, lsl #16 \n\
bic %4, %6, %0, lsl #16 \n\
mul %1, %3, %4 \n\
mul %4, %2, %4 \n\
mul %3, %0, %3 \n\
mul %0, %2, %0 \n\
adds %3, %4, %3 \n\
addcs %0, %0, #65536 \n\
adds %1, %1, %3, lsl #16 \n\
adc %0, %0, %3, lsr #16" \
: "=&r" ((USItype) (xh)), \
"=r" ((USItype) (xl)), \
"=&r" (__t0), "=&r" (__t1), "=r" (__t2) \
: "r" ((USItype) (a)), \
"r" ((USItype) (b)));}
其中a,b是32位数,xh是a*b的高32位,xl是a*b的低32位,两个32位数相乘绝对不会超过64位,所以该算法把32位先分成了高16位和低16位,2个16位数相乘绝对不会超过32位,CPU是有这个能力计算的,把16位看成一个整体就是一个2位数的乘法了,跟我们小学学的数学一样。
看似很多理所当然的东西背后仔细研究一下都有大文章。
PS:不明白的地方可以留言。