usleep的--系统调用流程 Android4.0.1

最新推荐文章于 2023-02-03 18:45:15 发布

dbqy

最新推荐文章于 2023-02-03 18:45:15 发布

阅读量6.6k

点赞数 1

文章标签： android

1.由于在不同的硬件平台上经常遇到usleep不准确的问题，比如usleep(2*1000)，结果sleep了10ms，是不是有点过分，测试代码如下：

[cpp]view plaincopy
 #include <stdio.h>  
 #include <stdlib.h>  
   
 int main(int argc,char **argv)  
 {  
   
     struct timeval oldTime, newTime;  
     int iStime,i,j;  
     iStime=5;  
     for(i=0;i<60;i++)  
     {  
         for(j=0;j<10;j++)  
         {  
             gettimeofday( &oldTime, NULL );  
             usleep( iStime * 1000 );  
             gettimeofday( &newTime, NULL );  
             printf("iStime:%d,actual time:%lld\n",iStime,((long long)(newTime.tv_sec*1000 + newTime.tv_usec/1000)-(long long)(oldTime.tv_sec*1000 + oldTime.tv_usec/1000)));  
         }  
         iStime++;  
     }  
 }  

当然为防止出现意外，禁止测试期间设置系统时间。

2. 根据以前的经验，此usleep不准主要是由于Kernel中系统timer的rating值过高引起的。

3. 下面从源码的角度分析一下usleep的实现细节，并进一步分析其原因。以下以Android4.0.1为例进行分析。注此问题主要与Kernel有关，与glibc或bionic无关。

4. 首先找到usleep的源码：

[cpp]view plaincopy
 //位于/bionic/libc/unistd/usleep.c  
 #include <time.h>  
 #include <errno.h>  
   
 int usleep(unsigned long usec)  
 {  
   struct timespec ts;  
   
   ts.tv_sec  = usec/1000000UL;  
   
 #ifdef __arm__  
     /* avoid divisions and modulos on the ARM */  
   ts.tv_nsec = (usec - ts.tv_sec*1000000UL)*1000;  
 #else  
   ts.tv_nsec = (usec % 1000000UL) * 1000UL;  
 #endif  
   
   for (;;)  
   {  
     if ( nanosleep( &ts, &ts ) == 0 )  
         return 0;  
   
     // We try again if the nanosleep failure is EINTR.  
     // The other possible failures are EINVAL (which we should pass through),  
     // and ENOSYS, which doesn't happen.  
     if ( errno != EINTR )  
         return -1;  
   }  
 }  

它也很懒的，就调用了nanosleep，哪就看看nanasleep的源码吧! 不幸是只找到一个extern int nanosleep(const struct timespec *, struct timespec *); 它位于/bionic/libc/include/sys/linux-unistd.h,并没有找到它的实现。其实看看Linux系统调用，早就知道它是一个系统调用，哪就分析一下是如何进行系统调用的，以前只是讲过原理，并没有实例，在此把它完成了。

5. 寻找系统调用函数

如果这个函数没有实现，哪肯定是不能调用的，就像MIT教授在公开课上所讲的，搞计算机的不像搞别的，做不了假，别人不管你怎么设计的，只看你实现的结果，很有道理。也证明了搞if else的人不能做弊。哪就从它的Android.mk入手吧，看看还Link了什么东东。打开libc的Android.mk发现，其中有一行

include $(LOCAL_PATH)/arch-$(TARGET_ARCH)/syscalls.mk

这就是关键所在，syscalls系统调用，不正是我们要找的吗？进入arch-arm/syscalls.mk一看，其中一大片.s,Search一下，看有没有nanosleep.s，还真有这么一行，真是大快人心：syscall_src += arch-arm/syscalls/nanosleep.S

赶紧去瞧瞧，ARM汇编水平不高，能看懂吗？先把代码贴上再说，不懂就问google.

[cpp]view plaincopy
 /* autogenerated by gensyscalls.py */  
 #include <sys/linux-syscalls.h>  
   
     .text  
     .type nanosleep, #function  
     .globl nanosleep  
     .align 4  
     .fnstart  
   
 nanosleep:  
     .save   {r4, r7}  
     stmfd   sp!, {r4, r7}  
     ldr     r7, =__NR_nanosleep  
     swi     #0  
     ldmfd   sp!, {r4, r7}  
     movs    r0, r0  
     bxpl    lr  
     b       __set_syscall_errno  
     .fnend  

__NR_nanosleep是个什么东东，凭直觉，肯定在sys/linux-syscalls.h中有定义。打开/libc/include/sys/linux-syscalls.h并search __NR_nanosleep, 明白了，它定义了__NR_nanosleep的值为(__NR_SYSCALL_BASE + 162),其实就是定义了其系统调用号。这就与前一文swi连接起来了。上面的代码把系统调用号传递给r7,然后触发了一个软中断，从而进入内核态执行。

6. 软中断处理流程

根据常识，既然是软中断，就一定有一个对应的ISR，打开/kernel/arch/arm/kernel/entry-common.S，发现其中有一个ENTRY(vector_swi),这就是我们要找的ISR,其详细代码如下：

[cpp]view plaincopy
     .align  5  
 ENTRY(vector_swi)  
     sub sp, sp, #S_FRAME_SIZE  
     stmia   sp, {r0 - r12}          @ Calling r0 - r12  
  ARM(   add r8, sp, #S_PC       )  
  ARM(   stmdb   r8, {sp, lr}^       )   @ Calling sp, lr  
  THUMB( mov r8, sp          )  
  THUMB( store_user_sp_lr r8, r10, S_SP  )   @ calling sp, lr  
     mrs r8, spsr            @ called from non-FIQ mode, so ok.  
     str lr, [sp, #S_PC]         @ Save calling PC  
     str r8, [sp, #S_PSR]        @ Save CPSR  
     str r0, [sp, #S_OLD_R0]     @ Save OLD_R0  
     zero_fp  
   
     /* 
      * Get the system call number. 
      */  
   
 #if defined(CONFIG_OABI_COMPAT)  
   
     /* 
      * If we have CONFIG_OABI_COMPAT then we need to look at the swi 
      * value to determine if it is an EABI or an old ABI call. 
      */  
 #ifdef CONFIG_ARM_THUMB  
     tst r8, #PSR_T_BIT  
     movne   r10, #0             @ no thumb OABI emulation  
     ldreq   r10, [lr, #-4]          @ get SWI instruction  
 #else  
     ldr r10, [lr, #-4]          @ get SWI instruction  
   A710( and ip, r10, #0x0f000000        @ check for SWI     )  
   A710( teq ip, #0x0f000000                     )  
   A710( bne .Larm710bug                     )  
 #endif  
 #ifdef CONFIG_CPU_ENDIAN_BE8  
     rev r10, r10            @ little endian instruction  
 #endif  
   
 #elif defined(CONFIG_AEABI)  
   
     /* 
      * Pure EABI user space always put syscall number into scno (r7). 
      */  
   A710( ldr ip, [lr, #-4]           @ get SWI instruction   )  
   A710( and ip, ip, #0x0f000000     @ check for SWI     )  
   A710( teq ip, #0x0f000000                     )  
   A710( bne .Larm710bug                     )  
   
 #elif defined(CONFIG_ARM_THUMB)  
   
     /* Legacy ABI only, possibly thumb mode. */  
     tst r8, #PSR_T_BIT          @ this is SPSR from save_user_regs  
     addne   scno, r7, #__NR_SYSCALL_BASE    @ put OS number in  
     ldreq   scno, [lr, #-4]  
   
 #else  
   
     /* Legacy ABI only. */  
     ldr scno, [lr, #-4]         @ get SWI instruction  
   A710( and ip, scno, #0x0f000000       @ check for SWI     )  
   A710( teq ip, #0x0f000000                     )  
   A710( bne .Larm710bug                     )  
   
 #endif  
   
 #ifdef CONFIG_ALIGNMENT_TRAP  
     ldr ip, __cr_alignment  
     ldr ip, [ip]  
     mcr p15, 0, ip, c1, c0      @ update control register  
 #endif  
     enable_irq  
   
     get_thread_info tsk  
     adr tbl, sys_call_table     @ load syscall table pointer  
     ldr ip, [tsk, #TI_FLAGS]        @ check for syscall tracing  
   
 #if defined(CONFIG_OABI_COMPAT)  
     /* 
      * If the swi argument is zero, this is an EABI call and we do nothing. 
      * 
      * If this is an old ABI call, get the syscall number into scno and 
      * get the old ABI syscall table address. 
      */  
     bics    r10, r10, #0xff000000  
     eorne   scno, r10, #__NR_OABI_SYSCALL_BASE  
     ldrne   tbl, =sys_oabi_call_table  
 #elif !defined(CONFIG_AEABI)  
     bic scno, scno, #0xff000000     @ mask off SWI op-code  
     eor scno, scno, #__NR_SYSCALL_BASE  @ check OS number  
 #endif  
   
     stmdb   sp!, {r4, r5}           @ push fifth and sixth args  
     tst ip, #_TIF_SYSCALL_TRACE     @ are we tracing syscalls?  
     bne __sys_trace  
   
     cmp scno, #NR_syscalls      @ check upper syscall limit  
     adr lr, BSYM(ret_fast_syscall)  @ return address  
     ldrcc   pc, [tbl, scno, lsl #2]     @ call sys_* routine  
   
     add r1, sp, #S_OFF  
 2:  mov why, #0             @ no longer a real syscall  
     cmp scno, #(__ARM_NR_BASE - __NR_SYSCALL_BASE)  
     eor r0, scno, #__NR_SYSCALL_BASE    @ put OS number back  
     bcs arm_syscall   
     b   sys_ni_syscall          @ not private func  
 ENDPROC(vector_swi)  

7. 找与nanosleep对应的处理函数

从上面的代码中可以看出，它将调用sys_call_table中的某个函数。在同一个文件中寻找sys_call_table,其代码如下：

[cpp]view plaincopy
     .type   sys_call_table, #object  
 ENTRY(sys_call_table)  
 #include "calls.S"  

看看linux/arch/arm/kernel/calls.S中的内容：

[cpp]view plaincopy
 /* 0 */     CALL(sys_restart_syscall)  
         CALL(sys_exit)  
         CALL(sys_fork_wrapper)  
         CALL(sys_read)  
         CALL(sys_write)  
                 ...  
 /* 160 */   CALL(sys_sched_get_priority_min)  
         CALL(sys_sched_rr_get_interval)  
         CALL(sys_nanosleep)  
         CALL(sys_mremap)  
         CALL(sys_setresuid16)  

原来nanosleep系统调用在Kernel中的函数为sys_nanosleep,现在去分析一下是如何实现高精度的sleep的，是忙等（执行nop指令），还是闲等（让出CPU使用权）呢? 马上就会有答案了。由于小弟知识有限，没哪么简单，我找了2个小时也没有找到答案，惭愧啊!

8. 先看看熟悉的系统调用open吧!

也不幸运，没有sys_open这样的函数。反正知道这个东东在fs/open.c中，基本原理应该是一样的。在此文件中找到了下面这个函数：

SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, int, mode)

linux/syscalls.h定义如下：

asmlinkage long sys_open(const char __user *filename,int flags, int mode); (asmlinkage就是一个extern "C")

这兄弟俩长得太像了，再看看SYSCALL_DEFINE3的定义，看看能不能找到二者的关系。

哈哈哈哈哈哈.....,终于在linux/syscalls.h中找到答案了，SYSCALL_DEFINE3的定义如下：

[cpp]view plaincopy
 #define __SYSCALL_DEFINEx(x, name, ...) asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__))  
 #define SYSCALL_DEFINEx(x, sname, ...)  __SYSCALL_DEFINEx(x, sname, __VA_ARGS__)  
 #define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)   

把SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, int, mode)还原就变成了：

asmlinkage long sys_open(const char __user *filename,int flags, int mode);是不是与要找的函数一模一样呢？终于找到如何看这个代码的方法了！

9. 继续找sys_nanosleep的实现代码

先看看linux/kernel/hrtimer.c中的commnets:

*  High-resolution kernel timers
*
* In contrast to the low-resolution timeout API implemented in
* kernel/timer.c, hrtimers provide finer resolution and accuracy
* depending on system configuration and capabilities.
*
* These timers are currently used for:
*   - itimers
*   - POSIX timers
*   - nanosleep
*   - precise in-kernel timing

看到上面的nanosleep了吗?说明有机会找到了。

SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp, struct timespec __user *, rmtp)这不就是我要找的吗? 由于这是一个宏，在SourceInsight中查找函数nanosleep是找不到的，search字符串nanosleep是可行的。其代码如下：

[cpp]view plaincopy
 SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,  
         struct timespec __user *, rmtp)  
 {  
     struct timespec tu;  
   
     if (copy_from_user(&tu, rqtp, sizeof(tu)))  
         return -EFAULT;  
   
     if (!timespec_valid(&tu))  
         return -EINVAL;  
   
     return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC);  
 }  

hrtimer_nanosleep实现如下：

[cpp]view plaincopy
 long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,  
                const enum hrtimer_mode mode, const clockid_t clockid)  
 {  
     struct restart_block *restart;  
     struct hrtimer_sleeper t;  
     int ret = 0;  
     unsigned long slack;  
   
     slack = current->timer_slack_ns;  
     if (rt_task(current))  
         slack = 0;  
   
     hrtimer_init_on_stack(&t.timer, clockid, mode);  
     hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack);  
     if (do_nanosleep(&t, mode))  
         goto out;  
   
     /* Absolute timers do not update the rmtp value and restart: */  
     if (mode == HRTIMER_MODE_ABS) {  
         ret = -ERESTARTNOHAND;  
         goto out;  
     }  
   
     if (rmtp) {  
         ret = update_rmtp(&t.timer, rmtp);  
         if (ret <= 0)  
             goto out;  
     }  
   
     restart = ¤t_thread_info()->restart_block;  
     restart->fn = hrtimer_nanosleep_restart;  
     restart->nanosleep.index = t.timer.base->index;  
     restart->nanosleep.rmtp = rmtp;  
     restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer);  
   
     ret = -ERESTART_RESTARTBLOCK;  
 out:  
     destroy_hrtimer_on_stack(&t.timer);  
     return ret;  
 }  

[cpp]view plaincopy
 static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)  
 {  
     hrtimer_init_sleeper(t, current);  
   
     do {  
         set_current_state(TASK_INTERRUPTIBLE);  
         hrtimer_start_expires(&t->timer, mode);  
         if (!hrtimer_active(&t->timer))  
             t->task = NULL;  
   
         if (likely(t->task))  
             schedule();  
   
         hrtimer_cancel(&t->timer);  
         mode = HRTIMER_MODE_ABS;  
   
     } while (t->task && !signal_pending(current));  
   
     __set_current_state(TASK_RUNNING);  
   
     return t->task == NULL;  
 }  

调用流程如下：

nanosleep()--> sys_nanosleep()--> hrtimer_nanosleep()--> do_nanosleep()-->hrtimer_start()--> enqueue_hrtimer() -->hrtimer_enqueue_reprogram()-->hrtimer_reprogram()-->int tick_program_event(ktime_t expires, int force)->
(struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; 获得clock_event_device)