kernel 系统调用----system call

Init

在trap_init中对SYSCALL_VECTOR(编号0x80)的向量进行初始化。

 808     set_system_trap_gate(SYSCALL_VECTOR, &system_call);

将system call初始化为trap门,加入到IDT table中,发生中断以后,会跳转到对应system_call的地址去执行后续的中断流程。发生中断到跳转执行中断向量的过程在kernel 中断分析三——中断处理流程有详细解释,本篇只关注system_call的运行过程。

ENTRY(system_call)

 499 /*
 500  * syscall stub including irq exit should be protected against kprobes
 501  */
 502     .pushsection .kprobes.text, "ax"
 503     # system call handler stub
 504 ENTRY(system_call)
 505     RING0_INT_FRAME         # can't unwind into user space anyway
 506     ASM_CLAC
 507     pushl_cfi %eax          # save orig_eax  --------------1
 508     SAVE_ALL                                    -----------2
 509     GET_THREAD_INFO(%ebp)                       -----------3
 510                     # system call tracing in operation / emulation
 511     testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) ---------4
 512     jnz syscall_trace_entry
 513     cmpl $(NR_syscalls), %eax
 514     jae syscall_badsys
 515 syscall_call:                                      --------5
 516     call *sys_call_table(,%eax,4)
 517 syscall_after_call:                                --------6
 518     movl %eax,PT_EAX(%esp)      # store the return value
 519 syscall_exit:                                      --------7
 520     LOCKDEP_SYS_EXIT
 521     DISABLE_INTERRUPTS(CLBR_ANY)    # make sure we don't miss an interrupt
 522                     # setting need_resched or sigpending
 523                     # between sampling and the iret
 524     TRACE_IRQS_OFF
 525     movl TI_flags(%ebp), %ecx
 526     testl $_TIF_ALLWORK_MASK, %ecx  # current->work
 527     jne syscall_exit_work
 528
  1. RING0_INT_FRAME设置esp、eip指向内核态,然后将eax中的系统调用号入栈
  2. 保存现场,即用户态的一些寄存器值
  3. 将thread_info的地址保存到ebp寄存器
  4. 当前进程是否有被trace,如果有就执行相关的动作保存当时的追踪信息
  5. 调用对应的系统调用函数
  6. 将返回值入栈
  7. 屏蔽其他中断。检测当前进程是否还有工作没有完成,如果有,那么跳转到syscall_exit_work
  8. 然后恢复userspace被压入栈的寄存器,返回userspace
 529 restore_all:
 530     TRACE_IRQS_IRET
 531 restore_all_notrace:
 532 #ifdef CONFIG_X86_ESPFIX32
 533     movl PT_EFLAGS(%esp), %eax  # mix EFLAGS, SS and CS     ------------1
 534     # Warning: PT_OLDSS(%esp) contains the wrong/random values if we
 535     # are returning to the kernel.
 536     # See comments in process.c:copy_thread() for details.
 537     movb PT_OLDSS(%esp), %ah
 538     movb PT_CS(%esp), %al
 539     andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
 540     cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
 541     CFI_REMEMBER_STATE
 542     je ldt_ss           # returning to user-space with LDT SS
 543 #endif
 544 restore_nocheck:
 545     RESTORE_REGS 4          # skip orig_eax/error_code
 546 irq_return:
 547     INTERRUPT_RETURN                            ----------
 548 .section .fixup,"ax"
 549 ENTRY(iret_exc)
 550     pushl $0            # no error code
 551     pushl $do_iret_error
 552     jmp error_code
 553 .previous
 554     _ASM_EXTABLE(irq_return,iret_exc)
 555
 556 #ifdef CONFIG_X86_ESPFIX32
 557     CFI_RESTORE_STATE
 558 ldt_ss:
 559 #ifdef CONFIG_PARAVIRT
 560     /*
 561      * The kernel can't run on a non-flat stack if paravirt mode
 562      * is active.  Rather than try to fixup the high bits of
 563      * ESP, bypass this code entirely.  This may break DOSemu
 564      * and/or Wine support in a paravirt VM, although the option
 565      * is still available to implement the setting of the high
 566      * 16-bits in the INTERRUPT_RETURN paravirt-op.
 567      */
 568     cmpl $0, pv_info+PARAVIRT_enabled
 569     jne restore_nocheck
 570 #endif
 571
 572 /*
 573  * Setup and switch to ESPFIX stack
 574  *
 575  * We're returning to userspace with a 16 bit stack. The CPU will not
 576  * restore the high word of ESP for us on executing iret... This is an
 577  * "official" bug of all the x86-compatible CPUs, which we can work
 578  * around to make dosemu and wine happy. We do this by preloading the
 579  * high word of ESP with the high word of the userspace ESP while
 580  * compensating for the offset by changing to the ESPFIX segment with
 581  * a base address that matches for the difference.
 582  */
 583 #define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8)
 584     mov %esp, %edx          /* load kernel esp */
 585     mov PT_OLDESP(%esp), %eax   /* load userspace esp */
 586     mov %dx, %ax            /* eax: new kernel esp */
 587     sub %eax, %edx          /* offset (low word is 0) */
 588     shr $16, %edx
 589     mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */
 590     mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */
 591     pushl_cfi $__ESPFIX_SS
 592     pushl_cfi %eax          /* new kernel esp */
 593     /* Disable interrupts, but do not irqtrace this section: we
 594      * will soon execute iret and the tracer was already set to
 595      * the irqstate after the iret */
 596     DISABLE_INTERRUPTS(CLBR_EAX)
 597     lss (%esp), %esp        /* switch to espfix segment */
 598     CFI_ADJUST_CFA_OFFSET -8
 599     jmp restore_nocheck
 600 #endif
 601     CFI_ENDPROC
 602 ENDPROC(system_call)

syscall_exit_work

_TIF_ALLWORK_MASK 的定义如下:

144 /* Work to do on any return to user space. */
145 #define _TIF_ALLWORK_MASK \
146   (_TIF_SIGPENDING|_TIF_NEED_RESCHED|_TIF_SINGLESTEP|\
147    _TIF_ASYNC_TLB|_TIF_NOTIFY_RESUME)

当以下情况之一发生时,返回用户态之前需要进入syscall_exit_work处理:
1. 当前进程有信号pending
2. 当前进程需要被重新调度
3. 设置了_TIF_SINGLESTEP,restore singlestep on return to user mode
4. got an async TLB fault in kernel
5. callback before returning to user

 670 syscall_exit_work:
 671     testl $_TIF_WORK_SYSCALL_EXIT, %ecx----------1
 672     jz work_pending
 673     TRACE_IRQS_ON
 674     ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call
 675                     # schedule() instead
 676     movl %esp, %eax
 677     call syscall_trace_leave
 678     jmp resume_userspace------------------------2
 679 END(syscall_exit_work)
  1. 检测是否有work pending
  2. 否则开中断然后返回用户态
 607 work_pending:
 608     testb $_TIF_NEED_RESCHED, %cl     -------------1
 609     jz work_notifysig                 -------------2
 610 work_resched:                         -------------3     
 611     call schedule
 612     LOCKDEP_SYS_EXIT
 613     DISABLE_INTERRUPTS(CLBR_ANY)    # make sure we don't miss an interrupt
 614                     # setting need_resched or sigpending
 615                     # between sampling and the iret
 616     TRACE_IRQS_OFF
 617     movl TI_flags(%ebp), %ecx
 618     andl $_TIF_WORK_MASK, %ecx  # is there any work to be done other
 619                     # than syscall tracing?
 620     jz restore_all
 621     testb $_TIF_NEED_RESCHED, %cl
 622     jnz work_resched
 623
 624 work_notifysig:             # deal with pending signals and-------------------4
 625                     # notify-resume requests
 626 #ifdef CONFIG_VM86
 627     testl $X86_EFLAGS_VM, PT_EFLAGS(%esp)
 628     movl %esp, %eax
 629     jne work_notifysig_v86      # returning to kernel-space or
 630                     # vm86-space
 631 1:
 632 #else
 633     movl %esp, %eax
 634 #endif
 635     TRACE_IRQS_ON
 636     ENABLE_INTERRUPTS(CLBR_NONE)
 637     movb PT_CS(%esp), %bl
 638     andb $SEGMENT_RPL_MASK, %bl
 639     cmpb $USER_RPL, %bl
 640     jb resume_kernel
 641     xorl %edx, %edx
 642     call do_notify_resume -------------------5
 643     jmp resume_userspace
 644
 645 #ifdef CONFIG_VM86
 646     ALIGN
 647 work_notifysig_v86:
 648     pushl_cfi %ecx          # save ti_flags for do_notify_resume
 649     call save_v86_state     # %eax contains pt_regs pointer
 650     popl_cfi %ecx
 651     movl %eax, %esp
 652     jmp 1b
 653 #endif
 654 END(work_pending)
  1. 检测_TIF_NEED_RESCHED,若被设置,跳转到work_resched,否则跳转到work_notifysig,进行信号处理
  2. 调用schedule主动让出CPU
  3. 处理pending的信号,具体的处理流程在do_notify_resume 中的do_signal

整个处理流程用流程图表现得更加直观:
这里写图片描述

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值