Init
在trap_init中对SYSCALL_VECTOR(编号0x80)的向量进行初始化。
808 set_system_trap_gate(SYSCALL_VECTOR, &system_call);
将system call初始化为trap门,加入到IDT table中,发生中断以后,会跳转到对应system_call的地址去执行后续的中断流程。发生中断到跳转执行中断向量的过程在kernel 中断分析三——中断处理流程有详细解释,本篇只关注system_call的运行过程。
ENTRY(system_call)
499 /*
500 * syscall stub including irq exit should be protected against kprobes
501 */
502 .pushsection .kprobes.text, "ax"
503 # system call handler stub
504 ENTRY(system_call)
505 RING0_INT_FRAME # can't unwind into user space anyway
506 ASM_CLAC
507 pushl_cfi %eax # save orig_eax --------------1
508 SAVE_ALL -----------2
509 GET_THREAD_INFO(%ebp) -----------3
510 # system call tracing in operation / emulation
511 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) ---------4
512 jnz syscall_trace_entry
513 cmpl $(NR_syscalls), %eax
514 jae syscall_badsys
515 syscall_call: --------5
516 call *sys_call_table(,%eax,4)
517 syscall_after_call: --------6
518 movl %eax,PT_EAX(%esp) # store the return value
519 syscall_exit: --------7
520 LOCKDEP_SYS_EXIT
521 DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
522 # setting need_resched or sigpending
523 # between sampling and the iret
524 TRACE_IRQS_OFF
525 movl TI_flags(%ebp), %ecx
526 testl $_TIF_ALLWORK_MASK, %ecx # current->work
527 jne syscall_exit_work
528
- RING0_INT_FRAME设置esp、eip指向内核态,然后将eax中的系统调用号入栈
- 保存现场,即用户态的一些寄存器值
- 将thread_info的地址保存到ebp寄存器
- 当前进程是否有被trace,如果有就执行相关的动作保存当时的追踪信息
- 调用对应的系统调用函数
- 将返回值入栈
- 屏蔽其他中断。检测当前进程是否还有工作没有完成,如果有,那么跳转到syscall_exit_work
- 然后恢复userspace被压入栈的寄存器,返回userspace
529 restore_all:
530 TRACE_IRQS_IRET
531 restore_all_notrace:
532 #ifdef CONFIG_X86_ESPFIX32
533 movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS ------------1
534 # Warning: PT_OLDSS(%esp) contains the wrong/random values if we
535 # are returning to the kernel.
536 # See comments in process.c:copy_thread() for details.
537 movb PT_OLDSS(%esp), %ah
538 movb PT_CS(%esp), %al
539 andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
540 cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
541 CFI_REMEMBER_STATE
542 je ldt_ss # returning to user-space with LDT SS
543 #endif
544 restore_nocheck:
545 RESTORE_REGS 4 # skip orig_eax/error_code
546 irq_return:
547 INTERRUPT_RETURN ----------
548 .section .fixup,"ax"
549 ENTRY(iret_exc)
550 pushl $0 # no error code
551 pushl $do_iret_error
552 jmp error_code
553 .previous
554 _ASM_EXTABLE(irq_return,iret_exc)
555
556 #ifdef CONFIG_X86_ESPFIX32
557 CFI_RESTORE_STATE
558 ldt_ss:
559 #ifdef CONFIG_PARAVIRT
560 /*
561 * The kernel can't run on a non-flat stack if paravirt mode
562 * is active. Rather than try to fixup the high bits of
563 * ESP, bypass this code entirely. This may break DOSemu
564 * and/or Wine support in a paravirt VM, although the option
565 * is still available to implement the setting of the high
566 * 16-bits in the INTERRUPT_RETURN paravirt-op.
567 */
568 cmpl $0, pv_info+PARAVIRT_enabled
569 jne restore_nocheck
570 #endif
571
572 /*
573 * Setup and switch to ESPFIX stack
574 *
575 * We're returning to userspace with a 16 bit stack. The CPU will not
576 * restore the high word of ESP for us on executing iret... This is an
577 * "official" bug of all the x86-compatible CPUs, which we can work
578 * around to make dosemu and wine happy. We do this by preloading the
579 * high word of ESP with the high word of the userspace ESP while
580 * compensating for the offset by changing to the ESPFIX segment with
581 * a base address that matches for the difference.
582 */
583 #define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8)
584 mov %esp, %edx /* load kernel esp */
585 mov PT_OLDESP(%esp), %eax /* load userspace esp */
586 mov %dx, %ax /* eax: new kernel esp */
587 sub %eax, %edx /* offset (low word is 0) */
588 shr $16, %edx
589 mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */
590 mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */
591 pushl_cfi $__ESPFIX_SS
592 pushl_cfi %eax /* new kernel esp */
593 /* Disable interrupts, but do not irqtrace this section: we
594 * will soon execute iret and the tracer was already set to
595 * the irqstate after the iret */
596 DISABLE_INTERRUPTS(CLBR_EAX)
597 lss (%esp), %esp /* switch to espfix segment */
598 CFI_ADJUST_CFA_OFFSET -8
599 jmp restore_nocheck
600 #endif
601 CFI_ENDPROC
602 ENDPROC(system_call)
syscall_exit_work
_TIF_ALLWORK_MASK 的定义如下:
144 /* Work to do on any return to user space. */
145 #define _TIF_ALLWORK_MASK \
146 (_TIF_SIGPENDING|_TIF_NEED_RESCHED|_TIF_SINGLESTEP|\
147 _TIF_ASYNC_TLB|_TIF_NOTIFY_RESUME)
当以下情况之一发生时,返回用户态之前需要进入syscall_exit_work处理:
1. 当前进程有信号pending
2. 当前进程需要被重新调度
3. 设置了_TIF_SINGLESTEP,restore singlestep on return to user mode
4. got an async TLB fault in kernel
5. callback before returning to user
670 syscall_exit_work:
671 testl $_TIF_WORK_SYSCALL_EXIT, %ecx----------1
672 jz work_pending
673 TRACE_IRQS_ON
674 ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call
675 # schedule() instead
676 movl %esp, %eax
677 call syscall_trace_leave
678 jmp resume_userspace------------------------2
679 END(syscall_exit_work)
- 检测是否有work pending
- 否则开中断然后返回用户态
607 work_pending:
608 testb $_TIF_NEED_RESCHED, %cl -------------1
609 jz work_notifysig -------------2
610 work_resched: -------------3
611 call schedule
612 LOCKDEP_SYS_EXIT
613 DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
614 # setting need_resched or sigpending
615 # between sampling and the iret
616 TRACE_IRQS_OFF
617 movl TI_flags(%ebp), %ecx
618 andl $_TIF_WORK_MASK, %ecx # is there any work to be done other
619 # than syscall tracing?
620 jz restore_all
621 testb $_TIF_NEED_RESCHED, %cl
622 jnz work_resched
623
624 work_notifysig: # deal with pending signals and-------------------4
625 # notify-resume requests
626 #ifdef CONFIG_VM86
627 testl $X86_EFLAGS_VM, PT_EFLAGS(%esp)
628 movl %esp, %eax
629 jne work_notifysig_v86 # returning to kernel-space or
630 # vm86-space
631 1:
632 #else
633 movl %esp, %eax
634 #endif
635 TRACE_IRQS_ON
636 ENABLE_INTERRUPTS(CLBR_NONE)
637 movb PT_CS(%esp), %bl
638 andb $SEGMENT_RPL_MASK, %bl
639 cmpb $USER_RPL, %bl
640 jb resume_kernel
641 xorl %edx, %edx
642 call do_notify_resume -------------------5
643 jmp resume_userspace
644
645 #ifdef CONFIG_VM86
646 ALIGN
647 work_notifysig_v86:
648 pushl_cfi %ecx # save ti_flags for do_notify_resume
649 call save_v86_state # %eax contains pt_regs pointer
650 popl_cfi %ecx
651 movl %eax, %esp
652 jmp 1b
653 #endif
654 END(work_pending)
- 检测_TIF_NEED_RESCHED,若被设置,跳转到work_resched,否则跳转到work_notifysig,进行信号处理
- 调用schedule主动让出CPU
- 处理pending的信号,具体的处理流程在do_notify_resume 中的do_signal
整个处理流程用流程图表现得更加直观: