内核分析-第五周

刘文学 原创作品转载请注明出处 http://blog.csdn.net/wdxz6547/article/details/50993837《Linux内核分析》MOOC课程http://mooc.study.163.com/course/USTC-1000029000

预备知识

  • 内核态
  • 用户态
  • 为什么要划分系统级别?
  • 如何区分内核和用户态? cs:eip
  • 寄存器上下文
  • 上下文切换
  • 系统调用号
  • 中断向量
  • 调度时机

系统调用过程中一定发生中断, 在系统调用执行过程中可能有进程的切换.

系统调用分类

进程控制

load
execute
end, abort
create process (for example, fork on Unix-like systems, or NtCreateProcess in the Windows NT Native API)
terminate process
get/set process attributes
wait for time, wait event, signal event
allocate, free memory

文件管理

create file, delete file
open, close
read, write, reposition
get/set file attributes

设备管理

request device, release device
read, write, reposition
get/set device attributes
logically attach or detach devices

信息管理

get/set time or date
get/set system data
get/set process, file, or device attributes

通信

create, delete communication connection
send, receive messages
transfer status information
attach or detach remote devices

什么是 system-call

linux 系统64 位系统调用表

32 位系统调用表

上面的系统调用并不需要完全掌握, 可以慢慢来, 当需要记住的 64 位系统有 326 个系统调用.

系统调用入口

从内核初始化在 init/main.c 中的 start_kernel 函数. 其中一个初始化是 setup_arch,
对于 x86 来说, 实际调用的是 arch/x86/kernel/setup.c 中的 setup_arch 函数. 而
setup_arch 又调用了 early_trap_init 函数. 这部分与系统开启相关. 这里不详细讨论.

初始化

syscall 的初始化 syscall_init 在 arch/x86/kernel/cpu/common.c 的 cpu_init 函数中.

linux/arch/x86/kernel/cpu/common.c

    void syscall_init(void)
    {
        /*
         * LSTAR and STAR live in a bit strange symbiosis.
         * They both write to the same internal register. STAR allows to
         * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
         */
        wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
        wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);

    #ifdef CONFIG_IA32_EMULATION //允许 64 位系统运行 32 位程序
        wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);
        /*
         * This only works on Intel CPUs.
         * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.
         * This does not cause SYSENTER to jump to the wrong location, because
         * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
         */
        wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
        wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
        wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
    #else //不允许 64 位系统运行 32 程序
        wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret);
        //[GDT](https://en.wikipedia.org/wiki/Global_Descriptor_Table)
        wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
        wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
        wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
    #endif

        /* Flags to clear on syscall */
        wrmsrl(MSR_SYSCALL_MASK,
               X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|
               X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT);
    }


linux/arch/x86/include/asm/segment.h

    #define __KERNEL_CS         (GDT_ENTRY_KERNEL_CS*8)
    #define __USER32_CS         (GDT_ENTRY_DEFAULT_USER32_CS*8 + 3)

linux/arch/x86/include/uapi/asm/processor-flags.h

    #define X86_EFLAGS_TF       _BITUL(X86_EFLAGS_TF_BIT)
    #define X86_EFLAGS_DF       _BITUL(X86_EFLAGS_DF_BIT)
    #define X86_EFLAGS_IF       _BITUL(X86_EFLAGS_IF_BIT)
    #define X86_EFLAGS_AC       _BITUL(X86_EFLAGS_AC_BIT)
    #define X86_EFLAGS_IOPL     (_AC(3,UL) << X86_EFLAGS_IOPL_BIT)

linux/arch/x86/kernel/cpu/msr.h


    static inline void native_write_msr(unsigned int msr,
                        unsigned low, unsigned high)
    {
        //Write the value in EDX:EAX to MSR specified by ECX. MSR[ECX] = EDX:EAX;
        asm volatile(""wrmsr"" : : ""c"" (msr), ""a""(low), ""d"" (high) : ""memory"");
        if (msr_tracepoint_active(__tracepoint_read_msr))
            do_trace_write_msr(msr, ((u64)high << 32 | low), 0);
    }

    /* Can be uninlined because referenced by paravirt */
    notrace static inline int native_write_msr_safe(unsigned int msr,
                        unsigned low, unsigned high)
    {
        int err;
        asm volatile(""2: wrmsr ; xor %[err],%[err]\n""
                 ""1:\n\t""
                 "".section .fixup,\"ax\"\n\t""
                 ""3:  mov %[fault],%[err] ; jmp 1b\n\t""
                 "".previous\n\t""
                 _ASM_EXTABLE(2b, 3b)
                 : [err] ""=a"" (err)
                 : ""c"" (msr), ""0"" (low), ""d"" (high),
                   [fault] ""i"" (-EIO)
                 : ""memory"");
        if (msr_tracepoint_active(__tracepoint_read_msr))
            do_trace_write_msr(msr, ((u64)high << 32 | low), err);
        return err;
    }

    static inline void wrmsr(unsigned msr, unsigned low, unsigned high)
    {
        native_write_msr(msr, low, high);
    }

    static inline void wrmsrl(unsigned msr, u64 val)
    {
        native_write_msr(msr, (u32)(val & 0xffffffffULL), (u32)(val >> 32));
    }

    /* wrmsr with exception handling */
    static inline int wrmsr_safe(unsigned msr, unsigned low, unsigned high)
    {
        return native_write_msr_safe(msr, low, high);
    }

    #define wrmsrl_safe(msr, val) wrmsr_safe((msr), (u32)(val),     \
                             (u32)((val) >> 32))

linux/arch/x86/entry/entry_64.S 1485 行

    ENTRY(ignore_sysret)
        mov $-ENOSYS, %eax
        sysret
    END(ignore_sysret)

wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);

MSR_STAR contains 63:48 bits of the user code segment. These bits
will be loaded to the CS and SS segment registers for the sysret
instruction which provides functionality to return from a system
call to user code with the related privilege. Also the MSR_STAR
contains 47:32 bits from the kernel code that will be used as the
base selector for CS and SS segment registers when user space
applications execute a system call.

wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);

加载 entry_SYSCALL_64 到 MSR_LSTAR 中, 其中 entry_SYSCALL_64 的定义
在[这里](http://code.woboq.org/linux/linux/arch/x86/entry/entry_64.S.html)

wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);

加载 entry_SYSCALL_compat 到 MSR_LSTAR 中, 其中 entry_SYSCALL_64 的定义
在[这里](http://code.woboq.org/linux/linux/arch/x86/entry/entry_64.S.html)

wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);

保存 __KERNEL_CS 到 MSR_IA32_SYSENTER_CS

wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);

MSR_IA32_SYSENTER_ESP 清零

wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);

MSR_IA32_SYSENTER_EIP 指向 entry_SYSCALL_compat

wrmsrl(MSR_SYSCALL_MASK, X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF| X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT);

标志位保存到 MSR_SYSCALL_MASK 之后, 被清零.

以上代码主要工作是将系统调用入口放入 MSR(model specific register)

系统调用准备

在 Linux 内核处理系统调用中断之前, 在一个异常被处理之前, idtentry 宏执行准备工作;
在中断被处理之前, interrupt 宏执行准备工作; 在系统调用被处理之前, entry_SYSCALL_64
将做准备工作.

在控制器由用户态转到内核态后, 并不是立即就执行内核态系统调用表中的内核函数,
原因是在系统调用完成之后还要返回用户态, 因此在调用内核系统调用函数之前, 必须
做一些准备工作, 保持用户态的信息(堆栈, 寄存器)待系统调用完之后恢复现场.
初始化内核, 寄存器,堆栈 等等.

/*
 * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers.
 *
 * 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
 * then loads new ss, cs, and rip from previously programmed MSRs.
 * rflags gets masked by a value from another MSR (so CLD and CLAC
 * are not needed). SYSCALL does not save anything on the stack
 * and does not change rsp.
 *
 * Registers on entry:
 * rax  system call number
 * rcx  return address
 * r11  saved rflags (note: r11 is callee-clobbered register in C ABI)
 * rdi  arg0
 * rsi  arg1
 * rdx  arg2
 * r10  arg3 (needs to be moved to rcx to conform to C ABI)
 * r8   arg4
 * r9   arg5
 * (note: r12-r15, rbp, rbx are callee-preserved in C ABI)
 *
 * Only called from user space.
 *
 * When user can change pt_regs->foo always force IRET. That is because
 * it deals with uncanonical addresses better. SYSRET has trouble
 * with them due to bugs in both AMD and Intel CPUs.
 */

ENTRY(entry_SYSCALL_64)
    /*
     * Interrupts are off on entry.
     * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
     * it is too small to ever cause noticeable irq latency.
     */
    SWAPGS_UNSAFE_STACK
    /*
     * A hypervisor implementation might want to use a label
     * after the swapgs, so that it can do the swapgs
     * for the guest and jump here on syscall.
     */
GLOBAL(entry_SYSCALL_64_after_swapgs)

    //将旧的 rsp 保存到 rsp_scratch
    movq    %rsp, PER_CPU_VAR(rsp_scratch)

    //rsp 指向 cpu_current_top_of_stack, 后续指令执行从 cpu_current_top_of_stack 地址开始
    movq    PER_CPU_VAR(cpu_current_top_of_stack), %rsp

    /* Construct struct pt_regs on stack */
    //将 $__USER_DS 压栈
    pushq   $__USER_DS         /* pt_regs->ss */

    //将 rsp_scratch 压栈, 实际为旧的 rsp
    pushq   PER_CPU_VAR(rsp_scratch)    /* pt_regs->sp */

    /*
     * Re-enable interrupts.
     * We use 'rsp_scratch' as a scratch space, hence irq-off block above
     * must execute atomically in the face of possible interrupt-driven
     * task preemption. We must enable interrupts only after we're done
     * with using rsp_scratch:
     */
    //重新开启中断
    ENABLE_INTERRUPTS(CLBR_NONE)

    //保存通用寄存器, -ENOSYS, flags, 主要原因是系统调用会用到.
    //rax - contains system call number;
    //rcx - contains return address to the user space;
    //r11 - contains register flags;
    //rdi - contains first argument of a system call handler;
    //rsi - contains second argument of a system call handler;
    //rdx - contains third argument of a system call handler;
    //r10 - contains fourth argument of a system call handler;
    //r8 - contains fifth argument of a system call handler;
    //r9 - contains sixth argument of a system call handler;
    //其他寄存器 rbp, rbx, r12~r15 在 C-ABI 作为 callee-preserved
    //其中 ENOSYS 是没有实现系统调用的错误代码
    pushq   %r11                /* pt_regs->flags */
    pushq   $__USER_CS         /* pt_regs->cs */
    pushq   %rcx                /* pt_regs->ip */
    pushq   %rax                /* pt_regs->orig_ax */
    pushq   %rdi                /* pt_regs->di */
    pushq   %rsi                /* pt_regs->si */
    pushq   %rdx                /* pt_regs->dx */
    pushq   %rcx                /* pt_regs->cx */
    pushq   $-ENOSYS           /* pt_regs->ax */
    pushq   %r8                 /* pt_regs->r8 */
    pushq   %r9                 /* pt_regs->r9 */
    pushq   %r10                /* pt_regs->r10 */
    pushq   %r11                /* pt_regs->r11 */
    sub $(6*8), %rsp           /* pt_regs->bp, bx, r12-15 not saved */

    //测试是否进入系统跟踪
    testl   $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
    jnz tracesys

entry_SYSCALL_64_fastpath:

#if __SYSCALL_MASK == ~0
    //__NR_syscall_max 为最大系统调用号
    cmpq    $__NR_syscall_max, %rax
#else
    andl    $__SYSCALL_MASK, %eax
    cmpl    $__NR_syscall_max, %eax
#endif
    //CF, ZF 标志是否清零, 如果是跳到 1:
    ja  1f              /* return -ENOSYS (already in pt_regs->ax) */

    //如果有正确的系统调用, 第四个参数赋值给给 rcx.
    movq    %r10, %rcx
    //调用系统调用表中的函数, 系统调用表见前面分析.
    call    *sys_call_table(, %rax, 8)

    //#define RAX       10*8
    //将 rax(系统调用返回结果)保存到 rsp
    movq    %rax, RAX(%rsp)

1:
/*
 * Syscall return path ending with SYSRET (fast path).
 * Has incompletely filled pt_regs.
 */
    //见后面附注
    LOCKDEP_SYS_EXIT
    /*
     * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
     * it is too small to ever cause noticeable irq latency.
     */
    DISABLE_INTERRUPTS(CLBR_NONE)
    /*
     * We must check ti flags with interrupts (or at least preemption)
     * off because we must *never* return to userspace without
   
  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值