刘文学 原创作品转载请注明出处 http://blog.csdn.net/wdxz6547/article/details/50993837《Linux内核分析》MOOC课程http://mooc.study.163.com/course/USTC-1000029000
预备知识
- 内核态
- 用户态
- 为什么要划分系统级别?
- 如何区分内核和用户态? cs:eip
- 寄存器上下文
- 上下文切换
- 系统调用号
- 中断向量
- 调度时机
系统调用过程中一定发生中断, 在系统调用执行过程中可能有进程的切换.
系统调用分类
进程控制
load
execute
end, abort
create process (for example, fork on Unix-like systems, or NtCreateProcess in the Windows NT Native API)
terminate process
get/set process attributes
wait for time, wait event, signal event
allocate, free memory
文件管理
create file, delete file
open, close
read, write, reposition
get/set file attributes
设备管理
request device, release device
read, write, reposition
get/set device attributes
logically attach or detach devices
信息管理
get/set time or date
get/set system data
get/set process, file, or device attributes
通信
create, delete communication connection
send, receive messages
transfer status information
attach or detach remote devices
什么是 system-call
linux 系统64 位系统调用表
上面的系统调用并不需要完全掌握, 可以慢慢来, 当需要记住的 64 位系统有 326 个系统调用.
系统调用入口
从内核初始化在 init/main.c 中的 start_kernel 函数. 其中一个初始化是 setup_arch,
对于 x86 来说, 实际调用的是 arch/x86/kernel/setup.c 中的 setup_arch 函数. 而
setup_arch 又调用了 early_trap_init 函数. 这部分与系统开启相关. 这里不详细讨论.
初始化
syscall 的初始化 syscall_init 在 arch/x86/kernel/cpu/common.c 的 cpu_init 函数中.
linux/arch/x86/kernel/cpu/common.c
void syscall_init(void)
{
/*
* LSTAR and STAR live in a bit strange symbiosis.
* They both write to the same internal register. STAR allows to
* set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
*/
wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
#ifdef CONFIG_IA32_EMULATION //允许 64 位系统运行 32 位程序
wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);
/*
* This only works on Intel CPUs.
* On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.
* This does not cause SYSENTER to jump to the wrong location, because
* AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
*/
wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
#else //不允许 64 位系统运行 32 程序
wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret);
//[GDT](https://en.wikipedia.org/wiki/Global_Descriptor_Table)
wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
#endif
/* Flags to clear on syscall */
wrmsrl(MSR_SYSCALL_MASK,
X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|
X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT);
}
linux/arch/x86/include/asm/segment.h
#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS*8)
#define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS*8 + 3)
linux/arch/x86/include/uapi/asm/processor-flags.h
#define X86_EFLAGS_TF _BITUL(X86_EFLAGS_TF_BIT)
#define X86_EFLAGS_DF _BITUL(X86_EFLAGS_DF_BIT)
#define X86_EFLAGS_IF _BITUL(X86_EFLAGS_IF_BIT)
#define X86_EFLAGS_AC _BITUL(X86_EFLAGS_AC_BIT)
#define X86_EFLAGS_IOPL (_AC(3,UL) << X86_EFLAGS_IOPL_BIT)
linux/arch/x86/kernel/cpu/msr.h
static inline void native_write_msr(unsigned int msr,
unsigned low, unsigned high)
{
//Write the value in EDX:EAX to MSR specified by ECX. MSR[ECX] = EDX:EAX;
asm volatile(""wrmsr"" : : ""c"" (msr), ""a""(low), ""d"" (high) : ""memory"");
if (msr_tracepoint_active(__tracepoint_read_msr))
do_trace_write_msr(msr, ((u64)high << 32 | low), 0);
}
/* Can be uninlined because referenced by paravirt */
notrace static inline int native_write_msr_safe(unsigned int msr,
unsigned low, unsigned high)
{
int err;
asm volatile(""2: wrmsr ; xor %[err],%[err]\n""
""1:\n\t""
"".section .fixup,\"ax\"\n\t""
""3: mov %[fault],%[err] ; jmp 1b\n\t""
"".previous\n\t""
_ASM_EXTABLE(2b, 3b)
: [err] ""=a"" (err)
: ""c"" (msr), ""0"" (low), ""d"" (high),
[fault] ""i"" (-EIO)
: ""memory"");
if (msr_tracepoint_active(__tracepoint_read_msr))
do_trace_write_msr(msr, ((u64)high << 32 | low), err);
return err;
}
static inline void wrmsr(unsigned msr, unsigned low, unsigned high)
{
native_write_msr(msr, low, high);
}
static inline void wrmsrl(unsigned msr, u64 val)
{
native_write_msr(msr, (u32)(val & 0xffffffffULL), (u32)(val >> 32));
}
/* wrmsr with exception handling */
static inline int wrmsr_safe(unsigned msr, unsigned low, unsigned high)
{
return native_write_msr_safe(msr, low, high);
}
#define wrmsrl_safe(msr, val) wrmsr_safe((msr), (u32)(val), \
(u32)((val) >> 32))
linux/arch/x86/entry/entry_64.S 1485 行
ENTRY(ignore_sysret)
mov $-ENOSYS, %eax
sysret
END(ignore_sysret)
wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
MSR_STAR contains 63:48 bits of the user code segment. These bits
will be loaded to the CS and SS segment registers for the sysret
instruction which provides functionality to return from a system
call to user code with the related privilege. Also the MSR_STAR
contains 47:32 bits from the kernel code that will be used as the
base selector for CS and SS segment registers when user space
applications execute a system call.
wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
加载 entry_SYSCALL_64 到 MSR_LSTAR 中, 其中 entry_SYSCALL_64 的定义
在[这里](http://code.woboq.org/linux/linux/arch/x86/entry/entry_64.S.html)
wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);
加载 entry_SYSCALL_compat 到 MSR_LSTAR 中, 其中 entry_SYSCALL_64 的定义
在[这里](http://code.woboq.org/linux/linux/arch/x86/entry/entry_64.S.html)
wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
保存 __KERNEL_CS 到 MSR_IA32_SYSENTER_CS
wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
MSR_IA32_SYSENTER_ESP 清零
wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
MSR_IA32_SYSENTER_EIP 指向 entry_SYSCALL_compat
wrmsrl(MSR_SYSCALL_MASK, X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF| X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT);
标志位保存到 MSR_SYSCALL_MASK 之后, 被清零.
以上代码主要工作是将系统调用入口放入 MSR(model specific register)
系统调用准备
在 Linux 内核处理系统调用中断之前, 在一个异常被处理之前, idtentry 宏执行准备工作;
在中断被处理之前, interrupt 宏执行准备工作; 在系统调用被处理之前, entry_SYSCALL_64
将做准备工作.
在控制器由用户态转到内核态后, 并不是立即就执行内核态系统调用表中的内核函数,
原因是在系统调用完成之后还要返回用户态, 因此在调用内核系统调用函数之前, 必须
做一些准备工作, 保持用户态的信息(堆栈, 寄存器)待系统调用完之后恢复现场.
初始化内核, 寄存器,堆栈 等等.
/*
* 64-bit SYSCALL instruction entry. Up to 6 arguments in registers.
*
* 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
* then loads new ss, cs, and rip from previously programmed MSRs.
* rflags gets masked by a value from another MSR (so CLD and CLAC
* are not needed). SYSCALL does not save anything on the stack
* and does not change rsp.
*
* Registers on entry:
* rax system call number
* rcx return address
* r11 saved rflags (note: r11 is callee-clobbered register in C ABI)
* rdi arg0
* rsi arg1
* rdx arg2
* r10 arg3 (needs to be moved to rcx to conform to C ABI)
* r8 arg4
* r9 arg5
* (note: r12-r15, rbp, rbx are callee-preserved in C ABI)
*
* Only called from user space.
*
* When user can change pt_regs->foo always force IRET. That is because
* it deals with uncanonical addresses better. SYSRET has trouble
* with them due to bugs in both AMD and Intel CPUs.
*/
ENTRY(entry_SYSCALL_64)
/*
* Interrupts are off on entry.
* We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
* it is too small to ever cause noticeable irq latency.
*/
SWAPGS_UNSAFE_STACK
/*
* A hypervisor implementation might want to use a label
* after the swapgs, so that it can do the swapgs
* for the guest and jump here on syscall.
*/
GLOBAL(entry_SYSCALL_64_after_swapgs)
//将旧的 rsp 保存到 rsp_scratch
movq %rsp, PER_CPU_VAR(rsp_scratch)
//rsp 指向 cpu_current_top_of_stack, 后续指令执行从 cpu_current_top_of_stack 地址开始
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
/* Construct struct pt_regs on stack */
//将 $__USER_DS 压栈
pushq $__USER_DS /* pt_regs->ss */
//将 rsp_scratch 压栈, 实际为旧的 rsp
pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */
/*
* Re-enable interrupts.
* We use 'rsp_scratch' as a scratch space, hence irq-off block above
* must execute atomically in the face of possible interrupt-driven
* task preemption. We must enable interrupts only after we're done
* with using rsp_scratch:
*/
//重新开启中断
ENABLE_INTERRUPTS(CLBR_NONE)
//保存通用寄存器, -ENOSYS, flags, 主要原因是系统调用会用到.
//rax - contains system call number;
//rcx - contains return address to the user space;
//r11 - contains register flags;
//rdi - contains first argument of a system call handler;
//rsi - contains second argument of a system call handler;
//rdx - contains third argument of a system call handler;
//r10 - contains fourth argument of a system call handler;
//r8 - contains fifth argument of a system call handler;
//r9 - contains sixth argument of a system call handler;
//其他寄存器 rbp, rbx, r12~r15 在 C-ABI 作为 callee-preserved
//其中 ENOSYS 是没有实现系统调用的错误代码
pushq %r11 /* pt_regs->flags */
pushq $__USER_CS /* pt_regs->cs */
pushq %rcx /* pt_regs->ip */
pushq %rax /* pt_regs->orig_ax */
pushq %rdi /* pt_regs->di */
pushq %rsi /* pt_regs->si */
pushq %rdx /* pt_regs->dx */
pushq %rcx /* pt_regs->cx */
pushq $-ENOSYS /* pt_regs->ax */
pushq %r8 /* pt_regs->r8 */
pushq %r9 /* pt_regs->r9 */
pushq %r10 /* pt_regs->r10 */
pushq %r11 /* pt_regs->r11 */
sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */
//测试是否进入系统跟踪
testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
jnz tracesys
entry_SYSCALL_64_fastpath:
#if __SYSCALL_MASK == ~0
//__NR_syscall_max 为最大系统调用号
cmpq $__NR_syscall_max, %rax
#else
andl $__SYSCALL_MASK, %eax
cmpl $__NR_syscall_max, %eax
#endif
//CF, ZF 标志是否清零, 如果是跳到 1:
ja 1f /* return -ENOSYS (already in pt_regs->ax) */
//如果有正确的系统调用, 第四个参数赋值给给 rcx.
movq %r10, %rcx
//调用系统调用表中的函数, 系统调用表见前面分析.
call *sys_call_table(, %rax, 8)
//#define RAX 10*8
//将 rax(系统调用返回结果)保存到 rsp
movq %rax, RAX(%rsp)
1:
/*
* Syscall return path ending with SYSRET (fast path).
* Has incompletely filled pt_regs.
*/
//见后面附注
LOCKDEP_SYS_EXIT
/*
* We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
* it is too small to ever cause noticeable irq latency.
*/
DISABLE_INTERRUPTS(CLBR_NONE)
/*
* We must check ti flags with interrupts (or at least preemption)
* off because we must *never* return to userspace without