Linux open系统调用的完整调用流程分析

系统调用,中断,异常是3种Linux用户空间切换到内核空间的方法
今天以open系统调用为例,分析从用户空间层到内核空间层的完整流程
我以安卓8.1源代码为例,对应的Linux内核版本是4.15.0

谷歌专门为android定制了C库叫做bionic库给安卓系统层的c/c++代码调用,open()函数定义在下面:
bionic/libc/bionic/open.cpp
int open(const char* pathname, int flags, …) {
mode_t mode = 0;

if ((flags & O_CREAT) != 0) {
va_list args;
va_start(args, flags);
mode = static_cast<mode_t>(va_arg(args, int));
va_end(args);
}

return __openat(AT_FDCWD, pathname, force_O_LARGEFILE(flags), mode);
}

bionic/libc/arch-arm/syscalls/__openat.S
#include <private/bionic_asm.h>

ENTRY(__openat)
mov ip, r7
.cfi_register r7, ip
ldr r7, =__NR_openat
swi #0
mov r7, ip
.cfi_restore r7
cmn r0, #(MAX_ERRNO + 1)
bxls lr
neg r0, r0
b __set_errno_internal
END(__openat)

对上面的代码进行分析
__NR_openat是对应的open系统调用号
r7寄存器保存系统调用号,这样子内核代码可以从这里获取系统调用号,然后调用不同的实现函数
ENTRY和END为宏定义,后面包含对应的符号地址,因此头文件对应的定义编译为C的符号
依靠swi指令用户空间切换到内核空间
发生swi后进入系统中断向量然后执行vector_swi,源代码分析如下(CONFIG_AEABI):
//内核入口
kernel/arch/arm/kernel/entry-common.S
.align 5
ENTRY(vector_swi)
#ifdef CONFIG_CPU_V7M
v7m_exception_entry
#else
sub sp, sp, #S_FRAME_SIZE
stmia sp, {r0 - r12} @ Calling r0 - r12
ARM( add r8, sp, #S_PC )
ARM( stmdb r8, {sp, lr}^ ) @ Calling sp, lr
THUMB( mov r8, sp )
THUMB( store_user_sp_lr r8, r10, S_SP ) @ calling sp, lr
mrs r8, spsr @ called from non-FIQ mode, so ok.
str lr, [sp, #S_PC] @ Save calling PC
str r8, [sp, #S_PSR] @ Save CPSR
str r0, [sp, #S_OLD_R0] @ Save OLD_R0

#endif
zero_fp
alignment_trap r10, ip, __cr_alignment
enable_irq
ct_user_exit
get_thread_info tsk
/*
* Get the system call number.
*/

#if defined(CONFIG_OABI_COMPAT)
/*
* If we have CONFIG_OABI_COMPAT then we need to look at the swi
* value to determine if it is an EABI or an old ABI call.
*/
#ifdef CONFIG_ARM_THUMB
tst r8, #PSR_T_BIT
movne r10, #0 @ no thumb OABI emulation
USER( ldreq r10, [lr, #-4] ) @ get SWI instruction
#else
USER( ldr r10, [lr, #-4] ) @ get SWI instruction
/*r10中存放的就是引起软中断的那条指令的机器码
发生软中断的时候,系统自动将PC-4存放到了lr寄存器,由于是三级流水,
并且是ARM状态,还需要减4才能得到发生软中断的那条指令的机器码所在的地址
*/
#endif
ARM_BE8(rev r10, r10) @ little endian instruction

#elif defined(CONFIG_AEABI)
/*
* Pure EABI user space always put syscall number into scno (r7).
/
#elif defined(CONFIG_ARM_THUMB)
/
Legacy ABI only, possibly thumb mode. */
tst r8, #PSR_T_BIT @ this is SPSR from save_user_regs
addne scno, r7, #__NR_SYSCALL_BASE @ put OS number in
USER( ldreq scno, [lr, #-4] )

#else
/* Legacy ABI only. */
USER( ldr scno, [lr, #-4] ) @ get SWI instruction
#endif
uaccess_disable tbl
adr tbl, sys_call_table @ load syscall table pointer
/此时tbl(r8)中存放的就是sys_call_table的起始地址/

#if defined(CONFIG_OABI_COMPAT)
/*
* If the swi argument is zero, this is an EABI call and we do nothing.
*
* If this is an old ABI call, get the syscall number into scno and
* get the old ABI syscall table address.
*/
bics r10, r10, #0xff000000
eorne scno, r10, #__NR_OABI_SYSCALL_BASE
ldrne tbl, =sys_oabi_call_table
#elif !defined(CONFIG_AEABI)
bic scno, scno, #0xff000000 @ mask off SWI op-code
eor scno, scno, #__NR_SYSCALL_BASE @ check OS number

#endif

local_restart:
ldr r10, [tsk, #TI_FLAGS] @ check for syscall tracing
stmdb sp!, {r4, r5} @ push fifth and sixth args
tst r10, #_TIF_SYSCALL_WORK @ are we tracing syscalls?
bne __sys_trace
invoke_syscall tbl, scno, r10, ret_fast_syscall
add r1, sp, #S_OFF
2: cmp scno, #(__ARM_NR_BASE - __NR_SYSCALL_BASE)
eor r0, scno, #__NR_SYSCALL_BASE @ put OS number back
bcs arm_syscall
mov why, #0 @ no longer a real syscall
b sys_ni_syscall @ not private func

#if defined(CONFIG_OABI_COMPAT) || !defined(CONFIG_AEABI)
/*
* We failed to handle a fault trying to access the page
* containing the swi instruction, but we’re not really in a
* position to return -EFAULT. Instead, return back to the
* instruction and re-enter the user fault handling path trying
* to page it in. This will likely result in sending SEGV to the
* current task.
*/
9001:
sub lr, lr, #4
str lr, [sp, #S_PC]
b ret_fast_syscall
#endif
ENDPROC(vector_swi)

kernel/arch/arm/kernel/entry-header.S
scno .req r7 @ syscall number
tbl .req r8 @ syscall table pointer
why .req r8 @ Linux syscall (!= 0)
tsk .req r9 @ current thread_info
.req 是伪汇编,以 scno .req r7 为例,表示scno是寄存器r7的别名。

get_thread_info tsk
其中,tsk是寄存器r9的别名,get_thread_info是一个宏定义,如下:
1: .macro get_thread_info, rd
2: mov \rd, sp, lsr #13
3: mov \rd, \rd, lsl #13
4: .endm
即:将sp进行8KB对齐后的值赋给寄存器r9,什么意思?
这个就涉及到Linux的内核栈了。Linux为每个进程都分配了一个8KB的内核栈,在内核栈的尾端存放有关于这个进程的struct therad_info结构:
1: struct thread_info {
2: unsigned long flags; /* low level flags /
3: int preempt_count; /
0 => preemptable, <0 => bug /
4: mm_segment_t addr_limit; /
address limit */
5: struct task_struct task; / main task structure */
6: struct exec_domain exec_domain; / execution domain /
7: __u32 cpu; /
cpu /
8: __u32 cpu_domain; /
cpu domain /
9: struct cpu_context_save cpu_context; /
cpu context /
10: __u32 syscall; /
syscall number /
11: __u8 used_cp[16]; /
thread used copro /
12: unsigned long tp_value;
13: struct crunch_state crunchstate;
14: union fp_state fpstate attribute((aligned(8)));
15: union vfp_state vfpstate;
16: #ifdef CONFIG_ARM_THUMBEE
17: unsigned long thumbee_state; /
ThumbEE Handler Base register */
18: #endif
19: struct restart_block restart_block;
20: };
通过上面的操作,寄存器r9中就是这个进程的thread_info结构的起始地址。
在linux内核中进程以及线程(多线程也是通过一组轻量级进程实现的)都是通过task_struct结构体来描述的,我们称它为进程描述符。而thread_info则是一个与进程描述符相关的小数据结构,它同进程的内核态栈stack存放在一个单独为进程分配的内存区域。由于这个内存区域同时保存了thread_info和stack,所以使用了联合体来定义,相关数据结构如下:
thread_union联合体定义:
union thread_union {
struct thread_info thread_info;
unsigned long stack[THREAD_SIZE/sizeof(long)];
};
这样设计的好处就是,得到stack,thread_info或task_struct任意一个数据结构的地址,就可以很快得到另外两个数据的地址。

kernel/arch/arm/kernel/entry-common.S

ENTRY(sys_call_table)
#include “calls.S”
#undef ABI
#undef OBSOLETE

/*============================================================================

  • Special system call wrappers
    */
    @ r0 = syscall number
    @ r8 = syscall table
    sys_syscall:
    bic scno, r0, #__NR_OABI_SYSCALL_BASE
    cmp scno, #__NR_syscall - __NR_SYSCALL_BASE
    cmpne scno, #NR_syscalls @ check range
    #ifdef CONFIG_CPU_SPECTRE
    movhs scno, #0
    csdb
    #endif
    stmloia sp, {r5, r6} @ shuffle args
    movlo r0, r1
    movlo r1, r2
    movlo r2, r3
    movlo r3, r4
    ldrlo pc, [tbl, scno, lsl #2]
    b sys_ni_syscall
    ENDPROC(sys_syscall)

kernel/arch/arm/kernel/calls.S
/* 0 / CALL(sys_restart_syscall)
CALL(sys_exit)
CALL(sys_fork)
CALL(sys_read)
CALL(sys_write)
/
5 / CALL(sys_open)

/
320 */ CALL(sys_get_mempolicy)
CALL(sys_set_mempolicy)
CALL(sys_openat)

对应__openat()的是sys_openat()
去内核源代码里搜索sys_openat的定义如下:
kernel/include/linux/syscalls.h
//asmlinkage是gcc标签,代表函数读取的参数来自于栈中,而非寄存器。
asmlinkage long sys_openat(int dfd, const char __user *filename, int flags,
umode_t mode);

内核源代码里搜索__openat的实现如下:
kernel/fs/open.c
SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags,
umode_t, mode)
{
if (force_o_largefile())
flags |= O_LARGEFILE;
return do_sys_open(dfd, filename, flags, mode);
}

  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值