系统调用由内核分配的一个编号唯一标识(系统调用号)。 所有的系统调用都由一处中枢代码处理,根据调用编号和一个静态表,将调用分派到具体的函数。传递的参数也是由中枢代码处理,这样参数的传递独立于实际的系统调用。从用户态到内核态,以及调用分派和参数传递,都是由汇编语言代码实现的。 为容许用户态和内核态之间的切换,用户进程必须通过一条专用的机器指令,引起处理器/内核对该进程的关注,这需要 C 标准库的协助。内核也必须提供一个例程,来满足切换请求并执行相关操作。该例程不能在用户空间中实现,因为其中需要执行普通应用程序不允许执行的命令。
系统调用表 (armV7)
1 /*
2 * linux/arch/arm/kernel/calls.S
3 *
4 * Copyright (C) 1995-2005 Russell King
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 *
10 * This file is included thrice in entry-common.S
11 */
12 /* 0 */ CALL(sys_restart_syscall)
13 CALL(sys_exit)
14 CALL(sys_fork_wrapper)
15 CALL(sys_read)
16 CALL(sys_write)
17 /* 5 */ CALL(sys_open)
18 CALL(sys_close)
19 CALL(sys_ni_syscall) /* was sys_waitpid */
20 CALL(sys_creat)
21 CALL(sys_link)
22 /* 10 */ CALL(sys_unlink)
23 CALL(sys_execve_wrapper)
24 CALL(sys_chdir)
25 CALL(OBSOLETE(sys_time)) /* used by libc4 */
以系统调用 open() 函数为例:
1,X86 平台:
1,用户空间 1,函数 open() 的声明 @1@ 在使用 open()函数时,要 include
<glibc-2.17\include\fcntl.h>
1 #ifndef _FCNTL_H
2 #include
3 #ifndef _ISOMAC
4 /* Now define the internal interfaces. */
5 extern int __open64 (const char *__file, int __oflag, ...);
6 libc_hidden_proto (__open64)
7 extern int __libc_open64 (const char *file, int oflag, ...);
8 extern int __libc_open (const char *file, int oflag, ...);
9 libc_hidden_proto (__libc_open)
10 extern int __libc_creat (const char *file, mode_t mode);
11 extern int __libc_fcntl (int fd, int cmd, ...);
12 ...
<glibc-2.17\io\fcntl.h>
1 ...
2 /* Open FILE and return a new file descriptor for it, or -1 on error.
3 OFLAG determines the type of access used. If O_CREAT is on OFLAG,
4 the third argument is taken as a `mode_t', the mode of the created file.
5 This function is a cancellation point and therefore not marked with
6 __THROW. */
7 #ifndef __USE_FILE_OFFSET64
8 extern int open (const char *__file, int __oflag, ...) __nonnull ((1));
9 #else
10 # ifdef __REDIRECT
11 extern int __REDIRECT (open, (const char *__file, int __oflag, ...), open64)
12 __nonnull ((1));
13 # else
14 # define open open64
15 # endif
16 #endif
17 #ifdef __USE_LARGEFILE64
18 extern int open64 (const char *__file, int __oflag, ...) __nonnull ((1));
19 #endif
20 ...
1 /* Define a macro which expands inline into the wrapper code for a system
2 call. */
3 # undef INLINE_SYSCALL
4 # define INLINE_SYSCALL(name, nr, args...) \
5 ({ \
6 unsigned long int resultvar = INTERNAL_SYSCALL (name, , nr, args); \
7 if (__builtin_expect (INTERNAL_SYSCALL_ERROR_P (resultvar, ), 0)) \
8 { \
9 __set_errno (INTERNAL_SYSCALL_ERRNO (resultvar, )); \
10 resultvar = (unsigned long int) -1; \
11 } \
12 (long int) resultvar; })
1 ENTRY(system_call)
2 RING0_INT_FRAME # cant unwind into user space anyway
3 pushl %eax # save orig_eax ,将系统调用号压入栈中
4 CFI_ADJUST_CFA_OFFSET 4
5 SAVE_ALL #将寄存器的值压入堆栈当中,压入堆栈的顺序对应着结构体struct pt_regs ,
6 #当出栈的时候,就将这些值传递到结构体struct pt_regs里面的成员,
7 #从而实现汇编代码向C程序传递参数
8
9 GET_THREAD_INFO(%ebp)
10 # system call tracing in operation / emulation
11 #GET_THREAD_INFO宏获得当前进程的thread_info结构的地址,获取当前进程的信息。
12 #thread_inof结构中flag字段的_TIF_SYSCALL_TRACE或_TIF_SYSCALL_AUDIT
13 #被置1。如果发生被跟踪的情况则转向相应的处理命令处。
14 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
15 jnz syscall_trace_entry #比较结果不为零的时候跳转。
16 #对用户态进程传递过来的系统调用号的合法性进行检查
17 #如果不合法则跳到syscall_badsys标记的命令处。
18 cmpl $(nr_syscalls), %eax
19 jae syscall_badsys #比较结果大于或者等于最大的系统调用号的时候跳转,不合法
20 #合法则跳转到相应系统调用号所对应的服务例程当中,
21 #也就是在sys_call_table表中找到了相应的函数入口点。
22 #由于sys_call_table表的表项占4字节字节字节字节,因此获得服务例程指针的具体方法
23 #是将由eax保存的系统调用号乘以4再与sys_call_table表的基址相加。
24 syscall_call:
25 call *sys_call_table(,%eax,4)
26 movl %eax,PT_EAX(%esp) # store the return value 将保存的结果返回。
<arch\x86\include\asm\ptrace.h>
1 struct pt_regs {
2 unsigned long bx;
3 unsigned long cx;
4 unsigned long dx;
5 unsigned long si;
6 unsigned long di;
7 unsigned long bp;
8 unsigned long ax;
9 unsigned long ds;
10 unsigned long es;
11 unsigned long fs;
12 unsigned long gs;
13 unsigned long orig_ax;
14 unsigned long ip;
15 unsigned long cs;
16 unsigned long flags;
17 unsigned long sp;
18 unsigned long ss;
19 };
1 /*
2 * Name resolution.
3 * This is the basic name resolution function, turning a pathname into
4 * the final dentry. We expect 'base' to be positive and a directory.
5 *
6 * Returns 0 and nd will have valid dentry and mnt on success.
7 * Returns error and drops reference to input namei data on failure.
8 */
9 static int link_path_walk(const char *name, struct nameidata *nd)
10 {
11 struct path next;
12 int err;
13
14 ...
15 //查找文件
16 err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW);
17 if (err < 0)
18 return err;
19 ...
20 }
walk_component():
1 static inline int walk_component(struct nameidata *nd, struct path *path,
2 struct qstr *name, int type, int follow)
3 {
4 struct inode *inode;
5 int err;
6 /*
7 * "." and ".." are special - ".." especially so because it has
8 * to be able to know about the current root directory and
9 * parent relationships.
10 */
11 ...
12 //查找文件的具体实现
13 err = do_lookup(nd, name, path, &inode);
14 if (unlikely(err)) {
15 terminate_walk(nd);
16 return err;
17 }
18 ...
19 }