Linux5.x 系统调用过程学习记录
简介
进程使用标准库例程,库例程接下来调用内核函数,最终,由内核负责在各个请求进程之间公平而且流畅地共享资源和服务
演示的时使用的虚拟机,且先进行快照备份处理
虚拟机环境
uname -a
Linux plr 5.4.0-148-generic #165-Ubuntu SMP Tue Apr 18 08:53:12 UTC 2023 x86_64 x86_64 x86_64 GNU/Linux
lsb_release -a
No LSB modules are available.
Distributor ID: Ubuntu
Description: Ubuntu 20.04 LTS
Release: 20.04
Codename: focal
gcc 版本
gcc -v
Using built-in specs.
COLLECT_GCC=gcc
COLLECT_LTO_WRAPPER=/usr/lib/gcc/x86_64-linux-gnu/9/lto-wrapper
OFFLOAD_TARGET_NAMES=nvptx-none:hsa
OFFLOAD_TARGET_DEFAULT=1
Target: x86_64-linux-gnu
Configured with: ../src/configure -v --with-pkgversion='Ubuntu 9.4.0-1ubuntu1~20.04.1' --with-bugurl=file:///usr/share/doc/gcc-9/README.Bugs --enable-languages=c,ada,c++,go,brig,d,fortran,objc,obj-c++,gm2 --prefix=/usr --with-gcc-major-version-only --program-suffix=-9 --program-prefix=x86_64-linux-gnu- --enable-shared --enable-linker-build-id --libexecdir=/usr/lib --without-included-gettext --enable-threads=posix --libdir=/usr/lib --enable-nls --enable-clocale=gnu --enable-libstdcxx-debug --enable-libstdcxx-time=yes --with-default-libstdcxx-abi=new --enable-gnu-unique-object --disable-vtable-verify --enable-plugin --enable-default-pie --with-system-zlib --with-target-system-zlib=auto --enable-objc-gc=auto --enable-multiarch --disable-werror --with-arch-32=i686 --with-abi=m64 --with-multilib-list=m32,m64,mx32 --enable-multilib --with-tune=generic --enable-offload-targets=nvptx-none=/build/gcc-9-Av3uEd/gcc-9-9.4.0/debian/tmp-nvptx/usr,hsa --without-cuda-driver --enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu
Thread model: posix
gcc version 9.4.0 (Ubuntu 9.4.0-1ubuntu1~20.04.1)
ldd --version
ldd (Ubuntu GLIBC 2.31-0ubuntu9.7) 2.31
Copyright (C) 2020 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
Written by Roland McGrath and Ulrich Drepper.
用户态
测试文件test1.c
#include <stdio.h>
int main()
{
FILE *fp = NULL;
// w 打开一个文本文件,允许写入文件。如果文件不存在,则会创建一个新文件
fp = fopen("test.txt", "w");
fprintf(fp, "test\n");
fflush(fp);
fclose(fp);
}
测试文件test2.c
#include <fcntl.h>
#include <unistd.h>
int main() {
int fd = open("test.txt", O_WRONLY | O_CREAT | O_TRUNC, 0666);
write(fd, "test", 4);
fsync(fd);
close(fd);
return 0;
}
编译
gcc test1.c -o test1
ldd test1 # ldd 用于打印程序或者库文件所依赖的共享库列表
linux-vdso.so.1 (0x00007ffdcf191000)
libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007f3ac1903000)
/lib64/ld-linux-x86-64.so.2 (0x00007f3ac1b03000)
gcc test2.c -o test2
ldd test2
linux-vdso.so.1 (0x00007ffd72d98000)
libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007faebe85c000)
/lib64/ld-linux-x86-64.so.2 (0x00007faebea5c000)
strace 最终系统调用
# 对于test, 记得加./test
which test
/usr/bin/test
# -e trace=file 跟踪和文件访问相关的调用(参数中有文件名)
strace -e trace=file,write ./test1
execve("./test1", ["./test1"], 0x7ffc5fbea7e0 /* 30 vars */) = 0
access("/etc/ld.so.preload", R_OK) = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3
openat(AT_FDCWD, "/lib/x86_64-linux-gnu/libc.so.6", O_RDONLY|O_CLOEXEC) = 3
openat(AT_FDCWD, "test.txt", O_WRONLY|O_CREAT|O_TRUNC, 0666) = 3
write(3, "test\n", 5) = 5
strace -e trace=file,write ./test2
execve("./test2", ["./test2"], 0x7ffcfcc015d0 /* 30 vars */) = 0
access("/etc/ld.so.preload", R_OK) = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3
openat(AT_FDCWD, "/lib/x86_64-linux-gnu/libc.so.6", O_RDONLY|O_CLOEXEC) = 3
openat(AT_FDCWD, "test.txt", O_WRONLY|O_CREAT|O_TRUNC, 0666) = 3
write(3, "test", 4)
gdb 调试
# 安装的debug版c库的动态链接文件
sudo apt install -y libc6-dbg
# 重新编译test1
gcc -g -Wall test1.c -o test1 -Wl,-rpath=/usr/lib/debug
# 链接库的指向位置发生变化
ldd test1
linux-vdso.so.1 (0x00007ffc847dd000)
libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007fa913452000)
/lib64/ld-linux-x86-64.so.2 (0x00007fa913652000)
gdb ./test1
(gdb) n
7 fp = fopen("test.txt", "w");
(gdb) s
_IO_new_fopen (filename=0x555555556006 "test.txt", mode=0x555555556004 "w") at iofopen.c:85
85 iofopen.c: No such file or directory.
(gdb) s
86 in iofopen.c
(gdb) s
__fopen_internal (is32=1, mode=0x555555556004 "w", filename=0x555555556006 "test.txt") at iofopen.c:58
58 in iofopen.c
(gdb) s
[Inferior 1 (process 26284) exited normally]
gcc -g -Wall test2.c -o test2 -Wl,-rpath=/usr/lib/debug
gdb ./test2
(gdb) n
5 int fd = open("test.txt", O_WRONLY | O_CREAT | O_TRUNC, 0666);
(gdb) s
__libc_open64 (file=0x555555556004 "test.txt", oflag=577) at ../sysdeps/unix/sysv/linux/open64.c:37
37 ../sysdeps/unix/sysv/linux/open64.c: No such file or directory.
(gdb) s
40 in ../sysdeps/unix/sysv/linux/open64.c
(gdb) s
43 in ../sysdeps/unix/sysv/linux/open64.c
(gdb) s
45 in ../sysdeps/unix/sysv/linux/open64.c
(gdb) s
48 in ../sysdeps/unix/sysv/linux/open64.c
(gdb) s
main () at test2.c:6
6 write(fd, "test", 4);
(gdb) s
__GI___libc_write (fd=3, buf=0x55555555600d, nbytes=4) at ../sysdeps/unix/sysv/linux/write.c:25
25 ../sysdeps/unix/sysv/linux/write.c: No such file or directory.
(gdb) s
26 in ../sysdeps/unix/sysv/linux/write.c
glibc 库函数
从上面的 test2 调用__libc_open64,在glibc查找实现
// https://elixir.bootlin.com/glibc/glibc-2.31/source/sysdeps/unix/sysv/linux/open64.c#L36
int __libc_open64 (const char *file, int oflag, ...)
{
int mode = 0;
if (__OPEN_NEEDS_MODE (oflag))
{
va_list arg;
va_start (arg, oflag);
mode = va_arg (arg, int);
va_end (arg);
}
return SYSCALL_CANCEL (openat, AT_FDCWD, file, oflag | EXTRA_OPEN_FLAGS,
mode);
}
宏SYSCALL_CANCEL 展开过程
综合 sysdeps/unix/sysdep.h 通过接口
和特定架构 sysdeps/unix/sysv/linux/x86_64/sysdep.h 所使用的宏
#undef internal_syscall4
#define internal_syscall4(number, err, arg1, arg2, arg3, arg4) \
({ \
unsigned long int resultvar; \
TYPEFY (arg4, __arg4) = ARGIFY (arg4); \
TYPEFY (arg3, __arg3) = ARGIFY (arg3); \
TYPEFY (arg2, __arg2) = ARGIFY (arg2); \
TYPEFY (arg1, __arg1) = ARGIFY (arg1); \
register TYPEFY (arg4, _a4) asm ("r10") = __arg4; \
register TYPEFY (arg3, _a3) asm ("rdx") = __arg3; \
register TYPEFY (arg2, _a2) asm ("rsi") = __arg2; \
register TYPEFY (arg1, _a1) asm ("rdi") = __arg1; \
asm volatile ( \
"syscall\n\t" \
: "=a" (resultvar) \
: "0" (number), "r" (_a1), "r" (_a2), "r" (_a3), "r" (_a4) \
: "memory", REGISTERS_CLOBBERED_BY_SYSCALL); \
(long int) resultvar; \
})
# define __set_errno(val) (errno = (val))
# define INTERNAL_SYSCALL_ERRNO(val, err) (-(val))
# define INTERNAL_SYSCALL_ERROR_P(val, err) \
((unsigned long int) (long int) (val) >= -4095L)
#define SYS_ify(syscall_name) __NR_##syscall_name
#define INTERNAL_SYSCALL(name, err, nr, args...) \
internal_syscall##nr (SYS_ify (name), err, args)
# define INLINE_SYSCALL(name, nr, args...) \
({ \
unsigned long int resultvar = INTERNAL_SYSCALL (name, , nr, args); \
if (__glibc_unlikely (INTERNAL_SYSCALL_ERROR_P (resultvar, ))) \
{ \
__set_errno (INTERNAL_SYSCALL_ERRNO (resultvar, )); \
resultvar = (unsigned long int) -1; \
} \
(long int) resultvar; })
#define __SYSCALL_CONCAT_X(a,b) a##b
#define __SYSCALL_CONCAT(a,b) __SYSCALL_CONCAT_X (a, b)
#define INTERNAL_SYSCALL_CALL(...) \
__INTERNAL_SYSCALL_DISP (__INTERNAL_SYSCALL, __VA_ARGS__)
#define __INLINE_SYSCALL4(name, a1, a2, a3, a4) \
INLINE_SYSCALL (name, 4, a1, a2, a3, a4)
// 获取系统调用参数
#define __INLINE_SYSCALL_NARGS_X(a,b,c,d,e,f,g,h,n,...) n
// 比如__INLINE_SYSCALL_NARGS_X(1,2,3,4,5,7,6,5,4,3,2,1,0,) 对应n 刚好是4
#define __INLINE_SYSCALL_NARGS(...) \
__INLINE_SYSCALL_NARGS_X (__VA_ARGS__,7,6,5,4,3,2,1,0,)
#define __INLINE_SYSCALL_DISP(b,...) \
__SYSCALL_CONCAT (b,__INLINE_SYSCALL_NARGS(__VA_ARGS__))(__VA_ARGS__)
#define INLINE_SYSCALL_CALL(...) \
__INLINE_SYSCALL_DISP (__INLINE_SYSCALL, __VA_ARGS__)
#define SYSCALL_CANCEL(...) \
({ \
long int sc_ret; \
sc_ret = INLINE_SYSCALL_CALL (__VA_ARGS__); \
sc_ret; \
})
SYSCALL_CANCEL (openat, AT_FDCWD, file, oflag | EXTRA_OPEN_FLAGS, mode);
然后使用gcc -E -o test.i test.c
只进行预处理
#define TYPEFY(X, name) __typeof__ ((X) - (X)) name
#define ARGIFY(X) ((__typeof__ ((X) - (X))) (X))
({
long int sc_ret;
sc_ret = ({
unsigned long int resultvar = ({
unsigned long int resultvar;
TYPEFY(mode, __arg4) = ARGIFY(mode);
TYPEFY(oflag | EXTRA_OPEN_FLAGS, __arg3) = ARGIFY(oflag | EXTRA_OPEN_FLAGS);
TYPEFY(file, __arg2) = ARGIFY(file);
TYPEFY(AT_FDCWD, __arg1) = ARGIFY(AT_FDCWD);
register TYPEFY(mode, _a4) asm("r10") = __arg4;
register TYPEFY(oflag | EXTRA_OPEN_FLAGS, _a3) asm("rdx") = __arg3;
register TYPEFY(file, _a2) asm("rsi") = __arg2;
register TYPEFY(AT_FDCWD, _a1) asm("rdi") = __arg1;
asm volatile("syscall\n\t":
"=a" (resultvar):
"0" (__NR_openat),
"r" (_a1), "r" (_a2), "r" (_a3), "r" (_a4):
"memory", REGISTERS_CLOBBERED_BY_SYSCALL);
(long int) resultvar;
});
if (__glibc_unlikely(((unsigned long int)(long int)(resultvar) >= -4095 L))) {
(errno = ((-(resultvar))));
resultvar = (unsigned long int) - 1;
}(long int) resultvar;
});
sc_ret;
});
// x86 64位定义
#define __NR_openat 257
#define __NR_readlink 89
// aarch64 定义
#define __NR_openat 56
linux 自身库函数搜索路径
比如使用内联汇编代码自己调用系统调用函数
参考inux系统调用号列表、系统调用的三种方式
#include <stdio.h>
#include <sys/types.h>
#include <sys/syscall.h>
#include <errno.h>
int main()
{
long rc;
char *file_name = "/etc/passwd" ;
unsigned short mode = 0444;
asm(
"int $0x80"
: "=a" (rc)
: "0" (SYS_chmod), "b" ((long)file_name), "c" ((long)mode)
);
if ((unsigned long)rc >= (unsigned long)-132) {
errno = -rc;
rc = -1;
}
if (rc == -1)
fprintf(stderr, "chmode failed, errno = %d\n" , errno);
else
printf ( "success!\n" );
return 0;
}
通过 gcc -v
输出 gcc 工作的详细过程 , 来查找头文件所在路径
echo '#include <sys/syscall.h> int main(){}' | gcc -E -v - # 进行查看
# 1 "/usr/include/x86_64-linux-gnu/sys/syscall.h" 1 3 4
# 24 "/usr/include/x86_64-linux-gnu/sys/syscall.h" 3 4
# 1 "/usr/include/x86_64-linux-gnu/asm/unistd.h" 1 3 4
# 20 "/usr/include/x86_64-linux-gnu/asm/unistd.h" 3 4
# 1 "/usr/include/x86_64-linux-gnu/asm/unistd_64.h" 1 3 4
# 21 "/usr/include/x86_64-linux-gnu/asm/unistd.h" 2 3 4
# 25 "/usr/include/x86_64-linux-gnu/sys/syscall.h" 2 3 4
# 1 "/usr/include/x86_64-linux-gnu/bits/syscall.h" 1 3 4
# 30 "/usr/include/x86_64-linux-gnu/sys/syscall.h" 2 3 4
查看头文件内容:
在32位x86 Linux系统中,可用的系统调用定义在/usr/include/<arch>/asm/unistd_32.h
头文件中
在64位x86_64 Linux系统中,可用的系统调用定义在/usr/include/<arch>/asm/unistd_64.h
头文件中
// /usr/include/x86_64-linux-gnu/sys/syscall.h
#include <asm/unistd.h>
#include <bits/syscall.h>
// /usr/include/x86_64-linux-gnu/bits/syscall.h
#ifdef __NR_openat
# define SYS_openat __NR_openat
#endif
// /usr/include/x86_64-linux-gnu/asm/unistd_64.h
#define __NR_openat 257
汇编
gcc -o test2 test2.c -static
objdump -S test2 > test2.S
cat test2.S | grep 'syscall' | wc -l
170
通过strace 查看 strace -e trace=file -x -i ./test2
47bea3, 刚好是readlink的系统调用入口
openat的调用入口
进入内核态
以下源码参考:linux-5.10
指令陷入处理
x86
SYSCALL
是在x86-64上进入内核模式的默认方法。该指令在Intel处理器的32位操作模式下不可用。
SYSENTER
是最常用于以32位操作模式调用系统调用的指令。它与SYSCALL相似,但使用起来有点困难,但这是内核的关注点。
int 0x80
是调用系统调用的传统方法,应避免使用。
x86_64位系统调用使用 SYSCALL 指令进入内核空间,使CPU切换到ring 0。SYSCALL 指令主要工作为从MSR寄存器加载CS/SS,以及系统调用入口(entry_SYSCALL_64),从而进入系统调用处理流程
从80486之后的x86架构CPU,内部增加了一组新的寄存器,统称为MSR寄存器,中文直译是模型特定寄存器,意思是这些寄存器不像上面列出的寄存器是固定的,这些寄存器可能随着不同的版本有所变化。这些寄存器主要用来支持一些新的功能。
在系统启动时会调用 syscall_init
初始化, 注册MSR_LSTAR 被设置为函数 entry_SYSCALL_64 的起始地址
// arch/x86/kernel/cpu/common.c L:1749
/* May not be marked __init: used by software suspend */
void syscall_init(void)
{
wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
//
wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
#ifdef CONFIG_IA32_EMULATION
wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);
/*
* This only works on Intel CPUs.
*/
wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
wrmsrl_safe(MSR_IA32_SYSENTER_ESP,
(unsigned long)(cpu_entry_stack(smp_processor_id()) + 1));
wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
#else
// intel64位增加了MSR寄存器,用于快速找到系统调用的处理函数
wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret);
// 分别存储了执行系统调用后,内核系统调用入口函数所需要的段寄存器、堆栈栈顶、函数地址,不再需要内存查表
wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
#endif
/* Flags to clear on syscall */
wrmsrl(MSR_SYSCALL_MASK,
X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|
X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT);
}
Linux5.10 entry_SYSCALL_64 处理流程
- 程序从用户空间进入内核空间,保存用户态现场,载入内核态的信息,程序工作状态从用户态转变为内核态。
- 根据系统调用号,从系统跳转表中,调用对应的系统调用函数。
- 系统调用函数完成逻辑后,需要从内核空间回到用户空间,程序内核态转变为用户态,需要把之前保存的用户态现场进行恢复。
SYM_CODE_START(entry_SYSCALL_64)
UNWIND_HINT_EMPTY
swapgs
/* tss.sp2 is scratch space. */
/* 保存用户栈地址到寄存器。 */
movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
// 序从用户栈跳转到内核栈工作,准备将寄存器数据存储到内核数据结构上 struct pt_regs
SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
// 获取内核栈顶地址,放入 %rsp 寄存器
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack, SYM_L_GLOBAL)
/* Construct struct pt_regs on stack */
/* 保存数据段起始地址。 */
pushq $__USER_DS /* pt_regs->ss */
/* 保存函数栈栈顶地址。 */
pushq PER_CPU_VAR(cpu_tss_rw + TSS_sp2) /* pt_regs->sp */
/* 保存 CPU 状态。 */
pushq %r11 /* pt_regs->flags */
/* 保存代码段起始地址。 */
pushq $__USER_CS /* pt_regs->cs */
/* 保存 syscall 的下一条指令(指令寄存器)。 */
pushq %rcx /* pt_regs->ip */
SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
/* 保存系统调用号。 */
pushq %rax /* pt_regs->orig_ax */
/* 将部分寄存器数据填充到 struct pt_regs 数据结构的其它成员。 */
PUSH_AND_CLEAR_REGS rax=$-ENOSYS
/* IRQs are off. */
movq %rax, %rdi
movq %rsp, %rsi
/* 调用 do_syscall_64 函数,执行系统调用逻辑。 */
call do_syscall_64 /* returns with IRQs disabled */
....
/* 恢复现场,返回用户空间。 */
SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
popq %rdi
popq %rsp
USERGS_SYSRET64
SYM_CODE_END(entry_SYSCALL_64)
call do_syscall_64
根据系统调用号,调用 系统跳转表(sys_call_table) 对应的函数。
保存系统调用函数执行结果。
恢复程序的工作模式,从内核模式切换回用户模式。
#ifdef CONFIG_X86_64
__visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs)
{
nr = syscall_enter_from_user_mode(regs, nr);
instrumentation_begin();
if (likely(nr < NR_syscalls)) {
/* 通过系统调用跳转表,调用系统调用号对应的函数。
* 函数返回值保存在 regs->ax 里,最后将这个值,保存到 rax 寄存器传递到用户空间。 */
nr = array_index_nospec(nr, NR_syscalls);
regs->ax = sys_call_table[nr](regs);
}
instrumentation_end();
syscall_exit_to_user_mode(regs);
}
#endif
struct pt_regs
// arch/x86/include/asm/ptrace.h L:56
struct pt_regs {
/*
* C ABI says these regs are callee-preserved. They aren't saved on kernel entry
* unless syscall needs a complete, fully filled "struct pt_regs".
*/
unsigned long r15;
unsigned long r14;
unsigned long r13;
unsigned long r12;
unsigned long rbp;
unsigned long rbx;
/* These regs are callee-clobbered. Always saved on kernel entry. */
unsigned long r11;
unsigned long r10; /* 程序传递到内核的第 4 个参数。 */
unsigned long r9; /* 程序传递到内核的第 6 个参数。 */
unsigned long r8; /* 程序传递到内核的第 5 个参数。 */
unsigned long ax; /* 程序传递到内核的系统调用号。 */
unsigned long cx; /* 程序传递到内核的 syscall 的下一条指令地址。 */
unsigned long dx; /* 程序传递到内核的第 3 个参数。 */
unsigned long si; /* 程序传递到内核的第 2 个参数。 */
unsigned long di; /* 程序传递到内核的第 1 个参数。 */
/*
* On syscall entry, this is syscall#. On CPU exception, this is error code.
* On hw interrupt, it's IRQ number:
*/
unsigned long orig_rax; /* 系统调用号。 */
/* Return frame for iretq
* 内核态返回用户态需要恢复现场的数据。*/
unsigned long ip; /* 保存程序调用 syscall 的下一条指令地址。 */
unsigned long cs; /* 用户态代码起始段地址。 */
unsigned long flags; /* 用户态的 CPU 标志。 */
unsigned long sp; /* 用户态的栈顶地址(栈内存是向下增长的)。 */
unsigned long ss; /* 用户态的数据段地址。 */
/* top of stack page */
};
aarch64
用户层进入内核态执行系统调用函数,通过异常方式(库函数完成),将当前系统调用函数的调用号放入x8
寄存器,然后使用svc
指令,发起同步异常。参考系统调用实现原理
Supervisor Call causes an exception to be taken to EL1.On executing an SVC instruction, the PE records the exception as a Supervisor Call exception in ESR_ELx, using the EC value 0x15
, and the value of the immediate argument.
// arch/arm64/kernel/entry.S
// 中断向量入口:
SYM_CODE_START(vectors)
kernel_ventry 1, sync_invalid // Synchronous EL1t
kernel_ventry 1, irq_invalid // IRQ EL1t
kernel_ventry 1, fiq_invalid // FIQ EL1t
kernel_ventry 1, error_invalid // Error EL1t
kernel_ventry 1, sync // Synchronous EL1h
kernel_ventry 1, irq // IRQ EL1h
kernel_ventry 1, fiq_invalid // FIQ EL1h
kernel_ventry 1, error // Error EL1h
kernel_ventry 0, sync // Synchronous 64-bit EL0 ,同步异常处理入口: 包括系统调用
kernel_ventry 0, irq // IRQ 64-bit EL0
kernel_ventry 0, fiq_invalid // FIQ 64-bit EL0
kernel_ventry 0, error // Error 64-bit EL0
// kernel_ventry 宏处理过程
.macro kernel_ventry, el, label, regsize = 64
.align 7
sub sp, sp, #S_FRAME_SIZE
b el\()\el\()_\label // 展开为: b el0_sync // 跳转到el0_sync
.endm
SYM_CODE_START_LOCAL_NOALIGN(el0_sync)
kernel_entry 0 // 保存用户态在寄存器数据
mov x0, sp
bl el0_sync_handler // el0_sync 处理函数
b ret_to_user
SYM_CODE_END(el0_sync)
找到系统调用函数
// arch/arm64/kernel/entry-common.c
asmlinkage void noinstr el0_sync_handler(struct pt_regs *regs) {
unsigned long esr = read_sysreg(esr_el1);
switch (ESR_ELx_EC(esr)) {
// arch/arm64/include/asm :
// #define ESR_ELx_EC_SVC64 (0x15)
case ESR_ELx_EC_SVC64:
el0_svc(regs);
break;
... // 其他异常
}
}
static void noinstr el0_svc(struct pt_regs *regs) {
...
do_el0_svc(regs);
}
// arch/arm64/kernel/syscall.c
void do_el0_svc(struct pt_regs *regs){
sve_user_discard();
// __NR_syscalls 系统调用总数
// sys_call_table 系统调用表 它每个系统调用的size是.long,即4byte
el0_svc_common(regs, regs->regs[8], __NR_syscalls, sys_call_table);
}
static void el0_svc_common(struct pt_regs *regs, int scno, int sc_nr,
const syscall_fn_t syscall_table[]) {
unsigned long flags = current_thread_info()->flags;
regs->orig_x0 = regs->regs[0];
regs->syscallno = scno;
....
invoke_syscall(regs, scno, sc_nr, syscall_table);
....
}
static void invoke_syscall(struct pt_regs *regs, unsigned int scno,
unsigned int sc_nr,
const syscall_fn_t syscall_table[])
{
long ret;
if (scno < sc_nr) {
syscall_fn_t syscall_fn;
syscall_fn = syscall_table[array_index_nospec(scno, sc_nr)];
ret = __invoke_syscall(regs, syscall_fn);
} else {
// 未定义的系统调用,返回—ENOSYS
ret = do_ni_syscall(regs, scno);
}
if (is_compat_task())
ret = lower_32_bits(ret);
regs->regs[0] = ret;
}
static long __invoke_syscall(struct pt_regs *regs, syscall_fn_t syscall_fn){
return syscall_fn(regs);
}
struct pt_regs 数据结构
// arch/arm64/include/asm/ptrace.h
struct pt_regs {
union {
struct user_pt_regs user_regs;
struct {
u64 regs[31];
u64 sp;
u64 pc;
u64 pstate;
};
};
u64 orig_x0;
#ifdef __AARCH64EB__
u32 unused2;
s32 syscallno;
#else
s32 syscallno;
u32 unused2;
#endif
u64 orig_addr_limit;
/* Only valid when ARM64_HAS_IRQ_PRIO_MASKING is enabled. */
u64 pmr_save;
u64 stackframe[2];
/* Only valid for some EL1 exceptions. */
u64 lockdep_hardirqs;
u64 exit_rcu;
};
// arch/arm64/include/asm/syscall.h
typedef long (*syscall_fn_t)(const struct pt_regs *regs);
系统调用表
x86
系统调用表 arch/x86/entry/syscalls/syscall_64.tbl
,建立了系统调用号与系统调用函数名的映射关系。脚本会根据这个表,自动生成相关的映射源码
# 64-bit system call numbers and entry vectors
#
# The format is:
# <number> <abi> <name> <entry point>
#
# The __x64_sys_*() stubs are created on-the-fly for sys_*() system calls
#
# The abi is "common", "64" or "x32" for this file.
0 common read sys_read
1 common write sys_write
2 common open sys_open
3 common close sys_close
4 common stat sys_newstat
5 common fstat sys_newfstat
6 common lstat sys_newlstat
7 common poll sys_poll
8 common lseek sys_lseek
9 common mmap sys_mmap
10 common mprotect sys_mprotect
11 common munmap sys_munmap
12 common brk sys_brk
...
257 common openat sys_openat
...
547 x32 pwritev2 compat_sys_pwritev64v2
# This is the end of the legacy x32 range. Numbers 548 and above are
# not special and are not to be used for x32-specific syscalls.
sys_call_table 的定义。#include <asm/syscalls_64.h>
这行源码对应的文件是在内核编译的时候,通过脚本创建的
/* include/generated/asm-offsets.h */
#define __NR_syscall_max 547 /* sizeof(syscalls_64) - 1 */
// arch/x86/entry/syscall_64.c L:18
#define __SYSCALL_64(nr, sym, qual) [nr] = sym,
asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
[0 ... __NR_syscall_max] = &__x86_sys_ni_syscall,
#include <asm/syscalls_64.h>
};
// arch/x86/entry/syscalls/syscalltbl.sh 脚本类似会生成下面一个文件
// arch/x86/include/generated/asm/syscalls_64.h
#ifdef CONFIG_X86
__SYSCALL_64(0, __x64_sys_read, )
#else /* CONFIG_UML */
__SYSCALL_64(0, sys_read, )
#endif
#ifdef CONFIG_X86
__SYSCALL_64(1, __x64_sys_write, )
#else /* CONFIG_UML */
__SYSCALL_64(1, sys_write, )
#endif
aarch64
// arch/arm64/kernel/sys.c
#undef __SYSCALL
#define __SYSCALL(nr, sym) asmlinkage long __arm64_##sym(const struct pt_regs *);
#include <asm/unistd.h>
// 对于ARM64架构,头文件“asm/unistd.h”是“arch/arm64/include/asm/unistd.h”。
#undef __SYSCALL
#define __SYSCALL(nr, sym) [nr] = __arm64_##sym,
const syscall_fn_t sys_call_table[__NR_syscalls] = {
[0 ... __NR_syscalls - 1] = __arm64_sys_ni_syscall,
#include <asm/unistd.h>
};
// arch/arm64/include/asm/unistd.h
#include <uapi/asm/unistd.h>
#define NR_syscalls (__NR_syscalls)
#define __ARCH_WANT_RENAMEAT
#define __ARCH_WANT_NEW_STAT
#define __ARCH_WANT_SET_GET_RLIMIT
#define __ARCH_WANT_TIME32_SYSCALLS
#define __ARCH_WANT_SYS_CLONE3
#include <asm-generic/unistd.h>
// include/uapi/asm-generic/unistd.h
#define __NR_io_setup 0
__SC_COMP(__NR_io_setup, sys_io_setup, compat_sys_io_setup)
#define __NR_io_destroy 1
__SYSCALL(__NR_io_destroy, sys_io_destroy)
....
#define __NR_syscalls 441
// 系统调用表展开为
const syscall_fn_t sys_call_table[__NR_syscalls] = {
[0 ... __NR_syscalls - 1] = __arm64_sys_ni_syscall,__arm64_compat_sys_io_setup,__arm64_sys_io_destroy,......
};
系统调用宏
注:在Linux 2.6.28及以前版本的内核中,IBM/S390、PowerPC、Sparc64以及MIPS 架构64位平台的ABI要求在系统调用时,用户空间程序将系统调用中32位的参数存放在64位的寄存器中要做到正确的符号扩展,但是用户空间程序却不能保证做到这点,这样就会可以通过向有漏洞的系统调用传送特制参数便可以导致系统崩溃或获得权限提升
// arch/arm64/include/asm/syscall_wrapper.h
#define __SYSCALL_DEFINEx(x, name, ...) \
asmlinkage long __arm64_sys##name(const struct pt_regs *regs); \
ALLOW_ERROR_INJECTION(__arm64_sys##name, ERRNO); \
static long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \
static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \
asmlinkage long __arm64_sys##name(const struct pt_regs *regs) \
{ \
return __se_sys##name(SC_ARM64_REGS_TO_ARGS(x,__VA_ARGS__)); \
} \
static long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \
{ \
long ret = __do_sys##name(__MAP(x,__SC_CAST,__VA_ARGS__)); \
__MAP(x,__SC_TEST,__VA_ARGS__); \
__PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__)); \
return ret; \
} \
static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))
#define SYSCALL_DEFINE0(sname) \
SYSCALL_METADATA(_##sname, 0); \
asmlinkage long __arm64_sys_##sname(const struct pt_regs *__unused); \
ALLOW_ERROR_INJECTION(__arm64_sys_##sname, ERRNO); \
asmlinkage long __arm64_sys_##sname(const struct pt_regs *__unused)
// include/linux/syscalls.h
#ifndef SYSCALL_DEFINE0
#define SYSCALL_DEFINE0(sname) \
SYSCALL_METADATA(_##sname, 0); \
asmlinkage long sys_##sname(void); \
ALLOW_ERROR_INJECTION(sys_##sname, ERRNO); \
asmlinkage long sys_##sname(void)
#endif /* SYSCALL_DEFINE0 */
#define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE2(name, ...) SYSCALL_DEFINEx(2, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE4(name, ...) SYSCALL_DEFINEx(4, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE5(name, ...) SYSCALL_DEFINEx(5, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE6(name, ...) SYSCALL_DEFINEx(6, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE_MAXARGS 6
#define SYSCALL_DEFINEx(x, sname, ...) \
SYSCALL_METADATA(sname, x, __VA_ARGS__) \
__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
对于 write
系统调用宏展开
#define __NR_write 64 __SYSCALL(__NR_write, sys_write)
asmlinkage long __arm64_sys_write(const struct pt_regs *);
// fs/read_write.c L:667
SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, size_t, count) {
return ksys_write(fd, buf, count);
}
x86 处理
#define __SC_DECL(t, a) t a
#define __TYPE_AS(t, v) __same_type((__force t)0, v)
#define __SC_ARGS(t, a) a
#define __MAP0(m, ...)
#define __MAP1(m, t, a, ...) m(t, a)
#define __MAP2(m, t, a, ...) m(t, a), __MAP1(m, __VA_ARGS__)
#define __MAP3(m, t, a, ...) m(t, a), __MAP2(m, __VA_ARGS__)
#define __MAP4(m, t, a, ...) m(t, a), __MAP3(m, __VA_ARGS__)
#define __MAP5(m, t, a, ...) m(t, a), __MAP4(m, __VA_ARGS__)
#define __MAP6(m, t, a, ...) m(t, a), __MAP5(m, __VA_ARGS__)
#define __MAP(n, ...) __MAP##n(__VA_ARGS__)
#define asmlinkage_protect(n, ret, args...) \
do { \
} while (0)
#define __PROTECT(...) asmlinkage_protect(__VA_ARGS__)
#define SC_X86_64_REGS_TO_ARGS(x, ...) \
__MAP(x, __SC_ARGS, , regs->di, , regs->si, , regs->dx, , regs->r10, , regs->r8, , regs->r9)
#define __SYS_STUBx(abi, name, ...) \
long __##abi##_##name(const struct pt_regs *regs); \
long __##abi##_##name(const struct pt_regs *regs) \
{ \
return __se_##name(__VA_ARGS__); \
}
#define __X64_SYS_STUBx(x, name, ...) \
__SYS_STUBx(x64, sys##name, \
SC_X86_64_REGS_TO_ARGS(x, __VA_ARGS__))
// arch/x86/include/asm/syscall_wrapper.h
#define __SYSCALL_DEFINEx(x, name, ...) \
static long __se_sys##name(__MAP(x, __SC_LONG, __VA_ARGS__)); \
static inline long __do_sys##name(__MAP(x, __SC_DECL, __VA_ARGS__)); \
__X64_SYS_STUBx(x, name, __VA_ARGS__) static long __se_sys##name(__MAP(x, __SC_LONG, __VA_ARGS__)) \
{ \
long ret = __do_sys##name(__MAP(x, __SC_CAST, __VA_ARGS__)); \
__MAP(x, __SC_TEST, __VA_ARGS__); \
__PROTECT(x, ret, __MAP(x, __SC_ARGS, __VA_ARGS__)); \
return ret; \
} \
static inline long __do_sys##name(__MAP(x, __SC_DECL, __VA_ARGS__))
// include/linux/syscalls.h L:222
#define SYSCALL_DEFINEx(x, sname, ...) \
__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
// include/linux/syscalls.h L:215
#define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)
SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
size_t, count)
{
return ksys_write(fd, buf, count);
}
通过 gcc -E
预处理后,展开为
// 声明__se_sys_write
static long __se_sys_write(__SC_LONG(unsigned int, fd), __SC_LONG(const char __user *, buf), __SC_LONG(size_t, count));
// 声明__do_sys_write
static inline long __do_sys_write(unsigned int fd,
const char __user *buf, size_t count);
// 声明__x64_sys_write
long __x64_sys_write(const struct pt_regs *regs);
long __x64_sys_write(const struct pt_regs *regs)
{
return __se_sys_write(regs - > di, regs - > si, regs - > dx);
}
static long __se_sys_write(__SC_LONG(unsigned int, fd), __SC_LONG(const char __user *, buf), __SC_LONG(size_t, count))
{
long ret = __do_sys_write(__SC_CAST(unsigned int, fd), __SC_CAST(const char __user *, buf), __SC_CAST(size_t, count));
__SC_TEST(unsigned int, fd), __SC_TEST(const char __user *, buf), __SC_TEST(size_t, count);
return ret;
}
static inline long __do_sys_write(unsigned int fd,
const char __user *buf, size_t count)
{
return ksys_write(fd, buf, count);
}
系统调用传递
__SYSCALL_DEFINEx宏展开后,传给__se_sys_openat函数的参数是SC_X86_64_REGS_TO_ARGS宏
#define SC_X86_64_REGS_TO_ARGS(x, ...) \
__MAP(x,__SC_ARGS \
,,regs->di,,regs->si,,regs->dx \
,,regs->r10,,regs->r8,,regs->r9) \
内核函数处理
用户程序 -> write -> 系统调用号(1)-> syscall -> 内核跳转表 -> 系统调用函数 (__x64_sys_write) -> 内核处理函数(ksys_write)
// fs/read_write.c L:647
ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count)
{
struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;
if (f.file) {
loff_t pos, *ppos = file_ppos(f.file);
if (ppos) {
pos = *ppos;
ppos = &pos;
}
ret = vfs_write(f.file, buf, count, ppos);
if (ret >= 0 && ppos)
f.file->f_pos = pos;
fdput_pos(f);
}
return ret;
}
hook 系统调用
Linux Hook系统调用 和 Linux内核系统调用劫持之kallsyms
#include<linux/module.h>
#include<linux/printk.h>
#include<linux/kobject.h>
#include<linux/kernel.h>
#include<asm/unistd_64.h>
#include<linux/syscalls.h>
#include<linux/delay.h>
#include<linux/kallsyms.h>
#include<asm/syscall.h>
#include<asm/paravirt.h>
#include <asm/nops.h>
static unsigned long __lkm_order;
static inline unsigned long lkm_read_cr0(void)
{
unsigned long val;
asm volatile("mov %%cr0,%0\n\t" : "=r" (val), "=m" (__lkm_order));
return val;
}
static inline void lkm_write_cr0(unsigned long val)
{
asm volatile("mov %0,%%cr0": : "r" (val), "m" (__lkm_order));
}
typedef asmlinkage long (*sys_call_ptr_t)(const struct pt_regs *);
static sys_call_ptr_t *call_table;
sys_call_ptr_t old_openat; //
static asmlinkage long my_openat(const struct pt_regs *regs)
{
int dfd = regs->di;
char __user *filename = (char *)regs->si;
char user_filename[256] = {0};
int ret = raw_copy_from_user(user_filename, filename, sizeof(user_filename));
printk("%s. proc:%s, pid:%d, dfd:%d, filename:[%s], copy ret:%d\n", __func__,
current->group_leader->comm, current->tgid, dfd, user_filename, ret);
return old_openat(regs);
}
void disable_write_protection(void)
{
unsigned long cr0 = lkm_read_cr0();
clear_bit(16, &cr0);
lkm_write_cr0(cr0);
}
void enable_write_protection(void)
{
unsigned long cr0 = lkm_read_cr0();
set_bit(16, &cr0);
lkm_write_cr0(cr0);
}
static int __init rootkits_lsm_init(void)
{
call_table = (sys_call_ptr_t *)kallsyms_lookup_name("sys_call_table");
printk("call_table:%lx",call_table);
if(!call_table) return -1;
old_openat = call_table[__NR_openat];
printk("[info] %s. old_openat:0x%llx\n", __func__, old_openat);
disable_write_protection();
call_table[__NR_openat] = my_openat;
printk("fake_openat:%lx,sys_openat:%lx,table:%lx",(unsigned long)my_openat,old_openat,call_table[__NR_openat]);
enable_write_protection();
return 0;
}
static void __exit root_kits_lsm_uninit(void)
{
disable_write_protection();
call_table[__NR_openat] = old_openat;
enable_write_protection();
printk("%s removed.\n",__func__);
}
module_init(rootkits_lsm_init);
module_exit(root_kits_lsm_uninit);
MODULE_LICENSE("GPL");