简介
execve syscall 做了什么
重要参考
Linux 进程启动 execve 系统调用内核源码解析
execve 一次简单的跟踪
入口
SYSCALL_DEFINE3(execve,
const char __user *, filename,
const char __user *const __user *, argv,
const char __user *const __user *, envp)
{
return do_execve(getname(filename), argv, envp);
}
SYSCALL_DEFINE3
表示这个syscall有三个参数 宏定义 实际函数名do_sys_execve
- 还有个execveat系统调用 区别仅仅是
filename
的cwd可以指定
static int do_execve(struct filename *filename,
const char __user *const __user *__argv,
const char __user *const __user *__envp)
{
struct user_arg_ptr argv = { .ptr.native = __argv };
struct user_arg_ptr envp = { .ptr.native = __envp };
return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
}
ftrace
对bash中敲ls跟踪 部分不重要的地方已省略…
抓的是
do_execveat_common
,因为do_sys_execve
和do_execve
都不让抓,报错,不懂
# perf ftrace --graph-opts depth=7 -a -G do_execveat_common
# tracer: function_graph
#
# CPU DURATION FUNCTION CALLS
# | | | | | | |
1) | do_execveat_common() {
1) | alloc_bprm() {
1) | ...
1) + 13.133 us | }
1) | copy_string_kernel() {
...
1) + 14.515 us | }
1) | copy_strings.isra.0() {
...
1) + 66.078 us | }
1) | copy_strings.isra.0() {
...
1) 6.495 us | }
1) | bprm_execve() {
1) 0.188 us | mutex_lock_interruptible();
1) | prepare_exec_creds() {
...
1) 3.646 us | }
1) | bprm_execve.part.0() {
1) | check_unsafe_exec() {
...
1) 3.999 us | }
1) | do_open_execat() {
...
1) + 57.539 us | }
1) | sched_exec() {
...
1) 3.658 us | }
1) | security_bprm_creds_for_exec() {
...
1) + 13.107 us | }
1) | exec_binprm() {
1) | search_binary_handler() {
1) | kernel_read() {
...
1) ! 445.465 us | }
1) | security_bprm_check() {
1) 1.134 us | ima_bprm_check();
1) 1.431 us | }
1) 0.229 us | _raw_read_lock();
1) 0.407 us | try_module_get();
1) 0.124 us | load_script();
1) 0.119 us | _raw_read_lock();
1) 0.094 us | module_put();
1) 0.095 us | try_module_get();
1) | load_elf_binary() {
1) 3.002 us | load_elf_phdrs();
1) 0.263 us | __kmalloc();
1) 0.933 us | kernel_read();
1) + 12.149 us | open_exec();
1) 0.207 us | irq_enter_rcu();
1) 0.532 us | __sysvec_irq_work();
1) 0.230 us | irq_exit_rcu();
1) 0.283 us | kfree();
1) 0.413 us | would_dump();
1) 0.314 us | kmem_cache_alloc_trace();
1) 1.875 us | kernel_read();
1) 1.352 us | load_elf_phdrs();
1) ! 273.565 us | begin_new_exec();
1) 0.418 us | irq_enter_rcu();
1) 0.336 us | __sysvec_irq_work();
1) 0.169 us | irq_exit_rcu();
1) 0.368 us | set_personality_64bit();
1) 0.844 us | setup_new_exec();
1) 0.152 us | randomize_stack_top();
1) + 10.054 us | setup_arg_pages();
1) 0.168 us | arch_mmap_rnd();
1) 0.410 us | total_mapping_size();
1) 6.781 us | elf_map();
1) 1.867 us | elf_map();
1) 1.614 us | elf_map();
1) 1.853 us | elf_map();
1) 1.048 us | set_brk();
1) ! 558.256 us | clear_user();
1) 0.332 us | irq_enter_rcu();
1) 0.541 us | __sysvec_irq_work();
1) 0.250 us | irq_exit_rcu();
1) + 43.014 us | load_elf_interp.constprop.0();
1) 0.237 us | irq_enter_rcu();
1) 0.521 us | __sysvec_irq_work();
1) 0.318 us | irq_exit_rcu();
1) 0.145 us | fput();
1) 0.398 us | kfree();
1) 0.287 us | kfree();
1) 0.327 us | kfree();
1) 0.345 us | set_binfmt();
1) 2.448 us | arch_setup_additional_pages();
1) 8.669 us | create_elf_tables();
1) 0.294 us | arch_randomize_brk();
1) 0.338 us | finalize_exec();
1) 0.186 us | start_thread();
1) ! 951.498 us | }
1) 0.170 us | _raw_read_lock();
1) 0.140 us | module_put();
1) # 1441.605 us | } /* search_binary_handler */
1) 0.321 us | proc_exec_connector();
1) # 1444.211 us | }
1) | acct_update_integrals() {
1) 0.109 us | task_cputime();
1) 0.649 us | }
1) 0.133 us | task_numa_free();
1) # 1524.423 us | }
1) # 1528.936 us | }
1) | free_bprm() {
1) | fput() {
1) 0.095 us | fput_many();
1) 0.272 us | }
1) 0.438 us | kfree();
1) 0.336 us | kfree();
1) 1.893 us | }
1) | putname() {
1) 0.225 us | kmem_cache_free();
1) 0.535 us | }
1) # 1690.790 us | }
流程
bprm
bprm 结构体 贯穿execve的过程
/root/linux-5.10.202/fs/exec.c
static int do_execveat_common(int fd, struct filename *filename,
struct user_arg_ptr argv,
struct user_arg_ptr envp,
int flags)
{
struct linux_binprm *bprm;
/root/linux-5.10.202/include/linux/binfmts.h
/*
* This structure is used to hold the arguments that are used when loading binaries.
*/
struct linux_binprm {
#ifdef CONFIG_MMU
struct vm_area_struct *vma;
unsigned long vma_pages;
#else
# define MAX_ARG_PAGES 32
struct page *page[MAX_ARG_PAGES];
#endif
struct mm_struct *mm;
unsigned long p; /* current top of mem */
unsigned long argmin; /* rlimit marker for copy_strings() */
unsigned int
/* Should an execfd be passed to userspace? */
have_execfd:1,
/* Use the creds of a script (see binfmt_misc) */
execfd_creds:1,
/*
* Set by bprm_creds_for_exec hook to indicate a
* privilege-gaining exec has happened. Used to set
* AT_SECURE auxv for glibc.
*/
secureexec:1,
/*
* Set when errors can no longer be returned to the
* original userspace.
*/
point_of_no_return:1;
#ifdef __alpha__
unsigned int taso:1;
#endif
struct file *executable; /* Executable to pass to the interpreter */
struct file *interpreter;
struct file *file;
struct cred *cred; /* new credentials */
int unsafe; /* how unsafe this exec is (mask of LSM_UNSAFE_*) */
unsigned int per_clear; /* bits to clear in current->personality */
int argc, envc;
const char *filename; /* Name of binary as seen by procps */
const char *interp; /* Name of the binary really executed. Most
of the time same as filename, but could be
different for binfmt_{misc,script} */
const char *fdpath; /* generated filename for execveat */
unsigned interp_flags;
int execfd; /* File descriptor of the executable */
unsigned long loader, exec;
struct rlimit rlim_stack; /* Saved RLIMIT_STACK used during exec. */
char buf[BINPRM_BUF_SIZE];
} __randomize_layout;
以下段落照抄于Linux 进程启动 execve 系统调用内核源码解析
它用于存储加载可执行文件时所需的参数,包括程序的参数列表、环境变量列表、限制信息等。下面是对该结构体的一些解释:
vma 和 vma_pages 成员变量用于存储新程序的地址空间信息。vma 是一个指向 vm_area_struct 结构体的指针,该结构体用于描述一个虚拟内存区域;vma_pages 是一个无符号长整型变量,表示新程序占用的虚拟内存页数。
mm 成员变量是一个指向 mm_struct 结构体的指针,用于表示进程的内存映射信息。
p 成员变量是一个无符号长整型变量,表示新程序的内存布局的顶部位置。
argmin 成员变量是一个无符号长整型变量,表示 RLIMIT_STACK 限制的标记位置。
have_execfd、execfd_creds 和 secureexec 成员变量是用于表示一些特殊情况的标志位。
point_of_no_return 成员变量是一个标志位,用于表示在执行新程序时是否可以返回错误给原始用户空间。
executable、interpreter 和 file 成员变量是指向 file 结构体的指针,分别表示要执行的程序文件、解释器文件和当前进程的执行文件。
cred 成员变量是一个指向 cred 结构体的指针,表示新程序的执行凭证。
unsafe 成员变量是一个整型变量,用于表示执行新程序的安全级别。
per_clear 成员变量是一个无符号整型变量,表示在执行新程序时需要清除的当前进程的 personality 标志位。
argc 和 envc 成员变量分别表示新程序的参数数量和环境变量数量。
filename、interp 和 fdpath 成员变量分别表示新程序的名称、解释器的名称和在执行 execveat() 系统调用时生成的文件名。
interp_flags 成员变量是一个无符号整型变量,表示解释器的标志位。
execfd 成员变量是一个整型变量,表示要执行的程序文件的文件描述符。
loader 和 exec 成员变量分别表示解释器和新程序的入口地址。
rlim_stack 成员变量是一个 rlimit 结构体,表示新程序的栈空间大小限制。
buf 成员变量是一个字符数组,用于存储新程序的代码和数据。
这个结构体是 execve() 系统调用的底层实现所需的参数集合,它会在内核中的加载可执行文件时被使用。
后面一段巴拉巴拉的加载,同样建议查阅Linux 进程启动 execve 系统调用内核源码解析
加载elf
elf 是linux平台的二进制可执行文件格式
/*
* cycle the list of binary formats handler, until one recognizes the image
*/
static int search_binary_handler(struct linux_binprm *bprm)
{
bool need_retry = IS_ENABLED(CONFIG_MODULES);
struct linux_binfmt *fmt;
int retval;
retval = prepare_binprm(bprm);
if (retval < 0)
return retval;
retval = security_bprm_check(bprm);
if (retval)
return retval;
retval = -ENOENT;
retry:
read_lock(&binfmt_lock);
list_for_each_entry(fmt, &formats, lh) {
if (!try_module_get(fmt->module))
continue;
read_unlock(&binfmt_lock);
retval = fmt->load_binary(bprm);
- 最后一句
retval = fmt->load_binary(bprm);
调用相应的函数,加载,很重要static struct linux_binfmt elf_format = { .module = THIS_MODULE, .load_binary = load_elf_binary, .load_shlib = load_elf_library, .core_dump = elf_core_dump, .min_coredump = ELF_EXEC_PAGESIZE, };
通过ftrace的跟踪,bash中执行ls,使用的是load_elf_binary
load_elf_binary
再次祭出
ftrace
分析load_elf_binary
做了什么
load_elf_binary() {
1) 3.002 us | load_elf_phdrs();
1) 0.263 us | __kmalloc();
1) 0.933 us | kernel_read();
1) + 12.149 us | open_exec();
1) 0.207 us | irq_enter_rcu();
1) 0.532 us | __sysvec_irq_work();
1) 0.230 us | irq_exit_rcu();
1) 0.283 us | kfree();
1) 0.413 us | would_dump();
1) 0.314 us | kmem_cache_alloc_trace();
1) 1.875 us | kernel_read();
1) 1.352 us | load_elf_phdrs();
1) ! 273.565 us | begin_new_exec();
1) 0.418 us | irq_enter_rcu();
1) 0.336 us | __sysvec_irq_work();
1) 0.169 us | irq_exit_rcu();
1) 0.368 us | set_personality_64bit();
1) 0.844 us | setup_new_exec();
1) 0.152 us | randomize_stack_top();
1) + 10.054 us | setup_arg_pages();
1) 0.168 us | arch_mmap_rnd();
1) 0.410 us | total_mapping_size();
1) 6.781 us | elf_map();
1) 1.867 us | elf_map();
1) 1.614 us | elf_map();
1) 1.853 us | elf_map();
1) 1.048 us | set_brk();
1) ! 558.256 us | clear_user();
1) 0.332 us | irq_enter_rcu();
1) 0.541 us | __sysvec_irq_work();
1) 0.250 us | irq_exit_rcu();
1) + 43.014 us | load_elf_interp.constprop.0();
1) 0.237 us | irq_enter_rcu();
1) 0.521 us | __sysvec_irq_work();
1) 0.318 us | irq_exit_rcu();
1) 0.145 us | fput();
1) 0.398 us | kfree();
1) 0.287 us | kfree();
1) 0.327 us | kfree();
1) 0.345 us | set_binfmt();
1) 2.448 us | arch_setup_additional_pages();
1) 8.669 us | create_elf_tables();
1) 0.294 us | arch_randomize_brk();
1) 0.338 us | finalize_exec();
1) 0.186 us | start_thread();
1) ! 951.498 us | }
这里面一大堆,真正影响到进程描述、修改current
task_struch
指针内容的,主要有下面几个函数
begin_new_exec
/*
* Calling this is the point of no return. None of the failures will be
* seen by userspace since either the process is already taking a fatal
* signal (via de_thread() or coredump), or will have SEGV raised
* (after exec_mmap()) by search_binary_handler (see below).
*/
int begin_new_exec(struct linux_binprm * bprm)
{
struct task_struct *me = current;
int retval;
/* Once we are committed compute the creds */
retval = bprm_creds_from_file(bprm);
if (retval)
return retval;
/*
* Ensure all future errors are fatal.
*/
bprm->point_of_no_return = true;
/*
* Make this the only thread in the thread group.
*/
retval = de_thread(me);
if (retval)
goto out;
/*
* Must be called _before_ exec_mmap() as bprm->mm is
* not visibile until then. This also enables the update
* to be lockless.
*/
set_mm_exe_file(bprm->mm, bprm->file);
/* If the binary is not readable then enforce mm->dumpable=0 */
would_dump(bprm, bprm->file);
if (bprm->have_execfd)
would_dump(bprm, bprm->executable);
/*
* Release all of the old mmap stuff
*/
acct_arg_size(bprm, 0);
retval = exec_mmap(bprm->mm);
if (retval)
goto out;
bprm->mm = NULL;
#ifdef CONFIG_POSIX_TIMERS
spin_lock_irq(&me->sighand->siglock);
posix_cpu_timers_exit(me);
spin_unlock_irq(&me->sighand->siglock);
exit_itimers(me);
flush_itimer_signals();
#endif
/*
* Make the signal table private.
*/
retval = unshare_sighand(me);
if (retval)
goto out_unlock;
/*
* Ensure that the uaccess routines can actually operate on userspace
* pointers:
*/
force_uaccess_begin();
me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD |
PF_NOFREEZE | PF_NO_SETAFFINITY);
flush_thread();
me->personality &= ~bprm->per_clear;
/*
* We have to apply CLOEXEC before we change whether the process is
* dumpable (in setup_new_exec) to avoid a race with a process in userspace
* trying to access the should-be-closed file descriptors of a process
* undergoing exec(2).
*/
do_close_on_exec(me->files);
if (bprm->secureexec) {
/* Make sure parent cannot signal privileged process. */
me->pdeath_signal = 0;
/*
* For secureexec, reset the stack limit to sane default to
* avoid bad behavior from the prior rlimits. This has to
* happen before arch_pick_mmap_layout(), which examines
* RLIMIT_STACK, but after the point of no return to avoid
* needing to clean up the change on failure.
*/
if (bprm->rlim_stack.rlim_cur > _STK_LIM)
bprm->rlim_stack.rlim_cur = _STK_LIM;
}
me->sas_ss_sp = me->sas_ss_size = 0;
/*
* Figure out dumpability. Note that this checking only of current
* is wrong, but userspace depends on it. This should be testing
* bprm->secureexec instead.
*/
if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP ||
!(uid_eq(current_euid(), current_uid()) &&
gid_eq(current_egid(), current_gid())))
set_dumpable(current->mm, suid_dumpable);
else
set_dumpable(current->mm, SUID_DUMP_USER);
perf_event_exec();
__set_task_comm(me, kbasename(bprm->filename), true);
/* An exec changes our domain. We are no longer part of the thread
group */
WRITE_ONCE(me->self_exec_id, me->self_exec_id + 1);
flush_signal_handlers(me, 0);
/*
* install the new credentials for this executable
*/
security_bprm_committing_creds(bprm);
commit_creds(bprm->cred);
bprm->cred = NULL;
/*
* Disable monitoring for regular users
* when executing setuid binaries. Must
* wait until new credentials are committed
* by commit_creds() above
*/
if (get_dumpable(me->mm) != SUID_DUMP_USER)
perf_event_exit_task(me);
/*
* cred_guard_mutex must be held at least to this point to prevent
* ptrace_attach() from altering our determination of the task's
* credentials; any time after this it may be unlocked.
*/
security_bprm_committed_creds(bprm);
/* Pass the opened binary to the interpreter. */
if (bprm->have_execfd) {
retval = get_unused_fd_flags(0);
if (retval < 0)
goto out_unlock;
fd_install(retval, bprm->executable);
bprm->executable = NULL;
bprm->execfd = retval;
}
return 0;
out_unlock:
up_write(&me->signal->exec_update_lock);
out:
return retval;
}
EXPORT_SYMBOL(begin_new_exec);
retval = exec_mmap(bprm->mm);
current-mm = bprm-mm
使用elf加载的mm,原先mm释放
retval = unshare_sighand(me);
- 复制信号回调表 这里没有对信号额外处理
newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); memcpy(newsighand->action, oldsighand->action, sizeof(newsighand->action));
- 复制信号回调表 这里没有对信号额外处理
flush_signal_handlers(me, 0);
重置部分信号 传参force_default
为0,不会全部重置- 在这里,除了
SIG_IGN
忽略类型的型号,全部置为默认信号处理/* * Flush all handlers for a task. */ void flush_signal_handlers(struct task_struct *t, int force_default) { int i; struct k_sigaction *ka = &t->sighand->action[0]; for (i = _NSIG ; i != 0 ; i--) { if (force_default || ka->sa.sa_handler != SIG_IGN) ka->sa.sa_handler = SIG_DFL; ka->sa.sa_flags = 0; #ifdef __ARCH_HAS_SA_RESTORER ka->sa.sa_restorer = NULL; #endif sigemptyset(&ka->sa.sa_mask); ka++; } }
- 在这里,除了
最后
sp 就绪 执行环境ok
start_thread(regs, elf_entry, bprm->p);
static inline void start_thread(struct pt_regs *regs, unsigned long pc,
unsigned long sp)
{
start_thread_common(regs, pc);
regs->pstate = PSR_MODE_EL0t;
spectre_v4_enable_task_mitigation(current);
regs->sp = sp;
}
总结
- maps 映射
- 父进程的所有maps都会消失
begin_new_exec
里被替换为新的进程的mm
- 父进程的所有maps都会消失
- files 文件描述符
- 不变 对
current->files
除了复制下来外没有额外处理(除了open时候标记为O_CLOEXEC
的会被关闭) - stdin stdout stderr 和其他 都不会有变化 继承父进程,bash中的管道得以实现
- 不变 对
- 信号
- 除了忽略的,其他都被重置为默认信号处理
- nohup得以实现,execve前hup信号置为忽略
- 优先级
- 不变 没有额外处理 fork 也没有额外处理
- ptrace
- 会继承 可以debug execve后的进程
- execve前通常会fork ptrace取决于fork时候的
CLONE_PTRACE
标志,execve本身对ptrace没有变化 - ptracer 如果之前有对内存区域修改 因为mm已全部替换 这些修改都会消失