execve 继承了什么?映射、信号、文件描述符、优先级、ptrace

简介

execve syscall 做了什么

重要参考
Linux 进程启动 execve 系统调用内核源码解析

execve 一次简单的跟踪

入口

SYSCALL_DEFINE3(execve,
		const char __user *, filename,
		const char __user *const __user *, argv,
		const char __user *const __user *, envp)
{
	return do_execve(getname(filename), argv, envp);
}
  • SYSCALL_DEFINE3 表示这个syscall有三个参数 宏定义 实际函数名do_sys_execve
  • 还有个execveat系统调用 区别仅仅是filename的cwd可以指定
    static int do_execve(struct filename *filename,
    	const char __user *const __user *__argv,
    	const char __user *const __user *__envp)
    {
    	struct user_arg_ptr argv = { .ptr.native = __argv };
    	struct user_arg_ptr envp = { .ptr.native = __envp };
    	return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
    }

ftrace

对bash中敲ls跟踪 部分不重要的地方已省略…

抓的是do_execveat_common,因为do_sys_execvedo_execve 都不让抓,报错,不懂

# perf ftrace --graph-opts depth=7 -a -G do_execveat_common
# tracer: function_graph
#
# CPU  DURATION                  FUNCTION CALLS
# |     |   |                     |   |   |   |
   1)               |  do_execveat_common() {
   1)               |    alloc_bprm() {
   1)               |      ...
   1) + 13.133 us   |    }
   1)               |    copy_string_kernel() {
                           ...   
   1) + 14.515 us   |    }
   1)               |    copy_strings.isra.0() {
                           ...
   1) + 66.078 us   |    }
   1)               |    copy_strings.isra.0() {
                           ...
   1)   6.495 us    |    }
   1)               |    bprm_execve() {
   1)   0.188 us    |      mutex_lock_interruptible();
   1)               |      prepare_exec_creds() {
                              ...
   1)   3.646 us    |      }
   1)               |      bprm_execve.part.0() {
   1)               |        check_unsafe_exec() {
                                 ...
   1)   3.999 us    |        }
   1)               |        do_open_execat() {
                                 ...
   1) + 57.539 us   |        }
   1)               |        sched_exec() {
                                 ...
   1)   3.658 us    |        }
   1)               |        security_bprm_creds_for_exec() {
                                 ...
   1) + 13.107 us   |        }
   1)               |        exec_binprm() {
   1)               |          search_binary_handler() {
   1)               |            kernel_read() {
                                 ...
   1) ! 445.465 us  |            }
   1)               |            security_bprm_check() {
   1)   1.134 us    |              ima_bprm_check();
   1)   1.431 us    |            }
   1)   0.229 us    |            _raw_read_lock();
   1)   0.407 us    |            try_module_get();
   1)   0.124 us    |            load_script();
   1)   0.119 us    |            _raw_read_lock();
   1)   0.094 us    |            module_put();
   1)   0.095 us    |            try_module_get();
   1)               |            load_elf_binary() {
   1)   3.002 us    |              load_elf_phdrs();
   1)   0.263 us    |              __kmalloc();
   1)   0.933 us    |              kernel_read();
   1) + 12.149 us   |              open_exec();
   1)   0.207 us    |              irq_enter_rcu();
   1)   0.532 us    |              __sysvec_irq_work();
   1)   0.230 us    |              irq_exit_rcu();
   1)   0.283 us    |              kfree();
   1)   0.413 us    |              would_dump();
   1)   0.314 us    |              kmem_cache_alloc_trace();
   1)   1.875 us    |              kernel_read();
   1)   1.352 us    |              load_elf_phdrs();
   1) ! 273.565 us  |              begin_new_exec();
   1)   0.418 us    |              irq_enter_rcu();
   1)   0.336 us    |              __sysvec_irq_work();
   1)   0.169 us    |              irq_exit_rcu();
   1)   0.368 us    |              set_personality_64bit();
   1)   0.844 us    |              setup_new_exec();
   1)   0.152 us    |              randomize_stack_top();
   1) + 10.054 us   |              setup_arg_pages();
   1)   0.168 us    |              arch_mmap_rnd();
   1)   0.410 us    |              total_mapping_size();
   1)   6.781 us    |              elf_map();
   1)   1.867 us    |              elf_map();
   1)   1.614 us    |              elf_map();
   1)   1.853 us    |              elf_map();
   1)   1.048 us    |              set_brk();
   1) ! 558.256 us  |              clear_user();
   1)   0.332 us    |              irq_enter_rcu();
   1)   0.541 us    |              __sysvec_irq_work();
   1)   0.250 us    |              irq_exit_rcu();
   1) + 43.014 us   |              load_elf_interp.constprop.0();
   1)   0.237 us    |              irq_enter_rcu();
   1)   0.521 us    |              __sysvec_irq_work();
   1)   0.318 us    |              irq_exit_rcu();
   1)   0.145 us    |              fput();
   1)   0.398 us    |              kfree();
   1)   0.287 us    |              kfree();
   1)   0.327 us    |              kfree();
   1)   0.345 us    |              set_binfmt();
   1)   2.448 us    |              arch_setup_additional_pages();
   1)   8.669 us    |              create_elf_tables();
   1)   0.294 us    |              arch_randomize_brk();
   1)   0.338 us    |              finalize_exec();
   1)   0.186 us    |              start_thread();
   1) ! 951.498 us  |            }
   1)   0.170 us    |            _raw_read_lock();
   1)   0.140 us    |            module_put();
   1) # 1441.605 us |          } /* search_binary_handler */
   1)   0.321 us    |          proc_exec_connector();
   1) # 1444.211 us |        }
   1)               |        acct_update_integrals() {
   1)   0.109 us    |          task_cputime();
   1)   0.649 us    |        }
   1)   0.133 us    |        task_numa_free();
   1) # 1524.423 us |      }
   1) # 1528.936 us |    }
   1)               |    free_bprm() {
   1)               |      fput() {
   1)   0.095 us    |        fput_many();
   1)   0.272 us    |      }
   1)   0.438 us    |      kfree();
   1)   0.336 us    |      kfree();
   1)   1.893 us    |    }
   1)               |    putname() {
   1)   0.225 us    |      kmem_cache_free();
   1)   0.535 us    |    }
   1) # 1690.790 us |  }

流程

bprm

bprm 结构体 贯穿execve的过程

/root/linux-5.10.202/fs/exec.c

static int do_execveat_common(int fd, struct filename *filename,
			      struct user_arg_ptr argv,
			      struct user_arg_ptr envp,
			      int flags)
{
	struct linux_binprm *bprm;



/root/linux-5.10.202/include/linux/binfmts.h

/*
 * This structure is used to hold the arguments that are used when loading binaries.
 */
struct linux_binprm {
#ifdef CONFIG_MMU
	struct vm_area_struct *vma;
	unsigned long vma_pages;
#else
# define MAX_ARG_PAGES	32
	struct page *page[MAX_ARG_PAGES];
#endif
	struct mm_struct *mm;
	unsigned long p; /* current top of mem */
	unsigned long argmin; /* rlimit marker for copy_strings() */
	unsigned int
		/* Should an execfd be passed to userspace? */
		have_execfd:1,

		/* Use the creds of a script (see binfmt_misc) */
		execfd_creds:1,
		/*
		 * Set by bprm_creds_for_exec hook to indicate a
		 * privilege-gaining exec has happened. Used to set
		 * AT_SECURE auxv for glibc.
		 */
		secureexec:1,
		/*
		 * Set when errors can no longer be returned to the
		 * original userspace.
		 */
		point_of_no_return:1;
#ifdef __alpha__
	unsigned int taso:1;
#endif
	struct file *executable; /* Executable to pass to the interpreter */
	struct file *interpreter;
	struct file *file;
	struct cred *cred;	/* new credentials */
	int unsafe;		/* how unsafe this exec is (mask of LSM_UNSAFE_*) */
	unsigned int per_clear;	/* bits to clear in current->personality */
	int argc, envc;
	const char *filename;	/* Name of binary as seen by procps */
	const char *interp;	/* Name of the binary really executed. Most
				   of the time same as filename, but could be
				   different for binfmt_{misc,script} */
	const char *fdpath;	/* generated filename for execveat */
	unsigned interp_flags;
	int execfd;		/* File descriptor of the executable */
	unsigned long loader, exec;

	struct rlimit rlim_stack; /* Saved RLIMIT_STACK used during exec. */

	char buf[BINPRM_BUF_SIZE];
} __randomize_layout;

以下段落照抄于Linux 进程启动 execve 系统调用内核源码解析

它用于存储加载可执行文件时所需的参数,包括程序的参数列表、环境变量列表、限制信息等。下面是对该结构体的一些解释:

vma 和 vma_pages 成员变量用于存储新程序的地址空间信息。vma 是一个指向 vm_area_struct 结构体的指针,该结构体用于描述一个虚拟内存区域;vma_pages 是一个无符号长整型变量,表示新程序占用的虚拟内存页数。

mm 成员变量是一个指向 mm_struct 结构体的指针,用于表示进程的内存映射信息。

p 成员变量是一个无符号长整型变量,表示新程序的内存布局的顶部位置。

argmin 成员变量是一个无符号长整型变量,表示 RLIMIT_STACK 限制的标记位置。

have_execfd、execfd_creds 和 secureexec 成员变量是用于表示一些特殊情况的标志位。

point_of_no_return 成员变量是一个标志位,用于表示在执行新程序时是否可以返回错误给原始用户空间。

executable、interpreter 和 file 成员变量是指向 file 结构体的指针,分别表示要执行的程序文件、解释器文件和当前进程的执行文件。

cred 成员变量是一个指向 cred 结构体的指针,表示新程序的执行凭证。

unsafe 成员变量是一个整型变量,用于表示执行新程序的安全级别。

per_clear 成员变量是一个无符号整型变量,表示在执行新程序时需要清除的当前进程的 personality 标志位。

argc 和 envc 成员变量分别表示新程序的参数数量和环境变量数量。

filename、interp 和 fdpath 成员变量分别表示新程序的名称、解释器的名称和在执行 execveat() 系统调用时生成的文件名。

interp_flags 成员变量是一个无符号整型变量,表示解释器的标志位。

execfd 成员变量是一个整型变量,表示要执行的程序文件的文件描述符。

loader 和 exec 成员变量分别表示解释器和新程序的入口地址。

rlim_stack 成员变量是一个 rlimit 结构体,表示新程序的栈空间大小限制。

buf 成员变量是一个字符数组,用于存储新程序的代码和数据。

这个结构体是 execve() 系统调用的底层实现所需的参数集合,它会在内核中的加载可执行文件时被使用。

后面一段巴拉巴拉的加载,同样建议查阅Linux 进程启动 execve 系统调用内核源码解析

加载elf

elf 是linux平台的二进制可执行文件格式

/*
 * cycle the list of binary formats handler, until one recognizes the image
 */
static int search_binary_handler(struct linux_binprm *bprm)
{
	bool need_retry = IS_ENABLED(CONFIG_MODULES);
	struct linux_binfmt *fmt;
	int retval;

	retval = prepare_binprm(bprm);
	if (retval < 0)
		return retval;

	retval = security_bprm_check(bprm);
	if (retval)
		return retval;

	retval = -ENOENT;
 retry:
	read_lock(&binfmt_lock);
	list_for_each_entry(fmt, &formats, lh) {
		if (!try_module_get(fmt->module))
			continue;
		read_unlock(&binfmt_lock);

		retval = fmt->load_binary(bprm);
  • 最后一句retval = fmt->load_binary(bprm);调用相应的函数,加载,很重要
    static struct linux_binfmt elf_format = {
    	.module		= THIS_MODULE,
    	.load_binary	= load_elf_binary,
    	.load_shlib	= load_elf_library,
    	.core_dump	= elf_core_dump,
    	.min_coredump	= ELF_EXEC_PAGESIZE,
    };
    

通过ftrace的跟踪,bash中执行ls,使用的是load_elf_binary

load_elf_binary

再次祭出ftrace分析load_elf_binary做了什么

load_elf_binary() {
   1)   3.002 us    |              load_elf_phdrs();
   1)   0.263 us    |              __kmalloc();
   1)   0.933 us    |              kernel_read();
   1) + 12.149 us   |              open_exec();
   1)   0.207 us    |              irq_enter_rcu();
   1)   0.532 us    |              __sysvec_irq_work();
   1)   0.230 us    |              irq_exit_rcu();
   1)   0.283 us    |              kfree();
   1)   0.413 us    |              would_dump();
   1)   0.314 us    |              kmem_cache_alloc_trace();
   1)   1.875 us    |              kernel_read();
   1)   1.352 us    |              load_elf_phdrs();
   1) ! 273.565 us  |              begin_new_exec();
   1)   0.418 us    |              irq_enter_rcu();
   1)   0.336 us    |              __sysvec_irq_work();
   1)   0.169 us    |              irq_exit_rcu();
   1)   0.368 us    |              set_personality_64bit();
   1)   0.844 us    |              setup_new_exec();
   1)   0.152 us    |              randomize_stack_top();
   1) + 10.054 us   |              setup_arg_pages();
   1)   0.168 us    |              arch_mmap_rnd();
   1)   0.410 us    |              total_mapping_size();
   1)   6.781 us    |              elf_map();
   1)   1.867 us    |              elf_map();
   1)   1.614 us    |              elf_map();
   1)   1.853 us    |              elf_map();
   1)   1.048 us    |              set_brk();
   1) ! 558.256 us  |              clear_user();
   1)   0.332 us    |              irq_enter_rcu();
   1)   0.541 us    |              __sysvec_irq_work();
   1)   0.250 us    |              irq_exit_rcu();
   1) + 43.014 us   |              load_elf_interp.constprop.0();
   1)   0.237 us    |              irq_enter_rcu();
   1)   0.521 us    |              __sysvec_irq_work();
   1)   0.318 us    |              irq_exit_rcu();
   1)   0.145 us    |              fput();
   1)   0.398 us    |              kfree();
   1)   0.287 us    |              kfree();
   1)   0.327 us    |              kfree();
   1)   0.345 us    |              set_binfmt();
   1)   2.448 us    |              arch_setup_additional_pages();
   1)   8.669 us    |              create_elf_tables();
   1)   0.294 us    |              arch_randomize_brk();
   1)   0.338 us    |              finalize_exec();
   1)   0.186 us    |              start_thread();
   1) ! 951.498 us  |            }

这里面一大堆,真正影响到进程描述、修改current task_struch指针内容的,主要有下面几个函数

begin_new_exec

/*
 * Calling this is the point of no return. None of the failures will be
 * seen by userspace since either the process is already taking a fatal
 * signal (via de_thread() or coredump), or will have SEGV raised
 * (after exec_mmap()) by search_binary_handler (see below).
 */
int begin_new_exec(struct linux_binprm * bprm)
{
	struct task_struct *me = current;
	int retval;

	/* Once we are committed compute the creds */
	retval = bprm_creds_from_file(bprm);
	if (retval)
		return retval;

	/*
	 * Ensure all future errors are fatal.
	 */
	bprm->point_of_no_return = true;

	/*
	 * Make this the only thread in the thread group.
	 */
	retval = de_thread(me);
	if (retval)
		goto out;

	/*
	 * Must be called _before_ exec_mmap() as bprm->mm is
	 * not visibile until then. This also enables the update
	 * to be lockless.
	 */
	set_mm_exe_file(bprm->mm, bprm->file);

	/* If the binary is not readable then enforce mm->dumpable=0 */
	would_dump(bprm, bprm->file);
	if (bprm->have_execfd)
		would_dump(bprm, bprm->executable);

	/*
	 * Release all of the old mmap stuff
	 */
	acct_arg_size(bprm, 0);
	retval = exec_mmap(bprm->mm);
	if (retval)
		goto out;

	bprm->mm = NULL;

#ifdef CONFIG_POSIX_TIMERS
	spin_lock_irq(&me->sighand->siglock);
	posix_cpu_timers_exit(me);
	spin_unlock_irq(&me->sighand->siglock);
	exit_itimers(me);
	flush_itimer_signals();
#endif

	/*
	 * Make the signal table private.
	 */
	retval = unshare_sighand(me);
	if (retval)
		goto out_unlock;

	/*
	 * Ensure that the uaccess routines can actually operate on userspace
	 * pointers:
	 */
	force_uaccess_begin();

	me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD |
					PF_NOFREEZE | PF_NO_SETAFFINITY);
	flush_thread();
	me->personality &= ~bprm->per_clear;

	/*
	 * We have to apply CLOEXEC before we change whether the process is
	 * dumpable (in setup_new_exec) to avoid a race with a process in userspace
	 * trying to access the should-be-closed file descriptors of a process
	 * undergoing exec(2).
	 */
	do_close_on_exec(me->files);

	if (bprm->secureexec) {
		/* Make sure parent cannot signal privileged process. */
		me->pdeath_signal = 0;

		/*
		 * For secureexec, reset the stack limit to sane default to
		 * avoid bad behavior from the prior rlimits. This has to
		 * happen before arch_pick_mmap_layout(), which examines
		 * RLIMIT_STACK, but after the point of no return to avoid
		 * needing to clean up the change on failure.
		 */
		if (bprm->rlim_stack.rlim_cur > _STK_LIM)
			bprm->rlim_stack.rlim_cur = _STK_LIM;
	}

	me->sas_ss_sp = me->sas_ss_size = 0;

	/*
	 * Figure out dumpability. Note that this checking only of current
	 * is wrong, but userspace depends on it. This should be testing
	 * bprm->secureexec instead.
	 */
	if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP ||
	    !(uid_eq(current_euid(), current_uid()) &&
	      gid_eq(current_egid(), current_gid())))
		set_dumpable(current->mm, suid_dumpable);
	else
		set_dumpable(current->mm, SUID_DUMP_USER);

	perf_event_exec();
	__set_task_comm(me, kbasename(bprm->filename), true);

	/* An exec changes our domain. We are no longer part of the thread
	   group */
	WRITE_ONCE(me->self_exec_id, me->self_exec_id + 1);
	flush_signal_handlers(me, 0);

	/*
	 * install the new credentials for this executable
	 */
	security_bprm_committing_creds(bprm);

	commit_creds(bprm->cred);
	bprm->cred = NULL;

	/*
	 * Disable monitoring for regular users
	 * when executing setuid binaries. Must
	 * wait until new credentials are committed
	 * by commit_creds() above
	 */
	if (get_dumpable(me->mm) != SUID_DUMP_USER)
		perf_event_exit_task(me);
	/*
	 * cred_guard_mutex must be held at least to this point to prevent
	 * ptrace_attach() from altering our determination of the task's
	 * credentials; any time after this it may be unlocked.
	 */
	security_bprm_committed_creds(bprm);

	/* Pass the opened binary to the interpreter. */
	if (bprm->have_execfd) {
		retval = get_unused_fd_flags(0);
		if (retval < 0)
			goto out_unlock;
		fd_install(retval, bprm->executable);
		bprm->executable = NULL;
		bprm->execfd = retval;
	}
	return 0;

out_unlock:
	up_write(&me->signal->exec_update_lock);
out:
	return retval;
}
EXPORT_SYMBOL(begin_new_exec);
  • retval = exec_mmap(bprm->mm);
    • current-mm = bprm-mm 使用elf加载的mm,原先mm释放
  • retval = unshare_sighand(me);
    • 复制信号回调表 这里没有对信号额外处理
      newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
      memcpy(newsighand->action, oldsighand->action,
      	       sizeof(newsighand->action));
      
  • flush_signal_handlers(me, 0); 重置部分信号 传参force_default为0,不会全部重置
    • 在这里,除了SIG_IGN忽略类型的型号,全部置为默认信号处理
      /*
       * Flush all handlers for a task.
       */
      
      void
      flush_signal_handlers(struct task_struct *t, int force_default)
      {
      	int i;
      	struct k_sigaction *ka = &t->sighand->action[0];
      	for (i = _NSIG ; i != 0 ; i--) {
      		if (force_default || ka->sa.sa_handler != SIG_IGN)
      			ka->sa.sa_handler = SIG_DFL;
      		ka->sa.sa_flags = 0;
      #ifdef __ARCH_HAS_SA_RESTORER
      		ka->sa.sa_restorer = NULL;
      #endif
      		sigemptyset(&ka->sa.sa_mask);
      		ka++;
      	}
      }
      

最后

sp 就绪 执行环境ok

start_thread(regs, elf_entry, bprm->p);

static inline void start_thread(struct pt_regs *regs, unsigned long pc,
				unsigned long sp)
{
	start_thread_common(regs, pc);
	regs->pstate = PSR_MODE_EL0t;
	spectre_v4_enable_task_mitigation(current);
	regs->sp = sp;
}

总结

  • maps 映射
    • 父进程的所有maps都会消失 begin_new_exec里被替换为新的进程的mm
  • files 文件描述符
    • 不变 对current->files除了复制下来外没有额外处理(除了open时候标记为O_CLOEXEC的会被关闭)
    • stdin stdout stderr 和其他 都不会有变化 继承父进程,bash中的管道得以实现
  • 信号
    • 除了忽略的,其他都被重置为默认信号处理
    • nohup得以实现,execve前hup信号置为忽略
  • 优先级
    • 不变 没有额外处理 fork 也没有额外处理
  • ptrace
    • 会继承 可以debug execve后的进程
    • execve前通常会fork ptrace取决于fork时候的CLONE_PTRACE标志,execve本身对ptrace没有变化
    • ptracer 如果之前有对内存区域修改 因为mm已全部替换 这些修改都会消失
  • 17
    点赞
  • 16
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值