可执行文件运行的系统调用

最新推荐文章于 2023-03-31 16:11:04 发布

Fred_HBUT

最新推荐文章于 2023-03-31 16:11:04 发布

阅读量845

点赞数

分类专栏： Kernel

Kernel 专栏收录该内容

29 篇文章 0 订阅

订阅专栏

系统调用execve的入口sys_execve()

/*
 * sys_execve() executes a new program.
 */
long sys_execve(const char __user *name,  //需要执行的文件的绝对路径（存于用户空间）
		const char __user *const __user *argv, //传入系统调用的参数（存于用户空间）		 
                const char __user *const __user *envp, struct pt_regs *regs) //regs是系统调用时系统堆栈的情况(详细解释请参看情景分析之系统调用)
{
	long error;
	char *filename;

	filename = getname(name); //copy *filename frome user space to system space.
	error = PTR_ERR(filename); 
	if (IS_ERR(filename))
		return error;
	error = do_execve(filename, argv, envp, regs);

#ifdef CONFIG_X86_32
	if (error == 0) {
		/* Make sure we don't return using sysenter.. */
                set_thread_flag(TIF_IRET);
        }
#endif

	putname(filename);
	return error;
}

我们首先关注标签__user，这个标签表示其后边的变量是指向用户空间的地址的（详细的解释，请参看深入Linux内核框架P27）。

关于sys_execve参数的说明：Not only the register set with the arguments and the name of the executable file (filename) but also pointers to the arguments and the environment of the program are passed as in system programming. The notation is slightly clumsy because argv and envp are arrays of pointers, and both the pointer tothe array itself as well as all pointers in the array are located in the userspace portion of the virtual address space. Recall from the Introduction that some precautions are required when userspace memoryis accessed from the kernel, and that the __user annotations allow automated tools to check if everything is handled properly.

接下来的getname将要执行的文件名从用户空间拷贝到系统空间会调用如下函数：

static char *getname_flags(const char __user * filename, int flags) 
{
	char *tmp, *result;

	result = ERR_PTR(-ENOMEM);
	tmp = __getname();  //allocate a physical page in system space as cache. Because the file's name could be very long. (hu xi ming, Page 306)
	if (tmp)  {
		int retval = do_getname(filename, tmp);

		result = tmp;
		if (retval < 0) {
			if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) {
				__putname(tmp);
				result = ERR_PTR(retval);
			}
		}
	}
	audit_getname(result);
	return result;
}

注意函数中的__getname();为文件名分配一个物理页面作为缓冲区，因为一个绝对路径可能很长，因此如果用临时变量的话，这个路径就被存储在系统堆栈段中，这显然是不合适的，因为系统堆栈段只有约7KB的空间。

之后调用do_getname()将filename从用户空间拷贝到分配到的系统物理页面上：

static int do_getname(const char __user *filename, char *page)
{
	int retval;
	unsigned long len = PATH_MAX;

	if (!segment_eq(get_fs(), KERNEL_DS)) {  //如果进程地址限制和KERNEL_DS不和相等,即当前进程没有运行在内核态
		if ((unsigned long) filename >= TASK_SIZE) //如果filname>=TASK_SIZE，则非法访问了
			return -EFAULT;
		if (TASK_SIZE - (unsigned long) filename < PATH_MAX)
			len = TASK_SIZE - (unsigned long) filename;    //这个是为什么????
	}

	retval = strncpy_from_user(page, filename, len);  //将filename从用户空间中拷贝到内核页面中。
	if (retval > 0) {
		if (retval < len)
			return 0;
		return -ENAMETOOLONG; 
	} else if (!retval)
		retval = -ENOENT;
	return retval;
}

对划红线部分代码的理解：在创建新进程的时候，有个copy_mm操作，将父进程的页目录和页表拷贝给子进程，同时将父进程中的可写页面也拷贝给子进程，只读的页面是不用拷贝的。但是我们运用了cow技术，因此在copy_mm中实际上并没有将页面拷贝给子进程，而是要等到子进程实际要用到这些页面，具体的说就是要往这些页面中写的时候，才会从子进程的空间中分配空闲页面。显然用户空间的用于实现堆栈空间的页面是可写的，因此在copy_mm的时候并没有将这些页面拷贝给子进程，当子进程用到自己的堆栈的时候，会重新分配新的干净的页面。那么子进程的第一个操作就是execve(argv),我们知道用户空间参数是通过堆栈给定的，因此filename作为参数压入子进程的堆栈时，子进程会分配干净的堆栈页面，然后将*filename压栈，这是第一次使用子进程的堆栈，当然堆栈是空的，因此红线部分可以的出fileame的长度。（但是有一个问题，库函数有实现了不同的方式去调用系统调用sys_doexecve，而这些库函数大都不止一个参数，而filename或者pathname一般都是第一参数，按照参数入栈次序，是不应该最先入栈的，那样的话filename指针到TASK_SIZE就不仅仅存储的是filename了。难道库函数会对这个做处理？）。这样的理解欠妥，更准确的解释可以参看博文《一个简单的进程创建的例子》。

完成拷贝动作的函数，最终调用：

/*
 * Copy a null terminated string from userspace.
 */

#define __do_strncpy_from_user(dst, src, count, res)			   \
do {									   \
	int __d0, __d1, __d2;						   \
	might_fault();							   \
	__asm__ __volatile__(						   \
		"	testl %1,%1\n"					   \
		"	jz 2f\n"					   \
		"0:	lodsb\n"					   \
		"	stosb\n"					   \
		"	testb %%al,%%al\n"				   \
		"	jz 1f\n"					   \
		"	decl %1\n"					   \
		"	jnz 0b\n"					   \
		"1:	subl %1,%0\n"					   \
		"2:\n"							   \
		".section .fixup,\"ax\"\n"				   \
		"3:	movl %5,%0\n"					   \
		"	jmp 2b\n"					   \
		".previous\n"						   \
		_ASM_EXTABLE(0b,3b)					   \
		: "=&d"(res), "=&c"(count), "=&a" (__d0), "=&S" (__d1),	   \  //输出部分res运用edx寄存器，count运用ecx寄存器,_d0用eax寄存器
                "=&D" (__d2)						   \  //__d1用ESI寄存器，_d2用EDI寄存器
		                                                           \  //
                : "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst) \ //count用和%0参数一样的寄存器，也就是用count初始化dx，以此类推。
		: "memory");						   \
} while (0)

这个函数高效的完成了拷贝工作，具体的解释可以参考《情景分析》P250。

现在我们终于在将要调用的可执行文件的名字和路径拷贝给了系统空间，下面回到sys_execve，调用do_execve(filename, argv, envp, regs);

/*
 * sys_execve() executes a new program.
 */
static int do_execve_common(const char *filename,
				struct user_arg_ptr argv,
				struct user_arg_ptr envp,
				struct pt_regs *regs)
{
	struct linux_binprm *bprm; //这个结构当然是非常重要的，下文，列出了这个结构体以便查询各个成员变量的意义。
                                   // This structure is used to hold the arguments that are used when loading binaries.
	struct file *file;
	struct files_struct *displaced;
	bool clear_in_exec;
	int retval;

	retval = unshare_files(&displaced);//
/*
 *   源码中的注释是：
 *    Helper to unshare the files of the current task.
 *    We don't want to expose copy_files internals to
 *    the exec layer of the kernel.
 *    注意，在解除可执行文件共享的操作中，只是复制了文件描述符表给子进程（因为在创建do_fork中,copy_files只是复制了file_struct,并没有递归的复制更加深层次的东西），
 *    而没有拷贝文件。
 */
	if (retval)
		goto out_ret;

	retval = -ENOMEM;
	bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
	if (!bprm)
		goto out_files;

	retval = prepare_bprm_creds(bprm); //Prepare credentials and lock ->cred_guard_mutex.
	if (retval)
		goto out_free;

	retval = check_unsafe_exec(bprm);
/*
 * determine how safe it is to execute the proposed program
 * - the caller must hold ->cred_guard_mutex to protect against
 *   PTRACE_ATTACH
 */

	if (retval < 0)
		goto out_free;
	clear_in_exec = retval;
	current->in_execve = 1;/* Tell the LSMs that the process is doing an execve */
	file = open_exec(filename); //打开可执行文件，这属于文件系统的内容。不过可以看一下里边有关打开文件标志的设置。返回的是可执行文件的上下文。
	retval = PTR_ERR(file);
	if (IS_ERR(file))
		goto out_unmark;

	sched_exec();
/*
When a new process is started with the exec system call, a good opportunity for the sched-
uler to move the task across CPUs arises. Naturally, it has not been running yet, so there can-
not be any negative effects on the CPU cache by moving the task to another CPU.
《深入Linux内核框架》 P125
 */
	bprm->file = file;
	bprm->filename = filename;
	bprm->interp = filename;

	retval = bprm_mm_init(bprm);
/*
 * Create a new mm_struct and populate it with a temporary stack
 * vm_area_struct.  We don't have enough context at this point to set the stack
 * flags, permissions, and offset, so we use temporary values.  We'll update
 * them later in setup_arg_pages().
 */
	if (retval)
		goto out_file;

	bprm->argc = count(argv, MAX_ARG_STRINGS);
	if ((retval = bprm->argc) < 0)
		goto out;

	bprm->envc = count(envp, MAX_ARG_STRINGS);
	if ((retval = bprm->envc) < 0)
		goto out;

	retval = prepare_binprm(bprm);
/* 
 * Fill the binprm structure from the inode. 
 * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes
 * 从可执行文件中读取前128字节，不管是什么格式的可执行文件，在开头的128字节中都包括了关于可执行文件属性的必要而充分的信息。
 */
	if (retval < 0)
		goto out;

	retval = copy_strings_kernel(1, &bprm->filename, bprm);//我也不知道这个是用来干吗的
	if (retval < 0)
		goto out;

	bprm->exec = bprm->p;  //********************************************
	retval = copy_strings(bprm->envc, envp, bprm);
	if (retval < 0)
		goto out;

	retval = copy_strings(bprm->argc, argv, bprm);//连着的三个copy_strings是将运行所需的参数和环境变量收集到bprm中。
	if (retval < 0)
		goto out;

	retval = search_binary_handler(bprm,regs);  //整个函数的核心：
	if (retval < 0)
		goto out;

	/* execve succeeded */
	current->fs->in_exec = 0;
	current->in_execve = 0;
	acct_update_integrals(current);
	free_bprm(bprm);
	if (displaced)
		put_files_struct(displaced);
	return retval;

out:
	if (bprm->mm) {
		acct_arg_size(bprm, 0);
		mmput(bprm->mm);
	}

out_file:
	if (bprm->file) {
		allow_write_access(bprm->file);
		fput(bprm->file);
	}

out_unmark:
	if (clear_in_exec)
		current->fs->in_exec = 0;
	current->in_execve = 0;

out_free:
	free_bprm(bprm);

out_files:
	if (displaced)
		reset_files_struct(displaced);
out_ret:
	return retval;
}

重要的结构体：

/*
 * This structure is used to hold the arguments that are used when loading binaries.
 */
struct linux_binprm {
	char buf[BINPRM_BUF_SIZE];
#ifdef CONFIG_MMU
	struct vm_area_struct *vma;
	unsigned long vma_pages;
#else
# define MAX_ARG_PAGES	32
	struct page *page[MAX_ARG_PAGES];
#endif
	struct mm_struct *mm;
	unsigned long p; /* current top of mem */
	unsigned int
		cred_prepared:1,/* true if creds already prepared (multiple
				 * preps happen for interpreters) */
		cap_effective:1;/* true if has elevated effective capabilities,
				 * false if not; except for init which inherits
				 * its parent's caps anyway */
#ifdef __alpha__
	unsigned int taso:1;
#endif
	unsigned int recursion_depth;
	struct file * file;
	struct cred *cred;	/* new credentials */
	int unsafe;		/* how unsafe this exec is (mask of LSM_UNSAFE_*) */
	unsigned int per_clear;	/* bits to clear in current->personality */
	int argc, envc;
	const char * filename;	/* Name of binary as seen by procps */
	const char * interp;	/* Name of the binary really executed. Most
				   of the time same as filename, but could be
				   different for binfmt_{misc,script} */
	unsigned interp_flags;
	unsigned interp_data;
	unsigned long loader, exec;
};

回到do_execve，我们来看一下这个函数的核心部分，关于这个函数的概要性的介绍在《情景分析》P311中有，

/*
 * cycle the list of binary formats handler, until one recognizes the image
 */
int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
{
	unsigned int depth = bprm->recursion_depth;
	int try,retval;
	struct linux_binfmt *fmt;

	retval = security_bprm_check(bprm);
	if (retval)
		return retval;

	retval = audit_bprm(bprm);
	if (retval)
		return retval;

	retval = -ENOENT;
	for (try=0; try<2; try++) {
		read_lock(&binfmt_lock);
		list_for_each_entry(fmt, &formats, lh) {
			int (*fn)(struct linux_binprm *, struct pt_regs *) = fmt->load_binary;
			if (!fn)
				continue;
			if (!try_module_get(fmt->module))
				continue;
			read_unlock(&binfmt_lock);
			retval = fn(bprm, regs);
			/*
			 * Restore the depth counter to its starting value
			 * in this call, so we don't have to rely on every
			 * load_binary function to restore it on return.
			 */
			bprm->recursion_depth = depth;
			if (retval >= 0) {
				if (depth == 0)
					tracehook_report_exec(fmt, bprm, regs);
				put_binfmt(fmt);
				allow_write_access(bprm->file);
				if (bprm->file)
					fput(bprm->file);
				bprm->file = NULL;
				current->did_exec = 1;
				proc_exec_connector(current);
				return retval;
			}
			read_lock(&binfmt_lock);
			put_binfmt(fmt);
			if (retval != -ENOEXEC || bprm->mm == NULL)
				break;
			if (!bprm->file) {
				read_unlock(&binfmt_lock);
				return retval;
			}
		}
		read_unlock(&binfmt_lock);
		if (retval != -ENOEXEC || bprm->mm == NULL) {
			break;
#ifdef CONFIG_MODULES
		} else {
#define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e))
			if (printable(bprm->buf[0]) &&
			    printable(bprm->buf[1]) &&
			    printable(bprm->buf[2]) &&
			    printable(bprm->buf[3]))
				break; /* -ENOEXEC */
			request_module("binfmt-%04x", *(unsigned short *)(&bprm->buf[2]));
#endif
		}
	}
	return retval;
}

这个函数的核心是两层循环，内存循环对fomats队列中的每个队员循环，让队列中的成员逐个试试它们的loda_binary()函数，看能否对上号，如果对上了号，则将目标文件装入并投入运行。如果内层循环结束后没有找到合适的运行这个文件的队员，那么如果内核支持动态安装模块，就条用reques_module()函数，从文件系统中寻找适合执行该文件的代理人。如果有，就将该模块加载进来，再对进行一次内部循环，查找适合的队员。如果还是没有找到，则返回出错。

这涉及到不同类型执行文件的不同的操作方式，不能详述，可以参看《情景分析》的a.out格式目标文件的装载和投运，以及《深入Linux内核框架》的elf格式目标文件的装载和投运。不过，不管什么类型的执行文件，基本上都做以下事情：

（1） It releases all resources used by the old process.
（2） It maps the application into virtual address space. The following segments must be taken into account (the variables specified are elements of the task structure and are set to the correct values by binary format handler)
（3） The text segment contains the executable code of the program. start_code and end_code
specify the area in address space where the segment resides.

（4） The pre-initialized data (variables supplied with a specific value at compilation time) are
        located between start_data and end_data and are mapped from the corresponding seg-
          ment of the executable file.
（5）The heap used for dynamic memory allocation is placed in virtual address space; start_brk
   and brk specify its boundaries.
（6）The position of the stack is defined by start_stack; the stack grows downward automati-
   cally on nearly all machines. The only exception is currently PA-Risc. The inverse direction
         of stack growth must be noted by the architecture by setting the configuration symbol
         STACK_GROWSUP.

（7）The program arguments and the environment are mapped into the virtual address space
         and are located between arg_start and arg_end and env_start and env_end, respec-
         tively.
                                                                                         ————《深入Linux内核框架》 P81
回到do_execve_common（）中，在search_binary_handler后，做收尾工作：

	/* execve succeeded */
	current->fs->in_exec = 0;
	current->in_execve = 0;
	acct_update_integrals(current);  //update mm integral fields in task_struct;主要是task_struct结构中与时间相关的变量的设置，以用于以后的调度。
	free_bprm(bprm);
	if (displaced)
		put_files_struct(displaced);
	return retval;

至此，完成了execve的过程！

总结起来，主要干了如下工作：

（1）将可执行文件的文件名从用户空间都到内核空间 filename = getname(name);

（2）打开可执行文件： file = open_exec(filename);
（3）初始化用于在加载二进制可执行文件时存储与其相关的所有信息的linux_binprm数据结构： retval = bprm_mm_init(bprm);
（4）将运行所需的参数和环境变量收集到bprm中：连续的三个copy_strings()

（5）函数的核心是：search_binary_handler。加载可执行文件。

完成了execv的过程！

Fred_HBUT

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
可执行文件运行的系统调用

系统调用execve的入口sys_execve()/* * sys_execve() executes a new program. */long sys_execve(const char __user *name, //需要执行的文件的绝对路径（存于用户空间） const char __user *const __user *
复制链接

扫一扫