Linux内核模块加载深度剖析(上篇)

文章详细剖析了Linux系统中使用insmod命令加载内核模块的过程,从busybox的insmod入口函数insmod_main()开始,经过参数解析,映射模块文件,直至内核空间的init_module()系统调用。讲解了模块如何从用户空间拷贝到内核空间,并简要提及了内核加载模块的一些关键步骤。
摘要由CSDN通过智能技术生成

Linux 内核模块加载深度剖析(上篇)

作者简介

微信公众号『嵌入式 Linux 开发』作者,专注于嵌入式 Linux 下的内核、驱动和系统软件开发,专注于基础知识和项目实战分享。

insmod 入口函数

本文用到的 busybox 版本为 1.34.1,Linux 内核版本为 4.14.294

insmod_main()函数是 insmod 命令的入口函数,该函数首先通过函数参数获取被加载模块的名字并存入局部指针变量 filename,然后调用bb_init_module()函数进行后续操作。

int insmod_main(int argc UNUSED_PARAM, char **argv)
{
	char *filename;
	int rc;

	/* Compat note:
	 * 2.6 style insmod has no options and required filename
	 * (not module name - .ko can't be omitted).
	 * 2.4 style insmod can take module name without .o
	 * and performs module search in default directories
	 * or in $MODPATH.
	 */

	IF_FEATURE_2_4_MODULES(
		getopt32(argv, INSMOD_OPTS INSMOD_ARGS);
		argv += optind - 1;
	);

	filename = *++argv;
	if (!filename)
		bb_show_usage();

	rc = bb_init_module(filename, parse_cmdline_module_options(argv, /*quote_spaces:*/ 0));
	if (rc)
		bb_error_msg("can't insert '%s': %s", filename, moderror(rc));

	return rc;
}

模块参数解析函数

parse_cmdline_module_options()函数会解析模块加载时传给模块的参数,通过while循环挨个解析模块后面传给模块的参数,并将解析出来的参数值val存入指针变量options指向的内存空间,最后返回该内存空间的首地址。

char* FAST_FUNC parse_cmdline_module_options(char **argv, int quote_spaces)
{
	char *options;
	int optlen;

	options = xzalloc(1);
	optlen = 0;
	while (*++argv) {
		const char *fmt;
		const char *var;
		const char *val;

		var = *argv;
		options = xrealloc(options, optlen + 2 + strlen(var) + 2);
		fmt = "%.*s%s ";
		val = strchrnul(var, '=');
		if (quote_spaces) {
			/*
			 * modprobe (module-init-tools version 3.11.1) compat:
			 * quote only value:
			 * var="val with spaces", not "var=val with spaces"
			 * (note: var *name* is not checked for spaces!)
			 */
			if (*val) { /* has var=val format. skip '=' */
				val++;
				if (strchr(val, ' '))
					fmt = "%.*s\"%s\" ";
			}
		}
		optlen += sprintf(options + optlen, fmt, (int)(val - var), var, val);
	}
	/* Remove trailing space. Disabled */
	/* if (optlen != 0) options[optlen-1] = '\0'; */
	return options;
}

映射模块文件

bb_init_module()函数首先判断模块有没有参数传入,调用try_to_mmap_module()函数完成后续映射工作,该函数接收两个参数:被加载模块的名字(filename),模块的大小(image_size)作为出参参数传入。最后调用init_module()函数,init_module()函数是系统调用函数,对应的内核函数是sys_init_module()函数,进入到内核空间。传入的参数分别是:模块内存空间首地址(image),模块大小(image_size),模块参数内存空间首地址(options)。

int FAST_FUNC bb_init_module(const char *filename, const char *options)
{
	size_t image_size;
	char *image;
	int rc;
	bool mmaped;

	if (!options)
		options = "";

//TODO: audit bb_init_module_24 to match error code convention
#if ENABLE_FEATURE_2_4_MODULES
	if (get_linux_version_code() < KERNEL_VERSION(2,6,0))
		return bb_init_module_24(filename, options);
#endif

	/*
	 * First we try finit_module if available.  Some kernels are configured
	 * to only allow loading of modules off of secure storage (like a read-
	 * only rootfs) which needs the finit_module call.  If it fails, we fall
	 * back to normal module loading to support compressed modules.
	 */
# ifdef __NR_finit_module
	{
		int fd = open(filename, O_RDONLY | O_CLOEXEC);
		if (fd >= 0) {
			rc = finit_module(fd, options, 0) != 0;
			close(fd);
			if (rc == 0)
				return rc;
		}
	}
# endif

	image_size = INT_MAX - 4095;
	mmaped = 0;
	image = try_to_mmap_module(filename, &image_size);
	if (image) {
		mmaped = 1;
	} else {
		errno = ENOMEM; /* may be changed by e.g. open errors below */
		image = xmalloc_open_zipped_read_close(filename, &image_size);
		if (!image)
			return -errno;
	}

	errno = 0;
	init_module(image, image_size, options);
	rc = errno;
	if (mmaped)
		munmap(image, image_size);
	else
		free(image);
	return rc;
}

try_to_mmap_module()函数首先打开模块文件获取模块文件描述符fd,然后通过fstat()函数获取模块文件的详细信息,判断模块文件的大小st_size是否超过了设定的文件最大值,调用mmap_read()函数以只读的方式将模块文件的内容映射进内存空间,并返回该内存空间的首地址,通过*(uint32_t*)image != SWAP_BE32(0x7f454C46)检查模块文件是否符号 ELF 标准格式,最后将内存空间的首地址image返回。通过try_to_mmap_module()函数我们就获取了模块文件内容在内存空间的地址。

void* FAST_FUNC try_to_mmap_module(const char *filename, size_t *image_size_p)
{
	/* We have user reports of failure to load 3MB module
	 * on a 16MB RAM machine. Apparently even a transient
	 * memory spike to 6MB during module load
	 * is too big for that system. */
	void *image;
	struct stat st;
	int fd;

	fd = xopen(filename, O_RDONLY);
	fstat(fd, &st);
	image = NULL;
	/* st.st_size is off_t, we can't just pass it to mmap */
	if (st.st_size <= *image_size_p) {
		size_t image_size = st.st_size;
		image = mmap_read(fd, image_size);
		if (image == MAP_FAILED) {
			image = NULL;
		} else if (*(uint32_t*)image != SWAP_BE32(0x7f454C46)) {
			/* No ELF signature. Compressed module? */
			munmap(image, image_size);
			image = NULL;
		} else {
			/* Success. Report the size */
			*image_size_p = image_size;
		}
	}
	close(fd);
	return image;
}

init_module()开始调用关系会进入到 Linux 内核源码。

init_module()其实是一个宏定义,最终会调用到__NR_init_module系统调用号对应的系统调用函数是sys_init_module(),该对应关系位于 Linux 内核源码include/uapi/asm-generic/unistd.h文件中。关于 Linux 系统调用的知识,后面会专门写个文章分析 Linux 系统调用的实现机制,并手写一个内核没有的系统调用。

#define init_module(mod, len, opts) syscall(__NR_init_module, mod, len, opts)
#define __NR_init_module 105
__SYSCALL(__NR_init_module, sys_init_module)

sys_init_module()函数是由宏定义SYSCALL_DEFINE3展开形成的,该定义位于文件include/linux/syscalls.h

#define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)
#define SYSCALL_DEFINEx(x, sname, ...)				\
	SYSCALL_METADATA(sname, x, __VA_ARGS__)			\
	__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
#define __SYSCALL_DEFINEx(x, name, ...)					\
	asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))	\
		__attribute__((alias(__stringify(SyS##name))));		\
	static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__));	\
	asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__));	\
	asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__))	\
	{								\
		long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__));	\
		__MAP(x,__SC_TEST,__VA_ARGS__);				\
		__PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__));	\
		return ret;						\
	}								\
	static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__))

SYSCALL_DEFINE3的实现位于kernel/module.c文件中,该函数首先调用may_init_module()函数判断用户是否有加载模块的权限,调用copy_module_from_user()函数将模块文件的内容从用户空间内存地址拷贝到内核空间内存地址,具体实现后面会分析,最后调用load_module()函数,细节详见下面分析。

SYSCALL_DEFINE3(init_module, void __user *, umod,
		unsigned long, len, const char __user *, uargs)
{
	int err;
	struct load_info info = { };

	err = may_init_module();
	if (err)
		return err;

	pr_debug("init_module: umod=%p, len=%lu, uargs=%p\n",
	       umod, len, uargs);

	err = copy_module_from_user(umod, len, &info);
	if (err)
		return err;

	return load_module(&info, uargs, 0);
}

copy_module_from_user()函数首先给load_info结构体成员info->len赋值为模块大小len,调用__vmalloc()函数在内核空间为模块分配info->len大小的内存空间,并返回内核内存空间的的起始地址info->hdr,最后调用copy_chunked_from_user()函数其实就是copy_from_user()函数将用户空间内存模块文件内容拷贝到info->hdr所指向的内核空间内存地址

static int copy_module_from_user(const void __user *umod, unsigned long len,
				  struct load_info *info)
{
	int err;

	info->len = len;
	if (info->len < sizeof(*(info->hdr)))
		return -ENOEXEC;

	err = security_kernel_read_file(NULL, READING_MODULE);
	if (err)
		return err;

	/* Suck in entire file: we'll want most of it. */
	info->hdr = __vmalloc(info->len,
			GFP_KERNEL | __GFP_NOWARN, PAGE_KERNEL);
	if (!info->hdr)
		return -ENOMEM;

	if (copy_chunked_from_user(info->hdr, umod, info->len) != 0) {
		vfree(info->hdr);
		return -EFAULT;
	}

	return 0;
}

至此,模块文件已经从用户空间拷贝到内核空间。

模块加载

鉴于模块加载函数load_module()比较复杂,限于篇幅限制,具体的加载过程会在《Linux内核模块加载深度剖析(中篇)》一文中分析。

static int load_module(struct load_info *info, const char __user *uargs,
		       int flags)
{
	struct module *mod;
	long err;
	char *after_dashes;

	err = module_sig_check(info, flags);
	if (err)
		goto free_copy;

	err = elf_header_check(info);
	if (err)
		goto free_copy;

	/* Figure out module layout, and allocate all the memory. */
	mod = layout_and_allocate(info, flags);
	if (IS_ERR(mod)) {
		err = PTR_ERR(mod);
		goto free_copy;
	}

	audit_log_kern_module(mod->name);

	/* Reserve our place in the list. */
	err = add_unformed_module(mod);
	if (err)
		goto free_module;

#ifdef CONFIG_MODULE_SIG
	mod->sig_ok = info->sig_ok;
	if (!mod->sig_ok) {
		pr_notice_once("%s: module verification failed: signature "
			       "and/or required key missing - tainting "
			       "kernel\n", mod->name);
		add_taint_module(mod, TAINT_UNSIGNED_MODULE, LOCKDEP_STILL_OK);
	}
#endif

	/* To avoid stressing percpu allocator, do this once we're unique. */
	err = percpu_modalloc(mod, info);
	if (err)
		goto unlink_mod;

	/* Now module is in final location, initialize linked lists, etc. */
	err = module_unload_init(mod);
	if (err)
		goto unlink_mod;

	init_param_lock(mod);

	/* Now we've got everything in the final locations, we can
	 * find optional sections. */
	err = find_module_sections(mod, info);
	if (err)
		goto free_unload;

	err = check_module_license_and_versions(mod);
	if (err)
		goto free_unload;

	/* Set up MODINFO_ATTR fields */
	setup_modinfo(mod, info);

	/* Fix up syms, so that st_value is a pointer to location. */
	err = simplify_symbols(mod, info);
	if (err < 0)
		goto free_modinfo;

	err = apply_relocations(mod, info);
	if (err < 0)
		goto free_modinfo;

	err = post_relocation(mod, info);
	if (err < 0)
		goto free_modinfo;

	flush_module_icache(mod);

	/* Now copy in args */
	mod->args = strndup_user(uargs, ~0UL >> 1);
	if (IS_ERR(mod->args)) {
		err = PTR_ERR(mod->args);
		goto free_arch_cleanup;
	}

	dynamic_debug_setup(mod, info->debug, info->num_debug);

	/* Ftrace init must be called in the MODULE_STATE_UNFORMED state */
	ftrace_module_init(mod);

	/* Finally it's fully formed, ready to start executing. */
	err = complete_formation(mod, info);
	if (err)
		goto ddebug_cleanup;

	err = prepare_coming_module(mod);
	if (err)
		goto bug_cleanup;

	/* Module is ready to execute: parsing args may do that. */
	after_dashes = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
				  -32768, 32767, mod,
				  unknown_module_param_cb);
	if (IS_ERR(after_dashes)) {
		err = PTR_ERR(after_dashes);
		goto coming_cleanup;
	} else if (after_dashes) {
		pr_warn("%s: parameters '%s' after `--' ignored\n",
		       mod->name, after_dashes);
	}

	/* Link in to sysfs. */
	err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp);
	if (err < 0)
		goto coming_cleanup;

	if (is_livepatch_module(mod)) {
		err = copy_module_elf(mod, info);
		if (err < 0)
			goto sysfs_cleanup;
	}

	/* Get rid of temporary copy. */
	free_copy(info);

	/* Done! */
	trace_module_load(mod);

	return do_init_module(mod);

 sysfs_cleanup:
	mod_sysfs_teardown(mod);
 coming_cleanup:
	mod->state = MODULE_STATE_GOING;
	destroy_params(mod->kp, mod->num_kp);
	blocking_notifier_call_chain(&module_notify_list,
				     MODULE_STATE_GOING, mod);
	klp_module_going(mod);
 bug_cleanup:
	mod->state = MODULE_STATE_GOING;
	/* module_bug_cleanup needs module_mutex protection */
	mutex_lock(&module_mutex);
	module_bug_cleanup(mod);
	mutex_unlock(&module_mutex);

	/* we can't deallocate the module until we clear memory protection */
	module_disable_ro(mod);
	module_disable_nx(mod);

 ddebug_cleanup:
	dynamic_debug_remove(mod, info->debug);
	synchronize_sched();
	kfree(mod->args);
 free_arch_cleanup:
	module_arch_cleanup(mod);
 free_modinfo:
	free_modinfo(mod);
 free_unload:
	module_unload_free(mod);
 unlink_mod:
	mutex_lock(&module_mutex);
	/* Unlink carefully: kallsyms could be walking list. */
	list_del_rcu(&mod->list);
	mod_tree_remove(mod);
	wake_up_all(&module_wq);
	/* Wait for RCU-sched synchronizing before releasing mod->list. */
	synchronize_sched();
	mutex_unlock(&module_mutex);
 free_module:
	/*
	 * Ftrace needs to clean up what it initialized.
	 * This does nothing if ftrace_module_init() wasn't called,
	 * but it must be called outside of module_mutex.
	 */
	ftrace_release_mod(mod);
	/* Free lock-classes; relies on the preceding sync_rcu() */
	lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size);

	module_deallocate(mod, info);
 free_copy:
	free_copy(info);
	return err;
}

未完待续。。。

总结

上篇主要介绍了模块文件从用户空间拷贝到内核空间的过程,从busybox源码开始一直到Linux内核源码。关注“嵌入式Linux开发”,持续更新更多嵌入式Linux开发方面的知识。

评论 12
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

→嵌入式Linux开发

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值