linux/module.h
enum module_state {
MODULE_STATE_LIVE, //模块当前正常使用中(存活状态) 0
MODULE_STATE_COMING, //模块当前正在被加载 1
MODULE_STATE_GOING, //模块当前正在被卸载 2
};
struct module
{
enum module_state state;//模块状态
/* Member of list of modules */
struct list_head list;
//list是作为一个列表的成员,所有的内核模块都被维护在一个全局链表中,链表头是一个全局变量struct
// module modules。任何一个新创建的模块,都会被加入到这个链表的头部
/ Unique handle for this module /
char name[MODULE_NAME_LEN];//模块句柄 模块的名字
/ Sysfs stuff. */
struct module_kobject mkobj;
struct module_attribute *modinfo_attrs;
const char *version;
const char *srcversion;
struct kobject holders_dir;
/ Exported symbols */
const struct kernel_symbol *syms;//导出的符号
const unsigned long crcs;
unsigned int num_syms; //符号数量
/ GPL-only exported symbols. */
unsigned int num_gpl_syms;
const struct kernel_symbol *gpl_syms;
const unsigned long gpl_crcs;
#ifdef CONFIG_UNUSED_SYMBOLS
/ unused exported symbols. */
const struct kernel_symbol *unused_syms;
const unsigned long unused_crcs;
unsigned int num_unused_syms;
/ GPL-only, unused exported symbols. */
unsigned int num_unused_gpl_syms;
const struct kernel_symbol *unused_gpl_syms;
const unsigned long unused_gpl_crcs;
#endif
/ symbols that will be GPL-only in the near future. */
const struct kernel_symbol *gpl_future_syms;
const unsigned long gpl_future_crcs;
unsigned int num_gpl_future_syms;
/ Exception table */
unsigned int num_exentries;
struct exception_table_entry extable;
/ Startup function. */
int (init)(void); //模块初始化函数 模块入口函数
/ If this is non-NULL, vfree after init() returns */
void module_init;//???
/ Here is the actual code + data, vfree’d on unload. */
void module_core;
/ Here are the sizes of the init and core sections /
unsigned int init_size, core_size;
/ The size of the executable code in each section. /
unsigned int init_text_size, core_text_size;
/ The handle returned from unwind_add_table. */
void unwind_info;
/ Arch-specific module values /
struct mod_arch_specific arch;//体系结构
unsigned int taints; / same bits as kernel:tainted /
#ifdef CONFIG_GENERIC_BUG
/ Support for BUG */
unsigned num_bugs;
struct list_head bug_list;
struct bug_entry bug_table;
#endif
#ifdef CONFIG_KALLSYMS
/ We keep the symbol and string tables for kallsyms. */
Elf_Sym *symtab;
unsigned int num_symtab;
char strtab;
/ Section attributes */
struct module_sect_attrs sect_attrs;
/ Notes attributes */
struct module_notes_attrs notes_attrs;
#endif
/ Per-cpu data. */
void percpu;//针对每个CPU的数据
/ The command line arguments (may be mangled). People like
keeping pointers to this stuff */
char *args; //命令行参数
#ifdef CONFIG_MARKERS
struct marker *markers;
unsigned int num_markers;
#endif
#ifdef CONFIG_TRACEPOINTS
struct tracepoint tracepoints;
unsigned int num_tracepoints;
#endif
#ifdef CONFIG_MODULE_UNLOAD
/ What modules depend on me? /
struct list_head modules_which_use_me;//这个模块所依赖的模块链表
/ Who is waiting for us to be unloaded */
struct task_struct waiter; //正在等待这么模块被卸载的任务
/ Destruction function. */
void (exit)(void); //模块出口函数
/ Reference counts */
struct module_ref ref[NR_CPUS];//引用了一个计数
#endif
};
include/linux/init.h
/* initcalls are now grouped by functionality into separate
- subsections. Ordering inside the subsections is determined
- by link order.
- For backwards compatibility, initcall() puts the call in
- the device init subsection.
- The `id’ arg to __define_initcall() is needed so that multiple initcalls
- can point at the same handler without causing duplicate-symbol build errors.
*/
#define __define_initcall(level,fn,id)
static initcall_t _initcall##fn##id __used
attribute((section(“.initcall” level “.init”))) = fn
///
#define __initcall(fn) device_initcall(fn)
/**
- module_init() - driver initialization entry point
- @x: function to be run at kernel boot time or module insertion
- module_init() will either be called during do_initcalls() (if
- builtin) or at module insertion time (if a module). There can only
- be one per module.
/
/include/linux/init.h
#define module_init(x) __initcall(x);
#define device_initcall(fn) __define_initcall(“6”,fn,6)
可以发现这些_initcall(fn)最终都是通过__define_initcall(level,fn)宏定义生成的。//这个版本少了id项
__define_initcall宏定义如下:
#define __define_initcall(level,fn)
static initcall_t _initcall##fn attribute_used
attribute((section(“.initcall” level “.init”))) = fn
这句话的意思为定义一个initcall_t型的初始化函数,函数存放在.initcall”level”.init section内。.initcall”level”.init section定义在vmlinux.lds内。
/* arch/arm/kernel/vmlinux.lds */
__initcall_start = .;
*(.initcallearly.init) __early_initcall_end = .; *(.initcall0.init) *(.initcall0s.init) *(.initcall1.init) *(.initcall1s.init) *(.initcall2.init) *(.initcall2s.init) *(.initcall3.init) *(.initcall3s.init) *(.initcall4.init) *(.initcall4s.init) *(.initcall5.init) *(.initcall5s.init) *(.initcallrootfs.init) *(.initcall6.init) *(.initcall6s.init) *(.initcall7.init) *(.initcall7s.init)
__initcall_end = .;
正好包括了上面init.h里定义的从core_initcall到late_initcall等7个level等级的.initcall”level”.init section. 因此通过不同的*_initcall声明的函数指针最终都会存放不同level等级的.initcall”level”.init section内。这些不同level的section按level等级高低依次存放。
下面我们再来看看,内核是什么时候调用存储在.initcall”level”.init section内的函数的。
内核是通过do_initcalls函数循环调用执行initcall.init section内的函数的,流程如下:
main.c
start_kernel -> rest_init -> kernel_thread -> kernel_init -> do_basic_setup -> do_initcalls
init/main.c
extern initcall_t __initcall_start[], __initcall_end[], __early_initcall_end[];
static void __init do_initcalls(void)
{
initcall_t *call;
for (call = __early_initcall_end; call < __initcall_end; call++)
do_one_initcall(*call); //回调函数
//kernel/module.c 中
//SYSCALL_DEFINE3(init_module, void __user *, umod,
// unsigned long, len, const char __user *, uargs)也调用了do_one_initcall
/* Make sure there is no pending stuff from the initcall sequence */
flush_scheduled_work();
}
/
/* Search for module by name: must hold module_mutex. */
static struct module *find_module(const char *name)
{
struct module *mod;
list_for_each_entry(mod, &modules, list) {
if (strcmp(mod->name, name) == 0)
return mod;
}
return NULL;
}
///
static int percpu_modinit(void)
{
pcpu_num_used = 2;
pcpu_num_allocated = 2;
pcpu_size = kmalloc(sizeof(pcpu_size[0]) * pcpu_num_allocated,
GFP_KERNEL);
/* Static in-kernel percpu data (used). /
pcpu_size[0] = -(__per_cpu_end-__per_cpu_start);
/ Free room. */
pcpu_size[1] = PERCPU_ENOUGH_ROOM + pcpu_size[0];
if (pcpu_size[1] < 0) {
printk(KERN_ERR “No per-cpu room for modules.\n”);
pcpu_num_used = 1;
}
return 0;
}
__initcall(percpu_modinit);
模块加载由内核的系统调用init_module完成。
linux3.5.2/kernel/module.c:3009
/* This is where the real work happens */
SYSCALL_DEFINE3(init_module, void __user *, umod,
unsigned long, len, const char __user *, uargs)
{
struct module *mod;
int ret = 0;
……
/* Do all the hard work */
mod = load_module(umod, len, uargs);//模块加载
……
/* Start the module */
if (mod->init != NULL)
ret = do_one_initcall(mod->init);//模块init函数调用
……
return 0;
}
模块加载
系统调用init_module由SYSCALL_DEFINE3(init_module…)实现,其中有两个关键的函数调用。load_module用于模块加载,do_one_initcall用于回调模块的init函数。
/* This is where the real work happens */
SYSCALL_DEFINE3(init_module, void __user *, umod,
unsigned long, len, const char __user *, uargs)
{
struct module *mod;
int ret = 0;
/* Must have permission */
if (!capable(CAP_SYS_MODULE))
return -EPERM;
/* Only one module load at a time, please */
if (mutex_lock_interruptible(&module_mutex) != 0)
return -EINTR;
/* Do all the hard work */
mod = load_module(umod, len, uargs);
if (IS_ERR(mod)) {
mutex_unlock(&module_mutex);
return PTR_ERR(mod);
}
/* Drop lock so they can recurse */
mutex_unlock(&module_mutex);
blocking_notifier_call_chain(&module_notify_list,
MODULE_STATE_COMING, mod);
/* Start the module */
if (mod->init != NULL)
ret = do_one_initcall(mod->init);//do_initcalls里也调用了 do_one_initcall
//此处是没编译进内核时的模块加载?do_initcalls里也调用了 do_one_initcall是编译进内核的模块自动加载?
if (ret < 0) {
/* Init routine failed: abort. Try to protect us from
buggy refcounters. */
mod->state = MODULE_STATE_GOING;
synchronize_sched();
module_put(mod);
blocking_notifier_call_chain(&module_notify_list,
MODULE_STATE_GOING, mod);
mutex_lock(&module_mutex);
free_module(mod);
mutex_unlock(&module_mutex);
wake_up(&module_wq);
return ret;
}
if (ret > 0) {
printk(KERN_WARNING "%s: ‘%s’->init suspiciously returned %d, "
“it should follow 0/-E convention\n”
KERN_WARNING “%s: loading module anyway…\n”,
func, mod->name, ret,
func);
dump_stack();
}
/* Now it's a first class citizen! Wake up anyone waiting for it. */
mod->state = MODULE_STATE_LIVE;
wake_up(&module_wq);
mutex_lock(&module_mutex);
/* Drop initial reference. */
module_put(mod);
unwind_remove_table(mod->unwind_info, 1);
module_free(mod, mod->module_init);
mod->module_init = NULL;
mod->init_size = 0;
mod->init_text_size = 0;
mutex_unlock(&module_mutex);
return 0;
}
//
模块卸载由内核的系统调用delete_module完成。
SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
unsigned int, flags)
{
struct module *mod;
char name[MODULE_NAME_LEN];
int ret, forced = 0;
if (!capable(CAP_SYS_MODULE))
return -EPERM;
if (strncpy_from_user(name, name_user, MODULE_NAME_LEN-1) < 0)
return -EFAULT;
name[MODULE_NAME_LEN-1] = '\0';
if (mutex_lock_interruptible(&module_mutex) != 0)
return -EINTR;
mod = find_module(name);
if (!mod) {
ret = -ENOENT;
goto out;
}
if (!list_empty(&mod->modules_which_use_me)) {
/* Other modules depend on us: get rid of them first. */
ret = -EWOULDBLOCK;
goto out;
}
/* Doing init or already dying? */
if (mod->state != MODULE_STATE_LIVE) {
/* FIXME: if (force), slam module count and wake up
waiter --RR */
DEBUGP("%s already dying\n", mod->name);
ret = -EBUSY;
goto out;
}
/* If it has an init func, it must have an exit func to unload */
if (mod->init && !mod->exit) {
forced = try_force_unload(flags);
if (!forced) {
/* This module can't be removed */
ret = -EBUSY;
goto out;
}
}
/* Set this up before setting mod->state */
mod->waiter = current;
/* Stop the machine so refcounts can't move and disable module. */
ret = try_stop_module(mod, flags, &forced);
if (ret != 0)
goto out;
/* Never wait if forced. */
if (!forced && module_refcount(mod) != 0)
wait_for_zero_refcount(mod);
mutex_unlock(&module_mutex);
/* Final destruction now noone is using it. */
if (mod->exit != NULL)
mod->exit();//调用exit module
blocking_notifier_call_chain(&module_notify_list,
MODULE_STATE_GOING, mod);
mutex_lock(&module_mutex);
/* Store the name of the last unloaded module for diagnostic purposes */
strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
unregister_dynamic_debug_module(mod->name);
free_module(mod);//卸载模块
out:
mutex_unlock(&module_mutex);
return ret;
}
函数在内核中的位置:linux-2.6.30/kernel/module.c
该函数的功能是将一个特定模块module的引用计数减1 ,这样当一个模块的引用计数因为不为0而不能从内核中卸载时,可以调用此函数一次或多次,实现对模块计数的清零,从而实现模块卸载。
void module_put(struct module module)
{
if (module) {
unsigned int cpu = get_cpu();
local_dec(&module->ref[cpu].count);
/ Maybe they’re waiting for us to drop reference? /
if (unlikely(!module_is_live(module)))
wake_up_process(module->waiter);
put_cpu();
}
}
//
/
modprobe_path is set via /proc/sys.
*/
char modprobe_path[KMOD_PATH_LEN] = “/sbin/modprobe”;
/**
- request_module - try to load a kernel module
- @fmt: printf style format string for the name of the module
- @varargs: arguements as specified in the format string
- Load a module using the user mode module loader. The function returns
- zero on success or a negative errno code on failure. Note that a
- successful module load does not mean the module did not then unload
- and exit on an error of its own. Callers must check that the service
- they requested is now available not blindly invoke it.
- If module auto-loading support is disabled then this function
- becomes a no-operation.
*/
kernel/kmod.c
int request_module(const char *fmt, …)
{
va_list args;
char module_name[MODULE_NAME_LEN];
unsigned int max_modprobes;
int ret;
char *argv[] = { modprobe_path, “-q”, “–”, module_name, NULL };
static char envp[] = { “HOME=/”,
“TERM=linux”,
“PATH=/sbin:/usr/sbin:/bin:/usr/bin”,
NULL };
static atomic_t kmod_concurrent = ATOMIC_INIT(0);
#define MAX_KMOD_CONCURRENT 50 / Completely arbitrary value - KAO */
static int kmod_loop_msg;
va_start(args, fmt);
ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
va_end(args);
if (ret >= MODULE_NAME_LEN)
return -ENAMETOOLONG;
/* If modprobe needs a service that is in a module, we get a recursive
- loop. Limit the number of running kmod threads to max_threads/2 or
- MAX_KMOD_CONCURRENT, whichever is the smaller. A cleaner method
- would be to run the parents of this process, counting how many times
- kmod was invoked. That would mean accessing the internals of the
- process tables to get the command line, proc_pid_cmdline is static
- and it is not worth changing the proc code just to handle this case.
- KAO.
- “trace the ppid” is simple, but will fail if someone’s
- parent exits. I think this is as good as it gets. --RR
/
max_modprobes = min(max_threads/2, MAX_KMOD_CONCURRENT);
atomic_inc(&kmod_concurrent);
if (atomic_read(&kmod_concurrent) > max_modprobes) {
/ We may be blaming an innocent here, but unlikely */
if (kmod_loop_msg++ < 5)
printk(KERN_ERR
“request_module: runaway loop modprobe %s\n”,
module_name);
atomic_dec(&kmod_concurrent);
return -ENOMEM;
}
ret = call_usermodehelper(modprobe_path, argv, envp, 1);
atomic_dec(&kmod_concurrent);
return ret;
}
Linux把内核也看作一个模块。那么模块与模块之间如何进行交互呢,一种常用的方法就是共享变量和函数。但并不是模块中的每个变量和函数都能被共享,内核只把各个模块中主要的变量和函数放在一个特定的区段,这些变量和函数就统称为符号。到低哪些符号可以被共享? Linux内核有自己的规定。例如 EXPORT_SYMBOL_GPL(sysfs_create_link); 在fs/sysfs/symlink.c
其中宏定义EXPORT_SYMBOL()本身的含义是“移出符号”。为什么说是“移出”呢?因为这些符号本来是内核内部的符号,通过这个宏放在一个公开的地方,使得装入到内核中的其他模块可以引用它们。
实际上,仅仅知道这些符号的名字是不够的,还得知道它们在内核映像中的地址才有意义。因此,内核中定义了如下结构来描述模块的符号:
struct module_symbol
{
unsigned long value; /*符号在内核映像中的地址*/
const char *name; /*指向符号名的指针*/
};
从后面对EXPORT_SYMBOL宏的定义可以看出,连接程序(ld)在连接内核映像时将这个结构存放在一个叫做“__ksymtab”的区段中,而这个区段中所有的符号就组成了模块对外“移出”的符号表,这些符号可供内核及已安装的模块来引用。而其他“对内”的符号则由连接程序自行生成,并仅供内部使用。
与EXPORT_SYMBOL相关的定义在include/linux/module.h中:
#define __MODULE_STRING_1(x) #x
#define __MODULE_STRING(x) __MODULE_STRING_1(x)
#define __EXPORT_SYMBOL(sym, str) \
const char _kstrtab##sym[] \
attribute((section(“.kstrtab”))) = str; \
const struct module_symbol _ksymtab##sym \
attribute((section(“__ksymtab”))) = \
{ (unsigned long)&sym, _kstrtab##sym }
#if defined(MODVERSIONS) || !defined(CONFIG_MODVERSIONS)
#define EXPORT_SYMBOL(var) __EXPORT_SYMBOL(var, __MODULE_STRING(var))
下面我们以EXPORT_SYMBOL(schedule)为例,来看一下这个宏的结果是什么。首先EXPORT_SYMBOL(schedule)的定义成了__EXPORT_SYMBOL(schedule, “schedule”)。而__EXPORT_SYMBOL()定义了两个语句,第一个语句定义了一个名为__kstrtab_ schedule的字符串,将字符串的内容初始化为“schedule”,并将其置于内核映像中的.kstrtab区段,注意这是一个专门存放符号名字符串的区段。第二个语句则定义了一个名为__kstrtab_ schedule的module_symbol结构,将其初始化为{&schedule,_kstrtab schedule}结构,并将其置于内核映像中的__ksymtab区段。这样,module_symbol结构中的域value的值就为schedule在内核映像中的地址,而指针name则指向字符串“schedule”。