linux启动流程(二)

最新推荐文章于 2023-04-16 15:31:45 发布

NeilLiu200

最新推荐文章于 2023-04-16 15:31:45 发布

阅读量132

点赞数

分类专栏： linux 文章标签： linux 系统启动

本文链接：https://blog.csdn.net/liumingzhuo/article/details/102457975

版权

linux 专栏收录该内容

8 篇文章 0 订阅

订阅专栏

Linux 启动流程-内核初始化

想要了解内核的启动过程，就需要查看linux源码了(此文是基于linux 5.2)。查看源码的过程中我们发现，内核的启动是start_kernel()函数，它是位于init包下main.c文件中的方法。

我们在此方法里发现很多xxx_init的方法，也就是做一些初始化操作。

asmlinkage __visible void __init start_kernel(void)
{
  ...
  set_task_stack_end_magic(&init_task)
  ...
  vfs_caches_init()
  ...
  sched_init()
	... 
	/* Do the rest non-__init'ed, we're now alive */
	arch_call_rest_init();
}

初始化第一个进程

我们知道linux系统中的所有进程都是通过父进程fork来的，那么内核启动时需要一个祖先进程，这个进程我们称之为0号进程，它是由set_task_stack_end_magic方法设置而来，set_task_stack_end_magic方法位于fock.c文件中。

void set_task_stack_end_magic(struct task_struct *tsk)
{
	unsigned long *stackend;

	stackend = end_of_stack(tsk);
	*stackend = STACK_END_MAGIC;	/* for overflow detection */
}

参数init_task是一个结构体，其定义在init_task.c文件中，做一些结构体的填充。

struct task_struct init_task
#ifdef CONFIG_ARCH_TASK_STRUCT_ON_STACK
	__init_task_data
#endif
= {
#ifdef CONFIG_THREAD_INFO_IN_TASK
	.thread_info	= INIT_THREAD_INFO(init_task),
	.stack_refcount	= REFCOUNT_INIT(1),
#endif
	.state		= 0,
	.stack		= init_stack,
	.usage		= REFCOUNT_INIT(2),
	.flags		= PF_KTHREAD,
	.prio		= MAX_PRIO - 20,
	.static_prio	= MAX_PRIO - 20,
	.normal_prio	= MAX_PRIO - 20,
	.policy		= SCHED_NORMAL,
	.cpus_allowed	= CPU_MASK_ALL,
	.nr_cpus_allowed= NR_CPUS,
	...

接下来**boot_cpu_init()**函数，位于cpu.c文件下。

void __init boot_cpu_init(void)
{
	int cpu = smp_processor_id();

	/* Mark the boot cpu "present", "online" etc for SMP and UP case */
	set_cpu_online(cpu, true);
	set_cpu_active(cpu, true);
	set_cpu_present(cpu, true);
	set_cpu_possible(cpu, true);

#ifdef CONFIG_SMP
	__boot_cpu_id = cpu;
#endif
}

smp_processor_id是一个宏，在smp的情况下获取cpu_id ，如果不是smp，那么返回0。

vfs_caches_init() 位于fs/dcache.c文件中，用来初始化基于内存的文件系统rootfs。

void __init vfs_caches_init(void)
{
	names_cachep = kmem_cache_create_usercopy("names_cache", PATH_MAX, 0,
			SLAB_HWCACHE_ALIGN|SLAB_PANIC, 0, PATH_MAX, NULL);

	dcache_init();
	inode_init();
	files_init();
	files_maxfiles_init();
	mnt_init();
	bdev_cache_init();
	chrdev_init();
}

vfs_caches_init() 内会调用 mnt_init() 方法, mnt方法位于namespace.c中

void __init mnt_init(void)
{
	int err;

	mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount),
			0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);

	mount_hashtable = alloc_large_system_hash("Mount-cache",
				sizeof(struct hlist_head),
				mhash_entries, 19,
				HASH_ZERO,
				&m_hash_shift, &m_hash_mask, 0, 0);
	mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache",
				sizeof(struct hlist_head),
				mphash_entries, 19,
				HASH_ZERO,
				&mp_hash_shift, &mp_hash_mask, 0, 0);

	if (!mount_hashtable || !mountpoint_hashtable)
		panic("Failed to allocate mount hash table\n");

	kernfs_init();

	err = sysfs_init();
	if (err)
		printk(KERN_WARNING "%s: sysfs_init error: %d\n",
			__func__, err);
	fs_kobj = kobject_create_and_add("fs", NULL);
	if (!fs_kobj)
		printk(KERN_WARNING "%s: kobj create error\n", __func__);
	init_rootfs();
	init_mount_tree();
}

mnt_init() 调用了 init_rootfs() 方法，此方法中会在VFS中注册一种 struct file_system_type rootfs_fs_type 的类型。VFS (Virtual File System) 存放着内核对外提供的接口。

此时0号进程初始化完成了

初始化第二个进程

start_kernel() 方法中的注释我们发现，有一个名叫 arch_call_rest_init() 的方法来做其余初始化的事，实际调用的是rest_init()方法

noinline void __ref rest_init(void)
{
	struct task_struct *tsk;
	int pid;

	rcu_scheduler_starting();
	/*
	 * We need to spawn init first so that it obtains pid 1, however
	 * the init task will end up wanting to create kthreads, which, if
	 * we schedule it before we create kthreadd, will OOPS.
	 */
	pid = kernel_thread(kernel_init, NULL, CLONE_FS);
	/*
	 * Pin init on the boot CPU. Task migration is not properly working
	 * until sched_init_smp() has been run. It will set the allowed
	 * CPUs for init to the non isolated CPUs.
	 */
	rcu_read_lock();
	tsk = find_task_by_pid_ns(pid, &init_pid_ns);
	set_cpus_allowed_ptr(tsk, cpumask_of(smp_processor_id()));
	rcu_read_unlock();

	numa_default_policy();
	pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);
	rcu_read_lock();
	kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns);
	rcu_read_unlock();

	/*
	 * Enable might_sleep() and smp_processor_id() checks.
	 * They cannot be enabled earlier because with CONFIG_PREEMPT=y
	 * kernel_thread() would trigger might_sleep() splats. With
	 * CONFIG_PREEMPT_VOLUNTARY=y the init task might have scheduled
	 * already, but it's stuck on the kthreadd_done completion.
	 */
	system_state = SYSTEM_SCHEDULING;

	complete(&kthreadd_done);

	/*
	 * The boot idle thread must execute schedule()
	 * at least once to get things moving:
	 */
	schedule_preempt_disabled();
	/* Call into cpu_idle with preempt disabled */
	cpu_startup_entry(CPUHP_ONLINE);
}

发现第12行pid = kernel_thread(kernel_init, NULL, CLONE_FS);此时会创建一个进程，我们称之为1号进程，此进程里运行的是用户进程即所有用户态进程的祖先进程，当此进程进入用户态后，会开枝散叶创建出很多子进程，形成一棵进程树。

从内核态到用户态

kernel_thread() 会调用 kernel_init 参数

static int __ref kernel_init(void *unused)
{
	int ret;

	kernel_init_freeable();
	/* need to finish all async __init code before freeing the memory */
	async_synchronize_full();
	ftrace_free_init_mem();
	free_initmem();
	mark_readonly();
  ...
  	if (ramdisk_execute_command) {
		ret = run_init_process(ramdisk_execute_command);
		...
	}

	...
	if (!try_to_run_init_process("/sbin/init") ||
	    !try_to_run_init_process("/etc/init") ||
	    !try_to_run_init_process("/bin/init") ||
	    !try_to_run_init_process("/bin/sh"))
		return 0;
  ...
}
///执行ramdisk
static noinline void __init kernel_init_freeable(void)
{
  ...
  if (!ramdisk_execute_command)
		ramdisk_execute_command = "/init";
  
}

///执行系统调用
static int run_init_process(const char *init_filename)
{
	argv_init[0] = init_filename;
	pr_info("Run %s as init process\n", init_filename);
	return do_execve(getname_kernel(init_filename),
		(const char __user *const __user *)argv_init,
		(const char __user *const __user *)envp_init);
}

从以上代码可以看出，1号进程运行的是一个文件。run_init_process() 中执行了 do_execve() 。此方法会尝试运行remdisk的’ /init '或普通文件系统上的"sbin/init"、"/etc/init"、 “/bin/init”、"/bin/sh"。不同版本的linux会选择不同的文件启动，但是只要有一个起来了就可以了。

那么具体的执行流程是什么？

//调用do_execve方法
int do_execve(struct filename *filename,
	const char __user *const __user *__argv,
	const char __user *const __user *__envp)
{
	struct user_arg_ptr argv = { .ptr.native = __argv };
	struct user_arg_ptr envp = { .ptr.native = __envp };
	return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
}
//调用do_execveat_common
static int do_execveat_common(int fd, struct filename *filename,
			      struct user_arg_ptr argv,
			      struct user_arg_ptr envp,
			      int flags)
{
	return __do_execve_file(fd, filename, argv, envp, flags, NULL);
}
//__do_execve_file 方法中  我们会发现执行了一个二进制文件exec_binprm()
static int __do_execve_file(int fd, struct filename *filename,
			    struct user_arg_ptr argv,
			    struct user_arg_ptr envp,
			    int flags, struct file *file)
{
  ...
  retval = exec_binprm(bprm);
  ...
}
//调用exec_binprm,此时又调用了search_binary_handler方法
static int exec_binprm(struct linux_binprm *bprm)
{
	pid_t old_pid, old_vpid;
	int ret;

	/* Need to fetch pid before load_binary changes it */
	old_pid = current->pid;
	rcu_read_lock();
	old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent));
	rcu_read_unlock();

	ret = search_binary_handler(bprm);
	if (ret >= 0) {
		audit_bprm(bprm);
		trace_sched_process_exec(current, old_pid, bprm);
		ptrace_event(PTRACE_EVENT_EXEC, old_vpid);
		proc_exec_connector(current);
	}
	return ret;
}

//查看search_binary_handler,我们会发现有一个名为linux_binfmt的结构体
int search_binary_handler(struct linux_binprm *bprm)
{
	bool need_retry = IS_ENABLED(CONFIG_MODULES);
	struct linux_binfmt *fmt;
	int retval;
  ...
	retval = fmt->load_binary(bprm);
	...
	}
	return retval;
}
//结构体linux_binfmt，定义了二进制文件
struct linux_binfmt {
	struct list_head lh;
	struct module *module;
	int (*load_binary)(struct linux_binprm *);
	int (*load_shlib)(struct file *);
	int (*core_dump)(struct coredump_params *cprm);
	unsigned long min_coredump;	/* minimal dump size */
} __randomize_layout;

static struct linux_binfmt elf_format = {
	.module		= THIS_MODULE,
	.load_binary	= load_elf_binary,
	.load_shlib	= load_elf_library,
	.core_dump	= elf_core_dump,
	.min_coredump	= ELF_EXEC_PAGESIZE,
};

//load_elf_binary 加载elf文件
static int load_elf_binary(struct linux_binprm *bprm)
{
  ...
  start_thread(regs, elf_entry, bprm->p);
  ...
}
//打开位于/arch/x86/kernel/process_32.c文件
void
start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
{
	set_user_gs(regs, 0);
	regs->fs		= 0;
	regs->ds		= __USER_DS;
	regs->es		= __USER_DS;
	regs->ss		= __USER_DS;
	regs->cs		= __USER_CS;
	regs->ip		= new_ip;
	regs->sp		= new_sp;
	regs->flags		= X86_EFLAGS_IF;
	force_iret();
}
EXPORT_SYMBOL_GPL(start_thread);

Tip:

ELF(Executable and Linkable Format，可执行与可链接格式)是Linux中的常用格式

最后我们会发现 start_thread把所有寄存器状态都设置成了_USER_XX, 也就是将代码段CS设置成了_USER_CS 将数据段的DS设置成了__USER_DS, 指令指针寄存器Ip和栈指针寄存器SP都做了重置，即此函数的作用是保存寄存器

force_iret是从系统调用中返回，即下一条指令是从用户态开始运行了。

进入到用户态之后就需要运行 remdisk 上的/init 加载存储设备的驱动，有了驱动就可以设置真正的根文件系统了。接下来remdisk 上的/init 会启动文件系统上的init

初始化2号进程

当系统进入用户态之后，就也就是有了所有进程的祖进程，接下来就需要一个内核态的进程来统一管理内核态了这个进程我们称之为2号进程。

在main.c 文件中

noinline void __ref rest_init(void)
{
   ...
  pid = kernel_thread(kernel_init, NULL, CLONE_FS);
  ...
  pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);
}

上述代码中，我们知道kernel_thread()是用来创建进程的，那么参数kthreadd 就是用来创建2号进程的了。

kthreadd负责所有内核态的线程的调度和管理，是内核态所有线程运行的祖先。
Ok，启动流程大体上就是这样了。

NeilLiu200

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
linux启动流程(二)

Linux 启动流程-内核初始化想要了解内核的启动过程，就需要查看linux源码了(此文是基于linux 5.2)。查看源码的过程中我们发现，内核的启动是start_kernel()函数，它是位于init包下main.c文件中的方法。我们在此方法里发现很多xxx_init的方法，也就是做一些初始化操作。asmlinkage __visible void __init start_kernel...
复制链接

扫一扫