说明:文档代码来自linux 6.1.20内核
0号进程如何创建
0 号进程是静态定义出来的,是所有进程的祖先,所以,是唯一不是fork出来的进程
总体调用流程如下:
start_kernel() -> arch_call_rest_init() -> rest_init() -> cpu_startup_entry() -> do_idle()
init_task代码如下
这里就是静态定义的位置
< init/init_task.c >
struct task_struct init_task
#ifdef CONFIG_ARCH_TASK_STRUCT_ON_STACK
__init_task_data
#endif
__aligned(L1_CACHE_BYTES)
= {
#ifdef CONFIG_THREAD_INFO_IN_TASK
.thread_info = INIT_THREAD_INFO(init_task),
.stack_refcount = REFCOUNT_INIT(1),
#endif
.__state = 0,
.stack = init_stack,
.usage = REFCOUNT_INIT(2),
.flags = PF_KTHREAD,
.prio = MAX_PRIO - 20,
.static_prio = MAX_PRIO - 20,
.normal_prio = MAX_PRIO - 20,
.policy = SCHED_NORMAL,
.cpus_ptr = &init_task.cpus_mask,
.user_cpus_ptr = NULL,
.cpus_mask = CPU_MASK_ALL,
.nr_cpus_allowed= NR_CPUS,
.mm = NULL,
.active_mm = &init_mm,
.restart_block = {
.fn = do_no_restart_syscall,
},
.se = {
.group_node = LIST_HEAD_INIT(init_task.se.group_node),
},
.rt = {
.run_list = LIST_HEAD_INIT(init_task.rt.run_list),
.time_slice = RR_TIMESLICE,
},
.tasks = LIST_HEAD_INIT(init_task.tasks),
#ifdef CONFIG_SMP
.pushable_tasks = PLIST_NODE_INIT(init_task.pushable_tasks, MAX_PRIO),
#endif
#ifdef CONFIG_CGROUP_SCHED
.sched_task_group = &root_task_group,
#endif
.ptraced = LIST_HEAD_INIT(init_task.ptraced),
.ptrace_entry = LIST_HEAD_INIT(init_task.ptrace_entry),
.real_parent = &init_task,
.parent = &init_task,
.children = LIST_HEAD_INIT(init_task.children),
.sibling = LIST_HEAD_INIT(init_task.sibling),
.group_leader = &init_task,
RCU_POINTER_INITIALIZER(real_cred, &init_cred),
RCU_POINTER_INITIALIZER(cred, &init_cred),
.comm = INIT_TASK_COMM,
.thread = INIT_THREAD,
.fs = &init_fs,
.files = &init_files,
#ifdef CONFIG_IO_URING
.io_uring = NULL,
#endif
.signal = &init_signals,
.sighand = &init_sighand,
.nsproxy = &init_nsproxy,
.pending = {
.list = LIST_HEAD_INIT(init_task.pending.list),
.signal = {{0}}
},
.blocked = {{0}},
.alloc_lock = __SPIN_LOCK_UNLOCKED(init_task.alloc_lock),
.journal_info = NULL,
INIT_CPU_TIMERS(init_task)
.pi_lock = __RAW_SPIN_LOCK_UNLOCKED(init_task.pi_lock),
.timer_slack_ns = 50000, /* 50 usec default slack */
.thread_pid = &init_struct_pid,
.thread_group = LIST_HEAD_INIT(init_task.thread_group),
.thread_node = LIST_HEAD_INIT(init_signals.thread_head),
#ifdef CONFIG_AUDIT
.loginuid = INVALID_UID,
.sessionid = AUDIT_SID_UNSET,
#endif
#ifdef CONFIG_PERF_EVENTS
.perf_event_mutex = __MUTEX_INITIALIZER(init_task.perf_event_mutex),
.perf_event_list = LIST_HEAD_INIT(init_task.perf_event_list),
#endif
#ifdef CONFIG_PREEMPT_RCU
.rcu_read_lock_nesting = 0,
.rcu_read_unlock_special.s = 0,
.rcu_node_entry = LIST_HEAD_INIT(init_task.rcu_node_entry),
.rcu_blocked_node = NULL,
#endif
#ifdef CONFIG_TASKS_RCU
.rcu_tasks_holdout = false,
.rcu_tasks_holdout_list = LIST_HEAD_INIT(init_task.rcu_tasks_holdout_list),
.rcu_tasks_idle_cpu = -1,
#endif
#ifdef CONFIG_TASKS_TRACE_RCU
.trc_reader_nesting = 0,
.trc_reader_special.s = 0,
.trc_holdout_list = LIST_HEAD_INIT(init_task.trc_holdout_list),
.trc_blkd_node = LIST_HEAD_INIT(init_task.trc_blkd_node),
#endif
#ifdef CONFIG_CPUSETS
.mems_allowed_seq = SEQCNT_SPINLOCK_ZERO(init_task.mems_allowed_seq,
&init_task.alloc_lock),
#endif
#ifdef CONFIG_RT_MUTEXES
.pi_waiters = RB_ROOT_CACHED,
.pi_top_task = NULL,
#endif
INIT_PREV_CPUTIME(init_task)
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
.vtime.seqcount = SEQCNT_ZERO(init_task.vtime_seqcount),
.vtime.starttime = 0,
.vtime.state = VTIME_SYS,
#endif
#ifdef CONFIG_NUMA_BALANCING
.numa_preferred_nid = NUMA_NO_NODE,
.numa_group = NULL,
.numa_faults = NULL,
#endif
#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
.kasan_depth = 1,
#endif
#ifdef CONFIG_KCSAN
.kcsan_ctx = {
.scoped_accesses = {LIST_POISON1, NULL},
},
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
.softirqs_enabled = 1,
#endif
#ifdef CONFIG_LOCKDEP
.lockdep_depth = 0, /* no locks held yet */
.curr_chain_key = INITIAL_CHAIN_KEY,
.lockdep_recursion = 0,
#endif
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
.ret_stack = NULL,
.tracing_graph_pause = ATOMIC_INIT(0),
#endif
#if defined(CONFIG_TRACING) && defined(CONFIG_PREEMPTION)
.trace_recursion = 0,
#endif
#ifdef CONFIG_LIVEPATCH
.patch_state = KLP_UNDEFINED,
#endif
#ifdef CONFIG_SECURITY
.security = NULL,
#endif
#ifdef CONFIG_SECCOMP_FILTER
.seccomp = { .filter_count = ATOMIC_INIT(0) },
#endif
};
EXPORT_SYMBOL(init_task);
start_kernel()
< init/main.c >
asmlinkage __visible void __init __no_sanitize_address start_kernel(void)
{
char *command_line;
char *after_dashes;
/* 设置栈魔数,魔数的目的是为了进行栈溢出检测 */
set_task_stack_end_magic(&init_task);
smp_setup_processor_id();
sched_init();
arch_call_rest_init();
}
arch_call_rest_init()
< init/main.c >
void __init __weak arch_call_rest_init(void)
{
rest_init();
}
rest_init();
< init/main.c >
noinline void __ref rest_init(void)
{
struct task_struct *tsk;
int pid;
rcu_scheduler_starting();
/*
* We need to spawn init first so that it obtains pid 1, however
* the init task will end up wanting to create kthreads, which, if
* we schedule it before we create kthreadd, will OOPS.
*/
pid = user_mode_thread(kernel_init, NULL, CLONE_FS); /* 1号进程的创建 */
/*
* Pin init on the boot CPU. Task migration is not properly working
* until sched_init_smp() has been run. It will set the allowed
* CPUs for init to the non isolated CPUs.
*/
rcu_read_lock();
tsk = find_task_by_pid_ns(pid, &init_pid_ns);
tsk->flags |= PF_NO_SETAFFINITY;
set_cpus_allowed_ptr(tsk, cpumask_of(smp_processor_id()));
rcu_read_unlock();
numa_default_policy();
pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES); /* 2号进程的创建 */
rcu_read_lock();
kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns);
rcu_read_unlock();
/*
* Enable might_sleep() and smp_processor_id() checks.
* They cannot be enabled earlier because with CONFIG_PREEMPTION=y
* kernel_thread() would trigger might_sleep() splats. With
* CONFIG_PREEMPT_VOLUNTARY=y the init task might have scheduled
* already, but it's stuck on the kthreadd_done completion.
*/
system_state = SYSTEM_SCHEDULING;
complete(&kthreadd_done);
/*
* The boot idle thread must execute schedule()
* at least once to get things moving:
*/
schedule_preempt_disabled();
/* Call into cpu_idle with preempt disabled */
cpu_startup_entry(CPUHP_ONLINE);
}
cpu_startup_entry()
< sched/idle.c >
void cpu_startup_entry(enum cpuhp_state state)
{
arch_cpu_idle_prepare();
cpuhp_online_idle(state);
while (1)
do_idle();
}
0号进程在忙什么
从cpu_startup_entry()可以看出,0号进程就是一直死循环在do_idle(),当系统中没有进程可选中被调度,就选择 0 号进程运行,这就是IDLE进程
do_idle()
< sched/idle.c >
static void do_idle(void)
{
int cpu = smp_processor_id();
/*
* Check if we need to update blocked load
*/
nohz_run_idle_balance(cpu);
/*
* If the arch has a polling bit, we maintain an invariant:
*
* Our polling bit is clear if we're not scheduled (i.e. if rq->curr !=
* rq->idle). This means that, if rq->idle has the polling bit set,
* then setting need_resched is guaranteed to cause the CPU to
* reschedule.
*/
__current_set_polling();
tick_nohz_idle_enter();
/* 如果不需要调度,就一直在这循环 */
while (!need_resched()) {
rmb();
local_irq_disable();
if (cpu_is_offline(cpu)) {
tick_nohz_idle_stop_tick();
cpuhp_report_idle_dead();
arch_cpu_idle_dead();
}
arch_cpu_idle_enter();
rcu_nocb_flush_deferred_wakeup();
/*
* In poll mode we reenable interrupts and spin. Also if we
* detected in the wakeup from idle path that the tick
* broadcast device expired for us, we don't want to go deep
* idle as we know that the IPI is going to arrive right away.
*/
/* 判断是否要进入更深层的休眠 */
if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
tick_nohz_idle_restart_tick();
cpu_idle_poll();
} else {
/* 进入更深层的idle */
cpuidle_idle_call();
}
arch_cpu_idle_exit();
}
/*
* Since we fell out of the loop above, we know TIF_NEED_RESCHED must
* be set, propagate it into PREEMPT_NEED_RESCHED.
*
* This is required because for polling idle loops we will not have had
* an IPI to fold the state for us.
*/
preempt_set_need_resched();
tick_nohz_idle_exit();
__current_clr_polling();
/*
* We promise to call sched_ttwu_pending() and reschedule if
* need_resched() is set while polling is set. That means that clearing
* polling needs to be visible before doing these things.
*/
smp_mb__after_atomic();
/*
* RCU relies on this call to be done outside of an RCU read-side
* critical section.
*/
/* 会处理其它ipi中断发过来的请求和触发软中断 */
flush_smp_call_function_queue();
/* 任务切换,切换到要运行的其他任务 */
schedule_idle();
if (unlikely(klp_patch_pending(current)))
klp_update_patch_state(current);
}
其余CPU IDLE进程如何创建
前门的分析看到的是第一颗CPU一直在运行idle进程,那么其他CPU的idle怎么创建
之前看到过,0号进程创建了1号进程,1号进程就是运行了kernel_init()
总体流程如下:
start_kernel() -> arch_call_rest_init() -> rest_init() -> kernel_init() -> kernel_init_freeable() -> smp_init() ->
idle_threads_init() -> idle_init()
kernel_init()
< init/main.c >
static int __ref kernel_init(void *unused)
{
int ret;
/*
* Wait until kthreadd is all set-up.
*/
wait_for_completion(&kthreadd_done); /* 1号进程的运行要等待2号进程创建完成 */
kernel_init_freeable(); /* 这里就是唤醒其他CPU的入口 */
/* need to finish all async __init code before freeing the memory */
async_synchronize_full();
system_state = SYSTEM_FREEING_INITMEM;
kprobe_free_init_mem();
ftrace_free_init_mem();
kgdb_free_init_mem();
exit_boot_config();
free_initmem();
mark_readonly();
/*
* Kernel mappings are now finalized - update the userspace page-table
* to finalize PTI.
*/
pti_finalize();
system_state = SYSTEM_RUNNING;
numa_default_policy();
rcu_end_inkernel_boot();
do_sysctl_args();
if (ramdisk_execute_command) {
ret = run_init_process(ramdisk_execute_command);
if (!ret)
return 0;
pr_err("Failed to execute %s (error %d)\n",
ramdisk_execute_command, ret);
}
/*
* We try each of these until one succeeds.
*
* The Bourne shell can be used instead of init if we are
* trying to recover a really broken machine.
*/
if (execute_command) {
ret = run_init_process(execute_command);
if (!ret)
return 0;
panic("Requested init %s failed (error %d).",
execute_command, ret);
}
if (CONFIG_DEFAULT_INIT[0] != '\0') {
ret = run_init_process(CONFIG_DEFAULT_INIT);
if (ret)
pr_err("Default init %s failed (error %d)\n",
CONFIG_DEFAULT_INIT, ret);
else
return 0;
}
if (!try_to_run_init_process("/sbin/init") ||
!try_to_run_init_process("/etc/init") ||
!try_to_run_init_process("/bin/init") ||
!try_to_run_init_process("/bin/sh"))
return 0;
panic("No working init found. Try passing init= option to kernel. "
"See Linux Documentation/admin-guide/init.rst for guidance.");
}
kernel_init_freeable()
< init/main.c >
static noinline void __init kernel_init_freeable(void)
{
/* Now the scheduler is fully set up and can do blocking allocations */
gfp_allowed_mask = __GFP_BITS_MASK;
/*
* init can allocate pages on any node
*/
set_mems_allowed(node_states[N_MEMORY]);
cad_pid = get_pid(task_pid(current));
smp_prepare_cpus(setup_max_cpus);
workqueue_init();
init_mm_internals();
rcu_init_tasks_generic();
do_pre_smp_initcalls();
lockup_detector_init();
smp_init(); /* 这里初始化其他CPU */
sched_init_smp();
padata_init();
page_alloc_init_late();
/* Initialize page ext after all struct pages are initialized. */
if (!early_page_ext_enabled())
page_ext_init();
do_basic_setup();
kunit_run_all_tests();
wait_for_initramfs();
console_on_rootfs();
/*
* check if there is an early userspace init. If yes, let it do all
* the work
*/
if (init_eaccess(ramdisk_execute_command) != 0) {
ramdisk_execute_command = NULL;
prepare_namespace();
}
/*
* Ok, we have completed the initial bootup, and
* we're essentially up and running. Get rid of the
* initmem segments and start the user-mode stuff..
*
* rootfs is available now, try loading the public keys
* and default modules
*/
integrity_load_keys();
}
smp_init()
< kernel/smp.c >
void __init smp_init(void)
{
int num_nodes, num_cpus;
idle_threads_init(); /* 复制出IDLE进程 */
cpuhp_threads_init();
pr_info("Bringing up secondary CPUs ...\n");
bringup_nonboot_cpus(setup_max_cpus);
num_nodes = num_online_nodes();
num_cpus = num_online_cpus();
pr_info("Brought up %d node%s, %d CPU%s\n",
num_nodes, (num_nodes > 1 ? "s" : ""),
num_cpus, (num_cpus > 1 ? "s" : ""));
/* Any cleanup work */
smp_cpus_done(setup_max_cpus);
}
idle_threads_init()
< kernel/smpboot.c >
void __init idle_threads_init(void)
{
unsigned int cpu, boot_cpu;
boot_cpu = smp_processor_id();
for_each_possible_cpu(cpu) { /* 有多少cpu就创建多少idle 进程 */
if (cpu != boot_cpu) /* 排除掉启动CPU */
idle_init(cpu);
}
}
idle_init()
< kernel/smpboot.c >
static __always_inline void idle_init(unsigned int cpu)
{
struct task_struct *tsk = per_cpu(idle_threads, cpu);
if (!tsk) {
tsk = fork_idle(cpu); /* fork出 idle 进程 */
if (IS_ERR(tsk))
pr_err("SMP: fork_idle() failed for CPU %u\n", cpu);
else
per_cpu(idle_threads, cpu) = tsk; /* 通过per_cpu绑定 */
}
}
IDLE进程初始化后,开始加载其余CPU,入口函数分析,以ARM64为例
加载其余CPU
secondary_start_kernel() - >cpu_startup_entry() -> do_idle()
secondary_start_kernel() 分析
< arch/arm64/kernel/smp.c >
asmlinkage notrace void secondary_start_kernel(void)
{
u64 mpidr = read_cpuid_mpidr() & MPIDR_HWID_BITMASK;
struct mm_struct *mm = &init_mm;
const struct cpu_operations *ops;
unsigned int cpu = smp_processor_id();
/*
* All kernel threads share the same mm context; grab a
* reference and switch to it.
*/
mmgrab(mm);
current->active_mm = mm;
/*
* TTBR0 is only used for the identity mapping at this stage. Make it
* point to zero page to avoid speculatively fetching new entries.
*/
cpu_uninstall_idmap();
if (system_uses_irq_prio_masking())
init_gic_priority_masking();
rcu_cpu_starting(cpu);
trace_hardirqs_off();
/*
* If the system has established the capabilities, make sure
* this CPU ticks all of those. If it doesn't, the CPU will
* fail to come online.
*/
check_local_cpu_capabilities();
ops = get_cpu_ops(cpu);
if (ops->cpu_postboot)
ops->cpu_postboot();
/*
* Log the CPU info before it is marked online and might get read.
*/
cpuinfo_store_cpu();
store_cpu_topology(cpu);
/*
* Enable GIC and timers.
*/
notify_cpu_starting(cpu);
ipi_setup(cpu);
numa_add_cpu(cpu);
/*
* OK, now it's safe to let the boot CPU continue. Wait for
* the CPU migration code to notice that the CPU is online
* before we continue.
*/
pr_info("CPU%u: Booted secondary processor 0x%010lx [0x%08x]\n",
cpu, (unsigned long)mpidr,
read_cpuid_id());
update_cpu_boot_status(CPU_BOOT_SUCCESS);
set_cpu_online(cpu, true);
complete(&cpu_running);
local_daif_restore(DAIF_PROCCTX);
/*
* OK, it's off to the idle thread for us
*/
cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); /* 运行idle进程的入口 */
}
cpu_startup_entry()上面看过了,我们再回顾一遍,就是个do_idle()死循环
cpu_startup_entry()
< kernel/sched/idle.c >
void cpu_startup_entry(enum cpuhp_state state)
{
arch_cpu_idle_prepare();
cpuhp_online_idle(state);
while (1)
do_idle();
}