Systemd 1-poweroff命令背后的技术原理（一）-CSDN博客

前言

poweroff 命令用于关闭 Linux 系统的电源。它不仅停止所有正在运行的进程，还会卸载文件系统，最终关闭计算机的电源。poweroff 通常需要超级用户权限（即 root 权限）才能执行。

基本用法

要关闭系统电源，可以执行以下命令：

或者，如果你已经是 root 用户，可以直接运行：

`poweroff` 与 `systemctl`

在许多现代 Linux 发行版中（如 Ubuntu 22.04），poweroff 实际上是一个指向 systemctl 的软连接。这意味着当你运行 poweroff 时，实际上是在调用 systemctl poweroff。此处以笔者的x86-64位的ubuntu22.04系统为例

root:~# which poweroff
/usr/sbin/poweroff
root:~# ls -ln /usr/sbin/poweroff
lrwxrwxrwx 1 0 0 14 11月 22  2023 /usr/sbin/poweroff -> /bin/systemctl

命令选项

poweroff 命令有一些选项，可以用于调整其行为：

root:~# poweroff --help
poweroff [OPTIONS...]

Power off the system.

Options:
     --help      Show this help
     --halt      Halt the machine
  -p --poweroff  Switch off the machine
     --reboot    Reboot the machine
  -f --force     Force immediate halt/power-off/reboot
  -w --wtmp-only Don't halt/power-off/reboot, just write wtmp record
  -d --no-wtmp   Don't write wtmp record
     --no-wall   Don't send wall message before halt/power-off/reboot

See the halt(8) man page for details.

一、Systemd poweroff命令源码分析

systemd的关机流程非常复杂，一次讲解难以做到面面俱到，所以笔者准备拆分分析其实现。此次分析，笔者以如下命令分析其具体流程，下次分析poweroff的正常关机流程。

（一）用户态流程

systemctl程序的入口函数为run，poweroff命令再shell解析后，会作为systemctl的参数传入systemctl,然后在systemctl_dispatch_parse_argv函数内解析

static int run(int argc, char *argv[]) {
        _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL;
        _cleanup_(umount_and_freep) char *mounted_dir = NULL;
        int r;

        setlocale(LC_ALL, "");
        log_setup();

        /* The journal merging logic potentially needs a lot of fds. */
        (void) rlimit_nofile_bump(HIGH_RLIMIT_NOFILE);

        sigbus_install();

        r = systemctl_dispatch_parse_argv(argc, argv);
        if (r <= 0)
                goto finish;

systemctl_dispatch_parse_argv函数在解析到argv中的poweroff选项后，会将arg_action全局变量设置为ACTION_POWEROFF，表征关机指令，然后再调用halt_parse_argv函数解析命令

int systemctl_dispatch_parse_argv(int argc, char *argv[]) {
        assert(argc >= 0);
        assert(argv);

        if (invoked_as(argv, "halt")) {
                arg_action = ACTION_HALT;
                return halt_parse_argv(argc, argv);

        } else if (invoked_as(argv, "poweroff")) {
                arg_action = ACTION_POWEROFF;
                return halt_parse_argv(argc, argv);

halt_parse_argv函数在解析到‘-f’命令后，会将全局变量arg_force置为2，如果没有在调用命令的时候指定‘-f’，则该变量默认为0。

int halt_parse_argv(int argc, char *argv[]) {
        enum {
                ARG_HELP = 0x100,
                ARG_HALT,
                ARG_REBOOT,
                ARG_NO_WALL
        };

        static const struct option options[] = {
                { "help",      no_argument,       NULL, ARG_HELP    },
                { "halt",      no_argument,       NULL, ARG_HALT    },
                { "poweroff",  no_argument,       NULL, 'p'         },
                { "reboot",    no_argument,       NULL, ARG_REBOOT  },
                { "force",     no_argument,       NULL, 'f'         },
                { "wtmp-only", no_argument,       NULL, 'w'         },
                { "no-wtmp",   no_argument,       NULL, 'd'         },
                { "no-sync",   no_argument,       NULL, 'n'         },
                { "no-wall",   no_argument,       NULL, ARG_NO_WALL },
                {}
        };

        int c, r, runlevel;

        assert(argc >= 0);
        assert(argv);

        /* called in sysvinit system as last command in shutdown/reboot so this is always forceful */
        if (utmp_get_runlevel(&runlevel, NULL) >= 0)
                if (IN_SET(runlevel, '0', '6'))
                        arg_force = 2;

        while ((c = getopt_long(argc, argv, "pfwdnih", options, NULL)) >= 0)
                switch (c) {

                case ARG_HELP:
                        return halt_help();

                case ARG_HALT:
                        arg_action = ACTION_HALT;
                        break;

                case 'p':
                        if (arg_action != ACTION_REBOOT)
                                arg_action = ACTION_POWEROFF;
                        break;

                case ARG_REBOOT:
                        arg_action = ACTION_REBOOT;
                        break;

                case 'f':
                        arg_force = 2;
                        break;

捋清楚这些后，继续回到systemctl_dispatch_parse_argv函数，在响应操作时，ACTION_POWEROFF的执行操作由halt_main函数实现。

switch (arg_action) {

        case ACTION_SYSTEMCTL:
                r = systemctl_main(argc, argv);
                break;

        /* Legacy command aliases set arg_action. They provide some fallbacks, e.g. to tell sysvinit to
         * reboot after you have installed systemd binaries. */

        case ACTION_HALT:
        case ACTION_POWEROFF:
        case ACTION_REBOOT:
        case ACTION_KEXEC:
                r = halt_main();
                break;
scrope fold...
        }

finish:
        release_busses();

        /* Note that we return r here, not 0, so that we can implement the LSB-like return codes */
        return r;
}

systemctl-compat-halt.c内的halt_main函数在默认不指定‘-f’参数的情况下，会先尝试通过 logind 服务进行关机、重启等操作，如果 logind 服务不支持这些操作、没有足够权限、操作正在进行中或遇到其他错误，则尝试低级别的操作。通过这种方式，最大程度地保证了操作的成功执行，同时减少了与 logind 服务方式的差异

int halt_main(void) {
        int r;

        if (arg_force == 0) {
                /* always try logind first */
                if (arg_when > 0)
                        r = logind_schedule_shutdown(arg_action);
                else {
                        r = logind_check_inhibitors(arg_action);
                        if (r < 0)
                                return r;

                        r = logind_reboot(arg_action);
                }
                if (r >= 0)
                        return r;
                if (IN_SET(r, -EACCES, -EOPNOTSUPP, -EINPROGRESS))
                        /* Requested operation requires auth, is not supported on the local system or already in
                         * progress */
                        return r;
                /* on all other errors, try low-level operation */

                /* In order to minimize the difference between operation with and without logind, we explicitly
                 * enable non-blocking mode for this, as logind's shutdown operations are always non-blocking. */
                arg_no_block = true;

                if (!arg_dry_run)
                        return start_with_fallback();
        }

这段代码是整个systemd关机流程的核心，尤其其中的logind_reboot函数调用链及其复杂，笔者在本章内会掠过，作为下一次的主要内容。接下来分析。最新的systemd支持指定时间执行关机等操作，不指定的情况下arg_when默认为0，此处以不指定的情况分析。

" --when=TIME Schedule halt/power-off/reboot/kexec action after\n"

// check is root?
        if (geteuid() != 0) {
                (void) must_be_root();
                return -EPERM;
        }

        if (!arg_no_wtmp) {
                if (sd_booted() > 0)
                        log_debug("Not writing utmp record, assuming that systemd-update-utmp is used.");
                else {
                        r = utmp_put_shutdown();
                        if (r < 0)
                                log_warning_errno(r, "Failed to write utmp record: %m");
                }
        }

        if (arg_dry_run)
                return 0;

        r = halt_now(arg_action); 
        return log_error_errno(r, "Failed to %s: %m", action_table[arg_action].verb);

arg_dry_run函数判断是否模拟执行操作而不实际去执行任何会改变系统状态的动作，重点来看ACTION_POWEROFF时执行的操作，使用 reboot(RB_POWER_OFF)执行系统关机，若调用失败则返回错误码。

int halt_now(enum action a) {
        /* The kernel will automatically flush ATA disks and suchlike on reboot(), but the file systems need
         * to be synced explicitly in advance. */
        if (!arg_no_sync && !arg_dry_run)
                sync();

        /* Make sure C-A-D is handled by the kernel from this point on... */
        if (!arg_dry_run)
                (void) reboot(RB_ENABLE_CAD);

        switch (a) {

        case ACTION_HALT:
                if (!arg_quiet)
                        log_info("Halting.");
                if (arg_dry_run)
                        return 0;
                (void) reboot(RB_HALT_SYSTEM);
                return -errno;

        case ACTION_POWEROFF:
                if (!arg_quiet)
                        log_info("Powering off.");
                if (arg_dry_run)
                        return 0;
                (void) reboot(RB_POWER_OFF);
                return -errno;

到此处，整个调用流程陷入内核态reboot syscall。

（二）内核态流程

* 笔者分析的内核源码版本为5.19.17。

kernel/reboot.c

reboot syscall的源码如下，其中比较重要的函数为reboot_pid_ns，先来分析这个函数

SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
		void __user *, arg)
{
	struct pid_namespace *pid_ns = task_active_pid_ns(current);
	char buffer[256];
	int ret = 0;

	/* We only trust the superuser with rebooting the system. */
	if (!ns_capable(pid_ns->user_ns, CAP_SYS_BOOT))
		return -EPERM;

	/* For safety, we require "magic" arguments. */
	if (magic1 != LINUX_REBOOT_MAGIC1 ||
			(magic2 != LINUX_REBOOT_MAGIC2 &&
			magic2 != LINUX_REBOOT_MAGIC2A &&
			magic2 != LINUX_REBOOT_MAGIC2B &&
			magic2 != LINUX_REBOOT_MAGIC2C))
		return -EINVAL;

	/*
	 * If pid namespaces are enabled and the current task is in a child
	 * pid_namespace, the command is handled by reboot_pid_ns() which will
	 * call do_exit().
	 */
	ret = reboot_pid_ns(pid_ns, cmd);
	if (ret)
		return ret;

内核函数reboot_pid_ns的作用是处理特定的命名空间（pid namespace）下的重启或关闭操作，pid命名空间是一种隔离机制，使得不同的命名空间可以有独立的进程号空间，是docker等容器的基础技术之一。以下是对代码进行分析和注释：

// 函数：reboot_pid_ns
// 功能：根据命令cmd对特定的pid命名空间进行重启或关闭操作
// 参数：
//   - pid_ns: 指向pid命名空间的指针
//   - cmd: 表示需要执行的重启或关闭操作的命令
// 返回值：
//   - 成功时返回0，失败时返回负值表示错误
int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
{
    // 如果pid命名空间是初始化的命名空间（root），则不执行任何操作，直接返回0
    if (pid_ns == &init_pid_ns)
        return 0;

    // 根据传入的命令cmd，设置pid命名空间的reboot信号
    switch (cmd) {
    case LINUX_REBOOT_CMD_RESTART2:
    case LINUX_REBOOT_CMD_RESTART:
        // 如果是重启命令，设置reboot信号为SIGHUP（挂起信号）
        pid_ns->reboot = SIGHUP;
        break;

    case LINUX_REBOOT_CMD_POWER_OFF:
    case LINUX_REBOOT_CMD_HALT:
        // 如果是关闭或停止命令，设置reboot信号为SIGINT（中断信号）
        pid_ns->reboot = SIGINT;
        break;
    default:
        // 如果命令无效，返回-EINVAL表示无效参数错误
        return -EINVAL;
    }

    // 获取tasklist锁，确保在操作过程中任务列表不会被修改
    read_lock(&tasklist_lock);
    // 向命名空间的child_reaper进程发送SIGKILL信号，强制终止该进程
    send_sig(SIGKILL, pid_ns->child_reaper, 1);
    // 释放tasklist锁
    read_unlock(&tasklist_lock);

    do_exit(0);

    return 0;
}

pid_ns->child_reaper是PID命名空间内的一号进程，在容器内就是容器的init进程， send_sig(SIGKILL, pid_ns->child_reaper, 1)这段程序的作用就是直接kill掉init进程。容器init进程退出时，exit函数会调用exit_notify函数，forget_original_parent。

/*
 * Send signals to all our closest relatives so that they know
 * to properly mourn us..
 */
static void exit_notify(struct task_struct *tsk, int group_dead)
{
	bool autoreap;
	struct task_struct *p, *n;
	LIST_HEAD(dead);

	write_lock_irq(&tasklist_lock);
	forget_original_parent(tsk, &dead);

forget_original_parent函数用于处理进程的父子关系。当一个进程的父进程终止时，内核需要确保该进程的孤儿（即原本由已终止的父进程控制的子进程）能够被正确地重新分配到一个新的父进程。通常，这个新父进程是系统中的 init 进程（PID 1），因为 init 进程是所有孤儿进程的最终收容者。

/*
 * This does two things:
 *
 * A.  Make init inherit all the child processes
 * B.  Check to see if any process groups have become orphaned
 *	as a result of our exiting, and if they have any stopped
 *	jobs, send them a SIGHUP and then a SIGCONT.  (POSIX 3.2.2.2)
 */
static void forget_original_parent(struct task_struct *father,
					struct list_head *dead)
{
	struct task_struct *p, *t, *reaper;

	if (unlikely(!list_empty(&father->ptraced)))
		exit_ptrace(father, dead);

	/* Can drop and reacquire tasklist_lock */
	reaper = find_child_reaper(father, dead);

`find_child_reaper` 函数，用于找到指定 PID 命名空间中的“child reaper”进程，即在该命名空间中负责处理孤儿进程的进程。通常，这个进程是命名空间中的初始化进程（通常是 PID 为 1 的进程）。

static struct task_struct *find_child_reaper(struct task_struct *father,
						struct list_head *dead)
	__releases(&tasklist_lock)
	__acquires(&tasklist_lock)
{
	struct pid_namespace *pid_ns = task_active_pid_ns(father);
	struct task_struct *reaper = pid_ns->child_reaper;
	struct task_struct *p, *n;

	if (likely(reaper != father))
		return reaper;

	reaper = find_alive_thread(father);
	if (reaper) {
		pid_ns->child_reaper = reaper;
		return reaper;
	}

	write_unlock_irq(&tasklist_lock);

	list_for_each_entry_safe(p, n, dead, ptrace_entry) {
		list_del_init(&p->ptrace_entry);
		release_task(p);
	}

	zap_pid_ns_processes(pid_ns);
	write_lock_irq(&tasklist_lock);

	return father;
}

zap_pid_ns_processes函数，用于清理指定 PID 命名空间 (pid_namespace) 中的所有进程。这个过程通常在命名空间的初始化进程终止时触发。以下是对该函数的详细注释和分析：

void zap_pid_ns_processes(struct pid_namespace *pid_ns)
{
    int nr;
    int rc;
    struct task_struct *task, *me = current;
    int init_pids = thread_group_leader(me) ? 1 : 2; // 确定初始化进程数量，如果当前进程是线程组的领导者，则数量为1，否则为2。
    struct pid *pid;

    /* 不允许更多的进程进入该pid命名空间 */
    disable_pid_allocation(pid_ns);

    /*
     * 忽略SIGCHLD信号，以使所有终止的子进程自动回收。
     * 这可以加速命名空间的关闭，且见下面的注释。
     */
    spin_lock_irq(&me->sighand->siglock);
    me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN; // 将SIGCHLD信号处理程序设置为 SIG_IGN（忽略）。
    spin_unlock_irq(&me->sighand->siglock);

    /*
     * cgroup-init线程组中的最后一个线程正在终止。
     * 查找命名空间中的剩余pid_t，发送信号并等待它们退出。
     *
     * 注意：此操作会向命名空间中的所有线程发送信号，甚至同一线程组中的线程也不例外。
     * 为了避免这种情况，我们需要遍历整个任务列表查找属于该命名空间的进程，
     * 但如果pid命名空间中只有少量进程，这样做可能会不必要地耗费资源。或者我们需要为每个pid命名空间维护一个任务列表。
     */
    rcu_read_lock();
    read_lock(&tasklist_lock);
    nr = 2;
    idr_for_each_entry_continue(&pid_ns->idr, pid, nr) {
        task = pid_task(pid, PIDTYPE_PID); // 获取进程的task_struct
        if (task && !__fatal_signal_pending(task)) // 如果任务存在且没有致命信号挂起
            group_send_sig_info(SIGKILL, SEND_SIG_PRIV, task, PIDTYPE_MAX); // 发送SIGKILL信号来终止进程
    }
    read_unlock(&tasklist_lock);
    rcu_read_unlock();

    /*
     * 回收在忽略SIGCHLD之前已经成为EXIT_ZOMBIE状态的子进程。
     * kernel_wait4()还会阻塞，直到父命名空间中被跟踪的孩子都分离并成为EXIT_DEAD状态。
     */
    do {
        clear_thread_flag(TIF_SIGPENDING); // 清除当前线程的挂起信号标志
        rc = kernel_wait4(-1, NULL, __WALL, NULL); // 等待子进程的状态变化
    } while (rc != -ECHILD); // 继续循环，直到所有子进程都被处理完

    /*
     * kernel_wait4()会遗漏EXIT_DEAD的子进程，以及它的父进程不在pid命名空间内的EXIT_ZOMBIE进程。
     * 这样的进程可能是使用setns()+fork()创建的。
     *
     * 如果这些EXIT_ZOMBIE进程在它们的父进程退出之前未被回收，它们将被重新分配给pid_ns->child_reaper。
     * 因此pidns->child_reaper需要保持有效，直到它们全部消失。
     *
     * 代码依赖于pid_ns->child_reaper忽略SIGCHILD信号，
     * 以便这些被重新分配的EXIT_ZOMBIE进程被自动回收。
     *
     * 从语义上讲，在允许child_reaper被回收之前等待EXIT_ZOMBIE进程也是有意义的，
     * 因为这保证了当pid命名空间的init进程被回收时，命名空间中的所有进程都已经消失。
     *
     * 一旦pid_namespace中的其他进程都消失，free_pid()将唤醒这个任务。
     */
    for (;;) {
        set_current_state(TASK_INTERRUPTIBLE); // 将当前状态设置为TASK_INTERRUPTIBLE，以便可以被唤醒
        if (pid_ns->pid_allocated == init_pids) // 如果命名空间中的进程数量等于初始数量，则退出
            break;
        schedule(); // 调度其他任务
    }
    __set_current_state(TASK_RUNNING); // 恢复到TASK_RUNNING状态

    if (pid_ns->reboot)
        current->signal->group_exit_code = pid_ns->reboot; // 设置命名空间的重启代码

    acct_exit_ns(pid_ns); // 处理命名空间退出的会计信息
    return;
}

总的来说，zap_pid_ns_processes 确保在 PID 命名空间中的所有进程都能被正确终止和回收，无论它们是直接的子进程还是后续生成的孤儿进程，'systemctl poweroff -f'命令对于容器环境，是直接强杀所有容器内进程的，这对于维护系统的稳定性和资源回收尤为重要。

如果非容器环境，systemd进程退出后，其余的进程都会被系统的init进程收养，并且内核会向这些进程发送SIGHUP然后在发送SIGCOUNT。

接下来继续分析分析reboot syscall函数。

/* Instead of trying to make the power_off code look like
	 * halt when pm_power_off is not set do it the easy way.
	 */
	if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !kernel_can_power_off())
		cmd = LINUX_REBOOT_CMD_HALT;

	mutex_lock(&system_transition_mutex);
	switch (cmd) {
scrope fold...

	case LINUX_REBOOT_CMD_HALT:
		kernel_halt();
		do_exit(0);

	case LINUX_REBOOT_CMD_POWER_OFF:
		kernel_power_off();
		do_exit(0);
		break;
scrope fold...
	}
	mutex_unlock(&system_transition_mutex);
	return ret;
}

LINUX_REBOOT_CMD_POWER_OFFCMD对应的操作为kernel_power_off函数，用于执行系统的电源关闭操作。以下是对代码进行详细分析和注释：

void kernel_power_off(void)
{
    // 准备系统关机，这里的参数 SYSTEM_POWER_OFF 表示关机操作
    kernel_shutdown_prepare(SYSTEM_POWER_OFF);

    // 执行电源关闭的具体准备工作
    do_kernel_power_off_prepare();

    // 将所有处理器迁移到进行重启的 CPU 上
    migrate_to_reboot_cpu();

    // 关闭系统核心部件 (如计时器、中断控制器等)
    syscore_shutdown();

    // 输出紧急信息，通知系统正在关机
    pr_emerg("Power down\n");

    // 转储内核日志，用于在系统关机时保存日志信息
    kmsg_dump(KMSG_DUMP_SHUTDOWN);

    // 调用机器相关的电源关闭函数，执行实际的电源关闭操作
    machine_power_off();
}

每个架构会有对应的machine_power_off函数如下是arm64的machine_power_off函数：

/*
 * Power-off simply requires that the secondary CPUs stop performing any
 * activity (executing tasks, handling interrupts). smp_send_stop()
 * achieves this. When the system power is turned off, it will take all CPUs
 * with it.
 */
void machine_power_off(void)
{
  // 禁用当前 CPU 的中断
	local_irq_disable();
  
  // 向整个系统中的所有其他 CPU 核心发送停止信号，在关机期间，确保不会有中断打断关机过程，这样可以避免关机过程中出现不一致性或其他异常情况
	smp_send_stop();
  
  // 执行实际的内核关机操作，该操作由架构相关代码完成
	do_kernel_power_off();
}

最后一个函数，do_kernel_power_off内核开发者对其的注释如下：

/**
 *	do_kernel_power_off - Execute kernel power-off handler call chain
 *
 *	Expected to be called as last step of the power-off sequence.
 *
 *	Powers off the system immediately if a power-off handler function has
 *	been registered. Otherwise does nothing.
 */
void do_kernel_power_off(void)

do_kernel_power_off函数的目的是执行已注册的内核关机处理函数链，它应当在关机流程的最后一步被调用。如果有注册的关机处理函数存在，它将立即关闭系统，否则不会执行任何操作，通过这种机制，内核确保在关机时能够调用适当的函数以关闭系统电源，如果没有合适的函数可调用，则避免出错。这样的设计可以带来灵活性，使平台相关的关机处理逻辑可以通过注册相应的函数来实现。