chrt: failed to set pid 0’s policy: Operation not permitted

陈诗雁

已于 2022-04-25 16:56:44 修改

阅读量2.2k

点赞数 1

文章标签： linux ubuntu centos 运维 harmonyos

于 2022-04-25 16:55:53 首次发布

本文链接：https://blog.csdn.net/sinat_33469206/article/details/124408504

版权

1、问题现象

chrt设置RT不成功

[root@test1:/root] chrt -f 99 whoami
chrt: failed to set pid 0's policy: Operation not permitted

上述测试语句是将whoami这个命令进程的调度策略调整成fifo 99优先级执行，但是报错了。这个错误直译过来是给pid 0 设置调度规则是不允许的，没有这个权限。

2、初步分析

先搜一下有没有类似问题：

Ubuntu – chrt(): “failed to set pid XXX’s policy” on one machine, but not others – iTecTec

[原创] chrt: failed to set pid xxxx's policy: Operation not permitted_Dream.Seeker的博客-CSDN博客

解决方式是：sysctl -w kernel.sched_rt_runtime_us=-1

这条命令意思是将rt任务的执行时间设置为不受限制，即只要有rt任务，那么这个任务会一直占据CPU直到运行结束或主动让权，这是个高危设置，容易造成hungtask或softlockup，其他任务得不到调度或直接系统卡死。

为什么这样设置后就可以让chrt 执行成功了呢？以及之前执行失败的真正原因是什么呢？搜索知识库已经没能找到答案了，正面分析一下。

3、寻找根因

strace一下系统调用：

strace chrt -f 99 whoami
execve("/usr/bin/chrt", ["chrt", "-f", "99", "whoami"], 0x7fff36a99428 /* 29 vars */) = 0
brk(NULL) = 0x608000
......
sched_get_priority_min(SCHED_FIFO) = 1
sched_get_priority_max(SCHED_FIFO) = 99
sched_setattr(0, {size=48, sched_policy=SCHED_FIFO, sched_flags=0, sched_nice=0, sched_priority=99, sched_runtime=0, sched_deadline=0, sched_period=0}, 0) = -1EPERM (Operation not permitted)
write(2, "chrt: ", 6chrt: ) = 6
write(2, "failed to set pid 0's policy", 28failed to set pid 0's policy) = 28
write(2, ": ", 2: ) = 2
......
+++ exited with 1 +++

发现是sched_setattr返回了EPERM错误（操作无权限），继续走读内核代码，函数调用关系：sched_setattr -> __sched_setscheduler

对应函数代码：

static int __sched_setscheduler(struct task_struct *p,
                const struct sched_attr *attr,
                bool user, bool pi)
  {
    int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
              MAX_RT_PRIO - 1 - attr->sched_priority;
    int retval, oldprio, oldpolicy = -1, queued, running;
    int new_effective_prio, policy = attr->sched_policy;
    const struct sched_class *prev_class;
    struct rq_flags rf;
    int reset_on_fork;
    int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
    struct rq *rq;
  
    /* The pi code expects interrupts enabled */
    BUG_ON(pi && in_interrupt());
  recheck:
    /* Double check policy once rq lock held: */
    if (policy < 0) {
        reset_on_fork = p->sched_reset_on_fork;
        policy = oldpolicy = p->policy;
    } else {
        reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
  
        if (!valid_policy(policy))
            return -EINVAL;
    }
  
    if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV))
        return -EINVAL;
  
    /*
     * Valid priorities for SCHED_FIFO and SCHED_RR are
     * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
     * SCHED_BATCH and SCHED_IDLE is 0.
     */
    if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) ||
        (!p->mm && attr->sched_priority > MAX_RT_PRIO-1))
        return -EINVAL;
    if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
        (rt_policy(policy) != (attr->sched_priority != 0)))
        return -EINVAL;
  
    /*
     * Allow unprivileged RT tasks to decrease priority:
     */
    if (user && !capable(CAP_SYS_NICE)) {
        if (fair_policy(policy)) {
            if (attr->sched_nice < task_nice(p) &&
                !can_nice(p, attr->sched_nice))
                return -EPERM;
        }
  
        if (rt_policy(policy)) {
            unsigned long rlim_rtprio =
                    task_rlimit(p, RLIMIT_RTPRIO);
  
            /* Can't set/change the rt policy: */
            if (policy != p->policy && !rlim_rtprio)
                return -EPERM;
  
            /* Can't increase priority: */
            if (attr->sched_priority > p->rt_priority &&
                attr->sched_priority > rlim_rtprio)
                return -EPERM;
        }
  
         /*
          * Can't set/change SCHED_DEADLINE policy at all for now
          * (safest behavior); in the future we would like to allow
          * unprivileged DL tasks to increase their relative deadline
          * or reduce their runtime (both ways reducing utilization)
          */
        if (dl_policy(policy))
            return -EPERM;
  
        /*
         * Treat SCHED_IDLE as nice 20. Only allow a switch to
         * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
         */
        if (idle_policy(p->policy) && !idle_policy(policy)) {
            if (!can_nice(p, task_nice(p)))
                return -EPERM;
        }
  
        /* Can't change other user's priorities: */
        if (!check_same_owner(p))
            return -EPERM;
  
        /* Normal users shall not reset the sched_reset_on_fork flag: */
        if (p->sched_reset_on_fork && !reset_on_fork)
            return -EPERM;
    }
  
    if (user) {
        if (attr->sched_flags & SCHED_FLAG_SUGOV)
            return -EINVAL;
  
        retval = security_task_setscheduler(p);
        if (retval)
            return retval;
    }
  
    /*
     * Make sure no PI-waiters arrive (or leave) while we are
     * changing the priority of the task:
     *
     * To be able to change p->policy safely, the appropriate
     * runqueue lock must be held.
     */
    rq = task_rq_lock(p, &rf);
    update_rq_clock(rq);
  
    /*
     * Changing the policy of the stop threads its a very bad idea:
     */
    if (p == rq->stop) {
        task_rq_unlock(rq, p, &rf);
        return -EINVAL;
    }
  
    /*
     * If not changing anything there's no need to proceed further,
     * but store a possible modification of reset_on_fork.
     */
    if (unlikely(policy == p->policy)) {
        if (fair_policy(policy) && attr->sched_nice != task_nice(p))
            goto change;
        if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
            goto change;
        if (dl_policy(policy) && dl_param_changed(p, attr))
            goto change;
  
        p->sched_reset_on_fork = reset_on_fork;
        task_rq_unlock(rq, p, &rf);
        return 0;
    }
  change:
  
    if (user) {
  #ifdef CONFIG_RT_GROUP_SCHED
        /*
         * Do not allow realtime tasks into groups that have no runtime
         * assigned.
         */
        if (rt_bandwidth_enabled() && rt_policy(policy) &&
                task_group(p)->rt_bandwidth.rt_runtime == 0 &&
                !task_group_is_autogroup(task_group(p))) {
            task_rq_unlock(rq, p, &rf);
            return -EPERM;
        }
  #endif
  #ifdef CONFIG_SMP
        if (dl_bandwidth_enabled() && dl_policy(policy) &&
                !(attr->sched_flags & SCHED_FLAG_SUGOV)) {
            cpumask_t *span = rq->rd->span;
  
            /*
             * Don't allow tasks with an affinity mask smaller than
             * the entire root_domain to become SCHED_DEADLINE. We
             * will also fail if there's no bandwidth available.
             */
            if (!cpumask_subset(span, &p->cpus_allowed) ||
                rq->rd->dl_bw.bw == 0) {
                task_rq_unlock(rq, p, &rf);
                return -EPERM;
            }
        }
  #endif
    }
  
    /* Re-check policy now with rq lock held: */
    if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
        policy = oldpolicy = -1;
        task_rq_unlock(rq, p, &rf);
        goto recheck;
    }
  
    /*
     * If setscheduling to SCHED_DEADLINE (or changing the parameters
     * of a SCHED_DEADLINE task) we need to check if enough bandwidth
     * is available.
     */
    if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
        task_rq_unlock(rq, p, &rf);
        return -EBUSY;
    }
  
    p->sched_reset_on_fork = reset_on_fork;
    oldprio = p->prio;
  
    if (pi) {
        /*
         * Take priority boosted tasks into account. If the new
         * effective priority is unchanged, we just store the new
         * normal parameters and do not touch the scheduler class and
         * the runqueue. This will be done when the task deboost
         * itself.
         */
        new_effective_prio = rt_effective_prio(p, newprio);
        if (new_effective_prio == oldprio)
            queue_flags &= ~DEQUEUE_MOVE;
    }
  
    queued = task_on_rq_queued(p);
    running = task_current(rq, p);
    if (queued)
        dequeue_task(rq, p, queue_flags);
    if (running)
        put_prev_task(rq, p);
  
    prev_class = p->sched_class;
    __setscheduler(rq, p, attr, pi);
  
    if (queued) {
        /*
         * We enqueue to tail when the priority of a task is
         * increased (user space view).
         */
        if (oldprio < p->prio)
            queue_flags |= ENQUEUE_HEAD;
  
        enqueue_task(rq, p, queue_flags);
    }
    if (running)
        set_curr_task(rq, p);
  
    check_class_changed(rq, p, prev_class, oldprio);
  
    /* Avoid rq from going away on us: */
    preempt_disable();
    task_rq_unlock(rq, p, &rf);
  
    if (pi)
        rt_mutex_adjust_pi(p);
  
    /* Run balance callbacks after we've adjusted the PI chain: */
    balance_callback(rq);
    preempt_enable();
  
    return 0;
  }

直接分析EPERM返回点，发现一处跟rt_runtime有关的判断分支：

/*
* Do not allow realtime tasks into groups that have no runtime
* assigned.
*/
if (rt_bandwidth_enabled() && rt_policy(policy) &&
task_group(p)->rt_bandwidth.rt_runtime == 0 &&
!task_group_is_autogroup(task_group(p))) {
task_rq_unlock(rq, p, &rf);
    return -EPERM;
}

static inline int rt_bandwidth_enabled(void)
{
    return sysctl_sched_rt_runtime >= 0;
}

{
    .procname = "sched_rt_runtime_us",
    .data = &sysctl_sched_rt_runtime,
    .maxlen = sizeof(int),
    .mode = 0644,
    .proc_handler = sched_rt_handler,
}

这个判断的直接意思就是如果sched_rt_runtime_us设置了大于或等于0（使能了实时进程的运行时间限制），自动给分组的进程组的实时进程限制运行时间为0，那么 sched_setattr rt 的操作没有权限。

sysctl -w kernel.sched_rt_runtime_us=-1 是将/proc/sys/kernel/sched_rt_period_us设置为-1，即sysctl_sched_rt_runtime=-1，这个操作过后上述条件不满足了，实际上就是给了设置rt进程操作的权限了，同时也带来了系统隐患。

4、解决方案

真正的解决方案应该在这里：

if (rt_bandwidth_enabled() && rt_policy(policy) &&

task_group(p)->rt_bandwidth.rt_runtime == 0 &&

!task_group_is_autogroup(task_group(p)))

这个意思是进程组的实时进程限制运行时间为0，只要把它调整为大于0就可以了，系统默认的数值应该是950000 或 0.95s。这数值怎么调呢？

先看下新进程所在crgoup的rt_runtime_us配置，果然是0：

[root@test1:/root]
cat /sys/fs/cgroup/cpu/system.slice/cpu.rt_runtime_us
0

解决步骤

1、找到当前环境下新进程的cgroup。cat /proc/(进程pid)/cgroup

[root@test1:/root]
cat /proc/35216/cgroup
12:devices:/system.slice
11:cpuset:/
10:perf_event:/
9:freezer:/
8:memory:/system.slice
7:pids:/
6:blkio:/
5:cpu,cpuacct:/system.slice
4:net_cls,net_prio:/
3:rdma:/
2:hugetlb:/
1:name=systemd:/system.slice

2、进入到对应的cgroup修改cpu.rt_runtime_us为950000（或其他大于0的值，取决于你的rt策略）。echo 950000 > /sys/fs/cgroup/cpu/(对应进程的croup)/cpu.rt_runtime_us

[root@test1:/root]
echo 950000 > /sys/fs/cgroup/cpu/system.slice/cpu.rt_runtime_us