1、问题现象
chrt设置RT不成功
[root@test1:/root] chrt -f 99 whoami
chrt: failed to set pid 0's policy: Operation not permitted
上述测试语句是将whoami这个命令进程的调度策略调整成fifo 99优先级执行,但是报错 了。这个错误直译过来是给pid 0 设置调度规则是不允许的,没有这个权限。
2、初步分析
先搜一下有没有类似问题:
Ubuntu – chrt(): “failed to set pid XXX’s policy” on one machine, but not others – iTecTec
[原创] chrt: failed to set pid xxxx's policy: Operation not permitted_Dream.Seeker的博客-CSDN博客
解决方式是:sysctl -w kernel.sched_rt_runtime_us=-1
这条命令意思是将rt任务的执行时间设置为不受限制,即只要有rt任务,那么这个任务会一直占据CPU直到运行结束或主动让权,这是个高危设置,容易造成hungtask或softlockup,其他任务得不到调度或直接系统卡死。
为什么这样设置后就可以让chrt 执行成功了呢?以及之前执行失败的真正原因是什么呢?搜索知识库已经没能找到答案了,正面分析一下。
3、寻找根因
strace一下系统调用:
strace chrt -f 99 whoami
execve("/usr/bin/chrt", ["chrt", "-f", "99", "whoami"], 0x7fff36a99428 /* 29 vars */) = 0
brk(NULL) = 0x608000
......
sched_get_priority_min(SCHED_FIFO) = 1
sched_get_priority_max(SCHED_FIFO) = 99
sched_setattr(0, {size=48, sched_policy=SCHED_FIFO, sched_flags=0, sched_nice=0, sched_priority=99, sched_runtime=0, sched_deadline=0, sched_period=0}, 0) = -1EPERM (Operation not permitted)
write(2, "chrt: ", 6chrt: ) = 6
write(2, "failed to set pid 0's policy", 28failed to set pid 0's policy) = 28
write(2, ": ", 2: ) = 2
......
+++ exited with 1 +++
发现是sched_setattr返回了EPERM错误(操作无权限),继续走读内核代码,函数调用关系:sched_setattr -> __sched_setscheduler
对应函数代码:
static int __sched_setscheduler(struct task_struct *p,
const struct sched_attr *attr,
bool user, bool pi)
{
int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
MAX_RT_PRIO - 1 - attr->sched_priority;
int retval, oldprio, oldpolicy = -1, queued, running;
int new_effective_prio, policy = attr->sched_policy;
const struct sched_class *prev_class;
struct rq_flags rf;
int reset_on_fork;
int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
struct rq *rq;
/* The pi code expects interrupts enabled */
BUG_ON(pi && in_interrupt());
recheck:
/* Double check policy once rq lock held: */
if (policy < 0) {
reset_on_fork = p->sched_reset_on_fork;
policy = oldpolicy = p->policy;
} else {
reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
if (!valid_policy(policy))
return -EINVAL;
}
if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV))
return -EINVAL;
/*
* Valid priorities for SCHED_FIFO and SCHED_RR are
* 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
* SCHED_BATCH and SCHED_IDLE is 0.
*/
if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) ||
(!p->mm && attr->sched_priority > MAX_RT_PRIO-1))
return -EINVAL;
if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
(rt_policy(policy) != (attr->sched_priority != 0)))
return -EINVAL;
/*
* Allow unprivileged RT tasks to decrease priority:
*/
if (user && !capable(CAP_SYS_NICE)) {
if (fair_policy(policy)) {
if (attr->sched_nice < task_nice(p) &&
!can_nice(p, attr->sched_nice))
return -EPERM;
}
if (rt_policy(policy)) {
unsigned long rlim_rtprio =
task_rlimit(p, RLIMIT_RTPRIO);
/* Can't set/change the rt policy: */
if (policy != p->policy && !rlim_rtprio)
return -EPERM;
/* Can't increase priority: */
if (attr->sched_priority > p->rt_priority &&
attr->sched_priority > rlim_rtprio)
return -EPERM;
}
/*
* Can't set/change SCHED_DEADLINE policy at all for now
* (safest behavior); in the future we would like to allow
* unprivileged DL tasks to increase their relative deadline
* or reduce their runtime (both ways reducing utilization)
*/
if (dl_policy(policy))
return -EPERM;
/*
* Treat SCHED_IDLE as nice 20. Only allow a switch to
* SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
*/
if (idle_policy(p->policy) && !idle_policy(policy)) {
if (!can_nice(p, task_nice(p)))
return -EPERM;
}
/* Can't change other user's priorities: */
if (!check_same_owner(p))
return -EPERM;
/* Normal users shall not reset the sched_reset_on_fork flag: */
if (p->sched_reset_on_fork && !reset_on_fork)
return -EPERM;
}
if (user) {
if (attr->sched_flags & SCHED_FLAG_SUGOV)
return -EINVAL;
retval = security_task_setscheduler(p);
if (retval)
return retval;
}
/*
* Make sure no PI-waiters arrive (or leave) while we are
* changing the priority of the task:
*
* To be able to change p->policy safely, the appropriate
* runqueue lock must be held.
*/
rq = task_rq_lock(p, &rf);
update_rq_clock(rq);
/*
* Changing the policy of the stop threads its a very bad idea:
*/
if (p == rq->stop) {
task_rq_unlock(rq, p, &rf);
return -EINVAL;
}
/*
* If not changing anything there's no need to proceed further,
* but store a possible modification of reset_on_fork.
*/
if (unlikely(policy == p->policy)) {
if (fair_policy(policy) && attr->sched_nice != task_nice(p))
goto change;
if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
goto change;
if (dl_policy(policy) && dl_param_changed(p, attr))
goto change;
p->sched_reset_on_fork = reset_on_fork;
task_rq_unlock(rq, p, &rf);
return 0;
}
change:
if (user) {
#ifdef CONFIG_RT_GROUP_SCHED
/*
* Do not allow realtime tasks into groups that have no runtime
* assigned.
*/
if (rt_bandwidth_enabled() && rt_policy(policy) &&
task_group(p)->rt_bandwidth.rt_runtime == 0 &&
!task_group_is_autogroup(task_group(p))) {
task_rq_unlock(rq, p, &rf);
return -EPERM;
}
#endif
#ifdef CONFIG_SMP
if (dl_bandwidth_enabled() && dl_policy(policy) &&
!(attr->sched_flags & SCHED_FLAG_SUGOV)) {
cpumask_t *span = rq->rd->span;
/*
* Don't allow tasks with an affinity mask smaller than
* the entire root_domain to become SCHED_DEADLINE. We
* will also fail if there's no bandwidth available.
*/
if (!cpumask_subset(span, &p->cpus_allowed) ||
rq->rd->dl_bw.bw == 0) {
task_rq_unlock(rq, p, &rf);
return -EPERM;
}
}
#endif
}
/* Re-check policy now with rq lock held: */
if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
policy = oldpolicy = -1;
task_rq_unlock(rq, p, &rf);
goto recheck;
}
/*
* If setscheduling to SCHED_DEADLINE (or changing the parameters
* of a SCHED_DEADLINE task) we need to check if enough bandwidth
* is available.
*/
if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
task_rq_unlock(rq, p, &rf);
return -EBUSY;
}
p->sched_reset_on_fork = reset_on_fork;
oldprio = p->prio;
if (pi) {
/*
* Take priority boosted tasks into account. If the new
* effective priority is unchanged, we just store the new
* normal parameters and do not touch the scheduler class and
* the runqueue. This will be done when the task deboost
* itself.
*/
new_effective_prio = rt_effective_prio(p, newprio);
if (new_effective_prio == oldprio)
queue_flags &= ~DEQUEUE_MOVE;
}
queued = task_on_rq_queued(p);
running = task_current(rq, p);
if (queued)
dequeue_task(rq, p, queue_flags);
if (running)
put_prev_task(rq, p);
prev_class = p->sched_class;
__setscheduler(rq, p, attr, pi);
if (queued) {
/*
* We enqueue to tail when the priority of a task is
* increased (user space view).
*/
if (oldprio < p->prio)
queue_flags |= ENQUEUE_HEAD;
enqueue_task(rq, p, queue_flags);
}
if (running)
set_curr_task(rq, p);
check_class_changed(rq, p, prev_class, oldprio);
/* Avoid rq from going away on us: */
preempt_disable();
task_rq_unlock(rq, p, &rf);
if (pi)
rt_mutex_adjust_pi(p);
/* Run balance callbacks after we've adjusted the PI chain: */
balance_callback(rq);
preempt_enable();
return 0;
}
直接分析EPERM返回点,发现一处跟rt_runtime有关的判断分支:
/*
* Do not allow realtime tasks into groups that have no runtime
* assigned.
*/
if (rt_bandwidth_enabled() && rt_policy(policy) &&
task_group(p)->rt_bandwidth.rt_runtime == 0 &&
!task_group_is_autogroup(task_group(p))) {
task_rq_unlock(rq, p, &rf);
return -EPERM;
}
static inline int rt_bandwidth_enabled(void)
{
return sysctl_sched_rt_runtime >= 0;
}
{
.procname = "sched_rt_runtime_us",
.data = &sysctl_sched_rt_runtime,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = sched_rt_handler,
}
这个判断的直接意思就是如果sched_rt_runtime_us设置了大于或等于0(使能了实时进程的运行时间限制),自动给分组的进程组的实时进程限制运行时间为0,那么 sched_setattr rt 的操作没有权限。
sysctl -w kernel.sched_rt_runtime_us=-1 是将/proc/sys/kernel/sched_rt_period_us设置为-1,即sysctl_sched_rt_runtime=-1,这个操作过后上述条件不满足了,实际上就是给了设置rt进程操作的权限了,同时也带来了系统隐患。
4、解决方案
真正的解决方案应该在这里:
if (rt_bandwidth_enabled() && rt_policy(policy) &&
task_group(p)->rt_bandwidth.rt_runtime == 0 &&
!task_group_is_autogroup(task_group(p)))
这个意思是进程组的实时进程限制运行时间为0,只要把它调整为大于0就可以了,系统默认的数值应该是950000 或 0.95s。这数值怎么调呢?
先看下新进程所在crgoup的rt_runtime_us配置,果然是0:
[root@test1:/root]
cat /sys/fs/cgroup/cpu/system.slice/cpu.rt_runtime_us
0
解决步骤
1、找到当前环境下新进程的cgroup。cat /proc/(进程pid)/cgroup
[root@test1:/root]
cat /proc/35216/cgroup
12:devices:/system.slice
11:cpuset:/
10:perf_event:/
9:freezer:/
8:memory:/system.slice
7:pids:/
6:blkio:/
5:cpu,cpuacct:/system.slice
4:net_cls,net_prio:/
3:rdma:/
2:hugetlb:/
1:name=systemd:/system.slice
2、进入到对应的cgroup修改cpu.rt_runtime_us为950000(或其他大于0的值,取决于你的rt策略)。echo 950000 > /sys/fs/cgroup/cpu/(对应进程的croup)/cpu.rt_runtime_us
[root@test1:/root]
echo 950000 > /sys/fs/cgroup/cpu/system.slice/cpu.rt_runtime_us