1. task_struct 中标示linux 进程优先级的几个重要变量
他们之间究竟是什么关系,究竟什么样的优先级值才能最快能被调度器调度执行?
详细解读之前,可以先明确上面的一个问题,task_struct 中的成员变量 prio越小,进程的优先级越高。prio 值的取值范围为0..139。
上述描述在include/linux/sched.h 中也有描述,
/*
* Priority of a process goes from 0..MAX_PRIO-1, valid RT
* priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
* tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority
* values are inverted: lower p->prio value means higher priority.
*
* The MAX_USER_RT_PRIO value allows the actual maximum
* RT priority to be separate from the value exported to
* user-space. This allows kernel threads to set their
* priority to a value higher than any user task. Note:
* MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
*/
#define MAX_USER_RT_PRIO 100
#define MAX_RT_PRIO MAX_USER_RT_PRIO
#define MAX_PRIO (MAX_RT_PRIO + 40)
#define DEFAULT_PRIO (MAX_RT_PRIO + 20)
需要提到调度器不同的调度策略,后两种是预留的还没有实现的,不用管。SCHED_FIFO/SCHED_RR 为实时进程调度策略,SCHED_NORMAL/SCHED_BATCH 为非实时进程也就是普通进程的调度策略。
/*
* Scheduling policies
*/
#define SCHED_NORMAL 0
#define SCHED_FIFO 1
#define SCHED_RR 2
#define SCHED_BATCH 3
/* SCHED_ISO: reserved but not implemented yet */
#define SCHED_IDLE 5
/* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */
#define SCHED_RESET_ON_FORK 0x40000000
其中,实时进程(sched policy 为)的优先级取值范围是0..99,非实时进程的取值范围为100..139。
如果问题就到这里结束也许觉得也还挺清晰,但是这只是开始,问题来源于linux kernel 中设置调度器的方式似乎正在颠覆你刚刚有的这点认识。
接下来会一一描述,但是有一条请绝对坚信并牢记,
task_struct 中的成员变量 prio越小,进程的优先级越高。prio 值的取值范围为0..139。
2.设置进程的调度策略和进程优先级
下面以linux kernel watchdog 进程作为例子,kernel/watchdog.c
为每个cpu 创建 watchdog/N 进程,watchdog 进程的循环体在 watchdog()函数。
static int watchdog_enable(int cpu)
{
struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
int err = 0;
/* enable the perf event */
err = watchdog_nmi_enable(cpu);
/* Regardless of err above, fall through and start softlockup */
/* create the watchdog thread */
if (!p) {
struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
p = kthread_create_on_node(watchdog, NULL, cpu_to_node(cpu), "watchdog/%d", cpu);
设置 watchdog/N 进程的调度策略和进程优先级,sched policy 很明确SCHED_FIFO,为RT 实时进程。
.sched_priority 被设置成了MAX_RT_PRIO-1 也就是99了,这是个什么优先级的进程呢? 别急,且看代码且分析。
//struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
sched_setscheduler(p, SCHED_FIFO, ¶m);
sched_setscheduler()
-> __sched_setscheduler()
-> __setscheduler()
在__sched_setscheduler() 函数中有这么一段代码,向我们描述了一个很重要的事实,想用sched_setscheduler()设置进程优先级,
必须满足: 实时进程有效优先级为1..99,非实时进程的优先级为0。 这里你可能觉得我是在开玩笑,非实时进程的有效优先级怎么可能为0 ?
请注意,这里指的是内核通过sched_setscheduler() 接口设置的优先级,在后面你就会看到非实时进程的优先级我们有个初始值(fork/init),
你只能通过内核提供的nice 设置函数,间接修改非实时进程的优先级。
/*
* Valid priorities for SCHED_FIFO and SCHED_RR are
* 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
* SCHED_BATCH and SCHED_IDLE is 0.
*/
if (param->sched_priority < 0 ||
(p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
(!p->mm && param->sched_priority > MAX_RT_PRIO-1))
return -EINVAL;
if (rt_policy(policy) != (param->sched_priority != 0))
return -EINVAL;
后来调到__setscheduler() 真正进行进程优先级设置,
/* Actually do priority change: must hold rq lock. */
static void
__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
{
p->policy = policy;// 你通过sched_setscheduler()传入的sched policy就是最终赋予给(task_struct*)p->policy
p->rt_priority = prio;//你通过sched_setscheduler()传入的prio 最终却赋予给了(task_struct*)p->rt_priority,怎么样? 没忽悠你吧,传入的非实时进程优先级值真的只能是0!!!
p->normal_prio = normal_prio(p);//接下来看 p->normal_prio 是怎么回事?
/* we are holding p->pi_lock already */
p->prio = rt_mutex_getprio(p);
if (rt_prio(p->prio))
p->sched_class = &rt_sched_class;
else
p->sched_class = &fair_sched_class;
set_load_weight(p);
}
看下normal_prio()
/*
* Calculate the expected normal priority: i.e. priority
* without taking RT-inheritance into account. Might be
* boosted by interactivity modifiers. Changes upon fork,
* setprio syscalls, and whenever the interactivity
* estimator recalculates.
*/
static inline int normal_prio(struct task_struct *p)
{
int prio;
if (task_has_rt_policy(p))
prio = MAX_RT_PRIO-1 - p->rt_priority;//如果你是实时进程,p->prio = 99 - 你通过sched_setscheduler()传入的优先级值,这下明白了吧,watchdog 进程通过sched_setscheduler() 设置的优先级值为99, 这样一算之后真正 p->prio 为0, 也就是linux kernel 中最高优先级的实时进程!
else
prio = __normal_prio(p);// 非实时进程的优先级,可真费劲,接着看?
return prio;
}
看下__normal_prio(),
/*
* __normal_prio - return the priority that is based on the static prio
*/
static inline int __normal_prio(struct task_struct *p)
{
return p->static_prio;//逗我呢吧,非实时进程的优先级通过sched_setscheduler()设置的值并没有赋予p->prio,而是直接取的p->static 的值
p->static 的值从哪来的呢?
}
遍历整个kernel 代码树,只有三个地方会对p->static进行赋值,
--------------------------
void sched_fork(struct task_struct *p)
{
/*
* Revert to default priority/policy on fork if requested.
*/
if (unlikely(p->sched_reset_on_fork)) {
if (task_has_rt_policy(p)) {
p->policy = SCHED_NORMAL;
p->static_prio = NICE_TO_PRIO(0);
p->rt_priority = 0;
} else if (PRIO_TO_NICE(p->static_prio) < 0)
p->static_prio = NICE_TO_PRIO(0);
--------------------------
void set_user_nice(struct task_struct *p, long nice)
{
/*
* The RT priorities are set via sched_setscheduler(), but we still
* allow the 'normal' nice value to be set - but as expected
* it wont have any effect on scheduling until the task is
* SCHED_FIFO/SCHED_RR:
*/
if (task_has_rt_policy(p)) {
p->static_prio = NICE_TO_PRIO(nice);
goto out_unlock;
}
on_rq = p->on_rq;
if (on_rq)
dequeue_task(rq, p, 0);
p->static_prio = NICE_TO_PRIO(nice);
--------------------------
/*
* INIT_TASK is used to set up the first task table, touch at
* your own risk!. Base=0, limit=0x1fffff (=2MB)
*/
#define INIT_TASK(tsk) \
{ \
.state = 0, \
.stack = &init_thread_info, \
.usage = ATOMIC_INIT(2), \
.flags = PF_KTHREAD, \
.prio = MAX_PRIO-20, \
.static_prio = MAX_PRIO-20,
明白了吧? p->static 的值,也就是非实时进程的优先级只能通过fork 或是设置nice 值间接修改。
3.查看进程优先级
ps 提供这个选项,Android ps 的实现不同于ubuntu,看下system/core/toolbox/ps.c,源代码告诉我们选项足够了,调度策略,优先级,RT优先级都能看,
ps -p -P
看下吧:PRIO 标识进程的优先级,怎么样? 有没有世界观又一次被颠覆的感觉? 你不是告诉我进程优先级取值范围是 0..139吗???
别急, 看下ps 到底是怎么得到进程的优先级的。
ps 是通过 cat /proc/<pid>/stat 获取到的值,
对应kernel 里面的实现在 fs/proc/base.c 中,
static const struct pid_entry tgid_base_stuff[] = {
INF("cmdline", S_IRUGO, proc_pid_cmdline),
ONE("stat", S_IRUGO, proc_tgid_stat),
看下priority 是通过task_prio()获取的,
static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task, int whole)
{
/* scale priority and nice values from timeslices to -20..20 */
/* to make it look like a "normal" Unix priority/nice value */
priority = task_prio(task);
nice = task_nice(task);
看下task_prio()
/**
* task_prio - return the priority value of a given task.
* @p: the task in question.
*
* This is the priority value as seen by users in /proc.
* RT tasks are offset by -200. Normal tasks are centered
* around 0, value goes from -16 to +15.
*/
int task_prio(const struct task_struct *p)
{
return p->prio - MAX_RT_PRIO;//ps 命令得到的priority 值是 p->prio - 100 得到的!!!
}
赶紧找到你熟悉的进程看看,有没有突然又相信本文开始提醒你坚持的世界观:
task_struct 中的成员变量 prio越小,进程的优先级越高。prio 值的取值范围为0..139。
USER PID PPID VSIZE RSS PRIO NICE RTPRI SCHED PCY WCHAN PC NAME
root 1 0 932 716 20 0 0 0 fg c0179270 00029eb4 S /init
root 2 0 0 0 -2 0 1 1 fg c00ba720 00000000 S kthreadd
root 3 2 0 0 20 0 0 0 fg c00a1e00 00000000 S ksoftirqd/0
root 6 2 0 0 20 0 0 0 fg c0081080 00000000 D kworker/u:0
root 7 2 0 0 0 -20 0 0 fg c007e2e0 00000000 D kworker/u:0H
root 8 2 0 0 -100 0 99 1 fg c00f494c 00000000 S migration/0
root 21 2 0 0 0 -20 0 0 fg c00b51b0 00000000 S khelper
root 22 2 0 0 0 -20 0 0 fg c00b51b0 00000000 S netns
root 27 2 0 0 0 -20 0 0 fg c00b60a8 00000000 S kworker/0:1H
root 28 2 0 0 0 -20 0 0 fg c00b51b0 00000000 S modem_notifier
root 29 2 0 0 0 -20 0 0 fg c00b51b0 00000000 S smd_channel_clo
root 30 2 0 0 0 -20 0 0 fg c00b51b0 00000000 S smsm_cb_wq
root 32 2 0 0 0 -20 0 0 fg c00b51b0 00000000 S rpm-smd
root 33 2 0 0 0 -20 0 0 fg c00b60a8 00000000 S kworker/u:1H
root 50 2 0 0 -51 0 50 1 fg c0100ac8 00000000 S irq/47-cpr
root 51 2 0 0 0 -20 0 0 fg c00b51b0 00000000 S mpm
root 52 2 0 0 20 0 0 0 fg c0144d74 00000000 S sync_supers
root 53 2 0 0 20 0 0 0 fg c0145d9c 00000000 S bdi-default
root 54 2 0 0 0 -20 0 0 fg c00b51b0 00000000 S kblockd
root 55 2 0 0 20 0 0 5 fg c03599c8 00000000 S system
root 56 2 0 0 20 0 0 0 fg c0414830 00000000 S khubd
root 57 2 0 0 -51 0 50 1 fg c0100ac8 00000000 S irq/102-msm_iom