- Overview
task如何管理 - enqueuetask的过程
入口函数如下所示:/* * The enqueue_task method is called before nr_running is * increased. Here we update the fair scheduling stats and * then put the task into the rbtree: */ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; int idle_h_nr_running = task_has_idle_policy(p); /* * The code below (indirectly) updates schedutil which looks at * the cfs_rq utilization to select a frequency. * Let's add the task's estimated utilization to the cfs_rq's * estimated utilization, before we update schedutil. */ util_est_enqueue(&rq->cfs, p); //rq的util预计,这个后面专门研究 /* * If in_iowait is set, the code below may not trigger any cpufreq * utilization updates, so do it here explicitly with the IOWAIT flag * passed. */ if (p->in_iowait) cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT); //设定IO boost,后面研究调频的时候,再做专门研究 for_each_sched_entity(se) { //如果开启CONFIG_FAIR_GROUP_SCHED,则将整个group内的task都queue到这个runqueue当中 if (se->on_rq) break; cfs_rq = cfs_rq_of(se);//获取这个sched_entity的cfs_rq,所以,此处说明,task的管理是放在cfs_rq当中的。 enqueue_entity(cfs_rq, se, flags);//核心函数 /* * end evaluation on encountering a throttled cfs_rq * * note: in the case of encountering a throttled cfs_rq we will * post the final h_nr_running increment below. */ if (cfs_rq_throttled(cfs_rq)) break; cfs_rq->h_nr_running++; cfs_rq->idle_h_nr_running += idle_h_nr_running; flags = ENQUEUE_WAKEUP; } for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_running++; cfs_rq->idle_h_nr_running += idle_h_nr_running; if (cfs_rq_throttled(cfs_rq)) break; update_load_avg(cfs_rq, se, UPDATE_TG); update_cfs_group(se); } if (!se) { add_nr_running(rq, 1); /* * Since new tasks are assigned an initial util_avg equal to * half of the spare capacity of their CPU, tiny tasks have the * ability to cross the overutilized threshold, which will * result in the load balancer ruining all the task placement * done by EAS. As a way to mitigate that effect, do not account * for the first enqueue operation of new tasks during the * overutilized flag detection. * * A better way of solving this problem would be to wait for * the PELT signals of tasks to converge before taking them * into account, but that is not straightforward to implement, * and the following generally works well enough in practice. */ if (flags & ENQUEUE_WAKEUP) update_overutilized_status(rq); } if (cfs_bandwidth_used()) { /* * When bandwidth control is enabled; the cfs_rq_throttled() * breaks in the above iteration can result in incomplete * leaf list maintenance, resulting in triggering the assertion * below. */ for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); if (list_add_leaf_cfs_rq(cfs_rq)) break; } } assert_list_leaf_cfs_rq(rq); hrtick_update(rq); } 如果开启CONFIG_FAIR_GROUP_SCHED,可能有多个cfs_rq,详细的后面再研究,现在关注task的管理。 /* runqueue on which this entity is (to be) queued */ static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) { return se->cfs_rq; } 如果没有开启CONFIG_FAIR_GROUP_SCHED,所有的se都隶属于一个cfs_rq static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) { struct task_struct *p = task_of(se); struct rq *rq = task_rq(p); return &rq->cfs; }
enqueue_task_fair的功能主要是获取task的entity,并将entity enqueue到对应的cfs_rq当中。对于没有开启CONFIG_FAIR_GROUP_SCHED来讲,rq将task放到cfs_rq当中进行管理。
同时将rq中的nr_running加1,所以,nr_running表示了目前rq当中处于runnable即就绪task的数量。
static inline void add_nr_running(struct rq *rq, unsigned count) { unsigned prev_nr = rq->nr_running; //累加计数之前的值 rq->nr_running = prev_nr + count; //之后的值 #ifdef CONFIG_SMP if (prev_nr < 2 && rq->nr_running >= 2) {//如果rq中有两个及以上的task,则认定这个domain overload,可以做loadbalance. if (!READ_ONCE(rq->rd->overload)) WRITE_ONCE(rq->rd->overload, 1); } #endif sched_update_tick_dependency(rq); //与no hz有关系,后面研究一下No hz }
enqueue_entity如下所示:
/* * MIGRATION * * dequeue * update_curr() * update_min_vruntime() * vruntime -= min_vruntime * * enqueue * update_curr() * update_min_vruntime() * vruntime += min_vruntime * * this way the vruntime transition between RQs is done when both * min_vruntime are up-to-date. * * WAKEUP (remote) * * ->migrate_task_rq_fair() (p->state == TASK_WAKING) * vruntime -= min_vruntime * * enqueue * update_curr() * update_min_vruntime() * vruntime += min_vruntime * * this way we don't have the most up-to-date min_vruntime on the originating * CPU and an up-to-date min_vruntime on the destination CPU. */ static void enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED); bool curr = cfs_rq->curr == se; //cfs_rq->curr表示上一次选task时被选中的task,即当前正在running的task(如果fair group没有开启的话) /* * If we're the current task, we must renormalise before calling * update_curr(). */ if (renorm && curr)//此处有疑问:curr表示上一次正是当前task在run为什么还会再enqueue一次呢? se->vruntime += cfs_rq->min_vruntime; //更新se的vruntime //更新当前正在running task的统计信息,如vruntime等 update_curr(cfs_rq); /* * Otherwise, renormalise after, such that we're placed at the current * moment in time, instead of some random moment in the past. Being * placed in the past could significantly boost this task to the * fairness detriment of existing tasks. */ if (renorm && !curr) se->vruntime += cfs_rq->min_vruntime; //接下来如英文注释,主要是更新task的load及rq的load /* * When enqueuing a sched_entity, we must: * - Update loads to have both entity and cfs_rq synced with now. * - Add its load to cfs_rq->runnable_avg * - For group_entity, update its weight to reflect the new share of * its group cfs_rq * - Add its new weight to cfs_rq->load.weight */ //更新task的负载 update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH); update_cfs_group(se); //将task的负载计入到cfs_rq当中 enqueue_runnable_load_avg(cfs_rq, se); //计数cfs_rq中的task数量 account_entity_enqueue(cfs_rq, se); if (flags & ENQUEUE_WAKEUP) place_entity(cfs_rq, se, 0); check_schedstat_required(); update_stats_enqueue(cfs_rq, se, flags); check_spread(cfs_rq, se); if (!curr) __enqueue_entity(cfs_rq, se); //设置当前task on rq状态 se->on_rq = 1; /* * When bandwidth control is enabled, cfs might have been removed * because of a parent been throttled but cfs->nr_running > 1. Try to * add it unconditionnally. */ if (cfs_rq->nr_running == 1 || cfs_bandwidth_used()) list_add_leaf_cfs_rq(cfs_rq); if (cfs_rq->nr_running == 1) check_enqueue_throttle(cfs_rq); }
上面的函数主要是做了一些入队之前的准备工作,主要是更新task与cfs_rq的load,计时相关的信息.这些内容分个章节专门研究.准备工作做完之后,才是真正的enqueue的过程:
//根据注释,task采用红黑树的方式进行保存. /* * Enqueue an entity into the rb-tree: */ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { //红黑树的根节点保存在cfs_rq的tasks_timeline当中. struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node; struct rb_node *parent = NULL; struct sched_entity *entry; bool leftmost = true; /* * Find the right place in the rbtree: */ while (*link) { parent = *link; entry = rb_entry(parent, struct sched_entity, run_node); /* * We dont care about collisions. Nodes with * the same key stay together. */ if (entity_before(se, entry)) { //如后面的entity_before函数所示,表明排序是按照vruntime进行排序的. link = &parent->rb_left; } else { link = &parent->rb_right; leftmost = false; } } rb_link_node(&se->run_node, parent, link); rb_insert_color_cached(&se->run_node, &cfs_rq->tasks_timeline, leftmost); } static inline int entity_before(struct sched_entity *a, struct sched_entity *b) { return (s64)(a->vruntime - b->vruntime) < 0; }
综上所述,cfs中对task的管理以se为单位通过红黑树的方式进行管理。而红黑树主要是依据vruntime顺序进行管理。
、
处于红黑树最左边的是vruntime最小的,所以,每次选task的时候,只要选最左边的task即可。 -
dequeue task
static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) { if (!(flags & DEQUEUE_NOCLOCK)) //如果没有要求不更新clack,则更新rq的clock update_rq_clock(rq); //关于rq clock后面研究 if (!(flags & DEQUEUE_SAVE)) { sched_info_dequeued(rq, p); psi_dequeue(p, flags & DEQUEUE_SLEEP); } uclamp_rq_dec(rq, p); //将taskd dequeue之后,更新uclamp,关于uclamp后面研究。 p->sched_class->dequeue_task(rq, p, flags); } /* * The dequeue_task method is called before nr_running is * decreased. We remove the task from the rbtree and * update the fair scheduling stats: */ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; int task_sleep = flags & DEQUEUE_SLEEP; int idle_h_nr_running = task_has_idle_policy(p); for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); //先dequeue task,然后才将nr_running减1 dequeue_entity(cfs_rq, se, flags); /* * end evaluation on encountering a throttled cfs_rq * * note: in the case of encountering a throttled cfs_rq we will * post the final h_nr_running decrement below. */ if (cfs_rq_throttled(cfs_rq)) break; cfs_rq->h_nr_running--; cfs_rq->idle_h_nr_running -= idle_h_nr_running; /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) { /* Avoid re-evaluating load for this entity: */ se = parent_entity(se); /* * Bias pick_next to pick a task from this cfs_rq, as * p is sleeping when it is within its sched_slice. */ if (task_sleep && se && !throttled_hierarchy(cfs_rq)) set_next_buddy(se); break; } flags |= DEQUEUE_SLEEP; } for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_running--; cfs_rq->idle_h_nr_running -= idle_h_nr_running; if (cfs_rq_throttled(cfs_rq)) break; update_load_avg(cfs_rq, se, UPDATE_TG); update_cfs_group(se); } if (!se) sub_nr_running(rq, 1); util_est_dequeue(&rq->cfs, p, task_sleep); hrtick_update(rq); }
static void dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { /* * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); /* * When dequeuing a sched_entity, we must: * - Update loads to have both entity and cfs_rq synced with now. * - Subtract its load from the cfs_rq->runnable_avg. * - Subtract its previous weight from cfs_rq->load.weight. * - For group entity, update its weight to reflect the new share * of its group cfs_rq. */ update_load_avg(cfs_rq, se, UPDATE_TG); dequeue_runnable_load_avg(cfs_rq, se); update_stats_dequeue(cfs_rq, se, flags); clear_buddies(cfs_rq, se); if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); se->on_rq = 0; account_entity_dequeue(cfs_rq, se); /* * Normalize after update_curr(); which will also have moved * min_vruntime if @se is the one holding it back. But before doing * update_min_vruntime() again, which will discount @se's position and * can move min_vruntime forward still more. */ if (!(flags & DEQUEUE_SLEEP)) se->vruntime -= cfs_rq->min_vruntime; /* return excess runtime on last dequeue */ return_cfs_rq_runtime(cfs_rq); update_cfs_group(se); /* * Now advance min_vruntime if @se was the entity holding it back, * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be * put back on, and if we advance min_vruntime, we'll be placed back * further than we started -- ie. we'll be penalized. */ if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE) update_min_vruntime(cfs_rq); }
static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline); }
dequeue_task负载将一个Task从rq中摘除.
-
Summary
综上所述,cfs的task通过cfs_rq中的红黑树按照vruntime排序进行管理
当enqueue的时候,设置task的on_rq为1,dequeue的时候设置为0
enqueue与dequeue的时候,进行task与cfs_rq的负载更新.
Kernel Scheduler学习之七:CFS调度器之task管理
最新推荐文章于 2022-06-09 13:55:06 发布