[scheduler]六. fork/clone等方式创建的进程如何被调度的

说明:

对于函数select_task_rq的分析,本章分析的不够清晰,后面会专门有两篇文章分析传统的负载均衡迁移进程和EAS如何根据能效为进程选择目标CPU

两篇文章请参考:
一: 九. EAS如何根据能效为进程选择目标CPU
二: 十. 传统的负载均衡是如何为task选择合适的cpu

从kernel/fork.c里面,我们能够看到,无论是userspace还是kernel space在创建进程的时候最后的调用路径都是相同的,最后都走到_do_fork函数,我们看看源码:

/* For compatibility with architectures that call do_fork directly rather than 
 * using the syscall entry points below. */  
long do_fork(unsigned long clone_flags,  
          unsigned long stack_start,  
          unsigned long stack_size,  
          int __user *parent_tidptr,  
          int __user *child_tidptr)  
{  
    return _do_fork(clone_flags, stack_start, stack_size,  
            parent_tidptr, child_tidptr, 0);  
}  
#endif  
  
/* 
 * Create a kernel thread. 
 *//*创建内核进程,比如在start_kernel-->rest_init里面创建了2号进kthreadd,*/  
pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)  
{  
    return _do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,  
        (unsigned long)arg, NULL, NULL, 0);  
}  
/*下面是提供userspace调用的.fork/vfork*/  
#ifdef __ARCH_WANT_SYS_FORK  
SYSCALL_DEFINE0(fork)  
{  
#ifdef CONFIG_MMU  
    return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0);  
#else  
    /* can not support in nommu mode */  
    return -EINVAL;  
#endif  
}  
#endif  
  
#ifdef __ARCH_WANT_SYS_VFORK  
SYSCALL_DEFINE0(vfork)  
{  
    return _do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,  
            0, NULL, NULL, 0);  
}  
#endif  
/*下面是clone相关的系统调用.*/
#ifdef __ARCH_WANT_SYS_CLONE  
#ifdef CONFIG_CLONE_BACKWARDS  
SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,  
         int __user *, parent_tidptr,  
         unsigned long, tls,  
         int __user *, child_tidptr)  
#elif defined(CONFIG_CLONE_BACKWARDS2)  
SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,  
         int __user *, parent_tidptr,  
         int __user *, child_tidptr,  
         unsigned long, tls)  
#elif defined(CONFIG_CLONE_BACKWARDS3)  
SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp,  
        int, stack_size,  
        int __user *, parent_tidptr,  
        int __user *, child_tidptr,  
        unsigned long, tls)  
#else  
SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,  
         int __user *, parent_tidptr,  
         int __user *, child_tidptr,  
         unsigned long, tls)  
#endif  
{  
    return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls);  
}  
#endif  

他们最终的调用函数都是_do_fork函数,至于userspace通过何种方式陷入内核创建进程的,以后会详细讲解,仅仅看调度相关的.
我们看下_do_fork函数源码:

/* 
 *  Ok, this is the main fork-routine. 
 * 
 * It copies the process, and if successful kick-starts 
 * it and waits for it to finish using the VM if required. 
 */  
long _do_fork(unsigned long clone_flags,  
          unsigned long stack_start,  
          unsigned long stack_size,  
          int __user *parent_tidptr,  
          int __user *child_tidptr,  
          unsigned long tls)  
{  
    struct task_struct *p;  
    int trace = 0;  
    long nr;  
  
    /* 
     * Determine whether and which event to report to ptracer.  When 
     * called from kernel_thread or CLONE_UNTRACED is explicitly 
     * requested, no event is reported; otherwise, report if the event 
     * for the type of forking is enabled. 
     */  
    if (!(clone_flags & CLONE_UNTRACED)) {  
        if (clone_flags & CLONE_VFORK)  
            trace = PTRACE_EVENT_VFORK;  
        else if ((clone_flags & CSIGNAL) != SIGCHLD)  
            trace = PTRACE_EVENT_CLONE;  
        else  
            trace = PTRACE_EVENT_FORK;  
  
        if (likely(!ptrace_event_enabled(current, trace)))  
            trace = 0;  
    }  
    /*创建进程的关键性函数,里面设置填充了若干新创建的进程task_struct结构体,同时
    调用了sched_fork函数,设置新创建进程相关的调度信息,比权重和vruntime等信息*/
    p = copy_process(clone_flags, stack_start, stack_size,  
             child_tidptr, NULL, trace, tls, NUMA_NO_NODE);  
    /* 
     * Do this prior waking up the new thread - the thread pointer 
     * might get invalid after that point, if the thread exits quickly. 
     */  
    if (!IS_ERR(p)) {  
        struct completion vfork;  
        struct pid *pid;  
  
        trace_sched_process_fork(current, p);  
  
        pid = get_task_pid(p, PIDTYPE_PID);  
        nr = pid_vnr(pid);  
  
        if (clone_flags & CLONE_PARENT_SETTID)  
            put_user(nr, parent_tidptr);  
  
        if (clone_flags & CLONE_VFORK) {  
            p->vfork_done = &vfork;  
            init_completion(&vfork);  
            get_task_struct(p);  
        }  
        /*将创建的进程加入到对应的rq中,并进程调度处理.*/
        wake_up_new_task(p);  
  
        /* forking complete and child started to run, tell ptracer */  
        if (unlikely(trace))  
            ptrace_event_pid(trace, pid);  
  
        if (clone_flags & CLONE_VFORK) {  
            if (!wait_for_vfork_done(p, &vfork))  
                ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);  
        }  
  
        put_pid(pid);  
    } else {  
        nr = PTR_ERR(p);  
    }  
    return nr;  
}  

下面对两个核心函数的分析:

一 对sched_fork函数的分析

/* 
 * This creates a new process as a copy of the old one, 
 * but does not actually start it yet. 
 * 
 * It copies the registers, and all the appropriate 
 * parts of the process environment (as per the clone 
 * flags). The actual kick-off is left to the caller. 
 */  
static struct task_struct *copy_process(unsigned long clone_flags,  
                    unsigned long stack_start,  
                    unsigned long stack_size,  
                    int __user *child_tidptr,  
                    struct pid *pid,  
                    int trace,  
                    unsigned long tls,  
                    int node)  
{  
............  
        /* Perform scheduler related setup. Assign this task to a CPU. */  
    retval = sched_fork(clone_flags, p);  
............  
}  
/* 
 * fork()/clone()-time setup: 
 */ /*sched_fork函数的具体实现*/ 
int sched_fork(unsigned long clone_flags, struct task_struct *p)  
{  
    unsigned long flags;
    /*禁止抢占并获得当前运行此函数的cpu id*/  
    int cpu = get_cpu();  
    
    __sched_fork(clone_flags, p);  
    /* 
     * We mark the process as NEW here. This guarantees that 
     * nobody will actually run it, and a signal or other external 
     * event cannot wake it up and insert it on the runqueue either. 
     */
    /*设置task的状态为TASK_NEW,随着task的不断变化,其state会不断的变化,并且
    调度器会根据这些不同的状态做出不同的行为*/  
    p->state = TASK_NEW;  
  
    /* 
     * Make sure we do not leak PI boosting priority to the child. 
     */  
    /*子进程继承父进程的优先级*/
    p->prio = current->normal_prio;  
  
    /* 
     * Revert to default priority/policy on fork if requested. 
     */  /*如果需要,重置这个进程的优先级/权重和policy*/
    if (unlikely(p->sched_reset_on_fork)) {  
        if (task_has_dl_policy(p) || task_has_rt_policy(p)) {  
            p->policy = SCHED_NORMAL;  
            p->static_prio = NICE_TO_PRIO(0);  
            p->rt_priority = 0;  
        } else if (PRIO_TO_NICE(p->static_prio) < 0)  
            p->static_prio = NICE_TO_PRIO(0);  
  
        p->prio = p->normal_prio = __normal_prio(p);  
        set_load_weight(p);  
  
        /* 
         * We don't need the reset flag anymore after the fork. It has 
         * fulfilled its duty: 
         */  
        p->sched_reset_on_fork = 0;  
    }  
    /*根据进程的优先级,选择调度类.*/
    if (dl_prio(p->prio)) {  
        put_cpu();  
        return -EAGAIN;  
    } else if (rt_prio(p->prio)) {  
        p->sched_class = &rt_sched_class;  
    } else {  
        p->sched_class = &fair_sched_class;  
    }  
    /*初始化这个task作为一个调度实体的 struct sched_entity 里面struct sched_avg
    结构体函数,比如设置初始化的load的更新时间,load_sum,util_sum,util_avg,
    load_avg,他们的数值会在PELT算法里面即update_load_avg函数里面进行更新.*/
    init_entity_runnable_average(&p->se);  
  
    /* 
     * The child is not yet in the pid-hash so no cgroup attach races, 
     * and the cgroup is pinned to this child due to cgroup_fork() 
     * is ran before sched_fork(). 
     * 
     * Silence PROVE_RCU. 
     */  
    raw_spin_lock_irqsave(&p->pi_lock, flags);  
    /* 
     * We're setting the cpu for the first time, we don't migrate, 
     * so use __set_task_cpu(). 
     *//*设置进程的cpu,以及对应的的cfs_rq,task_group等信息*/  
    __set_task_cpu(p, cpu);  
    /*调用对应的调度类的task_fork函数*/
    if (p->sched_class->task_fork)  
        p->sched_class->task_fork(p);  
    raw_spin_unlock_irqrestore(&p->pi_lock, flags);  
  
#ifdef CONFIG_SCHED_INFO  
    if (likely(sched_info_on()))  
        memset(&p->sched_info, 0, sizeof(p->sched_info));  
#endif  
#if defined(CONFIG_SMP)  
    p->on_cpu = 0;  
#endif  
    /*初始化task_struct结构体的抢占计数器的初始值*/
    init_task_preempt_count(p);  
#ifdef CONFIG_SMP
    plist_node_init(&p->pushable_tasks, MAX_PRIO);  
    RB_CLEAR_NODE(&p->pushable_dl_tasks);  
#endif  
    /*enable 抢占*/
    put_cpu();  
    return 0;  
}  

对于sched_fork里面几个关键函数的分析如下
#####1.1 __sched_fork(clone_flags, p)

/* 
 * Perform scheduler related setup for a newly forked process p. 
 * p is forked by current. 
 * 
 * __sched_fork() is basic setup used by init_idle() too: 
 */  
static void __sched_fork(unsigned long clone_flags, struct task_struct *p)  
{  /*初始化on_rq,即是否在rq里面*/
    p->on_rq         = 0;  
    /*初始化新创建的进程作为调度实体的数据结构*/
    p->se.on_rq          = 0;  
    p->se.exec_start     = 0;  
    p->se.sum_exec_runtime       = 0;  
    p->se.prev_sum_exec_runtime  = 0;  
    p->se.nr_migrations      = 0;
/*调度实体的vruntime,根据这个数值cfs调度算法将进程组成rb tree*/  
    p->se.vruntime           = 0;
   /*WALT算法标记task休眠的时间点*/  
#ifdef CONFIG_SCHED_WALT  
    p->last_sleep_ts     = 0;  
#endif  
    /*初始化se的group节点*/
    INIT_LIST_HEAD(&p->se.group_node);  
    /*根据WALT算法,即通过若干个窗口类的进程的runnable时间,来调节cpu的频率.下面这个
    函数是初始化一个新创建的进程的struct task_struct--->struct ravg结构体里面
    demand和sum_history[8]数值,demand是初始化当前task的runnable时间,即task load
     sum_history[8]是作为若干个窗口保存的数值,并且每个窗口都会进行update.具体怎么
    update详细查看:https://blog.csdn.net/wukongmingjing/article/details/81633225*/
    walt_init_new_task_load(p);  
  
#ifdef CONFIG_FAIR_GROUP_SCHED  
    p->se.cfs_rq         = NULL;  
#endif  
  
#ifdef CONFIG_SCHEDSTATS  
    memset(&p->se.statistics, 0, sizeof(p->se.statistics));  
#endif  
  
    RB_CLEAR_NODE(&p->dl.rb_node);  
    init_dl_task_timer(&p->dl);  
    __dl_clear_params(p);  
  
    INIT_LIST_HEAD(&p->rt.run_list);  
    /*初始化抢占通知*/
#ifdef CONFIG_PREEMPT_NOTIFIERS  
    INIT_HLIST_HEAD(&p->preempt_notifiers);  
#endif  
  
#ifdef CONFIG_NUMA_BALANCING  
    if (p->mm && atomic_read(&p->mm->mm_users) == 1) {  
        p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);  
        p->mm->numa_scan_seq = 0;  
    }  
  
    if (clone_flags & CLONE_VM)  
        p->numa_preferred_nid = current->numa_preferred_nid;  
    else  
        p->numa_preferred_nid = -1;  
  
    p->node_stamp = 0ULL;  
    p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;  
    p->numa_scan_period = sysctl_numa_balancing_scan_delay;  
    p->numa_work.next = &p->numa_work;  
    p->numa_faults = NULL;  
    p->last_task_numa_placement = 0;  
    p->last_sum_exec_runtime = 0;  
  
    p->numa_group = NULL;  
#endif /* CONFIG_NUMA_BALANCING */  
}
 
void walt_init_new_task_load(struct task_struct *p)  
{  
    int i;  
    u32 init_load_windows =  
            div64_u64((u64)sysctl_sched_walt_init_task_load_pct *  
                          (u64)walt_ravg_window, 100);  
    u32 init_load_pct = current->init_load_pct;  
  
    p->init_load_pct = 0;  
    memset(&p->ravg, 0, sizeof(struct ravg));  
  
    if (init_load_pct) {  
        init_load_windows = div64_u64((u64)init_load_pct *  
              (u64)walt_ravg_window, 100);  
    }  
  
    p->ravg.demand = init_load_windows;  
    for (i = 0; i < RAVG_HIST_SIZE_MAX; ++i)  
        p->ravg.sum_history[i] = init_load_windows;  
}  

#####1.2. set_load_weight§;

static void set_load_weight(struct task_struct *p)  
{   /*获取task的优先级*/
    int prio = p->static_prio - MAX_RT_PRIO;  
    struct load_weight *load = &p->se.load;  
  
    /* 
     * SCHED_IDLE tasks get minimal weight: 
     *//*设置idle thread的优先级权重*/  
    if (idle_policy(p->policy)) {  
        load->weight = scale_load(WEIGHT_IDLEPRIO);  
        load->inv_weight = WMULT_IDLEPRIO;  
        return;  
    }  
    /*设置正常进程优先级的权重,*/
    load->weight = scale_load(prio_to_weight[prio]); 
    /*进程权重的倒数,数值为2^32/weight*/ 
    load->inv_weight = prio_to_wmult[prio]; 
    /*上面两个数值都可以通过查表获取的*/ 
}
/* 
 * Nice levels are multiplicative, with a gentle 10% change for every 
 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to 
 * nice 1, it will get ~10% less CPU time than another CPU-bound task 
 * that remained on nice 0. 
 * 
 * The "10% effect" is relative and cumulative: from _any_ nice level, 
 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level 
 * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. 
 * If a task goes up by ~10% and another task goes down by ~10% then 
 * the relative distance between them is ~25%.) 
 */  
static const int prio_to_weight[40] = {  
 /* -20 */     88761,     71755,     56483,     46273,     36291,  
 /* -15 */     29154,     23254,     18705,     14949,     11916,  
 /* -10 */      9548,      7620,      6100,      4904,      3906,  
 /*  -5 */      3121,      2501,      1991,      1586,      1277,  
 /*   0 */      1024,       820,       655,       526,       423,  
 /*   5 */       335,       272,       215,       172,       137,  
 /*  10 */       110,        87,        70,        56,        45,  
 /*  15 */        36,        29,        23,        18,        15,  
};  
  
/* 
 * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated. 
 * 
 * In cases where the weight does not change often, we can use the 
 * precalculated inverse to speed up arithmetics by turning divisions 
 * into multiplications: 
 *//*2^32/weight*  : 2^32=4294967296 ,2^32/NICE_0_LOAD=2^32/1024=4194304
   符合预期*/  
static const u32 prio_to_wmult[40] = {  
 /* -20 */     48388,     59856,     76040,     92818,    118348,  
 /* -15 */    147320,    184698,    229616,    287308,    360437,  
 /* -10 */    449829,    563644,    704093,    875809,   1099582,  
 /*  -5 */   1376151,   1717300,   2157191,   2708050,   3363326,  
 /*   0 */   4194304,   5237765,   6557202,   8165337,  10153587,  
 /*   5 */  12820798,  15790321,  19976592,  24970740,  31350126,  
 /*  10 */  39045157,  49367440,  61356676,  76695844,  95443717,  
 /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,  
}; 

权重怎么计算比较简单

#####1.3. init_entity_runnable_average(&p->se)

/* Give new sched_entity start runnable values to heavy its load in infant time */  
void init_entity_runnable_average(struct sched_entity *se)  
{  
    /*获取新进程调度实体的由于计算se util和load的结构体,用来做初始化
    动作*/
    struct sched_avg *sa = &se->avg;  
    /*初始化load的更新时间*/
    sa->last_update_time = 0;  
    /* 
     * sched_avg's period_contrib should be strictly less then 1024, so 
     * we give it 1023 to make sure it is almost a period (1024us), and 
     * will definitely be update (after enqueue). 
     */  
    sa->period_contrib = 1023;  
    /* 
     * Tasks are intialized with full load to be seen as heavy tasks until 
     * they get a chance to stabilize to their real load level. 
     * Group entities are intialized with zero load to reflect the fact that 
     * nothing has been attached to the task group yet. 
     */  
    if (entity_is_task(se))  
        sa->load_avg = scale_load_down(se->load.weight);  
    sa->load_sum = sa->load_avg * LOAD_AVG_MAX;  
    /* 
     * In previous Android versions, we used to have: 
     *  sa->util_avg = scale_load_down(SCHED_LOAD_SCALE); 
     *  sa->util_sum = sa->util_avg * LOAD_AVG_MAX; 
     * However, that functionality has been moved to enqueue. 
     * It is unclear if we should restore this in enqueue. 
     */  
    /* 
     * At this point, util_avg won't be used in select_task_rq_fair anyway 
     */  
    sa->util_avg = 0;  
    sa->util_sum = 0;  
    /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */  
}  

#####1.4. __set_task_cpu(p, cpu)

/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */  
static inline void set_task_rq(struct task_struct *p, unsigned int cpu)  
{  
#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED)  
    struct task_group *tg = task_group(p);  
#endif  
  
#ifdef CONFIG_FAIR_GROUP_SCHED  
    set_task_rq_fair(&p->se, p->se.cfs_rq, tg->cfs_rq[cpu]);  
    p->se.cfs_rq = tg->cfs_rq[cpu];  
    p->se.parent = tg->se[cpu];  
#endif  
  
#ifdef CONFIG_RT_GROUP_SCHED  
    p->rt.rt_rq  = tg->rt_rq[cpu];  
    p->rt.parent = tg->rt_se[cpu];  
#endif  
}  
  
#else /* CONFIG_CGROUP_SCHED */  
  
static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }  
static inline struct task_group *task_group(struct task_struct *p)  
{  
    return NULL;  
}  
  
#endif /* CONFIG_CGROUP_SCHED */  
  
static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)  
{   /*设置进程所属的进程组的cpu上,即在进程组里面**cfs_rq,**se所属的cpu上*/
    set_task_rq(p, cpu);  
#ifdef CONFIG_SMP  
    /* 
     * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be 
     * successfuly executed on another CPU. We must ensure that updates of 
     * per-task data have been completed by this moment. 
     */  
    smp_wmb();  
  /*设置进程所属cpu为当前cpu*/
#ifdef CONFIG_THREAD_INFO_IN_TASK  
    p->cpu = cpu;  
#else  
    task_thread_info(p)->cpu = cpu;  
#endif  
    p->wake_cpu = cpu;  
#endif  
}  

#####1.5. task_fork_fair(这个是最核心代码)

/*  
 * called on fork with the child task as argument from the parent's context  
 *  - child not yet on the tasklist  
 *  - preemption disabled  
 */    
static void task_fork_fair(struct task_struct *p)    
{    
    struct cfs_rq *cfs_rq;    
    struct sched_entity *se = &p->se, *curr;    
    struct rq *rq = this_rq();    
    
    raw_spin_lock(&rq->lock);
    /*更新rq的clock*/    
    update_rq_clock(rq);    
    /*获取当前进程的cfs_rq*/
    cfs_rq = task_cfs_rq(current);    
    /*获取当前进程的调度实体*/
    curr = cfs_rq->curr;
    /*如果当前进程的调度实体存在,则设置新进程的调度实体的vruntime为
   父进程的vruntime*/    
    if (curr) {
        /*更加权重重新调整当前进程的vruntime*/    
        update_curr(cfs_rq);    
        se->vruntime = curr->vruntime;    
    }
    /*调整新进程的vruntime*/    
    place_entity(cfs_rq, se, 1);    
    /*如果当前进程vruntime比新进程的vruntime要小,则设置当前进程
    调度标志,在中断退出或者异常退出的时候会检查这个标记*/
    if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {    
        /*  
         * Upon rescheduling, sched_class::put_prev_task() will place  
         * 'current' within the tree based on its new key value.  
         */    
        swap(curr->vruntime, se->vruntime);    
        resched_curr(rq);    
    }    
    /*新进程的vruntime减去当前cpu的cfs_rq的最小vruntime,目的是你
   不知道这个新进程最后会在哪个cpu上执行,如果确定了,则会重新加上对应
    cpu cfs_rq的最小vruntime很巧妙.任何进程的vruntime时间都是
   所在cfs_rq最小vruntime基础上累加的数值*/
    se->vruntime -= cfs_rq->min_vruntime;    
    raw_spin_unlock(&rq->lock);    
}  

至此shced_fork全部分析完毕.

###二 对wake_up_new_task的分析

/* 
 * wake_up_new_task - wake up a newly created task for the first time. 
 * 
 * This function will do some initial scheduler statistics housekeeping 
 * that must be done for every newly created context, then puts the task 
 * on the runqueue and wakes it. 
 */  
void wake_up_new_task(struct task_struct *p)  
{  
    unsigned long flags;  
    struct rq *rq;  
  
    raw_spin_lock_irqsave(&p->pi_lock, flags); 
    /*OK ,新进程的状态标记为running了,即可以被调度器调度了*/ 
    p->state = TASK_RUNNING;  
    /*再次初始化struct task_struct---> struct ravg里面的成员变量*/
    walt_init_new_task_load(p);  
    /*再次初始化新进程调度实体的load/util*/
    /* Initialize new task's runnable average */  
    init_entity_runnable_average(&p->se);  
#ifdef CONFIG_SMP  
    /* 
     * Fork balancing, do it here and not earlier because: 
     *  - cpus_allowed can change in the fork path 
     *  - any previously selected cpu might disappear through hotplug 
     * 
     * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, 
     * as we're not fully set-up yet. 
     *//*选择一个合适的cpu,并设置此进程balance标记SD_BALANCE_FORK,即在fork/clone
     时候,根据当前系统状态,将创建的进程balance到合适的cpu上,核心函数*/  
    __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0, 1));  
#endif  
    /*获取当前进程的rq*/
    rq = __task_rq_lock(p);  
    /*更新rq的时间*/
    update_rq_clock(rq);  
    /*调整新进程的调度实体的util数值,否则为0 的话会导致整个rq的util变的很小,需要调整*/
    post_init_entity_util_avg(&p->se);  
    /*更新新进行在WALT窗口里面的运行时间,即更新struct task_struct ---> 
    struct ravg 成员变量 mark_start数值为当前时间.在WLAT文章中有详细讲解*/
    walt_mark_task_starting(p);
    /*新进程入队,核心函数*/  
    activate_task(rq, p, ENQUEUE_WAKEUP_NEW);  
    /*新进程已经在rq里面,可以运行*/
    p->on_rq = TASK_ON_RQ_QUEUED;  
    trace_sched_wakeup_new(p);  
    /*抢占check*/
    check_preempt_curr(rq, p, WF_FORK);  
#ifdef CONFIG_SMP  
    if (p->sched_class->task_woken) {  
        /* 
         * Nothing relies on rq->lock after this, so its fine to 
         * drop it. 
         */  
        lockdep_unpin_lock(&rq->lock);  
        p->sched_class->task_woken(rq, p);  
        lockdep_pin_lock(&rq->lock);  
    }  
#endif  
    task_rq_unlock(rq, p, &flags);  
}  

#####2.1 下面来分析核心函数activate_task的调用逻辑:

void activate_task(struct rq *rq, struct task_struct *p, int flags)  
{   /*check 进程的状态,并对rq里面处于uninterruptible的进程数量
    nr_uninterruptible--*/
    if (task_contributes_to_load(p))  
        rq->nr_uninterruptible--;  
    /*入队的核心函数*/
    enqueue_task(rq, p, flags);  
} 
 
#define task_contributes_to_load(task)  \  
                ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \  
                 (task->flags & PF_FROZEN) == 0 && \  
                 (task->state & TASK_NOLOAD) == 0)  
 
static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)  
{  
    update_rq_clock(rq);  
    if (!(flags & ENQUEUE_RESTORE))  
        sched_info_queued(rq, p);  
#ifdef CONFIG_INTEL_DWS  
    if (sched_feat(INTEL_DWS))  
        update_rq_runnable_task_avg(rq);  
#endif  
    /*调用对应调度类的入队函数*/
    p->sched_class->enqueue_task(rq, p, flags);  
}  
  
/* 
 * The enqueue_task method is called before nr_running is 
 * increased. Here we update the fair scheduling stats and 
 * then put the task into the rbtree: 
 */  /*CFS调度算法入队函数*/
static void  
enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)  
{  
    struct cfs_rq *cfs_rq;  
    struct sched_entity *se = &p->se;  
#ifdef CONFIG_SMP  
    int task_new = flags & ENQUEUE_WAKEUP_NEW;  
#endif  
    /*增加rq的runnable time,即当前的rq的runnable time+新进程的p->ravg.demand
    数值*/
    walt_inc_cumulative_runnable_avg(rq, p);  
  
    /* 
     * Update SchedTune accounting. 
     * 
     * We do it before updating the CPU capacity to ensure the 
     * boost value of the current task is accounted for in the 
     * selection of the OPP. 
     * 
     * We do it also in the case where we enqueue a throttled task; 
     * we could argue that a throttled task should not boost a CPU, 
     * however: 
     * a) properly implementing CPU boosting considering throttled 
     *    tasks will increase a lot the complexity of the solution 
     * b) it's not easy to quantify the benefits introduced by 
     *    such a more complex solution. 
     * Thus, for the time being we go for the simple solution and boost 
     * also for throttled RQs. 
     */  
    schedtune_enqueue_task(p, cpu_of(rq));  
  
    /* 
     * If in_iowait is set, the code below may not trigger any cpufreq 
     * utilization updates, so do it here explicitly with the IOWAIT flag 
     * passed. 
     *//*如果新进程是一个iowait的进程,则进行频率调整,根据iowait boost freq*/  
    if (p->in_iowait)  
        cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);  
  /* 这里是一个迭代,我们知道,进程有可能是处于一个进程组中的,所以当这个处于进程
     组中的进程加入到该进程组的队列中时,要对此队列向上迭代 */
    for_each_sched_entity(se) { 
        /*新创建的进程on_rq为0,只有入队之后,其数值才会被赋值为TASK_ON_RQ_QUEUED*/
        if (se->on_rq)  
            break;  
       /* 如果不是CONFIG_FAIR_GROUP_SCHED,获取其所在CPU的rq运行队列的cfs_rq
       运行队列如果是CONFIG_FAIR_GROUP_SCHED,获取其所在的cfs_rq运行队列*/
        cfs_rq = cfs_rq_of(se);  
        walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p);
        /*入队的核心函数*/  
        enqueue_entity(cfs_rq, se, flags);  
  
        /* 
         * end evaluation on encountering a throttled cfs_rq 
         * 
         * note: in the case of encountering a throttled cfs_rq we will 
         * post the final h_nr_running increment below. 
         *//*已经throttle,则退出迭代*/  
        if (cfs_rq_throttled(cfs_rq))  
            break;  
        cfs_rq->h_nr_running++;  
        /*将新创建的进程状态修改为ENQUEUE_WAKEUP状态*/
        flags = ENQUEUE_WAKEUP;  
    }  
    /* 只有se不处于队列中或者cfs_rq_throttled(cfs_rq)返回真才会运行这个循环 */
    for_each_sched_entity(se) {  
        cfs_rq = cfs_rq_of(se);  
        cfs_rq->h_nr_running++;  
        walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p);  
  
        if (cfs_rq_throttled(cfs_rq))  
            break;  
  
        update_load_avg(se, UPDATE_TG);  
        update_cfs_shares(se);  
    }  
    /*增加rq的nr_running的数值*/
    if (!se)  
        add_nr_running(rq, 1);  
  
#ifdef CONFIG_SMP  
    if (!se) {  
        struct sched_domain *sd;  
  
        rcu_read_lock();  
        sd = rcu_dereference(rq->sd);  
        if (!task_new && sd) {  
            if (cpu_overutilized(rq->cpu))  
                set_sd_overutilized(sd);  
            if (rq->misfit_task && sd->parent)  
                set_sd_overutilized(sd->parent);  
        }  
        rcu_read_unlock();  
    }  
  
#endif /* CONFIG_SMP */  
    hrtick_update(rq);  
}  

#####2.2 核心函数enqueue_entity

static void  
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)  
{  
    /* 
     * Update the normalized vruntime before updating min_vruntime 
     * through calling update_curr(). 
     */ 
    /*在task_fork_fair函数里面,对新进程的vruntime减去了对应cpu的cfs rq的最小
    vruntime,我们看到新创建进程的flags为ENQUEUE_WAKEUP_NEW=0x20
    ENQUEUE_WAKEUP=0x01,ENQUEUE_WAKING=0x04,
    所以!(0x20 & 0x01) || (0x20 & 0x04) 为true.*/ 
    if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))  
        se->vruntime += cfs_rq->min_vruntime;  
  
    /* 
     * Update run-time statistics of the 'current'. 
     */ 
    /*更新cfs_rq调度实体的vruntime和相关调度的统计信息*/ 
    update_curr(cfs_rq);  
    /*对新进程的调度实体进行util/load进行衰减,根据PELT算法*/
    update_load_avg(se, UPDATE_TG);  
    /*更新cfs_rqrunnable_load_sum/avg负载信息已经struct sched_entity → 
    struct sched_avg成员变量数值累加到整个struct cfs_rq-->struct sched_avg上去并
    触发频率的调整.*/
    enqueue_entity_load_avg(cfs_rq, se);  
    update_cfs_shares(se);  
    account_entity_enqueue(cfs_rq, se);  
    /*新创建进程flags为ENQUEUE_WAKEUP_NEW*/
    if (flags & ENQUEUE_WAKEUP) {  
        place_entity(cfs_rq, se, 0);  
        enqueue_sleeper(cfs_rq, se);  
    }  
    /*更新调度相关状态和统计信息*/
    update_stats_enqueue(cfs_rq, se);  
    check_spread(cfs_rq, se);
    /*如果当前调度实体不是cfs_rq当前的调度实体,则将新进程的调度实体插入rb tree中,根据
    vruntime的大小加入rb tree*/  
    if (se != cfs_rq->curr)  
        __enqueue_entity(cfs_rq, se);
    /*新进程在rq中*/  
    se->on_rq = 1;  
    
    if (cfs_rq->nr_running == 1) {  
        list_add_leaf_cfs_rq(cfs_rq);  
        check_enqueue_throttle(cfs_rq);  
    }  
}  

至此新进程如何被调度的讲解完毕,下一章节将讲解,idle进程被wakeup之后是怎样被调度的.
后续会讲解PELT算法,idle进程被wake_up_process流程.
当然最重要的一点没有分析的是这个函数select_task_rq,涉及到负载均衡,以后会详细分析.

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值