以下是调度器用于判断优先级最高进程的主要代码:
- struct task_struct *prev, *next;
- struct list_head *queue;
- struct prio_array *array;
- int indx;
- prev = current;
- array = rq->active;
- indx = sched_find_first_bit(array->bitmap);
- queue = array->queue + indx;
- next = list_entry(queue->next,struct task_struct,run_list);
我们先看看上面提到的几个数据结构和函数。
-
struct task_struct
内核把进程存放在叫做任务队列的双向循环链表中。链表中的每一项都是类型为task_struct(称为进程描述符)的结构。该结构在<linux/sched.h>中定义。该结构中,包含了两个结构成员struct list_head run_list和struct prio_array *array。
-
-
- struct task_struct {
- volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
- struct thread_info *thread_info;
- atomic_t usage;
- unsigned long flags; /* per process flags, defined below */
- unsigned long ptrace;
- int lock_depth; /* BKL lock depth */
- #ifdef CONFIG_SMP
- #ifdef __ARCH_WANT_UNLOCKED_CTXSW
- int oncpu;
- #endif
- #endif
- int load_weight; /* for niceness load balancing purposes */
- int prio, static_prio, normal_prio;
- struct list_head run_list;
- struct prio_array *array;
- unsigned short ioprio;
- #ifdef CONFIG_BLK_DEV_IO_TRACE
- unsigned int btrace_seq;
- #endif
- unsigned long sleep_avg;
- unsigned long long timestamp, last_ran;
- unsigned long long sched_time; /* sched_clock time spent running */
- enum sleep_type sleep_type;
- unsigned long policy;
- cpumask_t cpus_allowed;
- unsigned int time_slice, first_time_slice;
- #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
- struct sched_info sched_info;
- #endif
- struct list_head tasks;
- /*
- * ptrace_list/ptrace_children forms the list of my children
- * that were stolen by a ptracer.
- */
- struct list_head ptrace_children;
- struct list_head ptrace_list;
- struct mm_struct *mm, *active_mm;
- /* task state */
- struct linux_binfmt *binfmt;
- long exit_state;
- int exit_code, exit_signal;
- int pdeath_signal; /* The signal sent when the parent dies */
- /* ??? */
- unsigned long personality;
- unsigned did_exec:1;
- pid_t pid;
- pid_t tgid;
- #ifdef CONFIG_CC_STACKPROTECTOR
- /* Canary value for the -fstack-protector gcc feature */
- unsigned long stack_canary;
- #endif
- /*
- * pointers to (original) parent process, youngest child, younger sibling,
- * older sibling, respectively. (p->father can be replaced with
- * p->parent->pid)
- */
- struct task_struct *real_parent; /* real parent process (when being debugged) */
- struct task_struct *parent; /* parent process */
- /*
- * children/sibling forms the list of my children plus the
- * tasks I'm ptracing.
- */
- struct list_head children; /* list of my children */
- struct list_head sibling; /* linkage in my parent's children list */
- struct task_struct *group_leader; /* threadgroup leader */
- /* PID/PID hash table linkage. */
- struct pid_link pids[PIDTYPE_MAX];
- struct list_head thread_group;
- struct completion *vfork_done; /* for vfork() */
- int __user *set_child_tid; /* CLONE_CHILD_SETTID */
- int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */
- unsigned long rt_priority;
- cputime_t utime, stime;
- unsigned long nvcsw, nivcsw; /* context switch counts */
- struct timespec start_time;
- /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
- unsigned long min_flt, maj_flt;
- cputime_t it_prof_expires, it_virt_expires;
- unsigned long long it_sched_expires;
- struct list_head cpu_timers[3];
- /* process credentials */
- uid_t uid,euid,suid,fsuid;
- gid_t gid,egid,sgid,fsgid;
- struct group_info *group_info;
- kernel_cap_t cap_effective, cap_inheritable, cap_permitted;
- unsigned keep_capabilities:1;
- struct user_struct *user;
- #ifdef CONFIG_KEYS
- struct key *request_key_auth; /* assumed request_key authority */
- struct key *thread_keyring; /* keyring private to this thread */
- unsigned char jit_keyring; /* default keyring to attach requested keys to */
- #endif
- /*
- * fpu_counter contains the number of consecutive context switches
- * that the FPU is used. If this is over a threshold, the lazy fpu
- * saving becomes unlazy to save the trap. This is an unsigned char
- * so that after 256 times the counter wraps and the behavior turns
- * lazy again; this to deal with bursty apps that only use FPU for
- * a short time
- */
- unsigned char fpu_counter;
- int oomkilladj; /* OOM kill score adjustment (bit shift). */
- char comm[TASK_COMM_LEN]; /* executable name excluding path
- - access with [gs]et_task_comm (which lock
- it with task_lock())
- - initialized normally by flush_old_exec */
- /* file system info */
- int link_count, total_link_count;
- #ifdef CONFIG_SYSVIPC
- /* ipc stuff */
- struct sysv_sem sysvsem;
- #endif
- /* CPU-specific state of this task */
- struct thread_struct thread;
- /* filesystem information */
- struct fs_struct *fs;
- /* open file information */
- struct files_struct *files;
- /* namespaces */
- struct nsproxy *nsproxy;
- /* signal handlers */
- struct signal_struct *signal;
- struct sighand_struct *sighand;
- sigset_t blocked, real_blocked;
- sigset_t saved_sigmask; /* To be restored with TIF_RESTORE_SIGMASK */
- struct sigpending pending;
- unsigned long sas_ss_sp;
- size_t sas_ss_size;
- int (*notifier)(void *priv);
- void *notifier_data;
- sigset_t *notifier_mask;
- void *security;
- struct audit_context *audit_context;
- seccomp_t seccomp;
- /* Thread group tracking */
- u32 parent_exec_id;
- u32 self_exec_id;
- /* Protection of (de-)allocation: mm, files, fs, tty, keyrings */
- spinlock_t alloc_lock;
- /* Protection of the PI data structures: */
- spinlock_t pi_lock;
- #ifdef CONFIG_RT_MUTEXES
- /* PI waiters blocked on a rt_mutex held by this task */
- struct plist_head pi_waiters;
- /* Deadlock detection and priority inheritance handling */
- struct rt_mutex_waiter *pi_blocked_on;
- #endif
- #ifdef CONFIG_DEBUG_MUTEXES
- /* mutex deadlock detection */
- struct mutex_waiter *blocked_on;
- #endif
- #ifdef CONFIG_TRACE_IRQFLAGS
- unsigned int irq_events;
- int hardirqs_enabled;
- unsigned long hardirq_enable_ip;
- unsigned int hardirq_enable_event;
- unsigned long hardirq_disable_ip;
- unsigned int hardirq_disable_event;
- int softirqs_enabled;
- unsigned long softirq_disable_ip;
- unsigned int softirq_disable_event;
- unsigned long softirq_enable_ip;
- unsigned int softirq_enable_event;
- int hardirq_context;
- int softirq_context;
- #endif
- #ifdef CONFIG_LOCKDEP
- # define MAX_LOCK_DEPTH 30UL
- u64 curr_chain_key;
- int lockdep_depth;
- struct held_lock held_locks[MAX_LOCK_DEPTH];
- unsigned int lockdep_recursion;
- #endif
- /* journalling filesystem info */
- void *journal_info;
- /* VM state */
- struct reclaim_state *reclaim_state;
- struct backing_dev_info *backing_dev_info;
- struct io_context *io_context;
- unsigned long ptrace_message;
- siginfo_t *last_siginfo; /* For ptrace use. */
- /*
- * current io wait handle: wait queue entry to use for io waits
- * If this thread is processing aio, this points at the waitqueue
- * inside the currently handled kiocb. It may be NULL (i.e. default
- * to a stack based synchronous wait) if its doing sync IO.
- */
- wait_queue_t *io_wait;
- #ifdef CONFIG_TASK_XACCT
- /* i/o counters(bytes read/written, #syscalls */
- u64 rchar, wchar, syscr, syscw;
- #endif
- struct task_io_accounting ioac;
- #if defined(CONFIG_TASK_XACCT)
- u64 acct_rss_mem1; /* accumulated rss usage */
- u64 acct_vm_mem1; /* accumulated virtual memory usage */
- cputime_t acct_stimexpd;/* stime since last update */
- #endif
- #ifdef CONFIG_NUMA
- struct mempolicy *mempolicy;
- short il_next;
- #endif
- #ifdef CONFIG_CPUSETS
- struct cpuset *cpuset;
- nodemask_t mems_allowed;
- int cpuset_mems_generation;
- int cpuset_mem_spread_rotor;
- #endif
- struct robust_list_head __user *robust_list;
- #ifdef CONFIG_COMPAT
- struct compat_robust_list_head __user *compat_robust_list;
- #endif
- struct list_head pi_state_list;
- struct futex_pi_state *pi_state_cache;
- atomic_t fs_excl; /* holding fs exclusive resources */
- struct rcu_head rcu;
- /*
- * cache last used pipe for splice
- */
- struct pipe_inode_info *splice_pipe;
- #ifdef CONFIG_TASK_DELAY_ACCT
- struct task_delay_info *delays;
- #endif
- #ifdef CONFIG_FAULT_INJECTION
- int make_it_fail;
- #endif
- };
-
struct list_head
-
在Linux中list无处不在,如果定义一个结构使之成为链表,只需添加struct list_head结构作为它的一个成员就可以了。该结构定义在include/linux/list.h中。
-
- typedef struct list_head {
- struct list_head *next, *prev; //双向链表
- } list_t;
- #define LIST_HEAD_INIT(name) { &(name), &(name) }
- #define LIST_HEAD(name) /
- struct list_head name = LIST_HEAD_INIT(name) //定义一个空的链表
- #define INIT_LIST_HEAD(ptr) do { / //初始化一个已定义的列表
- (ptr)->next = (ptr); (ptr)->prev = (ptr); /
- } while (0)
-
sturct rq
-
- /*
- * This is the main, per-CPU runqueue data structure.
- *
- * Locking rule: those places that want to lock multiple runqueues
- * (such as the load balancing or the thread migration code), lock
- * acquire operations must be ordered by ascending &runqueue.
- */
- struct rq {
- spinlock_t lock;
- /*
- * nr_running and cpu_load should be in the same cacheline because
- * remote CPUs use both these fields when doing load calculation.
- */
- unsigned long nr_running;
- unsigned long raw_weighted_load;
- #ifdef CONFIG_SMP
- unsigned long cpu_load[3];
- #endif
- unsigned long long nr_switches;
- /*
- * This is part of a global counter where only the total sum
- * over all CPUs matters. A task can increase this counter on
- * one CPU and if it got migrated afterwards it may decrease
- * it on another CPU. Always updated under the runqueue lock:
- */
- unsigned long nr_uninterruptible;
- unsigned long expired_timestamp;
- /* Cached timestamp set by update_cpu_clock() */
- unsigned long long most_recent_timestamp;
- struct task_struct *curr, *idle;
- unsigned long next_balance;
- struct mm_struct *prev_mm;
- struct prio_array *active, *expired, arrays[2];
- int best_expired_prio;
- atomic_t nr_iowait;
- #ifdef CONFIG_SMP
- struct sched_domain *sd;
- /* For active balancing */
- int active_balance;
- int push_cpu;
- int cpu; /* cpu of this runqueue */
- struct task_struct *migration_thread;
- struct list_head migration_queue;
- #endif
- #ifdef CONFIG_SCHEDSTATS
- /* latency stats */
- struct sched_info rq_sched_info;
- /* sys_sched_yield() stats */
- unsigned long yld_exp_empty;
- unsigned long yld_act_empty;
- unsigned long yld_both_empty;
- unsigned long yld_cnt;
- /* schedule() stats */
- unsigned long sched_switch;
- unsigned long sched_cnt;
- unsigned long sched_goidle;
- /* try_to_wake_up() stats */
- unsigned long ttwu_cnt;
- unsigned long ttwu_local;
- #endif
- struct lock_class_key rq_lock_key;
- };
-
struct prio_array
-
-
- /*
- * These are the runqueue data structures:
- */
- struct prio_array {
- unsigned int nr_active;
- DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */
- struct list_head queue[MAX_PRIO];
- };
- #define DECLARE_BITMAP(name,bits) /
unsigned long name[BITS_TO_LONGS(bits)]
#define BITS_TO_LONGS(bits) /
(((bits)+BITS_PER_LONG-1)/BITS_PER_LONG)
-
-
sched_find_first_bit(array->bitmap)
该函数的功能是在活动优先级数组中找到第一个被设置的位。由下面的代码可以知道,最终实现使用了汇编代码。
-
- /*
- * Every architecture must define this function. It's the fastest
- * way of searching a 140-bit bitmap where the first 100 bits are
- * unlikely to be set. It's guaranteed that at least one of the 140
- * bits is cleared.
- */
- static inline int sched_find_first_bit(unsigned long *b)
- {
- return find_first_bit(b, 140);
- }
- /**
- * find_first_bit - find the first set bit in a memory region
- * @addr: The address to start the search at
- * @size: The maximum size to search
- *
- * Returns the bit-number of the first set bit, not the number of the byte
- * containing a bit.
- */
- long find_first_bit(const unsigned long * addr, unsigned long size)
- {
- return __find_first_bit(addr,size);
- }
- static inline long
- __find_first_bit(const unsigned long * addr, unsigned long size)
- {
- long d0, d1;
- long res;
- /*
- * We must test the size in words, not in bits, because
- * otherwise incoming sizes in the range -63..-1 will not run
- * any scasq instructions, and then the flags used by the jz
- * instruction will have whatever random value was in place
- * before. Nobody should call us like that, but
- * find_next_bit() does when offset and size are at the same
- * word and it fails to find a one itself.
- */
- size += 63;
- size >>= 6;
- if (!size)
- return 0;
- asm volatile(
- " repe; scasq/n"
- " jz 1f/n"
- " subq $8,%%rdi/n"
- " bsfq (%%rdi),%%rax/n"
- "1: subq %[addr],%%rdi/n"
- " shlq $3,%%rdi/n"
- " addq %%rdi,%%rax"
- :"=a" (res), "=&c" (d0), "=&D" (d1)
- :"0" (0ULL), "1" (size), "2" (addr),
- [addr] "r" (addr) : "memory");
- return res;
- }
-
list_entry(ptr,type,member)
list_entry作用就是通过list_head型指针ptr换算成其宿主结构的起始地址,该宿主结构是type型的,
而ptr在其宿主结构中定义为member成员。定义在内核源文件include/linux/list.h中。
list_entry(queue->next,struct task_struct,run_list)分析:queue和queue->next都是list_head结构,其宿主结构是struct task_struct,在宿主结构中,包含了一个list_head结构类型变量run_list。这句代码的主要作用是通过queue->next的地址求出它所在的sturct task_struct结构的起始地址。
- #define hlist_entry(ptr, type, member) container_of(ptr,type,member)