深入Linux内核架构笔记 - 进程管理与调度2

最新推荐文章于 2023-10-05 18:49:20 发布

snoopyljc

最新推荐文章于 2023-10-05 18:49:20 发布

阅读量104

点赞数

分类专栏： Linux 文章标签： Linux Process

本文链接：https://blog.csdn.net/snoopyljc/article/details/94771309

版权

Linux 专栏收录该内容

15 篇文章 0 订阅

订阅专栏

进程表示

Linux内核涉及进程的所有算法都围绕一个名为task_struct的结构建立，简化版本定义如下:

struct task_struct {
   volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
   void *stack;
   atomic_t usage;
   unsigned long flags; /* per process flags, defined below */
   unsigned long ptrace;
   int lock_depth;  /* BKL lock depth */
   
   int prio, static_prio, normal_prio;
   struct list_head run_list;
   const struct sched_class *sched_class;
   struct sched_entity se;
   unsigned short ioprio;
   unsigned long policy;
   cpumask_t cpus_allowed;
   unsigned int time_slice;

#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
   struct sched_info sched_info;
#endif

   struct list_head tasks;
  　/*
   * ptrace_list/ptrace_children forms the list of my children
   * that were stolen by a ptracer.
   */
   struct list_head ptrace_children;
   struct list_head ptrace_list;
   
   struct mm_struct *mm, *active_mm;
   
   /* task state */
   struct linux_binfmt *binfmt;
   long exit_state;
   int exit_code, exit_signal;
   int pdeath_signal; /* The signal sent when the parent dies */
   
   unsigned int personality;
   unsigned did_exec:1;
   pid_t pid;
   pid_t tgid;
   /*
   * pointers to (original) parent process, youngest child, younger sibling,
   * older sibling, respectively. (p->father can be replaced with
   * p->parent->pid)
   */
   struct task_struct *real_parent; /* real parent process (when being debugged) */
   struct task_struct *parent;　/* parent process */
   
   /*
   * children/sibling forms the list of my children plus the
   * tasks I’m ptracing.
   */
   struct list_head children;　/* list of my children */
   struct list_head sibling;　/* linkage in my parent’s children list */
   struct task_struct *group_leader;　/* threadgroup leader */
   
   /* PID/PID hash table linkage. */
   struct pid_link pids[PIDTYPE_MAX];
   struct list_head thread_group;
   
   struct completion *vfork_done;　/* for vfork() */
   int __user *set_child_tid;　/* CLONE_CHILD_SETTID */
   int __user *clear_child_tid;　/* CLONE_CHILD_CLEARTID */
   
   unsigned long rt_priority;
   cputime_t utime, stime, utimescaled, stimescaled;;
   unsigned long nvcsw, nivcsw; /* context switch counts */
   struct timespec start_time; /* monotonic time */
   struct timespec real_start_time; /* boot based time */
   /* mm fault and swap info: this can arguably be seen as either
   　mm-specific or thread-specific */
   unsigned long min_flt, maj_flt;
   
   cputime_t it_prof_expires, it_virt_expires;
   unsigned long long it_sched_expires;
   struct list_head cpu_timers[3];
   
   /* process credentials */
   uid_t uid,euid,suid,fsuid;
   gid_t gid,egid,sgid,fsgid;
   struct group_info *group_info;
   kernel_cap_t　cap_effective, cap_inheritable, cap_permitted;
   unsigned keep_capabilities:1;
   struct user_struct *user;

   char comm[TASK_COMM_LEN]; /* executable name excluding path
   　　　　　　　　　　　　　　　　- access with [gs]et_task_comm (which lock
   　　　　　　　　　　　　　　　　it with task_lock())
   　　　　　　　　　　　　　　　　- initialized normally by flush_old_exec */
   /* file system info */
   int link_count, total_link_count;
   /* ipc stuff */
   struct sysv_sem sysvsem;
   /* CPU-specific state of this task */
   struct thread_struct thread;
   /* filesystem information */
   struct fs_struct *fs;
   /* open file information */
   struct files_struct *files;
   /* namespace */
   struct nsproxy *nsproxy;
   
   /* signal handlers */
   struct signal_struct *signal;
   struct sighand_struct *sighand;
   sigset_t blocked, real_blocked;
   sigset_t saved_sigmask; /* To be restored with TIF_RESTORE_SIGMASK */
   struct sigpending pending;
   
   unsigned long sas_ss_sp;
   size_t sas_ss_size;
   int (*notifier)(void *priv);
   void *notifier_data;
   sigset_t *notifier_mask;
   
#ifdef CONFIG_SECURITY
   void *security;
#endif

   /* Thread group tracking */
   u32 parent_exec_id;
   u32 self_exec_id;
   
   /* journalling filesystem info */
   void *journal_info;
   
   /* VM state */
   struct reclaim_state *reclaim_state;
   struct backing_dev_info *backing_dev_info;
   struct io_context *io_context;
   unsigned long ptrace_message;
   siginfo_t *last_siginfo; /* For ptrace use.
   */
   ...
};

状态和执行信息，如待决信号，使用的二进制格式，进程ID号，到父进程以及其他相关进程的指针，优先级和程序有关的时间信息。
有关已经分配的虚拟内存的信息
进程身份凭据，用户ID，组ID以及权限等
使用的文件包含程序代码的二进制文件，以及进程所处理的所有文件的文件系统信息
线程信息记录该进程特定于CPU的运行时数据
进程间通信有关的信息
用于处理信号的相关信息
比较重要的成员介绍

state : 执行进程的状态
TASK_RUNNING : 表示进程可以运行
TASK_INTERRUPTIABLE : 进程因为等待某种事件或者资源而睡眠，可以通过信号唤醒
TASK_UNINTERRUPTIBLE : 进程因为等待某种事件或者资源而睡眠，不能由信号唤醒
TASK_STOPPED : 进程停止运行，比如由调试器暂停
TASK_TRACED : 用于从停止的进程中，将被调试的进程与常规的进程分开
EXIT_ZOMBIE : 进程已经停止运行，但是资源未完全释放
EXIT_DEAD : 父进程发出wait系统调用，但是进程资源完全从系统移除之前的状态，只有多个线程同时对同一个进程发wait调用时，该状态才有意义

资源限制

	struct rlimit {
		unsigned long rlim_cur;
		unsigned long rlim_max;
	}
	rlim_curr : 进程的当前资源限制，可有setrlimit系统调用来修改，也称之为软限制
	rlim_max : 进程限制的最大容许值，也称之为硬限制

进程类型

典型的UNIX进程包括：由二进制代码组成的应用程序，线程，分配给应用程序的一组资源
新进程是通过fork和exec系统调用产生的，fork负责生成当前进程的一个副本，原进程的资源都以合适的方式复制到子进程; exec用于从一个可执行的二进制文件加载另外一个程序，来代替当前运行的程序; clone用于实现线程，但需要用户空间的支持才能提供完整的实现

命名空间

一种隔离资源的方法，不同与KVM和VMWare提供的虚拟化环境，命名空间在只使用一个内核的一台物理计算机上，将进程或者其它资源放到不同的容器中来实现隔离
子容器不了解系统中的其他容器，但是父容器知道子命名空间的存在，也可以看到其中执行的所有进程。
命名空间的创建方式
- 使用fork／clone系统调用的时候，通过指定特殊的选项来创建新的命名空间
- unshare系统调用将进程的某些部分从父进程分离，其中也包括命名空间。
- 实现子系统此前的全局属性封装到命名空间中，每个进程关联到一个选定的命名空间，每个可以感知命名空间的子系统都必须提供一个数据结构，将所有通过命名空间形式提供的对象集中起来。
```
    struct nsproxy {
   	atomic_t count;
   	struct uts_namespace *uts_ns;
   	struct ipc_namespace *ipc_ns;
   	struct mnt_namespace *mnt_ns;
   	struct pid_namespace *pid_ns;
   	struct user_namespace *user_ns;
   	struct net *net_ns;
   };
    ```
```
- 默认命名空间, init_nsproxy (kernel/nsproxy.c)

进程ID号

Unix进程总是会分配一个号码用于在其命名空间中唯一地标识它们，该号码称为进程ID号，简称PID。使用fork或者clone产生的每个进程都由内核自动分配了一个新的唯一的PID.

进程ID
- 每个进程除了PID这个特征值，还有其他的ID
  1. 线程组ID(TGID)：线程组中的主进程ID，通过clone创建的所有线程的task_struct的group_leader成员，会指向组长的task_struct实例
  2. 进程组ID(PGID) : 独立的进程可以通过setpgrp系统调用合并成进程组，进程组成员的task_struct的pgrp的属性值为进程组组长的PID
  3. 会话ID(SID) : 几个进程组可以合并成会话，会话中的所有进程都用相同的会话ID，保存在task_struct的session成员中
- 命名空间增加了ID管理的复杂性，因为父命名空间能看到子命名空间的ID，反之则不行，这意味着某些进程具有多个PID，这必须反映在数据结构中，我们必须区分全局ID和局部ID
  1. 全局ID是在内核本身和初始命名空间中的唯一ID号，init进程属于初始命名空间，对于每个ID类型，都有一个给定的在这个系统中唯一的全局ID
  2. 局部ID是属于某个局部空间的，只在所属的命名空间内部有效
- task_struct中的ID
  1. pid_t pid : 全局PID
  2. pid_t tgid : 全局TGID
  3. signal->__pgrp : 进程组ID
  4. signal->__session : 会话ID
管理PID
- 数据结构
  1. PID命名空间
```
struct pid_namespace {
	...
	struct task_struct *child_reaper;
    ...
    int level;
    struct pid_namespace *parent;
}
```
    每个命名空间都有一个进程child_reaper，作用相当与全局的init进程
    parent指向父命名空间的指针，层次表示当前命名空间在命名空间层次中的深度
  2. PID管理的数据结构
```
struct upid {
	int nr;
	struct pid_namespace *ns;
	struct hlist_node pid_chain;
};
struct pid
{
	atomic_t count;
	/* lists of tasks that use this pid */
	struct hlist_head tasks[PIDTYPE_MAX];
	int level;
	struct upid numbers[1];
};
```
    nr : 表示ID的数值
    ns : 该ID所属的命名空间
    pid_chain : 散列溢出链表，散列表用于用于保存所有的upid(参考下面的pid_hash)
    tasks : 散列表头数组，共享同一个ID的task_struct通过该列表连接起来.
    numbers：upid的数组，每个数组项代表一个命名空间
  3. 由于所有共享同一ID的task_struct实例都按进程存储在一个散列表中，因此需要在task_struct中增加一个散列表元素:
```
struct task_struct {
	...
	/* PID/PID hash table linkage. */
	struct pid_link pids[PIDTYPE_MAX];
	...
};
struct pid_link
{
	struct hlist_node node;
	struct pid *pid;
};
```
    假定已经分配了struct pid的一个新实例，并设置用于给定的ID类型，会使用如下的方法附加到task_struct
```
int fastcall attach_pid(struct task_struct *task, enum pid_type type,
	struct pid *pid)
{
	struct pid_link *link;
	link = &task->pids[type];
	link->pid = pid;
	hlist_add_head_rcu(&link->node, &pid->tasks[type]);
	return 0;
}
```
    上述函数建立了task_struct和pid的双向链接，task_struct->pids[type]->pid可以访问到pid，从pid实例开始，可以遍历tasks[type]散列表找到task_struct.
  4. 用于根据命名空间和指定的PID数值查找pid结构实例的散列表
```
static struct hlist_head *pid_hash
```

函数

给定task_struct, ID类型和命名空间，获取命名空间局部的数字ID

static inline struct pid *task_pid(struct task_struct *task)
{
	return task->pids[PIDTYPE_PID].pid;
}
static inline struct pid *task_pgrp(struct task_struct *task)
{
	return task->group_leader->pids[PIDTYPE_PGID].pid;
}
pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
{
	struct upid *upid;
	pid_t nr = 0;
	if (pid && ns->level <= pid->level) {
	upid = &pid->numbers[ns->level];
	if (upid->ns == ns)
		nr = upid->nr;
	}
	return nr;
}

给出局部数字ID和对应的命名空间，查找task_struct实例
根据进程的局部数字ID和关联的命名空间的指针进行散列，得到在pid_hash数组的索引，然后遍历散列表直至找到所要的元素，如下：

struct pid * fastcall find_pid_ns(int nr, struct pid_namespace *ns)
{
	struct hlist_node *elem;
	struct upid *pnr;

	hlist_for_each_entry_rcu(pnr, elem,
			&pid_hash[pid_hashfn(nr, ns)], pid_chain)
		if (pnr->nr == nr && pnr->ns == ns)
			return container_of(pnr, struct pid,
					numbers[ns->level]);

	return NULL;
}

pid_task取出pid->tasks[type]散列表中的第一个task_struct实例

struct task_struct *find_task_by_pid_type_ns(int type, int nr,
	struct pid_namespace *ns)
{
	return pid_task(find_pid_ns(nr, ns), type);
}
struct task_struct * fastcall pid_task(struct pid *pid, enum pid_type type)
{
	struct task_struct *result = NULL;
	if (pid) {
		struct hlist_node *first;
		first = rcu_dereference(pid->tasks[type].first);
		if (first)
			result = hlist_entry(first, struct task_struct, pids[(type)].node);
	}
	return result;
}

生成唯一的PID
- 因为其他的ID可以派生自PID，所以只需要为PID生成唯一的数值即可，内核使用一个大的位图来跟踪已经分配和仍然可用的PID，其中每个PID由一个比特标识，PID可以通过对应比特在位图中的位置来计算，位图在pid的命名空间中维护。

进程关系

父子关系和兄弟关系

struct task_struct {
   ...
   struct list_head children; /* list of my children */
   struct list_head sibling; /* linkage in my parent’s children list */
   ...
}

在这里插入图片描述

snoopyljc

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
深入Linux内核架构笔记 - 进程管理与调度2

进程表示 Linux内核涉及进程的所有算法都围绕一个名为task_struct的结构建立，简化版本定义如下:struct task_struct { volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ void *stack; atomic_t usage; unsigned lon...
复制链接

扫一扫

专栏目录