进程的数据结构
- 软件平台:运行于VMware Workstation 12 Player下UbuntuLTS16.04_x64 系统
- 开发环境:Linux-4.19-rc3内核,glibc-2.9
目录
在linux中,对于进程的管理是通过一个task_struct
结构体描述
1、项目列表描述——tasks
struct list_head tasks;
2、任务ID
在内核中,进程和线程都会被描述为任务。
对于每个任务,都需要一个ID来描述和识别它们,代码如下:
pid_t pid;
pid_t tgid;
struct task_struct *group_leader;
- pid(process id):描述的是线程的id
- tgid(thread group id):描述的是进程的主线程的id
- group_leader:指向进程的主线程的
task_struct
-
对于一个只有一个线程的进程
只有主线程,那pid是自己,tgid是自己,group_leader指向的还是自己
-
对于一个有多个线程的进程
线程有自己的pid,tgid就是进程的主线程的pid,group_leader指向的就是进程的主线程。
通过比较三者,主要是pid
与tgid
,可得知tast_struct
代表的是一个进程还是代表一个线程。
3、信号处理
对于每个线程或进程,会对系统的相关信号进行处理。
在该结构体中,有如下的代码描述:
/* Signal handlers: */
struct signal_struct *signal; /* 相关的信号:其中有struct sigpending shared_pending一个是线程组共享的 */
struct sighand_struct *sighand; /* 哪些信号正在通过信号处理函数进行处理 */
sigset_t blocked; /* 哪些信号被阻塞暂不处理 */
sigset_t real_blocked;
sigset_t saved_sigmask;
struct sigpending pending; /* 哪些信号尚等待处理 */
/* 默认使用用户态的函数栈,当然也可以开辟新的栈专门用于信号处理 */
unsigned long sas_ss_sp;
size_t sas_ss_size;
unsigned int sas_ss_flags;
在这里也可以通过如下结构可区分线程与进程:
struct sigpending pending
——本任务;struct signal_struct *signal->struct sigpending shared_pending
——线程组共享的;
4、任务状态
volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
int exit_state;
unsigned int flags;
-
对于状态的描述有如下宏定义:其是通过bitset的方式来进行设置的,即每一位对应一个状态
/* Used in tsk->state: */ #define TASK_RUNNING 0x0000 #define TASK_INTERRUPTIBLE 0x0001 #define TASK_UNINTERRUPTIBLE 0x0002 #define __TASK_STOPPED 0x0004 #define __TASK_TRACED 0x0008 /* Used in tsk->exit_state: */ #define EXIT_DEAD 0x0010 #define EXIT_ZOMBIE 0x0020 #define EXIT_TRACE (EXIT_ZOMBIE | EXIT_DEAD) /* Used in tsk->state again: */ #define TASK_PARKED 0x0040 #define TASK_DEAD 0x0080 #define TASK_WAKEKILL 0x0100 #define TASK_WAKING 0x0200 #define TASK_NOLOAD 0x0400 #define TASK_NEW 0x0800 #define TASK_STATE_MAX 0x1000 /* Convenience macros for the sake of set_current_state: */ #define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE) #define TASK_STOPPED (TASK_WAKEKILL | __TASK_STOPPED) #define TASK_TRACED (TASK_WAKEKILL | __TASK_TRACED) #define TASK_IDLE (TASK_UNINTERRUPTIBLE | TASK_NOLOAD) /* Convenience macros for the sake of wake_up(): */ #define TASK_NORMAL (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE) /* get_task_state(): */ #define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \ TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \ __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \
-
TASK_RUNNING:表示进程在时刻准备运行的状态;
当处于此状态的进程获得时间片的时候,就是在运行中;如果没有获得时间片,就说明它被其他进程抢占了,在等待再次分配时间片。
-
TASK_INTERRUPTIBLE:可中断的睡眠状态。是一种浅睡眠的状态,即可以通过一个信号来唤醒进程;
-
TASK_UNINTERRUPTIBLE:不可中断的睡眠状态。是一种深度睡眠状态,不可被信号唤醒,只能死等I/O操作完成;
-
TASK_KILLABLE:可以终止的新睡眠状态。通过定义可知,其运行原理类似TASK_UNINTERRUPTIBLE,只可以响应致命信号;
-
TASK_STOPPED:进程接收到SIGSTOP、SIGTTIN、SIGTSTP或者SIGTTOU信号之后进入;
-
TASK_TRACED:进程被debugger等进程监视,进程执行被调试程序所停止。当一个进程被另外的进程所监视,每一个信号都会让进程进入该状态;
-
EXIT_ZOMBIE:僵尸状态。若一个进程结束时,其父进程未调用
wait()
系统调用来获知它的终止信息,此时进程就成了僵尸进程; -
EXIT_DEAD:是进程的最终状态。
-
-
上述EXIT_ZOMBIE与EXIT_DEAD可用于设置
exit_state
; -
对于
flags
字段,使用如下宏定义进行设置:/* * Per process flags */ #define PF_IDLE 0x00000002 /* I am an IDLE thread */ #define PF_EXITING 0x00000004 /* Getting shut down */ #define PF_EXITPIDONE 0x00000008 /* PI exit done on shut down */ #define PF_VCPU 0x00000010 /* I'm a virtual CPU */ #define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ #define PF_FORKNOEXEC 0x00000040 /* Forked but didn't exec */ #define PF_MCE_PROCESS 0x00000080 /* Process policy on mce errors */ #define PF_SUPERPRIV 0x00000100 /* Used super-user privileges */ #define PF_DUMPCORE 0x00000200 /* Dumped core */ #define PF_SIGNALED 0x00000400 /* Killed by a signal */ #define PF_MEMALLOC 0x00000800 /* Allocating memory */ #define PF_NPROC_EXCEEDED 0x00001000 /* set_user() noticed that RLIMIT_NPROC was exceeded */ #define PF_USED_MATH 0x00002000 /* If unset the fpu must be initialized before use */ #define PF_USED_ASYNC 0x00004000 /* Used async_schedule*(), used by module init */ #define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */ #define PF_FROZEN 0x00010000 /* Frozen for system suspend */ #define PF_KSWAPD 0x00020000 /* I am kswapd */ #define PF_MEMALLOC_NOFS 0x00040000 /* All allocation requests will inherit GFP_NOFS */ #define PF_MEMALLOC_NOIO 0x00080000 /* All allocation requests will inherit GFP_NOIO */ #define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */ #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */ #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */ #define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */
-
PF_EXITING:表示正在退出。
当有这个flag的时候,在函数
find_alive_thread
中,找活着的线程,遇到有这个flag的,就直接跳过;static struct task_struct *find_alive_thread(struct task_struct *p) { struct task_struct *t; for_each_thread(p, t) { if (!(t->flags & PF_EXITING)) return t; } return NULL; }
-
PF_VCPU:表示进程运行在虚拟CPU上。
在函数
account_system_time
中,统计进程的系统运行时间,如果有这个flag,就调用account_guest_time
,按照客户机的时间进行统计;/* * Account guest CPU time to a process. * @p: the process that the CPU time gets accounted to * @cputime: the CPU time spent in virtual machine since the last update */ void account_guest_time(struct task_struct *p, u64 cputime) { u64 *cpustat = kcpustat_this_cpu->cpustat; /* Add guest time to process. */ p->utime += cputime; account_group_user_time(p, cputime); p->gtime += cputime; /* Add guest time to cpustat. */ if (task_nice(p) > 0) { cpustat[CPUTIME_NICE] += cputime; cpustat[CPUTIME_GUEST_NICE] += cputime; } else { cpustat[CPUTIME_USER] += cputime; cpustat[CPUTIME_GUEST] += cputime; } } /* * Account system CPU time to a process. * @p: the process that the CPU time gets accounted to * @hardirq_offset: the offset to subtract from hardirq_count() * @cputime: the CPU time spent in kernel space since the last update */ void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime) { int index; if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { account_guest_time(p, cputime); return; } if (hardirq_count() - hardirq_offset) index = CPUTIME_IRQ; else if (in_serving_softirq()) index = CPUTIME_SOFTIRQ; else index = CPUTIME_SYSTEM; account_system_index_time(p, cputime, index); }
-
PF_FORKNOEXEC:表示fork完了,还没有exec。
在_do_fork函数里面调用
copy_process
,这个时候把flag设置为PF_FORKNOEXEC。当exec中调用了load_elf_binary
的时候,又把这个flag去掉;static __latent_entropy struct task_struct *copy_process( unsigned long clone_flags, unsigned long stack_start, unsigned long stack_size, int __user *child_tidptr, struct pid *pid, int trace, unsigned long tls, int node){ int retval; struct task_struct *p; struct multiprocess_signals delayed; /* ...... */ /* 一些判断处理 */ /* ...... */ retval = -EAGAIN; if (atomic_read(&p->real_cred->user->processes) >= task_rlimit(p, RLIMIT_NPROC)) { if (p->real_cred->user != INIT_USER && !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) goto bad_fork_free; } current->flags &= ~PF_NPROC_EXCEEDED; retval = copy_creds(p, clone_flags); if (retval < 0) goto bad_fork_free; /* * If multiple threads are within copy_process(), then this check * triggers too late. This doesn't hurt, the check is only there * to stop root fork bombs. */ retval = -EAGAIN; if (nr_threads >= max_threads) goto bad_fork_cleanup_count; delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE); p->flags |= PF_FORKNOEXEC; INIT_LIST_HEAD(&p->children); INIT_LIST_HEAD(&p->sibling); rcu_copy_process(p); p->vfork_done = NULL; spin_lock_init(&p->alloc_lock); /* ...... */}
-
5、进程调度
这里只列出部分字段供理解
//是否在运行队列上int on_rq;//优先级int prio;int static_prio;int normal_prio;unsigned int rt_priority;//调度器类const struct sched_class *sched_class;//调度实体struct sched_entity se;struct sched_rt_entity rt;struct sched_dl_entity dl;//调度策略unsigned int policy;//可以使用哪些CPUint nr_cpus_allowed;cpumask_t cpus_allowed;struct sched_info sched_info;
6、运行统计信息
对于每一个任务,有时候需要了解运行时的信息。
u64 utime; //用户态消耗的CPU时间u64 stime; //内核态消耗的CPU时间unsigned long nvcsw; //自愿(voluntary)上下文切换计数unsigned long nivcsw; //非自愿(involuntary)上下文切换计数u64 start_time; //进程启动时间,不包含睡眠时间u64 real_start_time;//进程启动时间,包含睡眠时间
7、进程的亲缘关系
对于Linux上的所有进程,其创建时都是有父进程的。
struct task_struct __rcu *real_parent; /* real parent process */struct task_struct __rcu *parent; /* recipient of SIGCHLD, wait4() reports */struct list_head children; /* list of my children */struct list_head sibling; /* linkage in my parent's children list */
- **real_parent:**指向真正的父进程(亲爹,即调用fork()函数的进程)
- **parent:**指向其父进程(干爹)。当它终止时,必须向它的父进程发送信号;
- **children:**表示链表的头部。链表中的所有元素都是它的子进程;
- **sibling:**用于把当前进程插入到兄弟链表中。
对于real_parent
和parent
,其通俗的理解如下:
- real_parent是亲爹,调fork的那个
- parent是干爹
- 大部分情况下亲爹干爹是一个人,ps看到的是干爹
- 亲爹干爹不一样的:比如有一种情况,比如亲爹死了,但是又得有一个父进程,比如1号进程就会被当成父进程。但进程不是1号进程fork出来的
对于亲缘关系,有如下的图表示(树形结构)
![进程亲缘关系](https://i-blog.csdnimg.cn/blog_migrate/6ea109f34381c83b01e617af7ad5ca46.png)
8、进程的权限
对于每个进程,都需要规定其权限来决定其可做什么不可做什么,以此来保证程序的安全。
/* Tracer's credentials at attach: */
const struct cred __rcu *ptracer_cred;
/* Objective and real subjective task credentials (COW): 谁操作我*/
const struct cred __rcu *real_cred;
/* Effective (overridable) subjective task credentials (COW): 我能操作谁*/
const struct cred __rcu *cred;
对于struct cred
结构体,其定义如下:
struct cred {
atomic_t usage;
#ifdef CONFIG_DEBUG_CREDENTIALS
atomic_t subscribers; /* number of processes subscribed */
void *put_addr;
unsigned magic;
#define CRED_MAGIC 0x43736564
#define CRED_MAGIC_DEAD 0x44656144
#endif
kuid_t uid; /* real UID of the task */
kgid_t gid; /* real GID of the task */
kuid_t suid; /* saved UID of the task */
kgid_t sgid; /* saved GID of the task */
kuid_t euid; /* effective UID of the task */
kgid_t egid; /* effective GID of the task */
kuid_t fsuid; /* UID for VFS ops */
kgid_t fsgid; /* GID for VFS ops */
unsigned securebits; /* SUID-less security management */
kernel_cap_t cap_inheritable; /* caps our children can inherit */
kernel_cap_t cap_permitted; /* caps we're permitted */
kernel_cap_t cap_effective; /* caps we can actually use */
kernel_cap_t cap_bset; /* capability bounding set */
kernel_cap_t cap_ambient; /* Ambient capability set */
#ifdef CONFIG_KEYS
unsigned char jit_keyring; /* default keyring to attach requested
* keys to */
struct key __rcu *session_keyring; /* keyring inherited over fork */
struct key *process_keyring; /* keyring private to this process */
struct key *thread_keyring; /* keyring private to this thread */
struct key *request_key_auth; /* assumed request_key authority */
#endif
#ifdef CONFIG_SECURITY
void *security; /* subjective LSM security */
#endif
struct user_struct *user; /* real user ID subscription */
struct user_namespace *user_ns; /* user_ns the caps and keyrings are relative to. */
struct group_info *group_info; /* supplementary groups for euid/fsgid */
struct rcu_head rcu; /* RCU deletion hook */
} __randomize_layout;
uid和gid
,注释是real user/group id。一般情况下,谁启动的进程,就是谁的ID。但是权限审核的时候,往往不比较这两个,也就是说不大起作用。euid和egid
,注释是effective user/group id。一看这个名字,就知道这个是起“作用”的。当这个进程要操作消息队列、共享内存、信号量等对象的时候,其实就是在比较这个用户和组是否有权限。fsuid和fsgid
,也就是filesystem user/group id。这个是对文件操作会审核的权限。suid和sgid
:会把用户id和组id保存在一个地方,也就是saved uid和save gid。
对于一般的进程,其需要做特定的事时并不需要完整的权限,只需要给定特定权限即可,故引入capabilities机制(用位图表示权限)。
kernel_cap_t cap_inheritable; /* caps our children can inherit */
kernel_cap_t cap_permitted; /* caps we're permitted */
kernel_cap_t cap_effective; /* caps we can actually use */
kernel_cap_t cap_bset; /* capability bounding set */
kernel_cap_t cap_ambient; /* Ambient capability set */
-
cap_permitted:表示进程能够使用的权限。但是真正起作用的是cap_effective。cap_permitted中可以包含cap_effective中没有的权限;
-
cap_effective:表示进程起作用的权限;
-
cap_inheritable:表示当可执行文件的扩展属性;
设置了
inheritable
位时,调用exec
执行该程序会继承调用者的inheritable
集合,并将其加入到permitted
集合。但在非root用户下执行exec时,通常不会保留inheritable
集合,但是往往又是非root用户,才想保留权限,所以非常鸡肋。 -
cap_bset,也就是capability bounding set,是系统中所有进程允许保留的权限。如果这个集合中不存在某个权限,那么系统中的所有进程都没有这个权限。即使以超级用户权限执行的进程,也是一样的。
这样有很多好处。例如,系统启动以后,将加载内核模块的权限去掉,那所有进程都不能加载内核模块。这样,即便这台机器被攻破,也做不了太多有害的事情。
- cap_ambient:就是为了解决cap_inheritable鸡肋的状况,也就是,非root用户进程使用exec执行一个程序的时候,如何保留权限的问题。当执行exec的时候,
cap_ambient
会被添加到cap_permitted
中,同时设置到cap_effective
中。
9、内存管理
对于每个进程,其都需要一个独立的虚拟内存空间,用以保存相关数据。
struct mm_struct *mm;
struct mm_struct *active_mm;
10、文件与文件系统
在Linux中万物皆文件,需要有一个结构体去描述该文件系统与一个打开的文件
/* Filesystem information: */
struct fs_struct *fs;
/* Open file information: */
struct files_struct *files;
11、(看不懂)
上两节,我们解读了task_struct的大部分的成员变量。这样一个任务执行的方方面面,都可以很好地管理起来,但是其中有一个问题我们没有谈。在程序执行过程中,一旦调用到系统调用,就需要进入内核继续执行。那如何将用户态的执行和内核态的执行串起来呢?
这就需要以下两个重要的成员变量:
struct thread_info thread_info;
void *stack;
用户态函数栈
在用户态中,程序的执行往往是一个函数调用另一个函数。函数调用都是通过栈来进行的。我们前面大致讲过函数栈的原理,今天我们仔细分析一下。
函数调用其实也很简单。如果你去看汇编语言的代码,其实就是指令跳转,从代码的一个地方跳到另外一个地方。这里比较棘手的问题是,参数和返回地址应该怎么传递过去呢?
我们看函数的调用过程,A调用B、调用C、调用D,然后返回C、返回B、返回A,这是一个后进先出的过程。有没有觉得这个过程很熟悉?没错,咱们数据结构里学的栈,也是后进先出的,所以用栈保存这些最合适。
在进程的内存空间里面,栈是一个从高地址到低地址,往下增长的结构,也就是上面是栈底,下面是栈顶,入栈和出栈的操作都是从下面的栈顶开始的。
我们先来看32位操作系统的情况。在CPU里,ESP(Extended Stack Pointer)是栈顶指针寄存器,入栈操作Push和出栈操作Pop指令,会自动调整ESP的值。另外有一个寄存器EBP(Extended Base Pointer),是栈基地址指针寄存器,指向当前栈帧的最底部。
例如,A调用B,A的栈里面包含A函数的局部变量,然后是调用B的时候要传给它的参数,然后返回A的地址,这个地址也应该入栈,这就形成了A的栈帧。接下来就是B的栈帧部分了,先保存的是A栈帧的栈底位置,也就是EBP。因为在B函数里面获取A传进来的参数,就是通过这个指针获取的,接下来保存的是B的局部变量等等。
当B返回的时候,返回值会保存在EAX寄存器中,从栈中弹出返回地址,将指令跳转回去,参数也从栈中弹出,然后继续执行A。
对于64位操作系统,模式多少有些不一样。因为64位操作系统的寄存器数目比较多。rax用于保存函数调用的返回结果。栈顶指针寄存器变成了rsp,指向栈顶位置。堆栈的Pop和Push操作会自动调整rsp,栈基指针寄存器变成了rbp,指向当前栈帧的起始位置。
改变比较多的是参数传递。rdi、rsi、rdx、rcx、r8、r9这6个寄存器,用于传递存储函数调用时的6个参数。如果超过6的时候,还是需要放到栈里面。
然而,前6个参数有时候需要进行寻址,但是如果在寄存器里面,是没有地址的,因而还是会放到栈里面,只不过放到栈里面的操作是被调用函数做的。
以上的栈操作,都是在进程的内存空间里面进行的。
内核态函数栈
接下来,我们通过系统调用,从进程的内存空间到内核中了。内核中也有各种各样的函数调用来调用去的,也需要这样一个机制,这该怎么办呢?
这时候,上面的成员变量stack,也就是内核栈,就派上了用场。
Linux给每个task都分配了内核栈。在32位系统上arch/x86/include/asm/page_32_types.h,是这样定义的:一个PAGE_SIZE是4K,左移一位就是乘以2,也就是8K。
#define THREAD_SIZE_ORDER 1
#define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER)
内核栈在64位系统上arch/x86/include/asm/page_64_types.h,是这样定义的:在PAGE_SIZE的基础上左移两位,也即16K,并且要求起始地址必须是8192的整数倍。
#ifdef CONFIG_KASAN
#define KASAN_STACK_ORDER 1
#else
#define KASAN_STACK_ORDER 0
#endif
#define THREAD_SIZE_ORDER (2 + KASAN_STACK_ORDER)
#define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER)
内核栈是一个非常特殊的结构,如下图所示:
这段空间的最低位置,是一个thread_info结构。这个结构是对task_struct结构的补充。因为task_struct结构庞大但是通用,不同的体系结构就需要保存不同的东西,所以往往与体系结构有关的,都放在thread_info里面。
在内核代码里面有这样一个union,将thread_info和stack放在一起,在include/linux/sched.h文件中就有。
union thread_union {
#ifndef CONFIG_THREAD_INFO_IN_TASK
struct thread_info thread_info;
#endif
unsigned long stack[THREAD_SIZE/sizeof(long)];
};
这个union就是这样定义的,开头是thread_info,后面是stack。
在内核栈的最高地址端,存放的是另一个结构pt_regs,定义如下。其中,32位和64位的定义不一样。
#ifdef __i386__
struct pt_regs {
unsigned long bx;
unsigned long cx;
unsigned long dx;
unsigned long si;
unsigned long di;
unsigned long bp;
unsigned long ax;
unsigned long ds;
unsigned long es;
unsigned long fs;
unsigned long gs;
unsigned long orig_ax;
unsigned long ip;
unsigned long cs;
unsigned long flags;
unsigned long sp;
unsigned long ss;
};
#else
struct pt_regs {
unsigned long r15;
unsigned long r14;
unsigned long r13;
unsigned long r12;
unsigned long bp;
unsigned long bx;
unsigned long r11;
unsigned long r10;
unsigned long r9;
unsigned long r8;
unsigned long ax;
unsigned long cx;
unsigned long dx;
unsigned long si;
unsigned long di;
unsigned long orig_ax;
unsigned long ip;
unsigned long cs;
unsigned long flags;
unsigned long sp;
unsigned long ss;
/* top of stack page */
};
#endif
看到这个是不是很熟悉?咱们在讲系统调用的时候,已经多次见过这个结构。当系统调用从用户态到内核态的时候,首先要做的第一件事情,就是将用户态运行过程中的CPU上下文保存起来,其实主要就是保存在这个结构的寄存器变量里。这样当从内核系统调用返回的时候,才能让进程在刚才的地方接着运行下去。
如果我们对比系统调用那一节的内容,你会发现系统调用的时候,压栈的值的顺序和struct pt_regs中寄存器定义的顺序是一样的。
在内核中,CPU的寄存器ESP或者RSP,已经指向内核栈的栈顶,在内核态里的调用都有和用户态相似的过程。
通过task_struct找内核栈
如果有一个task_struct的stack指针在手,你可以通过下面的函数找到这个线程内核栈:
static inline void *task_stack_page(const struct task_struct *task)
{
return task->stack;
}
从task_struct如何得到相应的pt_regs呢?我们可以通过下面的函数:
/*
* TOP_OF_KERNEL_STACK_PADDING reserves 8 bytes on top of the ring0 stack.
* This is necessary to guarantee that the entire "struct pt_regs"
* is accessible even if the CPU haven't stored the SS/ESP registers
* on the stack (interrupt gate does not save these registers
* when switching to the same priv ring).
* Therefore beware: accessing the ss/esp fields of the
* "struct pt_regs" is possible, but they may contain the
* completely wrong values.
*/
#define task_pt_regs(task) \
({ \
unsigned long __ptr = (unsigned long)task_stack_page(task); \
__ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; \
((struct pt_regs *)__ptr) - 1; \
})
你会发现,这是先从task_struct找到内核栈的开始位置。然后这个位置加上THREAD_SIZE就到了最后的位置,然后转换为struct pt_regs,再减一,就相当于减少了一个pt_regs的位置,就到了这个结构的首地址。
这里面有一个TOP_OF_KERNEL_STACK_PADDING,这个的定义如下:
#ifdef CONFIG_X86_32
# ifdef CONFIG_VM86
# define TOP_OF_KERNEL_STACK_PADDING 16
# else
# define TOP_OF_KERNEL_STACK_PADDING 8
# endif
#else
# define TOP_OF_KERNEL_STACK_PADDING 0
#endif
也就是说,32位机器上是8,其他是0。这是为什么呢?因为压栈pt_regs有两种情况。我们知道,CPU用ring来区分权限,从而Linux可以区分内核态和用户态。
因此,第一种情况,我们拿涉及从用户态到内核态的变化的系统调用来说。因为涉及权限的改变,会压栈保存SS、ESP寄存器的,这两个寄存器共占用8个byte。
另一种情况是,不涉及权限的变化,就不会压栈这8个byte。这样就会使得两种情况不兼容。如果没有压栈还访问,就会报错,所以还不如预留在这里,保证安全。在64位上,修改了这个问题,变成了定长的。
好了,现在如果你task_struct在手,就能够轻松得到内核栈和内核寄存器。
通过内核栈找task_struct
那如果一个当前在某个CPU上执行的进程,想知道自己的task_struct在哪里,又该怎么办呢?
这个艰巨的任务要交给thread_info这个结构。
struct thread_info { struct task_struct *task; /* main task structure */ __u32 flags; /* low level flags */ __u32 status; /* thread synchronous flags */ __u32 cpu; /* current CPU */ mm_segment_t addr_limit; unsigned int sig_on_uaccess_error:1; unsigned int uaccess_err:1; /* uaccess failed */};
这里面有个成员变量task指向task_struct,所以我们常用current_thread_info()->task来获取task_struct。
static inline struct thread_info *current_thread_info(void){ return (struct thread_info *)(current_top_of_stack() - THREAD_SIZE);}
而thread_info的位置就是内核栈的最高位置,减去THREAD_SIZE,就到了thread_info的起始地址。
但是现在变成这样了,只剩下一个flags。
struct thread_info { unsigned long flags; /* low level flags */};
那这时候怎么获取当前运行中的task_struct呢?current_thread_info有了新的实现方式。
在include/linux/thread_info.h中定义了current_thread_info。
#include <asm/current.h>#define current_thread_info() ((struct thread_info *)current)#endif
那current又是什么呢?在arch/x86/include/asm/current.h中定义了。
struct task_struct;
DECLARE_PER_CPU(struct task_struct *, current_task);
static __always_inline struct task_struct *get_current(void)
{
return this_cpu_read_stable(current_task);
}
#define current get_current
到这里,你会发现,新的机制里面,每个CPU运行的task_struct不通过thread_info获取了,而是直接放在Per CPU 变量里面了。
多核情况下,CPU是同时运行的,但是它们共同使用其他的硬件资源的时候,我们需要解决多个CPU之间的同步问题。
Per CPU变量是内核中一种重要的同步机制。顾名思义,Per CPU变量就是为每个CPU构造一个变量的副本,这样多个CPU各自操作自己的副本,互不干涉。比如,当前进程的变量current_task就被声明为Per CPU变量。
要使用Per CPU变量,首先要声明这个变量,在arch/x86/include/asm/current.h中有:
DECLARE_PER_CPU(struct task_struct *, current_task);
然后是定义这个变量,在arch/x86/kernel/cpu/common.c中有:
DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
也就是说,系统刚刚初始化的时候,current_task都指向init_task。
当某个CPU上的进程进行切换的时候,current_task被修改为将要切换到的目标进程。例如,进程切换函数__switch_to就会改变current_task。
__visible __notrace_funcgraph struct task_struct *
__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
{
......
this_cpu_write(current_task, next_p);
......
return prev_p;
}
当要获取当前的运行中的task_struct的时候,就需要调用this_cpu_read_stable进行读取。
#define this_cpu_read_stable(var) percpu_stable_op("mov", var)
好了,现在如果你是一个进程,正在某个CPU上运行,就能够轻松得到task_struct了。
总结时刻
这一节虽然只介绍了内核栈,但是内容更加重要。如果说task_struct的其他成员变量都是和进程管理有关的,内核栈是和进程运行有关系的。
我这里画了一张图总结一下32位和64位的工作模式,左边是32位的,右边是64位的。
- 在用户态,应用程序进行了至少一次函数调用。32位和64的传递参数的方式稍有不同,32位的就是用函数栈,64位的前6个参数用寄存器,其他的用函数栈。
- 在内核态,32位和64位都使用内核栈,格式也稍有不同,主要集中在pt_regs结构上。
- 在内核态,32位和64位的内核栈和task_struct的关联关系不同。32位主要靠thread_info,64位主要靠Per-CPU变量。
附录:task_struct
结构体源码
struct task_struct {
#ifdef CONFIG_THREAD_INFO_IN_TASK
/*
* For reasons of header soup (see current_thread_info()), this
* must be the first element of task_struct.
*/
struct thread_info thread_info;
#endif
/* -1 unrunnable, 0 runnable, >0 stopped: */
volatile long state;
/*
* This begins the randomizable portion of task_struct. Only
* scheduling-critical items should be added above here.
*/
randomized_struct_fields_start
void *stack;
atomic_t usage;
/* Per task flags (PF_*), defined further below: */
unsigned int flags;
unsigned int ptrace;
#ifdef CONFIG_SMP
struct llist_node wake_entry;
int on_cpu;
#ifdef CONFIG_THREAD_INFO_IN_TASK
/* Current CPU: */
unsigned int cpu;
#endif
unsigned int wakee_flips;
unsigned long wakee_flip_decay_ts;
struct task_struct *last_wakee;
/*
* recent_used_cpu is initially set as the last CPU used by a task
* that wakes affine another task. Waker/wakee relationships can
* push tasks around a CPU where each wakeup moves to the next one.
* Tracking a recently used CPU allows a quick search for a recently
* used CPU that may be idle.
*/
int recent_used_cpu;
int wake_cpu;
#endif
int on_rq;
int prio;
int static_prio;
int normal_prio;
unsigned int rt_priority;
const struct sched_class *sched_class;
struct sched_entity se;
struct sched_rt_entity rt;
#ifdef CONFIG_CGROUP_SCHED
struct task_group *sched_task_group;
#endif
struct sched_dl_entity dl;
#ifdef CONFIG_PREEMPT_NOTIFIERS
/* List of struct preempt_notifier: */
struct hlist_head preempt_notifiers;
#endif
#ifdef CONFIG_BLK_DEV_IO_TRACE
unsigned int btrace_seq;
#endif
unsigned int policy;
int nr_cpus_allowed;
cpumask_t cpus_allowed;
#ifdef CONFIG_PREEMPT_RCU
int rcu_read_lock_nesting;
union rcu_special rcu_read_unlock_special;
struct list_head rcu_node_entry;
struct rcu_node *rcu_blocked_node;
#endif /* #ifdef CONFIG_PREEMPT_RCU */
#ifdef CONFIG_TASKS_RCU
unsigned long rcu_tasks_nvcsw;
u8 rcu_tasks_holdout;
u8 rcu_tasks_idx;
int rcu_tasks_idle_cpu;
struct list_head rcu_tasks_holdout_list;
#endif /* #ifdef CONFIG_TASKS_RCU */
struct sched_info sched_info;
struct list_head tasks;
#ifdef CONFIG_SMP
struct plist_node pushable_tasks;
struct rb_node pushable_dl_tasks;
#endif
struct mm_struct *mm;
struct mm_struct *active_mm;
/* Per-thread vma caching: */
struct vmacache vmacache;
#ifdef SPLIT_RSS_COUNTING
struct task_rss_stat rss_stat;
#endif
int exit_state;
int exit_code;
int exit_signal;
/* The signal sent when the parent dies: */
int pdeath_signal;
/* JOBCTL_*, siglock protected: */
unsigned long jobctl;
/* Used for emulating ABI behavior of previous Linux versions: */
unsigned int personality;
/* Scheduler bits, serialized by scheduler locks: */
unsigned sched_reset_on_fork:1;
unsigned sched_contributes_to_load:1;
unsigned sched_migrated:1;
unsigned sched_remote_wakeup:1;
/* Force alignment to the next boundary: */
unsigned :0;
/* Unserialized, strictly 'current' */
/* Bit to tell LSMs we're in execve(): */
unsigned in_execve:1;
unsigned in_iowait:1;
#ifndef TIF_RESTORE_SIGMASK
unsigned restore_sigmask:1;
#endif
#ifdef CONFIG_MEMCG
unsigned in_user_fault:1;
#ifdef CONFIG_MEMCG_KMEM
unsigned memcg_kmem_skip_account:1;
#endif
#endif
#ifdef CONFIG_COMPAT_BRK
unsigned brk_randomized:1;
#endif
#ifdef CONFIG_CGROUPS
/* disallow userland-initiated cgroup migration */
unsigned no_cgroup_migration:1;
#endif
#ifdef CONFIG_BLK_CGROUP
/* to be used once the psi infrastructure lands upstream. */
unsigned use_memdelay:1;
#endif
unsigned long atomic_flags; /* Flags requiring atomic access. */
struct restart_block restart_block;
pid_t pid;
pid_t tgid;
#ifdef CONFIG_STACKPROTECTOR
/* Canary value for the -fstack-protector GCC feature: */
unsigned long stack_canary;
#endif
/*
* Pointers to the (original) parent process, youngest child, younger sibling,
* older sibling, respectively. (p->father can be replaced with
* p->real_parent->pid)
*/
/* Real parent process: */
struct task_struct __rcu *real_parent;
/* Recipient of SIGCHLD, wait4() reports: */
struct task_struct __rcu *parent;
/*
* Children/sibling form the list of natural children:
*/
struct list_head children;
struct list_head sibling;
struct task_struct *group_leader;
/*
* 'ptraced' is the list of tasks this task is using ptrace() on.
*
* This includes both natural children and PTRACE_ATTACH targets.
* 'ptrace_entry' is this task's link on the p->parent->ptraced list.
*/
struct list_head ptraced;
struct list_head ptrace_entry;
/* PID/PID hash table linkage. */
struct pid *thread_pid;
struct hlist_node pid_links[PIDTYPE_MAX];
struct list_head thread_group;
struct list_head thread_node;
struct completion *vfork_done;
/* CLONE_CHILD_SETTID: */
int __user *set_child_tid;
/* CLONE_CHILD_CLEARTID: */
int __user *clear_child_tid;
u64 utime;
u64 stime;
#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
u64 utimescaled;
u64 stimescaled;
#endif
u64 gtime;
struct prev_cputime prev_cputime;
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
struct vtime vtime;
#endif
#ifdef CONFIG_NO_HZ_FULL
atomic_t tick_dep_mask;
#endif
/* Context switch counts: */
unsigned long nvcsw;
unsigned long nivcsw;
/* Monotonic time in nsecs: */
u64 start_time;
/* Boot based time in nsecs: */
u64 real_start_time;
/* MM fault and swap info: this can arguably be seen as either mm-specific or thread-specific: */
unsigned long min_flt;
unsigned long maj_flt;
#ifdef CONFIG_POSIX_TIMERS
struct task_cputime cputime_expires;
struct list_head cpu_timers[3];
#endif
/* Process credentials: */
/* Tracer's credentials at attach: */
const struct cred __rcu *ptracer_cred;
/* Objective and real subjective task credentials (COW): */
const struct cred __rcu *real_cred;
/* Effective (overridable) subjective task credentials (COW): */
const struct cred __rcu *cred;
/*
* executable name, excluding path.
*
* - normally initialized setup_new_exec()
* - access it with [gs]et_task_comm()
* - lock it with task_lock()
*/
char comm[TASK_COMM_LEN];
struct nameidata *nameidata;
#ifdef CONFIG_SYSVIPC
struct sysv_sem sysvsem;
struct sysv_shm sysvshm;
#endif
#ifdef CONFIG_DETECT_HUNG_TASK
unsigned long last_switch_count;
unsigned long last_switch_time;
#endif
/* Filesystem information: */
struct fs_struct *fs;
/* Open file information: */
struct files_struct *files;
/* Namespaces: */
struct nsproxy *nsproxy;
/* Signal handlers: */
struct signal_struct *signal;
struct sighand_struct *sighand;
sigset_t blocked;
sigset_t real_blocked;
/* Restored if set_restore_sigmask() was used: */
sigset_t saved_sigmask;
struct sigpending pending;
unsigned long sas_ss_sp;
size_t sas_ss_size;
unsigned int sas_ss_flags;
struct callback_head *task_works;
struct audit_context *audit_context;
#ifdef CONFIG_AUDITSYSCALL
kuid_t loginuid;
unsigned int sessionid;
#endif
struct seccomp seccomp;
/* Thread group tracking: */
u32 parent_exec_id;
u32 self_exec_id;
/* Protection against (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, mempolicy: */
spinlock_t alloc_lock;
/* Protection of the PI data structures: */
raw_spinlock_t pi_lock;
struct wake_q_node wake_q;
#ifdef CONFIG_RT_MUTEXES
/* PI waiters blocked on a rt_mutex held by this task: */
struct rb_root_cached pi_waiters;
/* Updated under owner's pi_lock and rq lock */
struct task_struct *pi_top_task;
/* Deadlock detection and priority inheritance handling: */
struct rt_mutex_waiter *pi_blocked_on;
#endif
#ifdef CONFIG_DEBUG_MUTEXES
/* Mutex deadlock detection: */
struct mutex_waiter *blocked_on;
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
unsigned int irq_events;
unsigned long hardirq_enable_ip;
unsigned long hardirq_disable_ip;
unsigned int hardirq_enable_event;
unsigned int hardirq_disable_event;
int hardirqs_enabled;
int hardirq_context;
unsigned long softirq_disable_ip;
unsigned long softirq_enable_ip;
unsigned int softirq_disable_event;
unsigned int softirq_enable_event;
int softirqs_enabled;
int softirq_context;
#endif
#ifdef CONFIG_LOCKDEP
# define MAX_LOCK_DEPTH 48UL
u64 curr_chain_key;
int lockdep_depth;
unsigned int lockdep_recursion;
struct held_lock held_locks[MAX_LOCK_DEPTH];
#endif
#ifdef CONFIG_UBSAN
unsigned int in_ubsan;
#endif
/* Journalling filesystem info: */
void *journal_info;
/* Stacked block device info: */
struct bio_list *bio_list;
#ifdef CONFIG_BLOCK
/* Stack plugging: */
struct blk_plug *plug;
#endif
/* VM state: */
struct reclaim_state *reclaim_state;
struct backing_dev_info *backing_dev_info;
struct io_context *io_context;
/* Ptrace state: */
unsigned long ptrace_message;
siginfo_t *last_siginfo;
struct task_io_accounting ioac;
#ifdef CONFIG_TASK_XACCT
/* Accumulated RSS usage: */
u64 acct_rss_mem1;
/* Accumulated virtual memory usage: */
u64 acct_vm_mem1;
/* stime + utime since last update: */
u64 acct_timexpd;
#endif
#ifdef CONFIG_CPUSETS
/* Protected by ->alloc_lock: */
nodemask_t mems_allowed;
/* Seqence number to catch updates: */
seqcount_t mems_allowed_seq;
int cpuset_mem_spread_rotor;
int cpuset_slab_spread_rotor;
#endif
#ifdef CONFIG_CGROUPS
/* Control Group info protected by css_set_lock: */
struct css_set __rcu *cgroups;
/* cg_list protected by css_set_lock and tsk->alloc_lock: */
struct list_head cg_list;
#endif
#ifdef CONFIG_INTEL_RDT
u32 closid;
u32 rmid;
#endif
#ifdef CONFIG_FUTEX
struct robust_list_head __user *robust_list;
#ifdef CONFIG_COMPAT
struct compat_robust_list_head __user *compat_robust_list;
#endif
struct list_head pi_state_list;
struct futex_pi_state *pi_state_cache;
#endif
#ifdef CONFIG_PERF_EVENTS
struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts];
struct mutex perf_event_mutex;
struct list_head perf_event_list;
#endif
#ifdef CONFIG_DEBUG_PREEMPT
unsigned long preempt_disable_ip;
#endif
#ifdef CONFIG_NUMA
/* Protected by alloc_lock: */
struct mempolicy *mempolicy;
short il_prev;
short pref_node_fork;
#endif
#ifdef CONFIG_NUMA_BALANCING
int numa_scan_seq;
unsigned int numa_scan_period;
unsigned int numa_scan_period_max;
int numa_preferred_nid;
unsigned long numa_migrate_retry;
/* Migration stamp: */
u64 node_stamp;
u64 last_task_numa_placement;
u64 last_sum_exec_runtime;
struct callback_head numa_work;
struct numa_group *numa_group;
/*
* numa_faults is an array split into four regions:
* faults_memory, faults_cpu, faults_memory_buffer, faults_cpu_buffer
* in this precise order.
*
* faults_memory: Exponential decaying average of faults on a per-node
* basis. Scheduling placement decisions are made based on these
* counts. The values remain static for the duration of a PTE scan.
* faults_cpu: Track the nodes the process was running on when a NUMA
* hinting fault was incurred.
* faults_memory_buffer and faults_cpu_buffer: Record faults per node
* during the current scan window. When the scan completes, the counts
* in faults_memory and faults_cpu decay and these values are copied.
*/
unsigned long *numa_faults;
unsigned long total_numa_faults;
/*
* numa_faults_locality tracks if faults recorded during the last
* scan window were remote/local or failed to migrate. The task scan
* period is adapted based on the locality of the faults with different
* weights depending on whether they were shared or private faults
*/
unsigned long numa_faults_locality[3];
unsigned long numa_pages_migrated;
#endif /* CONFIG_NUMA_BALANCING */
#ifdef CONFIG_RSEQ
struct rseq __user *rseq;
u32 rseq_len;
u32 rseq_sig;
/*
* RmW on rseq_event_mask must be performed atomically
* with respect to preemption.
*/
unsigned long rseq_event_mask;
#endif
struct tlbflush_unmap_batch tlb_ubc;
struct rcu_head rcu;
/* Cache last used pipe for splice(): */
struct pipe_inode_info *splice_pipe;
struct page_frag task_frag;
#ifdef CONFIG_TASK_DELAY_ACCT
struct task_delay_info *delays;
#endif
#ifdef CONFIG_FAULT_INJECTION
int make_it_fail;
unsigned int fail_nth;
#endif
/*
* When (nr_dirtied >= nr_dirtied_pause), it's time to call
* balance_dirty_pages() for a dirty throttling pause:
*/
int nr_dirtied;
int nr_dirtied_pause;
/* Start of a write-and-pause period: */
unsigned long dirty_paused_when;
#ifdef CONFIG_LATENCYTOP
int latency_record_count;
struct latency_record latency_record[LT_SAVECOUNT];
#endif
/*
* Time slack values; these are used to round up poll() and
* select() etc timeout values. These are in nanoseconds.
*/
u64 timer_slack_ns;
u64 default_timer_slack_ns;
#ifdef CONFIG_KASAN
unsigned int kasan_depth;
#endif
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
/* Index of current stored address in ret_stack: */
int curr_ret_stack;
/* Stack of return addresses for return function tracing: */
struct ftrace_ret_stack *ret_stack;
/* Timestamp for last schedule: */
unsigned long long ftrace_timestamp;
/*
* Number of functions that haven't been traced
* because of depth overrun:
*/
atomic_t trace_overrun;
/* Pause tracing: */
atomic_t tracing_graph_pause;
#endif
#ifdef CONFIG_TRACING
/* State flags for use by tracers: */
unsigned long trace;
/* Bitmask and counter of trace recursion: */
unsigned long trace_recursion;
#endif /* CONFIG_TRACING */
#ifdef CONFIG_KCOV
/* Coverage collection mode enabled for this task (0 if disabled): */
unsigned int kcov_mode;
/* Size of the kcov_area: */
unsigned int kcov_size;
/* Buffer for coverage collection: */
void *kcov_area;
/* KCOV descriptor wired with this task or NULL: */
struct kcov *kcov;
#endif
#ifdef CONFIG_MEMCG
struct mem_cgroup *memcg_in_oom;
gfp_t memcg_oom_gfp_mask;
int memcg_oom_order;
/* Number of pages to reclaim on returning to userland: */
unsigned int memcg_nr_pages_over_high;
/* Used by memcontrol for targeted memcg charge: */
struct mem_cgroup *active_memcg;
#endif
#ifdef CONFIG_BLK_CGROUP
struct request_queue *throttle_queue;
#endif
#ifdef CONFIG_UPROBES
struct uprobe_task *utask;
#endif
#if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE)
unsigned int sequential_io;
unsigned int sequential_io_avg;
#endif
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
unsigned long task_state_change;
#endif
int pagefault_disabled;
#ifdef CONFIG_MMU
struct task_struct *oom_reaper_list;
#endif
#ifdef CONFIG_VMAP_STACK
struct vm_struct *stack_vm_area;
#endif
#ifdef CONFIG_THREAD_INFO_IN_TASK
/* A live task holds one reference: */
atomic_t stack_refcount;
#endif
#ifdef CONFIG_LIVEPATCH
int patch_state;
#endif
#ifdef CONFIG_SECURITY
/* Used by LSM modules for access restriction: */
void *security;
#endif
/*
* New fields for task_struct should be added above here, so that
* they are included in the randomized portion of task_struct.
*/
randomized_struct_fields_end
/* CPU-specific state of this task: */
struct thread_struct thread;
/*
* WARNING: on x86, 'thread_struct' contains a variable-sized
* structure. It *MUST* be at the end of 'task_struct'.
*
* Do not put anything below here!
*/
};