linux 进程管理

【星星之火】

已于 2022-09-27 16:18:52 修改

阅读量353

点赞数

分类专栏： Linux子系统文章标签： linux 运维服务器

于 2022-08-01 16:03:04 首次发布

本文链接：https://blog.csdn.net/fengyuwuzu0519/article/details/126101885

版权

Linux子系统专栏收录该内容

21 篇文章 21 订阅

订阅专栏

进程运行体现了操作系统的价值。本文开始介绍进程相关基本概念，linux内核进程是如何产生、管理进程的。以及操作系统在多个进程之间是如何进行调度的，毕竟是linux是一个可抢占的操作系统。

内核如何分配处理器资源给多个进程使用。或许没有最优的方案，因为完全理解和适应不同进程是一个不可能的事情，内核不断的发展适应当下最普世、最常见的情况。调度算法也经过了不断的发展和重写。

本文重点在于

进程基础，内核如何管理及维护进程，进程虚拟地址空间
进程的产生、加载，即 fork，exec过程
进程调度算法

一、进程基础

进程与线程理解

进程: 正在执行的程序及其相关联的资源，linux也称之为任务task

线程：进程中活动的对象(内核调度的对象是线程)，线程可以当做一个特殊的进程。

虚拟机制

虚拟处理器，让进程感觉在独享处理器资源

虚拟内存，让进程感觉独享空间

进程产生

fork()系统调用，调用fork()的进程为父进程，fork()产生的进程是子进程。fork返回后，父子进程均从fork返回后开始执行。

进程空间创建

exec()来创建进程空间。

进程描述符

struct task_struct 结构体描述一个进程的所有信息，占约1.7KB空间。包含：打开的文件、进程地址空间、挂起的信号、进程状态等等。

thread_info在内核栈的尾端存放，里面有指向task_struct指针。

task_struct由slab分配器分配。

处理进程的代码，大都需要task_struct。

进程描述符管理

task_list 双向循环链表管理所有进程描述符task_struct 。遍历该链表可以找到所有进程。

进程识别

pid来识别每个不同的进程。pid_t，实际为一个int类型。默认最大32768个，与老linux兼容。所有进程都是pid=1的init进程的后代。

进程五种状态

运行、可中断、不可中断、被其他进程跟踪、停止。内核通过set_task_state(task,state)来设置进程的状态。

进程上下文

系统调用、异常可以使得用户态进程切换到内核态，表面进程处于进程上下文中。只有这样进程才能访问到内核。

进程家族树

拥有同一个父进程的进程称为“兄弟”

进程描述符中有一个父进程指针parent指针指向父进程描述符和子进程链表children。

进程地址空间(内存描述符、虚拟内存区域)

linux采用虚拟内存技术，加载到内存的可执行程序，即进程可以独立的看到4GB的进程地址空间(32位机)，但是实际的物理内存可能只有256MB或者512MB，尤其在嵌入式方面，物理内存很少达到4GB。那么虚拟4BG的虚拟地址空间如何使用512MB的物理内存呢。往往进程使用的内存总和也不大。

进程地址空间通过mm_struct来描述，包含进程地址空间的所有信息。

内核所有mm_struct通过自身域的mmlist连接在一起。

struct mm_struct {
	struct vm_area_struct *mmap;		/* list of VMAs */
	struct rb_root mm_rb;
	u32 vmacache_seqnum;                   /* per-thread vmacache */
#ifdef CONFIG_MMU
	unsigned long (*get_unmapped_area) (struct file *filp,
				unsigned long addr, unsigned long len,
				unsigned long pgoff, unsigned long flags);
#endif
	unsigned long mmap_base;		/* base of mmap area */
	unsigned long mmap_legacy_base;         /* base of mmap area in bottom-up allocations */
	unsigned long task_size;		/* size of task vm space */
	unsigned long highest_vm_end;		/* highest vma end address */
	pgd_t * pgd;
	atomic_t mm_users;			/* How many users with user space? */
	atomic_t mm_count;			/* How many references to "struct mm_struct" (users count as 1) */
	atomic_long_t nr_ptes;			/* PTE page table pages */
#if CONFIG_PGTABLE_LEVELS > 2
	atomic_long_t nr_pmds;			/* PMD page table pages */
#endif
	int map_count;				/* number of VMAs */

	spinlock_t page_table_lock;		/* Protects page tables and some counters */
	struct rw_semaphore mmap_sem;

	struct list_head mmlist;		/* List of maybe swapped mm's.	These are globally strung
						 * together off init_mm.mmlist, and are protected
						 * by mmlist_lock
						 */


	unsigned long hiwater_rss;	/* High-watermark of RSS usage */
	unsigned long hiwater_vm;	/* High-water virtual memory usage */

	unsigned long total_vm;		/* Total pages mapped */
	unsigned long locked_vm;	/* Pages that have PG_mlocked set */
	unsigned long pinned_vm;	/* Refcount permanently increased */
	unsigned long shared_vm;	/* Shared pages (files) */
	unsigned long exec_vm;		/* VM_EXEC & ~VM_WRITE */
	unsigned long stack_vm;		/* VM_GROWSUP/DOWN */
	unsigned long def_flags;
	unsigned long start_code, end_code, start_data, end_data;
	unsigned long start_brk, brk, start_stack;
	unsigned long arg_start, arg_end, env_start, env_end;

	unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */

	/*
	 * Special counters, in some configurations protected by the
	 * page_table_lock, in other configurations by being atomic.
	 */
	struct mm_rss_stat rss_stat;

	struct linux_binfmt *binfmt;

	cpumask_var_t cpu_vm_mask_var;

	/* Architecture-specific MM context */
	mm_context_t context;

	unsigned long flags; /* Must use atomic bitops to access the bits */

	struct core_state *core_state; /* coredumping support */
#ifdef CONFIG_AIO
	spinlock_t			ioctx_lock;
	struct kioctx_table __rcu	*ioctx_table;
#endif
#ifdef CONFIG_MEMCG
	/*
	 * "owner" points to a task that is regarded as the canonical
	 * user/owner of this mm. All of the following must be true in
	 * order for it to be changed:
	 *
	 * current == mm->owner
	 * current->mm != mm
	 * new_owner->mm == mm
	 * new_owner->alloc_lock is held
	 */
	struct task_struct __rcu *owner;
#endif

	/* store ref to file /proc/<pid>/exe symlink points to */
	struct file __rcu *exe_file;
#ifdef CONFIG_MMU_NOTIFIER
	struct mmu_notifier_mm *mmu_notifier_mm;
#endif
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
	pgtable_t pmd_huge_pte; /* protected by page_table_lock */
#endif
#ifdef CONFIG_CPUMASK_OFFSTACK
	struct cpumask cpumask_allocation;
#endif
#ifdef CONFIG_NUMA_BALANCING
	/*
	 * numa_next_scan is the next time that the PTEs will be marked
	 * pte_numa. NUMA hinting faults will gather statistics and migrate
	 * pages to new nodes if necessary.
	 */
	unsigned long numa_next_scan;

	/* Restart point for scanning and setting pte_numa */
	unsigned long numa_scan_offset;

	/* numa_scan_seq prevents two threads setting pte_numa */
	int numa_scan_seq;
#endif
#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
	/*
	 * An operation with batched TLB flushing is going on. Anything that
	 * can move process memory needs to flush the TLB when moving a
	 * PROT_NONE or PROT_NUMA mapped page.
	 */
	bool tlb_flush_pending;
#endif
	struct uprobes_state uprobes_state;
#ifdef CONFIG_X86_INTEL_MPX
	/* address of the bounds directory */
	void __user *bd_addr;
#endif
};

其中包含了内存区域链接及树，用户数，使用数，mm_struct链表、代码段、数据、堆、栈、命令行、环境变量的首末地址等。

mm_struct这里重点关注 *mmap，是vm_area_struct结构体。整个地址空间通过mm_struct描述，地址空间被分层很多内存区域VMA，通过vm_area_struct结构体描述。如代码段、数据段、堆、栈可能分别位于不同的VMA区域。数据结构如下:

/*
 * This struct defines a memory VMM memory area. There is one of these
 * per VM-area/task.  A VM area is any part of the process virtual memory
 * space that has a special rule for the page-fault handlers (ie a shared
 * library, the executable area etc).
 */
struct vm_area_struct {
	/* The first cache line has the info for VMA tree walking. */
    //开始地址
	unsigned long vm_start;		/* Our start address within vm_mm. */
	//结束地址
    unsigned long vm_end;		/* The first byte after our end address
					   within vm_mm. */

	/* linked list of VM areas per task, sorted by address */
	//链表
    struct vm_area_struct *vm_next, *vm_prev;
    //树中节点
	struct rb_node vm_rb;

	/*
	 * Largest free memory gap in bytes to the left of this VMA.
	 * Either between this VMA and vma->vm_prev, or between one of the
	 * VMAs below us in the VMA rbtree and its ->vm_prev. This helps
	 * get_unmapped_area find a free area of the right size.
	 */
	unsigned long rb_subtree_gap;

	/* Second cache line starts here. */

    //所属于的mm_struct 
	struct mm_struct *vm_mm;	/* The address space we belong to. */
	pgprot_t vm_page_prot;		/* Access permissions of this VMA. */
    //属性标志描述 ，如是否可读写
	unsigned long vm_flags;		/* Flags, see mm.h. */

	/*
	 * For areas with an address space and backing store,
	 * linkage into the address_space->i_mmap interval tree.
	 */
	struct {
		struct rb_node rb;
		unsigned long rb_subtree_last;
	} shared;

	/*
	 * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma
	 * list, after a COW of one of the file pages.	A MAP_SHARED vma
	 * can only be in the i_mmap tree.  An anonymous MAP_PRIVATE, stack
	 * or brk vma (with NULL file) can only be in an anon_vma list.
	 */
	struct list_head anon_vma_chain; /* Serialized by mmap_sem &
					  * page_table_lock */
	struct anon_vma *anon_vma;	/* Serialized by page_table_lock */

	/* Function pointers to deal with this struct. */
    //操作函数
	const struct vm_operations_struct *vm_ops;

	/* Information about our backing store: */
	unsigned long vm_pgoff;		/* Offset (within vm_file) in PAGE_SIZE
					   units, *not* PAGE_CACHE_SIZE */
	struct file * vm_file;		/* File we map to (can be NULL). */
	void * vm_private_data;		/* was vm_pte (shared mem) */

#ifndef CONFIG_MMU
	struct vm_region *vm_region;	/* NOMMU mapping region */
#endif
#ifdef CONFIG_NUMA
	struct mempolicy *vm_policy;	/* NUMA policy for the VMA */
#endif
};

二、进程创建及程序加载

当我们在shell中通过./启动应用程序的时候，涉及到新进程的创建，之后新进程会将程序的各个代码段加载，产生进行地址空间，然后开始运行程序。

linux进程创建通过fork()来产生，在用于成使用fork系统调用是一个简单的事情，fork的复杂之处在于内核中fork的实现。

应用层fork

拷贝当前进程创建一个子进程，子进程与父进程区别仅在于pid和ppid和某些单独进程的资源(如挂起的信号，子进程没必要继承)。

如下，fork创建子进程后，子进程返回0，父进程返回子进程pid。且父子进程均从fork调用后开始被调度。说明了子进程前期的代码段，数据段与父进程一致。

下面例子中子进程首先退出，但是父进程没有通过waitpid获取子进程退出，因此子进程在退出后将变成僵尸进程。

如果父子进程都存在的时候，杀死父进程，那么子进程会变成孤儿进程被init进程接管，因此子进程的父进程会变为pid为1的init进程。

#include<stdio.h>
#include<unistd.h>
#include<stdlib.h>
#include<sys/types.h>

int main()
{
    pid_t pid = fork();//创建子进程
    if(pid == 0)
    {
        //子进程
        int cnt = 0;

		printf("I am child: pid:%d ppid:%d\n",getpid(),getppid());

		while(1)
        {
            sleep(1);
            if(cnt == 60)
                break;
            cnt++;
        }

        exit(1);//终止进程
    }
    else if(pid > 0)
    {
        //父进程
        int cnt = 0;

	    printf("I am father: pid:%d ppid:%d\n",getpid(),getppid());
		while(1)
        {
            sleep(1);
            if(cnt == 120)
                break;
            cnt++;
        }
    }

    return 0;
}

内核层fork

应用层通过fork来到内核态的时候，主要任务:

为子进程分配所需要的内存和数据结构
将父进程的资源拷贝一部分给子进程继承
将子进程加载到内核进程表
调度器调度子进程

clone系统调用

fork、vfork都调用了clone系统调用。其在内核中调用了 do_fork()函数实现，因此重点在于do_fork的分析。

SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
         int __user *, parent_tidptr,
         int, tls_val,
         int __user *, child_tidptr)
{
    return do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr);
}

do_fork内核实现

/*
 *  Ok, this is the main fork-routine.
 *
 * It copies the process, and if successful kick-starts
 * it and waits for it to finish using the VM if required.
 */
long do_fork(unsigned long clone_flags,
	      unsigned long stack_start,
	      unsigned long stack_size,
	      int __user *parent_tidptr,
	      int __user *child_tidptr)
{
	struct task_struct *p;
	int trace = 0;
	long nr;

	/*
	 * Determine whether and which event to report to ptracer.  When
	 * called from kernel_thread or CLONE_UNTRACED is explicitly
	 * requested, no event is reported; otherwise, report if the event
	 * for the type of forking is enabled.
	 */
    //标志解析
	if (!(clone_flags & CLONE_UNTRACED)) {
		if (clone_flags & CLONE_VFORK)
			trace = PTRACE_EVENT_VFORK;
		else if ((clone_flags & CSIGNAL) != SIGCHLD)
			trace = PTRACE_EVENT_CLONE;
		else
			trace = PTRACE_EVENT_FORK;

		if (likely(!ptrace_event_enabled(current, trace)))
			trace = 0;
	}
    
    //主要就是这一个函数copy_process
	p = copy_process(clone_flags, stack_start, stack_size,
			 child_tidptr, NULL, trace);
	/*
	 * Do this prior waking up the new thread - the thread pointer
	 * might get invalid after that point, if the thread exits quickly.
	 */
	if (!IS_ERR(p)) {
		struct completion vfork;
		struct pid *pid;

		trace_sched_process_fork(current, p);

        //产生分配给子进程的pid
		pid = get_task_pid(p, PIDTYPE_PID);
		nr = pid_vnr(pid);

		if (clone_flags & CLONE_PARENT_SETTID)
			put_user(nr, parent_tidptr);

		if (clone_flags & CLONE_VFORK) {
			p->vfork_done = &vfork;
			init_completion(&vfork);
			get_task_struct(p);
		}

        //子进程加入到调度队列中取，等待调度
		wake_up_new_task(p);

		/* forking complete and child started to run, tell ptracer */
		if (unlikely(trace))
			ptrace_event_pid(trace, pid);

		if (clone_flags & CLONE_VFORK) {
            //如果有CLONE_VFORK 需要子进程先运行
			if (!wait_for_vfork_done(p, &vfork))
				ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
		}

		put_pid(pid);
	} else {
		nr = PTR_ERR(p);
	}
	return nr;
}