命名空间是一种资源划分方案。资源有很多类别,比如:process IDs, hostnames, user IDs, file names, and some names associated with network access, and interprocess communication.
Process ID (pid)
一个PID命名空间具有独立的process IDs空间。PID命名空间是有层级的,上一级命名空间可以看到其所有下级的PIDs。从而最初始的命名空间可看到所有的进程。PID命名空间中第一个进程ID为1,它和init进程一样特殊,最典型的就是负责接管所有的孤儿进程(orphaned processes)。终止进程PID 1将直接终止其所在的已经所有下级PID命名空间里的进程。
现在让我们走进Linux的实现:
1. 进程与命名空间
struct task_struct {
...
/* Namespaces: */
struct nsproxy *nsproxy;
...
};
每个进程绑定的命名空间[2]
/*
* A structure to contain pointers to all per-process
* namespaces - fs (mount), uts, network, sysvipc, etc.
*
* The pid namespace is an exception -- it's accessed using
* task_active_pid_ns. The pid namespace here is the
* namespace that children will use.
*
* 'count' is the number of tasks holding a reference.
* The count for each namespace, then, will be the number
* of nsproxies pointing to it, not the number of tasks.
*
* The nsproxy is shared by tasks which share all namespaces.
* As soon as a single namespace is cloned or unshared, the
* nsproxy is copied.
*/
struct nsproxy {
atomic_t count;
struct uts_namespace *uts_ns; //UNIX Timesharing System
struct ipc_namespace *ipc_ns;
struct mnt_namespace *mnt_ns;
struct pid_namespace *pid_ns_for_children;
struct net *net_ns; //网路相关的命名空间参数
struct time_namespace *time_ns;
struct time_namespace *time_ns_for_children;
struct cgroup_namespace *cgroup_ns;
};
extern struct nsproxy init_nsproxy;
上述结构只提供子进程的PID命名空间,进程自身的PID命名空间通过下面函数获得[3]
struct pid_namespace *task_active_pid_ns(struct task_struct *tsk)
{
return ns_of_pid(task_pid(tsk));
}
EXPORT_SYMBOL_GPL(task_active_pid_ns);
// https://github.com/torvalds/linux/blob/master/include/linux/sched.h#L1309
static inline struct pid *task_pid(struct task_struct *task)
{
return task->thread_pid;
}
// https://github.com/torvalds/linux/blob/master/include/linux/pid.h#L144
/*
* ns_of_pid() returns the pid namespace in which the specified pid was
* allocated.
*
* NOTE:
* ns_of_pid() is expected to be called for a process (task) that has
* an attached 'struct pid' (see attach_pid(), detach_pid()) i.e @pid
* is expected to be non-NULL. If @pid is NULL, caller should handle
* the resulting NULL pid-ns.
*/
static inline struct pid_namespace *ns_of_pid(struct pid *pid)
{
struct pid_namespace *ns = NULL;
if (pid)
ns = pid->numbers[pid->level].ns;
return ns;
}
// 一个PID命名空间init进程的判断
/*
* is_child_reaper returns true if the pid is the init process
* of the current namespace. As this one could be checked before
* pid_ns->child_reaper is assigned in copy_process, we check
* with the pid number.
*/
static inline bool is_child_reaper(struct pid *pid)
{
return pid->numbers[pid->level].nr == 1;
}
2. PID命名空间数据结构 [4]
enum { /* definitions for pid_namespace's hide_pid field */
HIDEPID_OFF = 0,
HIDEPID_NO_ACCESS = 1,
HIDEPID_INVISIBLE = 2,
};
struct pid_namespace {
struct kref kref;
struct idr idr;
struct rcu_head rcu;
unsigned int pid_allocated;
struct task_struct *child_reaper;
struct kmem_cache *pid_cachep;
unsigned int level;
struct pid_namespace *parent; //父级命名空间
#ifdef CONFIG_PROC_FS
struct dentry *proc_self;
struct dentry *proc_thread_self;
#endif
#ifdef CONFIG_BSD_PROCESS_ACCT
struct fs_pin *bacct;
#endif
struct user_namespace *user_ns;
struct ucounts *ucounts;
kgid_t pid_gid;
int hide_pid;
int reboot; /* group exit code if this pidns was rebooted */
struct ns_common ns;
} __randomize_layout;
extern struct pid_namespace init_pid_ns;
2. 初始化全局默认PID命名空间[5]
/*
* PID-map pages start out as NULL, they get allocated upon
* first use and are never deallocated. This way a low pid_max
* value does not cause lots of bitmaps to be allocated, but
* the scheme scales to up to 4 million PIDs, runtime.
*/
struct pid_namespace init_pid_ns = {
.kref = KREF_INIT(2),
.idr = IDR_INIT(init_pid_ns.idr),
.pid_allocated = PIDNS_ADDING,
.level = 0,
.child_reaper = &init_task,
.user_ns = &init_user_ns,
.ns.inum = PROC_PID_INIT_INO,
#ifdef CONFIG_PID_NS
.ns.ops = &pidns_operations,
#endif
};
EXPORT_SYMBOL_GPL(init_pid_ns);
3. PID与Namespace的关联[6]
struct pid init_struct_pid = {
.count = REFCOUNT_INIT(1),
.tasks = {
{ .first = NULL },
{ .first = NULL },
{ .first = NULL },
},
.level = 0,
.numbers = { {
.nr = 0, // PID
.ns = &init_pid_ns, //指定 Namespace
}, }
};
3.1 Related Structs[7]
enum pid_type
{
PIDTYPE_PID,
PIDTYPE_TGID,
PIDTYPE_PGID,
PIDTYPE_SID,
PIDTYPE_MAX,
};
/*
* struct upid is used to get the id of the struct pid, as it is
* seen in particular namespace. Later the struct pid is found with
* find_pid_ns() using the int nr and struct pid_namespace *ns.
*/
struct upid {
int nr;
struct pid_namespace *ns;
};
struct pid
{
refcount_t count;
unsigned int level;
spinlock_t lock;
/* lists of tasks that use this pid */
struct hlist_head tasks[PIDTYPE_MAX];
struct hlist_head inodes;
/* wait queue for pidfd notifications */
wait_queue_head_t wait_pidfd;
struct rcu_head rcu;
struct upid numbers[1];
};
4. PID命名空间的创建
二种方式:
(1)fork or clone: 使用特定选项。
(2)unshare系统调用将进程的某些部分从父进程分离,其中也包括命名空间。
预定义的Fork/Clone的命名空间FLAG:
// https://github.com/torvalds/linux/blob/master/include/uapi/linux/sched.h#L8
/*
* cloning flags:
*/
...
#define CLONE_NEWNS 0x00020000 /* New mount namespace group */
...
#define CLONE_NEWCGROUP 0x02000000 /* New cgroup namespace */
#define CLONE_NEWUTS 0x04000000 /* New utsname namespace */
#define CLONE_NEWIPC 0x08000000 /* New ipc namespace */
#define CLONE_NEWUSER 0x10000000 /* New user namespace */
#define CLONE_NEWPID 0x20000000 /* New pid namespace */
#define CLONE_NEWNET 0x40000000 /* New network namespace */
[1] https://github.com/torvalds/linux/blob/master/include/linux/sched.h#L922
[2] https://github.com/torvalds/linux/blob/master/include/linux/nsproxy.h#L16
[3] https://github.com/torvalds/linux/blob/master/kernel/pid.c#L487
[4] https://github.com/torvalds/linux/blob/master/include/linux/pid_namespace.h
[5] https://github.com/torvalds/linux/blob/master/kernel/pid.c#L73
[6] https://github.com/torvalds/linux/blob/master/kernel/pid.c#L56
[7] https://github.com/torvalds/linux/blob/master/include/linux/pid.h#L48