user_namespace
struct user_namespace init_user_ns = {
.uid_map = {
.nr_extents = 1,
.extent[0] = {
.first = 0,
.lower_first = 0,
.count = 4294967295U,
},
},
.gid_map = {
.nr_extents = 1,
.extent[0] = {
.first = 0,
.lower_first = 0,
.count = 4294967295U,
},
},
.projid_map = {
.nr_extents = 1,
.extent[0] = {
.first = 0,
.lower_first = 0,
.count = 4294967295U,
},
},
.count = ATOMIC_INIT(3),
.owner = GLOBAL_ROOT_UID,
.group = GLOBAL_ROOT_GID,
.proc_inum = PROC_USER_INIT_INO,
};
struct cred init_cred = {
.usage = ATOMIC_INIT(4),
#ifdef CONFIG_DEBUG_CREDENTIALS
.subscribers = ATOMIC_INIT(2),
.magic = CRED_MAGIC,
#endif
.uid = GLOBAL_ROOT_UID,
.gid = GLOBAL_ROOT_GID,
.suid = GLOBAL_ROOT_UID,
.sgid = GLOBAL_ROOT_GID,
.euid = GLOBAL_ROOT_UID,
.egid = GLOBAL_ROOT_GID,
.fsuid = GLOBAL_ROOT_UID,
.fsgid = GLOBAL_ROOT_GID,
.securebits = SECUREBITS_DEFAULT,
.cap_inheritable = CAP_EMPTY_SET,
.cap_permitted = CAP_FULL_SET,
.cap_effective = CAP_FULL_SET,
.cap_bset = CAP_FULL_SET,
.user = INIT_USER,
.user_ns = &init_user_ns,
.group_info = &init_groups,
};
struct pid_namespace init_pid_ns = {
.kref = {
.refcount = ATOMIC_INIT(2),
},
.pidmap = {
[ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL }
},
.last_pid = 0,
.level = 0,
.child_reaper = &init_task,
.user_ns = &init_user_ns,
.proc_inum = PROC_PID_INIT_INO,
};
struct nsproxy init_nsproxy = {
.count = ATOMIC_INIT(1),
.uts_ns = &init_uts_ns,
#if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC)
.ipc_ns = &init_ipc_ns,
#endif
.mnt_ns = NULL,
.pid_ns = &init_pid_ns,
#ifdef CONFIG_NET
.net_ns = &init_net,
#endif
};
#define INIT_TASK(tsk) \
{
RCU_POINTER_INITIALIZER(real_cred, &init_cred), \
RCU_POINTER_INITIALIZER(cred, &init_cred), \//会将cred下的user_ns初始化好
.nsproxy = &init_nsproxy, \//会将pid、uts、ipc下的user_ns初始化好
int cap_capable(const struct cred *cred, struct user_namespace *targ_ns,
int cap, int audit)
{
if ((ns->parent == cred->user_ns) && uid_eq(ns->owner, cred->euid))
return 0;
.owner = GLOBAL_ROOT_UID, //初始化为0
.group = GLOBAL_ROOT_GID,
.proc_inum = PROC_USER_INIT_INO,
};
sys_clone ->do_fork -> copy_process ->copy_creds ->create_user_ns
进程 —— cred
int copy_creds(struct task_struct *p, unsigned long clone_flags)
{
new = prepare_creds(); //先准备出一份新的cred空间,内存完全拷贝旧的cred
if (!new)
return -ENOMEM;
if (clone_flags & CLONE_NEWUSER) {
ret = create_user_ns(new);
if (ret < 0)
goto error_put;
}
//没有CLONE_NEWUSER,则user_ns的owner不变
int create_user_ns(struct cred *new)
{
struct user_namespace *ns, *parent_ns = new->user_ns;
kuid_t owner = new->euid; //新的user_ns的owner为cred->euid
kgid_t group = new->egid;
int ret;
/* The creator needs a mapping in the parent user namespace
* or else we won't be able to reasonably tell userspace who
* created a user_namespace.
*/
if (!kuid_has_mapping(parent_ns, owner) ||
!kgid_has_mapping(parent_ns, group))
return -EPERM;
struct cred init_cred = {
.euid = GLOBAL_ROOT_UID,
.egid = GLOBAL_ROOT_GID, //默认初始euid和egid都为0
Mar 1 11:33:48 linux-haj6 kernel: [ 3740.307809] wyf:oldns:uid_map:create_user_ns:nr_extents=1
Mar 1 11:33:48 linux-haj6 kernel: [ 3740.307812] wyf:oldns:uid_map:create_user_ns:extent[0]:first=0:lower_first=0:count=4294967295
带入map_id_up
first = 0
last = 0 + 4294967295 – 1 =4294967294
0属于[0, 4294967294]
0
static u32 map_id_up(struct uid_gid_map *map, u32 id) // owner = 0
{
unsigned idx, extents;
u32 first, last;
/* Find the matching extent */
extents = map->nr_extents; // map->nr_extents = 1
smp_read_barrier_depends();
for (idx = 0; idx < extents; idx++) {
first = map->extent[idx].lower_first; // map->extent[0].lower_first = 0 first =0
last = first + map->extent[idx].count - 1; // map->extent[0].count = 4294967295 last =4294967294
if (id >= first && id <= last) //0>=0 && 0<=4294967294 idx = 0
break;
}
/* Map the id or note failure */
if (idx < extents) //0<1
id = (id - first) + map->extent[idx].first; //id = 0-0 +0 =0
else
id = (u32) -1;
return id; //返回0
}
kuid_has_mapping要自己设定,默认都是0。
如果是默认的情况,
usernamespace 1 下用户:user1
usernamespace 2 下用户:user2
sys_execve ->do_execve -> do_execve_common->prepare_binprm -> security_bprm_set_creds ->selinux_bprm_set_creds -> cap_bprm_set_creds
http://lwn.net/Articles/528078/
inode_capable –>kuid_has_mapping
prepare_binprm -> kuid_has_mapping
find_keyring_by_name -> kuid_has_mapping
key_serial_next -> kuid_has_mapping
find_ge_key -> kuid_has_mapping
__key_user_next -> kuid_has_mapping
create_user_ns -> kuid_has_mapping
/proc/PID/ns
/proc/PID/uid_map ——是写这个文件来做的
/proc/sys/kernel/overflowuid
通过proc接口来写uid_map信息:proc_uid_map_write
fs/proc/base.c
static const struct file_operations proc_uid_map_operations = {
.open = proc_uid_map_open,
.write = proc_uid_map_write,
.read = seq_read,
.llseek = seq_lseek,
.release = proc_id_map_release,
};
#ifdef CONFIG_USER_NS
REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations),
REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations),
REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
#endif
echo “0 1000 100″ > /proc/$pid/uid_map
echo “0 1000 100″ > /proc/$pid/gid_map
也就是user_ns中0对应host中1000,范围是[0,99]
最新的lxc已经给出具体代码如下:
int add_id_mapping(enum idtype idtype, pid_t pid, uid_t host_start, uid_t ns_start, int range)
{
char path[PATH_MAX];
int ret;
FILE *f;
ret = snprintf(path, PATH_MAX, "/proc/%d/%cid_map", pid, idtype == ID_TYPE_UID ? 'u' : 'g');
if (ret < 0 || ret >= PATH_MAX) {
fprintf(stderr, "%s: path name too long", __func__);
return -E2BIG;
}
f = fopen(path, "w");
if (!f) {
perror("open");
return -EINVAL;
}
ret = fprintf(f, "%d %d %d", ns_start, host_start, range);
if (ret < 0)
perror("write");
fclose(f);
return ret < 0 ? ret : 0;
}
int lxc_spawn(struct lxc_handler *handler)
{
/* Create a process in a new set of namespaces */
handler->pid = lxc_clone(do_start, handler, handler->clone_flags);
if (lxc_map_ids(&handler->conf->id_map, handler->pid)) {
ERROR("failed to set up id mapping");
goto out_delete_net;
}
linux-haj6:/usr/src/linux-3.7.8 # id root
uid=0(root) gid=0(root) groups=105(sfcb),0(root)
linux-haj6:/usr/src/linux-3.7.8 # id wangyufei
uid=1000(wangyufei) gid=100(users) groups=16(dialout),33(video),100(users)
设置了uid_map的mount动作
Mar 5 10:48:37 linux-haj6 kernel: [87268.547947] wyf:may_mount:user_ns=ffff8816059acd98:parent=ffffffff81a30ea0:owner=0:group=0:proc_inum=4026532326
Mar 5 10:48:37 linux-haj6 kernel: [87268.547949] wyf:uid_map:may_mount:nr_extents=1
Mar 5 10:48:37 linux-haj6 kernel: [87268.547952] wyf:uid_map:may_mount:extent[0]:first=0:lower_first=1000:count=100
Mar 5 10:48:37 linux-haj6 kernel: [87268.547954] wyf:uid_map:may_mount:extent[1]:first=0:lower_first=0:count=0
Mar 5 10:48:37 linux-haj6 kernel: [87268.547956] wyf:uid_map:may_mount:extent[2]:first=0:lower_first=0:count=0
Mar 5 10:48:37 linux-haj6 kernel: [87268.547959] wyf:uid_map:may_mount:extent[3]:first=0:lower_first=0:count=0
Mar 5 10:48:37 linux-haj6 kernel: [87268.547961] wyf:uid_map:may_mount:extent[4]:first=0:lower_first=0:count=0
static u32 map_id_up(struct uid_gid_map *map, u32 id)
{
unsigned idx, extents;
u32 first, last;
/* Find the matching extent */
extents = map->nr_extents;
smp_read_barrier_depends();
for (idx = 0; idx < extents; idx++) {
first = map->extent[idx].lower_first; //将底层的起始id赋给first
last = first + map->extent[idx].count - 1; //根据范围计算出底层的末尾id
if (id >= first && id <= last) //当前进程的cred中userns的owner是不是在范围内
break;
}
/* Map the id or note failure */
if (idx < extents)
id = (id - first) + map->extent[idx].first; //如果在范围内,返回当前层的uid
else
id = (u32) -1;
return id;
}
每个用户空间可以配置其专有权限的uid范围
sys_execve ->do_execve -> do_execve_common->prepare_binprm -> security_bprm_set_creds ->selinux_bprm_set_creds -> cap_bprm_set_creds
map_id_down——将当前层id映射成底层id
map_id_up——将底层id映射成当前层id
kernel/timer.c
SYSCALL_DEFINE0(getuid)
{
/* Only we change this so SMP safe */
return from_kuid_munged(current_user_ns(), current_uid());
}
分析研究权限相关问题。
1)只有root用户才可以写/proc下的uid_map。其它用户即使赋给了所有的cap,将其uid euid gid egid等都变成了root的0,也不会成功写入。
非root用户是不可能通过setuid或者seteuid取得其他权限(包括root权限)的,它只能恢复原来的权限。允许通过setuid或者seteuid取得root权限是非常危险的,这样他就可以在程序的后边做任何想做的事了(包括kill掉你的系统)。只能通过exec一个设置了setuid位的可执行程序,来取得其他(程序文件所有者)权限(包括root权限)
2)要设置uid_map就一定要root用户权限,随后,要在clone出来的new_namespace中做clone等动作就要检查uid是不是在设定的map范围内,如root当初设定了这个new usernamespace中uid的范围为[1000,1099],那么随后的root因为uid为0就无法在其clone出的空间中执行一些动作(作茧自缚),这时root可以通过setuid将自己的身份变为[1000,1099]范围内的,如1000,则后续的动作就会合法了。
另外,普通用户要想成功的clone出空间,并在其中做一些动作,也要先获得root权限,设定map,如下:
先将xxx 设置成root拥有的setuid的可执行文件。然后在其clone出来的newns中uid应该为普通用户id,euid为0,再用seteuid将其设置回普通用户id,然后在newns下的操作,包括clone在内就合法了。
3)通过setcap命令可以设置可执行文件的capability,使调用它的用户拥有设定的capability,属于setuid动作的子集。
libcap-progs - Libcap utility programs——setcap命令的安装包
2013年5月26日上传