runc rootless模式代码解析
本来以为user namespace映射了root权限后可以直接mount /proc /dev, 实现容器隔离。但是实际测试发现,/dev 并不能在userns中的root权限mount udev 到自身的/dev。但实际上, docker上是实现了类似的功能的,所以这里研究了相应的功能。docker 启动采用了containerd 和runc交互的形式,实现容器的创建。而runc可以直接启动利用rootless模式启动普通用户映射root权限的能力,并生成/dev设备文件。这里研究了 /dev目录文件的生成过程。
“libcontainer/rootfs_linux.go”
func mountToRootfs(m *configs.Mount, rootfs, mountLabel string, enableCgroupns bool) error {
var (
dest = m.Destination
)
if !strings.HasPrefix(dest, rootfs) {
dest = filepath.Join(rootfs, dest)
}
switch m.Device {
...
case "tmpfs":
...
if err := mountPropagate(m, rootfs, mountLabel);
if copyUp {
if err := fileutils.CopyDirectory(dest, tmpDir);
if err := unix.Mount(tmpDir, dest, "", unix.MS_MOVE, "")
}
函数mount_root();挂载ROOT_DEV(真正的根文件系统所在块设备)对应的设备到rootfs的/root目录下。根文件系统挂载后进程的当前目录是/root。sys_mount(".", “/”, NULL, MS_MOVE, NULL);将当前目录的实际根文件系统移到根目录"/“下。sys_chroot((const char __user __force *)”.");将当前目录设为进程的根目录。
func mountPropagate(m *configs.Mount, rootfs string, mountLabel string)
{
// /dev目录清除
...
if err := unix.Mount(m.Source, dest, m.Device, uintptr(flags), data);
for _, pflag := range m.PropagationFlags {
unix.Mount("", dest, "", uintptr(pflag), "")
}
...
}
// CopyDirectory copies the files under the source directory
// to dest directory. The dest directory is created if it
// does not exist.
func CopyDirectory(source string, dest string) error {
fi, err := os.Stat(source)
// Get owner.
st, ok := fi.Sys().(*syscall.Stat_t)
// We have to pick an owner here anyway.
MkdirAllNewAs(dest, fi.Mode(), int(st.Uid), int(st.Gid));
return filepath.Walk(source, func(path string, info os.FileInfo, err error) error {
// Get the relative path
relPath, err := filepath.Rel(source, path)
if err != nil {
if info.IsDir() {
// Skip the source directory.
if path != source {
// Get the owner.
st, ok := info.Sys().(*syscall.Stat_t)
uid := int(st.Uid)
gid := int(st.Gid)
os.Mkdir(filepath.Join(dest, relPath), info.Mode());
os.Lchown(filepath.Join(dest, relPath), uid, gid);
}
return nil
}
// Copy the file.
CopyFile(path, filepath.Join(dest, relPath));
return nil
})
}
mknod 创建设备节点
这里创建节点后复制了host的设备文件。
// CopyFile copies the file at source to dest
func CopyFile(source string, dest string) error {
si, err := os.Lstat(source)
st, ok := si.Sys().(*syscall.Stat_t)
uid := int(st.Uid)
gid := int(st.Gid)
// Handle symlinks
if si.Mode()&os.ModeSymlink != 0 {
target, err := os.Readlink(source)
os.Symlink(target, dest);
}
// Handle device files
if st.Mode&syscall.S_IFMT == syscall.S_IFBLK || st.Mode&syscall.S_IFMT == syscall.S_IFCHR {
devMajor := int64(major(uint64(st.Rdev)))
devMinor := int64(minor(uint64(st.Rdev)))
mode := uint32(si.Mode() & 07777)
if st.Mode&syscall.S_IFMT == syscall.S_IFBLK {
mode |= syscall.S_IFBLK
}
if st.Mode&syscall.S_IFMT == syscall.S_IFCHR {
mode |= syscall.S_IFCHR
}
syscall.Mknod(dest, mode, int(mkdev(devMajor, devMinor)));
}
// Handle regular files 复制文件
if si.Mode().IsRegular() {
sf, err := os.Open(source)
defer sf.Close()
df, err := os.Create(dest)
defer df.Close()
_, err = io.Copy(df, sf)
}
// Chown the file
if err := os.Lchown(dest, uid, gid);
// Chmod the file
if !(si.Mode()&os.ModeSymlink == os.ModeSymlink) {
os.Chmod(dest, si.Mode());
}
return nil
}
// CopyDirectory copi
建立dev目录的开始
mountToRootfs在往上即是prepareRootfs,prepareRootfs调用mountToRootfs创建设备节点,并复制设备文件。
// prepareRootfs sets up the devices, mount points, and filesystems for use
// inside a new mount namespace. It doesn't set anything as ro. You must call
// finalizeRootfs after this function to finish setting up the rootfs.
func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig) (err error) {
config := iConfig.Config
if err := prepareRoot(config); err != nil {
return newSystemErrorWithCause(err, "preparing rootfs")
}
hasCgroupns := config.Namespaces.Contains(configs.NEWCGROUP)
setupDev := needsSetupDev(config)
for _, m := range config.Mounts {
for _, precmd := range m.PremountCmds {
if err := mountCmd(precmd); err != nil {
return newSystemErrorWithCause(err, "running premount command")
}
}
mountToRootfs(m, config.Rootfs, config.MountLabel, hasCgroupns)
for _, postcmd := range m.PostmountCmds {
mountCmd(postcmd);
}
}
if setupDev {
createDevices(config)
setupPtmx(config)
setupDevSymlinks(config.Rootfs)
}
return nil
}
这里还创建了设备节点。
// Create the device nodes in the container.
func createDevices(config *configs.Config) error {
useBindMount := system.RunningInUserNS() || config.Namespaces.Contains(configs.NEWUSER)
oldMask := unix.Umask(0000)
for _, node := range config.Devices {
// containers running in a user namespace are not allowed to mknod
// devices so we can just bind mount it from the host.
if err := createDeviceNode(config.Rootfs, node, useBindMount); err != nil {
unix.Umask(oldMask)
return err
}
}
unix.Umask(oldMask)
return nil
}
// Creates the device node in the rootfs of the container.
func createDeviceNode(rootfs string, node *configs.Device, bind bool) error {
dest := filepath.Join(rootfs, node.Path)
os.MkdirAll(filepath.Dir(dest), 0755);
if bind {
return bindMountDeviceNode(dest, node)
}
if err := mknodDevice(dest, node); err != nil {
if os.IsExist(err) {
return nil
} else if os.IsPermission(err) {
return bindMountDeviceNode(dest, node)
}
return err
}
return nil
}
创建设备节点并绑定设备节点。
func bindMountDeviceNode(dest string, node *configs.Device) error {
f, err := os.Create(dest)
if err != nil && !os.IsExist(err) {
return err
}
if f != nil {
f.Close()
}
return unix.Mount(node.Path, dest, "bind", unix.MS_BIND, "")
}