晚上差点没赶上班车。明天要share,继续准备
##main.go ->initCommand
##main_unix.go
var
initCommand
= cli.Command{
Name:
"init"
,
Usage:
`initialize the namespaces and launch the process (do not call it outside of runc)`
,
Action:
func
(context *cli.Context)
error
{
factory
,
_
:= libcontainer.
New
(
""
)
if
err
:= factory.
StartInitialization
(); err !=
nil
{
// as the error is sent back to the parent there is no need to log
// or write it to stderr because the parent process will handle this
os.
Exit
(
1
)
}
panic
(
"libcontainer: container init failed to exec"
)
},
}
- 从环境变量中解析出childPipe、rootDir的fd以及initType(默认为standard,有时间看一下还有其他什么特别的初始化方式),并清除当前进程的所有环境变量。
- 设置一个trap以及panic recover,如果初始化容器失败,会往childPipe中写入procError。
- 调用
newContainerInit
创建一个init对象(两种类型,standard or setns,下面以standard为例),首先从childPipe中获取config配置文件,从配置文件中读取环境变量并设置到当前进程。构造一个linuxStandardInit对象,主要包括pipe、parentPid、config和rootDir等字段。 - 调用linuxStandardInit对象的
Init
方法进行初始化。
// StartInitialization loads a container by opening the pipe fd from the parent to read the configuration and state
// This is a low level implementation detail of the reexec and should not be consumed externally
func
(l *LinuxFactory)
StartInitialization
() (err
error
) {
var
pipefd
,
rootfd
int
for
_
,
pair
:=
range
[]
struct
{
k
string
v *
int
}{
{
"_LIBCONTAINER_INITPIPE"
, &pipefd},
{
"_LIBCONTAINER_STATEDIR"
, &rootfd},
} {
s
:= os.
Getenv
(pair.k)
i
,
err
:= strconv.
Atoi
(s)
if
err !=
nil
{
return
fmt.
Errorf
(
"unable to convert %s=%s to int"
, pair.k, s)
}
*pair.
v
= i
}
var
(
pipe
= os.
NewFile
(
uintptr
(pipefd),
"pipe"
)
it
=
initType
(os.
Getenv
(
"_LIBCONTAINER_INITTYPE"
))
)
// clear the current process's environment to clean any libcontainer
// specific env vars.
os.
Clearenv
()
var
i
initer
defer
func
() {
// We have an error during the initialization of the container's init,
// send it back to the parent process in the form of an initError.
// If container's init successed, syscall.Exec will not return, hence
// this defer function will never be called.
if
_
,
ok
:= i.(*linuxStandardInit); ok {
// Synchronisation only necessary for standard init.
if
werr
:= utils.
WriteJSON
(pipe, syncT{procError}); werr !=
nil
{
panic
(err)
}
}
if
werr
:= utils.
WriteJSON
(pipe,
newSystemError
(err)); werr !=
nil
{
panic
(err)
}
// ensure that this pipe is always closed
pipe.
Close
()
}()
defer
func
() {
if
e
:=
recover
(); e !=
nil
{
err
= fmt.
Errorf
(
"panic from initialization: %v, %v"
, e,
string
(debug.
Stack
()))
}
}()
i
,
err
=
newContainerInit
(it, pipe, rootfd)
if
err !=
nil
{
return
err
}
return
i.
Init
()
}
func
newContainerInit
(t initType, pipe *os.File, stateDirFD
int
) (initer,
error
) {
var
config
*initConfig
if
err
:= json.
NewDecoder
(pipe).
Decode
(&config); err !=
nil
{
return
nil
, err
}
if
err
:=
populateProcessEnvironment
(config.Env); err !=
nil
{
return
nil
, err
}
switch
t {
case
initSetns:
return
&linuxSetnsInit{
config: config,
},
nil
case
initStandard:
return
&linuxStandardInit{
pipe: pipe,
parentPid: syscall.
Getppid
(),
config: config,
stateDirFD: stateDirFD,
},
nil
}
return
nil
, fmt.
Errorf
(
"unknown init type %q"
, t)
}
##standard_init_linux.go
func
(l *linuxStandardInit)
Init
()
error
{
// Max首先是针对Session keyring的一些配置,不是很清楚这里的Session是什么?
// Max首先是针对Session keyring的一些配置,不是很清楚这里的Session是什么?
if
!l.config.Config.NoNewKeyring {
ringname
,
keepperms
,
newperms
:= l.
getSessionRingParams
()
// do not inherit the parent's session keyring
sessKeyId
,
err
:= keys.
JoinSessionKeyring
(ringname)
if
err !=
nil
{
return
err
}
// make session keyring searcheable
if
err
:= keys.
ModKeyringPerm
(sessKeyId, keepperms, newperms); err !=
nil
{
return
err
}
}
// Max 配置console和tty。如果配置文件中指定有Console字段,则从该字段中获取tty的slave路径创建一个linuxConsole对象,调用其
dupStdio
打开slave设备,将其fd复制(dup3)到当前进程的标准IO。如果console对象创建好以后,便调用ioctl的TIOCSCTTY分配控制终端
var
console
*linuxConsole
if
l.config.Console !=
""
{
console
=
newConsoleFromPath
(l.config.Console)
if
err
:= console.
dupStdio
(); err !=
nil
{
return
err
}
}
if
console !=
nil
{
if
err
:= system.
Setctty
(); err !=
nil
{
return
err
}
}
// Max 调用
// Max 调用
setupNetwork
配置容器的网络。奇怪网络不是在前面配置过了吗,还是调用同样的函数
if
err
:=
setupNetwork
(l.config); err !=
nil
{
return
err
}
// Max 调用setupRoute配置容器的静态路由信息。
// Max 调用setupRoute配置容器的静态路由信息。
if
err
:=
setupRoute
(l.config.Config); err !=
nil
{
return
err
}
// Max selinux,调用label.Init()检查selinux是否被启动以及是否检查过,并将结果存入全局变量。此处的label并非是用户label,而是selinux相关的processLabel。
label.
Init
()
// InitializeMountNamespace() can be executed only for a new mount namespace
// Max 如果设置了mount namespace,则调用
// Max 如果设置了mount namespace,则调用
setupRootfs
在新的mount namespace中配置设备、挂载点以及文件系统。
if
l.config.Config.Namespaces.
Contains
(configs.NEWNS) {
if
err
:=
setupRootfs
(l.config.Config, console, l.pipe); err !=
nil
{
return
err
}
}
// Max 根据需要配置hostname、apparmor、processLabel、sysctl、readonlyPath、maskPath。这些都是一些feature,对容器启动本身没有太多影响。
// Max 根据需要配置hostname、apparmor、processLabel、sysctl、readonlyPath、maskPath。这些都是一些feature,对容器启动本身没有太多影响。
if
hostname
:= l.config.Config.Hostname; hostname !=
""
{
if
err
:= syscall.
Sethostname
([]
byte
(hostname)); err !=
nil
{
return
err
}
}
if
err
:= apparmor.
ApplyProfile
(l.config.AppArmorProfile); err !=
nil
{
return
err
}
if
err
:= label.
SetProcessLabel
(l.config.ProcessLabel); err !=
nil
{
return
err
}
for
key
,
value
:=
range
l.config.Config.Sysctl {
if
err
:=
writeSystemProperty
(key, value); err !=
nil
{
return
err
}
}
for
_
,
path
:=
range
l.config.Config.ReadonlyPaths {
if
err
:=
remountReadonly
(path); err !=
nil
{
return
err
}
}
for
_
,
path
:=
range
l.config.Config.MaskPaths {
if
err
:=
maskPath
(path); err !=
nil
{
return
err
}
}
pdeath
,
err
:= system.
GetParentDeathSignal
()
if
err !=
nil
{
return
err
}
// Max 获取父进程的退出信号量。
// Max 获取父进程的退出信号量。
if
l.config.NoNewPrivileges {
if
err
:= system.
Prctl
(PR_SET_NO_NEW_PRIVS,
1
,
0
,
0
,
0
); err !=
nil
{
return
err
}
}
// Tell our parent that we're ready to Execv. This must be done before the
// Seccomp rules have been applied, because we need to be able to read and
// write to a socket.
// Max 通过管道与父进程进行同步,先发出procReady再等待procRun。
// Max 通过管道与父进程进行同步,先发出procReady再等待procRun。
if
err
:=
syncParentReady
(l.pipe); err !=
nil
{
return
err
}
// Without NoNewPrivileges seccomp is a privileged operation, so we need to
// do this before dropping capabilities; otherwise do it as late as possible
// just before execve so as few syscalls take place after it as possible.
// Max 初始化seccomp。
// Max 初始化seccomp。
if
l.config.Config.Seccomp !=
nil
&& !l.config.NoNewPrivileges {
if
err
:= seccomp.
InitSeccomp
(l.config.Config.Seccomp); err !=
nil
{
return
err
}
}
// Max 调用
// Max 调用
finalizeNamespace
根据config配置将需要的特权capabilities加入白名单,设置user namespace,关闭不需要的文件描述符。
if
err
:=
finalizeNamespace
(l.config); err !=
nil
{
return
err
}
// finalizeNamespace can change user/group which clears the parent death
// signal, so we restore it here.
// 恢复parent进程的death信号量并检查当前父进程pid是否为我们原来记录的。不是的话,自杀
// 恢复parent进程的death信号量并检查当前父进程pid是否为我们原来记录的。不是的话,自杀
if
err
:= pdeath.
Restore
(); err !=
nil
{
return
err
}
// compare the parent from the inital start of the init process and make sure that it did not change.
// if the parent changes that means it died and we were reparented to something else so we should
// just kill ourself and not cause problems for someone else.
// Max 恢复parent进程的death信号量并检查当前父进程pid是否为我们原来记录的。不是的话,kill ourself。。。
// Max 恢复parent进程的death信号量并检查当前父进程pid是否为我们原来记录的。不是的话,kill ourself。。。
if
syscall.
Getppid
() != l.parentPid {
return
syscall.
Kill
(syscall.
Getpid
(), syscall.SIGKILL)
}
// check for the arg before waiting to make sure it exists and it is returned
// as a create time error.
name
,
err
:= exec.
LookPath
(l.config.Args[
0
])
if
err !=
nil
{
return
err
}
// close the pipe to signal that we have completed our init.
// Max 与父进程之间的同步已经完成,关闭pipe。
// Max 与父进程之间的同步已经完成,关闭pipe。
l.pipe.
Close
()
// wait for the fifo to be opened on the other side before
// exec'ing the users process.
// Max 尝试以只写方式打开fifo管道,并往管道中写入“0” 。该操作会一直保持阻塞,直到管道的另一端以读方式打开,并读取内容。至此,create操作流程已经结束
// Max 尝试以只写方式打开fifo管道,并往管道中写入“0” 。该操作会一直保持阻塞,直到管道的另一端以读方式打开,并读取内容。至此,create操作流程已经结束
fd
,
err
:= syscall.
Openat
(l.stateDirFD, execFifoFilename, os.O_WRONLY|syscall.O_CLOEXEC,
0
)
if
err !=
nil
{
return
newSystemErrorWithCause
(err,
"openat exec fifo"
)
}
if
_
,
err
:= syscall.
Write
(fd, []
byte
(
"0"
)); err !=
nil
{
return
newSystemErrorWithCause
(err,
"write 0 exec fifo"
)
}
// Max 下面实际上是start的时候才会触发的操作了,阻塞清除后,根据config配置初始化seccomp,并调用syscall.Exec执行config里面指定的命令。
// Max 下面实际上是start的时候才会触发的操作了,阻塞清除后,根据config配置初始化seccomp,并调用syscall.Exec执行config里面指定的命令。
if
l.config.Config.Seccomp !=
nil
&& l.config.NoNewPrivileges {
if
err
:= seccomp.
InitSeccomp
(l.config.Config.Seccomp); err !=
nil
{
return
newSystemErrorWithCause
(err,
"init seccomp"
)
}
}
if
err
:= syscall.
Exec
(name, l.config.Args[
0
:], os.
Environ
()); err !=
nil
{
return
newSystemErrorWithCause
(err,
"exec user process"
)
}
return
nil
}