docker study --- runc

his blog is based on the master branck of ruc. I had fork it and save it under https://github.com/savagecm/runc.

There are four background process background: docker daemon、docker-containerd、docker-containerd-shim and docker-runc.

this part is mainly for docker-runc.

source code:

the entry point is under here:

app.Commands = []cli.Command{
		checkpointCommand,
		createCommand,
		deleteCommand,
		eventsCommand,
		execCommand,
		initCommand,
		killCommand,
		listCommand,
		pauseCommand,
		psCommand,
		restoreCommand,
		resumeCommand,
		runCommand,
		specCommand,
		startCommand,
		stateCommand,
		updateCommand,
	}
	app.Before = func(context *cli.Context) error {
		if context.GlobalBool("debug") {
			logrus.SetLevel(logrus.DebugLevel)
		}
		if path := context.GlobalString("log"); path != "" {
			f, err := os.OpenFile(path, os.O_CREATE|os.O_WRONLY|os.O_APPEND|os.O_SYNC, 0666)
			if err != nil {
				return err
			}
			logrus.SetOutput(f)
		}
		switch context.GlobalString("log-format") {
		case "text":
			// retain logrus's default.
		case "json":
			logrus.SetFormatter(new(logrus.JSONFormatter))
		default:
			return fmt.Errorf("unknown log-format %q", context.GlobalString("log-format"))
		}
		return nil
	}
	// If the command returns an error, cli takes upon itself to print
	// the error on cli.ErrWriter and exit.
	// Use our own writer here to ensure the log gets sent to the right location.
	cli.ErrWriter = &FatalWriter{cli.ErrWriter}
	if err := app.Run(os.Args); err != nil {
		fatal(err)
	}
}


we define the supported command here.match the related command according to the parameter passed in. if the command passed in is "create" command, that is related to createCommand.

createCommand contains the related action.

package main

import (
	"os"

	"github.com/urfave/cli"
)

var createCommand = cli.Command{
	Name:  "create",
	Usage: "create a container",
	ArgsUsage: `<container-id>
Where "<container-id>" is your name for the instance of the container that you
are starting. The name you provide for the container instance must be unique on
your host.`,
	Description: `The create command creates an instance of a container for a bundle. The bundle
is a directory with a specification file named "` + specConfig + `" and a root
filesystem.
The specification file includes an args parameter. The args parameter is used
to specify command(s) that get run when the container is started. To change the
command(s) that get executed on start, edit the args parameter of the spec. See
"runc spec --help" for more explanation.`,
	Flags: []cli.Flag{
		cli.StringFlag{
			Name:  "bundle, b",
			Value: "",
			Usage: `path to the root of the bundle directory, defaults to the current directory`,
		},
		cli.StringFlag{
			Name:  "console-socket",
			Value: "",
			Usage: "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal",
		},
		cli.StringFlag{
			Name:  "pid-file",
			Value: "",
			Usage: "specify the file to write the process id to",
		},
		cli.BoolFlag{
			Name:  "no-pivot",
			Usage: "do not use pivot root to jail process inside rootfs.  This should be used whenever the rootfs is on top of a ramdisk",
		},
		cli.BoolFlag{
			Name:  "no-new-keyring",
			Usage: "do not create a new session keyring for the container.  This will cause the container to inherit the calling processes session key",
		},
	},
	Action: func(context *cli.Context) error {
		if err := checkArgs(context, 1, exactArgs); err != nil {
			return err
		}
		if err := revisePidFile(context); err != nil {
			return err
		}
		spec, err := setupSpec(context)
		if err != nil {
			return err
		}
		status, err := startContainer(context, spec, true)
		if err != nil {
			return err
		}
		// exit with the container's exit status so any external supervisor is
		// notified of the exit with the correct exit status.
		os.Exit(status)
		return nil
	},
}
context definition:

// Context is a type that is passed through to
// each Handler action in a cli application. Context
// can be used to retrieve context-specific Args and
// parsed command-line options.
type Context struct {
	App           *App
	Command       Command
	shellComplete bool
	flagSet       *flag.FlagSet
	setFlags      map[string]bool
	parentContext *Context
}

spec definition:

// Spec is the base configuration for the container.
type Spec struct {
	// Version of the Open Container Runtime Specification with which the bundle complies.
	Version string `json:"ociVersion"`
	// Platform specifies the configuration's target platform.
	Platform Platform `json:"platform"`
	// Process configures the container process.
	Process Process `json:"process"`
	// Root configures the container's root filesystem.
	Root Root `json:"root"`
	// Hostname configures the container's hostname.
	Hostname string `json:"hostname,omitempty"`
	// Mounts configures additional mounts (on top of Root).
	Mounts []Mount `json:"mounts,omitempty"`
	// Hooks configures callbacks for container lifecycle events.
	Hooks *Hooks `json:"hooks,omitempty"`
	// Annotations contains arbitrary metadata for the container.
	Annotations map[string]string `json:"annotations,omitempty"`

	// Linux is platform specific configuration for Linux based containers.
	Linux *Linux `json:"linux,omitempty" platform:"linux"`
	// Solaris is platform specific configuration for Solaris containers.
	Solaris *Solaris `json:"solaris,omitempty" platform:"solaris"`
	// Windows is platform specific configuration for Windows based containers, including Hyper-V containers.
	Windows *Windows `json:"windows,omitempty" platform:"windows"`
}

take create command as an example. it mainly calls fucntion startContainer()

func startContainer(context *cli.Context, spec *specs.Spec, create bool) (int, error) {
	id := context.Args().First()
	if id == "" {
		return -1, errEmptyID
	}
	container, err := createContainer(context, id, spec)
	if err != nil {
		return -1, err
	}
	// Support on-demand socket activation by passing file descriptors into the container init process.
	listenFDs := []*os.File{}
	if os.Getenv("LISTEN_FDS") != "" {
		listenFDs = activation.Files(false)
	}
	r := &runner{
		enableSubreaper: !context.Bool("no-subreaper"),
		shouldDestroy:   true,
		container:       container,
		listenFDs:       listenFDs,
		consoleSocket:   context.String("console-socket"),
		detach:          context.Bool("detach"),
		pidFile:         context.String("pid-file"),
		create:          create,
	}
	return r.run(&spec.Process)
}

definition of Process:

// Process contains information to start a specific application inside the container.
type Process struct {
	// Terminal creates an interactive terminal for the container.
	Terminal bool `json:"terminal,omitempty"`
	// ConsoleSize specifies the size of the console.
	ConsoleSize Box `json:"consoleSize,omitempty"`
	// User specifies user information for the process.
	User User `json:"user"`
	// Args specifies the binary and arguments for the application to execute.
	Args []string `json:"args"`
	// Env populates the process environment for the process.
	Env []string `json:"env,omitempty"`
	// Cwd is the current working directory for the process and must be
	// relative to the container's root.
	Cwd string `json:"cwd"`
	// Capabilities are Linux capabilities that are kept for the container.
	Capabilities []string `json:"capabilities,omitempty" platform:"linux"`
	// Rlimits specifies rlimit options to apply to the process.
	Rlimits []LinuxRlimit `json:"rlimits,omitempty" platform:"linux"`
	// NoNewPrivileges controls whether additional privileges could be gained by processes in the container.
	NoNewPrivileges bool `json:"noNewPrivileges,omitempty" platform:"linux"`
	// ApparmorProfile specifies the apparmor profile for the container.
	ApparmorProfile string `json:"apparmorProfile,omitempty" platform:"linux"`
	// SelinuxLabel specifies the selinux context that the container process is run as.
	SelinuxLabel string `json:"selinuxLabel,omitempty" platform:"linux"`
}

startContainer() mainly create a container object, initializerunner object, call run function of runner.

1. create container object

here is createContainer()

func createContainer(context *cli.Context, id string, spec *specs.Spec) (libcontainer.Container, error) {
	config, err := specconv.CreateLibcontainerConfig(&specconv.CreateOpts{
		CgroupName:       id,
		UseSystemdCgroup: context.GlobalBool("systemd-cgroup"),
		NoPivotRoot:      context.Bool("no-pivot"),
		NoNewKeyring:     context.Bool("no-new-keyring"),
		Spec:             spec,
	})
	if err != nil {
		return nil, err
	}

	factory, err := loadFactory(context)
	if err != nil {
		return nil, err
	}
	return factory.Create(id, config)
}

here:

1. load LinuxFactory.

loadFactory()

// loadFactory returns the configured factory instance for execing containers.
func loadFactory(context *cli.Context) (libcontainer.Factory, error) {
	root := context.GlobalString("root")
	abs, err := filepath.Abs(root)
	if err != nil {
		return nil, err
	}
	cgroupManager := libcontainer.Cgroupfs
	if context.GlobalBool("systemd-cgroup") {
		if systemd.UseSystemd() {
			cgroupManager = libcontainer.SystemdCgroups
		} else {
			return nil, fmt.Errorf("systemd cgroup flag passed, but systemd support for managing cgroups is not available")
		}
	}
	return libcontainer.New(abs, cgroupManager, libcontainer.CriuPath(context.GlobalString("criu")))
}

New()

func New(root string, options ...func(*LinuxFactory) error) (Factory, error) {
	if root != "" {
		if err := os.MkdirAll(root, 0700); err != nil {
			return nil, newGenericError(err, SystemError)
		}
	}
	l := &LinuxFactory{
		Root:      root,
		InitArgs:  []string{"/proc/self/exe", "init"},
		Validator: validate.New(),
		CriuPath:  "criu",
	}
	Cgroupfs(l)
	for _, opt := range options {
		if err := opt(l); err != nil {
			return nil, err
		}
	}
	return l, nil
}
// LinuxFactory implements the default factory interface for linux based systems.
type LinuxFactory struct {
    // Root directory for the factory to store state.
    Root string

    // InitArgs are arguments for calling the init responsibilities for spawning
    // a container.
    InitArgs []string

    // CriuPath is the path to the criu binary used for checkpoint and restore of
    // containers.
    CriuPath string

    // Validator provides validation to container configurations.
    Validator validate.Validator

    // NewCgroupsManager returns an initialized cgroups manager for a single container.
    NewCgroupsManager func(config *configs.Cgroup, paths map[string]string) cgroups.Manager
}

the /proc/self/exe init  it will call the process itself

InitArgs:  []string{"/proc/self/exe", "init"},

2. call Create command

constrtuc
type Factory interface {
	// Creates a new container with the given id and starts the initial process inside it.
	// id must be a string containing only letters, digits and underscores and must contain
	// between 1 and 1024 characters, inclusive.
	//
	// The id must not already be in use by an existing container. Containers created using
	// a factory with the same path (and file system) must have distinct ids.
	//
	// Returns the new container with a running process.
	//
	// errors:
	// IdInUse - id is already in use by a container
	// InvalidIdFormat - id has incorrect format
	// ConfigInvalid - config is invalid
	// Systemerror - System error
	//
	// On error, any partially created container parts are cleaned up (the operation is atomic).
Create(id string, config *configs.Config) (Container, error)
......
}

create function:

func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, error) {
	if l.Root == "" {
		return nil, newGenericError(fmt.Errorf("invalid root"), ConfigInvalid)
	}
	if err := l.validateID(id); err != nil {
		return nil, err
	}
	if err := l.Validator.Validate(config); err != nil {
		return nil, newGenericError(err, ConfigInvalid)
	}
	uid, err := config.HostUID()
	if err != nil {
		return nil, newGenericError(err, SystemError)
	}
	gid, err := config.HostGID()
	if err != nil {
		return nil, newGenericError(err, SystemError)
	}
	containerRoot := filepath.Join(l.Root, id)
	if _, err := os.Stat(containerRoot); err == nil {
		return nil, newGenericError(fmt.Errorf("container with id exists: %v", id), IdInUse)
	} else if !os.IsNotExist(err) {
		return nil, newGenericError(err, SystemError)
	}
	if err := os.MkdirAll(containerRoot, 0711); err != nil {
		return nil, newGenericError(err, SystemError)
	}
	if err := os.Chown(containerRoot, uid, gid); err != nil {
		return nil, newGenericError(err, SystemError)
	}
	fifoName := filepath.Join(containerRoot, execFifoFilename)
	oldMask := syscall.Umask(0000)
	if err := syscall.Mkfifo(fifoName, 0622); err != nil {
		syscall.Umask(oldMask)
		return nil, newGenericError(err, SystemError)
	}
	syscall.Umask(oldMask)
	if err := os.Chown(fifoName, uid, gid); err != nil {
		return nil, newGenericError(err, SystemError)
	}
//```Maxx
 c := &linuxContainer{
		id:            id,
		root:          containerRoot,
		config:        config,
		initArgs:      l.InitArgs,
		criuPath:      l.CriuPath,
		cgroupManager: l.NewCgroupsManager(config.Cgroups, nil),
	}
	c.state = &stoppedState{c: c}
	return c, nil
}

2. call run function

func (r *runner) run(config *specs.Process) (int, error) {
	// Maxx: new process instance
	process, err := newProcess(*config)
	if err != nil {
		r.destroy()
		return -1, err
	}
	if len(r.listenFDs) > 0 {
		process.Env = append(process.Env, fmt.Sprintf("LISTEN_FDS=%d", len(r.listenFDs)), "LISTEN_PID=1")
		process.ExtraFiles = append(process.ExtraFiles, r.listenFDs...)
	}

	rootuid, err := r.container.Config().HostUID()
	if err != nil {
		r.destroy()
		return -1, err
	}

	rootgid, err := r.container.Config().HostGID()
	if err != nil {
		r.destroy()
		return -1, err
	}

	detach := r.detach || r.create

	// Check command-line for sanity.
	if detach && config.Terminal && r.consoleSocket == "" {
		r.destroy()
		return -1, fmt.Errorf("cannot allocate tty if runc will detach without setting console socket")
	}
	// XXX: Should we change this?
	if (!detach || !config.Terminal) && r.consoleSocket != "" {
		r.destroy()
		return -1, fmt.Errorf("cannot use console socket if runc will not detach or allocate tty")
	}
	//Maxx note here  this will call start 
	startFn := r.container.Start
	if !r.create {
		startFn = r.container.Run
	}
	// Setting up IO is a two stage process. We need to modify process to deal
	// with detaching containers, and then we get a tty after the container has
	// started.
	handler := newSignalHandler(r.enableSubreaper)
	tty, err := setupIO(process, rootuid, rootgid, config.Terminal, detach)
	if err != nil {
		r.destroy()
		return -1, err
	}
	// // Maxx   call container start 
	// D:\study\go\runc-master\libcontainer\container_linux.go 看调用的run方法
	if err := startFn(process); err != nil {
		r.destroy()
		return -1, err
	}
	if config.Terminal {
		if err := tty.recvtty(process, r.detach || r.create); err != nil {
			r.terminate(process)
			r.destroy()
			return -1, err
		}
	}
	defer tty.Close()

	if config.Terminal && detach {
		conn, err := net.Dial("unix", r.consoleSocket)
		if err != nil {
			r.terminate(process)
			r.destroy()
			return -1, err
		}
		defer conn.Close()

		unixconn, ok := conn.(*net.UnixConn)
		if !ok {
			r.terminate(process)
			r.destroy()
			return -1, fmt.Errorf("casting to UnixConn failed")
		}

		socket, err := unixconn.File()
		if err != nil {
			r.terminate(process)
			r.destroy()
			return -1, err
		}
		defer socket.Close()

		err = tty.sendtty(socket, r.terminalinfo())
		if err != nil {
			r.terminate(process)
			r.destroy()
			return -1, err
		}
	}

	if err := tty.ClosePostStart(); err != nil {
		r.terminate(process)
		r.destroy()
		return -1, err
	}
	if r.pidFile != "" {
		if err := createPidFile(r.pidFile, process); err != nil {
			r.terminate(process)
			r.destroy()
			return -1, err
		}
	}
	if detach {
		return 0, nil
	}
	status, err := handler.forward(process, tty)
	if err != nil {
		r.terminate(process)
	}
	r.destroy()
	return status, err
}

func (c *linuxContainer) Start(process *Process) error {
	c.m.Lock()
	defer c.m.Unlock()
	status, err := c.currentStatus()
	if err != nil {
		return err
	}
	return c.start(process, status == Stopped)
}

note: for the first time, it will call the following Run()


func (c *linuxContainer) Run(process *Process) error {
	c.m.Lock()
	defer c.m.Unlock()
	status, err := c.currentStatus()
	if err != nil {
		return err
	}
	if err := c.start(process, status == Stopped); err != nil {
		return err
	}
	if status == Stopped {
		return c.exec()
	}
	return nil
}



start()

func (c *linuxContainer) start(process *Process, isInit bool) error {
// Maxx construct process object 
 parent, err := c.newParentProcess(process, isInit)
	if err != nil {
		return newSystemErrorWithCause(err, "creating new parent process")
	}
// Maxx init container process
 if err := parent.start(); err != nil {
		// terminate the process to ensure that it properly is reaped.
		if err := parent.terminate(); err != nil {
			logrus.Warn(err)
		}
		return newSystemErrorWithCause(err, "starting container process")
	}
	// generate a timestamp indicating when the container was started
	c.created = time.Now().UTC()
	c.state = &runningState{
		c: c,
	}
	if isInit {
		c.state = &createdState{
			c: c,
		}
		state, err := c.updateState(parent)
		if err != nil {
			return err
		}
		c.initProcessStartTime = state.InitProcessStartTime

		if c.config.Hooks != nil {
			s := configs.HookState{
				Version:    c.config.Version,
				ID:         c.id,
				Pid:        parent.pid(),
				BundlePath: utils.SearchLabels(c.config.Labels, "bundle"),
			}
			for i, hook := range c.config.Hooks.Poststart {
				if err := hook.Run(s); err != nil {
					if err := parent.terminate(); err != nil {
						logrus.Warn(err)
					}
					return newSystemErrorWithCausef(err, "running poststart hook %d", i)
				}
			}
		}
	}
	return nil
}

newParentProcess()

func (c *linuxContainer) newParentProcess(p *Process, doInit bool) (parentProcess, error) {
// Maxx create pipeline between parent and child
 parentPipe, childPipe, err := newPipe()
	if err != nil {
		return nil, newSystemErrorWithCause(err, "creating new init pipe")
	}
	rootDir, err := os.Open(c.root)
	if err != nil {
		return nil, err
	}
// Maxx /proc/self/exe $path of runc init
 cmd, err := c.commandTemplate(p, childPipe, rootDir)
	if err != nil {
		return nil, newSystemErrorWithCause(err, "creating new command template")
	}
	if !doInit {
		return c.newSetnsProcess(p, cmd, parentPipe, childPipe, rootDir)
	}
// Maxx init the first process in the container
 return c.newInitProcess(p, cmd, parentPipe, childPipe, rootDir)
}

newInitProcess()

func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe, rootDir *os.File) (*initProcess, error) {
	cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
	nsMaps := make(map[configs.NamespaceType]string)
	for _, ns := range c.config.Namespaces {
		if ns.Path != "" {
			nsMaps[ns.Type] = ns.Path
		}
	}
	_, sharePidns := nsMaps[configs.NEWPID]
// Maxx set the clone flag here
 data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps)
	if err != nil {
		return nil, err
	}
	p.consoleChan = make(chan *os.File, 1)
	return &initProcess{
		cmd:           cmd,
		childPipe:     childPipe,
		parentPipe:    parentPipe,
		manager:       c.cgroupManager,
		config:        c.newInitConfig(p),
		container:     c,
		process:       p,
		bootstrapData: data,
		sharePidns:    sharePidns,
		rootDir:       rootDir,
	}, nil
}

newInitProcess() return initProcess

type initProcess struct {
	cmd           *exec.Cmd
	parentPipe    *os.File
	childPipe     *os.File
	config        *initConfig
	manager       cgroups.Manager
	container     *linuxContainer
	fds           []string
	process       *Process
	bootstrapData io.Reader
	sharePidns    bool
	rootDir       *os.File
}
the start function of initProcess:

func (p *initProcess) start() error {
	defer p.parentPipe.Close()
	// Maxx /proc/self/exe path_to_runc init
	err := p.cmd.Start()
	p.process.ops = p
	p.childPipe.Close()
	p.rootDir.Close()
	if err != nil {
		p.process.ops = nil
		return newSystemErrorWithCause(err, "starting init process command")
	}
	// Maxx 通过pipe向nsexec传输信息包含namespace 信息
	if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
		return err
	}
	// Maxx  wait setns stop
	if err := p.execSetns(); err != nil {
		return newSystemErrorWithCause(err, "running exec setns process for init")
	}
	// Save the standard descriptor names before the container process
	// can potentially move them (e.g., via dup2()).  If we don't do this now,
	// we won't know at checkpoint time which file descriptor to look up.
	// Maxx get pid of nsexec return 
	fds, err := getPipeFds(p.pid())
	if err != nil {
		return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", p.pid())
	}
	p.setExternalDescriptors(fds)
	// Do this before syncing with child so that no children
	// can escape the cgroup
	if err := p.manager.Apply(p.pid()); err != nil {
		return newSystemErrorWithCause(err, "applying cgroup configuration for process")
	}
	defer func() {
		if err != nil {
			// TODO: should not be the responsibility to call here
			p.manager.Destroy()
		}
	}()
	if err := p.createNetworkInterfaces(); err != nil {
		return newSystemErrorWithCause(err, "creating network interfaces")
	}
	// Maxx 前面调用过cmd.start,通过调用/proc/self/exe 传入init参数
	// 通过管道向子进程传递配置 initCommand  will use
	// D:\study\go\runc-master\main.go
	if err := p.sendConfig(); err != nil {
		return newSystemErrorWithCause(err, "sending config to init process")
	}
	var (
		sentRun    bool
		sentResume bool
	)

	ierr := parseSync(p.parentPipe, func(sync *syncT) error {
		switch sync.Type {
		case procConsole:
			if err := writeSync(p.parentPipe, procConsoleReq); err != nil {
				return newSystemErrorWithCause(err, "writing syncT 'request fd'")
			}

			masterFile, err := utils.RecvFd(p.parentPipe)
			if err != nil {
				return newSystemErrorWithCause(err, "getting master pty from child pipe")
			}

			if p.process.consoleChan == nil {
				// TODO: Don't panic here, do something more sane.
				panic("consoleChan is nil")
			}
			p.process.consoleChan <- masterFile

			if err := writeSync(p.parentPipe, procConsoleAck); err != nil {
				return newSystemErrorWithCause(err, "writing syncT 'ack fd'")
			}
		case procReady:
			if err := p.manager.Set(p.config.Config); err != nil {
				return newSystemErrorWithCause(err, "setting cgroup config for ready process")
			}
			// set oom_score_adj
			if err := setOomScoreAdj(p.config.Config.OomScoreAdj, p.pid()); err != nil {
				return newSystemErrorWithCause(err, "setting oom score for ready process")
			}
			// set rlimits, this has to be done here because we lose permissions
			// to raise the limits once we enter a user-namespace
			if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
				return newSystemErrorWithCause(err, "setting rlimits for ready process")
			}
			// call prestart hooks
			if !p.config.Config.Namespaces.Contains(configs.NEWNS) {
				if p.config.Config.Hooks != nil {
					s := configs.HookState{
						Version:    p.container.config.Version,
						ID:         p.container.id,
						Pid:        p.pid(),
						BundlePath: utils.SearchLabels(p.config.Config.Labels, "bundle"),
					}
					for i, hook := range p.config.Config.Hooks.Prestart {
						if err := hook.Run(s); err != nil {
							return newSystemErrorWithCausef(err, "running prestart hook %d", i)
						}
					}
				}
			}
			// Sync with child.
			if err := writeSync(p.parentPipe, procRun); err != nil {
				return newSystemErrorWithCause(err, "writing syncT 'run'")
			}
			sentRun = true
		case procHooks:
			if p.config.Config.Hooks != nil {
				s := configs.HookState{
					Version:    p.container.config.Version,
					ID:         p.container.id,
					Pid:        p.pid(),
					BundlePath: utils.SearchLabels(p.config.Config.Labels, "bundle"),
				}
				for i, hook := range p.config.Config.Hooks.Prestart {
					if err := hook.Run(s); err != nil {
						return newSystemErrorWithCausef(err, "running prestart hook %d", i)
					}
				}
			}
			// Sync with child.
			if err := writeSync(p.parentPipe, procResume); err != nil {
				return newSystemErrorWithCause(err, "writing syncT 'resume'")
			}
			sentResume = true
		default:
			return newSystemError(fmt.Errorf("invalid JSON payload from child"))
		}

		return nil
	})

	if !sentRun {
		return newSystemErrorWithCause(ierr, "container init")
	}
	if p.config.Config.Namespaces.Contains(configs.NEWNS) && !sentResume {
		return newSystemError(fmt.Errorf("could not synchronise after executing prestart hooks with container process"))
	}
	if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil {
		return newSystemErrorWithCause(err, "shutting down init pipe")
	}

	// Must be done after Shutdown so the child will exit and we can wait for it.
	if ierr != nil {
		p.wait()
		return ierr
	}
	return nil
}

a little summary:

p.cmd.Start()
actually the command is /proc/self/exe $runc_path init, that is call runc itself again and pass init as its argument. So the next step is to look in to the initCommand of runc.

Note:when do we create the child process?

As we defined the cgo code in the main_unix.go, before calling p.cmd.Start(), it will call the nsexec method in thensexec.c the description of nsenter:

nsexec will get bootstrap data(namespace paths, clone flags, uid/gid mapping, console path)from its father process and clone a child process, and return the processid.
when the nsexec finish running . p.cmd.Start() will execute. and pass config info via pipeline.



after

p.cmd.Start()
runc will call initCommand

var initCommand = cli.Command{
	Name:  "init",
	Usage: `initialize the namespaces and launch the process (do not call it outside of runc)`,
	Action: func(context *cli.Context) error {
		factory, _ := libcontainer.New("")
		if err := factory.StartInitialization(); err != nil {
			// as the error is sent back to the parent there is no need to log
			// or write it to stderr because the parent process will handle this
			os.Exit(1)
		}
		panic("libcontainer: container init failed to exec")
	},
}
new()
// New returns a linux based container factory based in the root directory and
// configures the factory with the provided option funcs.
func New(root string, options ...func(*LinuxFactory) error) (Factory, error) {
	if root != "" {
		if err := os.MkdirAll(root, 0700); err != nil {
			return nil, newGenericError(err, SystemError)
		}
	}
	l := &LinuxFactory{
		Root:      root,
		InitArgs:  []string{"/proc/self/exe", "init"},
		Validator: validate.New(),
		CriuPath:  "criu",
	}
	// Maxx  LinuxFactory.Create()中构造了linuxContainer对象

	Cgroupfs(l)
	for _, opt := range options {
		if err := opt(l); err != nil {
			return nil, err
		}
	}
	return l, nil
}
for StartInitialization()
// StartInitialization loads a container by opening the pipe fd from the parent to read the configuration and state
// This is a low level implementation detail of the reexec and should not be consumed externally
func (l *LinuxFactory) StartInitialization() (err error) {
	var pipefd, rootfd int
	for _, pair := range []struct {
		k string
		v *int
	}{
		{"_LIBCONTAINER_INITPIPE", &pipefd},
		{"_LIBCONTAINER_STATEDIR", &rootfd},
	} {

		s := os.Getenv(pair.k)

		i, err := strconv.Atoi(s)
		if err != nil {
			return fmt.Errorf("unable to convert %s=%s to int", pair.k, s)
		}
		*pair.v = i
	}
	var (
		pipe = os.NewFile(uintptr(pipefd), "pipe")
		it   = initType(os.Getenv("_LIBCONTAINER_INITTYPE"))
	)
	defer pipe.Close()

	// clear the current process's environment to clean any libcontainer
	// specific env vars.
	os.Clearenv()

	defer func() {
		// We have an error during the initialization of the container's init,
		// send it back to the parent process in the form of an initError.
		if werr := utils.WriteJSON(pipe, syncT{procError}); werr != nil {
			fmt.Fprintln(os.Stderr, err)
			return
		}
		if werr := utils.WriteJSON(pipe, newSystemError(err)); werr != nil {
			fmt.Fprintln(os.Stderr, err)
			return
		}
	}()
	defer func() {
		if e := recover(); e != nil {
			err = fmt.Errorf("panic from initialization: %v, %v", e, string(debug.Stack()))
		}
	}()
// Maxx 通过管道解析父进程传来的信息  config
 i, err := newContainerInit(it, pipe, rootfd)
	if err != nil {
		return err
	}

	// If Init succeeds, syscall.Exec will not return, hence none of the defers will be called.
	return i.Init()
}
newContainerInit

func newContainerInit(t initType, pipe *os.File, stateDirFD int) (initer, error) {
	var config *initConfig
	if err := json.NewDecoder(pipe).Decode(&config); err != nil {
		return nil, err
	}
	if err := populateProcessEnvironment(config.Env); err != nil {
		return nil, err
	}
	switch t {
	case initSetns:
		return &linuxSetnsInit{
			pipe:       pipe,
			config:     config,
			stateDirFD: stateDirFD,
		}, nil
	case initStandard:
		return &linuxStandardInit{
			pipe:       pipe,
			parentPid:  syscall.Getppid(),
			config:     config,
			stateDirFD: stateDirFD,
		}, nil
	}
	return nil, fmt.Errorf("unknown init type %q", t)
}

i.init()

type linuxStandardInit struct {
	pipe       *os.File
	parentPid  int
	stateDirFD int
	config     *initConfig
}
func (l *linuxStandardInit) Init() error {
	if !l.config.Config.NoNewKeyring {
		ringname, keepperms, newperms := l.getSessionRingParams()

		// do not inherit the parent's session keyring
		sessKeyId, err := keys.JoinSessionKeyring(ringname)
		if err != nil {
			return err
		}
		// make session keyring searcheable
		if err := keys.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil {
			return err
		}
	}
	// Maxx  setup network

	if err := setupNetwork(l.config); err != nil {
		return err
	}
	if err := setupRoute(l.config.Config); err != nil {
		return err
	}

	label.Init()

	// prepareRootfs() can be executed only for a new mount namespace.
	if l.config.Config.Namespaces.Contains(configs.NEWNS) {
		if err := prepareRootfs(l.pipe, l.config.Config); err != nil {
			return err
		}
	}

	// Set up the console. This has to be done *before* we finalize the rootfs,
	// but *after* we've given the user the chance to set up all of the mounts
	// they wanted.
	if l.config.CreateConsole {
		if err := setupConsole(l.pipe, l.config, true); err != nil {
			return err
		}
		if err := system.Setctty(); err != nil {
			return err
		}
	}

	// Finish the rootfs setup.
	if l.config.Config.Namespaces.Contains(configs.NEWNS) {
		if err := finalizeRootfs(l.config.Config); err != nil {
			return err
		}
	}

	if hostname := l.config.Config.Hostname; hostname != "" {
		if err := syscall.Sethostname([]byte(hostname)); err != nil {
			return err
		}
	}
	if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
		return err
	}
	if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil {
		return err
	}

	for key, value := range l.config.Config.Sysctl {
		if err := writeSystemProperty(key, value); err != nil {
			return err
		}
	}
	for _, path := range l.config.Config.ReadonlyPaths {
		if err := readonlyPath(path); err != nil {
			return err
		}
	}
	for _, path := range l.config.Config.MaskPaths {
		if err := maskPath(path); err != nil {
			return err
		}
	}
	pdeath, err := system.GetParentDeathSignal()
	if err != nil {
		return err
	}
	if l.config.NoNewPrivileges {
		if err := system.Prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
			return err
		}
	}
	// Tell our parent that we're ready to Execv. This must be done before the
	// Seccomp rules have been applied, because we need to be able to read and
	// write to a socket.
	if err := syncParentReady(l.pipe); err != nil {
		return err
	}
	// Without NoNewPrivileges seccomp is a privileged operation, so we need to
	// do this before dropping capabilities; otherwise do it as late as possible
	// just before execve so as few syscalls take place after it as possible.
	if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges {
		if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
			return err
		}
	}
	if err := finalizeNamespace(l.config); err != nil {
		return err
	}
	// finalizeNamespace can change user/group which clears the parent death
	// signal, so we restore it here.
	if err := pdeath.Restore(); err != nil {
		return err
	}
	// compare the parent from the initial start of the init process and make sure that it did not change.
	// if the parent changes that means it died and we were reparented to something else so we should
	// just kill ourself and not cause problems for someone else.
	if syscall.Getppid() != l.parentPid {
		return syscall.Kill(syscall.Getpid(), syscall.SIGKILL)
	}
	// check for the arg before waiting to make sure it exists and it is returned
	// as a create time error.
	name, err := exec.LookPath(l.config.Args[0])
	if err != nil {
		return err
	}
	// close the pipe to signal that we have completed our init.
	l.pipe.Close()
	// wait for the fifo to be opened on the other side before
	// exec'ing the users process.
	fd, err := syscall.Openat(l.stateDirFD, execFifoFilename, os.O_WRONLY|syscall.O_CLOEXEC, 0)
	if err != nil {
		return newSystemErrorWithCause(err, "openat exec fifo")
	}
	if _, err := syscall.Write(fd, []byte("0")); err != nil {
		return newSystemErrorWithCause(err, "write 0 exec fifo")
	}
	if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges {
		if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
			return newSystemErrorWithCause(err, "init seccomp")
		}
	}
	//Maxx  用户进程
	if err := syscall.Exec(name, l.config.Args[0:], os.Environ()); err != nil {
		return newSystemErrorWithCause(err, "exec user process")
	}
	return nil
}





评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值