命令:
ctr run -h
NAME:
ctr run - run a container
USAGE:
ctr run [command options] Image|RootFS ID [COMMAND] [ARG...]
OPTIONS:
--tty, -t allocate a TTY for the container
--runtime value runtime name (io.containerd.runtime.v1.linux, io.containerd.runtime.v1.windows, io.containerd.runtime.v1.com.vmware.linux) (default: "io.containerd.runtime.v1.linux")
--readonly set the containers filesystem as readonly
--net-host enable host networking for the container
--mount value specify additional container mount (ex: type=bind,src=/tmp,dest=/host,options=rbind:ro)
--env value specify additional container environment variables (i.e. FOO=bar)
--label value specify additional labels (foo=bar)
--rm remove the container after running
--checkpoint value provide the checkpoint digest to restore the container
--snapshotter value Snapshotter name. Empty value stands for the daemon default value.
--rootfs Use custom rootfs that is not managed by containerd snapshotter.
本文以 ctr run docker.io/library/redis:latest containerd-redis 命令为例子
一. 客户端 ctr run 命令分析
1.1 cmd/ctr/run.go 中,根据 ctr run [command options] Image|RootFS ID [COMMAND] [ARG...],Image 和 id 必须参数项
Action: func(context *cli.Context) error {
var (
err error
checkpointIndex digest.Digest
ctx, cancel = appContext(context)
id = context.Args().Get(1)
imageRef = context.Args().First()
tty = context.Bool("tty")
)
defer cancel()
if imageRef == "" {
return errors.New("image ref must be provided")
}
if id == "" {
return errors.New("container id must be provided")
}
if raw := context.String("checkpoint"); raw != "" {
if checkpointIndex, err = digest.Parse(raw); err != nil {
return err
}
}
1.2 context.Bool("rm") 一次性调用就删除,
Action: func(context *cli.Context) error {
if raw := context.String("checkpoint"); raw != "" {
if checkpointIndex, err = digest.Parse(raw); err != nil {
return err
}
}
client, err := newClient(context)
if err != nil {
return err
}
container, err := newContainer(ctx, client, context)
if err != nil {
return err
}
if context.Bool("rm") {
defer container.Delete(ctx, containerd.WithSnapshotCleanup)
}
task, err := newTask(ctx, container, checkpointIndex, tty)
if err != nil {
return err
}
defer task.Delete(ctx)
statusC := make(chan uint32, 1)
go func() {
status, err := task.Wait(ctx)
if err != nil {
logrus.WithError(err).Error("wait process")
}
statusC <- status
}()
var con console.Console
if tty {
con = console.Current()
defer con.Reset()
if err := con.SetRaw(); err != nil {
return err
}
}
if err := task.Start(ctx); err != nil {
return err
}
if tty {
if err := handleConsoleResize(ctx, task, con); err != nil {
logrus.WithError(err).Error("console resize")
}
} else {
sigc := forwardAllSignals(ctx, task)
defer stopCatch(sigc)
}
status := <-statusC
if _, err := task.Delete(ctx); err != nil {
return err
}
if status != 0 {
return cli.NewExitError("", int(status))
}
return nil
},
1.3 newContainer 直接调用 NewContainer,发送 GRPC 请求 Create 方法,服务端实现在第二章节讲解:
// NewContainer will create a new container in container with the provided id
// the id must be unique within the namespace
func (c *Client) NewContainer(ctx context.Context, id string, opts ...NewContainerOpts) (Container, error) {
container := containers.Container{
ID: id,
Runtime: containers.RuntimeInfo{
Name: c.runtime,
},
}
for _, o := range opts {
if err := o(ctx, c, &container); err != nil {
return nil, err
}
}
r, err := c.ContainerService().Create(ctx, container)
if err != nil {
return nil, err
}
return containerFromRecord(c, r), nil
}
1.4 newTask 主要调用 NewTask 函数实现的接口
func (c *container) NewTask(ctx context.Context, ioCreate IOCreation, opts ...NewTaskOpts) (Task, error) {
c.mu.Lock()
defer c.mu.Unlock()
i, err := ioCreate(c.c.ID)
if err != nil {
return nil, err
}
request := &tasks.CreateTaskRequest{
ContainerID: c.c.ID,
Terminal: i.Terminal,
Stdin: i.Stdin,
Stdout: i.Stdout,
Stderr: i.Stderr,
}
if c.c.RootFS != "" {
// get the rootfs from the snapshotter and add it to the request
mounts, err := c.client.SnapshotService(c.c.Snapshotter).Mounts(ctx, c.c.RootFS)
if err != nil {
return nil, err
}
for _, m := range mounts {
request.Rootfs = append(request.Rootfs, &types.Mount{
Type: m.Type,
Source: m.Source,
Options: m.Options,
})
}
}
var info TaskInfo
for _, o := range opts {
if err := o(ctx, c.client, &info); err != nil {
return nil, err
}
}
if info.RootFS != nil {
for _, m := range info.RootFS {
request.Rootfs = append(request.Rootfs, &types.Mount{
Type: m.Type,
Source: m.Source,
Options: m.Options,
})
}
}
if info.Options != nil {
any, err := typeurl.MarshalAny(info.Options)
if err != nil {
return nil, err
}
request.Options = any
}
t := &task{
client: c.client,
io: i,
id: c.ID(),
}
if info.Checkpoint != nil {
request.Checkpoint = info.Checkpoint
// we need to defer the create call to start
t.deferred = request
} else {
response, err := c.client.TaskService().Create(ctx, request)
if err != nil {
return nil, err
}
t.pid = response.Pid
}
return t, nil
}
c.client.TaskService().Create 传给服务端对应的方法 services/tasks/service.go 第三张讲解
二. 服务端 create 命令分析
2.1 _Containers_Create_Handler 函数路径 api/services/containes/v1/containers.pb.go,服务端收到 GRPC 请求,调用 Create 方法
func _Containers_Create_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
in := new(CreateContainerRequest)
if err := dec(in); err != nil {
return nil, err
}
if interceptor == nil {
return srv.(ContainersServer).Create(ctx, in)
}
info := &grpc.UnaryServerInfo{
Server: srv,
FullMethod: "/containerd.services.containers.v1.Containers/Create",
}
handler := func(ctx context.Context, req interface{}) (interface{}, error) {
return srv.(ContainersServer).Create(ctx, req.(*CreateContainerRequest))
}
return interceptor(ctx, in, info, handler)
}
2.2 services/containers/service.go 中 重要的函数 store.Create 为实现的接口
func (s *Service) Create(ctx context.Context, req *api.CreateContainerRequest) (*api.CreateContainerResponse, error) {
var resp api.CreateContainerResponse
if err := s.withStoreUpdate(ctx, func(ctx context.Context, store containers.Store) error {
container := containerFromProto(&req.Container)
created, err := store.Create(ctx, container)
resp.Container = containerToProto(&created)
return nil
}); err != nil {
return &resp, errdefs.ToGRPC(err)
}
if err := s.publisher.Publish(ctx, "/containers/create", &eventsapi.ContainerCreate{
ID: resp.Container.ID,
Image: resp.Container.Image,
Runtime: &eventsapi.ContainerCreate_Runtime{
Name: resp.Container.Runtime.Name,
Options: resp.Container.Runtime.Options,
},
}); err != nil {
return &resp, err
}
return &resp, nil
}
2.3 根据 withStore 函数可以得到 store 为 metadata.NewContainerStore,路径 /metadata/containers.go 中,containerStore 结构体是包裹的是操作数据库
type containerStore struct {
tx *bolt.Tx
}
func NewContainerStore(tx *bolt.Tx) containers.Store {
return &containerStore{
tx: tx,
}
}
2.4 Create 函数中创建存容器的 bucket,证明没有container 在 2.4.1 中讲解,readContainer 将数据读入到 container 中。
func (s *containerStore) Create(ctx context.Context, container containers.Container) (containers.Container, error) {
namespace, err := namespaces.NamespaceRequired(ctx)
if err != nil {
return containers.Container{}, err
}
if err := identifiers.Validate(container.ID); err != nil {
return containers.Container{}, err
}
bkt, err := createContainersBucket(s.tx, namespace)
if err != nil {
return containers.Container{}, err
}
cbkt, err := bkt.CreateBucket([]byte(container.ID))
if err != nil {
if err == bolt.ErrBucketExists {
err = errors.Wrapf(errdefs.ErrAlreadyExists, "content %q", container.ID)
}
return containers.Container{}, err
}
container.CreatedAt = time.Now().UTC()
container.UpdatedAt = container.CreatedAt
if err := writeContainer(cbkt, &container); err != nil {
return containers.Container{}, errors.Wrap(err, "failed to write container")
}
return container, nil
}
这部分主要是创建 bucket 操作数据库
三. 服务端 Task 分析
3.1 路径 services/tasks/service.go
func (s *Service) Create(ctx context.Context, r *api.CreateTaskRequest) (*api.CreateTaskResponse, error)
3.1.1 getContainer 根据容器 ID 拿到容器信息
container, err := s.getContainer(ctx, r.ContainerID)
if err != nil {
return nil, errdefs.ToGRPC(err)
}
3.1.2 创建 runtime 的参数
opts := runtime.CreateOpts{
Spec: container.Spec,
IO: runtime.IO{
Stdin: r.Stdin,
Stdout: r.Stdout,
Stderr: r.Stderr,
Terminal: r.Terminal,
},
Checkpoint: checkpointPath,
Options: r.Options,
}
for _, m := range r.Rootfs {
opts.Rootfs = append(opts.Rootfs, mount.Mount{
Type: m.Type,
Source: m.Source,
Options: m.Options,
})
}
3.1.3 getRuntime 根据名返回接口 3.1.3.1 所示:
runtime, err := s.getRuntime(container.Runtime.Name)
if err != nil {
return nil, err
}
type Service struct {
runtimes map[string]runtime.Runtime
db *bolt.DB
store content.Store
publisher events.Publisher
}
3.1.3.1 Runtime 接口
// Runtime is responsible for the creation of containers for a certain platform,
// arch, or custom usage.
type Runtime interface {
// ID of the runtime
ID() string
// Create creates a task with the provided id and options.
Create(ctx context.Context, id string, opts CreateOpts) (Task, error)
// Get returns a task.
Get(context.Context, string) (Task, error)
// Tasks returns all the current tasks for the runtime.
// Any container runs at most one task at a time.
Tasks(context.Context) ([]Task, error)
// Delete removes the task in the runtime.
Delete(context.Context, Task) (*Exit, error)
}
3.1.4 runtime.Create 调用的为 linux/runtime.go 中 Create 方法,第四章节讲解
c, err := runtime.Create(ctx, r.ContainerID, opts)
if err != nil {
return nil, errors.Wrap(err, "runtime create failed")
}
四. 服务端 Runtime 分析
4.1 路径 linux/runtime.go 函数 Create
func (r *Runtime) Create(ctx context.Context, id string, opts runtime.CreateOpts) (_ runtime.Task, err error)
4.1.1 newBundle 根据传入的路径和 ID 创建目录文件,/var/lib/containerd/io.containerd.runtime.v1.linux/default
bundle, err := newBundle(filepath.Join(r.root, namespace), namespace, id, opts.Spec.Value, r.events)
if err != nil {
return nil, err
}
4.1.2 NewShime 连接到 shim,返回连接到 shime 的客户端结构体
s, err := bundle.NewShim(ctx, r.shim, r.address, r.remote, r.shimDebug, opts)
if err != nil {
return nil, err
}
4.1.2 client.WithStart 启动一个 containerd-shim 进程,containerd-shim --namespace default --address /run/containerd/containerd.sock
// NewShim connects to the shim managing the bundle and tasks
func (b *bundle) NewShim(ctx context.Context, binary, grpcAddress string, remote, debug bool, createOpts runtime.CreateOpts) (*client.Client, error) {
opt := client.WithStart(binary, grpcAddress, debug)
if !remote {
opt = client.WithLocal(b.events)
}
var options runcopts.CreateOptions
if createOpts.Options != nil {
v, err := typeurl.UnmarshalAny(createOpts.Options)
if err != nil {
return nil, err
}
options = *v.(*runcopts.CreateOptions)
}
return client.New(ctx, client.Config{
Address: b.shimAddress(),
Path: b.path,
Namespace: b.namespace,
CgroupPath: options.ShimCgroup,
}, opt)
}
4.1.3 填充 CreateTaskRequest 结构体,发送 GRPC 给 shim 创建,Create 路径为 linux/shim/service.go,4.1.3.1 节讲解
sopts := &shim.CreateTaskRequest{
ID: id,
Bundle: bundle.path,
Runtime: r.runtime,
Stdin: opts.IO.Stdin,
Stdout: opts.IO.Stdout,
Stderr: opts.IO.Stderr,
Terminal: opts.IO.Terminal,
Checkpoint: opts.Checkpoint,
Options: opts.Options,
}
for _, m := range opts.Rootfs {
sopts.Rootfs = append(sopts.Rootfs, &types.Mount{
Type: m.Type,
Source: m.Source,
Options: m.Options,
})
}
if _, err = s.Create(ctx, sopts); err != nil {
return nil, errdefs.FromGRPC(err)
}
4.1.3.1 Create 函数中 newInitProcess 做的事情比较多在第五章节讲解
func (s *Service) Create(ctx context.Context, r *shimapi.CreateTaskRequest) (*shimapi.CreateTaskResponse, error) {
process, err := newInitProcess(ctx, s.platform, s.path, s.namespace, r)
// save the main task id and bundle to the shim for additional requests
s.id = r.ID
s.bundle = r.Bundle
s.initProcess = process
pid := process.Pid()
s.processes[r.ID] = process
s.mu.Unlock()
cmd := &reaper.Cmd{
ExitCh: make(chan int, 1),
}
reaper.Default.Register(pid, cmd)
s.events <- &eventsapi.TaskCreate{
ContainerID: r.ID,
Bundle: r.Bundle,
Rootfs: r.Rootfs,
IO: &eventsapi.TaskIO{
Stdin: r.Stdin,
Stdout: r.Stdout,
Stderr: r.Stderr,
Terminal: r.Terminal,
},
Checkpoint: r.Checkpoint,
Pid: uint32(pid),
}
go s.waitExit(process, pid, cmd)
return &shimapi.CreateTaskResponse{
Pid: uint32(pid),
}, nil
}
五. 服务端 newInitProcess 分析
5.1 路径 linux/shim/init.go
rootfs := filepath.Join(path, "rootfs")
path 大致为 /var/lib/containerd/io.containerd.runtime.v1.linux/default/${container-id}
填充 Runc 结构体和 initProcess 结构体
runtime := &runc.Runc{
Command: r.Runtime,
Log: filepath.Join(path, "log.json"),
LogFormat: runc.JSON,
PdeathSignal: syscall.SIGKILL,
Root: filepath.Join(RuncRoot, namespace),
}
p := &initProcess{
id: r.ID,
bundle: r.Bundle,
runtime: runtime,
platform: plat,
stdio: stdio{
stdin: r.Stdin,
stdout: r.Stdout,
stderr: r.Stderr,
terminal: r.Terminal,
},
rootfs: rootfs,
}
5.2 NewPipeIO 为 runc 创建管道
if r.Terminal {
if socket, err = runc.NewConsoleSocket(filepath.Join(path, "pty.sock")); err != nil {
return nil, errors.Wrap(err, "failed to create OCI runtime console socket")
}
defer os.Remove(socket.Path())
} else {
if io, err = runc.NewPipeIO(0, 0); err != nil {
return nil, errors.Wrap(err, "failed to create OCI runtime io pipes")
}
p.io = io
}
5.4 路径 github.com/containerd/go-runc/runc.go,如果第一次创建则执行 p.runtime.Create 函数第六章讲解
if err := p.runtime.Create(context, r.ID, r.Bundle, opts); err != nil {
return nil, p.runtimeError(err, "OCI runtime create failed")
}
六. 服务端 runc 分析
路径 vendor/github.com/containerd/go-runc/runc.go
结构体 Runc,内容也不复杂
// Runc is the client to the runc cli
type Runc struct {
//If command is empty, DefaultCommand is used
Command string
Root string
Debug bool
Log string
LogFormat Format
PdeathSignal syscall.Signal
Setpgid bool
Criu string
SystemdCgroup string
}
6.1 Create 创建一个新的容器,成功返回 pid,Monitor.Start 主要是创建文件描述符,启动进程等
cmd 大致为 runc --root /run/containerd/runc/default --log /var/lib/containerd/io.containerd.runtime.v1.linux/default/${container-id}/log.json --pid-file /var/lib/containerd/io.containerd.runtime.v1.linux/default/${container-id}/init.pid,最终调用的是 runc 命令
// Create creates a new container and returns its pid if it was created successfully
func (r *Runc) Create(context context.Context, id, bundle string, opts *CreateOpts) error {
args := []string{"create", "--bundle", bundle}
if opts != nil {
oargs, err := opts.args()
if err != nil {
return err
}
args = append(args, oargs...)
}
cmd := r.command(context, append(args, id)...)
if opts != nil && opts.IO != nil {
opts.Set(cmd)
}
cmd.ExtraFiles = opts.ExtraFiles
if cmd.Stdout == nil && cmd.Stderr == nil {
data, err := Monitor.CombinedOutput(cmd)
if err != nil {
return fmt.Errorf("%s: %s", err, data)
}
return nil
}
if err := Monitor.Start(cmd); err != nil {
return err
}
if opts != nil && opts.IO != nil {
if c, ok := opts.IO.(StartCloser); ok {
if err := c.CloseAfterStart(); err != nil {
return err
}
}
}
_, err := Monitor.Wait(cmd)
return err
}
目录 /var/lib/containerd/io.containerd.runtime.v1.linux/default/${container-id}/config.json 大致如下内容:
{
"ociVersion": "1.0.0",
"process": {
"user": {
"uid": 0,
"gid": 0
},
"args": [
"docker-entrypoint.sh",
"redis-server"
],
"env": [
"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
"GOSU_VERSION=1.10",
"REDIS_VERSION=4.0.1",
"REDIS_DOWNLOAD_URL=http://download.redis.io/releases/redis-4.0.1.tar.gz",
"REDIS_DOWNLOAD_SHA=2049cd6ae9167f258705081a6ef23bb80b7eff9ff3d0d7481e89510f27457591"
],
"cwd": "/data",
"capabilities": {
"bounding": [
"CAP_CHOWN",
"CAP_DAC_OVERRIDE",
"CAP_FSETID",
"CAP_FOWNER",
"CAP_MKNOD",
"CAP_NET_RAW",
"CAP_SETGID",
"CAP_SETUID",
"CAP_SETFCAP",
"CAP_SETPCAP",
"CAP_NET_BIND_SERVICE",
"CAP_SYS_CHROOT",
"CAP_KILL",
"CAP_AUDIT_WRITE"
],
"effective": [
"CAP_CHOWN",
"CAP_DAC_OVERRIDE",
"CAP_FSETID",
"CAP_FOWNER",
"CAP_MKNOD",
"CAP_NET_RAW",
"CAP_SETGID",
"CAP_SETUID",
"CAP_SETFCAP",
"CAP_SETPCAP",
"CAP_NET_BIND_SERVICE",
"CAP_SYS_CHROOT",
"CAP_KILL",
"CAP_AUDIT_WRITE"
],
"inheritable": [
"CAP_CHOWN",
"CAP_DAC_OVERRIDE",
"CAP_FSETID",
"CAP_FOWNER",
"CAP_MKNOD",
"CAP_NET_RAW",
"CAP_SETGID",
"CAP_SETUID",
"CAP_SETFCAP",
"CAP_SETPCAP",
"CAP_NET_BIND_SERVICE",
"CAP_SYS_CHROOT",
"CAP_KILL",
"CAP_AUDIT_WRITE"
],
"permitted": [
"CAP_CHOWN",
"CAP_DAC_OVERRIDE",
"CAP_FSETID",
"CAP_FOWNER",
"CAP_MKNOD",
"CAP_NET_RAW",
"CAP_SETGID",
"CAP_SETUID",
"CAP_SETFCAP",
"CAP_SETPCAP",
"CAP_NET_BIND_SERVICE",
"CAP_SYS_CHROOT",
"CAP_KILL",
"CAP_AUDIT_WRITE"
]
},
"rlimits": [
{
"type": "RLIMIT_NOFILE",
"hard": 1024,
"soft": 1024
}
],
"noNewPrivileges": true
},
"root": {
"path": "rootfs"
},
"mounts": [
{
"destination": "/proc",
"type": "proc",
"source": "proc"
},
{
"destination": "/dev",
"type": "tmpfs",
"source": "tmpfs",
"options": [
"nosuid",
"strictatime",
"mode=755",
"size=65536k"
]
},
{
"destination": "/dev/pts",
"type": "devpts",
"source": "devpts",
"options": [
"nosuid",
"noexec",
"newinstance",
"ptmxmode=0666",
"mode=0620",
"gid=5"
]
},
{
"destination": "/dev/shm",
"type": "tmpfs",
"source": "shm",
"options": [
"nosuid",
"noexec",
"nodev",
"mode=1777",
"size=65536k"
]
},
{
"destination": "/dev/mqueue",
"type": "mqueue",
"source": "mqueue",
"options": [
"nosuid",
"noexec",
"nodev"
]
},
{
"destination": "/sys",
"type": "sysfs",
"source": "sysfs",
"options": [
"nosuid",
"noexec",
"nodev",
"ro"
]
},
{
"destination": "/run",
"type": "tmpfs",
"source": "tmpfs",
"options": [
"nosuid",
"strictatime",
"mode=755",
"size=65536k"
]
}
],
"linux": {
"resources": {
"devices": [
{
"allow": false,
"access": "rwm"
}
]
},
"namespaces": [
{
"type": "pid"
},
{
"type": "ipc"
},
{
"type": "uts"
},
{
"type": "mount"
},
{
"type": "network"
}
]
}
}