本文代码基于containerd 1.16分支
最近在看kubernetes中cni相关的内容,想研究下pod创建时网络是如何调用的,新版的kubernetes中已经没有了docker-shim,所以从kubelet中/pkg/kubelet/kuberuntime/kuberuntime_manager.go中的createPodSandbox方法开始分析,该方法调用RunPodSandbox
cri是Kubernetes容器运行时接口(CRI)的containerd插件实现。使用它,可以将containerd用作Kubernetes集群的容器运行时。
1.RunPodSandbox函数
1.1 生成id和name,将name和id进行绑定,防止创建同名的sandbox,ReleaseByName在发生错误后释放掉Name。
// RunPodSandbox creates and starts a pod-level sandbox. Runtimes should ensure
// the sandbox is in ready state.
func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandboxRequest) (_ *runtime.RunPodSandboxResponse, retErr error) {
config := r.GetConfig()
log.G(ctx).Debugf("Sandbox config %+v", config)
// Generate unique id and name for the sandbox and reserve the name.
id := util.GenerateID()
metadata := config.GetMetadata()
if metadata == nil {
return nil, errors.New("sandbox config must include metadata")
}
name := makeSandboxName(metadata)
log.G(ctx).WithField("podsandboxid", id).Debugf("generated id for sandbox name %q", name)
// Reserve the sandbox name to avoid concurrent `RunPodSandbox` request starting the
// same sandbox.
if err := c.sandboxNameIndex.Reserve(name, id); err != nil {
return nil, fmt.Errorf("failed to reserve sandbox name %q: %w", name, err)
}
defer func() {
// Release the name if the function returns with an error.
if retErr != nil {
c.sandboxNameIndex.ReleaseByName(name)
}
}()
1.2 实例化sandbox
// Create initial internal sandbox object.
sandbox := sandboxstore.NewSandbox(
sandboxstore.Metadata{
ID: id,
Name: name,
Config: config,
RuntimeHandler: r.GetRuntimeHandler(),
},
sandboxstore.Status{
State: sandboxstore.StateUnknown,
},
)
1.3 确保镜像存在,如果不存在则拉取,并将 镜像转换成containerd.Image接口形式
// Ensure sandbox container image snapshot.
image, err := c.ensureImageExists(ctx, c.config.SandboxImage, config)
if err != nil {
return nil, fmt.Errorf("failed to get sandbox image %q: %w", c.config.SandboxImage, err)
}
containerdImage, err := c.toContainerdImage(ctx, *image)
if err != nil {
return nil, fmt.Errorf("failed to get image from containerd %q: %w", image.ID, err)
}
1.3.1 优先从本地加载镜像,不存在则通过PullImage拉取。
func (c *criService) ensureImageExists(ctx context.Context, ref string, config *runtime.PodSandboxConfig) (*imagestore.Image, error) {
image, err := c.localResolve(ref)
if err != nil && !errdefs.IsNotFound(err) {
return nil, fmt.Errorf("failed to get image %q: %w", ref, err)
}
if err == nil {
return &image, nil
}
// Pull image to ensure the image exists
resp, err := c.PullImage(ctx, &runtime.PullImageRequest{Image: &runtime.ImageSpec{Image: ref}, SandboxConfig: config})
if err != nil {
return nil, fmt.Errorf("failed to pull image %q: %w", ref, err)
}
imageID := resp.GetImageRef()
newImage, err := c.imageStore.Get(imageID)
if err != nil {
// It's still possible that someone removed the image right after it is pulled.
return nil, fmt.Errorf("failed to get image %q after pulling: %w", imageID, err)
}
return &newImage, nil
}
1.4 获取 sandbox runtime
// getSandboxRuntime returns the runtime configuration for sandbox.
// If the sandbox contains untrusted workload, runtime for untrusted workload will be returned,
// or else default runtime will be returned.
func (c *criService) getSandboxRuntime(config *runtime.PodSandboxConfig, runtimeHandler string) (criconfig.Runtime, error) {
if untrustedWorkload(config) {
// If the untrusted annotation is provided, runtimeHandler MUST be empty.
if runtimeHandler != "" && runtimeHandler != criconfig.RuntimeUntrusted {
return criconfig.Runtime{}, errors.New("untrusted workload with explicit runtime handler is not allowed")
}
// If the untrusted workload is requesting access to the host/node, this request will fail.
//
// Note: If the workload is marked untrusted but requests privileged, this can be granted, as the
// runtime may support this. For example, in a virtual-machine isolated runtime, privileged
// is a supported option, granting the workload to access the entire guest VM instead of host.
// TODO(windows): Deprecate this so that we don't need to handle it for windows.
if hostAccessingSandbox(config) {
return criconfig.Runtime{}, errors.New("untrusted workload with host access is not allowed")
}
runtimeHandler = criconfig.RuntimeUntrusted
}
if runtimeHandler == "" {
runtimeHandler = c.config.ContainerdConfig.DefaultRuntimeName
}
handler, ok := c.config.ContainerdConfig.Runtimes[runtimeHandler]
if !ok {
return criconfig.Runtime{}, fmt.Errorf("no runtime for %q is configured", runtimeHandler)
}
return handler, nil
}
1.5 如果linux不是host network则设置pod网络,网络命名空间的路径 为:/var/run/netns
podNetwork := true
if goruntime.GOOS != "windows" &&
config.GetLinux().GetSecurityContext().GetNamespaceOptions().GetNetwork() == runtime.NamespaceMode_NODE {
// Pod network is not needed on linux with host network.
podNetwork = false
}
if goruntime.GOOS == "windows" &&
config.GetWindows().GetSecurityContext().GetHostProcess() {
//Windows HostProcess pods can only run on the host network
podNetwork = false
}
if podNetwork {
netStart := time.Now()
// If it is not in host network namespace then create a namespace and set the sandbox
// handle. NetNSPath in sandbox metadata and NetNS is non empty only for non host network
// namespaces. If the pod is in host network namespace then both are empty and should not
// be used.
var netnsMountDir = "/var/run/netns"
if c.config.NetNSMountsUnderStateDir {
netnsMountDir = filepath.Join(c.config.StateDir, "netns")
}
sandbox.NetNS, err = netns.NewNetNS(netnsMountDir)
if err != nil {
return nil, fmt.Errorf("failed to create network namespace for sandbox %q: %w", id, err)
}
sandbox.NetNSPath = sandbox.NetNS.GetPath()
defer func() {
if retErr != nil {
deferCtx, deferCancel := ctrdutil.DeferContext()
defer deferCancel()
// Teardown network if an error is returned.
if err := c.teardownPodNetwork(deferCtx, sandbox); err != nil {
log.G(ctx).WithError(err).Errorf("Failed to destroy network for sandbox %q", id)
}
if err := sandbox.NetNS.Remove(); err != nil {
log.G(ctx).WithError(err).Errorf("Failed to remove network namespace %s for sandbox %q", sandbox.NetNSPath, id)
}
sandbox.NetNSPath = ""
}
}()
// Setup network for sandbox.
// Certain VM based solutions like clear containers (Issue containerd/cri-containerd#524)
// rely on the assumption that CRI shim will not be querying the network namespace to check the
// network states such as IP.
// In future runtime implementation should avoid relying on CRI shim implementation details.
// In this case however caching the IP will add a subtle performance enhancement by avoiding
// calls to network namespace of the pod to query the IP of the veth interface on every
// SandboxStatus request.
if err := c.setupPodNetwork(ctx, &sandbox); err != nil {
return nil, fmt.Errorf("failed to setup network for sandbox %q: %w", id, err)
}
sandboxCreateNetworkTimer.UpdateSince(netStart)
}
1.5.1 获取netPlugin
通过netPlugin.Setup进行网络设置
selectPodIPs 获取pod ip
// setupPodNetwork setups up the network for a pod
func (c *criService) setupPodNetwork(ctx context.Context, sandbox *sandboxstore.Sandbox) error {
var (
id = sandbox.ID
config = sandbox.Config
path = sandbox.NetNSPath
netPlugin = c.getNetworkPlugin(sandbox.RuntimeHandler)
)
if netPlugin == nil {
return errors.New("cni config not initialized")
}
opts, err := cniNamespaceOpts(id, config)
if err != nil {
return fmt.Errorf("get cni namespace options: %w", err)
}
log.G(ctx).WithField("podsandboxid", id).Debugf("begin cni setup")
result, err := netPlugin.Setup(ctx, id, path, opts...)
if err != nil {
return err
}
logDebugCNIResult(ctx, id, result)
// Check if the default interface has IP config
if configs, ok := result.Interfaces[defaultIfName]; ok && len(configs.IPConfigs) > 0 {
sandbox.IP, sandbox.AdditionalIPs = selectPodIPs(ctx, configs.IPConfigs, c.config.IPPreference)
sandbox.CNIResult = result
return nil
}
return fmt.Errorf("failed to find network info for sandbox %q", id)
}
1.5.1.1 getNetworkPlugin
其中c.netPlugin是在初始化initCRIService中的server.NewCRIService(c, client)时,c.initPlatform()中
// getNetworkPlugin returns the network plugin to be used by the runtime class
// defaults to the global CNI options in the CRI config
func (c *criService) getNetworkPlugin(runtimeClass string) cni.CNI {
if c.netPlugin == nil {
return nil
}
i, ok := c.netPlugin[runtimeClass]
if !ok {
if i, ok = c.netPlugin[defaultNetworkPlugin]; !ok {
return nil
}
}
return i
}
// initPlatform handles linux specific initialization for the CRI service.
func (c *criService) initPlatform() (err error) {
//...
pluginDirs := map[string]string{
defaultNetworkPlugin: c.config.NetworkPluginConfDir,
}
for name, conf := range c.config.Runtimes {
if conf.NetworkPluginConfDir != "" {
pluginDirs[name] = conf.NetworkPluginConfDir
}
}
c.netPlugin = make(map[string]cni.CNI)
for name, dir := range pluginDirs {
max := c.config.NetworkPluginMaxConfNum
if name != defaultNetworkPlugin {
if m := c.config.Runtimes[name].NetworkPluginMaxConfNum; m != 0 {
max = m
}
}
// Pod needs to attach to at least loopback network and a non host network,
// hence networkAttachCount is 2. If there are more network configs the
// pod will be attached to all the networks but we will only use the ip
// of the default network interface as the pod IP.
i, err := cni.New(cni.WithMinNetworkCount(networkAttachCount),
cni.WithPluginConfDir(dir),
cni.WithPluginMaxConfNum(max),
cni.WithPluginDir([]string{c.config.NetworkPluginBinDir}))
if err != nil {
return fmt.Errorf("failed to initialize cni: %w", err)
}
c.netPlugin[name] = i
}
if c.allCaps == nil {
c.allCaps, err = cap.Current()
if err != nil {
return fmt.Errorf("failed to get caps: %w", err)
}
}
return nil
}
New方法创建了一个libcni实例
// New creates a new libcni instance.
func New(config ...Opt) (CNI, error) {
cni := defaultCNIConfig()
var err error
for _, c := range config {
if err = c(cni); err != nil {
return nil, err
}
}
return cni, nil
}
func defaultCNIConfig() *libcni {
return &libcni{
config: config{
pluginDirs: []string{DefaultCNIDir}, //opt/cni/bin
pluginConfDir: DefaultNetDir, //etc/cni/net.d
pluginMaxConfNum: DefaultMaxConfNum, //1
prefix: DefaultPrefix, //eth
},
cniConfig: cnilibrary.NewCNIConfig(
[]string{
DefaultCNIDir,
},
&invoke.DefaultExec{
RawExec: &invoke.RawExec{Stderr: os.Stderr},
PluginDecoder: version.PluginDecoder{},
},
),
networkCount: 1,
}
}
1.5.1.2
判断CNI插件是否就绪
初始化新namespace
// Setup setups the network in the namespace and returns a Result
func (c *libcni) Setup(ctx context.Context, id string, path string, opts ...NamespaceOpts) (*Result, error) {
if err := c.Status(); err != nil {
return nil, err
}
ns, err := newNamespace(id, path, opts...)
if err != nil {
return nil, err
}
result, err := c.attachNetworks(ctx, ns)
if err != nil {
return nil, err
}
return c.createResult(result)
}
asynchAttach 开启协程,创建网络
func (c *libcni) attachNetworks(ctx context.Context, ns *Namespace) ([]*types100.Result, error) {
var wg sync.WaitGroup
var firstError error
results := make([]*types100.Result, len(c.Networks()))
rc := make(chan asynchAttachResult)
for i, network := range c.Networks() {
wg.Add(1)
go asynchAttach(ctx, i, network, ns, &wg, rc)
}
for range c.Networks() {
rs := <-rc
if rs.err != nil && firstError == nil {
firstError = rs.err
}
results[rs.index] = rs.res
}
wg.Wait()
return results, firstError
}
执行network.Attach方法
func asynchAttach(ctx context.Context, index int, n *Network, ns *Namespace, wg *sync.WaitGroup, rc chan asynchAttachResult) {
defer wg.Done()
r, err := n.Attach(ctx, ns)
rc <- asynchAttachResult{index: index, res: r, err: err}
}
func (n *Network) Attach(ctx context.Context, ns *Namespace) (*types100.Result, error) {
r, err := n.cni.AddNetworkList(ctx, n.config, ns.config(n.ifName))
if err != nil {
return nil, err
}
return types100.NewResultFromResult(r)
}
进入到containernetworking/cni/libcni/api.go中调用cni进行网络创建
// AddNetworkList executes a sequence of plugins with the ADD command
func (c *CNIConfig) AddNetworkList(ctx context.Context, list *NetworkConfigList, rt *RuntimeConf) (types.Result, error) {
var err error
var result types.Result
for _, net := range list.Plugins {
result, err = c.addNetwork(ctx, list.Name, list.CNIVersion, net, result, rt)
if err != nil {
return nil, fmt.Errorf("plugin %s failed (add): %w", pluginDescription(net.Network), err)
}
}
if err = c.cacheAdd(result, list.Bytes, list.Name, rt); err != nil {
return nil, fmt.Errorf("failed to set network %q cached result: %w", list.Name, err)
}
return result, nil
}
参考:;【containerd 源码分析】containerd cri PodRunSandbox 源码分析之二_张忠琳的博客-CSDN博客