Docker存储驱动之overlay新镜像存储的实现和inode耗尽问题

镜像是按层下载和管理的,新镜像下载的文件临时存放在/var/lib/docker/tmp,文件命名方式是GetImageBlobxxx(xxx是一串随机数字),这些临时文件时按层打包为tar.gz等压缩包。临时文件首先被解压为tar包存在缓存中,然后使用docker\layer\layer_store.go的layerStore的Register函数注册到系统中,最后临时文件被删除。
在docker\distribution\pull_v2.go:

func (ld *v2LayerDescriptor) Download(ctx context.Context, progressOutput progress.Output) (io.ReadCloser, int64, error) {

...

    return ioutils.NewReadCloserWrapper(tmpFile, func() error {
        tmpFile.Close()
        //关闭后删除临时文件
        err := os.RemoveAll(tmpFile.Name())
        if err != nil {
            logrus.Errorf("Failed to remove temp file: %s", tmpFile.Name())
        }
        return err
    }), size, nil
}

所谓注册就是将实际数据写到文件系统中。这个过程分三步:
第一,建立镜像层的独有目录
第二,将解压tar包数据写到相应镜像层的独有目录中
第三,在镜像层管理Map中插入镜像层对象

func (ls *layerStore) Register(ts io.Reader, parent ChainID) (Layer, error) {
    logrus.Debugf("Register parent: %s", parent)
    return ls.registerWithDescriptor(ts, parent, distribution.Descriptor{})
}

func (ls *layerStore) registerWithDescriptor(ts io.Reader, parent ChainID, descriptor distribution.Descriptor) (Layer, error) {
    // err is used to hold the error which will always trigger
    // cleanup of creates sources but may not be an error returned
    // to the caller (already exists).
    var err error
    var pid string
    var p *roLayer
    if string(parent) != "" {
        //这里直接从Map中取
        p = ls.get(parent)
        if p == nil {
            return nil, ErrLayerDoesNotExist
        }
        pid = p.cacheID
        // Release parent chain if error
        defer func() {
            if err != nil {
                ls.layerL.Lock()
                ls.releaseLayer(p)
                ls.layerL.Unlock()
            }
        }()
        if p.depth() >= maxLayerDepth {
            err = ErrMaxDepthExceeded
            return nil, err
        }
    }

    // Create new roLayer
    layer := &roLayer{
        parent:         p,
        cacheID:        stringid.GenerateRandomID(),
        referenceCount: 1,
        layerStore:     ls,
        references:     map[Layer]struct{}{},
        descriptor:     descriptor,
    }
    //如果parent为空,这里pid为空,会建一个根目录root
    //如果parent的root存在,则做overlay
    if err = ls.driver.Create(layer.cacheID, pid, "", nil); err != nil {
        return nil, err
    }

    tx, err := ls.store.StartTransaction()
    if err != nil {
        return nil, err
    }

    defer func() {
        if err != nil {
            logrus.Debugf("Cleaning up layer %s: %v", layer.cacheID, err)
            if err := ls.driver.Remove(layer.cacheID); err != nil {
                logrus.Errorf("Error cleaning up cache layer %s: %v", layer.cacheID, err)
            }
            if err := tx.Cancel(); err != nil {
                logrus.Errorf("Error canceling metadata transaction %q: %s", tx.String(), err)
            }
        }
    }()
    //应用tar包
    if err = ls.applyTar(tx, ts, pid, layer); err != nil {
        return nil, err
    }

    if layer.parent == nil {
        layer.chainID = ChainID(layer.diffID)
    } else {

        layer.chainID = createChainIDFromParent(layer.parent.chainID, layer.diffID)
    }

    if err = storeLayer(tx, layer); err != nil {
        return nil, err
    }

    ls.layerL.Lock()
    defer ls.layerL.Unlock()

    if existingLayer := ls.getWithoutLock(layer.chainID); existingLayer != nil {
        // Set error for cleanup, but do not return the error
        err = errors.New("layer already exists")
        return existingLayer.getReference(), nil
    }

    if err = tx.Commit(layer.chainID); err != nil {
        return nil, err
    }
    //注册层
    ls.layerMap[layer.chainID] = layer

    return layer.getReference(), nil
}

这个过程的前两步是依赖驱动来完成的,对于overlay驱动来说,有两种情景:
第一,处理的镜像层有父层
第二,处理的镜像层没有父层,也就是镜像层是基础镜像层
我们先看下overlay驱动的初始化:

// Init returns the NaiveDiffDriver, a native diff driver for overlay filesystem.
// If overlay filesystem is not supported on the host, graphdriver.ErrNotSupported is returned as error.
// If an overlay filesystem is not supported over an existing filesystem then error graphdriver.ErrIncompatibleFS is returned.
func Init(home string, options []string, uidMaps, gidMaps []idtools.IDMap) (graphdriver.Driver, error) {

    if err := supportsOverlay(); err != nil {
        return nil, graphdriver.ErrNotSupported
    }

    fsMagic, err := graphdriver.GetFSMagic(home)
    if err != nil {
        return nil, err
    }
    if fsName, ok := graphdriver.FsNames[fsMagic]; ok {
        backingFs = fsName
    }

    switch fsMagic {
    case graphdriver.FsMagicAufs, graphdriver.FsMagicBtrfs, graphdriver.FsMagicOverlay, graphdriver.FsMagicZfs, graphdriver.FsMagicEcryptfs:
        logrus.Errorf("'overlay' is not supported over %s", backingFs)
        return nil, graphdriver.ErrIncompatibleFS
    }

    rootUID, rootGID, err := idtools.GetRootUIDGID(uidMaps, gidMaps)
    if err != nil {
        return nil, err
    }
    // Create the driver home dir
    if err := idtools.MkdirAllAs(home, 0700, rootUID, rootGID); err != nil && !os.IsExist(err) {
        return nil, err
    }

    if err := mount.MakePrivate(home); err != nil {
        return nil, err
    }

    d := &Driver{
        home:    home,
        uidMaps: uidMaps,
        gidMaps: gidMaps,
        ctr:     graphdriver.NewRefCounter(graphdriver.NewFsChecker(graphdriver.FsMagicOverlay)),
    }

    return NaiveDiffDriverWithApply(d, uidMaps, gidMaps), nil
}

// NaiveDiffDriverWithApply returns a NaiveDiff driver with custom ApplyDiff.
func NaiveDiffDriverWithApply(driver ApplyDiffProtoDriver, uidMaps, gidMaps []idtools.IDMap) graphdriver.Driver {
    return &naiveDiffDriverWithApply{
        Driver:    graphdriver.NewNaiveDiffDriver(driver, uidMaps, gidMaps),
        applyDiff: driver,
    }
}

可以看到返回的是naiveDiffDriverWithApply,而naiveDiffDriverWithApply包含两个对象Driver和applyDiff。
根据go语言特性,第一步调用

    if err = ls.driver.Create(layer.cacheID, pid, "", nil); err != nil {
        return nil, err
    }

调用的是文件中docker\daemon\graphdriver\overlay\overlay.go的Driver实现了Create,所以会调用Driver的Create函数:

// Create is used to create the upper, lower, and merge directories required for overlay fs for a given id.
// The parent filesystem is used to configure these directories for the overlay.
func (d *Driver) Create(id, parent, mountLabel string, storageOpt map[string]string) (retErr error) {

    if len(storageOpt) != 0 {
        return fmt.Errorf("--storage-opt is not supported for overlay")
    }

    dir := d.dir(id)

    rootUID, rootGID, err := idtools.GetRootUIDGID(d.uidMaps, d.gidMaps)
    if err != nil {
        return err
    }

    //path.Dir(dir)返回除最后一个子目录外的所有路径
    if err := idtools.MkdirAllAs(path.Dir(dir), 0700, rootUID, rootGID); err != nil {
        return err
    }
    //建一个镜像层的独有目录
    if err := idtools.MkdirAs(dir, 0700, rootUID, rootGID); err != nil {
        return err
    }

    defer func() {
        // Clean up on failure
        if retErr != nil {
            os.RemoveAll(dir)
        }
    }()

    // Toplevel images are just a "root" dir
    //如果没有父层,则在镜像层目录下建一个root目录,并返回
    if parent == "" {
        if err := idtools.MkdirAs(path.Join(dir, "root"), 0755, rootUID, rootGID); err != nil {
            return err
        }
        return nil
    }
    //否则建立upper,merged等目录
    logrus.Debugf("Make layer dir")
    parentDir := d.dir(parent)

    // Ensure parent exists
    if _, err := os.Lstat(parentDir); err != nil {
        return err
    }

    // If parent has a root, just do an overlay to it
    //如果父镜像层有root目录,则建立upper等目录
    parentRoot := path.Join(parentDir, "root")

    //如果父层的root存在,则
    if s, err := os.Lstat(parentRoot); err == nil {

        if err := idtools.MkdirAs(path.Join(dir, "upper"), s.Mode(), rootUID, rootGID); err != nil {
            return err
        }
        if err := idtools.MkdirAs(path.Join(dir, "work"), 0700, rootUID, rootGID); err != nil {
            return err
        }
        if err := idtools.MkdirAs(path.Join(dir, "merged"), 0700, rootUID, rootGID); err != nil {
            return err
        }
        if err := ioutil.WriteFile(path.Join(dir, "lower-id"), []byte(parent), 0666); err != nil {
            return err
        }
        return nil
    }

    // Otherwise, copy the upper and the lower-id from the parent

    lowerID, err := ioutil.ReadFile(path.Join(parentDir, "lower-id"))
    if err != nil {
        return err
    }

    if err := ioutil.WriteFile(path.Join(dir, "lower-id"), lowerID, 0666); err != nil {
        return err
    }

    parentUpperDir := path.Join(parentDir, "upper")
    s, err := os.Lstat(parentUpperDir)
    if err != nil {
        return err
    }

    upperDir := path.Join(dir, "upper")
    if err := idtools.MkdirAs(upperDir, s.Mode(), rootUID, rootGID); err != nil {
        return err
    }
    if err := idtools.MkdirAs(path.Join(dir, "work"), 0700, rootUID, rootGID); err != nil {
        return err
    }
    if err := idtools.MkdirAs(path.Join(dir, "merged"), 0700, rootUID, rootGID); err != nil {
        return err
    }
        //这里应该是把父镜像层的所有数据拷贝到子镜像层
    return copyDir(parentUpperDir, upperDir, 0)
}

阅读代码可以得知,如果有父层(必然也有父层目录的root目录,遇到情况似乎都这样),则会在本镜像层目录建立upper,work,merged,lower-id目录,然后返回。如果没有父层,镜像层本身是基础镜像层,则直接在本镜像层目录建一个root子文件夹,然后返回。

根据go语言特性,第二步调用:

    //应用tar包
    if err = ls.applyTar(tx, ts, pid, layer); err != nil {
        return nil, err
    }

调用的是docker\daemon\graphdriver\overlay\overlay.go的naiveDiffDriverWithApply的成员ApplyDiff:


// ApplyDiff creates a diff layer with either the NaiveDiffDriver or with a fallback.
func (d *naiveDiffDriverWithApply) ApplyDiff(id, parent string, diff archive.Reader) (int64, error) {

    b, err := d.applyDiff.ApplyDiff(id, parent, diff)
    if err == ErrApplyDiffFallback {
        //初始化在NaiveDiffDriverWithApply函数(47行)
        //Driver实现在docker\daemon\graphdriver\fsdiff.go
        return d.Driver.ApplyDiff(id, parent, diff)
    }
    return b, err
}

可以看到naiveDiffDriverWithApply.ApplyDiff首先会尝试调用d.applyDiff.ApplyDiff,如果失败会调用d.Driver.ApplyDiff。
d.applyDiff.ApplyDiff也就是docker\daemon\graphdriver\overlay\overlay.go的Driver的成员函数ApplyDiff:

// ApplyDiff applies the new layer on top of the root, if parent does not exist with will return an ErrApplyDiffFallback error.
func (d *Driver) ApplyDiff(id string, parent string, diff archive.Reader) (size int64, err error) {
    dir := d.dir(id)

    if parent == "" {
        logrus.Debugf("Applied tar on err,no parent")
        return 0, ErrApplyDiffFallback
    }
    logrus.Debugf("Applied tar on parent:%s",parent)
    //只有父镜像层root存在的的才会继续往下执行
    parentRootDir := path.Join(d.dir(parent), "root")
    if _, err := os.Stat(parentRootDir); err != nil {
        return 0, ErrApplyDiffFallback
    }

    // We now know there is a parent, and it has a "root" directory containing
    // the full root filesystem. We can just hardlink it and apply the
    // layer. This relies on two things:
    // 1) ApplyDiff is only run once on a clean (no writes to upper layer) container
    // 2) ApplyDiff doesn't do any in-place writes to files (would break hardlinks)
    // These are all currently true and are not expected to break

        //先生成一个临时的目录tmproot
    tmpRootDir, err := ioutil.TempDir(dir, "tmproot")
    if err != nil {
        return 0, err
    }
    //最后要删掉upper等临时目录
    defer func() {
        if err != nil {
            os.RemoveAll(tmpRootDir)
        } else {
            os.RemoveAll(path.Join(dir, "upper"))
            os.RemoveAll(path.Join(dir, "work"))
            os.RemoveAll(path.Join(dir, "merged"))
            os.RemoveAll(path.Join(dir, "lower-id"))
        }
    }()

    //tmproot指向了父镜像层的root
    //将所有位于下层的内容都硬链接到“下层目录”中
    //当应用差异数据时,原来的inode还存在,同名的目录项指向新的inode
    if err = copyDir(parentRootDir, tmpRootDir, copyHardlink); err != nil {
        return 0, err
    }

    options := &archive.TarOptions{UIDMaps: d.uidMaps, GIDMaps: d.gidMaps}
    //最终调用applyLayerHandler,实现在docker\docker\pkg\chrootarchive\diff_unix.go
    //为何去覆盖父层呢
    if size, err = graphdriver.ApplyUncompressedLayer(tmpRootDir, diff, options); err != nil {
        return 0, err
    }
        //搞不懂为何不一开始就命名为 root呢,而是要后来才改为root
    rootDir := path.Join(dir, "root")
    if err := os.Rename(tmpRootDir, rootDir); err != nil {
        return 0, err
    }

    return
}

对于有父层的镜像层,会在镜像层目录建一个tmproot目录,然后将父层root目录的所有内容建立硬链接到该目录,完成后删除upper等目录,再改tmproot为root(说实在话这是什么鬼,建了删,建了改)。然后将本层的新数据覆盖父层的硬链接。由linux的硬链接的特性知,对于同名的文件,文件名(目录项对象)将指向新的文件(子层文件,inode),其他的还是父层文件(父层inode)。这样完成了镜像层的合并。

对于没有父层的镜像层,这个更简单,调用上述函数将出错返回,然后调用docker\daemon\graphdriver\fsdiff.go的NaiveDiffDriver的成员ApplyDiff:

// ApplyDiff extracts the changeset from the given diff into the
// layer with the specified id and parent, returning the size of the
// new layer in bytes.
func (gdw *NaiveDiffDriver) ApplyDiff(id, parent string, diff archive.Reader) (size int64, err error) {
    driver := gdw.ProtoDriver

    // Mount the root filesystem so we can apply the diff/layer.
    //其实是挂在路径,如果有root,则直接返回root
    //通过ID获取镜像层根目录即是镜像层目录的root
    layerFs, err := driver.Get(id, "")
    if err != nil {
        return
    }
    defer driver.Put(id)

    options := &archive.TarOptions{UIDMaps: gdw.uidMaps,
        GIDMaps: gdw.gidMaps}
    start := time.Now().UTC()
    logrus.Debug("ApplyUncompressedLayer to:%s",layerFs)
    if size, err = ApplyUncompressedLayer(layerFs, diff, options); err != nil {
        return
    }
    logrus.Debugf("Untar time: %vs", time.Now().UTC().Sub(start).Seconds())

    return
}

直接在镜像目录建一个root文件夹,将tar包解压到该文件夹。可以看到docker的overlay驱动处理镜像层合并问题是采用将底层镜像层的内容建立硬链接到子层的方法,如果底层镜像层文件比较多,而镜像又有很多层,会出现什么问题?因为文件系统划分的元数据区大小是有限的,每一个新层就要建立底层的文件的硬链接,硬链接也就是目录项对象,这些目录项对象由目录(特殊文件,也即是inode)集合,是存储在元数据区的,这样文件系统数据区还没有使用完,产生很多的inode占用完元数据区—这就是inode耗尽问题

  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值