Bolt的数据是存储在db文件中,bolt操作始于db文件。
1、db文件初始化和加载流程
open函数实现db文件的打开
func Open(path string, mode os.FileMode, options *Options) (*DB, error) {
var db = &DB{opened: true}
// Set default options if no options are provided.
if options == nil {
options = DefaultOptions
}
//使用默认值初始化,这些字段的含义在前面的已经介绍,此处不再介绍
db.NoGrowSync = options.NoGrowSync
db.MmapFlags = options.MmapFlags
// Set default values for later DB operations.
db.MaxBatchSize = DefaultMaxBatchSize
db.MaxBatchDelay = DefaultMaxBatchDelay
db.AllocSize = DefaultAllocSize
flag := os.O_RDWR
if options.ReadOnly {//如果以只读方式打开
flag = os.O_RDONLY
db.readOnly = true
}
//获取文件句柄
db.path = path
var err error
if db.file, err = os.OpenFile(db.path, flag|os.O_CREATE, mode); err != nil {
_ = db.close()
return nil, err
}
//锁定文件,以便在读写模式下使用Bolt的其他进程无法同时使用数据库。只读使用共享锁锁定数据库文件(多个进程可能同时保持锁定)否则(设置options.ReadOnly)
if err := flock(db, mode, !db.readOnly, options.Timeout); err != nil {
_ = db.close()
return nil, err
}
// Default values for test hooks
db.ops.writeAt = db.file.WriteAt
// Initialize the database if it doesn't exist.
if info, err := db.file.Stat(); err != nil {
return nil, err
} else if info.Size() == 0 {//不存在db文件
// 初始化操作
if err := db.init(); err != nil {
return nil, err
}
} else {//db文件已存在
// Read the first meta page to determine the page size.
var buf [0x1000]byte
if _, err := db.file.ReadAt(buf[:], 0); err == nil {
m := db.pageInBuffer(buf[:], 0).meta()
if err := m.validate(); err != nil {
// If we can't read the page size, we can assume it's the same
// as the OS -- since that's how the page size was chosen in the
// first place.
//
// If the first page is invalid and this OS uses a different
// page size than what the database was created with then we
// are out of luck and cannot access the database.
db.pageSize = os.Getpagesize()
} else {
db.pageSize = int(m.pageSize)
}
}
}
// 初始化page 内存池
db.pagePool = sync.Pool{
New: func() interface{} {
return make([]byte, db.pageSize)
},
}
// 将文件映射在内存
if err := db.mmap(options.InitialMmapSize); err != nil {
_ = db.close()
return nil, err
}
// 空闲page加载到freelist.
db.freelist = newFreelist()
db.freelist.read(db.page(db.meta().freelist))
// Mark the database as opened and return.
return db, nil
}
如果之前不存在db文件会调用init进行初始化,否在读取文件头部4k数据从中取出想要的信息完成后续加载。早函数尾部进行内存的映射和空闲page的加载。
func (db *DB) init() error {
// Set the page size to the OS page size.
db.pageSize = os.Getpagesize()
// Create two meta pages on a buffer.
buf := make([]byte, db.pageSize*4) //前4个page预留
for i := 0; i < 2; i++ {//初始化两个meat page
p := db.pageInBuffer(buf[:], pgid(i)) //填充第几个page
p.id = pgid(i)
p.flags = metaPageFlag
// Initialize the meta page.
m := p.meta()
m.magic = magic
m.version = version
m.pageSize = uint32(db.pageSize)
m.freelist = 2//起始page
m.root = bucket{root: 3}//根page起始page
m.pgid = 4//page id起始位置
m.txid = txid(i)
m.checksum = m.sum64()
}
// Write an empty freelist at page 3.
p := db.pageInBuffer(buf[:], pgid(2)) //第二个page保存free page
p.id = pgid(2)
p.flags = freelistPageFlag
p.count = 0
// Write an empty leaf page at page 4.
p = db.pageInBuffer(buf[:], pgid(3))// 第三个page leaf节点
p.id = pgid(3)
p.flags = leafPageFlag
p.count = 0
// buf写入db
if _, err := db.ops.writeAt(buf, 0); err != nil {
return err
}
//同步文件
if err := fdatasync(db); err != nil {
return err
}
return nil
}
可以看到open函数比较简单,如果是第一次使用会open文件然后将固定格式写入前四个page。
2、查询/创建bucket
查询bucket是创建的前提,创建前都会查询bucket是否存在,如果存在会返回错误,如果不存在才会创建bucket。查询操作在事务内进行。
func (tx *Tx) Bucket(name []byte) *Bucket {
return tx.root.Bucket(name)
}
func (b *Bucket) Bucket(name []byte) *Bucket {
if b.buckets != nil {
if child := b.buckets[string(name)]; child != nil {
return child
}
}
//创建查询游标
c := b.Cursor()
//在当前的bucket中查询找name对应的value
k, v, flags := c.seek(name)
//key不存在,结束
if !bytes.Equal(name, k) || (flags&bucketLeafFlag) == 0 {
return nil
}
// 否则打开bucket
var child = b.openBucket(v)
if b.buckets != nil {
b.buckets[string(name)] = child
}
return child
}
Seek是查询的核心函数,seek中会使用二分查找在b+tree中查找对应的key。因为b+tree有可能被缓存到了内存,所以查找过程会在node和page之间切换。
func (c *Cursor) seek(seek []byte) (key []byte, value []byte, flags uint32) {
_assert(c.bucket.tx.db != nil, "tx closed")
// Start from root page/node and traverse to correct page.
c.stack = c.stack[:0]
c.search(seek, c.bucket.root)//从根page开始查找
ref := &c.stack[len(c.stack)-1]//获取最后一次查询的位置
// 未查询到
if ref.index >= ref.count() {
return nil, nil, 0
}
// 获取key对应的value.
return c.keyValue()
}
func (c *Cursor) search(key []byte, pgid pgid) {
p, n := c.bucket.pageNode(pgid)//从内存或者磁盘获取该page
if p != nil && (p.flags&(branchPageFlag|leafPageFlag)) == 0 {
panic(fmt.Sprintf("invalid page type: %d: %x", p.id, p.flags))
}
e := elemRef{page: p, node: n}
c.stack = append(c.stack, e)//记录查询路径
// 叶子节点,获取key对应的value,如果有的话.
if e.isLeaf() {
c.nsearch(key)
return
}
if n != nil {
c.searchNode(key, n) //从内存节点进去下一层继续查找
return
}
c.searchPage(key, p)//从page进入下一层继续查找
}
Search是个递归调用的函数,优先在从node中查询,如果node中不存在再去page中查询,知道查询到叶子节点,在叶子节点中在查找最终的value。看完seek查询我们在回到查询bucket的函数,当查询到value后打开bukcet函数openBucket()
func (b *Bucket) openBucket(value []byte) *Bucket {
var child = newBucket(b.tx)
// If unaligned load/stores are broken on this arch and value is
// unaligned simply clone to an aligned byte array.
unaligned := brokenUnaligned && uintptr(unsafe.Pointer(&value[0]))&3 != 0
if unaligned {
value = cloneBytes(value)
}
// 如果这是一个可写事务,那么我们需要复制bucket条目。只读事务可以直接指向mmap条目,bucket中的root记录了该表的根page
if b.tx.writable && !unaligned {
child.bucket = &bucket{}
*child.bucket = *(*bucket)(unsafe.Pointer(&value[0]))
} else {
child.bucket = (*bucket)(unsafe.Pointer(&value[0]))
}
//如果 bucket是inline的,将数据保存到page中
if child.root == 0 {
child.page = (*page)(unsafe.Pointer(&value[bucketHeaderSize]))
}
return &child
}
创建bucket前半段是查询以及一些检查,略过不看。直接看关键部分
func (b *Bucket) CreateBucket(key []byte) (*Bucket, error) {
// Create empty, inline bucket.
var bucket = Bucket{
bucket: &bucket{},
rootNode: &node{isLeaf: true},
FillPercent: DefaultFillPercent,
}
var value = bucket.write()//生成bucket的value
// Insert into node.
key = cloneBytes(key)
//将k-v写入,page类型设置成bucketLeafFlag
c.node().put(key, key, value, 0, bucketLeafFlag)
// Since subbuckets are not allowed on inline buckets, we need to
// dereference the inline page, if it exists. This will cause the bucket
// to be treated as a regular, non-inline bucket for the rest of the tx.
b.page = nil
return b.Bucket(key), nil
}
先看下是如何生成bucket的value的
value数据结构:
func (b *Bucket) write() []byte {
// Allocate the appropriate size.
var n = b.rootNode
var value = make([]byte, bucketHeaderSize+n.size())
// Write a bucket header.
var bucket = (*bucket)(unsafe.Pointer(&value[0]))
*bucket = *b.bucket//bucket数据拷贝
// Convert byte slice to a fake page and write the root node.
var p = (*page)(unsafe.Pointer(&value[bucketHeaderSize]))
n.write(p)//填充page部分数据
return value
}
func (n *node) write(p *page) {
// Initialize page.
if n.isLeaf {
p.flags |= leafPageFlag//设置leaf标记
} else {
p.flags |= branchPageFlag//设置branch标记
}
if len(n.inodes) >= 0xFFFF {
panic(fmt.Sprintf("inode overflow: %d (pgid=%d)", len(n.inodes), p.id))
}
p.count = uint16(len(n.inodes)) //设置count
// Stop here if there are no items to write.
if p.count == 0 {
return
}
// 遍历node中的每一个inode,填充到page中
b := (*[maxAllocSize]byte)(unsafe.Pointer(&p.ptr))[n.pageElementSize()*len(n.inodes):]
for i, item := range n.inodes {
_assert(len(item.key) > 0, "write: zero-length inode key")
// Write the page element.
if n.isLeaf {
elem := p.leafPageElement(uint16(i))
elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem)))
elem.flags = item.flags
elem.ksize = uint32(len(item.key))
elem.vsize = uint32(len(item.value))
} else {
elem := p.branchPageElement(uint16(i))
elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem)))
elem.ksize = uint32(len(item.key))
elem.pgid = item.pgid
_assert(elem.pgid != p.id, "write: circular dependency occurred")
}
klen, vlen := len(item.key), len(item.value)
if len(b) < klen+vlen {
b = (*[maxAllocSize]byte)(unsafe.Pointer(&b[0]))[:]
}
// Write data for the element to the end of the page.
copy(b[0:], item.key)
b = b[klen:]
copy(b[0:], item.value)
b = b[vlen:]
}
// DEBUG ONLY: n.dump()
}
Bucket已经变成了value,在回到前面的CreateBucket函数,Cursor.node()函数获取kv插入到哪个node节点。前面我们说过在seek函数中会记录key的查找轨迹,根据查找轨迹就知道该kv应该插入哪个node中。
func (c *Cursor) node() *node {
_assert(len(c.stack) > 0, "accessing a node with a zero-length cursor stack")
// 如果node存在且正好是leaf节点,直接返回.
if ref := &c.stack[len(c.stack)-1]; ref.node != nil && ref.isLeaf() {
return ref.node
}
// 如果root node不存在,从根page中加载该node
var n = c.stack[0].node
if n == nil {
// page.id 从3开始,前面将open时介绍过
n = c.bucket.node(c.stack[0].page.id, nil)
}
//从root向下遍历将完整的路径上的node都加载出来
for _, ref := range c.stack[:len(c.stack)-1] {
_assert(!n.isLeaf, "expected branch node")
n = n.childAt(int(ref.index))
}
_assert(n.isLeaf, "expected leaf node")
return n
}
加载的node的实际执行函数是Bucket.node()
func (b *Bucket) node(pgid pgid, parent *node) *node {
_assert(b.nodes != nil, "nodes map expected")
// 缓存中存在,直接返回
if n := b.nodes[pgid]; n != nil {
return n
}
// 创建新node构建父子关系
n := &node{bucket: b, parent: parent}
if parent == nil {
b.rootNode = n
} else {
parent.children = append(parent.children, n)
}
// 如果是inline bucket就使用bucket自带page,如果不是就加载一个page
var p = b.page
if p == nil {
p = b.tx.page(pgid)//底层是从db的映射到内存的data中获取
}
// 使用page数据加载node以及inodes
n.read(p)
b.nodes[pgid] = n//加入缓存
// Update statistics.
b.tx.stats.NodeCount++
return n
}
到此,我们已经为kv的插入准备好了一切,可以行进kv的插入操作了,kv插入是调用node.put()函数进行:
func (n *node) put(oldKey, newKey, value []byte, pgid pgid, flags uint32) {
if pgid >= n.bucket.tx.meta.pgid {
panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", pgid, n.bucket.tx.meta.pgid))
} else if len(oldKey) <= 0 {
panic("put: zero-length old key")
} else if len(newKey) <= 0 {
panic("put: zero-length new key")
}
// Find insertion index.
index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, oldKey) != -1 })
// Add capacity and shift nodes if we don't have an exact match and need to insert.
exact := (len(n.inodes) > 0 && index < len(n.inodes) && bytes.Equal(n.inodes[index].key, oldKey))
if !exact {
n.inodes = append(n.inodes, inode{})
copy(n.inodes[index+1:], n.inodes[index:])
}
inode := &n.inodes[index]
inode.flags = flags
inode.key = newKey
inode.value = value
inode.pgid = pgid
_assert(len(inode.key) > 0, "put: zero-length inode key")
}
这函数比较简单,找到合适的位置,插入kv。创建bucket完成