英文原文链接:https://cstack.github.io/db_tutorial/parts/part8.html
我们将持续修改表的格式,从未排序的数据改成B-tree。因为改动太大,需要花费几节文章来说明。在这节,我们定义叶子节点层,并支持key/value对插入到单节点树。
可选的表格式
当前的格式,每页只存储了rows(不包括元数据),因此从空间上讲是非常高效的。数据插入操作非常快,因为我们支持从表尾插入。但是,但是想要查找数据,只能扫描全表。如果我们想要删除一行,需要把改行后面的数据move,以填补删除留在的hole。
如果我们按ID把表存储为一个有序数组来保存行,这样我们可以基于ID使用二分法进行查询。但在这样,插入会很慢,因为我们不得不挪动很多行来腾出空间。
如果使用tree结构呢?每一个节点可以保存多行,所以我们可以保存额外的信息来跟踪每个节点保存多少行。此外,所有的内部节点不存储任何行数据,所以这些都是额外空间开销。但是好处是,对于较大的数据库文件,我们可以实现快速地插入、删除和查找。
归纳如下:
Unsorted Array of rows | Sorted Array of rows | Tree of nodes | |
Pages contain | only data | only data | metadata, primary keys, and data |
Rows per page | more | more | fewer |
Insertion | O(1) | O(n) | O(log(n)) |
Deletion | O(n) | O(n) | O(log(n)) |
Lookup by id | O(n) | O(log(n)) | O(log(n)) |
节点头format
叶节点和内部节点具有不同的布局。让我们创建一个枚举来跟踪节点类型:
enum NodeType_t { NODE_INTERNAL, NODE_LEAF };
typedef enum NodeType_t NodeType;
每个节点将对应一个页面。内部节点存储了孩子节点的页码。btree向Pager询问特定页码,并得到指向页面缓存的指针。页面按页码顺序依次存储在数据库文件中。
节点需要在页面头中存储一些元数据。包括节点类型,是否是根节点,以及指向其父节点的指针(用来查找兄弟节点)。我为每个头字段的大小和偏移量定义几个常量:
/*
* Common Node Header Layout
*/
const uint32_t NODE_TYPE_SIZE = sizeof(uint8_t);
const uint32_t NODE_TYPE_OFFSET = 0;
const uint32_t IS_ROOT_SIZE = sizeof(uint8_t);
const uint32_t IS_ROOT_OFFSET = NODE_TYPE_SIZE;
const uint32_t PARENT_POINTER_SIZE = sizeof(uint32_t);
const uint32_t PARENT_POINTER_OFFSET = IS_ROOT_OFFSET IS_ROOT_SIZE;
const uint8_t COMMON_NODE_HEADER_SIZE =
NODE_TYPE_SIZE IS_ROOT_SIZE PARENT_POINTER_SIZE;
叶子节点格式
除了这些公共多头字段外,叶子节点还需要存储包括了多少个cell,每个cell是一个键值对。
/*
* Leaf Node Header Layout
*/
const uint32_t LEAF_NODE_NUM_CELLS_SIZE = sizeof(uint32_t);
const uint32_t LEAF_NODE_NUM_CELLS_OFFSET = COMMON_NODE_HEADER_SIZE;
const uint32_t LEAF_NODE_HEADER_SIZE =
COMMON_NODE_HEADER_SIZE LEAF_NODE_NUM_CELLS_SIZE;
归档下,叶子节点格式如下:
在头部留一点空间来存放公共的信息,虽然有一点开销,但是对于访问起来更加地方便。
注意到,在页尾预留了一些空间。在header之后尽可能多的存储cell,但是留下的空间可能存放不下一个完整的cell,这种情况下,就不再使用这些空间,避免跨node。
如何访问叶子节点字段
+ /* 获取cell个数*/
+uint32_t* leaf_node_num_cells(void* node) {
+ return (char *)node + LEAF_NODE_NUM_CELLS_OFFSET;
+}
+ /* 根据cell编码获取相应的cell*/
+void* leaf_node_cell(void* node, uint32_t cell_num) {
+ return (char *)node + LEAF_NODE_HEADER_SIZE + cell_num * LEAF_NODE_CELL_SIZE;
+}
+ /* 根据cell编码获取key*/
+uint32_t* leaf_node_key(void* node, uint32_t cell_num) {
+ return leaf_node_cell(node, cell_num);
+}
+ /* 根据cell编码获取value*/
+void* leaf_node_value(void* node, uint32_t cell_num) {
+ return leaf_node_cell(node, cell_num) + LEAF_NODE_KEY_SIZE;
+}
+ /* 初始化叶子节点 */
+void initialize_leaf_node(void* node) { *leaf_node_num_cells(node) = 0; }
+
调整Pager和Table对象的实现
-void pager_flush(Pager* pager, uint32_t page_num, uint32_t size) {
+void pager_flush(Pager* pager, uint32_t page_num) {
if (pager->pages[page_num] == NULL) {
printf("Tried to flush null page\n");
exit(EXIT_FAILURE);
@@ -242,7 +337,7 @@ void pager_flush(Pager* pager, uint32_t page_num, uint32_t size) {
}
ssize_t bytes_written =
- write(pager->file_descriptor, pager->pages[page_num], size);
+ write(pager->file_descriptor, pager->pages[page_num], PAGE_SIZE);
if (bytes_written == -1) {
printf("Error writing: %d\n", errno);
void db_close(Table* table) {
Pager* pager = table->pager;
- uint32_t num_full_pages = table->num_rows / ROWS_PER_PAGE;
- for (uint32_t i = 0; i < num_full_pages; i++) {
+ for (uint32_t i = 0; i < pager->num_pages; i++) {
if (pager->pages[i] == NULL) {
continue;
}
- pager_flush(pager, i, PAGE_SIZE);
+ pager_flush(pager, i);
free(pager->pages[i]);
pager->pages[i] = NULL;
}
- // There may be a partial page to write to the end of the file
- // This should not be needed after we switch to a B-tree
- uint32_t num_additional_rows = table->num_rows % ROWS_PER_PAGE;
- if (num_additional_rows > 0) {
- uint32_t page_num = num_full_pages;
- if (pager->pages[page_num] != NULL) {
- pager_flush(pager, page_num, num_additional_rows * ROW_SIZE);
- free(pager->pages[page_num]);
- pager->pages[page_num] = NULL;
- }
- }
-
int result = close(pager->file_descriptor);
if (result == -1) {
printf("Error closing db file.\n");
在我们的数据库中存储page编号显然比存储row编码更加合理。现在我们使用page数量取代特定的表后,page的个数同Pager对象建立关系,而不再是同表对象。每颗B-tree对象都使用它的根节点的page编码来唯一来标记,这样表对象就需要做相应的调整。
const uint32_t PAGE_SIZE = 4096;
const uint32_t TABLE_MAX_PAGES = 100;
-const uint32_t ROWS_PER_PAGE = PAGE_SIZE / ROW_SIZE;
-const uint32_t TABLE_MAX_ROWS = ROWS_PER_PAGE * TABLE_MAX_PAGES;
struct Pager_t {
int file_descriptor;
uint32_t file_length;
+ uint32_t num_pages;
void* pages[TABLE_MAX_PAGES];
};
typedef struct Pager_t Pager;
struct Table_t {
Pager* pager;
- uint32_t num_rows;
+ uint32_t root_page_num;
};
typedef struct Table_t Table;
@@ -127,6 +200,10 @@ void* get_page(Pager* pager, uint32_t page_num) {
}
pager->pages[page_num] = page;
+
+ if (page_num >= pager->num_pages) {
+ pager->num_pages = page_num + 1;
+ }
}
return pager->pages[page_num];
@@ -184,6 +269,12 @@ Pager* pager_open(const char* filename) {
Pager* pager = malloc(sizeof(Pager));
pager->file_descriptor = fd;
pager->file_length = file_length;
+ pager->num_pages = (file_length / PAGE_SIZE);
+
+ if (file_length % PAGE_SIZE != 0) {
+ printf("Db file is not a whole number of pages. Corrupt file.\n");
+ exit(EXIT_FAILURE);
+ }
for (uint32_t i = 0; i < TABLE_MAX_PAGES; i++) {
pager->pages[i] = NULL;
调整Cursor对象的实现
一个cursor代表来数据在表中的位置。当我们的表使用一个简单的数组存储时,我们可以通过row编码来访问它。现在,我们改用了Btree,这个位置就需要替换成node编码加cell编码来唯一标识。
struct Cursor_t {
Table* table;
- uint32_t row_num;
+ uint32_t page_num;
+ uint32_t cell_num;
bool end_of_table; // Indicates a position one past the last element
};
typedef struct Cursor_t Cursor;
Cursor* table_start(Table* table) {
Cursor* cursor = malloc(sizeof(Cursor));
cursor->table = table;
- cursor->row_num = 0;
- cursor->end_of_table = (table->num_rows == 0);
+ cursor->page_num = table->root_page_num;
+ cursor->cell_num = 0;
+
+ void* root_node = get_page(table->pager, table->root_page_num);
+ uint32_t num_cells = *leaf_node_num_cells(root_node);
+ cursor->end_of_table = (num_cells == 0);
return cursor;
}
void* cursor_value(Cursor* cursor) {
- uint32_t row_num = cursor->row_num;
- uint32_t page_num = row_num / ROWS_PER_PAGE;
+ uint32_t page_num = cursor->page_num;
void* page = get_page(cursor->table->pager, page_num);
- uint32_t row_offset = row_num % ROWS_PER_PAGE;
- uint32_t byte_offset = row_offset * ROW_SIZE;
- return page + byte_offset;
+ return leaf_node_value(page, cursor->cell_num);
}
void cursor_advance(Cursor* cursor) {
- cursor->row_num += 1;
- if (cursor->row_num >= cursor->table->num_rows) {
+ uint32_t page_num = cursor->page_num;
+ void* node = get_page(cursor->table->pager, page_num);
+
+ cursor->cell_num += 1;
+ if (cursor->cell_num >= (*leaf_node_num_cells(node))) {
cursor->end_of_table = true;
}
}