简介
本文以使用 innodb 引擎为例展示从ibd文件解析索引数据的方法,代码实现主要参考 https://blog.jcole.us/innodb/ ,这上面对innodb *.ibd 文件中不同类型page的组织方式、page header信息的格式、主键与二级索引的格式、记录的格式等做了比较详细的描述。
从 *.ibd文件中可以解析得到page、index等的属性信息,但是具体的数据是连在一起的二进制串(主键record 中包含其他fields的数据,当然,长度过长的fields的数据不会直接存放在这里)https://blog.jcole.us/2013/01/10/the-physical-structure-of-records-in-innodb/,如果不知道每个fields的属性信息,就无法解析。这时可以使用SDI(Serialized Dictionary Information),SDI 同样存放在 *.ibd文件中,可以在解析*.ibd文件时,根据page 的类型得到。在该文件中,数据以zlib压缩后的json格式存放,所以在解析时需要使用到zlib解压。在SDI中可以拿到每个列、索引以及表的信息等,这样就可以用于解析二进制的fields数据,这部分的实现参考了MySQL中的ibd2sdi。
代码示例
以下代码仅仅是简单的功能演示,仅支持字段类型为int,且没有对文件的校验等操作。如果要从二进制数据中正确划分其他类型的字段,可以在509行后加上设置对应字段的长度的代码。
#include <stdio.h>
#include <assert.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <zlib.h>
#include <rapidjson/document.h>
#include <vector>
typedef uint64_t ulint;
typedef uint8_t u8;
typedef uint16_t u16;
typedef uint32_t u32;
typedef uint64_t u64;
typedef uint8_t byte;
#define ALLOC(buf, n) buf = (byte *)malloc(n)
#define FREE(buf) free(buf)
#define PAGE_SIZE (1 << 14) // 16KB
#define U16_MAX 0xffff
#define DATA_INT 4 /* integer: can be any size 1 - 8 bytes */
#define DATA_INT_TYPE 4
constexpr u16 FIL_PAGE_SDI = 17853;
constexpr u16 SDI_OFFSET_PAGE_LEVEL = 26;
constexpr u16 SDI_OFFSET_PAGE_DATA = 38;
constexpr u16 SDI_OFFSET_N_RECS = 16;
constexpr u16 SDI_OFFSET_N_NEW_BYTES = 5;
constexpr u16 SDI_OFFSET_REC_TYPE = 3;
constexpr u16 SDI_OFFSET_REC_NEXT = 2;
constexpr u16 SDI_OFFSET_DATA_TYPE_LEN = 4;
constexpr u16 SDI_OFFSET_DATA_TYPE = 0;
constexpr u16 SDI_OFFSET_DATA_ID_LEN = 8;
constexpr u16 SDI_OFFSET_DATA_ID = 4;
constexpr u16 SDI_OFFSET_BLOB_ALLOWED = 4;
constexpr u16 REC_TYPE_ORDINARY = 0;
constexpr u16 REC_TYPE_NODE_PTR = 1;
constexpr u16 REC_TYPE_INF = 2;
constexpr u16 REC_TYPE_SUP = 3;
constexpr u16 REC_OFF_DATA_VARCHAR = 33;
constexpr u16 PAGE_N_HEAP = 4;
constexpr u16 PAGE_HEADER = 38;
constexpr u16 FSEG_HEADER_SIZE = 10;
constexpr u16 REC_NEW_INFO_BITS = 5;
constexpr u16 REC_OLD_INFO_BITS = 6;
constexpr u16 PAGE_NEW_INFIMUM = PAGE_HEADER + 36 + 2 * FSEG_HEADER_SIZE +
SDI_OFFSET_N_NEW_BYTES;
constexpr u16 REC_INFO_DELETED_FLAG = 0x20;
constexpr u16 REC_INFO_BITS_SHIFT = 0;
constexpr u16 REC_NEXT = 2;
constexpr u16 REC_OFF_TYPE = 3;
constexpr u16 BTR_EXTERN_LEN = 12;
constexpr u16 REC_OFF_DATA_UNCOMP_LEN = 25;
constexpr u16 REC_OFF_DATA_COMP_LEN = 29;
constexpr u16 FIL_PAGE_INDEX = 17855;
constexpr u16 OFFSET_FILE_PAGE_NO = 4;
constexpr u16 OFFSET_FILE_SPACE_NO = 34;
constexpr u16 OFFSET_FILE_PAGE_TYPE = 24;
constexpr u16 OFFSET_INDEX_ID = 66;
constexpr u16 OFFSET_PAGE_LEVEL = 64;
constexpr u16 OFFSET_NUM_RECORDS = 54;
constexpr u16 OFFSET_INFIMUM_OFFSET = 97;
constexpr u16 OFFSET_INFIMUM_REC_TYPE = 95;
constexpr u16 OFFSET_INFIMUM_N_OWN = 94;
constexpr u16 OFFSET_SUP_N_OWN = 107;
constexpr u16 OFFSET_SUP_REC_TYPE = 108;
constexpr u16 OFFSET_SUP_NEXT_OFFSET = 110;
constexpr u16 OFFSET_USER_RECS = 120;
struct index_t {
byte *str_;
bool m_is_pk_;
u16 m_page_level_;
u16 m_num_records_;
u8 m_inf_n_own_;
u16 m_inf_rec_type_;
u16 m_inf_next_offset_;
u8 m_sup_n_own_;
u16 m_sup_rec_type_;
u16 m_sup_next_offset_;
u32 m_space_no_;
u32 m_page_no_;
u64 m_index_id_;
std::vector<u16> fields_offsets_;
std::vector<byte *> recs_;
index_t(bool is_pk, u16 page_level, u16 num_records,
u8 inf_n_own,
u16 inf_rec_type, u16 inf_next_offset,
u8 sup_n_own, u8 sup_rec_type,
u16 sup_next_offset,
u32 space_no, u32 page_no, u64 index_id): index_t(){
m_is_pk_ = is_pk;
m_page_level_ = page_level;
m_num_records_ = num_records;
m_inf_n_own_ = inf_n_own;
m_inf_rec_type_ = inf_rec_type;
m_inf_next_offset_ = inf_next_offset;
m_sup_n_own_ = sup_n_own;
m_sup_rec_type_ = sup_rec_type;
m_sup_next_offset_ = sup_next_offset;
m_space_no_ = space_no;
m_page_no_ = page_no;
m_index_id_ = index_id;
}
index_t(const index_t &other):index_t() {
m_is_pk_ = other.m_is_pk_;
m_page_level_ = other.m_page_level_;
m_num_records_ = other.m_num_records_;
m_inf_n_own_ = other.m_inf_n_own_;
m_inf_rec_type_ = other.m_inf_rec_type_;
m_inf_next_offset_ = other.m_inf_next_offset_;
m_sup_n_own_ = other.m_sup_n_own_;
m_sup_rec_type_ = other.m_sup_rec_type_;
m_sup_next_offset_ = other.m_sup_next_offset_;
m_space_no_ = other.m_space_no_;
m_page_no_ = other.m_page_no_;
m_index_id_ = other.m_index_id_;
}
const char *str() {
assert(str_ != NULL);
memset(str_, 0, 1024);
sprintf((char*)str_, "[%s Index space=%u, page=%u,\
level=%hu, num_records=%hu, index_id=%lu]\n\
\tInfimum Record:\n\
\t rec_type=%hu, next_offset=%hu, n_own=%hhu\n",
(m_is_pk_ ? "Clust" : "Secondary"),
m_space_no_, m_page_no_, m_page_level_,
m_num_records_, m_index_id_,
m_inf_rec_type_, m_inf_next_offset_, m_inf_n_own_);
for (u32 i = 0; i < recs_.size(); ++i) {
sprintf((char *)str_ + strlen((char *)str_), "\t Entry [");
for (u32 j = 1; j < fields_offsets_.size(); ++j) {
sprintf((char *)str_ + strlen((char *)str_), "0x");
for (u32 k = 0; k < fields_offsets_[j] - fields_offsets_[j-1]; ++k) {
sprintf((char