mysql源码分析——InnoDB的磁盘结构源码分析

fpcc

已于 2022-10-30 18:13:41 修改

阅读量600

点赞数 1

分类专栏：数据库开发文章标签： mysql 数据库

于 2022-10-30 18:12:31 首次发布

本文链接：https://blog.csdn.net/fpcc/article/details/127602698

版权

数据库开发专栏收录该内容

47 篇文章 64 订阅

订阅专栏

一、基本介绍

在前面的一篇中初步对InnoDB磁盘结构的表空间，数据字典，双写缓冲区、日志进行分析说明，对InnoDB的磁盘结构有了一个整体上的概念上的认识。这样，在下面的源码分析中，就可以比较清楚的把功能和源码的内容就对起来，做到心中有数，看代码才不会一头雾水。

二、表空间

表空间是InnoDB在文件IO上的一层逻辑存储空间管理的结构，它基本可以分为space、segment inode、extent和page四层。它在实际的源码中使用了磁盘链表这种数据结构：

/**  innobase/include/fil0fil.h File space address */
struct fil_addr_t {
  /* Default constructor */
  fil_addr_t() : page(FIL_NULL), boffset(0) {}

  /** Constructor
  @param[in]	p	Logical page number
  @param[in]	boff	Offset within the page */
  fil_addr_t(page_no_t p, uint32_t boff) : page(p), boffset(boff) {}

  /** Compare to instances
  @param[in]	rhs	Instance to compare with
  @return true if the page number and page offset are equal */
  bool is_equal(const fil_addr_t &rhs) const {
    return (page == rhs.page && boffset == rhs.boffset);
  }

  /** Check if the file address is null.
  @return true if null */
  bool is_null() const { return (page == FIL_NULL && boffset == 0); }

  /** Print a string representation.
  @param[in,out]	out		Stream to write to */
  std::ostream &print(std::ostream &out) const {
    out << "[fil_addr_t: page=" << page << ", boffset=" << boffset << "]";

    return (out);
  }

  /** Page number within a space */
  page_no_t page;

  /** Byte offset within the page * /
  uint32_t boffset;
};
/** File node of a tablespace or the log data space */
struct fil_node_t {
  using List_node = UT_LIST_NODE_T(fil_node_t);

  /** tablespace containing this file */
  fil_space_t *space;

  /** file name; protected by Fil_shard::m_mutex and log_sys->mutex. */
  char *name;

  /** whether this file is open. Note: We set the is_open flag after
  we increase the write the MLOG_FILE_OPEN record to redo log. Therefore
  we increment the in_use reference count before setting the OPEN flag. */
  bool is_open;

  /** file handle (valid if is_open) */
  pfs_os_file_t handle;

  /** event that groups and serializes calls to fsync */
  os_event_t sync_event;

  /** whether the file actually is a raw device or disk partition */
  bool is_raw_disk;

  /** size of the file in database pages (0 if not known yet);
  the possible last incomplete megabyte may be ignored
  if space->id == 0 */
  page_no_t size;

  /** Size of the file when last flushed, used to force the flush when file
  grows to keep the filesystem metadata synced when using O_DIRECT_NO_FSYNC */
  page_no_t flush_size;

  /** initial size of the file in database pages;
  FIL_IBD_FILE_INITIAL_SIZE by default */
  page_no_t init_size;

  /** maximum size of the file in database pages */
  page_no_t max_size;

  /** count of pending i/o's; is_open must be true if nonzero */
  size_t n_pending;

  /** count of pending flushes; is_open must be true if nonzero */
  size_t n_pending_flushes;

  /** e.g., when a file is being extended or just opened. */
  size_t in_use;

  /** number of writes to the file since the system was started */
  int64_t modification_counter;

  /** the modification_counter of the latest flush to disk */
  int64_t flush_counter;

  /** link to the fil_system->LRU list (keeping track of open files) */
  List_node LRU;

  /** whether the file system of this file supports PUNCH HOLE */
  bool punch_hole;

  /** block size to use for punching holes */
  size_t block_size;

  /** whether atomic write is enabled for this file */
  bool atomic_write;

  /** FIL_NODE_MAGIC_N * /
  size_t magic_n;
}

/** Tablespace or log data space */
struct fil_space_t {
  using List_node = UT_LIST_NODE_T(fil_space_t);
  using Files = std::vector<fil_node_t, ut_allocator<fil_node_t>>;

  /** Release the reserved free extents.
  @param[in]	n_reserved	number of reserved extents */
  void release_free_extents(ulint n_reserved);

  /** @return true if the instance is queued for deletion. Guarantees the space
  is not deleted as long as the fil_shard mutex is not released. */
  bool is_deleted() const;

  /** @return true if the instance was not queued for deletion. It does not
  guarantee it is not queued for deletion at the moment. */
  bool was_not_deleted() const;

  /** Marks the space object for deletion. It will bump the space object version
  and cause all pages in buffer pool that reference to the current space
  object version to be stale and be freed on first encounter. */
  void set_deleted();

#ifndef UNIV_HOTBACKUP
  /** Returns current version of the space object. It is being bumped when the
   space is truncated or deleted. Guarantees the version returned is up to date
   as long as fil_shard mutex is not released.*/
  uint32_t get_current_version() const;

  /** Returns current version of the space object. It is being bumped when the
   space is truncated or deleted. It does not guarantee the version is current
   one.*/
  uint32_t get_recent_version() const;

  /** Bumps the space object version and cause all pages in buffer pool that
  reference the current space object version to be stale and be freed on
  first encounter. */
  void bump_version();

  /** @return true if this space does not have any more references. Guarantees
  the result only if true was returned. */
  bool has_no_references() const;

  /** @return Current number of references to the space. This method
  should be called only while shutting down the server. Only when there is no
  background nor user session activity the returned value will be valid. */
  size_t get_reference_count() const;

  /** Increment the page reference count. */
  void inc_ref() noexcept {
    const auto o = m_n_ref_count.fetch_add(1);
    ut_a(o != std::numeric_limits<size_t>::max());
  }

  /** Decrement the page reference count. */
  void dec_ref() noexcept {
    const auto o = m_n_ref_count.fetch_sub(1);
    ut_a(o >= 1);
  }
#endif /* !UNIV_HOTBACKUP */

#ifdef UNIV_DEBUG
  /** Print the extent descriptor pages of this tablespace into
  the given output stream.
  @param[in]	out	the output stream.
  @return	the output stream. */
  std::ostream &print_xdes_pages(std::ostream &out) const;

  /** Print the extent descriptor pages of this tablespace into
  the given file.
  @param[in]	filename	the output file name. */
  void print_xdes_pages(const char *filename) const;
#endif /* UNIV_DEBUG */

 public:
  using Observer = FlushObserver;
  using FlushObservers = std::vector<Observer *, ut_allocator<Observer *>>;

  /** When the tablespace was extended last. */
  ib::Timer m_last_extended{};

  /** Extend undo tablespaces by so many pages. */
  page_no_t m_undo_extend{};

  /** When an undo tablespace has been initialized with required header pages,
  that size is recorded here.  Auto-truncation happens when the file size
  becomes bigger than both this and srv_max_undo_log_size. */
  page_no_t m_undo_initial{};

  /** Tablespace name */
  char *name{};

  /** Tablespace ID */
  space_id_t id;

  /** Initializes fields. This could be replaced by a constructor if SunPro is
  compiling it correctly. */
  void initialize() noexcept {
    new (&m_last_extended) ib::Timer;
    new (&files) fil_space_t::Files();

#ifndef UNIV_HOTBACKUP
    new (&m_version) std::atomic<uint32_t>;
    new (&m_n_ref_count) std::atomic_size_t;
    new (&m_deleted) std::atomic<bool>;
#endif /* !UNIV_HOTBACKUP */
  }

 private:
#ifndef UNIV_HOTBACKUP
  /** All pages in the buffer pool that reference this fil_space_t instance with
  version before this version can be lazily freed or reused as free pages.
  They should be rejected if there is an attempt to write them to disk.

  Writes to m_version are guarded by the exclusive MDL/table lock latches
  acquired by the caller, as stated in docs. Note that the Fil_shard mutex seems
  to be latched in 2 of 3 usages only, so is not really an alternative.

  Existence of the space object during reads is assured during these operations:
  1. when read by the buf_page_init_low on page read/creation - the caller must
  have acquired shared MDL/table lock latches.
  2. when read on buf_page_t::is_stale() on page access for a query or for purge
  operation. The caller must have acquired shared MDL/table lock latches.
  3. when read on buf_page_t::is_stale() on page access from LRU list, flush
  list or whatever else. Here, the fact that the page has latched the space
  using the reference counting system is what guards the space existence.

  When reading the value for the page being created with buf_page_init_low we
  have the MDL latches on table that is in tablespace or the tablespace alone,
  so we won't be able to bump m_version until they are released, so we will
  read the current value of the version. When reading the value for the page
  validation with buf_page_t::is_stale(), we will either:
  a) have the MDL latches required in at least S mode in case we need to be
  certain if the page is stale, to use it in a query or in purge operation, or
  b) in case we don't not have the MDL latches, we may read an outdated value.
  This happens for pages that are seen during for example LRU or flush page
  scans. These pages are not needed for the query itself. The read is to decide
  if the page can be safely discarded. Reading incorrect value can lead to no
  action being executed. Reading incorrect value can't lead to page being
  incorrectly evicted.
  */
  std::atomic<uint32_t> m_version{};

  /** Number of buf_page_t entries that point to this instance.

  This field is guarded by the Fil_shard mutex and the "reference
  count system". The reference count system here is allowing to take a "latch"
  on the space by incrementing the reference count, and release it by
  decrementing it.

  The increments are possible from two places:
  1. buf_page_init_low is covered by the existing MDL/table lock latches only
  and the fact that the space it is using is a current version of the space
  (the version itself is also guarded by these MDL/table lock latches). It
  implicitly acquires the "reference count system" latch after this operation.
  2. buf_page_t::buf_page_t(const buf_page_t&) copy constructor - increases the
  value, but it assumes the page being copied from has "reference count system"
  latch so the reference count is greater than 0 during this constructor call.

  For decrementing the reference count is itself a latch allowing for the safe
  decrement.

  The value is checked for being 0 in Fil_shard::checkpoint under the Fil_shard
  mutex, and only if the space is deleted.
  Observing m_n_ref_count==0 might trigger freeing the object. No other thread
  can be during the process of incrementing m_n_ref_count from 0 to 1 in
  parallel to this check. This is impossible for following reasons. Recall the
  only two places where we do increments listed above:
  1. If the space is deleted, then MDL/table lock latches guarantee there are
  no users that would be able to see it as the current version of space and thus
  will not attempt to increase the reference value from 0.
  2. The buf_page_t copy constructor can increase it, but it assumes the page
  being copied from has "reference count system" latch so the reference count is
  greater than 0 during this constructor call.

  There is also an opposite race possible: while we check for ref count being
  zero, another thread may be decrementing it in parallel, and we might miss
  that if we check too soon. This is benign, as it will result in us not
  reclaiming the memory we could (but not have to) free, and will return to the
  check on next checkpoint.
  */
  std::atomic_size_t m_n_ref_count{};
#endif /* !UNIV_HOTBACKUP */

  /** true if the tablespace is marked for deletion. */
  std::atomic_bool m_deleted{};

 public:
  /** true if we want to rename the .ibd file of tablespace and
  want to stop temporarily posting of new i/o requests on the file */
  bool stop_ios{};

  /** We set this true when we start deleting a single-table
  tablespace.  When this is set following new ops are not allowed:
  * read IO request
  * ibuf merge
  * file flush
  Note that we can still possibly have new write operations because we
  don't check this flag when doing flush batches. */
  bool stop_new_ops{};

#ifdef UNIV_DEBUG
  /** Reference count for operations who want to skip redo log in
  the file space in order to make fsp_space_modify_check pass. */
  ulint redo_skipped_count{};
#endif /* UNIV_DEBUG */

  /** Purpose */
  fil_type_t purpose;

  /** Files attached to this tablespace. Note: Only the system tablespace
  can have multiple files, this is a legacy issue. */
  Files files{};

  /** Tablespace file size in pages; 0 if not known yet */
  page_no_t size{};

  /** FSP_SIZE in the tablespace header; 0 if not known yet */
  page_no_t size_in_header{};

  /** Autoextend size */
  uint64_t autoextend_size_in_bytes{};

  /** Length of the FSP_FREE list */
  uint32_t free_len{};

  /** Contents of FSP_FREE_LIMIT */
  page_no_t free_limit{};

  /** Tablespace flags; see fsp_flags_is_valid() and
  page_size_t(ulint) (constructor).
  This is protected by space->latch and tablespace MDL */
  uint32_t flags{};

  /** Number of reserved free extents for ongoing operations like
  B-tree page split */
  uint32_t n_reserved_extents{};

  /** This is positive when flushing the tablespace to disk;
  dropping of the tablespace is forbidden if this is positive */
  uint32_t n_pending_flushes{};

  /** This is positive when we have pending operations against this
  tablespace. The pending operations can be ibuf merges or lock
  validation code trying to read a block.  Dropping of the tablespace
  is forbidden if this is positive.  Protected by Fil_shard::m_mutex. */
  uint32_t n_pending_ops{};

#ifndef UNIV_HOTBACKUP
  /** Latch protecting the file space storage allocation */
  rw_lock_t latch;
#endif /* !UNIV_HOTBACKUP */

  /** List of spaces with at least one unflushed file we have
  written to */
  List_node unflushed_spaces;

  /** true if this space is currently in unflushed_spaces */
  bool is_in_unflushed_spaces{};

  /** Compression algorithm */
  Compression::Type compression_type;

  /** Encryption algorithm */
  Encryption::Type encryption_type;

  /** Encrypt key */
  byte encryption_key[Encryption::KEY_LEN];

  /** Encrypt key length*/
  ulint encryption_klen{};

  /** Encrypt initial vector */
  byte encryption_iv[Encryption::KEY_LEN];

  /** Encryption is in progress */
  encryption_op_type encryption_op_in_progress;

  /** Flush lsn of header page. It is used only during recovery */
  lsn_t m_header_page_flush_lsn;

  /** FIL_SPACE_MAGIC_N */
  ulint magic_n;

  /** System tablespace */
  static fil_space_t *s_sys_space;

  /** Redo log tablespace */
  static fil_space_t *s_redo_space;

  /** Check if the tablespace is compressed.
  @return true if compressed, false otherwise. */
  bool is_compressed() const noexcept MY_ATTRIBUTE((warn_unused_result)) {
    return compression_type != Compression::NONE;
  }

  /** Check if the tablespace is encrypted.
  @return true if encrypted, false otherwise. */
  bool is_encrypted() const noexcept MY_ATTRIBUTE((warn_unused_result)) {
    return FSP_FLAGS_GET_ENCRYPTION(flags);
  }

  /** Check if the encryption details, like the encryption key, type and
  other details, that are needed to carry out encryption are available.
  @return true if encryption can be done, false otherwise. */
  bool can_encrypt() const noexcept MY_ATTRIBUTE((warn_unused_result)) {
    return encryption_type != Encryption::Type::NONE;
  }

  /** Copy the encryption info from this object to the provided
  Encryption object.
  @param[in]    en   Encryption object to which info is copied. */
  void get_encryption_info(Encryption &en) noexcept {
    en.set_type(encryption_type);
    en.set_key(encryption_key);
    en.set_key_length(encryption_klen);
    en.set_initial_vector(encryption_iv);
  }

 public:
  /** Get the file node corresponding to the given page number of the
  tablespace.
  @param[in,out]  page_no   Caller passes the page number within a tablespace.
                            After return, it contains the page number within
                            the returned file node. For tablespaces containing
                            only one file, the given page_no will not change.
  @ return the file node object. * /
  fil_node_t * get_file_node(page_no_t * page_no) noexcept
      MY_ATTRIBUTE((warn_unused_result));
};

InnodB内，数据都是页为单位来管理的，它主要分为两类：FSP_HDR/XDES Page和fseg inodes Page。在分配Page时，以Extent为单位一次至少分配64个Page。fil_addr_t明显只有两个变量重要，即page， offset,看到这两个变量，就应该想到这是找位置的具体的方法。

三、数据字典

数据字典的和元数据关系密切，其实说句不太贴切的话，就是配置的数据和根据配置生成的相关的数据库的信息的数据，它的代码主要在inlude/dict0dd.h dict0dd.ic中。

/** Hard-coded data dictionary information */
struct innodb_dd_table_t {
  /** Data dictionary table name */
  const char *name;
  /** Number of indexes */
  const uint n_indexes;
};
/** The hard-coded data dictionary tables. */
const innodb_dd_table_t innodb_dd_table[] = {
    INNODB_DD_TABLE("dd_properties", 1),

    INNODB_DD_TABLE("innodb_dynamic_metadata", 1),
    INNODB_DD_TABLE("innodb_table_stats", 1),
    INNODB_DD_TABLE("innodb_index_stats", 1),
    INNODB_DD_TABLE("innodb_ddl_log", 2),

    INNODB_DD_TABLE("catalogs", 2),
    INNODB_DD_TABLE("character_sets", 3),
    INNODB_DD_TABLE("check_constraints", 3),
    INNODB_DD_TABLE("collations", 3),
    INNODB_DD_TABLE("column_statistics", 3),
    INNODB_DD_TABLE("column_type_elements", 1),
    INNODB_DD_TABLE("columns", 5),
    INNODB_DD_TABLE("events", 6),
    INNODB_DD_TABLE("foreign_key_column_usage", 3),
    INNODB_DD_TABLE("foreign_keys", 4),
    INNODB_DD_TABLE("index_column_usage", 3),
    INNODB_DD_TABLE("index_partitions", 3),
    INNODB_DD_TABLE("index_stats", 1),
    INNODB_DD_TABLE("indexes", 3),
    INNODB_DD_TABLE("parameter_type_elements", 1),
    INNODB_DD_TABLE("parameters", 3),
    INNODB_DD_TABLE("resource_groups", 2),
    INNODB_DD_TABLE("routines", 7),
    INNODB_DD_TABLE("schemata", 3),
    INNODB_DD_TABLE("st_spatial_reference_systems", 3),
    INNODB_DD_TABLE("table_partition_values", 1),
    INNODB_DD_TABLE("table_partitions", 7),
    INNODB_DD_TABLE("table_stats", 1),
    INNODB_DD_TABLE("tables", 10),
    INNODB_DD_TABLE("tablespace_files", 2),
    INNODB_DD_TABLE("tablespaces", 2),
    INNODB_DD_TABLE("triggers", 7),
    INNODB_DD_TABLE("view_routine_usage", 2),
    INNODB_DD_TABLE("view_table_usage", 2)};


    /** Data structure for a database table.  Most fields will be
    initialized to 0, NULL or FALSE in dict_mem_table_create(). */
    struct dict_table_t {
      /** Check if the table is compressed.
      @return true if compressed, false otherwise. */
      bool is_compressed() const { return (DICT_TF_GET_ZIP_SSIZE(flags) != 0); }

      /** Get reference count.
      @return current value of n_ref_count */
      inline uint64_t get_ref_count() const;

      /** Acquire the table handle. */
      inline void acquire();

      /** Acquire the table handle, with lock() and unlock() the table.
      This function needs to be called for opening table when the table
      is in memory and later the stats information would be initialized */
      inline void acquire_with_lock();

      /** Release the table handle. */
      inline void release();

      /** Lock the table handle. */
      inline void lock();

      /** Unlock the table handle. */
      inline void unlock();

    #ifndef UNIV_HOTBACKUP
      /** Get schema and table name in system character set.
      @param[out]	schema	schema name
      @param[out]	table	table name */
      void get_table_name(std::string &schema, std::string &table);

      /** Mutex of the table for concurrency access. */
      ib_mutex_t *mutex;

      /** Creation state of mutex. */
      std::atomic<os_once::state_t> mutex_created;
    #endif /* !UNIV_HOTBACKUP */

      /** Id of the table. */
      table_id_t id;

      /** Memory heap. If you allocate from this heap after the table has
      been created then be sure to account the allocation into
      dict_sys->size. When closing the table we do something like
      dict_sys->size -= mem_heap_get_size(table->heap) and if that is going
      to become negative then we would assert. Something like this should do:
      old_size = mem_heap_get_size()
      mem_heap_alloc()
      new_size = mem_heap_get_size()
      dict_sys->size += new_size - old_size. */
      mem_heap_t *heap;

      /** Table name. */
      table_name_t name;

      /** Truncate name. */
      table_name_t trunc_name;

      /** NULL or the directory path specified by DATA DIRECTORY. */
      char *data_dir_path;

      /** NULL or the tablespace name that this table is assigned to,
      specified by the TABLESPACE option.*/
      id_name_t tablespace;

      /** Space where the clustered index of the table is placed. */
      space_id_t space;

      /** dd::Tablespace::id of the table */
      dd::Object_id dd_space_id;

      /** Stores information about:
      1 row format (redundant or compact),
      2 compressed page size (zip shift size),
      3 whether using atomic blobs,
      4 whether the table has been created with the option DATA DIRECTORY.
      Use DICT_TF_GET_COMPACT(), DICT_TF_GET_ZIP_SSIZE(),
      DICT_TF_HAS_ATOMIC_BLOBS() and DICT_TF_HAS_DATA_DIR() to parse this
      flag. */
      unsigned flags : DICT_TF_BITS;

      /** Stores information about:
      1 whether the table has been created using CREATE TEMPORARY TABLE,
      2 whether the table has an internally defined DOC ID column,
      3 whether the table has a FTS index,
      4 whether DOC ID column need to be added to the FTS index,
      5 whether the table is being created its own tablespace,
      6 whether the table has been DISCARDed,
      7 whether the aux FTS tables names are in hex.
      8 whether the table is instinc table.
      9 whether the table has encryption setting.
      Use DICT_TF2_FLAG_IS_SET() to parse this flag. */
      unsigned flags2 : DICT_TF2_BITS;

      /** TRUE if the table is an intermediate table during copy alter
      operation or a partition/subpartition which is required for copying
      data and skip the undo log for insertion of row in the table.
      This variable will be set and unset during extra(), or during the
      process of altering partitions */
      unsigned skip_alter_undo : 1;

      /** TRUE if this is in a single-table tablespace and the .ibd file is
      missing. Then we must return in ha_innodb.cc an error if the user
      tries to query such an orphaned table. */
      unsigned ibd_file_missing : 1;

      /** TRUE if the table object has been added to the dictionary cache. */
      unsigned cached : 1;

      /** TRUE if the table is to be dropped, but not yet actually dropped
      (could in the background drop list). It is turned on at the beginning
      of row_drop_table_for_mysql() and turned off just before we start to
      update system tables for the drop. It is protected by
      dict_operation_lock. */
      unsigned to_be_dropped : 1;

      /** Number of non-virtual columns defined so far. */
      unsigned n_def : 10;

      /** Number of non-virtual columns. */
      unsigned n_cols : 10;

      /** Number of non-virtual columns before first instant ADD COLUMN,
      including the system columns like n_cols. */
      unsigned n_instant_cols : 10;

      /** Number of total columns (inlcude virtual and non-virtual) */
      unsigned n_t_cols : 10;

      /** Number of total columns defined so far. */
      unsigned n_t_def : 10;

      /** Number of virtual columns defined so far. */
      unsigned n_v_def : 10;

      /** Number of virtual columns. */
      unsigned n_v_cols : 10;

      /** Number of multi-value virtual columns. */
      unsigned n_m_v_cols : 10;

      /** TRUE if this table is expected to be kept in memory. This table
      could be a table that has FK relationships or is undergoing DDL */
      unsigned can_be_evicted : 1;

      /** TRUE if this table is not evictable(can_be_evicted) and this is
      because of DDL operation */
      unsigned ddl_not_evictable : 1;

      /** TRUE if some indexes should be dropped after ONLINE_INDEX_ABORTED
      or ONLINE_INDEX_ABORTED_DROPPED. */
      unsigned drop_aborted : 1;

      /** Array of column descriptions. */
      dict_col_t *cols;

      /** Array of virtual column descriptions. */
      dict_v_col_t *v_cols;

      /** List of stored column descriptions. It is used only for foreign key
      check during create table and copy alter operations.
      During copy alter, s_cols list is filled during create table operation
      and need to preserve till rename table operation. That is the
      reason s_cols is a part of dict_table_t */
      dict_s_col_list *s_cols;

      /** Column names packed in a character string
      "name1\0name2\0...nameN\0". Until the string contains n_cols, it will
      be allocated from a temporary heap. The final string will be allocated
      from table->heap. */
      const char *col_names;

      /** Virtual column names */
      const char *v_col_names;

      /** True if the table belongs to a system database (mysql, information_schema
      or performance_schema) */
      bool is_system_table;

      /** Hash chain node. */
      hash_node_t name_hash;

      /** Hash chain node. */
      hash_node_t id_hash;

      /** The FTS_DOC_ID_INDEX, or NULL if no fulltext indexes exist */
      dict_index_t *fts_doc_id_index;

      /** List of indexes of the table. */
      UT_LIST_BASE_NODE_T(dict_index_t) indexes;

      /** List of foreign key constraints in the table. These refer to
      columns in other tables. */
      UT_LIST_BASE_NODE_T(dict_foreign_t) foreign_list;

      /** List of foreign key constraints which refer to this table. */
      UT_LIST_BASE_NODE_T(dict_foreign_t) referenced_list;

      /** Node of the LRU list of tables. */
      UT_LIST_NODE_T(dict_table_t) table_LRU;

      /** metadata version number of dd::Table::se_private_data() */
      uint64_t version;

      /** table dynamic metadata status, protected by dict_persist->mutex */
      std::atomic<table_dirty_status> dirty_status;

    #ifndef UNIV_HOTBACKUP
      /** Node of the dirty table list of tables, which is protected
      by dict_persist->mutex */
      UT_LIST_NODE_T(dict_table_t) dirty_dict_tables;
    #endif /* !UNIV_HOTBACKUP */

    #ifdef UNIV_DEBUG
      /** This field is used to mark if a table is in the
      dirty_dict_tables_list. if the dirty_status is not of
      METADATA_CLEAN, the table should be in the list, otherwise not.
      This field should be protected by dict_persist->mutex too. */
      bool in_dirty_dict_tables_list;
    #endif /* UNIV_DEBUG */

      /** Maximum recursive level we support when loading tables chained
      together with FK constraints. If exceeds this level, we will stop
      loading child table into memory along with its parent table. */
      unsigned fk_max_recusive_level : 8;

      /** Count of how many foreign key check operations are currently being
      performed on the table. We cannot drop the table while there are
      foreign key checks running on it. */
      std::atomic<ulint> n_foreign_key_checks_running;

      /** Transaction id that last touched the table definition. Either when
      loading the definition or CREATE TABLE, or ALTER TABLE (prepare,
      commit, and rollback phases). */
      trx_id_t def_trx_id;

      /*!< set of foreign key constraints in the table; these refer to
      columns in other tables */
      dict_foreign_set foreign_set;

      /*!< set of foreign key constraints which refer to this table */
      dict_foreign_set referenced_set;

    #ifdef UNIV_DEBUG
      /** This field is used to specify in simulations tables which are so
      big that disk should be accessed. Disk access is simulated by putting
      the thread to sleep for a while. NOTE that this flag is not stored to
      the data dictionary on disk, and the database will forget about value
      TRUE if it has to reload the table definition from disk. */
      ibool does_not_fit_in_memory;
    #endif /* UNIV_DEBUG */

      /** TRUE if the maximum length of a single row exceeds BIG_ROW_SIZE.
      Initialized in dict_table_add_to_cache(). */
      unsigned big_rows : 1;

    #ifndef UNIV_HOTBACKUP
      /** Statistics for query optimization. @{ */

      /** Creation state of 'stats_latch'. */
      std::atomic<os_once::state_t> stats_latch_created;

      /** This latch protects:
      "dict_table_t::stat_initialized",
      "dict_table_t::stat_n_rows (*)",
      "dict_table_t::stat_clustered_index_size",
      "dict_table_t::stat_sum_of_other_index_sizes",
      "dict_table_t::stat_modified_counter (*)",
      "dict_table_t::indexes*::stat_n_diff_key_vals[]",
      "dict_table_t::indexes*::stat_index_size",
      "dict_table_t::indexes*::stat_n_leaf_pages".
      (*) Those are not always protected for
      performance reasons. */
      rw_lock_t *stats_latch;

      /** TRUE if statistics have been calculated the first time after
      database startup or table creation. */
      unsigned stat_initialized : 1;

      /** Timestamp of last recalc of the stats. */
      ib_time_monotonic_t stats_last_recalc;

    /** The two bits below are set in the 'stat_persistent' member. They
    have the following meaning:
    1. _ON=0, _OFF=0, no explicit persistent stats setting for this table,
    the value of the global srv_stats_persistent is used to determine
    whether the table has persistent stats enabled or not
    2. _ON=0, _OFF=1, persistent stats are explicitly disabled for this
    table, regardless of the value of the global srv_stats_persistent
    3. _ON=1, _OFF=0, persistent stats are explicitly enabled for this
    table, regardless of the value of the global srv_stats_persistent
    4. _ON=1, _OFF=1, not allowed, we assert if this ever happens. */
    #define DICT_STATS_PERSISTENT_ON (1 << 1)
    #define DICT_STATS_PERSISTENT_OFF (1 << 2)

      /** Indicates whether the table uses persistent stats or not. See
      DICT_STATS_PERSISTENT_ON and DICT_STATS_PERSISTENT_OFF. */
      ib_uint32_t stat_persistent;

    /** The two bits below are set in the 'stats_auto_recalc' member. They
    have the following meaning:
    1. _ON=0, _OFF=0, no explicit auto recalc setting for this table, the
    value of the global srv_stats_persistent_auto_recalc is used to
    determine whether the table has auto recalc enabled or not
    2. _ON=0, _OFF=1, auto recalc is explicitly disabled for this table,
    regardless of the value of the global srv_stats_persistent_auto_recalc
    3. _ON=1, _OFF=0, auto recalc is explicitly enabled for this table,
    regardless of the value of the global srv_stats_persistent_auto_recalc
    4. _ON=1, _OFF=1, not allowed, we assert if this ever happens. */
    #define DICT_STATS_AUTO_RECALC_ON (1 << 1)
    #define DICT_STATS_AUTO_RECALC_OFF (1 << 2)

      /** Indicates whether the table uses automatic recalc for persistent
      stats or not. See DICT_STATS_AUTO_RECALC_ON and
      DICT_STATS_AUTO_RECALC_OFF. */
      ib_uint32_t stats_auto_recalc;

      /** The number of pages to sample for this table during persistent
      stats estimation. If this is 0, then the value of the global
      srv_stats_persistent_sample_pages will be used instead. */
      ulint stats_sample_pages;

      /** Approximate number of rows in the table. We periodically calculate
      new estimates. */
      ib_uint64_t stat_n_rows;

      /** Approximate clustered index size in database pages. */
      ulint stat_clustered_index_size;

      /** Approximate size of other indexes in database pages. */
      ulint stat_sum_of_other_index_sizes;

      /** If FTS AUX table, parent table id */
      table_id_t parent_id;

      /** How many rows are modified since last stats recalc. When a row is
      inserted, updated, or deleted, we add 1 to this number; we calculate
      new estimates for the table and the indexes if the table has changed
      too much, see row_update_statistics_if_needed(). The counter is reset
      to zero at statistics calculation. This counter is not protected by
      any latch, because this is only used for heuristics. */
      ib_uint64_t stat_modified_counter;

    /** Background stats thread is not working on this table. */
    #define BG_STAT_NONE 0

    /** Set in 'stats_bg_flag' when the background stats code is working
    on this table. The DROP TABLE code waits for this to be cleared before
    proceeding. */
    #define BG_STAT_IN_PROGRESS (1 << 0)

    /** Set in 'stats_bg_flag' when DROP TABLE starts waiting on
    BG_STAT_IN_PROGRESS to be cleared. The background stats thread will
    detect this and will eventually quit sooner. */
    #define BG_STAT_SHOULD_QUIT (1 << 1)

      /** The state of the background stats thread wrt this table.
      See BG_STAT_NONE, BG_STAT_IN_PROGRESS and BG_STAT_SHOULD_QUIT.
      Writes are covered by dict_sys->mutex. Dirty reads are possible. */
      byte stats_bg_flag;

      /** @} */
    #endif /* !UNIV_HOTBACKUP */

      /** AUTOINC related members. @{ */

      /* The actual collection of tables locked during AUTOINC read/write is
      kept in trx_t. In order to quickly determine whether a transaction has
      locked the AUTOINC lock we keep a pointer to the transaction here in
      the 'autoinc_trx' member. This is to avoid acquiring lock_sys latches and
      scanning the vector in trx_t.
      When an AUTOINC lock has to wait, the corresponding lock instance is
      created on the trx lock heap rather than use the pre-allocated instance
      in autoinc_lock below. */

      /** A buffer for an AUTOINC lock for this table. We allocate the
      memory here so that individual transactions can get it and release it
      without a need to allocate space from the lock heap of the trx:
      otherwise the lock heap would grow rapidly if we do a large insert
      from a select. */
    #ifndef UNIV_HOTBACKUP
      lock_t *autoinc_lock;

      /** Creation state of autoinc_mutex member */
      std::atomic<os_once::state_t> autoinc_mutex_created;
    #endif /* !UNIV_HOTBACKUP */

      /** Mutex protecting the autoincrement counter. */
      ib_mutex_t *autoinc_mutex;

      /** Autoinc counter value to give to the next inserted row. */
      ib_uint64_t autoinc;

      /** Mutex protecting the persisted autoincrement counter. */
      ib_mutex_t *autoinc_persisted_mutex;

      /** Autoinc counter value that has been persisted in redo logs or
      DDTableBuffer. It's mainly used when we want to write counter back
      to DDTableBuffer.
      This is different from the 'autoinc' above, which could be bigger
      than this one, because 'autoinc' will get updated right after
      some counters are allocated, but we will write the counter to redo
      logs and update this counter later. Once all allocated counters
      have been written to redo logs, 'autoinc' should be exact the next
      counter of this persisted one.
      We want this counter because when we need to write the counter back
      to DDTableBuffer, we had better keep it consistency with the counter
      that has been written to redo logs. Besides, we can't read the 'autoinc'
      directly easily, because the autoinc_lock is required and there could
      be a deadlock.
      This variable is protected by autoinc_persisted_mutex. */
      ib_uint64_t autoinc_persisted;

      /** The position of autoinc counter field in clustered index. This would
      be set when CREATE/ALTER/OPEN TABLE and IMPORT TABLESPACE, and used in
      modifications to clustered index, such as INSERT/UPDATE. There should
      be no conflict to access it, so no protection is needed. */
      ulint autoinc_field_no;

      /** The transaction that currently holds the the AUTOINC lock on this table.
      Protected by lock_sys table shard latch. To "peek" the current value one
      can read it without any latch, understanding that in general it may change.
      Such access pattern is correct if trx thread wants to check if it has the lock
      granted, as the field can only change to other value when lock is released,
      which can not happen concurrently to thread executing the trx. */
      std::atomic<const trx_t *> autoinc_trx;

      /** @} */

    #ifndef UNIV_HOTBACKUP
      /** FTS specific state variables. */
      fts_t *fts;
    #endif /* !UNIV_HOTBACKUP */

      /** Quiescing states, protected by the dict_index_t::lock. ie. we can
      only change the state if we acquire all the latches (dict_index_t::lock)
      in X mode of this table's indexes. */
      ib_quiesce_t quiesce;

      /** Count of the number of record locks on this table. We use this to
      determine whether we can evict the table from the dictionary cache.
      Writes (atomic increments and decrements) are performed when holding a shared
      latch on lock_sys. (Note that this the table's shard latch is NOT required,
      as this is field counts *record* locks, so a page shard is latched instead)
      Reads should be performed when holding exclusive lock_sys latch, however:
      - Some places assert this field is zero without holding any latch.
      - Some places assert this field is positive holding only shared latch. */
      std::atomic<size_t> n_rec_locks;

    #ifndef UNIV_DEBUG
     private:
    #endif
      /** Count of how many handles are opened to this table. Dropping of the
      table is NOT allowed until this count gets to zero. MySQL does NOT
      itself check the number of open handles at DROP. */
      std::atomic<uint64_t> n_ref_count;

     public:
    #ifndef UNIV_HOTBACKUP
      /** List of locks on the table. Protected by lock_sys shard latch. */
      table_lock_list_t locks;
      /** count_by_mode[M] = number of locks in this->locks with
      lock->type_mode&LOCK_MODE_MASK == M.
      Used to quickly verify that there are no LOCK_S or LOCK_X, which are the only
      modes incompatible with LOCK_IS and LOCK_IX, to avoid costly iteration over
      this->locks when adding LOCK_IS or LOCK_IX.
      We use count_by_mode[LOCK_AUTO_INC] to track the number of granted and pending
      autoinc locks on this table. This value is set after acquiring the lock_sys
      table shard latch, but we peek the contents to determine whether other
      transactions have acquired the AUTOINC lock or not. Of course only one
      transaction can be granted the lock but there can be multiple
      waiters.
      Protected by lock_sys table shard latch. */
      ulong count_by_mode[LOCK_NUM];
    #endif /* !UNIV_HOTBACKUP */

      /** Timestamp of the last modification of this table. */
      time_t update_time;

      /** row-id counter for use by intrinsic table for getting row-id.
      Given intrinsic table semantics, row-id can be locally maintained
      instead of getting it from central generator which involves mutex
      locking. */
      ib_uint64_t sess_row_id;

      /** trx_id counter for use by intrinsic table for getting trx-id.
      Intrinsic table are not shared so don't need a central trx-id
      but just need a increased counter to track consistent view while
      proceeding SELECT as part of UPDATE. */
      ib_uint64_t sess_trx_id;

    #ifdef UNIV_DEBUG
    /** Value of 'magic_n'. */
    #define DICT_TABLE_MAGIC_N 76333786

      /** Magic number. */
      ulint magic_n;
    #endif /* UNIV_DEBUG */
      /** mysql_row_templ_t for base columns used for compute the virtual
      columns */
      dict_vcol_templ_t *vc_templ;

      /** encryption key, it's only for export/import */
      byte *encryption_key;

      /** encryption iv, it's only for export/import */
      byte *encryption_iv;

      /** remove the dict_table_t from cache after DDL operation */
      bool discard_after_ddl;

      /** refresh/reload FK info */
      bool refresh_fk;

    #ifndef UNIV_HOTBACKUP
      /** multiple cursors can be active on this temporary table */
      temp_prebuilt_vec *temp_prebuilt;
    #endif /* !UNIV_HOTBACKUP */

      /** TRUE only for dictionary tables like mysql/tables,
      mysql/columns, mysql/tablespaces, etc. This flag is used
      to do non-locking reads on DD tables. */
      bool is_dd_table;

      /** true if this table is explicitly put to non-LRU list
      during table creation */
      bool explicitly_non_lru;

      /** @return the clustered index */
      const dict_index_t *first_index() const {
        ut_ad(magic_n == DICT_TABLE_MAGIC_N);
        const dict_index_t *first = UT_LIST_GET_FIRST(indexes);
        return (first);
      }
      /** @return the clustered index */
      dict_index_t *first_index() {
        return (const_cast<dict_index_t *>(
            const_cast<const dict_table_t *>(this)->first_index()));
      }

      /** @return if there was any instantly added column.
      This will be true after one or more instant ADD COLUMN, however,
      it would become false after ALTER TABLE which rebuilds or copies
      the old table.
      If this is true, all instantly added columns should have default
      values, and records in the table may have REC_INFO_INSTANT_FLAG set. */
      bool has_instant_cols() const {
        ut_ad(n_instant_cols <= n_cols);

        return (n_instant_cols < n_cols);
      }

      /** Set the number of columns when the first instant ADD COLUMN happens.
      @param[in]	instant_cols	number of fields when first instant
                                      ADD COLUMN happens, without system
                                      columns */
      void set_instant_cols(uint16_t instant_cols) {
        n_instant_cols = static_cast<unsigned>(instant_cols) + get_n_sys_cols();
      }

      /** Get the number of user columns when the first instant ADD COLUMN
      happens.
      @return	the number of user columns as described above */
      uint16_t get_instant_cols() const {
        return static_cast<uint16_t>(n_instant_cols - get_n_sys_cols());
      }

      /** Check whether the table is corrupted.
      @return true if the table is corrupted, otherwise false */
      bool is_corrupted() const {
        ut_ad(magic_n == DICT_TABLE_MAGIC_N);

        const dict_index_t *index = first_index();

        /* It is possible that this table is only half created, in which case
        the clustered index may be NULL.  If the clustered index is corrupted,
        the table is corrupt.  We do not consider the table corrupt if only
        a secondary index is corrupt. */
        ut_ad(index == nullptr || index->is_clustered());

        return (index != nullptr && index->type & DICT_CORRUPT);
      }

      /** Returns a column's name.
      @param[in] col_nr	column number
      @return column name. NOTE: not guaranteed to stay valid if table is
      modified in any way (columns added, etc.). */
      const char *get_col_name(ulint col_nr) const {
        ut_ad(col_nr < n_def);
        ut_ad(magic_n == DICT_TABLE_MAGIC_N);

        const char *s = col_names;
        if (s) {
          for (ulint i = 0; i < col_nr; i++) {
            s += strlen(s) + 1;
          }
        }

        return (s);
      }

      /**Gets the nth column of a table.
      @param[in] pos	position of column
      @return pointer to column object */
      dict_col_t *get_col(ulint pos) const {
        ut_ad(pos < n_def);
        ut_ad(magic_n == DICT_TABLE_MAGIC_N);

        return (cols + pos);
      }

      /** Gets the number of user-defined non-virtual columns in a table
      in the dictionary cache.
      @return number of user-defined (e.g., not ROW_ID) non-virtual columns
      of a table */
      uint16_t get_n_user_cols() const {
        ut_ad(magic_n == DICT_TABLE_MAGIC_N);

        return (static_cast<uint16_t>(n_cols) - get_n_sys_cols());
      }

      /** Gets the number of system columns in a table.
      For intrinsic table on ROW_ID column is added for all other
      tables TRX_ID and ROLL_PTR are all also appeneded.
      @return number of system (e.g., ROW_ID) columns of a table */
      uint16_t get_n_sys_cols() const {
        ut_ad(magic_n == DICT_TABLE_MAGIC_N);

        return (is_intrinsic() ? DATA_ITT_N_SYS_COLS : DATA_N_SYS_COLS);
      }

      /** Gets the number of all non-virtual columns (also system) in a table
      in the dictionary cache.
      @return number of non-virtual columns of a table */
      ulint get_n_cols() const {
        ut_ad(magic_n == DICT_TABLE_MAGIC_N);

        return (n_cols);
      }

      /** Gets the given system column of a table.
      @param[in] sys DATA_ROW_ID, ...
      @return pointer to column object */
      dict_col_t *get_sys_col(ulint sys) const {
        dict_col_t *col;

        ut_ad(sys < get_n_sys_cols());
        ut_ad(magic_n == DICT_TABLE_MAGIC_N);

        col = get_col(n_cols - get_n_sys_cols() + sys);
        ut_ad(col->mtype == DATA_SYS);
        ut_ad(col->prtype == (sys | DATA_NOT_NULL));

        return (col);
      }

      /** Determine if this is a temporary table. */
      bool is_temporary() const {
        ut_ad(magic_n == DICT_TABLE_MAGIC_N);
        return (flags2 & DICT_TF2_TEMPORARY);
      }

      /** Determine if this is a FTS AUX table. */
      bool is_fts_aux() const {
        ut_ad(magic_n == DICT_TABLE_MAGIC_N);
        return (flags2 & DICT_TF2_AUX);
      }

      /** Determine whether the table is intrinsic.
      An intrinsic table is a special kind of temporary table that
      is invisible to the end user. It can be created internally by InnoDB,
      the MySQL server layer or other modules connected to InnoDB in order
      to gather and use data as part of a larger task. Since access to it
      must be as fast as possible, it does not need UNDO semantics, system
      fields DB_TRX_ID & DB_ROLL_PTR, doublewrite, checksum, insert buffer,
      use of the shared data dictionary, locking, or even a transaction.
      In short, these are not ACID tables at all, just temporary data stored
      and manipulated during a larger process.*/
      bool is_intrinsic() const {
        if (flags2 & DICT_TF2_INTRINSIC) {
          ut_ad(is_temporary());
          return (true);
        }

        return (false);
      }

      /* GAP locks are skipped for DD tables and SDI tables
      @return true if table is DD table or SDI table, else false */
      inline bool skip_gap_locks() const;

      /** Determine if the table can support instant ADD COLUMN */
      inline bool support_instant_add() const;
    };
    struct dict_vcol_templ_t {
      /** number of regular columns */
      ulint n_col;

      /** number of virtual columns */
      ulint n_v_col;

      /** array of templates for virtual col and their base columns */
      mysql_row_templ_t **vtempl;

      /** table's database name */
      std::string db_name;

      /** table name */
      std::string tb_name;

      /** share->table_name */
      std::string share_name;

      /** MySQL record length */
      ulint rec_len;

      /** default column value if any */
      byte *default_rec;
    };

这里面还有几个相关的数据结构，都可以认真看一下，分析一下。

四、双写缓冲区

双写缓冲区，一定得有一个缓冲的字样：

struct Buffer {
  /** Constructor
  @param[in]	n_pages		        Number of pages to create */
  explicit Buffer(size_t n_pages) noexcept
      : m_n_bytes(n_pages * univ_page_size.physical()) {
    ut_a(n_pages > 0);

    auto n_bytes = m_n_bytes + univ_page_size.physical();

    m_ptr_unaligned = static_cast<byte *>(ut_zalloc_nokey(n_bytes));

    m_ptr = static_cast<byte *>(ut_align(m_ptr_unaligned, UNIV_PAGE_SIZE));

    ut_a(ptrdiff_t(m_ptr - m_ptr_unaligned) <=
         (ssize_t)univ_page_size.physical());

    m_next = m_ptr;
  }

  /** Destructor */
  ~Buffer() noexcept {
    if (m_ptr_unaligned != nullptr) {
      ut_free(m_ptr_unaligned);
    }
    m_ptr_unaligned = nullptr;
  }

  /** Add the contents of ptr upto n_bytes to the buffer.
  @return false if it won't fit. Nothing is copied if it won't fit. */
  bool append(const void *ptr, size_t n_bytes) noexcept {
    ut_a(m_next >= m_ptr && m_next <= m_ptr + m_n_bytes);

    if (m_next + univ_page_size.physical() > m_ptr + m_n_bytes) {
      return false;
    }

    memcpy(m_next, ptr, n_bytes);
    m_next += univ_page_size.physical();

    return true;
  }

  /** @return the start of the buffer to write from. */
  byte *begin() noexcept { return m_ptr; }

  /** @return the start of the buffer to write from. */
  const byte *begin() const noexcept { return m_ptr; }

  /** @return the size of of the buffer to write. */
  size_t size() const noexcept {
    ut_a(m_next >= m_ptr);
    return std::ptrdiff_t(m_next - m_ptr);
  }

  /** @return the capacity of the buffer in bytes. */
  size_t capacity() const noexcept { return m_n_bytes; }

  /** @return true if the buffer is empty. */
  bool empty() const noexcept { return size() == 0; }

  /** Empty the buffer. */
  void clear() noexcept { m_next = m_ptr; }

  /** Write buffer used in writing to the doublewrite buffer,
  aligned to an address divisible by UNIV_PAGE_SIZE (which is
  required by Windows AIO) */
  byte *m_ptr{};

  /** Start of  next write to the buffer. */
  byte *m_next{};

  /** Pointer to m_ptr, but unaligned */
  byte *m_ptr_unaligned{};

  /** Size of the unaligned (raw) buffer. * /
  const size_t m_n_bytes{};

  // Disable copying
  Buffer(const Buffer &) = delete;
  Buffer(const Buffer &&) = delete;
  Buffer &operator=(Buffer &&) = delete;
  Buffer &operator=(const Buffer &) = delete;
};

// Forward declaration.
class Segment;
class Batch_segment;

/** Doublewrite implementation. Assumes it can use DBLWR_PAGES. */
class Double_write {
  /** Maximum wait in micro-seconds for new write events. */
  static constexpr auto MAX_WAIT_FOR_EVENTS = 10000000;

 public:
  /** Number of instances. */
  static uint32_t s_n_instances;

  /** For collecting pages to write. */
  struct Buf_pages {
    /** Constructor.
    @param[in] size             Number of pages to reserve. */
    explicit Buf_pages(uint32_t size) : m_pages(size) {
      ut_a(size > 0);
      ut_a(m_pages.capacity() == size);
      ut_a(m_pages.size() == m_pages.capacity());
    }

    /** Add a page to the collection.
    @param[in] bpage     Page to write.
    @param[in] e_block   encrypted block.
    @param[in] e_len     length of data in e_block. */
    void push_back(buf_page_t *bpage, const file::Block *e_block,
                   uint32_t e_len) noexcept {
      ut_a(m_size < m_pages.capacity());
#ifdef UNIV_DEBUG
      {
        byte *e_frame =
            (e_block == nullptr) ? nullptr : os_block_get_frame(e_block);
        if (e_frame != nullptr) {
          ut_ad(mach_read_from_4(e_frame + FIL_PAGE_OFFSET) ==
                bpage->page_no());
          ut_ad(mach_read_from_4(e_frame + FIL_PAGE_SPACE_ID) ==
                bpage->space());
        }
      }
#endif /* UNIV_DEBUG */
      m_pages[m_size++] = std::make_tuple(bpage, e_block, e_len);
    }

    /** Clear the collection. */
    void clear() noexcept { m_size = 0; }

    /** @return check if collection is empty. */
    bool empty() const noexcept { return size() == 0; }

    /** @return number of active elements. */
    uint32_t size() const noexcept { return m_size; }

    /** @return the capacity of the collection. */
    uint32_t capacity() const noexcept MY_ATTRIBUTE((warn_unused_result)) {
      return m_pages.capacity();
    }

    typedef std::tuple<buf_page_t *, const file::Block *, uint32_t> Dblwr_tuple;
    using Pages = std::vector<Dblwr_tuple, ut_allocator<Dblwr_tuple>>;

    /** Collection of pages. */
    Pages m_pages{};

    /** Number of live elements. */
    uint32_t m_size{};
  };

  /** Constructor
  @param[in] id                 Instance ID
  @param[in] n_pages            Number of pages handled by this instance. */
  Double_write(uint16_t id, uint32_t n_pages) noexcept;

  /** Destructor */
  ~Double_write() noexcept;

  /** @return instance ID */
  uint16_t id() const noexcept MY_ATTRIBUTE((warn_unused_result)) {
    return m_id;
  }

  /** Process the requests in the flush queue, write the blocks to the
  double write file, sync the file if required and then write to the
  data files. */
  void write(buf_flush_t flush_type) noexcept;

  /** @return the double write instance to use for flushing.
  @param[in] buf_pool_index     Buffer pool instance number.
  @param[in] flush_type         LRU or Flush list write.
  @return instance that will handle the flush to disk. */
  static Double_write *instance(buf_flush_t flush_type,
                                uint32_t buf_pool_index) noexcept
      MY_ATTRIBUTE((warn_unused_result)) {
    ut_a(buf_pool_index < srv_buf_pool_instances);

    auto midpoint = s_instances->size() / 2;
    auto i = midpoint > 0 ? buf_pool_index % midpoint : 0;

    if (flush_type == BUF_FLUSH_LIST) {
      i += midpoint;
    }

    return s_instances->at(i);
  }

  /** Wait for any pending batch to complete.
  @return true if the thread had to wait for another batch. */
  bool wait_for_pending_batch() noexcept {
    ut_ad(mutex_own(&m_mutex));

    auto sig_count = os_event_reset(m_event);

    std::atomic_thread_fence(std::memory_order_acquire);

    if (m_batch_running.load(std::memory_order_acquire)) {
      mutex_exit(&m_mutex);

      MONITOR_INC(MONITOR_DBLWR_FLUSH_WAIT_EVENTS);
      os_event_wait_low(m_event, sig_count);
      sig_count = os_event_reset(m_event);
      return true;
    }

    return false;
  }

  /** Flush buffered pages to disk, clear the buffers.
  @param[in] flush_type           FLUSH LIST or LRU LIST flush.
  @return false if there was a write batch already in progress. */
  bool flush_to_disk(buf_flush_t flush_type) noexcept {
    ut_ad(mutex_own(&m_mutex));

    /* Wait for any batch writes that are in progress. */
    if (wait_for_pending_batch()) {
      ut_ad(!mutex_own(&m_mutex));
      return false;
    }

    MONITOR_INC(MONITOR_DBLWR_FLUSH_REQUESTS);

    /* Write the pages to disk and free up the buffer. */
    write_pages(flush_type);

    ut_a(m_buffer.empty());
    ut_a(m_buf_pages.empty());

    return true;
  }

  /** Process the requests in the flush queue, write the blocks to the
  double write file, sync the file if required and then write to the
  data files.
  @param[in] flush_type         LRU or FLUSH request. */
  void write_pages(buf_flush_t flush_type) noexcept;

  /** Force a flush of the page queue.
  @param[in] flush_type           FLUSH LIST or LRU LIST flush. */
  void force_flush(buf_flush_t flush_type) noexcept {
    for (;;) {
      mutex_enter(&m_mutex);
      if (!m_buf_pages.empty() && !flush_to_disk(flush_type)) {
        ut_ad(!mutex_own(&m_mutex));
        continue;
      }
      break;
    }
    mutex_exit(&m_mutex);
  }

  /** Add a page to the flush batch. If the flush batch is full then write
  the batch to disk.
  @param[in] flush_type     Flush type.
  @param[in] bpage          Page to flush to disk.
  @param[in] e_block        Encrypted block frame or nullptr.
  @param[in] e_len          Encrypted data length if e_block is valid. */
  void enqueue(buf_flush_t flush_type, buf_page_t *bpage,
               const file::Block *e_block, uint32_t e_len) noexcept {
    ut_ad(buf_page_in_file(bpage));

    void *frame{};
    uint32_t len{};
    byte *e_frame =
        (e_block == nullptr) ? nullptr : os_block_get_frame(e_block);

    if (e_frame != nullptr) {
      frame = e_frame;
      len = e_len;
    } else {
      prepare(bpage, &frame, &len);
    }

    ut_a(len <= univ_page_size.physical());

    for (;;) {
      mutex_enter(&m_mutex);

      if (m_buffer.append(frame, len)) {
        break;
      }

      if (flush_to_disk(flush_type)) {
        auto success = m_buffer.append(frame, len);
        ut_a(success);
        break;
      }

      ut_ad(!mutex_own(&m_mutex));
    }

    m_buf_pages.push_back(bpage, e_block, e_len);

    mutex_exit(&m_mutex);
  }

  /** Note that the IO batch has started. */
  void batch_started() noexcept {
    m_batch_running.store(true, std::memory_order_release);
  }

  /** Wake up all the threads that were waiting for the batch to complete. */
  void batch_completed() noexcept {
    m_batch_running.store(false, std::memory_order_release);
    std::atomic_thread_fence(std::memory_order_release);
    os_event_set(m_event);
  }

  /** Create the batch write segments.
  @param[in] segments_per_file  Number of configured segments per file.
  @return DB_SUCCESS or error code. */
  static dberr_t create_batch_segments(uint32_t segments_per_file) noexcept
      MY_ATTRIBUTE((warn_unused_result));

  /** Create the single page flush segments.
  @param[in] segments_per_file  Number of configured segments per file.
  @return DB_SUCCESS or error code. */
  static dberr_t create_single_segments(uint32_t segments_per_file) noexcept
      MY_ATTRIBUTE((warn_unused_result));

  /** Get the instance that handles a particular page's IO. Submit the
  write request to the a double write queue that is empty.
  @param[in]  flush_type        Flush type.
  @param[in]	bpage             Page from the buffer pool.
  @param[in]  e_block    compressed + encrypted frame contents or nullptr.
  @param[in]  e_len      encrypted data length. */
  static void submit(buf_flush_t flush_type, buf_page_t *bpage,
                     const file::Block *e_block, uint32_t e_len) noexcept {
    if (s_instances == nullptr) {
      return;
    }

    auto dblwr = instance(flush_type, bpage);
    dblwr->enqueue(flush_type, bpage, e_block, e_len);
  }

  /** Writes a single page to the doublewrite buffer on disk, syncs it,
  then writes the page to the datafile.
  @param[in]	bpage             Data page to write to disk.
  @param[in]	e_block           Encrypted data block.
  @param[in]	e_len             Encrypted data length.
  @return DB_SUCCESS or error code */
  static dberr_t sync_page_flush(buf_page_t *bpage, file::Block *e_block,
                                 uint32_t e_len) noexcept
      MY_ATTRIBUTE((warn_unused_result));

  // clang-format off
  /** @return the double write instance to use for flushing.
  @param[in] flush_type         LRU or Flush list write.
  @param[in] bpage              Page to write to disk.
  @return instance that will handle the flush to disk. */
  static Double_write *instance(buf_flush_t flush_type, const buf_page_t *bpage)
      noexcept MY_ATTRIBUTE((warn_unused_result)) {
    return instance(flush_type, buf_pool_index(buf_pool_from_bpage(bpage)));
  }

  /** Updates the double write buffer when a write request is completed.
  @param[in,out] bpage          Block that has just been written to disk.
  @param[in] flush_type         Flush type that triggered the write. */
  static void write_complete(buf_page_t *bpage, buf_flush_t flush_type)
      noexcept;

  /** REad the V1 doublewrite buffer extents boundaries.
  @param[in,out] block1         Starting block number for the first extent.
  @param[in,out] block2         Starting block number for the second extent.
  @return true if successful, false if not. */
  static bool init_v1(page_no_t &block1, page_no_t &block2) noexcept
      MY_ATTRIBUTE((warn_unused_result));

  /** Creates the V1 doublewrite buffer extents. The header of the
  doublewrite buffer is placed on the trx system header page.
  @param[in,out] block1         Starting block number for the first extent.
  @param[in,out] block2         Starting block number for the second extent.
  @return true if successful, false if not. */
  static bool create_v1(page_no_t &block1, page_no_t &block2) noexcept
      MY_ATTRIBUTE((warn_unused_result));

  /** Writes a page that has already been written to the
  doublewrite buffer to the data file. It is the job of the
  caller to sync the datafile.
  @param[in]  in_bpage          Page to write.
  @param[in]  sync              true if it's a synchronous write.
  @param[in]  e_block           block containing encrypted data frame.
  @param[in]  e_len             encrypted data length.
  @return DB_SUCCESS or error code */
  static dberr_t write_to_datafile(const buf_page_t *in_bpage, bool sync,
      const file::Block* e_block, uint32_t e_len)
      noexcept MY_ATTRIBUTE((warn_unused_result));

  /** Force a flush of the page queue.
  @param[in] flush_type           FLUSH LIST or LRU LIST flush.
  @param[in] buf_pool_index       Buffer pool instance for which called. */
  static void force_flush(buf_flush_t flush_type, uint32_t buf_pool_index)
      noexcept {
    if (s_instances == nullptr) {
      return;
    }
    auto dblwr = instance(flush_type, buf_pool_index);

    dblwr->force_flush(flush_type);
  }

  /** Load the doublewrite buffer pages from an external file.
  @param[in,out]	file		      File handle
  @param[in,out]	pages		      For storing the doublewrite pages
                                read from the file
  @return DB_SUCCESS or error code */
  static dberr_t load(dblwr::File &file, recv::Pages *pages) noexcept
      MY_ATTRIBUTE((warn_unused_result));

  /** Write zeros to the file if it is "empty"
  @param[in]	file		          File instance.
  @param[in]	n_pages           Size in physical pages.
  @return DB_SUCCESS or error code */
  static dberr_t init_file(dblwr::File &file, uint32_t n_pages) noexcept
      MY_ATTRIBUTE((warn_unused_result));

  /** Reset the size in bytes to the configured size.
  @param[in,out] file						File to reset.
  @param[in] truncate           Truncate the file to configured size if true. */
  static void reset_file(dblwr::File &file, bool truncate) noexcept;

  /** Reset the size in bytes to the configured size of all files. */
  static void reset_files() noexcept {
    for (auto &file : Double_write::s_files) {
      /* Physically truncate the file: true. */
      Double_write::reset_file(file, true);
    }
  }

  /** Create the v2 data structures
  @return DB_SUCCESS or error code */
  static dberr_t create_v2() noexcept MY_ATTRIBUTE((warn_unused_result));

#ifndef _WIN32
  /** @return true if we need to fsync to disk */
  static bool is_fsync_required() noexcept MY_ATTRIBUTE((warn_unused_result)) {
    /* srv_unix_file_flush_method is a dynamic variable. */
    return srv_unix_file_flush_method != SRV_UNIX_O_DIRECT &&
           srv_unix_file_flush_method != SRV_UNIX_O_DIRECT_NO_FSYNC;
  }
#endif /* _WIN32 */

  /** Extract the data and length to write to the doublewrite file
  @param[in]	bpage		          Page to write
  @param[out]	ptr		            Start of buffer to write
  @param[out]	len		            Length of the data to write */
  static void prepare(const buf_page_t *bpage, void **ptr, uint32_t *len)
      noexcept;

  /** Free the data structures. */
  static void shutdown() noexcept;

  /** Toggle the doublewrite buffer dynamically
  @param[in]	value		          Current value */
  static void toggle(bool value) noexcept {
    if (s_instances == nullptr) {
      return;
    }

    if (value) {
      ib::info(ER_IB_MSG_DBLWR_1304) << "Atomic write enabled";
    } else {
      ib::info(ER_IB_MSG_DBLWR_1305) << "Atomic write disabled";
    }
  }

  // clang-format on

  /** Write the data to disk synchronously.
  @param[in]    segment      Segment to write to.
  @param[in]	bpage        Page to write.
  @param[in]    e_block      Encrypted block.  Can be nullptr.
  @param[in]    e_len        Encrypted data length in e_block. */
  static void single_write(Segment *segment, const buf_page_t *bpage,
                           file::Block *e_block, uint32_t e_len) noexcept;

 private:
  /** Create the singleton instance, start the flush thread
  @return DB_SUCCESS or error code */
  static dberr_t start() noexcept MY_ATTRIBUTE((warn_unused_result));

  /** Asserts when a corrupt block is found during writing out
  data to the disk.
  @param[in]	block		          Block that was corrupt */
  static void croak(const buf_block_t *block) noexcept;

  /** Check the LSN values on the page with which this block
  is associated.  Also validate the page if the option is set.
  @param[in]	block		          Block to check */
  static void check_block(const buf_block_t *block) noexcept;

  /** Check the LSN values on the page.
  @param[in]	page		          Page to check */
  static void check_page_lsn(const page_t *page) noexcept;

  /** Calls buf_page_get() on the TRX_SYS_PAGE and returns
  a pointer to the doublewrite buffer within it.
  @param[in,out]	mtr		        To manage the page latches
  @return pointer to the doublewrite buffer within the filespace
          header page. */
  static byte *get(mtr_t *mtr) noexcept MY_ATTRIBUTE((warn_unused_result));

 private:
  using Segments = mpmc_bq<Segment *>;
  using Instances = std::vector<Double_write *>;
  using Batch_segments = mpmc_bq<Batch_segment *>;

  /** Instance ID */
  uint16_t m_id{};

  /** Protects m_buf_pages. */
  ib_mutex_t m_mutex;

  /** Wait for IO batch to complete. */
  os_event_t m_event;

  /** true if the the batch hasn't completed yet. */
  std::atomic_bool m_batch_running{false};

  /** The copy of the page frame, the page must be in in m_buf_pages. */
  Buffer m_buffer;

  /** Pages that should be written to the data files. */
  Buf_pages m_buf_pages;

  /** File segments to use for LRU batched writes. */
  static Batch_segments *s_LRU_batch_segments;

  /** File segments to use for flush list batched writes. */
  static Batch_segments *s_flush_list_batch_segments;

  /** File segments to use for single page writes. */
  static Segments *s_single_segments;

  /** For indexing batch segments by ID. */
  static std::vector<Batch_segment *> s_segments;

 public:
  /** Files to use for atomic writes. * /
  static std::vector<dblwr::File> s_files;

  /** The global instances */
  static Instances *s_instances;

  // Disable copying
  Double_write(const Double_write &) = delete;
  Double_write(const Double_write &&) = delete;
  Double_write &operator=(Double_write &&) = delete;
  Double_write &operator=(const Double_write &) = delete;
};

/** File segment of a double write file. */
class Segment {
 public:
  /** Constructor.
  @param[in] file               File that owns the segment.
  @param[in] start              Offset (page number) of segment in the file.
  @param[in] n_pages            Number of pages in the segment. */
  Segment(dblwr::File &file, page_no_t start, uint32_t n_pages)
      : m_file(file),
        m_start(start * univ_page_size.physical()),
        m_end(m_start + (n_pages * univ_page_size.physical())) {}

  /** Destructor. */
  virtual ~Segment() {}

  /** Write to the segment.
  @param[in] ptr                Start writing from here.
  @param[in] len                Number of bytes to write. */
  void write(const void *ptr, uint32_t len) noexcept {
    ut_a(len <= m_end - m_start);
    IORequest req(IORequest::WRITE | IORequest::DO_NOT_WAKE);

    req.dblwr();

    auto err = os_file_write_retry(req, m_file.m_name.c_str(), m_file.m_pfs,
                                   ptr, m_start, len);
    ut_a(err == DB_SUCCESS);
  }

  /** Flush the segment to disk. */
  void flush() noexcept { os_file_flush(m_file.m_pfs); }

  /** File that owns the segment. */
  dblwr::File &m_file;

  /** Physical offset in the file for the segment. */
  os_offset_t m_start{};

  /** Physical offset up to which this segment is responsible for. */
  os_offset_t m_end{};

  // Disable copying
  Segment(Segment &&) = delete;
  Segment(const Segment &) = delete;
  Segment &operator=(Segment &&) = delete;
  Segment &operator=(const Segment &) = delete;
};

struct File {
  /** ID of the file. */
  uint32_t m_id{};

  /** File name. */
  std::string m_name{};

  /** File handle. */
  pfs_os_file_t m_pfs{};

  /** Number of batched pages per doublwrite file. */
  static uint32_t s_n_pages;

  /** Serialize the object into JSON format.
  @return the object in JSON format. */
  std::string to_json() const noexcept MY_ATTRIBUTE((warn_unused_result)) {
    std::ostringstream out;
    out << "{";
    out << "\"className\": \"dblwr::File\",";
    out << "\"m_id\": \"" << m_id << "\",";
    out << "\"m_name\": \"" << m_name << "\",";
    out << "\"s_n_pages\": \"" << s_n_pages << "\"";
    out << "}";

    return out.str();
  }

  /** Print this object into the given stream.
  @param[in]  out  output stream into which the current object is printed.
  @return the output stream. */
  std::ostream &print(std::ostream &out) const noexcept {
    out << to_json();
    return out;
  }
};

/** Pages recovered from the doublewrite buffer */
class Pages {
 public:
  using Buffers = std::vector<Page *, ut_allocator<Page *>>;

  /** Default constructor */
  Pages() : m_pages() {}

  /** Destructor */
  ~Pages() noexcept {
    for (auto &page : m_pages) {
      UT_DELETE(page);
    }

    m_pages.clear();
  }

  /** Add a page frame to the doublewrite recovery buffer.
  @param[in]	page_no		        Page number in the doublewrite buffer
  @param[in]	page		          Page contents
  @param[in]	n_bytes		        Size in bytes */
  void add(page_no_t page_no, const byte *page, uint32_t n_bytes) noexcept;

  /** Find a doublewrite copy of a page.
  @param[in]	page_id		        Page number to lookup
  @return	page frame
  @retval nullptr if no page was found */
  const byte *find(const page_id_t &page_id) const noexcept;

  /** Recover double write buffer pages
  @param[in]	space		          Tablespace pages to recover, if set
                                to nullptr then try and recovery all. */
  void recover(fil_space_t *space) noexcept;

  /** Check if some pages could be restored because of missing
  tablespace IDs */
  void check_missing_tablespaces() const noexcept;

  /** Object the vector of pages.
  @return the vector of pages. */
  Buffers &get_pages() noexcept MY_ATTRIBUTE((warn_unused_result)) {
    return m_pages;
  }

 private:
  /** Recovered doublewrite buffer page frames */
  Buffers m_pages;

  // Disable copying
  Pages(const Pages &) = delete;
  Pages(const Pages &&) = delete;
  Pages &operator=(Pages &&) = delete;
  Pages &operator=(const Pages &) = delete;
};

在这里看到了相关的页、段的数据结构就明白了吧。缓冲区就是要通过这些来实际操作数据的。

五、日志

原来分析上层的日志应用层，下来看一下具体的InnodB引擎Undolog日志部分(innodbbase/include/purge.h及相关头文件)：


// Forward declaration
struct TrxUndoRsegsIterator;

/** This is the purge pointer/iterator. We need both the undo no and the
transaction no up to which purge has parsed and applied the records. */
struct purge_iter_t {
  purge_iter_t() : trx_no(), undo_no(), undo_rseg_space(SPACE_UNKNOWN) {
    // Do nothing
  }

  /** Purge has advanced past all transactions whose number
  is less than this */
  trx_id_t trx_no;

  /** Purge has advanced past all records whose undo number
  is less than this. */
  undo_no_t undo_no;

  /** The last undo record resided in this space id */
  space_id_t undo_rseg_space;

  /** The transaction that created the undo log record,
  the Modifier trx id */
  trx_id_t modifier_trx_id;
};

/** An undo::Tablespace object is used to easily convert between
undo_space_id and undo_space_num and to create the automatic file_name
and space name.  In addition, it is used in undo::Tablespaces to track
the trx_rseg_t objects in an Rsegs vector. So we do not allocate the
Rsegs vector for each object, only when requested by the constructor. */
struct Tablespace {
  /** Constructor
  @param[in]  id    tablespace id */
  explicit Tablespace(space_id_t id)
      : m_id(id),
        m_num(undo::id2num(id)),
        m_implicit(true),
        m_new(false),
        m_space_name(),
        m_file_name(),
        m_log_file_name(),
        m_rsegs() {}

  /** Copy Constructor
  @param[in]  other    undo tablespace to copy */
  Tablespace(Tablespace &other)
      : m_id(other.id()),
        m_num(undo::id2num(other.id())),
        m_implicit(other.is_implicit()),
        m_new(other.is_new()),
        m_space_name(),
        m_file_name(),
        m_log_file_name(),
        m_rsegs() {
    ut_ad(m_id == 0 || is_reserved(m_id));

    set_space_name(other.space_name());
    set_file_name(other.file_name());

    /* When the copy constructor is used, add an Rsegs
    vector. This constructor is only used in the global
    undo::Tablespaces object where rollback segments are
    tracked. */
    m_rsegs = UT_NEW_NOKEY(Rsegs());
  }

  /** Destructor */
  ~Tablespace() {
    if (m_space_name != nullptr) {
      ut_free(m_space_name);
      m_space_name = nullptr;
    }

    if (m_file_name != nullptr) {
      ut_free(m_file_name);
      m_file_name = nullptr;
    }

    if (m_log_file_name != nullptr) {
      ut_free(m_log_file_name);
      m_log_file_name = nullptr;
    }

    /* Clear the cached rollback segments.  */
    if (m_rsegs != nullptr) {
      UT_DELETE(m_rsegs);
      m_rsegs = nullptr;
    }
  }

  /* Determine if this undo space needs to be truncated.
  @return true if it should be truncated, false if not. */
  bool needs_truncation();

  /** Change the space_id from its current value.
  @param[in]  space_id  The new undo tablespace ID */
  void set_space_id(space_id_t space_id);

  /** Replace the standard undo space name if it exists with a copy
  of the undo tablespace name provided.
  @param[in]  new_space_name  non-standard undo space name */
  void set_space_name(const char *new_space_name);

  /** Get the undo tablespace name. Make it if not yet made.
  NOTE: This is only called from stack objects so there is no
  race condition. If it is ever called from a shared object
  like undo::spaces, then it must be protected by the caller.
  @return tablespace name created from the space_id */
  char *space_name() {
    if (m_space_name == nullptr) {
#ifndef UNIV_HOTBACKUP
      m_space_name = make_space_name(m_id);
#endif /* !UNIV_HOTBACKUP */
    }

    return (m_space_name);
  }

  /** Replace the standard undo file name if it exists with a copy
  of the file name provided. This name can come in three forms:
  absolute path, relative path, and basename.  Undo ADD DATAFILE
  does not accept a relative path.  So if that comes in here, it
  was the scanned name and is relative to the datadir.
  If this is just a basename, add it to srv_undo_dir.
  @param[in]  file_name  explicit undo file name */
  void set_file_name(const char *file_name);

  /** Get the undo space filename. Make it if not yet made.
  NOTE: This is only called from stack objects so there is no
  race condition. If it is ever called from a shared object
  like undo::spaces, then it must be protected by the caller.
  @return tablespace filename created from the space_id */
  char *file_name() {
    if (m_file_name == nullptr) {
      m_file_name = make_file_name(m_id);
    }

    return (m_file_name);
  }

  /** Build a log file name based on space_id
  @param[in]	space_id	id of the undo tablespace.
  @return DB_SUCCESS or error code */
  char *make_log_file_name(space_id_t space_id);

  /** Get the undo log filename. Make it if not yet made.
  NOTE: This is only called from stack objects so there is no
  race condition. If it is ever called from a shared object
  like undo::spaces, then it must be protected by the caller.
  @return tablespace filename created from the space_id */
  char *log_file_name() {
    if (m_log_file_name == nullptr) {
      m_log_file_name = make_log_file_name(m_id);
    }

    return (m_log_file_name);
  }

  /** Get the undo tablespace ID.
  @return tablespace ID */
  space_id_t id() { return (m_id); }

  /** Get the undo tablespace number.  This is the same as m_id
  if m_id is 0 or this is a v5.6-5.7 undo tablespace. v8+ undo
  tablespaces use a space_id from the reserved range.
  @return undo tablespace number */
  space_id_t num() {
    ut_ad(m_num < FSP_MAX_ROLLBACK_SEGMENTS);

    return (m_num);
  }

  /** Get a reference to the List of rollback segments within
  this undo tablespace.
  @return a reference to the Rsegs vector. */
  Rsegs *rsegs() { return (m_rsegs); }

  /** Report whether this undo tablespace was explicitly created
  by an SQL statement.
  @return true if the tablespace was created explicitly. */
  bool is_explicit() { return (!m_implicit); }

  /** Report whether this undo tablespace was implicitly created.
  @return true if the tablespace was created implicitly. */
  bool is_implicit() { return (m_implicit); }

  /** Report whether this undo tablespace was created at startup.
  @retval true if created at startup.
  @retval false if pre-existed at startup. */
  bool is_new() { return (m_new); }

  /** Note that this undo tablespace is being created. */
  void set_new() { m_new = true; }

  /** Return whether the undo tablespace is active.
  @return true if active */
  bool is_active() {
    if (m_rsegs == nullptr) {
      return (false);
    }
    m_rsegs->s_lock();
    bool ret = m_rsegs->is_active();
    m_rsegs->s_unlock();
    return (ret);
  }

  /** Return whether the undo tablespace is active. For optimization purposes,
  do not take a latch.
  @return true if active */
  bool is_active_no_latch() {
    if (m_rsegs == nullptr) {
      return (false);
    }
    return (m_rsegs->is_active());
  }

  /** Return the rseg at the requested rseg slot if the undo space is active.
  @param[in] slot   The slot of the rseg.  1 to 127
  @return Rseg pointer of nullptr if the space is not active. */
  trx_rseg_t *get_active(ulint slot) {
    m_rsegs->s_lock();
    if (!m_rsegs->is_active()) {
      m_rsegs->s_unlock();
      return (nullptr);
    }

    /* Mark the chosen rseg so that it will not be selected
    for UNDO truncation. */
    trx_rseg_t *rseg = m_rsegs->at(slot);
    rseg->trx_ref_count++;

    m_rsegs->s_unlock();

    return (rseg);
  }

  /** Return whether the undo tablespace is inactive due to
  implicit selection by the purge thread.
  @return true if marked for truncation by the purge thread */
  bool is_inactive_implicit() {
    if (m_rsegs == nullptr) {
      return (false);
    }
    m_rsegs->s_lock();
    bool ret = m_rsegs->is_inactive_implicit();
    m_rsegs->s_unlock();
    return (ret);
  }

  /** Return whether the undo tablespace was made inactive by
  ALTER TABLESPACE.
  @return true if altered inactive */
  bool is_inactive_explicit() {
    if (m_rsegs == nullptr) {
      return (false);
    }
    m_rsegs->s_lock();
    bool ret = m_rsegs->is_inactive_explicit();
    m_rsegs->s_unlock();
    return (ret);
  }

  /** Return whether the undo tablespace is empty and ready
  to be dropped.
  @return true if empty */
  bool is_empty() {
    if (m_rsegs == nullptr) {
      return (true);
    }
    m_rsegs->s_lock();
    bool ret = m_rsegs->is_empty();
    m_rsegs->s_unlock();
    return (ret);
  }

  /** Set the undo tablespace active for use by transactions. */
  void set_active() {
    m_rsegs->x_lock();
    m_rsegs->set_active();
    m_rsegs->x_unlock();
  }

  /** Set the state of the rollback segments in this undo tablespace to
  inactive_implicit if currently active.  If the state is inactive_explicit,
  leave as is. Then put the space_id into the callers marked_space_id.
  This is done when marking a space for truncate.  It will not be used
  for new transactions until it becomes active again. */
  void set_inactive_implicit(space_id_t *marked_space_id) {
    m_rsegs->x_lock();
    if (m_rsegs->is_active()) {
      m_rsegs->set_inactive_implicit();
    }
    *marked_space_id = m_id;

    m_rsegs->x_unlock();
  }

  /** Make the undo tablespace inactive so that it will not be
  used for new transactions.  The purge thread will clear out
  all the undo logs, truncate it, and then mark it empty. */
  void set_inactive_explicit() {
    m_rsegs->x_lock();
    m_rsegs->set_inactive_explicit();
    m_rsegs->x_unlock();
  }

  /** Make the undo tablespace active again so that it will
  be used for new transactions.
  If current State is ___ then do:
  empty:            Set active.
  active_implicit:  Ignore.  It was not altered inactive. When it is done
                    being truncated it will go back to active.
  active_explicit:  Depends if it is marked for truncation.
    marked:         Set to inactive_implicit. the next state will be active.
    not yet:        Set to active so that it does not get truncated.  */
  void alter_active();

  /** Set the state of the undo tablespace to empty so that it
  can be dropped. */
  void set_empty() {
    m_rsegs->x_lock();
    m_rsegs->set_empty();
    m_rsegs->x_unlock();
  }

 private:
  /** Undo Tablespace ID. */
  space_id_t m_id;

  /** Undo Tablespace number, from 1 to 127. This is the
  7-bit number that is used in a rollback pointer.
  Use id2num() to get this number from a space_id. */
  space_id_t m_num;

  /** True if this is an implicit undo tablespace */
  bool m_implicit;

  /** True if this undo tablespace was implicitly created when
  this instance started up. False if it pre-existed. */
  bool m_new;

  /** The tablespace name, auto-generated when needed from
  the space number. */
  char *m_space_name;

  /** The tablespace file name, auto-generated when needed
  from the space number. */
  char *m_file_name;

  /** The tablespace log file name, auto-generated when needed
  from the space number. */
  char *m_log_file_name;

  /** List of rollback segments within this tablespace.
  This is not always used. Must call init_rsegs to use it. */
  Rsegs *m_rsegs;
};

再看一下Redo Log日志的数据结构(log0types.h)：

struct Log_handle {
  lsn_t start_lsn;

  lsn_t end_lsn;
};

/** Redo log - single data structure with state of the redo log system.
In future, one could consider splitting this to multiple data structures. */
struct alignas(ut::INNODB_CACHE_LINE_SIZE) log_t {
  /**************************************************/ /**

   @name Users writing to log buffer

   *******************************************************/

  /** @{ */

#ifndef UNIV_HOTBACKUP
  /** Event used for locking sn */
  os_event_t sn_lock_event;

#ifdef UNIV_PFS_RWLOCK
  /** The instrumentation hook */
  struct PSI_rwlock *pfs_psi;
#endif /* UNIV_PFS_RWLOCK */
#ifdef UNIV_DEBUG
  /** The rw_lock instance only for the debug info list */
  /* NOTE: Just "rw_lock_t sn_lock_inst;" and direct minimum initialization
  seem to hit the bug of Sun Studio of Solaris. */
  rw_lock_t *sn_lock_inst;
#endif /* UNIV_DEBUG */

  /** Current sn value. Used to reserve space in the redo log,
  and used to acquire an exclusive access to the log buffer.
  Represents number of data bytes that have ever been reserved.
  Bytes of headers and footers of log blocks are not included.
  Its highest bit is used for locking the access to the log buffer. */
  MY_COMPILER_DIAGNOSTIC_PUSH()
  MY_COMPILER_CLANG_WORKAROUND_REF_DOCBUG()
  /**
  @see @ref subsect_redo_log_sn */
  MY_COMPILER_DIAGNOSTIC_PUSH()
  alignas(ut::INNODB_CACHE_LINE_SIZE) atomic_sn_t sn;

  /** Intended sn value while x-locked. */
  atomic_sn_t sn_locked;

  /** Mutex which can be used for x-lock sn value */
  mutable ib_mutex_t sn_x_lock_mutex;

  /** Padding after the _sn to avoid false sharing issues for
  constants below (due to changes of sn). */
  alignas(ut::INNODB_CACHE_LINE_SIZE)

      /** Pointer to the log buffer, aligned up to OS_FILE_LOG_BLOCK_SIZE.
      The alignment is to ensure that buffer parts specified for file IO write
      operations will be aligned to sector size, which is required e.g. on
      Windows when doing unbuffered file access.
      Protected by: locking sn not to add. */
      aligned_array_pointer<byte, OS_FILE_LOG_BLOCK_SIZE> buf;

  /** Size of the log buffer expressed in number of data bytes,
  that is excluding bytes for headers and footers of log blocks. */
  atomic_sn_t buf_size_sn;

  /** Size of the log buffer expressed in number of total bytes,
  that is including bytes for headers and footers of log blocks. */
  size_t buf_size;

  alignas(ut::INNODB_CACHE_LINE_SIZE)

      /** The recent written buffer.
      Protected by: locking sn not to add. */
      Link_buf<lsn_t> recent_written;

  /** Used for pausing the log writer threads.
  When paused, each user thread should write log as in the former version. */
  std::atomic_bool writer_threads_paused;

  /** Some threads waiting for the ready for write lsn by closer_event. */
  lsn_t current_ready_waiting_lsn;

  /** current_ready_waiting_lsn is waited using this sig_count. */
  int64_t current_ready_waiting_sig_count;

  alignas(ut::INNODB_CACHE_LINE_SIZE)

      /** The recent closed buffer.
      Protected by: locking sn not to add. */
      Link_buf<lsn_t> recent_closed;

  alignas(ut::INNODB_CACHE_LINE_SIZE)

      /** @} */

      /**************************************************/ /**

       @name Users <=> writer

       *******************************************************/

      /** @{ */

      /** Maximum sn up to which there is free space in both the log buffer
      and the log files. This is limitation for the end of any write to the
      log buffer. Threads, which are limited need to wait, and possibly they
      hold latches of dirty pages making a deadlock possible.
      Protected by: writer_mutex (writes). */
      atomic_sn_t buf_limit_sn;

  /** Up to this lsn, data has been written to disk (fsync not required).
  Protected by: writer_mutex (writes). */
  MY_COMPILER_DIAGNOSTIC_PUSH()
  MY_COMPILER_CLANG_WORKAROUND_REF_DOCBUG()
  /*
  @see @ref subsect_redo_log_write_lsn */
  MY_COMPILER_DIAGNOSTIC_POP()
  alignas(ut::INNODB_CACHE_LINE_SIZE) atomic_lsn_t write_lsn;

  alignas(ut::INNODB_CACHE_LINE_SIZE)

      /** Unaligned pointer to array with events, which are used for
      notifications sent from the log write notifier thread to user threads.
      The notifications are sent when write_lsn is advanced. User threads
      wait for write_lsn >= lsn, for some lsn. Log writer advances the
      write_lsn and notifies the log write notifier, which notifies all users
      interested in nearby lsn values (lsn belonging to the same log block).
      Note that false wake-ups are possible, in which case user threads
      simply retry waiting. */
      os_event_t *write_events;

  /** Number of entries in the array with writer_events. */
  size_t write_events_size;

  /** Approx. number of requests to write/flush redo since startup. */
  alignas(ut::INNODB_CACHE_LINE_SIZE)
      std::atomic<uint64_t> write_to_file_requests_total;

  /** How often redo write/flush is requested in average.
  Measures in microseconds. Log threads do not spin when
  the write/flush requests are not frequent. */
  alignas(ut::INNODB_CACHE_LINE_SIZE)
      std::atomic<uint64_t> write_to_file_requests_interval;

  /** This padding is probably not needed, left for convenience. */
  alignas(ut::INNODB_CACHE_LINE_SIZE)

      /** @} */

      /**************************************************/ /**

       @name Users <=> flusher

       *******************************************************/

      /** @{ */

      /** Unaligned pointer to array with events, which are used for
      notifications sent from the log flush notifier thread to user threads.
      The notifications are sent when flushed_to_disk_lsn is advanced.
      User threads wait for flushed_to_disk_lsn >= lsn, for some lsn.
      Log flusher advances the flushed_to_disk_lsn and notifies the
      log flush notifier, which notifies all users interested in nearby lsn
      values (lsn belonging to the same log block). Note that false
      wake-ups are possible, in which case user threads simply retry
      waiting. */
      os_event_t *flush_events;

  /** Number of entries in the array with events. */
  size_t flush_events_size;

  /** This event is in the reset state when a flush is running;
  a thread should wait for this without owning any of redo mutexes,
  but NOTE that to reset this event, the thread MUST own the writer_mutex */
  os_event_t old_flush_event;

  /** Padding before the frequently updated flushed_to_disk_lsn. */
  alignas(ut::INNODB_CACHE_LINE_SIZE)

      /** Up to this lsn data has been flushed to disk (fsynced). */
      atomic_lsn_t flushed_to_disk_lsn;

  /** Padding after the frequently updated flushed_to_disk_lsn. */
  alignas(ut::INNODB_CACHE_LINE_SIZE)

      /** @} */

      /**************************************************/ /**

       @name Log flusher thread

       *******************************************************/

      /** @{ */

      /** Last flush start time. Updated just before fsync starts. */
      Log_clock_point last_flush_start_time;

  /** Last flush end time. Updated just after fsync is finished.
  If smaller than start time, then flush operation is pending. */
  Log_clock_point last_flush_end_time;

  /** Flushing average time (in microseconds). */
  double flush_avg_time;

  /** Mutex which can be used to pause log flusher thread. */
  mutable ib_mutex_t flusher_mutex;

  alignas(ut::INNODB_CACHE_LINE_SIZE)

      os_event_t flusher_event;

  /** Padding to avoid any dependency between the log flusher
  and the log writer threads. */
  alignas(ut::INNODB_CACHE_LINE_SIZE)

      /** @} */

      /**************************************************/ /**

       @name Log writer thread

       *******************************************************/

      /** @{ */

      /** Space id for pages with log blocks. */
      space_id_t files_space_id;

  /** Size of buffer used for the write-ahead (in bytes). */
  uint32_t write_ahead_buf_size;

  /** Aligned pointer to buffer used for the write-ahead. It is aligned to
  system page size (why?) and is currently limited by constant 64KB. */
  aligned_array_pointer<byte, 64 * 1024> write_ahead_buf;

  /** Up to this file offset in the log files, the write-ahead
  has been done or is not required (for any other reason). */
  uint64_t write_ahead_end_offset;

  /** Aligned buffers for file headers. */
  aligned_array_pointer<byte, OS_FILE_LOG_BLOCK_SIZE> *file_header_bufs;
#endif /* !UNIV_HOTBACKUP */

  /** Some lsn value within the current log file. */
  lsn_t current_file_lsn;

  /** File offset for the current_file_lsn. */
  uint64_t current_file_real_offset;

  /** Up to this file offset we are within the same current log file. */
  uint64_t current_file_end_offset;

  /** Number of performed IO operations (only for printing stats). */
  uint64_t n_log_ios;

  /** Size of each single log file (expressed in bytes, including
  file header). */
  uint64_t file_size;

  /** Number of log files. */
  uint32_t n_files;

  /** Total capacity of all the log files (file_size * n_files),
  including headers of the log files. */
  uint64_t files_real_capacity;

  /** Capacity of redo log files for log writer thread. The log writer
  does not to exceed this value. If space is not reclaimed after 1 sec
  wait, it writes only as much as can fit the free space or crashes if
  there is no free space at all (checkpoint did not advance for 1 sec). */
  lsn_t lsn_capacity_for_writer;

  /** When this margin is being used, the log writer decides to increase
  the concurrency_margin to stop new incoming mini-transactions earlier,
  on bigger margin. This is used to provide adaptive concurrency margin
  calculation, which we need because we might have unlimited thread
  concurrency setting or we could miss some log_free_check() calls.
  It is just best effort to help getting out of the troubles. */
  lsn_t extra_margin;

  /** True if we haven't increased the concurrency_margin since we entered
  (lsn_capacity_for_margin_inc..lsn_capacity_for_writer] range. This allows
  to increase the margin only once per issue and wait until the issue becomes
  resolved, still having an option to increase margin even more, if new issue
  comes later. */
  bool concurrency_margin_ok;

  /** Maximum allowed concurrency_margin. We never set higher, even when we
  increase the concurrency_margin in the adaptive solution. */
  lsn_t max_concurrency_margin;

#ifndef UNIV_HOTBACKUP
  /** Mutex which can be used to pause log writer thread. */
  mutable ib_mutex_t writer_mutex;

  alignas(ut::INNODB_CACHE_LINE_SIZE)

      os_event_t writer_event;

  /** Padding after section for the log writer thread, to avoid any
  dependency between the log writer and the log closer threads. */
  alignas(ut::INNODB_CACHE_LINE_SIZE)

      /** @} */

      /**************************************************/ /**

       @name Log closer thread

       *******************************************************/

      /** @{ */

      /** Event used by the log closer thread to wait for tasks. */
      os_event_t closer_event;

  /** Mutex which can be used to pause log closer thread. */
  mutable ib_mutex_t closer_mutex;

  /** Padding after the log closer thread and before the memory used
  for communication between the log flusher and notifier threads. */
  alignas(ut::INNODB_CACHE_LINE_SIZE)

      /** @} */

      /**************************************************/ /**

       @name Log flusher <=> flush_notifier

       *******************************************************/

      /** @{ */

      /** Event used by the log flusher thread to notify the log flush
      notifier thread, that it should proceed with notifying user threads
      waiting for the advanced flushed_to_disk_lsn (because it has been
      advanced). */
      os_event_t flush_notifier_event;

  /** The next flushed_to_disk_lsn can be waited using this sig_count. */
  int64_t current_flush_sig_count;

  /** Mutex which can be used to pause log flush notifier thread. */
  mutable ib_mutex_t flush_notifier_mutex;

  /** Padding. */
  alignas(ut::INNODB_CACHE_LINE_SIZE)

      /** @} */

      /**************************************************/ /**

       @name Log writer <=> write_notifier

       *******************************************************/

      /** @{ */

      /** Mutex which can be used to pause log write notifier thread. */
      mutable ib_mutex_t write_notifier_mutex;

  alignas(ut::INNODB_CACHE_LINE_SIZE)

      /** Event used by the log writer thread to notify the log write
      notifier thread, that it should proceed with notifying user threads
      waiting for the advanced write_lsn (because it has been advanced). */
      os_event_t write_notifier_event;

  alignas(ut::INNODB_CACHE_LINE_SIZE)

      /** @} */

      /**************************************************/ /**

       @name Maintenance

       *******************************************************/

      /** @{ */

      /** Used for stopping the log background threads. */
      std::atomic_bool should_stop_threads;

  /** Event used for pausing the log writer threads. */
  os_event_t writer_threads_resume_event;

  /** Used for resuming write notifier thread */
  atomic_lsn_t write_notifier_resume_lsn;

  /** Used for resuming flush notifier thread */
  atomic_lsn_t flush_notifier_resume_lsn;

  /** Number of total I/O operations performed when we printed
  the statistics last time. */
  mutable uint64_t n_log_ios_old;

  /** Wall time when we printed the statistics last time. */
  mutable time_t last_printout_time;

  /** @} */

  /**************************************************/ /**

   @name Recovery

   *******************************************************/

  /** @{ */

  /** Lsn from which recovery has been started. */
  lsn_t recovered_lsn;

  /** Format of the redo log: e.g., LOG_HEADER_FORMAT_CURRENT. */
  uint32_t format;

  /** Corruption status. */
  log_state_t state;

  /** Used only in recovery: recovery scan succeeded up to this lsn. */
  lsn_t scanned_lsn;

#ifdef UNIV_DEBUG

  /** When this is set, writing to the redo log should be disabled.
  We check for this in functions that write to the redo log. */
  bool disable_redo_writes;

  /** DEBUG only - if we copied or initialized the first block in buffer,
  this is set to lsn for which we did that. We later ensure that we start
  the redo log at the same lsn. Else it is zero and we would crash when
  trying to start redo then. */
  lsn_t first_block_is_correct_for_lsn;

#endif /* UNIV_DEBUG */

  alignas(ut::INNODB_CACHE_LINE_SIZE)

      /** @} */

      /**************************************************/ /**

       @name Fields protected by the log_limits mutex.
             Related to free space in the redo log.

       *******************************************************/

      /** @{ */

      /** Mutex which protects fields: available_for_checkpoint_lsn,
      requested_checkpoint_lsn. It also synchronizes updates of:
      free_check_limit_sn, concurrency_margin and dict_persist_margin.
      It also protects the srv_checkpoint_disabled (together with the
      checkpointer_mutex). */
      mutable ib_mutex_t limits_mutex;

  /** A new checkpoint could be written for this lsn value.
  Up to this lsn value, all dirty pages have been added to flush
  lists and flushed. Updated in the log checkpointer thread by
  taking minimum oldest_modification out of the last dirty pages
  from each flush list. However it will not be bigger than the
  current value of log.buf_dirty_pages_added_up_to_lsn.
  Read by: user threads when requesting fuzzy checkpoint
  Read by: log_print() (printing status of redo)
  Updated by: log_checkpointer
  Protected by: limits_mutex. */
  MY_COMPILER_DIAGNOSTIC_PUSH()
  MY_COMPILER_CLANG_WORKAROUND_REF_DOCBUG()
  /**
  @see @ref subsect_redo_log_available_for_checkpoint_lsn */
  MY_COMPILER_DIAGNOSTIC_POP()
  lsn_t available_for_checkpoint_lsn;

  /** When this is larger than the latest checkpoint, the log checkpointer
  thread will be forced to write a new checkpoint (unless the new latest
  checkpoint lsn would still be smaller than this value).
  Read by: log_checkpointer
  Updated by: user threads (log_free_check() or for sharp checkpoint)
  Protected by: limits_mutex. */
  lsn_t requested_checkpoint_lsn;

  /** Maximum lsn allowed for checkpoint by dict_persist or zero.
  This will be set by dict_persist_to_dd_table_buffer(), which should
  be always called before really making a checkpoint.
  If non-zero, up to this lsn value, dynamic metadata changes have been
  written back to mysql.innodb_dynamic_metadata under dict_persist->mutex
  protection. All dynamic metadata changes after this lsn have to
  be kept in redo logs, but not discarded. If zero, just ignore it.
  Updated by: DD (when persisting dynamic meta data)
  Updated by: log_checkpointer (reset when checkpoint is written)
  Protected by: limits_mutex. */
  lsn_t dict_max_allowed_checkpoint_lsn;

  /** If should perform checkpoints every innodb_log_checkpoint_every ms.
  Disabled during startup / shutdown. Enabled in srv_start_threads.
  Updated by: starting thread (srv_start_threads)
  Read by: log_checkpointer */
  bool periodical_checkpoints_enabled;

  /** Maximum sn up to which there is free space in the redo log.
  Threads check this limit and compare to current log.sn, when they
  are outside mini-transactions and hold no latches. The formula used
  to compute the limitation takes into account maximum size of mtr and
  thread concurrency to include proper margins and avoid issues with
  race condition (in which all threads check the limitation and then
  all proceed with their mini-transactions). Also extra margin is
  there for dd table buffer cache (dict_persist_margin).
  Read by: user threads (log_free_check())
  Updated by: log_checkpointer (after update of checkpoint_lsn)
  Updated by: log_writer (after increasing concurrency_margin)
  Updated by: DD (after update of dict_persist_margin)
  Protected by (updates only): limits_mutex. */
  atomic_sn_t free_check_limit_sn;

  /** Margin used in calculation of @see free_check_limit_sn.
  Read by: page_cleaners, log_checkpointer
  Updated by: log_writer
  Protected by (updates only): limits_mutex. */
  atomic_sn_t concurrency_margin;

  /** Margin used in calculation of @see free_check_limit_sn.
  Read by: page_cleaners, log_checkpointer
  Updated by: DD
  Protected by (updates only): limits_mutex. */
  atomic_sn_t dict_persist_margin;

  alignas(ut::INNODB_CACHE_LINE_SIZE)

      /** @} */

      /**************************************************/ /**

       @name Log checkpointer thread

       *******************************************************/

      /** @{ */

      /** Event used by the log checkpointer thread to wait for requests. */
      os_event_t checkpointer_event;

  /** Mutex which can be used to pause log checkpointer thread.
  This is used by log_position_lock() together with log_buffer_x_lock(),
  to pause any changes to current_lsn or last_checkpoint_lsn. */
  mutable ib_mutex_t checkpointer_mutex;

  /** Latest checkpoint lsn.
  Read by: user threads, log_print (no protection)
  Read by: log_writer (under writer_mutex)
  Updated by: log_checkpointer (under both mutexes)
  Protected by (updates only): checkpointer_mutex + writer_mutex. */
  MY_COMPILER_DIAGNOSTIC_PUSH()
  MY_COMPILER_CLANG_WORKAROUND_REF_DOCBUG()
  /**
  @see @ref subsect_redo_log_last_checkpoint_lsn */
  MY_COMPILER_DIAGNOSTIC_POP()
  atomic_lsn_t last_checkpoint_lsn;

  /** Next checkpoint number.
  Read by: log_get_last_block (no protection)
  Read by: log_writer (under writer_mutex)
  Updated by: log_checkpointer (under both mutexes)
  Protected by: checkpoint_mutex + writer_mutex. */
  std::atomic<checkpoint_no_t> next_checkpoint_no;

  /** Latest checkpoint wall time.
  Used by (private): log_checkpointer. */
  Log_clock_point last_checkpoint_time;

  /** Aligned buffer used for writing a checkpoint header. It is aligned
  similarly to log.buf.
  Used by (private): log_checkpointer, recovery code */
  aligned_array_pointer<byte, OS_FILE_LOG_BLOCK_SIZE> checkpoint_buf;

  /** @} */

  /**************************************************/ /**

   @name Fields considered constant, updated when log system
         is initialized (log_sys_init()) and not assigned to
         particular log thread.

   *******************************************************/

  /** @{ */

  /** Capacity of the log files available for log_free_check(). */
  lsn_t lsn_capacity_for_free_check;

  /** Capacity of log files excluding headers of the log files.
  If the checkpoint age exceeds this, it is a serious error,
  because in such case we have already overwritten redo log. */
  lsn_t lsn_real_capacity;

  /** When the oldest dirty page age exceeds this value, we start
  an asynchronous preflush of dirty pages. */
  lsn_t max_modified_age_async;

  /** When the oldest dirty page age exceeds this value, we start
  a synchronous flush of dirty pages. */
  lsn_t max_modified_age_sync;

  /** When checkpoint age exceeds this value, we write checkpoints
  if lag between oldest_lsn and checkpoint_lsn exceeds max_checkpoint_lag. */
  lsn_t max_checkpoint_age_async;

  /** @} */

  /** true if redo logging is disabled. Read and write with writer_mutex  */
  bool m_disable;

  /** true, if server is not recoverable. Read and write with writer_mutex */
  bool m_crash_unsafe;

  /** start LSN of first redo log file. */
  lsn_t m_first_file_lsn;

#endif /* !UNIV_HOTBACKUP */
};

日志这一块的代码前面分析过很多，这里就不再重复了。其实只要找到了源码，理清了脉络，基本就没有什么大问题了。

六、总结

看这些基础的数据结构有一个非常不友好的地方就是太枯燥，但又不得不看。因为你不看这些基础的东西，后面的应用和分析就无法明白怎么回事儿。尤事开头难，难的不是开头这件事，是开关要做很多枯燥无味的东西，这些东西又不得不做。它关系着后面的长久与发展，正如一个王朝的创建，三驾马车如何搭建？看看历史，成功的都有一批优秀的人才来辅佐。但是，鲜有人会问，这些人是如何被聚拢起来的。
明白了吧，那就是开始。