Ext4文件系统解析(二)

1、前言

想要了解EXT文件系统的工作原理,那了解文件系统在磁盘上的分布就是必不可少的。这一节主要介绍EXT文件系统硬盘存储的物理结构。
由于当前主流的CPU架构均采用小端模式,因此下文介绍均已小端模式为准。

2、超级块

2.1 属性

下表列举出超级块中相对重要的属性。

属性名含义
s_log_block_size块大小,计算公式 = 2 ^ (10 + s_log_block_size)
s_blocks_per_group每个块组中块的个数
s_inodes_per_group每个块组中索引的个数
s_magic魔数(0xEF53)
s_inode_size索引大小,单位:byte
s_feature_compat兼容特性
s_feature_incompat不兼容特性
s_feature_ro_compat只读兼容特性
s_backup_bgs包含超级块备份的块组号。
s_desc_size块组描述符大小
2.2 特性

一些默认开启或者常用的文件系统特性。
在这里插入图片描述

属性名含义
COMPAT_HAS_JOURNAL开启日志。
COMPAT_EXT_ATTR支持扩展属性。
COMPAT_RESIZE_INODE保留块组描述符。需要开启RO_COMPAT_SPARSE_SUPER特性。
COMPAT_SPARSE_SUPER2稀疏超级块V2。开启本特性后,仅s_backup_bgs 属性指向的2个块组备份超级块。
INCOMPAT_FILETYPEapp_ext4_dir_entry结构中包含文件类型。
INCOMPAT_META_BG开启元块组属性。与COMPAT_RESIZE_INODE特性互斥。
INCOMPAT_64BIT支持超过2^32个块。
INCOMPAT_FLEX_BG开启弹性块组。
INCOMPAT_INLINE_DATA支持内联文件和目录。
RO_COMPAT_SPARSE_SUPER稀疏超级块。
2.3 参考代码
typedef struct {
  ub32 s_inodes_count;       /* Inodes count */
  ub32 s_blocks_count;       /* Blocks count */
  ub32 s_r_blocks_count;     /* Reserved blocks count */
  ub32 s_free_blocks_count;  /* Free blocks count */
  ub32 s_free_inodes_count;  /* Free inodes count */
  ub32 s_first_data_block;   /* First Data Block */
  ub32 s_log_block_size;     /* Block size */
  ub32 s_log_cluster_size;   /* Allocation cluster size */
  ub32 s_blocks_per_group;   /* # Blocks per group */
  ub32 s_clusters_per_group; /* # Fragments per group */
  ub32 s_inodes_per_group;   /* # Inodes per group */
  ub32 s_mtime;              /* Mount time */
  ub32 s_wtime;              /* Write time */
  ub16 s_mnt_count;          /* Mount count */
  ub16 s_max_mnt_count;      /* Maximal mount count */
  ub16 s_magic;              /* Magic signature */
  ub16 s_state;              /* File system state */
  ub16 s_errors;             /* Behaviour when detecting errors */
  ub16 s_minor_rev_level;    /* minor revision level */
  ub32 s_lastcheck;          /* time of last check */
  ub32 s_checkinterval;      /* max. time between checks */
  ub32 s_creator_os;         /* OS */
  ub32 s_rev_level;          /* Revision level */
  ub16 s_def_resuid;         /* Default uid for reserved blocks */
  ub16 s_def_resgid;         /* Default gid for reserved blocks */
  /*
   * These fields are for EXT2_DYNAMIC_REV superblocks only.
   *
   * Note: the difference between the compatible feature set and
   * the incompatible feature set is that if there is a bit set
   * in the incompatible feature set that the kernel doesn't
   * know about, it should refuse to mount the filesystem.
   *
   * e2fsck's requirements are more strict; if it doesn't know
   * about a feature in either the compatible or incompatible
   * feature set, it must abort and not try to meddle with
   * things it doesn't understand...
   */
  ub32 s_first_ino;              /* First non-reserved inode */
  ub16 s_inode_size;             /* size of inode structure */
  ub16 s_block_group_nr;         /* block group # of this superblock */
  ub32 s_feature_compat;         /* compatible feature set */
  ub32 s_feature_incompat;       /* incompatible feature set */
  ub32 s_feature_ro_compat;      /* readonly-compatible feature set */
  ub8 s_uuid[16];                /* 128-bit uuid for volume */
  b8 s_volume_name[16];          /* volume name */
  b8 s_last_mounted[64];         /* directory where last mounted */
  ub32 s_algorithm_usage_bitmap; /* For compression */
  /*
   * Performance hints.  Directory preallocation should only
   * happen if the EXT2_FEATURE_COMPAT_DIR_PREALLOC flag is on.
   */
  ub8 s_prealloc_blocks;      /* Nr of blocks to try to preallocate*/
  ub8 s_prealloc_dir_blocks;  /* Nr to preallocate for dirs */
  ub16 s_reserved_gdt_blocks; /* Per group table for online growth */
  /*
   * Journaling support valid if EXT2_FEATURE_COMPAT_HAS_JOURNAL set.
   */
  ub8 s_journal_uuid[16]; /* uuid of journal superblock */
  ub32 s_journal_inum;    /* inode number of journal file */
  ub32 s_journal_dev;     /* device number of journal file */
  ub32 s_last_orphan;     /* start of list of inodes to delete */
  ub32 s_hash_seed[4];    /* HTREE hash seed */
  ub8 s_def_hash_version; /* Default hash version to use */
  ub8 s_jnl_backup_type;  /* Default type of journal backup */
  ub16 s_desc_size;       /* Group desc. size: INCOMPAT_64BIT */
  ub32 s_default_mount_opts;
  ub32 s_first_meta_bg;       /* First metablock group */
  ub32 s_mkfs_time;           /* When the filesystem was created */
  ub32 s_jnl_blocks[17];      /* Backup of the journal inode */
  ub32 s_blocks_count_hi;     /* Blocks count high 32bits */
  ub32 s_r_blocks_count_hi;   /* Reserved blocks count high 32 bits*/
  ub32 s_free_blocks_hi;      /* Free blocks count */
  ub16 s_min_extra_isize;     /* All inodes have at least # bytes */
  ub16 s_want_extra_isize;    /* New inodes should reserve # bytes */
  ub32 s_flags;               /* Miscellaneous flags */
  ub16 s_raid_stride;         /* RAID stride */
  ub16 s_mmp_update_interval; /* # seconds to wait in MMP checking */
  ub64 s_mmp_block;           /* Block for multi-mount protection */
  ub32 s_raid_stripe_width;   /* blocks on all data disks (N*stride)*/
  ub8 s_log_groups_per_flex;  /* FLEX_BG group size */
  ub8 s_reserved_char_pad;
  ub16 s_reserved_pad;            /* Padding to next 32bits */
  ub64 s_kbytes_written;          /* nr of lifetime kilobytes written */
  ub32 s_snapshot_inum;           /* Inode number of active snapshot */
  ub32 s_snapshot_id;             /* sequential ID of active snapshot */
  ub64 s_snapshot_r_blocks_count; /* reserved blocks for active
                      snapshot's future use */
  ub32 s_snapshot_list;     /* inode number of the head of the on-disk snapshot
                                   list */
  ub32 s_error_count;       /* number of fs errors */
  ub32 s_first_error_time;  /* first time an error happened */
  ub32 s_first_error_ino;   /* inode involved in first error */
  ub64 s_first_error_block; /* block involved of first error */
  ub8 s_first_error_func[32]; /* function where the error happened */
  ub32 s_first_error_line;    /* line number where error happened */
  ub32 s_last_error_time;     /* most recent time of an error */
  ub32 s_last_error_ino;      /* inode involved in last error */
  ub32 s_last_error_line;     /* line number where error happened */
  ub64 s_last_error_block;    /* block involved of last error */
  ub8 s_last_error_func[32];  /* function where the error happened */
  ub8 s_mount_opts[64];
  ub32 s_usr_quota_inum;  /* inode number of user quota file */
  ub32 s_grp_quota_inum;  /* inode number of group quota file */
  ub32 s_overhead_blocks; /* overhead blocks/clusters in fs */
  ub32 s_backup_bgs[2];   /* If sparse_super2 enabled */
  ub32 s_reserved[106];   /* Padding to the end of the block */
  ub32 s_checksum;        /* crc32c(superblock) */
} app_ext4_super_block;

3、组描述符

3.1 属性

下表列举出组描述符的关键属性。

属性名含义
bg_inode_table索引表的物理偏移。
bg_inode_table_hi索引表的物理偏移的高32位。
3.2 索引表计算

已知目标文件的Inode = 357,每个块组的Inode数 inode_count_ = 8192, 组描述大小 gdt_size_ = 32,索引Inode大小 inode_size_ = 256,该如何找到文件对应的组描述符呢?

首先,计算出文件所在的块组,bg_no = (inode_no - 1) / inode_count_ = 356 / 8192 = 0, 即文件属于第一个块组。

接着,计算文件所在的组描述符的位置,gdt_block_no = bg_no / gdt_count_ = 0 / (4096 / 32) = 0,即文件所在的组描述符在块组文件描述符的第一个块中。

然后,计算文件所在的组描述符在块中的位置,gdt_index = bg_no % gdt_count_ = 0,块中的第一个组描述符即文件所在的组描述符。

其次,计算文件在所在块组中的索引, inode_partition = (inode_no - 1) % inode_count_ = 356 % 8192 = 356, 即文件是块组的第356个inode节点。

再次,计算文件在索引表中的位置,inode_block_no = inode_partition / it_inode_count = 356 / (4096 / 256) = 22, 即文件所在的索引在索引表的第22个块中。

最后,从组描述的bg_inode_table和bg_inode_table_hi获取inode_table_no,计算出索引表的偏移位置file_offset = (inode_table_no + inode_block_no) * 4096。

默认情况下,所有的组描述符在第一个块组中都存在备份,因此从第一个块组中读取对应的组描述符即可。

// inode 0 is defined but not exist, so actual inode no begin with 1.
// the bg number of the inode_no
b32 bg_no = (inode_no - 1) / volume_->inode_count_;
// the gdt number in bg
b32 gdt_block_no = bg_no / volume_->gdt_count_;
// the index of gdt in the bg which this inode in
b32 gdt_index = bg_no % volume_->gdt_count_;
// the index of inode in the bg which this inode in
b32 inode_partition = (inode_no - 1) % volume_->inode_count_;
// the inode count in one IT block
b32 it_inode_count = volume_->block_size_ / volume_->inode_size_;
// the index of IT block in the bg which this inode in
b32 inode_block_no = inode_partition / it_inode_count;
// move file pointer to gdt block

b64 file_offset = 0;
if (volume_->meta_group_)
  file_offset = GetGDTOffset(gdt_block_no * (b64)volume_->gdt_count_);
else
  // use gdt in first bg
  file_offset = GetGDTOffset(0) + gdt_block_no * (b64)volume_->block_size_;
if (lseek64(volume_->fd_, file_offset, SEEK_SET) != file_offset) goto IOErr;

gdt_record_ = (app_ext4_group_desc *)new char[volume_->block_size_];
if (volume_->block_size_ !=
    read(volume_->fd_, gdt_record_, volume_->block_size_))
  goto IOErr;

// get offset of block which inode in
if (!volume_->extend64_) {
  file_offset = (gdt_record_[gdt_index].bg_inode_table + inode_block_no) *
                (b64)volume_->block_size_;
} else {
  app_ext4_group_desc64 *gdt_record =
      (app_ext4_group_desc64 *)((char *)gdt_record_.get() +
                                volume_->gdt_size_ * gdt_index);
  b64 inode_table_no =
      gdt_record->bg_inode_table | ((b64)gdt_record->bg_inode_table_hi << 32);
  file_offset = (inode_table_no + inode_block_no) * volume_->block_size_;
}
if (lseek64(volume_->fd_, file_offset, SEEK_SET) != file_offset) goto IOErr;

inode_record_ = (app_ext4_inode *)new char[volume_->block_size_];
if (volume_->block_size_ !=
    read(volume_->fd_, inode_record_, volume_->block_size_))
  goto IOErr;
3.3 参考代码
typedef struct {
  ub32 bg_block_bitmap;         /* Blocks bitmap block */
  ub32 bg_inode_bitmap;         /* Inodes bitmap block */
  ub32 bg_inode_table;          /* Inodes table block */
  ub16 bg_free_blocks_count;    /* Free blocks count */
  ub16 bg_free_inodes_count;    /* Free inodes count */
  ub16 bg_used_dirs_count;      /* Directories count */
  ub16 bg_flags;                /* EXT4_BG_flags (INODE_UNINIT, etc) */
  ub32 bg_exclude_bitmap_lo;    /* Exclude bitmap for snapshots */
  ub16 bg_block_bitmap_csum_lo; /* crc32c(s_uuid+grp_num+bitmap) LSB */
  ub16 bg_inode_bitmap_csum_lo; /* crc32c(s_uuid+grp_num+bitmap) LSB */
  ub16 bg_itable_unused;        /* Unused inodes count */
  ub16 bg_checksum;             /* crc16(sb_uuid+group+desc) */
} app_ext4_group_desc;

typedef struct {
  ub32 bg_block_bitmap;         /* Blocks bitmap block */
  ub32 bg_inode_bitmap;         /* Inodes bitmap block */
  ub32 bg_inode_table;          /* Inodes table block */
  ub16 bg_free_blocks_count;    /* Free blocks count */
  ub16 bg_free_inodes_count;    /* Free inodes count */
  ub16 bg_used_dirs_count;      /* Directories count */
  ub16 bg_flags;                /* EXT4_BG_flags (INODE_UNINIT, etc) */
  ub32 bg_exclude_bitmap_lo;    /* Exclude bitmap for snapshots */
  ub16 bg_block_bitmap_csum_lo; /* crc32c(s_uuid+grp_num+bitmap) LSB */
  ub16 bg_inode_bitmap_csum_lo; /* crc32c(s_uuid+grp_num+bitmap) LSB */
  ub16 bg_itable_unused;        /* Unused inodes count */
  ub16 bg_checksum;             /* crc16(sb_uuid+group+desc) */
  ub32 bg_block_bitmap_hi;      /* Blocks bitmap block MSB */
  ub32 bg_inode_bitmap_hi;      /* Inodes bitmap block MSB */
  ub32 bg_inode_table_hi;       /* Inodes table block MSB */
  ub16 bg_free_blocks_count_hi; /* Free blocks count MSB */
  ub16 bg_free_inodes_count_hi; /* Free inodes count MSB */
  ub16 bg_used_dirs_count_hi;   /* Directories count MSB */
  ub16 bg_itable_unused_hi;     /* Unused inodes count MSB */
  ub32 bg_exclude_bitmap_hi;    /* Exclude bitmap block MSB */
  ub16 bg_block_bitmap_csum_hi; /* crc32c(s_uuid+grp_num+bitmap) MSB */
  ub16 bg_inode_bitmap_csum_hi; /* crc32c(s_uuid+grp_num+bitmap) MSB */
  ub32 bg_reserved;
} app_ext4_group_desc64;

4、索引节点

4.1 属性

下表列举出Inode中相对重要的属性。

属性名含义
i_mode文件属性和文件类型。
i_size_lo文件大小低32位。
i_links_count硬链接数量。
i_flags标志位。
i_block块图或者扩展树,存储文件内容或者目录索引。
i_size_high文件大小高32位。
i_extra_isize扩展属性大小。
4.2 文件标识
含义
0x1000S_IFIFO (FIFO)
0x2000S_IFCHR (Character device)
0x4000S_IFDIR (Directory)
0x6000S_IFBLK (Block device)
0x8000S_IFREG (Regular file)
0xA000S_IFLNK (Symbolic link)
0xC000S_IFSOCK (Socket)
4.3 文件内容

通常情况下,i_block中用于存储文件所有块的索引信息。某些特殊场景下,会用于其它情况。

  • 软链接(Symbolic Links)
    当链接的目标路径长度小于60时, 会将目标路径存储在i_block中。

  • 内联数据(Inline Data)
    当文件系统开启Inline Data特性,且数据长度小于156(目前)时,用于存储内容的前60个字节。

  • 直接/间接块索引(Direct/Indirect Block Addressing)
    i_block[0:11]:存储数据内容的块号。
    i_block[12] :指向间接数据块(存储数据块号的数据块)。
    i_block[13]:指向双重间接数据块(存储间接数据块的数据块)。
    i_block[14]:指向三重间接数据块(存储双重间接数据块的数据块)。
    在这里插入图片描述

  • 扩展树索引(Extent Tree)
    通过树的形式管理文件或者文件夹的数据块。扩展树的详细介绍请参考最后一节。

4.4 参考代码
#define EXT4_N_BLOCKS 15
typedef struct {
  ub16 i_mode;        /* File mode */
  ub16 i_uid;         /* Low 16 bits of Owner Uid */
  ub32 i_size;        /* Size in bytes */
  ub32 i_atime;       /* Access time */
  ub32 i_ctime;       /* Inode Change time */
  ub32 i_mtime;       /* Modification time */
  ub32 i_dtime;       /* Deletion Time */
  ub16 i_gid;         /* Low 16 bits of Group Id */
  ub16 i_links_count; /* Links count */
  ub32 i_blocks;      /* Blocks count */
  ub32 i_flags;       /* File flags */
  union {
    struct {
      ub32 l_i_version; /* was l_i_reserved1 */
    } linux1;
    struct {
      ub32 h_i_translator;
    } hurd1;
  } osd1;                      /* OS dependent 1 */
  ub32 i_block[EXT4_N_BLOCKS]; /* Pointers to blocks */
  ub32 i_generation;           /* File version (for NFS) */
  ub32 i_file_acl;             /* File ACL */
  ub32 i_size_high;            /* Formerly i_dir_acl, directory ACL */
  ub32 i_faddr;                /* Fragment address */
  union {
    struct {
      ub16 l_i_blocks_hi;
      ub16 l_i_file_acl_high;
      ub16 l_i_uid_high;    /* these 2 fields    */
      ub16 l_i_gid_high;    /* were reserved2[0] */
      ub16 l_i_checksum_lo; /* crc32c(uuid+inum+inode) */
      ub16 l_i_reserved;
    } linux2;
    struct {
      ub8 h_i_frag;  /* Fragment number */
      ub8 h_i_fsize; /* Fragment size */
      ub16 h_i_mode_high;
      ub16 h_i_uid_high;
      ub16 h_i_gid_high;
      ub32 h_i_author;
    } hurd2;
  } osd2; /* OS dependent 2 */
  ub16 i_extra_isize;
  ub16 i_checksum_hi;  /* crc32c(uuid+inum+inode) */
  ub32 i_ctime_extra;  /* extra Change time (nsec << 2 | epoch) */
  ub32 i_mtime_extra;  /* extra Modification time (nsec << 2 | epoch) */
  ub32 i_atime_extra;  /* extra Access time (nsec << 2 | epoch) */
  ub32 i_crtime;       /* File creation time */
  ub32 i_crtime_extra; /* extra File creation time (nsec << 2 | epoch)*/
  ub32 i_version_hi;   /* high 32 bits for 64-bit version */
} app_ext4_inode;

5、扩展属性

扩展属性通常用于存储文件的ACLs访问权限和一些其他的安全属性,例如selinux等。因此通常情况下,使用文件系统时并不需要关注文件的扩展属性。
当有一种例外情况,那就是开启了内联数据特性后,文件的一部分数据内容会存储到扩展属性中。
我们可以在2个地方找到文件的扩展属性,其一,2个索引信息的中间;其二,i_file_acl指向的块。而内联数据则存在于第一个地方。
扩展属性块以app_ext4_attr_header结构开始,但在索引信息后时只存在第一个字段h_magic = 0xEA020000
实际的扩展属性用app_ext4_attr_entry管理,当e_name_index = 7且e_name = data时,则代表内联数据。

typedef struct {
  ub32 h_magic;       /* magic number for identification */
  ub32 h_refcount;    /* reference count */
  ub32 h_blocks;      /* number of disk blocks used */
  ub32 h_hash;        /* hash value of all attributes */
  ub32 h_reserved[4]; /* zero right now */
} app_ext4_attr_header;

typedef struct {
  ub8 e_name_len;     /* length of name */
  ub8 e_name_index;   /* attribute name index */
  ub16 e_value_offs;  /* offset in disk block of value */
  ub32 e_value_block; /* disk block attribute is stored on (n/i) */
  ub32 e_value_size;  /* size of attribute value */
  ub32 e_hash;        /* hash value of name and value */
} app_ext4_attr_entry;

// 获取扩展内联数据
app_ext4_attr_header *attr_header =
      (app_ext4_attr_header *)((b8 *)&inode_info_->i_extra_isize +
                               inode_info_->i_extra_isize);
  if (attr_header->h_magic != kExtAttrMagic) return false;

  // Extended attributes, when stored after the inode,
  // have a header ext4_xattr_ibody_header that is 4 bytes long
  app_ext4_attr_entry *attr_data =
      (app_ext4_attr_entry *)((b8 *)attr_header + sizeof(attr_header->h_magic));
  while (attr_data->e_name_index != kExtAttrDataIdx ||
         attr_data->e_name_len != sizeof(kExtAttrDataName)) {
    attr_data =
        (app_ext4_attr_entry *)((b8 *)attr_data + sizeof(app_ext4_attr_entry) +
                                (attr_data->e_name_len + 3) / 4 * 4);
  }

  // For an inode attribute e_value_offs is relative to the first entry
  if (*(b32 *)((b8 *)attr_data + sizeof(app_ext4_attr_entry)) ==
      kExtAttrDataName) {
    memcpy(inline_data_,
           (b8 *)attr_header + sizeof(attr_header->h_magic) +
               attr_data->e_value_offs,
           attr_data->e_value_size);
  }

6、扩展树

由于直接/间接块索引的种种缺陷,在EXT4中推出了扩展树取而代之。扩展树,顾名思义,通过树的形式管理数据块。
其中每个节点以app_ext4_extent_header开始,非叶子节点时,后接app_ext4_extent_idx结构;叶子节点则紧跟app_ext4_extent结构。
app_ext4_extent_header用于存储当前节点的信息。

变量含义
eh_magic魔数,0xF30A。
eh_entries当前节点存储的数据个数。
eh_depth当前节点的深度,0则代表当前是叶子节点。

app_ext4_extent存储实际的数据块信息。

变量含义
ee_block起始的逻辑块地址。
ee_len当前extent管理的实际物理块个数。ee_len = ee_len > 32768 ? ee_len - 32768 : ee_len
ee_start_hi / ee_start按位或即可得出起始的物理块地址。

app_ext4_extent_idx存储下一层节点的信息。

变量含义
ei_block起始的逻辑块地址。
ei_leaf/ ei_leaf_hi按位或即可得出下一层节点的物理块地址。
typedef struct {
  ub16 eh_magic;      /* probably will support different formats */
  ub16 eh_entries;    /* number of valid entries */
  ub16 eh_max;        /* capacity of store in entries */
  ub16 eh_depth;      /* has tree real underlaying blocks? */
  ub32 eh_generation; /* generation of the tree */
} app_ext4_extent_header;

typedef struct {
  ub32 ee_block;    /* first logical block extent covers */
  ub16 ee_len;      /* number of blocks covered by extent */
  ub16 ee_start_hi; /* high 16 bits of physical block */
  ub32 ee_start;    /* low 32 bigs of physical block */
} app_ext4_extent;

typedef struct {
  ub32 ei_block;   /* index covers logical blocks from 'block' */
  ub32 ei_leaf;    /* pointer to the physical block of the next *
                    * level. leaf or next index could bet here */
  ub16 ei_leaf_hi; /* high 16 bits of physical block */
  ub16 ei_unused;
} app_ext4_extent_idx;
  • 26
    点赞
  • 26
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值