元数据
MDS用户保存cephfs的元数据信息,mds使用动态申请缓存空间来存储元数据信息,其记录的元数据除了文件在磁盘中的位置,还包括文件名、文件属性】归属目录、子树分隔、扩展、配额等等。cephfs里面元数据首先写入日志mdlog,然后再把日志里的元数据刷新到rados层。
测试时可以通过如下命令强制将日志刷新到元数据池
ceph daemon /var/run/ceph/ceph-mds.node1.asok flush journal
目录元数据
每个目录在元数据池中为一个对象(inode.偏移值,其中偏移值是在内核里切割文件时给出的。),根目录默认有一个对象,inode值为1
目录对象保存了当前目录下的文件数据
root@node1[/mnt/ceph]# rados listomapkeys 1.00000000 -p meta
file1_head
test1_head
test2_head
test3_head
root@node1[/mnt/ceph]# ls -al
total 6
drwxr-xr-x 5 root root 0 May 17 16:46 .
drwxr-xr-x. 5 root root 42 May 10 14:11 ..
-rw-r--r-- 1 root root 4096 May 15 13:59 file1
drwxr-xr-x 3 root root 1048576 May 15 15:22 test1
drwxr-xr-x 2 root root 0 May 15 13:58 test2
drwxr-xr-x 2 root root 0 May 17 16:46 test3
root@node1[/mnt/ceph]# ls -li
total 6
1099511627778 -rw-r--r-- 1 root root 4096 May 15 13:59 file1
1099511627776 drwxr-xr-x 3 root root 1048576 May 15 15:22 test1
1099511627777 drwxr-xr-x 2 root root 0 May 15 13:58 test2
1099511627781 drwxr-xr-x 2 root root 0 May 17 16:46 test3
# test1的inode是1099511627776,这个是10进制数,转16进制为10000000000,如下可以找到test1目录的对象
root@node1[/mnt/ceph]# rados -p meta ls | grep "10000000000."
10000000000.00000000
文件元数据
文件的元数据保存在它的上级目录中,也就是每个目录的元数据都保存了下级目录和文件的名称,以及inode号(文件没有下级目录,所以没必要单独弄一个对象来保存元数据,只需要保存在上级目录的元数据里面)
# 以test1目录为例
root@node1[/mnt/ceph]# ls -al test1
total 1026
drwxr-xr-x 3 root root 1048576 May 15 15:22 .
drwxr-xr-x 5 root root 0 May 17 16:46 ..
-rw-r--r-- 1 root root 1048576 May 16 17:22 file1
drwxr-xr-x 2 root root 0 May 15 15:22 test11
root@node1[/mnt/ceph]# rados listomapvals 10000000000.00000000 -p meta
file1_2
value (462 bytes) :
00000000 02 00 00 00 00 00 00 00 49 0f 06 a3 01 00 00 03 |........I.......|
00000010 00 00 00 00 01 00 00 00 00 00 00 ea 61 44 66 8c |............aDf.|
00000020 d3 ec 3a a4 81 00 00 00 00 00 00 00 00 00 00 01 |..:.............|
00000030 00 00 00 00 00 00 00 00 00 00 00 00 02 02 18 00 |................|
00000040 00 00 00 00 40 00 01 00 00 00 00 00 40 00 0a 00 |....@.......@...|
00000050 00 00 00 00 00 00 00 00 00 00 00 a0 00 00 00 00 |................|
00000060 00 00 01 00 00 00 ff ff ff ff ff ff ff ff 00 00 |................|
00000070 00 00 00 00 00 00 00 00 00 00 ea 61 44 66 8c d3 |...........aDf..|
00000080 ec 3a ea 61 44 66 34 45 c8 3a 00 00 00 00 00 00 |.:.aDf4E.:......|
00000090 00 00 03 02 28 00 00 00 00 00 00 00 00 00 00 00 |....(...........|
000000a0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
*
000000c0 03 02 38 00 00 00 00 00 00 00 00 00 00 00 00 a0 |..8.............|
000000d0 00 00 00 00 00 00 01 00 00 00 00 00 00 00 00 00 |................|
000000e0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
000000f0 00 00 00 00 00 00 ea 61 44 66 8c d3 ec 3a 03 02 |.......aDf...:..|
00000100 38 00 00 00 00 00 00 00 00 00 00 00 00 a0 00 00 |8...............|
00000110 00 00 00 00 01 00 00 00 00 00 00 00 00 00 00 00 |................|
00000120 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
00000130 00 00 00 00 ea 61 44 66 8c d3 ec 3a 0a 00 00 00 |.....aDf...:....|
00000140 00 00 00 00 00 00 00 00 00 00 00 00 01 00 00 00 |................|
00000150 00 00 00 00 02 00 00 00 00 00 00 00 00 00 00 00 |................|
00000160 00 00 00 00 00 00 00 00 ff ff ff ff ff ff ff ff |................|
00000170 00 00 00 00 01 01 10 00 00 00 00 00 00 00 00 00 |................|
00000180 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
00000190 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ea 61 |...............a|
000001a0 44 66 34 45 c8 3a 0a 00 00 00 00 00 00 00 ff ff |Df4E.:..........|
000001b0 ff ff 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
000001c0 00 00 fe ff ff ff ff ff ff ff 00 00 00 00 |..............|
000001ce
file1_head
value (462 bytes) :
00000000 03 00 00 00 00 00 00 00 49 0f 06 a3 01 00 00 03 |........I.......|
00000010 00 00 00 00 01 00 00 00 00 00 00 45 d0 45 66 f4 |...........E.Ef.|
00000020 ef e3 23 a4 81 00 00 00 00 00 00 00 00 00 00 01 |..#.............|
00000030 00 00 00 00 00 00 00 00 00 00 00 00 02 02 18 00 |................|
00000040 00 00 00 00 40 00 01 00 00 00 00 00 40 00 0a 00 |....@.......@...|
00000050 00 00 00 00 00 00 00 00 00 00 00 00 10 00 00 00 |................|
00000060 00 00 02 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
00000070 00 00 00 00 00 00 00 00 00 00 45 d0 45 66 f4 ef |..........E.Ef..|
00000080 e3 23 ea 61 44 66 34 45 c8 3a 00 00 00 00 00 00 |.#.aDf4E.:......|
00000090 00 00 03 02 28 00 00 00 00 00 00 00 00 00 00 00 |....(...........|
000000a0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
*
000000c0 03 02 38 00 00 00 00 00 00 00 00 00 00 00 00 00 |..8.............|
000000d0 10 00 00 00 00 00 01 00 00 00 00 00 00 00 00 00 |................|
000000e0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
000000f0 00 00 00 00 00 00 45 d0 45 66 f4 ef e3 23 03 02 |......E.Ef...#..|
00000100 38 00 00 00 00 00 00 00 00 00 00 00 00 00 10 00 |8...............|
00000110 00 00 00 00 01 00 00 00 00 00 00 00 00 00 00 00 |................|
00000120 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
00000130 00 00 00 00 45 d0 45 66 f4 ef e3 23 0d 00 00 00 |....E.Ef...#....|
00000140 00 00 00 00 00 00 00 00 00 00 00 00 01 00 00 00 |................|
00000150 00 00 00 00 02 00 00 00 00 00 00 00 00 00 00 00 |................|
00000160 00 a0 00 00 00 00 00 00 ff ff ff ff ff ff ff ff |................|
00000170 00 00 00 00 01 01 10 00 00 00 00 00 00 00 00 00 |................|
00000180 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
00000190 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ea 61 |...............a|
000001a0 44 66 34 45 c8 3a 13 00 00 00 00 00 00 00 ff ff |Df4E.:..........|
000001b0 ff ff 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
000001c0 00 00 02 00 00 00 00 00 00 00 00 00 00 00 |..............|
000001ce
test11_head
value (462 bytes) :
00000000 02 00 00 00 00 00 00 00 49 0f 06 a3 01 00 00 04 |........I.......|
00000010 00 00 00 00 01 00 00 00 00 00 00 9c 62 44 66 25 |............bDf%|
00000020 19 b4 05 ed 41 00 00 00 00 00 00 00 00 00 00 01 |....A...........|
00000030 00 00 00 00 02 00 00 00 00 00 00 00 02 02 18 00 |................|
00000040 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ff ff |................|
00000050 ff ff ff ff ff ff 00 00 00 00 00 00 00 00 00 00 |................|
00000060 00 00 01 00 00 00 ff ff ff ff ff ff ff ff 00 00 |................|
00000070 00 00 00 00 00 00 00 00 00 00 9c 62 44 66 25 19 |...........bDf%.|
00000080 b4 05 9c 62 44 66 25 19 b4 05 00 00 00 00 00 00 |...bDf%.........|
00000090 00 00 03 02 28 00 00 00 00 00 00 00 00 00 00 00 |....(...........|
000000a0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
*
000000c0 03 02 38 00 00 00 00 00 00 00 00 00 00 00 00 00 |..8.............|
000000d0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 01 00 |................|
000000e0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
000000f0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 03 02 |................|
00000100 38 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |8...............|
00000110 00 00 00 00 00 00 00 00 00 00 00 00 01 00 00 00 |................|
00000120 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
00000130 00 00 00 00 00 00 00 00 00 00 00 00 06 00 00 00 |................|
00000140 00 00 00 00 00 00 00 00 00 00 00 00 01 00 00 00 |................|
00000150 00 00 00 00 06 00 00 00 00 00 00 00 00 00 00 00 |................|
00000160 00 00 00 00 00 00 00 00 ff ff ff ff ff ff ff ff |................|
00000170 00 00 00 00 01 01 10 00 00 00 00 00 00 00 00 00 |................|
00000180 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
00000190 00 00 00 00 00 00 00 00 00 00 00 00 00 00 9c 62 |...............b|
000001a0 44 66 25 19 b4 05 00 00 00 00 00 00 00 00 ff ff |Df%.............|
000001b0 ff ff 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
000001c0 00 00 fe ff ff ff ff ff ff ff 00 00 00 00 |..............|
000001ce
如上可以看出,test1下的test11目录inode是10000000004, file1的inode为10000000003 (Linux是小端模式,内存数据得倒着看)
业务数据
文件中的数据会被切分为4M大小的对象
# 写入两个不同大小文件
root@node1[/mnt/ceph]# dd if=/dev/zero of=1M_file bs=1M count=1
1+0 records in
1+0 records out
1048576 bytes (1.0 MB, 1.0 MiB) copied, 0.00314067 s, 334 MB/s
root@node1[/mnt/ceph]# dd if=/dev/zero of=21M_file bs=1M count=21
21+0 records in
21+0 records out
22020096 bytes (22 MB, 21 MiB) copied, 0.0436599 s, 504 MB/s
# 查看inode,并转16进制,为10000000006 和 10000000007
root@node1[/mnt/ceph]# ls -li 1M_file
1099511627782 -rw-r--r-- 1 root root 1048576 May 17 17:48 1M_file
root@node1[/mnt/ceph]# ls -li 21M_file
1099511627783 -rw-r--r-- 1 root root 22020096 May 17 17:49 21M_file
# 查看对象,对象的名称为文件的inode号.偏移值
root@node1[/mnt/ceph]# rados -p data ls | grep "10000000006."
10000000006.00000000
root@node1[/mnt/ceph]# rados -p data ls | grep "10000000007."
10000000007.00000004
10000000007.00000001
10000000007.00000005
10000000007.00000003
10000000007.00000002
10000000007.00000000