dm-thin-provision架构及实现简析

前言:

    最近对快照感兴趣, 初步分析了下dm-thin-provision的代码, 初步感觉实现方式很不错, 但不足的是性能比较差, metadata写了数据过多.

但整体实现方式还是值得参考的.

dm-thin-provision简介

    thin-provision是device mapper的一种, 可以完成存储设备的特定映射, 这种设备有下面的特点:

(1)允许多个虚拟设备存储在相同的数据卷中,从而达到共享数据,节省空间的目的;

(2)支持任意深度的快照。之前的实现的性能为O(n),新的实现通过一个单独的数据避免了性能随快照深度的增加而降低。

(3)支持元数据存储到单独的设备上。这样就可以将元数据放到镜像设备或者更快的SSD上。

 创建thin provision

    有两种方法创建dm thin-provision, 一种是通过dmsetup工具创建, 另外一种是通过lvm管理工具创建.

    通过dmsetup创建dm-thin-provision

    a:  创建pool

          # dmsetup create pool \

               --table "0 20971520 thin-pool $metadata_dev $data_dev \

                   $data_block_size $low_water_mark"

         # dmsetup create yy_thin_pool --table '0 409600 thin-pool /dev/loop6 /dev/loop7 128 0'

         # dmsetup table /dev/mapper/yy_thin_pool

         0 409600 thin-pool 7:6 7:7 128 0 0

    b: 创建thin volume

         # dmsetup message /dev/mapper/yy_thin_pool 0 "create_thin 0"

         # dmsetup table /dev/mapper/thin

         0 40960 thin 253:3 0

    c: 创建快照snapshot    

         # dmsetup suspend /dev/mapper/thin

         # dmsetup message /dev/mapper/yy_thin_pool 0 "create_snap 1 0"

         # dmsetup resume /dev/mapper/thin

         # dmsetup create snap --table "0 40960 thin /dev/mapper/yy_thin_pool 1"

   通过lvm创建thin provision

     a: 创建thin pool

         # dd if=/dev/zero of=lvm0.img bs=1024k count=256

         # losetup /dev/loop7 lvm0.img

         # pvcreate /dev/loop7

           Physical volume "/dev/loop7" successfully created

         # vgcreate vg_test /dev/loop7

           Volume group "vg_test" successfully created

         # lvcreate -L 200M -T vg_test/mythinpool

           Logical volume "lvol0" created

           Logical volume "mythinpool" created

         # ls /dev/mapper/* |grep mythin

          /dev/mapper/vg_test-mythinpool

          /dev/mapper/vg_test-mythinpool_tdata

          /dev/mapper/vg_test-mythinpool_tmeta

          /dev/mapper/vg_test-mythinpool-tpool

     b: 创建thin

        # lvcreate -T vg_test/mythinpool -V 300M -n lvol1

          Logical volume "lvol1" created

     c: 创建snapshot

        # lvcreate -s --name mysnapshot1 vg_test/lvol1

          Logical volume "mysnapshot1" created

dm-thin-provision架构


  thin-provision记录了每次写的block的映射关系, 具体对应关系放到了metadata dev中, 每个block介于64k 和1G中间, 创建pool的时候传入, 在pool_ctr中有检查

/*

 * The block size of the device holding pool data must be

 * between 64KB and 1GB.

 */

#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT)

#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)

  而无论读写, 首先会有一个从metadata中找block的过程, 例如:

  thin_bio_map-> dm_thin_find_block.  如果是读命令, 结果却发现找不到block的情况, 这种情况就说明之前根本没有写入数据,所以应该返回全0.

如果是写命令, 没有找到block, 就可以往metadata的树中插入新的项目, 当下次写命令查找时能返回正确的block信息.

snapshot和对应的thin一样, 也是一种thin device, 只不过这种thin device是有一部分能共享. 当对某个thin创建snapshot时, 需要怎么来处理呢?

a: 创建snapshot也就是创建了一个新的thin_id的设备, 只不过有一个time值记录当前的snapshotted_time, 如果找到的block的时间小于当前设备的time, 则说明是共享的,代表当前block是比较老的版本.

b: 如果是共享的, 其实已经找到了block号,

c: 如果不是共享的,则需要重新创建一个block, 并于当前bio进行map, 同时写入map的结果到metadata dev中保存.

关键的数据结构

pool ,代表最开始创建的pool

 Collapse source
struct pool_c {
 
        struct dm_target *ti;
 
        struct pool *pool;
 
        struct dm_dev *data_dev;
 
        struct dm_dev *metadata_dev;
 
        struct dm_target_callbacks callbacks;
 
  
 
        dm_block_t low_water_blocks;
 
        struct pool_features requested_pf;/* Features requested during table load */
 
        struct pool_features adjusted_pf; /* Features used after adjusting for constituent devices */
 
};

Thin_c, 代表已创建好的thin dev

 Collapse source
struct thin_c {
 
        struct list_head list;
 
        struct dm_dev *pool_dev;
 
        struct dm_dev *origin_dev;
 
        sector_t origin_size;
 
        dm_thin_id dev_id;
 
  
 
        struct pool *pool;
 
        struct dm_thin_device *td;
 
        struct mapped_device *thin_md;
 
  
 
        bool requeue_mode:1;
 
        spinlock_t lock;
 
        struct list_head deferred_cells;
 
        struct bio_list deferred_bio_list;
 
        struct bio_list retry_on_resume_list;
 
        struct rb_root sort_bio_list;/* sorted list of deferred bios */
 
  
 
        /*
 
         * Ensures the thin is not destroyed until the worker has finished
 
         * iterating the active_thins list.
 
         */
 
        atomic_t refcount;
 
        struct completion can_destroy;
 
};

Metadata中的四棵树

 Collapse source
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
/*
 
 * Two-level btree.
 
 * First level holds thin_dev_t.
 
 * Second level holds mappings.
 
 */
 
struct dm_btree_info info;
           // 描述双层的树,上层是设备下层是mapping
 
 
 
/*
 
 * Non-blocking version of the above.
 
 */
 
struct dm_btree_info nb_info;
                  // 不包含block io的版本
 
 
 
/*
 
 * Just the top level for deleting whole devices.
 
 */
 
struct dm_btree_info tl_info;
                 // 上层设备树
 
 
 
/*
 
 * Just the bottom level for creating new devices.
 
 */
 
struct dm_btree_info bl_info;
                // 下层块信息树
 
 
 
/*
 
 * Describes the device details btree.
 
 */
 
struct dm_btree_info details_info;            // 描述设备的详细信息的树

初始化以及IO操作流程

a: ioctl 命令入口
 Collapse source
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
static struct target_type pool_target = {
 
        .name ="thin-pool",
 
        .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
 
                    DM_TARGET_IMMUTABLE,
 
        .version = {1,16, 0},
 
        .module = THIS_MODULE,
 
        .ctr = pool_ctr,
 
        .dtr = pool_dtr,
 
        .map = pool_map,
 
        .presuspend = pool_presuspend,
 
        .presuspend_undo = pool_presuspend_undo,
 
        .postsuspend = pool_postsuspend,
 
        .preresume = pool_preresume,
 
        .resume = pool_resume,
 
        .message = pool_message,
 
        .status = pool_status,
 
        .merge = pool_merge,
 
        .iterate_devices = pool_iterate_devices,
 
        .io_hints = pool_io_hints,
 
};
b: 创建thin device

pool_message -> process_create_thin_mesg -> dm_pool_create_thin -> __create_thin -> dm_btree_insert -> insert -> btree_insert_raw

c: 创建snapshot device

pool_message -> process_create_snap_mesg -> dm_pool_create_snap -> __create_snap -> dm_btree_insert ->>> __set_snapshot_details

感觉snapshot device创建过程应该复用thin的创建过程, 最好传递一个参数, 很多函数都是一样的, 浪费代码.

d: io命令处理入口函数map

主要从块设备层开始

对request based 的io, 走的是一条路.

table_load -> dm_setup_md_queue -> dm_init_request_based_queue -> dm_request_fn -> queue_kthread_work ->

   kthread_worker_fn -> work->func() -> 

这个tio work 在 init_tio -> init_kthread_work(&tio->work, map_tio_request);

map_tio_request -> ti->type->map_rq(ti, clone, &tio->info);

在map request的时候, 会先找clone, 然后调用target的map_rq

对bio based 的io, 路线如下:

table_load -> dm_setup_md_queue -> blk_queue_make_request(md->queue, dm_make_request);

dm_make_request ->  __split_and_process_bio -> __split_and_process_non_flush -> __clone_and_map_data_bio

__map_bio -> ti->type->map(ti, clone);

e: 对thin device和snapshot device进行map

thin_bio_map -> dm_thin_find_block -> dm_btree_lookup 

看当前block是否找到, 找到则设置当前block的状态:

                result->block = exception_block;

                result->shared = __snapshotted_since(td, exception_time);

这个__snapshotted_since是比较现在的time和当前设备time的大小, 每次进行snapshot动作, 设备的time值就会加一

 Collapse source
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
/*
 
 * Check whether @time (of block creation) is older than @td's last snapshot.
 
 * If so then the associated block is shared with the last snapshot device.
 
 * Any block on a device created *after* the device last got snapshotted is
 
 * necessarily not shared.
 
 */
 
static bool __snapshotted_since(struct dm_thin_device *td, uint32_t time)
 
{
 
        returntd->snapshotted_time > time;
 
}

设备的share状态很关键, 如果当前结果返回的是shared状态, 则代表两者可以共享数据.

如果不是share状态, 则需要重新分配一个块,然后进行remap动作

 Collapse source
1
2
3
4
5
6
7
8
9
build_data_key(tc->td, result.block, &key);
 
if (bio_detain(tc->pool, &key, bio, &data_cell)) {
 
        cell_defer_no_holder(tc, virt_cell);
 
        returnDM_MAPIO_SUBMITTED;
 
}

remap动作就是根据block的地址加上当前请求的偏移, 组成新的地址往下发

 Collapse source
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
 
{                      
 
        struct pool *pool = tc->pool;
 
        sector_t bi_sector = bio->bi_sector;
 
                 
 
        bio->bi_bdev = tc->pool_dev->bdev;
 
        if(block_size_is_power_of_two(pool))
 
                bio->bi_sector = (block << pool->sectors_per_block_shift) |
 
                                (bi_sector & (pool->sectors_per_block -1));
 
        else
 
                bio->bi_sector = (block * pool->sectors_per_block) +
 
                                 sector_div(bi_sector, pool->sectors_per_block);
 
}

如果没有找到, 则需要视情况决定是否重新分配block然后往下发:

 Collapse source
1
2
3
4
5
6
7
case -ENODATA:
 
case -EWOULDBLOCK:
 
        thin_defer_cell(tc, virt_cell);
 
        returnDM_MAPIO_SUBMITTED;

thin_defer_cell->wake_worker -> do_worker

 Collapse source
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
static void do_worker(struct work_struct *ws)
 
{      
 
        struct pool *pool = container_of(ws, struct pool, worker);
 
         
 
        throttle_work_start(&pool->throttle);
 
        dm_pool_issue_prefetches(pool->pmd);
 
        throttle_work_update(&pool->throttle);
 
        process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping);
 
        throttle_work_update(&pool->throttle);
 
        process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard);
 
        throttle_work_update(&pool->throttle);
 
        process_deferred_bios(pool);
 
        throttle_work_complete(&pool->throttle);
 
}

这个do_worker做的事情是进行新的mapping, 然后往下发:

process_prepared_mapping -> dm_thin_insert_block -> remap_and_issue

核心在于之前的remap函数, issue的过程比较简单

 Collapse source
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
static void issue(struct thin_c *tc, struct bio *bio)
 
{
 
        struct pool *pool = tc->pool;
 
        unsignedlong flags;
 
  
 
        if(!bio_triggers_commit(tc, bio)) {
 
                generic_make_request(bio);
 
                return;
 
        }
 
  
 
        /*
 
         * Complete bio with an error if earlier I/O caused changes to
 
         * the metadata that can't be committed e.g, due to I/O errors
 
         * on the metadata device.
 
         */
 
        if(dm_thin_aborted_changes(tc->td)) {
 
                bio_io_error(bio);
 
                return;
 
        }
 
  
 
        /*
 
         * Batch together any bios that trigger commits and then issue a
 
         * single commit for them in process_deferred_bios().
 
         */
 
        spin_lock_irqsave(&pool->lock, flags);
 
        bio_list_add(&pool->deferred_flush_bios, bio);
 
        spin_unlock_irqrestore(&pool->lock, flags);
 
}

 结论

dm-thin-provision通过在metadata device中创建二元树(thin_id, LBA)来记录对请求的映射情况, 而当对某个thin device进行snapshot的时候, 通过time值记录该snapshot的情况. 在重新定位的时候通过比较该time值的大小来确定当前块是否被共享. 这样能最大程度的实现了块空间的利用, 非常适合类似container的场景.

而不利的情况是当要导出增量的修改的时候, 可能比较麻烦.

 

展开阅读全文

没有更多推荐了,返回首页