Block块设备层Plug和Unplug机制

最新推荐文章于 2023-10-17 16:20:11 发布

果汁底线

最新推荐文章于 2023-10-17 16:20:11 发布

阅读量1.9k

点赞数 2

分类专栏： linux 文章标签： bio 内核

本文链接：https://blog.csdn.net/guozhidixian/article/details/113873489

版权

linux 专栏收录该内容

10 篇文章 1 订阅

订阅专栏

一. 基本原理

Linux块设备层使用了Plug/Unplug的机制来提升IO吞吐量，基本原理为：当IO请求（一般时将文件数据以BIO的格式提交，以submit_bio函数）提交时，不是直接提交给块设备驱动(比如UFS Driver或MMC Driver), 而是放在一个放在一个Plug队列中(可以理解为一个蓄BIO的水池)，等待一定的时机或者条件或者周期后再将队列中的请求统一下发给IO Schedule层（下发给IO Schedule层的请求都是以Request的形式）。将请求放入Plug队列(可以理解为一个蓄BIO的水池)中是Plug(蓄流)过程，统一下发请求(以Request的格式下发)到IO Schedule的过程即为unplug(泄流)过程。每个请求(Request请求)在Plug队列中等待的时间不会太长，通常在ms级别。

如此设计，可以增加IO合并和排序的机会，便于提升磁盘访问效率。

二. PLug和UnPlug流程

1. Plug

(1)基本流程

从文件系统层下发到块设备层的IO请求称为BIO, BIO会在块设备层进行合并，并且生成新的Request, 并经过IO调度层(排序和合并)下发到块设备驱动层，块设备层下发Request到IO调度层时，会通过请求队列的make_request_fn（在这个函数中给make_request_fn赋值----->blk_queue_make_request(q, blk_queue_bio)）接口，目的时为了将请求放入每个Task的plug队列中，当Plug队列满或者在进行调度的时候(schedule函数中)会根据当前进程的状态将该进程的Plug队列中的请求Flush到派发队列(这个队列时IO调度层的派发队列)，并触发Unplug流程。

per task的plug队列：新内核版本中实现的机制。IO请求提交时先链入此Plug队列，当该队列满时(>BLK_MAX_REQUEST_COUNT)或者IO请求长度大于BLK_PLUG_FLUSH_SIZE，会flush到相应设备的请求队列中(request_queue)。
优点：per task维护plug队列，可以避免频繁对设备的请求队列操作导致的锁竞争，能提升效率

Plug框架图

(2)代码流程

2. UnPlug

(1)基本流程

UnPlug分为同步UnPlug和异步UnPLug

同步unplug是立即通过调用blk_run_queue对下发请求队列(Request_queue)中。
异步unplug是通过唤醒kblockd工作队列来对请求队列中的请求进行下发。

(2)同步UnPlug

提交IO请求时候进行UnPlug，是先将请求放入该Task的Plug队列，当该队列满(request_count >= BLK_MAX_REQUEST_COUNT ) 或者是请求大小超过Plug Flush size（blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)时，会Flush到相应设备的请求队列中(request_queue)去。

#define BLK_MAX_REQUEST_COUNT 16
#define BLK_PLUG_FLUSH_SIZE (128 * 1024)

static inline unsigned int blk_rq_bytes(const struct request *rq)
{
    return rq->__data_len;
}

代码：

submit_bio -->
    generic_make_request -->
            make_request_fn-->
                blk_queue_bio-->
                    blk_flush_plug_list(plug, false)-->注：这里传入的from_schedule参数为false,表示将触发同步Unplug,立即下发请求
                                          queue_unplugged--->
                                                    __blk_run_queue

static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
{
	struct blk_plug *plug;
	int where = ELEVATOR_INSERT_SORT;
	struct request *req, *free;
	unsigned int request_count = 0;

	/*
	 * low level driver can indicate that it wants pages above a
	 * certain limit bounced to low memory (ie for highmem, or even
	 * ISA dma in theory)
	 */
	blk_queue_bounce(q, &bio);

	blk_queue_split(q, &bio);

	if (!bio_integrity_prep(bio))
		return BLK_QC_T_NONE;

	if (op_is_flush(bio->bi_opf)) {
		spin_lock_irq(q->queue_lock);
		where = ELEVATOR_INSERT_FLUSH;
		goto get_rq;
	}

	/*
	 * Check if we can merge with the plugged list before grabbing
	 * any locks.
	 */
	if (!blk_queue_nomerges(q)) {
        /*尝试将bio合并到request中*/
		if (blk_attempt_plug_merge(q, bio, &request_count, NULL))
			return BLK_QC_T_NONE;
	} else
		request_count = blk_plug_queued_count(q);

	spin_lock_irq(q->queue_lock);

	blk_update_tw_state(q,
			blk_io_vol_rqs(q, REQ_OP_WRITE),
			blk_io_vol_bytes(q, REQ_OP_WRITE));

	switch (elv_merge(q, &req, bio)) {
    /*向后合并,将bio合并到request中*/
	case ELEVATOR_BACK_MERGE:
		if (!bio_attempt_back_merge(q, req, bio))
			break;
    /* 调用elv_bio_merged,该函数会调用电梯调度器注册的elevator_bio_merged_fn接口来通知调度器 
    做相应的处理，对于deadline调度器而言该接口为NULL*/

		elv_bio_merged(q, req, bio);
		blk_queue_io_vol_merge(q, bio->bi_opf, 0, bio->bi_iter.bi_size);
		free = attempt_back_merge(q, req);
		if (free)
			__blk_put_request(q, free);
		else
			elv_merged_request(q, req, ELEVATOR_BACK_MERGE);
		goto out_unlock;
    /*向前合并,将bio合并到request中*/
	case ELEVATOR_FRONT_MERGE:
		if (!bio_attempt_front_merge(q, req, bio))
			break;
		elv_bio_merged(q, req, bio);
		blk_queue_io_vol_merge(q, bio->bi_opf, 0, bio->bi_iter.bi_size);
		free = attempt_front_merge(q, req);
		if (free)
			__blk_put_request(q, free);
		else
			elv_merged_request(q, req, ELEVATOR_FRONT_MERGE);
		goto out_unlock;
	default:
		break;
	}

get_rq:
	rq_qos_throttle(q, bio, q->queue_lock);

	/*
	 * Grab a free request. This is might sleep but can not fail.
	 * Returns with the queue unlocked.
	 */
	blk_queue_enter_live(q);
	req = get_request(q, bio->bi_opf, bio, 0, GFP_NOIO);
	if (IS_ERR(req)) {
		blk_queue_exit(q);
		rq_qos_cleanup(q, bio);
		if (PTR_ERR(req) == -ENOMEM)
			bio->bi_status = BLK_STS_RESOURCE;
		else
			bio->bi_status = BLK_STS_IOERR;
		bio_endio(bio);
		goto out_unlock;
	}

	rq_qos_track(q, req, bio);

	/*
	 * After dropping the lock and possibly sleeping here, our request
	 * may now be mergeable after it had proven unmergeable (above).
	 * We don't worry about that case for efficiency. It won't happen
	 * often, and the elevators are able to handle it.
	 */
	blk_init_request_from_bio(req, bio);

	if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags))
		req->cpu = raw_smp_processor_id();

    /* 每个进程有一个自己的Plug队列，current是一个宏，表示当前在CPU运行的Task*/
	plug = current->plug;
	if (plug) {
		/*
		 * If this is the first request added after a plug, fire
		 * of a plug trace.
		 *
		 * @request_count may become stale because of schedule
		 * out, so check plug list again.
		 */
		if (!request_count || list_empty(&plug->list))
			trace_block_plug(q);
		else {
			struct request *last = list_entry_rq(plug->list.prev);
            /*如果request_count >=16 或者 IO请求长度>=128K,进行Unplug */
			if (request_count >= BLK_MAX_REQUEST_COUNT ||
			    blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE) {
				blk_flush_plug_list(plug, false);
				trace_block_plug(q);
			}
		}
        /* 将request的queuelist加入到pluh队列中*/
		list_add_tail(&req->queuelist, &plug->list);
		blk_account_io_start(req, true);
	} else {
		spin_lock_irq(q->queue_lock);
		add_acct_request(q, req, where);
        /*如果没有Plug,直接运行signal device queue
          调用块设备驱动层提供的request_fn接口处理请求队列中的请求 */
		__blk_run_queue(q);
out_unlock:
		spin_unlock_irq(q->queue_lock);
	}

	return BLK_QC_T_NONE;
}

(3)异步UnPlug

当发生kernel schedule 时，当前进程sleep前，先将当前task 的plug queue中的IO请求flush到Dispatch queue中，并进行unplug。

异步Unplug流程
schedule-->
        sched_submit_work-->
                        blk_schedule_flush_plug-->
                                    blk_flush_plug_list(plug, true)-->注意这里传入的from_schedule参数为true,表示将触发异步UnPlug,
                                                                       即唤醒Kblockd工作队列来进行Unplug操作，后续的kblock唤醒周期在块设备驱动设置
                                                            queue_unplugged-->
                                                                    blk_run_queue_async

Kernel/sched/Core.c
asmlinkage __visible void __sched schedule(void)
{
	struct task_struct *tsk = current;

	sched_submit_work(tsk);
	do {
		preempt_disable();
		__schedule(false);
		sched_preempt_enable_no_resched();
	} while (need_resched());
}

static inline void sched_submit_work(struct task_struct *tsk)
{
    if (!tsk->state || tsk_is_pi_blocked(tsk))
        return;
    /*
     * If we are going to sleep and we have plugged IO queued,
     * make sure to submit it to avoid deadlocks.
     */
    if (blk_needs_flush_plug(tsk))
        blk_schedule_flush_plug(tsk);
}

static inline void blk_schedule_flush_plug(struct task_struct *tsk)
{
	struct blk_plug *plug = tsk->plug;

	if (plug)
		blk_flush_plug_list(plug, true);
}

void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
{
	struct request_queue *q;
	struct request *rq;
	LIST_HEAD(list);
	unsigned int depth;

	flush_plug_callbacks(plug, from_schedule);

	if (!list_empty(&plug->mq_list))
		blk_mq_flush_plug_list(plug, from_schedule);

	if (list_empty(&plug->list))
		return;

	list_splice_init(&plug->list, &list);

	list_sort(NULL, &list, plug_rq_cmp);

	q = NULL;
	depth = 0;

	while (!list_empty(&list)) {
		rq = list_entry_rq(list.next);
		list_del_init(&rq->queuelist);
		BUG_ON(!rq->q);
		if (rq->q != q) {
			/*
			 * This drops the queue lock
			 */
			if (q)
				queue_unplugged(q, depth, from_schedule);
			q = rq->q;
			depth = 0;
			spin_lock_irq(q->queue_lock);
		}

		/*
		 * Short-circuit if @q is dead
		 */
		if (unlikely(blk_queue_dying(q))) {
			__blk_end_request_all(rq, BLK_STS_IOERR);
			continue;
		}

		/*
		 * rq is already accounted, so use raw insert
		 */
		if (op_is_flush(rq->cmd_flags))
			__elv_add_request(q, rq, ELEVATOR_INSERT_FLUSH);
		else
			__elv_add_request(q, rq, ELEVATOR_INSERT_SORT_MERGE);

		depth++;
	}

	/*
	 * This drops the queue lock
	 */
	if (q)
		queue_unplugged(q, depth, from_schedule);
}

参考文章：

https://www.ubuntukylin.com/news/791-cn.html