一. 基本原理
Linux块设备层使用了Plug/Unplug的机制来提升IO吞吐量,基本原理为:当IO请求(一般时将文件数据以BIO的格式提交,以submit_bio函数)提交时,不是直接提交给块设备驱动(比如UFS Driver或MMC Driver), 而是放在一个放在一个Plug队列中(可以理解为一个蓄BIO的水池),等待一定的时机或者条件或者周期后再将队列中的请求统一下发给IO Schedule层(下发给IO Schedule层的请求都是以Request的形式)。将请求放入Plug队列(可以理解为一个蓄BIO的水池)中是Plug(蓄流)过程,统一下发请求(以Request的格式下 发)到IO Schedule的过程即为unplug(泄流)过程。每个请求(Request请求)在Plug队列中等待的时间不会太长,通常在ms级别。
如此设计,可以增加IO合并和排序的机会,便于提升磁盘访问效率。
二. PLug和UnPlug流程
1. Plug
(1)基本流程
从文件系统层下发到块设备层的IO请求称为BIO, BIO会在块设备层进行合并,并且生成新的Request, 并经过IO调度层(排序和合并)下发到块设备驱动层,块设备层下发Request到IO调度层时,会通过请求队列的make_request_fn(在这个函数中给make_request_fn赋值----->blk_queue_make_request(q, blk_queue_bio))接口,目的时为了将请求放入每个Task的plug队列中,当Plug队列满或者在进行调度的时候(schedule函数中)会根据当前进程的状态将该进程的Plug队列中的请求Flush到派发队列(这个队列时IO调度层的派发队列),并触发Unplug流程。
per task的plug队列:新内核版本中实现的机制。IO请求提交时先链入此Plug队列,当该队列满时(>BLK_MAX_REQUEST_COUNT)或者IO请求长度大于BLK_PLUG_FLUSH_SIZE,会flush到相应设备的请求队列中(request_queue)。
优点:per task维护plug队列,可以避免频繁对设备的请求队列操作导致的锁竞争,能提升效率
Plug框架图
(2)代码流程
2. UnPlug
(1)基本流程
UnPlug分为同步UnPlug和异步UnPLug
同步unplug是立即通过调用blk_run_queue对下发请求队列(Request_queue)中。
异步unplug是通过唤醒kblockd工作队列来对请求队列中的请求进行下发。
(2)同步UnPlug
提交IO请求时候进行UnPlug,是先将请求放入该Task的Plug队列,当该队列满(request_count >= BLK_MAX_REQUEST_COUNT ) 或者是请求大小超过Plug Flush size(blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)时,会Flush到相应设备的请求队列中(request_queue)去。
#define BLK_MAX_REQUEST_COUNT 16
#define BLK_PLUG_FLUSH_SIZE (128 * 1024)
static inline unsigned int blk_rq_bytes(const struct request *rq)
{
return rq->__data_len;
}
代码:
submit_bio -->
generic_make_request -->
make_request_fn-->
blk_queue_bio-->
blk_flush_plug_list(plug, false)-->注:这里传入的from_schedule参数为false,表示将触发同步Unplug,立即下发请求
queue_unplugged--->
__blk_run_queue
static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
{
struct blk_plug *plug;
int where = ELEVATOR_INSERT_SORT;
struct request *req, *free;
unsigned int request_count = 0;
/*
* low level driver can indicate that it wants pages above a
* certain limit bounced to low memory (ie for highmem, or even
* ISA dma in theory)
*/
blk_queue_bounce(q, &bio);
blk_queue_split(q, &bio);
if (!bio_integrity_prep(bio))
return BLK_QC_T_NONE;
if (op_is_flush(bio->bi_opf)) {
spin_lock_irq(q->queue_lock);
where = ELEVATOR_INSERT_FLUSH;
goto get_rq;
}
/*
* Check if we can merge with the plugged list before grabbing
* any locks.
*/
if (!blk_queue_nomerges(q)) {
/*尝试将bio合并到request中*/
if (blk_attempt_plug_merge(q, bio, &request_count, NULL))
return BLK_QC_T_NONE;
} else
request_count = blk_plug_queued_count(q);
spin_lock_irq(q->queue_lock);
blk_update_tw_state(q,
blk_io_vol_rqs(q, REQ_OP_WRITE),
blk_io_vol_bytes(q, REQ_OP_WRITE));
switch (elv_merge(q, &req, bio)) {
/*向后合并,将bio合并到request中*/
case ELEVATOR_BACK_MERGE:
if (!bio_attempt_back_merge(q, req, bio))
break;
/* 调用elv_bio_merged,该函数会调用电梯调度器注册的elevator_bio_merged_fn接口来通知调度器
做相应的处理,对于deadline调度器而言该接口为NULL*/
elv_bio_merged(q, req, bio);
blk_queue_io_vol_merge(q, bio->bi_opf, 0, bio->bi_iter.bi_size);
free = attempt_back_merge(q, req);
if (free)
__blk_put_request(q, free);
else
elv_merged_request(q, req, ELEVATOR_BACK_MERGE);
goto out_unlock;
/*向前合并,将bio合并到request中*/
case ELEVATOR_FRONT_MERGE:
if (!bio_attempt_front_merge(q, req, bio))
break;
elv_bio_merged(q, req, bio);
blk_queue_io_vol_merge(q, bio->bi_opf, 0, bio->bi_iter.bi_size);
free = attempt_front_merge(q, req);
if (free)
__blk_put_request(q, free);
else
elv_merged_request(q, req, ELEVATOR_FRONT_MERGE);
goto out_unlock;
default:
break;
}
get_rq:
rq_qos_throttle(q, bio, q->queue_lock);
/*
* Grab a free request. This is might sleep but can not fail.
* Returns with the queue unlocked.
*/
blk_queue_enter_live(q);
req = get_request(q, bio->bi_opf, bio, 0, GFP_NOIO);
if (IS_ERR(req)) {
blk_queue_exit(q);
rq_qos_cleanup(q, bio);
if (PTR_ERR(req) == -ENOMEM)
bio->bi_status = BLK_STS_RESOURCE;
else
bio->bi_status = BLK_STS_IOERR;
bio_endio(bio);
goto out_unlock;
}
rq_qos_track(q, req, bio);
/*
* After dropping the lock and possibly sleeping here, our request
* may now be mergeable after it had proven unmergeable (above).
* We don't worry about that case for efficiency. It won't happen
* often, and the elevators are able to handle it.
*/
blk_init_request_from_bio(req, bio);
if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags))
req->cpu = raw_smp_processor_id();
/* 每个进程有一个自己的Plug队列,current是一个宏,表示当前在CPU运行的Task*/
plug = current->plug;
if (plug) {
/*
* If this is the first request added after a plug, fire
* of a plug trace.
*
* @request_count may become stale because of schedule
* out, so check plug list again.
*/
if (!request_count || list_empty(&plug->list))
trace_block_plug(q);
else {
struct request *last = list_entry_rq(plug->list.prev);
/*如果request_count >=16 或者 IO请求长度>=128K,进行Unplug */
if (request_count >= BLK_MAX_REQUEST_COUNT ||
blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE) {
blk_flush_plug_list(plug, false);
trace_block_plug(q);
}
}
/* 将request的queuelist加入到pluh队列中*/
list_add_tail(&req->queuelist, &plug->list);
blk_account_io_start(req, true);
} else {
spin_lock_irq(q->queue_lock);
add_acct_request(q, req, where);
/*如果没有Plug,直接运行signal device queue
调用块设备驱动层提供的request_fn接口处理请求队列中的请求 */
__blk_run_queue(q);
out_unlock:
spin_unlock_irq(q->queue_lock);
}
return BLK_QC_T_NONE;
}
(3)异步UnPlug
当发生kernel schedule 时,当前进程sleep前,先将当前task 的plug queue中的IO请求flush到Dispatch queue中,并进行unplug。
异步Unplug流程
schedule-->
sched_submit_work-->
blk_schedule_flush_plug-->
blk_flush_plug_list(plug, true)-->注意这里传入的from_schedule参数为true,表示将触发异步UnPlug,
即唤醒Kblockd工作队列来进行Unplug操作,后续的kblock唤醒周期在块设备驱动设置
queue_unplugged-->
blk_run_queue_async
Kernel/sched/Core.c
asmlinkage __visible void __sched schedule(void)
{
struct task_struct *tsk = current;
sched_submit_work(tsk);
do {
preempt_disable();
__schedule(false);
sched_preempt_enable_no_resched();
} while (need_resched());
}
static inline void sched_submit_work(struct task_struct *tsk)
{
if (!tsk->state || tsk_is_pi_blocked(tsk))
return;
/*
* If we are going to sleep and we have plugged IO queued,
* make sure to submit it to avoid deadlocks.
*/
if (blk_needs_flush_plug(tsk))
blk_schedule_flush_plug(tsk);
}
static inline void blk_schedule_flush_plug(struct task_struct *tsk)
{
struct blk_plug *plug = tsk->plug;
if (plug)
blk_flush_plug_list(plug, true);
}
void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
{
struct request_queue *q;
struct request *rq;
LIST_HEAD(list);
unsigned int depth;
flush_plug_callbacks(plug, from_schedule);
if (!list_empty(&plug->mq_list))
blk_mq_flush_plug_list(plug, from_schedule);
if (list_empty(&plug->list))
return;
list_splice_init(&plug->list, &list);
list_sort(NULL, &list, plug_rq_cmp);
q = NULL;
depth = 0;
while (!list_empty(&list)) {
rq = list_entry_rq(list.next);
list_del_init(&rq->queuelist);
BUG_ON(!rq->q);
if (rq->q != q) {
/*
* This drops the queue lock
*/
if (q)
queue_unplugged(q, depth, from_schedule);
q = rq->q;
depth = 0;
spin_lock_irq(q->queue_lock);
}
/*
* Short-circuit if @q is dead
*/
if (unlikely(blk_queue_dying(q))) {
__blk_end_request_all(rq, BLK_STS_IOERR);
continue;
}
/*
* rq is already accounted, so use raw insert
*/
if (op_is_flush(rq->cmd_flags))
__elv_add_request(q, rq, ELEVATOR_INSERT_FLUSH);
else
__elv_add_request(q, rq, ELEVATOR_INSERT_SORT_MERGE);
depth++;
}
/*
* This drops the queue lock
*/
if (q)
queue_unplugged(q, depth, from_schedule);
}
参考文章:
https://www.ubuntukylin.com/news/791-cn.html