一步一步学linux操作系统: 32 输入与输出系统_ 块设备二_直接 I/O,缓存 I/O 与 块设备数据写入请求

直接 I/O 与 缓存 I/O

可以参见 https://blog.csdn.net/leacock1991/article/details/108035136

对于 ext4 文件系统,最后调用的是 ext4_file_write_iter,它将 I/O 的调用分成两种情况:

  • 第一是直接 I/O
    最终调用的是 generic_file_direct_write,这里调用的是 mapping->a_ops->direct_IO,实际调用的是 ext4_direct_IO,往设备层写入数据。

  • 第二种是缓存 I/O
    最终会将数据从应用拷贝到内存缓存中,但是这个时候,并不执行真正的 I/O 操作。
    只将整个页或其中部分标记为脏。写操作由一个 timer 触发,那个时候,才调用 wb_workfn 往硬盘写入页面。

    • wb_workfn 后的调用链
      wb_workfn->wb_do_writeback->wb_writeback->writeback_sb_inodes->__writeback_single_inode->do_writepages。在 do_writepages 中,我们要调用 mapping->a_ops->writepages,但实际调用的是 ext4_writepages,往设备层写入数据。

直接 I/O 如何访问块设备?

直接 I/O 调用到 ext4_direct_IO

ext4_direct_IO 函数 与 ext4_direct_IO_write 函数

\linux-4.13.16\fs\ext4\inode.c


static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
  struct file *file = iocb->ki_filp;
  struct inode *inode = file->f_mapping->host;
  size_t count = iov_iter_count(iter);
  loff_t offset = iocb->ki_pos;
  ssize_t ret;
......
  ret = ext4_direct_IO_write(iocb, iter);
......
}


static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter)
{
  struct file *file = iocb->ki_filp;
  struct inode *inode = file->f_mapping->host;
  struct ext4_inode_info *ei = EXT4_I(inode);
  ssize_t ret;
  loff_t offset = iocb->ki_pos;
  size_t count = iov_iter_count(iter);
......
  ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
           get_block_func, ext4_end_io_dio, NULL,
           dio_flags);


……
}

在这里插入图片描述
在这里插入图片描述

在 ext4_direct_IO_write 调用 __blockdev_direct_IO,有个参数inode->i_sb->s_bdev,通过当前文件的 inode,可以得到 super_block。这个 super_block 中的 s_bdev,就是https://blog.csdn.net/leacock1991/article/details/108308446中填进去的那个 block_device

__blockdev_direct_IO 会调用 do_blockdev_direct_IO

do_blockdev_direct_IO 函数

\linux-4.13.16\fs\direct-io.c


static inline ssize_t
do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
          struct block_device *bdev, struct iov_iter *iter,
          get_block_t get_block, dio_iodone_t end_io,
          dio_submit_t submit_io, int flags)
{
  unsigned i_blkbits = ACCESS_ONCE(inode->i_blkbits);
  unsigned blkbits = i_blkbits;
  unsigned blocksize_mask = (1 << blkbits) - 1;
  ssize_t retval = -EINVAL;
  size_t count = iov_iter_count(iter);
  loff_t offset = iocb->ki_pos;
  loff_t end = offset + count;
  struct dio *dio;
  struct dio_submit sdio = { 0, };
  struct buffer_head map_bh = { 0, };
......
  dio = kmem_cache_alloc(dio_cache, GFP_KERNEL);
  dio->flags = flags;
  dio->i_size = i_size_read(inode);
  dio->inode = inode;
  if (iov_iter_rw(iter) == WRITE) {
    dio->op = REQ_OP_WRITE;
    dio->op_flags = REQ_SYNC | REQ_IDLE;
    if (iocb->ki_flags & IOCB_NOWAIT)
      dio->op_flags |= REQ_NOWAIT;
  } else {
    dio->op = REQ_OP_READ;
  }
  sdio.blkbits = blkbits;
  sdio.blkfactor = i_blkbits - blkbits;
  sdio.block_in_file = offset >> blkbits;


  sdio.get_block = get_block;
  dio->end_io = end_io;
  sdio.submit_io = submit_io;
  sdio.final_block_in_bio = -1;
  sdio.next_block_for_io = -1;


  dio->iocb = iocb;
  dio->refcount = 1;


  sdio.iter = iter;
  sdio.final_block_in_request =
    (offset + iov_iter_count(iter)) >> blkbits;
......
  sdio.pages_in_io += iov_iter_npages(iter, INT_MAX);


  retval = do_direct_IO(dio, &sdio, &map_bh);
.....
}
	

在这里插入图片描述

在这里面函数里有 struct dio 结构struct dio_submit 结构,用来描述将要发生的写入请求。

struct dio 结构

\linux-4.13.16\fs\direct-io.c

/* dio_state communicated between submission path and end_io */
struct dio {
	int flags;			/* doesn't change */
	int op;
	int op_flags;
	blk_qc_t bio_cookie;
	struct block_device *bio_bdev;
	struct inode *inode;
	loff_t i_size;			/* i_size when submitted */
	dio_iodone_t *end_io;		/* IO completion function */

	void *private;			/* copy from map_bh.b_private */

	/* BIO completion state */
	spinlock_t bio_lock;		/* protects BIO fields below */
	int page_errors;		/* errno from get_user_pages() */
	int is_async;			/* is IO async ? */
	bool defer_completion;		/* defer AIO completion to workqueue? */
	bool should_dirty;		/* if pages should be dirtied */
	int io_error;			/* IO error in completion path */
	unsigned long refcount;		/* direct_io_worker() and bios */
	struct bio *bio_list;		/* singly linked via bi_private */
	struct task_struct *waiter;	/* waiting task (NULL if none) */

	/* AIO related stuff */
	struct kiocb *iocb;		/* kiocb */
	ssize_t result;                 /* IO result */

	/*
	 * pages[] (and any fields placed after it) are not zeroed out at
	 * allocation time.  Don't add new fields after pages[] unless you
	 * wish that they not be zeroed.
	 */
	union {
		struct page *pages[DIO_PAGES];	/* page buffer */
		struct work_struct complete_work;/* deferred AIO completion */
	};
} ____cacheline_aligned_in_smp;
	

在这里插入图片描述

struct bio 是将数据传给块设备的通用传输对象

struct dio_submit 结构

\linux-4.13.16\fs\direct-io.c

struct dio_submit {
	struct bio *bio;		/* bio under assembly */
	unsigned blkbits;		/* doesn't change */
	unsigned blkfactor;		/* When we're using an alignment which
					   is finer than the filesystem's soft
					   blocksize, this specifies how much
					   finer.  blkfactor=2 means 1/4-block
					   alignment.  Does not change */
	unsigned start_zero_done;	/* flag: sub-blocksize zeroing has
					   been performed at the start of a
					   write */
	int pages_in_io;		/* approximate total IO pages */
	sector_t block_in_file;		/* Current offset into the underlying
					   file in dio_block units. */
	unsigned blocks_available;	/* At block_in_file.  changes */
	int reap_counter;		/* rate limit reaping */
	sector_t final_block_in_request;/* doesn't change */
	int boundary;			/* prev block is at a boundary */
	get_block_t *get_block;		/* block mapping function */
	dio_submit_t *submit_io;	/* IO submition function */

	loff_t logical_offset_in_bio;	/* current first logical block in bio */
	sector_t final_block_in_bio;	/* current final block in bio + 1 */
	sector_t next_block_for_io;	/* next block to be put under IO,
					   in dio_blocks units */

	/*
	 * Deferred addition of a page to the dio.  These variables are
	 * private to dio_send_cur_page(), submit_page_section() and
	 * dio_bio_add_page().
	 */
	struct page *cur_page;		/* The page */
	unsigned cur_page_offset;	/* Offset into it, in bytes */
	unsigned cur_page_len;		/* Nr of bytes at cur_page_offset */
	sector_t cur_page_block;	/* Where it starts */
	loff_t cur_page_fs_offset;	/* Offset in file */

	struct iov_iter *iter;
	/*
	 * Page queue.  These variables belong to dio_refill_pages() and
	 * dio_get_page().
	 */
	unsigned head;			/* next page to process */
	unsigned tail;			/* last valid page + 1 */
	size_t from, to;
};
	

在这里插入图片描述
do_blockdev_direct_IO 函数 会调用 do_direct_IO

do_direct_IO 函数

\linux-4.13.16\fs\direct-io.c


static int do_direct_IO(struct dio *dio, struct dio_submit *sdio,
      struct buffer_head *map_bh)
{
  const unsigned blkbits = sdio->blkbits;
  const unsigned i_blkbits = blkbits + sdio->blkfactor;
  int ret = 0;


  while (sdio->block_in_file < sdio->final_block_in_request) {
    struct page *page;
    size_t from, to;


    page = dio_get_page(dio, sdio);
        from = sdio->head ? 0 : sdio->from;
    to = (sdio->head == sdio->tail - 1) ? sdio->to : PAGE_SIZE;
    sdio->head++;


    while (from < to) {
      unsigned this_chunk_bytes;  /* # of bytes mapped */
      unsigned this_chunk_blocks;  /* # of blocks */
......
            ret = submit_page_section(dio, sdio, page,
              from,
              this_chunk_bytes,
              sdio->next_block_for_io,
              map_bh);
......
      sdio->next_block_for_io += this_chunk_blocks;
      sdio->block_in_file += this_chunk_blocks;
      from += this_chunk_bytes;
      dio->result += this_chunk_bytes;
      sdio->blocks_available -= this_chunk_blocks;
      if (sdio->block_in_file == sdio->final_block_in_request)
        break;
......
        }
    }
}

在这里插入图片描述
do_direct_IO 里面有两层循环:

  • 第一层循环是依次处理这次要写入的所有块
    对于每一块,取出对应的内存中的页 page,在这一块中,有写入的起始地址 from 和终止地址 to
  • 第二层循环就是依次处理 from 到 to 的数据
    调用 submit_page_section,提交到块设备层进行写入。
    submit_page_section 会调用 dio_bio_submit,进而调用 submit_bio 向块设备层提交数据。

缓存 I/O 如何访问块设备?

缓存 I/O 调用到 ext4_writepages

ext4_writepages 函数

\linux-4.13.16\fs\ext4\inode.c


static int ext4_writepages(struct address_space *mapping,
         struct writeback_control *wbc)
{
......
  struct mpage_da_data mpd;
  struct inode *inode = mapping->host;
  struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
......
  mpd.do_map = 0;
  mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
  ret = mpage_prepare_extent_to_map(&mpd);
  /* Submit prepared bio */
  ext4_io_submit(&mpd.io_submit);
......
}

在这里插入图片描述
比较重要的一个数据结构 struct mpage_da_data

struct mpage_da_data

\linux-4.13.16\fs\ext4\inode.c


struct mpage_da_data {
  struct inode *inode;
......
  pgoff_t first_page;  /* The first page to write */
  pgoff_t next_page;  /* Current page to examine */
  pgoff_t last_page;  /* Last page to examine */
  struct ext4_map_blocks map;
  struct ext4_io_submit io_submit;  /* IO submission data */
  unsigned int do_map:1;
};


struct ext4_io_submit {
......
  struct bio    *io_bio;
  ext4_io_end_t    *io_end;
  sector_t    io_next_block;
};

在这里插入图片描述

里面有文件的 inode、要写入的页的偏移量,还有一个重要的 struct ext4_io_submit,里面有通用传输对象 bio 相关结构见下

在 ext4_writepages 中,mpage_prepare_extent_to_map 用于初始化这个 struct mpage_da_data 结构

接下来的调用链为:mpage_prepare_extent_to_map->mpage_process_page_bufs->mpage_submit_page->ext4_bio_write_page->io_submit_add_bh。

io_submit_add_bh 函数

\linux-4.13.16\fs\ext4\page-io.c

static int io_submit_add_bh(struct ext4_io_submit *io,
			    struct inode *inode,
			    struct page *page,
			    struct buffer_head *bh)
{
	int ret;

	if (io->io_bio && bh->b_blocknr != io->io_next_block) {
submit_and_retry:
		ext4_io_submit(io);
	}
	if (io->io_bio == NULL) {
		ret = io_submit_init_bio(io, bh);
		if (ret)
			return ret;
		io->io_bio->bi_write_hint = inode->i_write_hint;
	}
	ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh));
	if (ret != bh->b_size)
		goto submit_and_retry;
	wbc_account_io(io->io_wbc, page, bh->b_size);
	io->io_next_block++;
	return 0;
}

在这里插入图片描述

在 io_submit_add_bh 中,此时的 bio 还是空的,因而要调用 io_submit_init_bio,初始化 bio。

io_submit_init_bio 函数

\linux-4.13.16\fs\ext4\page-io.c


static int io_submit_init_bio(struct ext4_io_submit *io,
            struct buffer_head *bh)
{
  struct bio *bio;


  bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
  if (!bio)
    return -ENOMEM;
  wbc_init_bio(io->io_wbc, bio);
  bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
  bio->bi_bdev = bh->b_bdev;
  bio->bi_end_io = ext4_end_bio;
  bio->bi_private = ext4_get_io_end(io->io_end);
  io->io_bio = bio;
  io->io_next_block = bh->b_blocknr;
  return 0;
}

在这里插入图片描述
再回到 ext4_writepages 中。在 bio 初始化完之后,要调用 ext4_io_submit,提交 I/O。

ext4_io_submit 函数

\linux-4.13.16\fs\ext4\page-io.c


void ext4_io_submit(struct ext4_io_submit *io)
{
  struct bio *bio = io->io_bio;


  if (bio) {
    int io_op_flags = io->io_wbc->sync_mode == WB_SYNC_ALL ?
          REQ_SYNC : 0;
    io->io_bio->bi_write_hint = io->io_end->inode->i_write_hint;
    bio_set_op_attrs(io->io_bio, REQ_OP_WRITE, io_op_flags);
    submit_bio(io->io_bio);
  }
  io->io_bio = NULL;
}

在这里插入图片描述

ext4_io_submit 又是调用 submit_bio,向块设备层传输数据。这同直接I/O 一样 submit_page_section 会调用 dio_bio_submit,进而调用 submit_bio 向块设备层提交数据。

如何向块设备层提交请求?

不管是直接 I/O,还是缓存 I/O,最后都到了 submit_bio 里面

submit_bio 最后会调用 generic_make_request 如图
\linux-4.13.16\block\blk-core.c
在这里插入图片描述

generic_make_request 函数

generic_make_request 函数 最重要的两大逻辑:获取一个请求队列 request_queue [如何向块设备层提交请求?] 和调用这个队列的 make_request_fn 函数。[请求提交与调度]

\linux-4.13.16\block\blk-core.c


blk_qc_t generic_make_request(struct bio *bio)
{
  /*
   * bio_list_on_stack[0] contains bios submitted by the current
   * make_request_fn.
   * bio_list_on_stack[1] contains bios that were submitted before
   * the current make_request_fn, but that haven't been processed
   * yet.
   */
  struct bio_list bio_list_on_stack[2];
  blk_qc_t ret = BLK_QC_T_NONE;
......
  if (current->bio_list) {
    bio_list_add(&current->bio_list[0], bio);
    goto out;
  }


  bio_list_init(&bio_list_on_stack[0]);
  current->bio_list = bio_list_on_stack;
  do {
    struct request_queue *q = bdev_get_queue(bio->bi_bdev);


    if (likely(blk_queue_enter(q, bio->bi_opf & REQ_NOWAIT) == 0)) {
      struct bio_list lower, same;


      /* Create a fresh bio_list for all subordinate requests */
      bio_list_on_stack[1] = bio_list_on_stack[0];
      bio_list_init(&bio_list_on_stack[0]);
      ret = q->make_request_fn(q, bio);


      blk_queue_exit(q);


      /* sort new bios into those for a lower level
       * and those for the same level
       */
      bio_list_init(&lower);
      bio_list_init(&same);
      while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL)
        if (q == bdev_get_queue(bio->bi_bdev))
          bio_list_add(&same, bio);
        else
          bio_list_add(&lower, bio);
      /* now assemble so we handle the lowest level first */
      bio_list_merge(&bio_list_on_stack[0], &lower);
      bio_list_merge(&bio_list_on_stack[0], &same);
      bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]);
    } 
......
    bio = bio_list_pop(&bio_list_on_stack[0]);
  } while (bio);
  current->bio_list = NULL; /* deactivate */
out:
  return ret;
}

在这里插入图片描述
do-while 中先是获取一个请求队列 request_queue,然后调用这个队列的 make_request_fn 函数

request_queue 结构

\linux-4.13.16\include\linux\blkdev.h


struct request_queue {
  /*
   * Together with queue_head for cacheline sharing
   */
  struct list_head  queue_head;
  struct request    *last_merge;
  struct elevator_queue  *elevator;
......
  request_fn_proc    *request_fn;
  make_request_fn    *make_request_fn;
......
}

在这里插入图片描述
从 struct block_device 结构和 struct gendisk 结构 可以发现 每个块设备都有一个请求队列 struct request_queue,用于处理上层发来的请求。

在每个块设备的驱动程序初始化的时候,会生成一个 request_queue。

链表 list_head,保存请求 request

request 结构

\linux-4.13.16\include\linux\blkdev.h


struct request {
  struct list_head queuelist;
......
  struct request_queue *q;
......
  struct bio *bio;
  struct bio *biotail;
......
}

在这里插入图片描述
每个 request 包括一个链表的 struct bio,有指针指向一头一尾

struct bio 结构

\linux-4.13.16\include\linux\blk_types.h


struct bio {
  struct bio    *bi_next;  /* request queue link */
  struct block_device  *bi_bdev;
  blk_status_t    bi_status;
......
    struct bvec_iter  bi_iter;
  unsigned short    bi_vcnt;  /* how many bio_vec's */
  unsigned short    bi_max_vecs;  /* max bvl_vecs we can hold */
  atomic_t    __bi_cnt;  /* pin count */
  struct bio_vec    *bi_io_vec;  /* the actual vec list */
......
};


struct bio_vec {
  struct page  *bv_page;
  unsigned int  bv_len;
  unsigned int  bv_offset;
}

在这里插入图片描述

在这里插入图片描述
bio 中,bi_next 是链表中的下一项,struct bio_vec 指向一组页面

图片来自极客时间趣谈linux操作系统
图片来自极客时间趣谈linux操作系统
在请求队列 request_queue 上,还有两个重要的函数,一个是 make_request_fn 函数,用于生成 request;另一个是 request_fn 函数,用于处理 request。

块设备的初始化

以 scsi 驱动为例, 在初始化设备驱动的时候,调用 scsi_alloc_queue,把 request_fn 设置为 scsi_request_fn,调用 blk_init_allocated_queue->blk_queue_make_request,把 make_request_fn 设置为 blk_queue_bio

scsi_alloc_sdev 调用 scsi_alloc_queue

scsi_alloc_sdev 函数

\drivers\scsi\scsi_scan.c


/**
 * scsi_alloc_sdev - allocate and setup a scsi_Device
 * @starget: which target to allocate a &scsi_device for
 * @lun: which lun
 * @hostdata: usually NULL and set by ->slave_alloc instead
 *
 * Description:
 *     Allocate, initialize for io, and return a pointer to a scsi_Device.
 *     Stores the @shost, @channel, @id, and @lun in the scsi_Device, and
 *     adds scsi_Device to the appropriate list.
 *
 * Return value:
 *     scsi_Device pointer, or NULL on failure.
 **/
static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget,
             u64 lun, void *hostdata)
{
  struct scsi_device *sdev;
  sdev = kzalloc(sizeof(*sdev) + shost->transportt->device_size,
           GFP_ATOMIC);
......
  sdev->request_queue = scsi_alloc_queue(sdev);
......
}

在这里插入图片描述

scsi_alloc_queue 函数

\linux-4.13.16\drivers\scsi\scsi_lib.c

struct request_queue *scsi_alloc_queue(struct scsi_device *sdev)
{
  struct Scsi_Host *shost = sdev->host;
  struct request_queue *q;


  q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE);
  if (!q)
    return NULL;
  q->cmd_size = sizeof(struct scsi_cmnd) + shost->hostt->cmd_size;
  q->rq_alloc_data = shost;
  q->request_fn = scsi_request_fn;
  q->init_rq_fn = scsi_init_rq;
  q->exit_rq_fn = scsi_exit_rq;
  q->initialize_rq_fn = scsi_initialize_rq;


    //调用blk_queue_make_request(q, blk_queue_bio);
  if (blk_init_allocated_queue(q) < 0) {
    blk_cleanup_queue(q);
    return NULL;
  }


  __scsi_init_queue(shost, q);
......
  return q
}

在这里插入图片描述

scsi_alloc_queue 调用 blk_init_allocated_queue

blk_init_allocated_queue 函数

\linux-4.13.16\block\blk-core.c


int blk_init_allocated_queue(struct request_queue *q)
{
  q->fq = blk_alloc_flush_queue(q, NUMA_NO_NODE, q->cmd_size);
......
  blk_queue_make_request(q, blk_queue_bio);
......
  /* init elevator */
  if (elevator_init(q, NULL)) {
......
  }
......
}

在这里插入图片描述

除了初始化 make_request_fn 函数 ,还要做一件很重要的事情,就是初始化 I/O 的电梯算法

blk_init_allocated_queue 调用 blk_queue_make_request 将 make_request_fn 设置为 blk_queue_bio

blk_queue_make_request 函数

\linux-4.13.16\block\blk-settings.c

void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
{
	/*
	 * set defaults
	 */
	q->nr_requests = BLKDEV_MAX_RQ;

	q->make_request_fn = mfn;
	blk_queue_dma_alignment(q, 511);
	blk_queue_congestion_threshold(q);
	q->nr_batching = BLK_BATCH_REQ;

	blk_set_default_limits(&q->limits);
}

在这里插入图片描述

电梯算法简介

电梯算法有很多种类型,定义为 elevator_type

struct elevator_type 结构

\linux-4.13.16\include\linux\elevator.h


/*
 * identifies an elevator type, such as AS or deadline
 */
struct elevator_type
{
	/* managed by elevator core */
	struct kmem_cache *icq_cache;

	/* fields provided by elevator implementation */
	union {
		struct elevator_ops sq;
		struct elevator_mq_ops mq;
	} ops;
	size_t icq_size;	/* see iocontext.h */
	size_t icq_align;	/* ditto */
	struct elv_fs_entry *elevator_attrs;
	char elevator_name[ELV_NAME_MAX];
	struct module *elevator_owner;
	bool uses_mq;
#ifdef CONFIG_BLK_DEBUG_FS
	const struct blk_mq_debugfs_attr *queue_debugfs_attrs;
	const struct blk_mq_debugfs_attr *hctx_debugfs_attrs;
#endif

	/* managed by elevator core */
	char icq_cache_name[ELV_NAME_MAX + 6];	/* elvname + "_io_cq" */
	struct list_head list;
};

在这里插入图片描述

  • struct elevator_type elevator_noop
    Noop 调度算法是最简单的 IO 调度算法,它将 IO 请求放入到一个 FIFO 队列中,然后逐个执行这些 IO 请求。
  • struct elevator_type iosched_deadline
    Deadline 算法要保证每个 IO 请求在一定的时间内一定要被服务到,以此来避免某个请求饥饿。
  • struct elevator_type iosched_cfq
    CFQ 完全公平调度算法。所有的请求会在多个队列中排序。同一个进程的请求,总是在同一队列中处理。时间片会分配到每个队列,通过轮询算法,保证了 I/O 带宽,以公平的方式,在不同队列之间进行共享。

elevator_init 中会根据名称来指定电梯算法,如果没有选择,那就默认使用 iosched_cfq。

请求提交与调度

回到 generic_make_request函数中
不管是直接 I/O,还是缓存 I/O,最后都到了 submit_bio 里面,submit_bio 会调用 generic_make_request

以 scsi 驱动为例,blk_init_allocated_queue 调用 blk_queue_make_request 将 make_request_fn 设置为 blk_queue_bio

generic_make_request 函数调用队列的 make_request_fn,其实就是调用 blk_queue_bio

blk_queue_bio 函数

\linux-4.13.16\block\blk-core.c


static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
{
  struct request *req, *free;
  unsigned int request_count = 0;
......
  switch (elv_merge(q, &req, bio)) {
  case ELEVATOR_BACK_MERGE:
    if (!bio_attempt_back_merge(q, req, bio))
      break;
    elv_bio_merged(q, req, bio);
    free = attempt_back_merge(q, req);
    if (free)
      __blk_put_request(q, free);
    else
      elv_merged_request(q, req, ELEVATOR_BACK_MERGE);
    goto out_unlock;
  case ELEVATOR_FRONT_MERGE:
    if (!bio_attempt_front_merge(q, req, bio))
      break;
    elv_bio_merged(q, req, bio);
    free = attempt_front_merge(q, req);
    if (free)
      __blk_put_request(q, free);
    else
      elv_merged_request(q, req, ELEVATOR_FRONT_MERGE);
    goto out_unlock;
  default:
    break;
  }


get_rq:
  req = get_request(q, bio->bi_opf, bio, GFP_NOIO);
......
  blk_init_request_from_bio(req, bio);
......
  add_acct_request(q, req, where);
  __blk_run_queue(q);
out_unlock:
......
  return BLK_QC_T_NONE;
}

在这里插入图片描述

blk_queue_bio 调用 elv_merge 来判断是否有request可以合并

调用 elv_merge 来判断,当前这个 bio 请求是否能够和目前已有的 request 合并起来,成为同一批 I/O 操作,从而提高读取和写入的性能

判断标准和 struct bio 的成员 struct bvec_iter 有关,它里面有两个变量,一个是起始磁盘簇 bi_sector,另一个是大小 bi_size。

elv_merge 函数

\linux-4.13.16\block\elevator.c


enum elv_merge elv_merge(struct request_queue *q, struct request **req,
    struct bio *bio)
{
  struct elevator_queue *e = q->elevator;
  struct request *__rq;
......
  if (q->last_merge && elv_bio_merge_ok(q->last_merge, bio)) {
    enum elv_merge ret = blk_try_merge(q->last_merge, bio);


    if (ret != ELEVATOR_NO_MERGE) {
      *req = q->last_merge;
      return ret;
    }
  }
......
  __rq = elv_rqhash_find(q, bio->bi_iter.bi_sector);
  if (__rq && elv_bio_merge_ok(__rq, bio)) {
    *req = __rq;
    return ELEVATOR_BACK_MERGE;
  }


  if (e->uses_mq && e->type->ops.mq.request_merge)
    return e->type->ops.mq.request_merge(q, req, bio);
  else if (!e->uses_mq && e->type->ops.sq.elevator_merge_fn)
    return e->type->ops.sq.elevator_merge_fn(q, req, bio);


  return ELEVATOR_NO_MERGE;
}

在这里插入图片描述
elv_merge 尝试了三次合并

  • 第一次,它先判断和上一次合并的 request 能不能再次合并,看看能不能赶上马上要走的这部电梯。

在 blk_try_merge 主要做了这样的判断:

如果 blk_rq_pos(rq) + blk_rq_sectors(rq) == bio->bi_iter.bi_sector,也就是说这个 request 的起始地址加上它的大小(其实是这个 request 的结束地址),如果和 bio 的起始地址能接得上,那就把 bio 放在 request 的最后,称为 ELEVATOR_BACK_MERGE。

如果 blk_rq_pos(rq) - bio_sectors(bio) == bio->bi_iter.bi_sector,也就是说,这个 request 的起始地址减去 bio 的大小等于 bio 的起始地址,这说明 bio 放在 request 的最前面能够接得上,那就把 bio 放在 request 的最前面,称为 ELEVATOR_FRONT_MERGE。否则,那就不合并,称为 ELEVATOR_NO_MERGE。

blk_try_merge 函数
\linux-4.13.16\block\blk-merge.c


enum elv_merge blk_try_merge(struct request *rq, struct bio *bio)
{
......
    if (blk_rq_pos(rq) + blk_rq_sectors(rq) == bio->bi_iter.bi_sector)
    return ELEVATOR_BACK_MERGE;
  else if (blk_rq_pos(rq) - bio_sectors(bio) == bio->bi_iter.bi_sector)
    return ELEVATOR_FRONT_MERGE;
  return ELEVATOR_NO_MERGE;
}

在这里插入图片描述

  • 第二次,如果和上一个合并过的 request 无法合并,那就调用 elv_rqhash_find。
    按照 bio 的起始地址查找 request,看有没有能够合并的。如果有的话,因为是按照起始地址找的,应该接在人家的后面,所以是 ELEVATOR_BACK_MERGE。

  • 第三次,调用 elevator_merge_fn 试图合并

对于 iosched_cfq,调用的是 cfq_merge。在这里面,cfq_find_rq_fmerge 会调用 elv_rb_find 函数,里面的参数是 bio 的结束地址。

能不能找到可以合并的。如果有的话,因为是按照结束地址找的,应该接在人家前面,所以是 ELEVATOR_FRONT_MERGE。

\linux-4.13.16\block\cfq-iosched.c


static enum elv_merge cfq_merge(struct request_queue *q, struct request **req,
         struct bio *bio)
{
  struct cfq_data *cfqd = q->elevator->elevator_data;
  struct request *__rq;


  __rq = cfq_find_rq_fmerge(cfqd, bio);
  if (__rq && elv_bio_merge_ok(__rq, bio)) {
    *req = __rq;
    return ELEVATOR_FRONT_MERGE;
  }


  return ELEVATOR_NO_MERGE;
}


static struct request *
cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
{
  struct task_struct *tsk = current;
  struct cfq_io_cq *cic;
  struct cfq_queue *cfqq;


  cic = cfq_cic_lookup(cfqd, tsk->io_context);
  if (!cic)
    return NULL;


  cfqq = cic_to_cfqq(cic, op_is_sync(bio->bi_opf));
  if (cfqq)
    return elv_rb_find(&cfqq->sort_list, bio_end_sector(bio));


  return NUL
}

在这里插入图片描述
在这里插入图片描述

elv_merge 返回 blk_queue_bio 的时候就知道,应该做哪种类型的合并,接着就要进行真的合并。

如果没有办法合并,那就调用 get_request,创建一个新的 request,调用 blk_init_request_from_bio,将 bio 放到新的 request 里面,然后调用 add_acct_request,把新的 request 加到 request_queue 队列中。

到这里解析完了 generic_make_request 中最重要的两大逻辑:获取一个请求队列 request_queue [如何向块设备层提交请求?] 和调用这个队列的 make_request_fn 函数。[请求提交与调度]

请求的处理

设备驱动程序往设备里面写,调用的是请求队列 request_queue 的另外一个函数 request_fn。
对于 scsi 设备来讲,调用的是 scsi_request_fn

scsi_request_fn 函数

\linux-4.13.16\drivers\scsi\scsi_lib.c


static void scsi_request_fn(struct request_queue *q)
  __releases(q->queue_lock)
  __acquires(q->queue_lock)
{
  struct scsi_device *sdev = q->queuedata;
  struct Scsi_Host *shost;
  struct scsi_cmnd *cmd;
  struct request *req;


  /*
   * To start with, we keep looping until the queue is empty, or until
   * the host is no longer able to accept any more requests.
   */
  shost = sdev->host;
  for (;;) {
    int rtn;
    /*
     * get next queueable request.  We do this early to make sure
     * that the request is fully prepared even if we cannot
     * accept it.
     */
    req = blk_peek_request(q);
......
    /*
     * Remove the request from the request list.
     */
    if (!(blk_queue_tagged(q) && !blk_queue_start_tag(q, req)))
      blk_start_request(req);
.....
    cmd = req->special;
......
    /*
     * Dispatch the command to the low-level driver.
     */
    cmd->scsi_done = scsi_done;
    rtn = scsi_dispatch_cmd(cmd);
......
  }
  return;
......
}

在这里插入图片描述
这里面是一个 for 无限循环,从 request_queue 中读取 request,然后封装更加底层的指令,给设备控制器下指令,实施真正的 I/O 操作。

总结

块设备的 I/O 操作分为两种,一种是直接 I/O,另一种是缓存 I/O。

无论是哪种 I/O,最终都会调用 submit_bio 提交块设备 I/O 请求。

所谓的写入块设备,I/O 就是将 page cache 里面的数据写入硬盘。

对于每一种块设备,都有一个 gendisk 表示这个设备,它有一个请求队列,这个队列是一系列的 request 对象。每个 request 对象里面包含多个 BIO 对象,指向 page cache。

对于请求队列来讲,还有两个函数:

  • 一个函数叫 make_request_fn 函数
    用于将请求放入队列。submit_bio 会调用 generic_make_request,然后调用这个函数
  • 另一个函数在设备驱动程序里实现,叫 request_fn 函数
    用于从队列里面取出请求来,写入外部设备。

图片来自极客时间趣谈linux操作系统
图片来自极客时间趣谈linux操作系统

参考资料:

趣谈Linux操作系统(极客时间)链接:
http://gk.link/a/10iXZ
欢迎大家来一起交流学习

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

墨1024

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值