操作系统学习-3.Linux文件系统学习2-IO过程分析

最新推荐文章于 2023-12-11 17:22:38 发布

沈万三djh

最新推荐文章于 2023-12-11 17:22:38 发布

阅读量435

点赞数

分类专栏：操作系统文章标签： linux 运维服务器

本文链接：https://blog.csdn.net/djhsws/article/details/120883409

版权

操作系统专栏收录该内容

9 篇文章 2 订阅

订阅专栏

IO过程分析

io调度框架
- 整体流程
- 细节展开
io的提交过程
- 从pagecache到bio到requst
- submit_bio

io调度框架

整体流程

在这里插入图片描述

细节展开

在这里插入图片描述

io的提交过程

从pagecache到bio到requst

我们从read流程和write流程中都知道，最后都会到readpages或者writepages中来

我们就从readpage开始分析，write的基本上和《Linux-块设备驱动之框架详细分析(详解)》讲的差不多，

也就是会到ll_rw_block函数中，至于怎么到这个函数中，就不求甚解了。

我们来看readpage流程。
static int read_page

s(struct address_space *mapping, struct file *filp,
    struct list_head *pages, unsigned nr_pages)
{
  struct blk_plug plug;
  unsigned page_idx;
  int ret;

  blk_start_plug(&plug);

  if (mapping->a_ops->readpages) {
    put_pages_list(pages);
    goto out;
  }

  for (page_idx = 0; page_idx < nr_pages; page_idx++) {
    struct page *page = list_to_page(pages);
    list_del(&page->lru);
    if (!add_to_page_cache_lru(page, mapping, page->index,
        mapping_gfp_constraint(mapping, GFP_KERNEL))) {
      mapping->a_ops->readpage(filp, page);
    }
    page_cache_release(page);
  }
  ret = 0;

out:
  blk_finish_plug(&plug);

  return ret;
}

假设我们读取的文件还没有在lru中。就一定会走mapping->a_ops->readpage

我们假设是ext4文件系统。

static const 
struct address_space_operations ext4_aops = {
  .readpage    = ext4_readpage,
  .readpages    = ext4_readpages,
  .writepage    = ext4_writepage,
  .writepages    = ext4_writepages,
  .write_begin    = ext4_write_begin,
  .write_end    = ext4_write_end,
  .bmap      = ext4_bmap,
  .invalidatepage    = ext4_invalidatepage,
  .releasepage    = ext4_releasepage,
  .direct_IO    = ext4_direct_IO,
  .migratepage    = buffer_migrate_page,
  .is_partially_uptodate  = block_is_partially_uptodate,
  .error_remove_page  = generic_error_remove_page,
};

static int
ext4_readpages(struct file *file, struct address_space *mapping,
    struct list_head *pages, unsigned nr_pages)
{
  struct inode *inode = mapping->host;

  if (ext4_has_inline_data(inode))
    return 0;

  return ext4_mpage_readpages(mapping, pages, NULL, nr_pages);
}

没有做什么，就是你要多少pages，就转到ext4_mpage_readpages中来。

int ext4_mpage_readpages(struct address_space *mapping,
       struct list_head *pages, struct page *page,
       unsigned nr_pages)
{
  struct bio *bio = NULL;
  unsigned page_idx;
  sector_t last_block_in_bio = 0;

  struct inode *inode = mapping->host;
  const unsigned blkbits = inode->i_blkbits;
  const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;
  const unsigned blocksize = 1 << blkbits;
  sector_t block_in_file;
  sector_t last_block;
  sector_t last_block_in_file;
  sector_t blocks[MAX_BUF_PER_PAGE];
  unsigned page_block;
  struct block_device *bdev = inode->i_sb->s_bdev;
  int length;
  unsigned relative_block = 0;
  struct ext4_map_blocks map;

  map.m_pblk = 0;
  map.m_lblk = 0;
  map.m_len = 0;
  map.m_flags = 0;

  for (page_idx = 0; nr_pages; page_idx++, nr_pages--) {
    int fully_mapped = 1;
    unsigned first_hole = blocks_per_page;

    prefetchw(&page->flags);
    if (pages) {
      page = list_entry(pages->prev, struct page, lru);
      list_del(&page->lru);
      if (add_to_page_cache_lru(page, mapping, page->index,
          mapping_gfp_constraint(mapping, GFP_KERNEL)))
        goto next_page;
    }

    if (page_has_buffers(page))
      goto confused;

    block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
    last_block = block_in_file + nr_pages * blocks_per_page;
    last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
    if (last_block > last_block_in_file)
      last_block = last_block_in_file;
    page_block = 0;

    /*
     * Map blocks using the previous result first.
     */
    if ((map.m_flags & EXT4_MAP_MAPPED) &&
        block_in_file > map.m_lblk &&
        block_in_file < (map.m_lblk + map.m_len)) {
      unsigned map_offset = block_in_file - map.m_lblk;
      unsigned last = map.m_len - map_offset;

      for (relative_block = 0; ; relative_block++) {
        if (relative_block == last) {
          /* needed? */
          map.m_flags &= ~EXT4_MAP_MAPPED;
          break;
        }
        if (page_block == blocks_per_page)
          break;
        blocks[page_block] = map.m_pblk + map_offset +
          relative_block;
        page_block++;
        block_in_file++;
      }
    }

    /*
     * Then do more ext4_map_blocks() calls until we are
     * done with this page.
     */
    while (page_block < blocks_per_page) {
      if (block_in_file < last_block) {
        map.m_lblk = block_in_file;
        map.m_len = last_block - block_in_file;

        if (ext4_map_blocks(NULL, inode, &map, 0) < 0) {
        set_error_page:
          SetPageError(page);
          zero_user_segment(page, 0,
                PAGE_CACHE_SIZE);
          unlock_page(page);
          goto next_page;
        }
      }
      if ((map.m_flags & EXT4_MAP_MAPPED) == 0) {
        fully_mapped = 0;
        if (first_hole == blocks_per_page)
          first_hole = page_block;
        page_block++;
        block_in_file++;
        continue;
      }
      if (first_hole != blocks_per_page)
        goto confused;    /* hole -> non-hole */

      /* Contiguous blocks? */
      if (page_block && blocks[page_block-1] != map.m_pblk-1)
        goto confused;
      for (relative_block = 0; ; relative_block++) {
        if (relative_block == map.m_len) {
          /* needed? */
          map.m_flags &= ~EXT4_MAP_MAPPED;
          break;
        } else if (page_block == blocks_per_page)
          break;
        blocks[page_block] = map.m_pblk+relative_block;
        page_block++;
        block_in_file++;
      }
    }
    if (first_hole != blocks_per_page) {
      zero_user_segment(page, first_hole << blkbits,
            PAGE_CACHE_SIZE);
      if (first_hole == 0) {
        SetPageUptodate(page);
        unlock_page(page);
        goto next_page;
      }
    } else if (fully_mapped) {
      SetPageMappedToDisk(page);
    }
    if (fully_mapped && blocks_per_page == 1 &&
        !PageUptodate(page) && cleancache_get_page(page) == 0) {
      SetPageUptodate(page);
      goto confused;
    }

    /*
     * This page will go to BIO.  Do we need to send this
     * BIO off first?
     */
    if (bio && (last_block_in_bio != blocks[0] - 1)) {
    submit_and_realloc:
      ext4_submit_bio_read(bio);
      bio = NULL;
    }
    if (bio == NULL) {
      struct ext4_crypto_ctx *ctx = NULL;

      if (ext4_encrypted_inode(inode) &&
          S_ISREG(inode->i_mode)) {
        ctx = ext4_get_crypto_ctx(inode);
        if (IS_ERR(ctx))
          goto set_error_page;
      }
      bio = bio_alloc(GFP_KERNEL,
        min_t(int, nr_pages, BIO_MAX_PAGES));
      if (!bio) {
        if (ctx)
          ext4_release_crypto_ctx(ctx);
        goto set_error_page;
      }
      bio->bi_bdev = bdev;
      bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
      bio->bi_end_io = mpage_end_io;
      bio->bi_private = ctx;
    }

    length = first_hole << blkbits;
    if (bio_add_page(bio, page, length, 0) < length)
      goto submit_and_realloc;

    if (((map.m_flags & EXT4_MAP_BOUNDARY) &&
         (relative_block == map.m_len)) ||
        (first_hole != blocks_per_page)) {
      ext4_submit_bio_read(bio);
      bio = NULL;
    } else
      last_block_in_bio = blocks[blocks_per_page - 1];
    goto next_page;
  confused:
    if (bio) {
      ext4_submit_bio_read(bio);
      bio = NULL;
    }
    if (!PageUptodate(page))
      block_read_full_page(page, ext4_get_block);
    else
      unlock_page(page);
  next_page:
    if (pages)
      page_cache_release(page);
  }
  BUG_ON(pages && !list_empty(pages));
  if (bio)
    ext4_submit_bio_read(bio);
  return 0;
}

1、前面是做一些结构体的初始化

2、后面就是for循环遍历每个page

3、然后就是说对建立一个bio结构体

4、然后对bio结构体的数据进行初始化

5、建立page和bio的关联。bio_add_page

6、然后是提交这个bio去申请数据ext4_submit_bio_read

这里我们要注意关联的部分

int bio_add_page(struct bio *bio, struct page *page,
     unsigned int len, unsigned int offset)
{
  struct bio_vec *bv;

  /*
   * cloned bio must not modify vec list
   */
  if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
    return 0;
  if (bio->bi_vcnt > 0) {
    bv = &bio->bi_io_vec[bio->bi_vcnt - 1];

    if (page == bv->bv_page &&
        offset == bv->bv_offset + bv->bv_len) {
      bv->bv_len += len;
      goto done;
    }
  }

  if (bio->bi_vcnt >= bio->bi_max_vecs)
    return 0;

  bv    = &bio->bi_io_vec[bio->bi_vcnt];
  bv->bv_page  = page;
  bv->bv_len  = len;
  bv->bv_offset  = offset;

  bio->bi_vcnt++;
done:
  bio->bi_iter.bi_size += len;
  return len;
}

关联也很简单就是数据指针。我们通过之前的《2.Linux 块设备驱动代码编写—简单驱动》这个程序里，

pRHdata = pdev->data + (bio->bi_sector * RAMHD_SECTOR_SIZE); 
  bio_for_each_segment(bvec, bio, i){ 
    pBuffer = kmap(bvec->bv_page) + bvec->bv_offset; 
    switch(bio_data_dir(bio)){ 
      case READ: 
        memcpy(pBuffer, pRHdata, bvec->bv_len); 
        flush_dcache_page(bvec->bv_page); 
        break;

我们知道，到了驱动，它会把磁盘或者块设备中数据拷贝到bv_page，而这个指针就是我们前面关联的。

另外一种方式的《4.Linux-块设备驱动(详解)》是只有requst的，其实也是一样的，只是封装格式不同而已。

至于最后的

然后是提交这个bio去申请数据ext4_submit_bio_read

最后会到submit_bio这个函数，后面我们会分析。

但是这里是有疑问的，这里每个page进行一次submit_bio，这个就不对了把，每个bio是有链表，很明显这里没有连接在一块。

其实应该猜到了，会在submit_bio 的过程中连接在一块的。

submit_bio

static void
ext4_submit_bio_read(struct bio *bio)
{
  if (trace_android_fs_dataread_start_enabled()) {
    struct page *first_page = bio->bi_io_vec[0].bv_page;

    if (first_page != NULL) {
      trace_android_fs_dataread_start(
        first_page->mapping->host,
        page_offset(first_page),
        bio->bi_iter.bi_size,
        current->pid,
        current->comm);
    }
  }
  submit_bio(READ, bio);
}

上次我们分析到这里ext4_submit_bio_read
后面就到了submit_bio.

block/blk-core.c
blk_qc_t submit_bio(int rw, struct bio *bio)
{
  bio->bi_rw |= rw;
  if (bio_has_data(bio)) {
    unsigned int count;

    if (unlikely(rw & REQ_WRITE_SAME))
      count = bdev_logical_block_size(bio->bi_bdev) >> 9;
    else
      count = bio_sectors(bio);

    if (rw & WRITE) {
      count_vm_events(PGPGOUT, count);
    } else {
      task_io_account_read(bio->bi_iter.bi_size);
      count_vm_events(PGPGIN, count);
    }

    mtk_btag_pidlog_submit_bio(bio);

    if (unlikely(block_dump)) {
      char b[BDEVNAME_SIZE];
      printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
      current->comm, task_pid_nr(current),
        (rw & WRITE) ? "WRITE" : "READ",
        (unsigned long long)bio->bi_iter.bi_sector,
        bdevname(bio->bi_bdev, b),
        count);
    }
  }

  return generic_make_request(bio);
}

这里就直接到generic_make_requst，

blk_qc_t generic_make_request(struct bio *bio)
{
  struct bio_list bio_list_on_stack;
  blk_qc_t ret = BLK_QC_T_NONE;

  if (!generic_make_request_checks(bio))
    goto out;

  if (current->bio_list) {
    bio_list_add(current->bio_list, bio);
    goto out;
  }

  BUG_ON(bio->bi_next);
  bio_list_init(&bio_list_on_stack);
  current->bio_list = &bio_list_on_stack;
  do {
    struct request_queue *q = bdev_get_queue(bio->bi_bdev);

    if (likely(blk_queue_enter(q, __GFP_DIRECT_RECLAIM) == 0)) {

      ret = q->make_request_fn(q, bio);

      blk_queue_exit(q);

      bio = bio_list_pop(current->bio_list);
    } else {
      struct bio *bio_next = bio_list_pop(current->bio_list);

      bio_io_error(bio);
      bio = bio_next;
    }
  } while (bio);
  current->bio_list = NULL; /* deactivate */

out:
  return ret;
}