Linux内核读文件处理过程浅析

作者:YouChuang

本文主要介绍学习块IO层的一些基本原理笔记



常用的数据结构

file结构体

913 struct file {
914 /*
915 * fu_list becomes invalid after file_free is called and queued via
916 * fu_rcuhead for RCU freeing
917 */
918 union {
919 struct list_head fu_list;
920 struct rcu_head fu_rcuhead;
921 } f_u;
922 struct path f_path;
923 #define f_dentry f_path.dentry
924 #define f_vfsmnt f_path.mnt
925 const struct file_operations *f_op; //对file操作的函数结构体
926 spinlock_t f_lock; /* f_ep_links, f_flags, no IRQ */
927 atomic_long_t f_count;
928 unsigned int f_flags;
929 fmode_t f_mode; //模式
930 loff_t f_pos;
931 struct fown_struct f_owner;
932 const struct cred *f_cred;
933 struct file_ra_state f_ra;
934
935 u64 f_version;
936 #ifdef CONFIG_SECURITY
937 void *f_security;
938 #endif
939 /* needed for tty driver, and maybe others */
940 void *private_data;
941
942 #ifdef CONFIG_EPOLL
943 /* Used by fs/eventpoll.c to link all the hooks to this file */
944 struct list_head f_ep_links;
945 #endif /* #ifdef CONFIG_EPOLL */
946 struct address_space *f_mapping;
947 #ifdef CONFIG_DEBUG_WRITECOUNT
948 unsigned long f_mnt_write_state;
949 #endif
950 };

具体流程

VFS

1.sys_read()系统调用开始读文件历程
系统调用定义
asmlinkage long sys_read(unsigned int fd, char __user *buf, size_t count);

2.经过宏转换SYSCALL_DEFINE3
/* 这里为什么要在变量前加一个逗号? */
SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
{
struct file *file; //file结构体
ssize_t ret = -EBADF; //ssize_t数据类型
int fput_needed;
/*Lightweight file lookup - no refcnt increment if fd table isn’t shared.
*从进程文件链表中根据fd获取file文件表,如果没有这个文件表,则返回EBADF错误
*/
file = fget_light(fd, &fput_needed);
if (file) {
loff_t pos = file_pos_read(file);//获取file结构的pos位移参数
ret = vfs_read(file, buf, count, &pos);//读文件内容,具体见下方
file_pos_write(file, pos);
fput_light(file, fput_needed);
}

 return ret;

}

3.调用vfs_read函数
ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
279 {
280 ssize_t ret;
281
282 if (!(file->f_mode & FMODE_READ))
283 return -EBADF; //文件不允许读,则返回
284 if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read))
285 return -EINVAL; //文件结构没有read函数指针,则返回
286 if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
287 return -EFAULT;
288
289 ret = rw_verify_area(READ, file, pos, count); //检查是否有读权限
if (ret >= 0) {
count = ret;
/*
*实际上就是具体的文件系统向通用Block层注册的一个函数指针 file->f_op是从对应的inode->i_fop而来
*而inode- >i_fop是由对应的文件系统类型在生成这个inode时赋予的。file->f_op->read很可能就等同于do_sync_read,ext3中对应do_sync_read函数;
*/
if (file->f_op->read)

           ret = file->f_op->read(file, buf, count, pos);
      else
           ret = do_sync_read(file, buf, count, pos);
      if (ret > 0) {
           fsnotify_access(file->f_path.dentry);//通知已被访问
           add_rchar(current, ret);//计数器刷新
      }
      inc_syscr(current);
 }

 return ret;

}

EXPORT_SYMBOL(vfs_read);

* ext3文件系统

具体的文件系统中文件操作函数的定义,此处是ext3
55 const struct file_operations ext3_file_operations = {
56 .llseek = generic_file_llseek,
57 .read = do_sync_read,
58 .write = do_sync_write,
59 .aio_read = generic_file_aio_read,
60 .aio_write = generic_file_aio_write,
61 .unlocked_ioctl = ext3_ioctl,
62 #ifdef CONFIG_COMPAT
63 .compat_ioctl = ext3_compat_ioctl,
64 #endif
65 .mmap = generic_file_mmap,
66 .open = dquot_file_open,
67 .release = ext3_release_file,
68 .fsync = ext3_sync_file,
69 .splice_read = generic_file_splice_read,
70 .splice_write = generic_file_splice_write,
71 };

4.继续调用do_sync_read()函数
ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
{
for (;;) { //while(1)
/* 一般文件系统中aio_read对应generic_file_aio_read函数,类似于上面的read */
ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
if (ret != -EIOCBRETRY)
break;
wait_on_retry_sync_kiocb(&kiocb);
}
269
270 if (-EIOCBQUEUED == ret)
271 ret = wait_on_sync_kiocb(&kiocb);
272 *ppos = kiocb.ki_pos;
273 return ret;
274 }

5.调用generic_file_aio_read函数

ssize_t
generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos)
{
1264 struct file *filp = iocb->ki_filp;
1265 ssize_t retval;
1266 unsigned long seg; //unsigned数据类型
1267 size_t count;
1268 loff_t *ppos = &iocb->ki_pos;
1269
1270 count = 0;
/* Performs necessary checks before doing a write
检查每一个用户空间缓存数据段是否可写
*/
1271 retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
1272 if (retval)
1273 return retval;
1274
/* coalesce the iovecs and go direct-to-BIO for O_DIRECT
O_DIRECT意思是直接进行文件IO,而系统不进行缓存,会影响文件读写速度
*/
if (filp->f_flags & O_DIRECT) {
1277 loff_t size;
1278 struct address_space *mapping;
1279 struct inode *inode;
1280
1281 mapping = filp->f_mapping;
1282 inode = mapping->host;
1283 if (!count)
1284 goto out; /* skip atime */
1285 size = i_size_read(inode);
if (pos < size) {
/* 将相应位置可能存在的Page Cache刷回磁盘,避免产生不一致 */
retval = filemap_write_and_wait_range(mapping, pos,
pos + iov_length(iov, nr_segs) - 1);
if (!retval) {
/* 读取数据 */
retval = mapping->a_ops->direct_IO(READ, iocb,
iov, pos, nr_segs);
}
if (retval > 0)
*ppos = pos + retval;
if (retval) {
file_accessed(filp);
goto out;
}
}
}

 for (seg = 0; seg < nr_segs; seg++) {

1303 read_descriptor_t desc;
1304
1305 desc.written = 0;
1306 desc.arg.buf = iov[seg].iov_base;
1307 desc.count = iov[seg].iov_len;
1308 if (desc.count == 0)
1309 continue;
1310 desc.error = 0;
do_generic_file_read(filp, ppos, &desc, file_read_actor);
1312 retval += desc.written;
1313 if (desc.error) {
1314 retval = retval ?: desc.error;
1315 break;
1316 }
1317 if (desc.count > 0)
1318 break;
1319 }
1320 out:
1321 return retval;
}
EXPORT_SYMBOL(generic_file_aio_read);

6.调用do_generic_file_read函数
http://blog.chinaunix.net/uid-28236237-id-4030381.html
965 static void do_generic_file_read(struct file *filp, loff_t *ppos,
966 read_descriptor_t *desc, read_actor_t actor)
967 {

984 for (;;) {
985 struct page *page;
986 pgoff_t end_index;
987 loff_t isize;
988 unsigned long nr, ret;
989
990 cond_resched();
991 find_page:
/* 检查页是否已经包含在页缓存中,若存在则返回页描述符 */
992 page = find_get_page(mapping, index);
993 if (!page) {
/* 页不在缓存中,则发出一个同步预读请求,预读机制很大几率保证数据能够进入缓存,但下面继续调用find_get_pages检查的话还是有可能不在缓存中 */
994 page_cache_sync_readahead(mapping,
995 ra, filp,
996 index, last_index - index);

997 page = find_get_page(mapping, index);
998 if (unlikely(page == NULL))
//unlikely表示Page不在缓存中可能性较小,但若真不在的话就直接跳转到no_cached_page处去处理
999 goto no_cached_page;
1000 }
/* 目前已经在缓存中了,判断是否为readahead页,若是则发起异步预读请求 */
1001 if (PageReadahead(page)) {
1002 page_cache_async_readahead(mapping,
1003 ra, filp, page,
1004 index, last_index - index);
1005 }
/* 接着判断页数据是否为最新的,若不是则进一步处理,并跳转到page_not_up_to_date */
1006 if (!PageUptodate(page)) {
1007 if (inode->i_blkbits == PAGE_CACHE_SHIFT ||
1008 !mapping->a_ops->is_partially_uptodate)
1009 goto page_not_up_to_date;
1010 if (!trylock_page(page))
1011 goto page_not_up_to_date;
1012 if (!mapping->a_ops->is_partially_uptodate(page,
1013 desc, offset))
1014 goto page_not_up_to_date_locked;
1015 unlock_page(page);
1016 }

1017 page_ok: //将page内容拷贝到用户空间

1027 isize = i_size_read(inode);
1028 end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
1029 if (unlikely(!isize || index > end_index)) {
1030 page_cache_release(page);
1031 goto out;
1032 }
1033
1034 /* nr is the maximum number of bytes to copy from this page */
1035 nr = PAGE_CACHE_SIZE; //4KB or less
1036 if (index == end_index) {
1037 nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
1038 if (nr <= offset) {
1039 page_cache_release(page);
1040 goto out;
1041 }
1042 }
1043 nr = nr - offset;
1044
1045 /* If users can be writing to this page using arbitrary
1046 * virtual addresses, take care about potential aliasing
1047 * before reading the page on the kernel side.
1048 */
1049 if (mapping_writably_mapped(mapping))
1050 flush_dcache_page(page);
1051
1052 /*
1053 * When a sequential read accesses a page several times,
1054 * only mark it as accessed the first time.
1055 */
1056 if (prev_index != index || offset != prev_offset)
1057 mark_page_accessed(page);
1058 prev_index = index;

1070 ret = actor(desc, page, offset, nr);
1071 offset += ret;
1072 index += offset >> PAGE_CACHE_SHIFT;
1073 offset &= ~PAGE_CACHE_MASK;
1074 prev_offset = offset;
1075
1076 page_cache_release(page);
1077 if (ret == nr && desc->count)
1078 continue;
1079 goto out;
1080
1081 page_not_up_to_date:
1082 /* Get exclusive access to the page … */
1083 error = lock_page_killable(page);
1084 if (unlikely(error))
1085 goto readpage_error;
1086
1087 page_not_up_to_date_locked:
1088 /* Did it get truncated before we got the lock? */
1089 if (!page->mapping) {
1090 unlock_page(page);
1091 page_cache_release(page);
1092 continue;
1093 }
1094
1095 /* Did somebody else fill it already? */
1096 if (PageUptodate(page)) {
1097 unlock_page(page);
1098 goto page_ok;
1099 }
1100
1101 readpage: //缺页的情况下,从磁盘中读取对应的页
1102 /* Start the actual read. The read will unlock the page. */
/* readpage读取磁盘,也与具体的文件系统有关,见下面分析 */
1103 error = mapping->a_ops->readpage(filp, page);
1104
1105 if (unlikely(error)) {
1106 if (error == AOP_TRUNCATED_PAGE) {
1107 page_cache_release(page);
1108 goto find_page;
1109 }
1110 goto readpage_error;
1111 }
1112
1113 if (!PageUptodate(page)) {
1114 error = lock_page_killable(page);
1115 if (unlikely(error))
1116 goto readpage_error;
1117 if (!PageUptodate(page)) {
1118 if (page->mapping == NULL) {
1119 /*
1120 * invalidate_mapping_pages got it
1121 */
1122 unlock_page(page);
1123 page_cache_release(page);
1124 goto find_page;
1125 }
1126 unlock_page(page);
1127 shrink_readahead_size_eio(filp, ra);
1128 error = -EIO;
1129 goto readpage_error;
1130 }
1131 unlock_page(page);
1132 }
1133
1134 goto page_ok;
1135
1136 readpage_error:
1137 /* UHHUH! A synchronous read error occurred. Report it */
1138 desc->error = error;
1139 page_cache_release(page);
1140 goto out;
1141
1142 no_cached_page://缺页的情况下,先分配对应的页面,然后从磁盘中读取
1143 /*
1144 * Ok, it wasn’t cached, so we need to create a new
1145 * page.. 找不到则新分配一个页面,将之前搜索的页从磁盘读入,然后写入页高速缓存
1146 */
1147 page = page_cache_alloc_cold(mapping);//为即将加入页缓存的新页分配数据结构
1148 if (!page) {
1149 desc->error = -ENOMEM;
1150 goto out;
1151 }
1152 error = add_to_page_cache_lru(page, mapping,
1153 index, GFP_KERNEL);
1154 if (error) {
1155 page_cache_release(page);
1156 if (error == -EEXIST)
1157 goto find_page;
1158 desc->error = error;
1159 goto out;
1160 }
1161 goto readpage;//还是回去继续读取页面
1162 }
1163
1164 out:
1165 ra->prev_pos = prev_index;
1166 ra->prev_pos <<= PAGE_CACHE_SHIFT;
1167 ra->prev_pos |= prev_offset;
1168
1169 *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset;
1170 file_accessed(filp);
1171 }

调用find_get_page
616 struct page *find_get_page(struct address_space *mapping, pgoff_t offset)
617 {
618 void **pagep;
619 struct page *page;
620
621 rcu_read_lock();
622 repeat:
623 page = NULL;
624 pagep = radix_tree_lookup_slot(&mapping->page_tree, offset);
642 }
643 rcu_read_unlock();
644
645 return page;
646 }
647 EXPORT_SYMBOL(find_get_page);

调用page_cache_alloc_cold
216 static inline struct page *page_cache_alloc_cold(struct address_space *x)
217 {
218 return __page_cache_alloc(mapping_gfp_mask(x)|__GFP_COLD);
219 }

461 #ifdef CONFIG_NUMA
462 struct page *__page_cache_alloc(gfp_t gfp)
463 {
/*
检测当前进程是否其cpuset资源设置了page-spread,如果是,其申请page可以spread到其它的node,否则alloc_pages()根据current的memory policy在相就node上申请,不会有spread.
*/
464 if (cpuset_do_page_mem_spread()) {
465 int n = cpuset_mem_spread_node();
466 return alloc_pages_exact_node(n, gfp, 0);
467 }
468 return alloc_pages(gfp, 0);
469 }
470 EXPORT_SYMBOL(__page_cache_alloc);
471 #endif

调用alloc_pages
302 static inline struct page *
303 alloc_pages(gfp_t gfp_mask, unsigned int order)
304 {
305 return alloc_pages_current(gfp_mask, order);
306 }

调用alloc_pages_current

1729 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1730 {
1731 struct mempolicy *pol = current->mempolicy;
1732
1733 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1734 pol = &default_policy;
1735
1736 /*
1737 * No reference counting needed for current->mempolicy
1738 * nor system default_policy
1739 */
1740 if (pol->mode == MPOL_INTERLEAVE)
1741 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1742 return __alloc_pages_nodemask(gfp, order,
1743 policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
1744 }
1745 EXPORT_SYMBOL(alloc_pages_current);

调用add_to_page_cache_lru
436 int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
437 pgoff_t offset, gfp_t gfp_mask)
438 {
439 int ret;
440
441 /*
442 * Splice_read and readahead add shmem/tmpfs pages into the page cache
443 * before shmem_readpage has a chance to mark them as SwapBacked: they
444 * need to go on the anon lru below, and mem_cgroup_cache_charge
445 * (called in add_to_page_cache) needs to know where they’re going too.
446 */
447 if (mapping_cap_swap_backed(mapping))
448 SetPageSwapBacked(page);
449
450 ret = add_to_page_cache(page, mapping, offset, gfp_mask);
451 if (ret == 0) {
452 if (page_is_file_cache(page))
453 lru_cache_add_file(page);
454 else
455 lru_cache_add_anon(page);
456 }
457 return ret;
458 }
459 EXPORT_SYMBOL_GPL(add_to_page_cache_lru);

调用add_to_page_cache
http://lxr.free-electrons.com/source/include/linux/pagemap.h?v=2.6.34#L443
439 /*
440 * Like add_to_page_cache_locked, but used to add newly allocated pages:
441 * the page is new, so we can just run __set_page_locked() against it.
442 */
443 static inline int add_to_page_cache(struct page *page,
444 struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask)
445 {
446 int error;
447
448 __set_page_locked(page);// 调用__set_bit()
449 error = add_to_page_cache_locked(page, mapping, offset, gfp_mask);
450 if (unlikely(error))
451 __clear_page_locked(page);
452 return error;
453 }
454
455 #endif /* _LINUX_PAGEMAP_H */

调用add_to_page_locked

396 int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
397 pgoff_t offset, gfp_t gfp_mask)
398 {
399 int error;
400
401 VM_BUG_ON(!PageLocked(page));
402
403 error = mem_cgroup_cache_charge(page, current->mm,
404 gfp_mask & GFP_RECLAIM_MASK);
405 if (error)
406 goto out;
407
408 error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
409 if (error == 0) {
410 page_cache_get(page);
411 page->mapping = mapping;
412 page->index = offset;
413
414 spin_lock_irq(&mapping->tree_lock);
415 error = radix_tree_insert(&mapping->page_tree, offset, page);
416 if (likely(!error)) {
417 mapping->nrpages++;
418 __inc_zone_page_state(page, NR_FILE_PAGES);
419 if (PageSwapBacked(page))
420 __inc_zone_page_state(page, NR_SHMEM);
421 spin_unlock_irq(&mapping->tree_lock);
422 } else {
423 page->mapping = NULL;
424 spin_unlock_irq(&mapping->tree_lock);
425 mem_cgroup_uncharge_cache_page(page);
426 page_cache_release(page);
427 }
428 radix_tree_preload_end();
429 } else
430 mem_cgroup_uncharge_cache_page(page);
431 out:
432 return error;
433 }
434 EXPORT_SYMBOL(add_to_page_cache_locked);

ext3文件系统 页的三种处理方式导致了三种初始化函数,顺序(默认方式),写回,日志
http://lxr.oss.org.cn/source/fs/ext3/inode.c?v=2.6.34#L1848
1848 static const struct address_space_operations ext3_ordered_aops = {
1849 .readpage = ext3_readpage,
1850 .readpages = ext3_readpages,
1851 .writepage = ext3_ordered_writepage,
1852 .sync_page = block_sync_page,
1853 .write_begin = ext3_write_begin,
1854 .write_end = ext3_ordered_write_end,
1855 .bmap = ext3_bmap,
1856 .invalidatepage = ext3_invalidatepage,
1857 .releasepage = ext3_releasepage,
1858 .direct_IO = ext3_direct_IO,
1859 .migratepage = buffer_migrate_page,
1860 .is_partially_uptodate = block_is_partially_uptodate,
1861 .error_remove_page = generic_error_remove_page,
1862 };

1864 static const struct address_space_operations ext3_writeback_aops = {
1865 .readpage = ext3_readpage,
1866 .readpages = ext3_readpages,
1867 .writepage = ext3_writeback_writepage,
1868 .sync_page = block_sync_page,
1869 .write_begin = ext3_write_begin,
1870 .write_end = ext3_writeback_write_end,
1871 .bmap = ext3_bmap,
1872 .invalidatepage = ext3_invalidatepage,
1873 .releasepage = ext3_releasepage,
1874 .direct_IO = ext3_direct_IO,
1875 .migratepage = buffer_migrate_page,
1876 .is_partially_uptodate = block_is_partially_uptodate,
1877 .error_remove_page = generic_error_remove_page,
1878 };
1879

1880 static const struct address_space_operations ext3_journalled_aops = {
1881 .readpage = ext3_readpage,
1882 .readpages = ext3_readpages,
1883 .writepage = ext3_journalled_writepage,
1884 .sync_page = block_sync_page,
1885 .write_begin = ext3_write_begin,
1886 .write_end = ext3_journalled_write_end,
1887 .set_page_dirty = ext3_journalled_set_page_dirty,
1888 .bmap = ext3_bmap,
1889 .invalidatepage = ext3_invalidatepage,
1890 .releasepage = ext3_releasepage,
1891 .is_partially_uptodate = block_is_partially_uptodate,
1892 .error_remove_page = generic_error_remove_page,
1893 };

* Cache层

ext3_readpage对应的是mpage_readpage函数
http://lxr.oss.org.cn/source/fs/ext3/inode.c?v=2.6.34#L1706
1706 static int ext3_readpage(struct file *file, struct page *page)
1707 {
1708 return mpage_readpage(page, ext3_get_block);
1709 }

7.调用mpage_readpage函数

410 int mpage_readpage(struct page *page, get_block_t get_block)
411 {
412 struct bio *bio = NULL;
413 sector_t last_block_in_bio = 0;
414 struct buffer_head map_bh;
415 unsigned long first_logical_block = 0;
416
417 map_bh.b_state = 0;
418 map_bh.b_size = 0;
419 bio = do_mpage_readpage(bio, page, 1, &last_block_in_bio,
420 &map_bh, &first_logical_block, get_block);
421 if (bio)
/* */
422 mpage_bio_submit(READ, bio);
423 return 0;
424 }
425 EXPORT_SYMBOL(mpage_readpage);

8.调用do_mpage_readpage函数

168 static struct bio *
169 do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
170 sector_t *last_block_in_bio, struct buffer_head *map_bh,
171 unsigned long *first_logical_block, get_block_t get_block)
172 {

189 if (page_has_buffers(page))
190 goto confused;
191
192 block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
193 last_block = block_in_file + nr_pages * blocks_per_page;
194 last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
195 if (last_block > last_block_in_file)
196 last_block = last_block_in_file;
197 page_block = 0;
198
199 /*
200 * Map blocks using the result from the previous get_blocks call first.
201 */
202 nblocks = map_bh->b_size >> blkbits;
203 if (buffer_mapped(map_bh) && block_in_file > *first_logical_block &&
204 block_in_file < (*first_logical_block + nblocks)) {
205 unsigned map_offset = block_in_file - *first_logical_block;
206 unsigned last = nblocks - map_offset;
207
208 for (relative_block = 0; ; relative_block++) {
209 if (relative_block == last) {
210 clear_buffer_mapped(map_bh);
211 break;
212 }
213 if (page_block == blocks_per_page)
214 break;
215 blocks[page_block] = map_bh->b_blocknr + map_offset +
216 relative_block;
217 page_block++;
218 block_in_file++;
219 }
220 bdev = map_bh->b_bdev;
221 }
222
223 /*
224 * Then do more get_blocks calls until we are done with this page.
225 */
226 map_bh->b_page = page;
227 while (page_block < blocks_per_page) {
228 map_bh->b_state = 0;
229 map_bh->b_size = 0;
230
231 if (block_in_file < last_block) {
232 map_bh->b_size = (last_block-block_in_file) << blkbits;
233 if (get_block(inode, block_in_file, map_bh, 0))
234 goto confused;
235 *first_logical_block = block_in_file;
236 }
237
238 if (!buffer_mapped(map_bh)) {
239 fully_mapped = 0;
240 if (first_hole == blocks_per_page)
241 first_hole = page_block;
242 page_block++;
243 block_in_file++;
244 continue;
245 }

253 if (buffer_uptodate(map_bh)) {
254 map_buffer_to_page(page, map_bh, page_block);
255 goto confused;
256 }
257
258 if (first_hole != blocks_per_page)
259 goto confused; /* hole -> non-hole */
260
/* Contiguous blocks? 判断块是否连续,若连续则合并提交为一个bio */
262 if (page_block && blocks[page_block-1] != map_bh->b_blocknr-1)
263 goto confused;
264 nblocks = map_bh->b_size >> blkbits;
265 for (relative_block = 0; ; relative_block++) {
266 if (relative_block == nblocks) {
267 clear_buffer_mapped(map_bh);
268 break;
269 } else if (page_block == blocks_per_page)
270 break;
271 blocks[page_block] = map_bh->b_blocknr+relative_block;
272 page_block++;
273 block_in_file++;
274 }
275 bdev = map_bh->b_bdev;
276 }
277
278 if (first_hole != blocks_per_page) {
279 zero_user_segment(page, first_hole << blkbits, PAGE_CACHE_SIZE);
280 if (first_hole == 0) {
281 SetPageUptodate(page);
282 unlock_page(page);
283 goto out;
284 }
285 } else if (fully_mapped) {
286 SetPageMappedToDisk(page);
287 }
288
289 /*
290 * This page will go to BIO. Do we need to send this BIO off first?
291 */
292 if (bio && (*last_block_in_bio != blocks[0] - 1))
293 bio = mpage_bio_submit(READ, bio);
294
295 alloc_new:
296 if (bio == NULL) {
/将blocks开始的物理块号附加到bio ??/
297 bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
298 min_t(int, nr_pages, bio_get_nr_vecs(bdev)),
299 GFP_KERNEL);
300 if (bio == NULL)
301 goto confused;
302 }
303
304 length = first_hole << blkbits;
305 if (bio_add_page(bio, page, length, 0) < length) {
/向设备层提交bio/
306 bio = mpage_bio_submit(READ, bio);
307 goto alloc_new;
308 }
309
310 relative_block = block_in_file - *first_logical_block;
311 nblocks = map_bh->b_size >> blkbits;
312 if ((buffer_boundary(map_bh) && relative_block == nblocks) ||
313 (first_hole != blocks_per_page))
314 bio = mpage_bio_submit(READ, bio);
315 else
316 *last_block_in_bio = blocks[blocks_per_page - 1];
317 out:
318 return bio;
319 /不连续的块则针对每个块分配一个bio进行提交/
320 confused:
321 if (bio)
322 bio = mpage_bio_submit(READ, bio);
323 if (!PageUptodate(page))
324 block_read_full_page(page, get_block);
325 else
326 unlock_page(page);
327 goto out;
328 }

9.调用mpage_bio_submit函数

86 static struct bio *mpage_bio_submit(int rw, struct bio *bio)
87 {
88 bio->bi_end_io = mpage_end_io_read;
89 if (rw == WRITE)
90 bio->bi_end_io = mpage_end_io_write;
91 submit_bio(rw, bio);
92 return NULL;
93 }

10.调用submit_bio函数

1539 /**
1540 * submit_bio - submit a bio to the block device layer for I/O
1541 * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)
1542 * @bio: The &struct bio which describes the I/O
1543 *
1544 * submit_bio() is very similar in purpose to generic_make_request(), and
1545 * uses that function to do most of the work. Both are fairly rough
1546 * interfaces; @bio must be presetup and ready for I/O.
1547 *
1548 */
1549 void submit_bio(int rw, struct bio *bio)
1550 {
1551 int count = bio_sectors(bio);
1552
1553 bio->bi_rw |= rw;
1554
1555 /*
1556 * If it’s a regular read/write or a barrier with data attached,
1557 * go through the normal accounting stuff before submission.
1558 */
1559 if (bio_has_data(bio)) {
1560 if (rw & WRITE) {
1561 count_vm_events(PGPGOUT, count);
1562 } else {
1563 task_io_account_read(bio->bi_size);
1564 count_vm_events(PGPGIN, count);
1565 }
1566
1567 if (unlikely(block_dump)) {
1568 char b[BDEVNAME_SIZE];
1569 printk(KERN_DEBUG “%s(%d): %s block %Lu on %s\n”,
1570 current->comm, task_pid_nr(current),
1571 (rw & WRITE) ? “WRITE” : “READ”,
1572 (unsigned long long)bio->bi_sector,
1573 bdevname(bio->bi_bdev, b));
1574 }
1575 }
1576
/*
调用generic_make_request将bio提交到磁盘驱动维护的请求队列中;
*/
1577 generic_make_request(bio);
1578 }
1579 EXPORT_SYMBOL(submit_bio);

* Block层

11.调用generic_make_request函数

1490 /*
1491 * We only want one ->make_request_fn to be active at a time,
1492 * else stack usage with stacked devices could be a problem.
1493 * So use current->bio_list to keep a list of requests
1494 * submited by a make_request_fn function.
1495 * current->bio_list is also used as a flag to say if
1496 * generic_make_request is currently active in this task or not.
1497 * If it is NULL, then no make_request is active. If it is non-NULL,
1498 * then a make_request is active, and new requests should be added
1499 * at the tail
1500 */
1501 void generic_make_request(struct bio *bio)
1502 {
1503 struct bio_list bio_list_on_stack;
1504
1505 if (current->bio_list) {
1506 /* make_request is active */
1507 bio_list_add(current->bio_list, bio);
1508 return;
1509 }
1510 /* following loop may be a bit non-obvious, and so deserves some
1511 * explanation.
1512 * Before entering the loop, bio->bi_next is NULL (as all callers
1513 * ensure that) so we have a list with a single bio.
1514 * We pretend that we have just taken it off a longer list, so
1515 * we assign bio_list to a pointer to the bio_list_on_stack,
1516 * thus initialising the bio_list of new bios to be
1517 * added. __generic_make_request may indeed add some more bios
1518 * through a recursive call to generic_make_request. If it
1519 * did, we find a non-NULL value in bio_list and re-enter the loop
1520 * from the top. In this case we really did just take the bio
1521 * of the top of the list (no pretending) and so remove it from
1522 * bio_list, and call into __generic_make_request again.
1523 *
1524 * The loop was structured like this to make only one call to
1525 * __generic_make_request (which is important as it is large and
1526 * inlined) and to keep the structure simple.
1527 */
1528 BUG_ON(bio->bi_next);
1529 bio_list_init(&bio_list_on_stack);
1530 current->bio_list = &bio_list_on_stack;
1531 do {
1532 __generic_make_request(bio);//继续调用
1533 bio = bio_list_pop(current->bio_list);
1534 } while (bio);
1535 current->bio_list = NULL; /* deactivate */
1536 }
1537 EXPORT_SYMBOL(generic_make_request);

12.调用__generic_make_request函数

1380 /**
1381 * generic_make_request - hand a buffer to its device driver for I/O
1382 * @bio: The bio describing the location in memory and on the device.
1383 *
1384 * generic_make_request() is used to make I/O requests of block
1385 * devices. It is passed a &struct bio, which describes the I/O that needs
1386 * to be done.
1387 *
1388 * generic_make_request() does not return any status. The
1389 * success/failure status of the request, along with notification of
1390 * completion, is delivered asynchronously through the bio->bi_end_io
1391 * function described (one day) else where.
1392 *
1393 * The caller of generic_make_request must make sure that bi_io_vec
1394 * are set to describe the memory buffer, and that bi_dev and bi_sector are
1395 * set to describe the device address, and the
1396 * bi_end_io and optionally bi_private are set to describe how
1397 * completion notification should be signaled.
1398 *
1399 * generic_make_request and the drivers it calls may use bi_next if this
1400 * bio happens to be merged with someone else, and may change bi_dev and
1401 * bi_sector for remaps as it sees fit. So the values of these fields
1402 * should NOT be depended on after the call to generic_make_request.
1403 */
1404 static inline void __generic_make_request(struct bio *bio)
1405 {
1406 struct request_queue *q;
1407 sector_t old_sector;
1408 int ret, nr_sectors = bio_sectors(bio);
1409 dev_t old_dev;
1410 int err = -EIO;
1411
1412 might_sleep();
1413
1414 if (bio_check_eod(bio, nr_sectors))
1415 goto end_io;
1416
1417 /*
1418 * Resolve the mapping until finished. (drivers are
1419 * still free to implement/resolve their own stacking
1420 * by explicitly returning 0)
1421 *
1422 * NOTE: we don’t repeat the blk_size check for each new device.
1423 * Stacking drivers are expected to know what they are doing.
1424 */
1425 old_sector = -1;
1426 old_dev = 0;
1427 do {
1428 char b[BDEVNAME_SIZE];
1429
1430 q = bdev_get_queue(bio->bi_bdev);
1431 if (unlikely(!q)) {
1432 printk(KERN_ERR
1433 “generic_make_request: Trying to access ”
1434 “nonexistent block-device %s (%Lu)\n”,
1435 bdevname(bio->bi_bdev, b),
1436 (long long) bio->bi_sector);
1437 goto end_io;
1438 }
1439
1440 if (unlikely(!bio_rw_flagged(bio, BIO_RW_DISCARD) &&
1441 nr_sectors > queue_max_hw_sectors(q))) {
1442 printk(KERN_ERR “bio too big device %s (%u > %u)\n”,
1443 bdevname(bio->bi_bdev, b),
1444 bio_sectors(bio),
1445 queue_max_hw_sectors(q));
1446 goto end_io;
1447 }
1448
1449 if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
1450 goto end_io;
1451
1452 if (should_fail_request(bio))
1453 goto end_io;
1454
1455 /*
1456 * If this device has partitions, remap block n
1457 * of partition p to block n+start(p) of the disk.
1458 */
1459 blk_partition_remap(bio);
1460
1461 if (bio_integrity_enabled(bio) && bio_integrity_prep(bio))
1462 goto end_io;
1463
1464 if (old_sector != -1)
1465 trace_block_remap(q, bio, old_dev, old_sector);
1466
1467 old_sector = bio->bi_sector;
1468 old_dev = bio->bi_bdev->bd_dev;
1469
1470 if (bio_check_eod(bio, nr_sectors))
1471 goto end_io;
1472
1473 if (bio_rw_flagged(bio, BIO_RW_DISCARD) &&
1474 !blk_queue_discard(q)) {
1475 err = -EOPNOTSUPP;
1476 goto end_io;
1477 }
1478
1479 trace_block_bio_queue(q, bio);
1480
1481 ret = q->make_request_fn(q, bio); //调用make_request_fn
1482 } while (ret);
1483
1484 return;
1485
1486 end_io:
1487 bio_endio(bio, err);
1488 }

261 typedef int (make_request_fn) (struct request_queue *q, struct bio *bio);

* IO scheduler层

调用__make_request函数

40 static int __make_request(struct request_queue *q, struct bio *bio);

1153 static int __make_request(struct request_queue *q, struct bio *bio)
1154 {
1155 struct request *req;
1156 int el_ret;
1157 unsigned int bytes = bio->bi_size;
1158 const unsigned short prio = bio_prio(bio);
1159 const bool sync = bio_rw_flagged(bio, BIO_RW_SYNCIO);
1160 const bool unplug = bio_rw_flagged(bio, BIO_RW_UNPLUG);
1161 const unsigned int ff = bio->bi_rw & REQ_FAILFAST_MASK;
1162 int rw_flags;
1163
1164 if (bio_rw_flagged(bio, BIO_RW_BARRIER) &&
1165 (q->next_ordered == QUEUE_ORDERED_NONE)) {
1166 bio_endio(bio, -EOPNOTSUPP);
1167 return 0;
1168 }
1169 /*
1170 * low level driver can indicate that it wants pages above a
1171 * certain limit bounced to low memory (ie for highmem, or even
1172 * ISA dma in theory)
1173 */
1174 blk_queue_bounce(q, &bio);
1175
1176 spin_lock_irq(q->queue_lock);
1177
1178 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER)) || elv_queue_empty(q))
1179 goto get_rq;
1180
1181 el_ret = elv_merge(q, &req, bio);
1182 switch (el_ret) {
1183 case ELEVATOR_BACK_MERGE:
1184 BUG_ON(!rq_mergeable(req));
1185
1186 if (!ll_back_merge_fn(q, req, bio))
1187 break;
1188
1189 trace_block_bio_backmerge(q, bio);
1190
1191 if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
1192 blk_rq_set_mixed_merge(req);
1193
1194 req->biotail->bi_next = bio;
1195 req->biotail = bio;
1196 req->__data_len += bytes;
1197 req->ioprio = ioprio_best(req->ioprio, prio);
1198 if (!blk_rq_cpu_valid(req))
1199 req->cpu = bio->bi_comp_cpu;
1200 drive_stat_acct(req, 0);
1201 if (!attempt_back_merge(q, req))
1202 elv_merged_request(q, req, el_ret);
1203 goto out;
1204
1205 case ELEVATOR_FRONT_MERGE:
1206 BUG_ON(!rq_mergeable(req));
1207
1208 if (!ll_front_merge_fn(q, req, bio))
1209 break;
1210
1211 trace_block_bio_frontmerge(q, bio);
1212
1213 if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) {
1214 blk_rq_set_mixed_merge(req);
1215 req->cmd_flags &= ~REQ_FAILFAST_MASK;
1216 req->cmd_flags |= ff;
1217 }
1218
1219 bio->bi_next = req->bio;
1220 req->bio = bio;
1221
1222 /*
1223 * may not be valid. if the low level driver said
1224 * it didn’t need a bounce buffer then it better
1225 * not touch req->buffer either…
1226 */
1227 req->buffer = bio_data(bio);
1228 req->__sector = bio->bi_sector;
1229 req->__data_len += bytes;
1230 req->ioprio = ioprio_best(req->ioprio, prio);
1231 if (!blk_rq_cpu_valid(req))
1232 req->cpu = bio->bi_comp_cpu;
1233 drive_stat_acct(req, 0);
1234 if (!attempt_front_merge(q, req))
1235 elv_merged_request(q, req, el_ret);
1236 goto out;
1237
1238 /* ELV_NO_MERGE: elevator says don’t/can’t merge. */
1239 default:
1240 ;
1241 }
1242
1243 get_rq:
1244 /*
1245 * This sync check and mask will be re-done in init_request_from_bio(),
1246 * but we need to set it earlier to expose the sync flag to the
1247 * rq allocator and io schedulers.
1248 */
1249 rw_flags = bio_data_dir(bio);
1250 if (sync)
1251 rw_flags |= REQ_RW_SYNC;
1252
1253 /*
1254 * Grab a free request. This is might sleep but can not fail.
1255 * Returns with the queue unlocked.
1256 */
1257 req = get_request_wait(q, rw_flags, bio);
1258
1259 /*
1260 * After dropping the lock and possibly sleeping here, our request
1261 * may now be mergeable after it had proven unmergeable (above).
1262 * We don’t worry about that case for efficiency. It won’t happen
1263 * often, and the elevators are able to handle it.
1264 */
1265 init_request_from_bio(req, bio);
1266
1267 spin_lock_irq(q->queue_lock);
1268 if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||
1269 bio_flagged(bio, BIO_CPU_AFFINE))
1270 req->cpu = blk_cpu_to_group(smp_processor_id());
1271 if (queue_should_plug(q) && elv_queue_empty(q))
1272 blk_plug_device(q);
1273 add_request(q, req);
1274 out:
1275 if (unplug || !queue_should_plug(q))
1276 __generic_unplug_device(q);
1277 spin_unlock_irq(q->queue_lock);
1278 return 0;
1279 }

* Driver层

调用blk_init_queue函数

527 /**
528 * blk_init_queue - prepare a request queue for use with a block device
529 * @rfn: The function to be called to process requests that have been
530 * placed on the queue.
531 * @lock: Request queue spin lock
532 *
533 * Description:
534 * If a block device wishes to use the standard request handling procedures,
535 * which sorts requests and coalesces adjacent requests, then it must
536 * call blk_init_queue(). The function @rfn will be called when there
537 * are requests on the queue that need to be processed. If the device
538 * supports plugging, then @rfn may not be called immediately when requests
539 * are available on the queue, but may be called at some time later instead.
540 * Plugged queues are generally unplugged when a buffer belonging to one
541 * of the requests on the queue is needed, or due to memory pressure.
542 *
543 * @rfn is not required, or even expected, to remove all requests off the
544 * queue, but only as many as it can handle at a time. If it does leave
545 * requests on the queue, it is responsible for arranging that the requests
546 * get dealt with eventually.
547 *
548 * The queue spin lock must be held while manipulating the requests on the
549 * request queue; this lock will be taken also from interrupt context, so irq
550 * disabling is needed for it.
551 *
552 * Function returns a pointer to the initialized request queue, or %NULL if
553 * it didn’t succeed.
554 *
555 * Note:
556 * blk_init_queue() must be paired with a blk_cleanup_queue() call
557 * when the block device is deactivated (such as at module unload).
558 **/
559
560 struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock)
561 {
562 return blk_init_queue_node(rfn, lock, -1);
563 }
564 EXPORT_SYMBOL(blk_init_queue);

调用blk_init_queue_node函数
566 struct request_queue *
567 blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
568 {
569 struct request_queue *q = blk_alloc_queue_node(GFP_KERNEL, node_id);
570
571 if (!q)
572 return NULL;
573
574 q->node = node_id;
575 if (blk_init_free_list(q)) {
576 kmem_cache_free(blk_requestq_cachep, q);
577 return NULL;
578 }
579
580 q->request_fn = rfn;
581 q->prep_rq_fn = NULL;
582 q->unplug_fn = generic_unplug_device;
583 q->queue_flags = QUEUE_FLAG_DEFAULT;
584 q->queue_lock = lock;
585
586 /*
587 * This also sets hw/phys segments, boundary and size
588 */
589 blk_queue_make_request(q, __make_request);
590
591 q->sg_reserved_size = INT_MAX;
592
593 /*
594 * all done
595 */
596 if (!elevator_init(q, NULL)) {
597 blk_queue_congestion_threshold(q);
598 return q;
599 }
600
601 blk_put_queue(q);
602 return NULL;
603 }
604 EXPORT_SYMBOL(blk_init_queue_node);

参考
《Linux内核设计与实现》

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值