10.2 文件预读
10.3 文件锁
10.4 文件读
asmlinkage ssize_t sys_read(unsigned int fd, char __user * buf, size_t count)
{
____struct file *file;
____ssize_t ret = -EBADF;
____int fput_needed;
/* 通过current指针获得当前进程打开的所有文件的文件描述符表,根据fd获取当前的文件;同时增加应用计数 */
____file = fget_light(fd, &fput_needed);
____if (file) {
/* 获取文件当前的position: file->f_pos */
________loff_t pos = file_pos_read(file);
________ret = vfs_read(file, buf, count, &pos);
/* 读取完文件后写回操作文件的position: file->f_pos; */
________file_pos_write(file, pos);
________fput_light(file, fput_needed); /* 当前文件的引用计数减一 */
____}
____return ret;
}
ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
{
____ssize_t ret;
____if (!(file->f_mode & FMODE_READ))
________return -EBADF;
____if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read))
________return -EINVAL;
____if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
________return -EFAULT;
____ret = rw_verify_area(READ, file, pos, count); /*检查文件是否有读写锁,及权限*/
____if (ret >= 0) {
________count = ret;
________ret = security_file_permission (file, MAY_READ); /*os安全相关*/
________if (!ret) {
____________if (file->f_op->read)
________________ret = file->f_op->read(file, buf, count, pos); /*文件的读*/
____________else
________________ret = do_sync_read(file, buf, count, pos);
____________if (ret > 0) {
________________fsnotify_access(file->f_dentry);
________________current->rchar += ret;
____________}
____________current->syscr++;
________}
____}
____return ret;
}
文件的读:file->f_op_read => 以ext2文件系统的读为例:generic_file_read。
generic_file_read(),通过struct kiocb实现文件的同步和异步操作的问题。
ssize_t
generic_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
{
____struct iovec local_iov = { .iov_base = buf, .iov_len = count };
____struct kiocb kiocb;
____ssize_t ret;
____init_sync_kiocb(&kiocb, filp);
____ret = __generic_file_aio_read(&kiocb, &local_iov, 1, ppos);
____if (-EIOCBQUEUED == ret)
________ret = wait_on_sync_kiocb(&kiocb);
____return ret;
}
EXPORT_SYMBOL(generic_file_read);
__generic_file_aio_read()分为三部分:其中最重要的部分是buffer I/O的处理过程。direct I/O只是业务逻辑,和buffer I/O基本流程大部分相同。
ssize_t
__generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
________unsigned long nr_segs, loff_t *ppos)
{
____struct file *filp = iocb->ki_filp;
____ssize_t retval;
____unsigned long seg;
____size_t count;
/*第一部分:计算希望读的字节数,校验用户态是否合法。*/
____count = 0;
____for (seg = 0; seg < nr_segs; seg++) {
________const struct iovec *iv = &iov[seg];
________/*
________ * If any segment has a negative length, or the cumulative
________ * length ever wraps negative then return -EINVAL.
________ */
________count += iv->iov_len;
________if (unlikely((ssize_t)(count|iv->iov_len) < 0))
____________return -EINVAL;
________if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len))/*校验用户态是否合法*/
____________continue;
________if (seg == 0)
____________return -EFAULT;
________nr_segs = seg;
________count -= iv->iov_len;___/* This segment is no good */
________break;
____}
/*第二部分:处理direct I/O */
____/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
____if (filp->f_flags & O_DIRECT) {
________loff_t pos = *ppos, size;
________struct address_space *mapping;
________struct inode *inode;
________mapping = filp->f_mapping;
________inode = mapping->host;
________retval = 0;
________if (!count)
____________goto out; /* skip atime */
________size = i_size_read(inode);
________if (pos < size) {
____________retval = generic_file_direct_IO(READ, iocb,
________________________iov, pos, nr_segs);
____________if (retval > 0 && !is_sync_kiocb(iocb))
________________retval = -EIOCBQUEUED;
____________if (retval > 0)
________________*ppos = pos + retval;
________}
________file_accessed(filp);
________goto out;
____}
/*第三部分:buffer I/O 处理*/
____retval = 0;
____if (count) {
________for (seg = 0; seg < nr_segs; seg++) {
____________read_descriptor_t desc;
____________desc.written = 0;
____________desc.arg.buf = iov[seg].iov_base;
____________desc.count = iov[seg].iov_len;
____________if (desc.count == 0)
________________continue;
____________desc.error = 0;
____________do_generic_file_read(filp,ppos,&desc,file_read_actor);
____________retval += desc.written;
____________if (desc.error) {
________________retval = retval ?: desc.error;
________________break;
____________}
________}
____}
out:
____return retval;
}
static inline void do_generic_file_read(struct file * filp, loff_t *ppos,
____________________read_descriptor_t * desc,
____________________read_actor_t actor)
{
____do_generic_mapping_read(filp->f_mapping,
________________&filp->f_ra,
________________filp,
________________ppos,
________________desc,
________________actor);
}
do_generic_mapping_read()函数,通过函数的名字可以看出,是处理通用的mapping的读操作,而mapping在file中由代表了文件的page cahe部分,所以,(对于使用page cache的buffer I/O)整个函数主要对文件的操作的就是对page cache操作。如果文件的内容已经在page cache里面,不需要读,直接复制内存就可以;如果page cache没有文件内容,则需要申请page cache,然后从磁盘读文件内容到page cache。
以下共分七个部分来说明:
1)为了便于查找文件读的位置,首先把文件读位置及字节数转换为在page cache中的页面索引值及页内的偏离量,以及预读一部分页面。
void do_generic_mapping_read(struct address_space *mapping,
____________ struct file_ra_state *_ra,
____________ struct file *filp,
____________ loff_t *ppos,
____________ read_descriptor_t *desc,
____________ read_actor_t actor)
{
____struct inode *inode = mapping->host;
____unsigned long index;
____unsigned long end_index;
____unsigned long offset;
____unsigned long last_index;
____unsigned long next_index;
____unsigned long prev_index;
____loff_t isize;
____struct page *cached_page;
____int error;
____struct file_ra_state ra = *_ra;
____cached_page = NULL;
____index = *ppos >> PAGE_CACHE_SHIFT; /*文件开始读的位置对应page cache中的索引*/
____next_index = index;
____prev_index = ra.prev_page; /*文件预读的前一索引值*/
/*本次读结束的位置,在page cache中的索引值*/
____last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
/*在第一个开始要读的页中的偏移位置*/
____offset = *ppos & ~PAGE_CACHE_MASK;
/*整个文件的大小*/
____isize = i_size_read(inode);
____if (!isize)
________goto out;
/*文件最后的页的索引值*/
____end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
____for (;;) {
________struct page *page;
________unsigned long nr, ret;
________/* nr is the maximum number of bytes to copy from this page */
________nr = PAGE_CACHE_SIZE;
________if (index >= end_index) {
____________if (index > end_index)
________________goto out;
____________nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
____________if (nr <= offset) {
________________goto out;
____________}
________}
________nr = nr - offset;
________cond_resched();
________if (index == next_index)
____________next_index = page_cache_readahead(mapping, &ra, filp,
____________________index, last_index - index);
......
2)检查page cache是否存在我们需要的页面.(find_page:分为三种情况1.no_cached_page,2page_ok, 3.page_not_up_to_date)
3)处理页面是最新的情况(page_ok:)
4)页面在page cache中,但不是最新的页面。(page_not_up_to_date)
5)处理读页面(no_cached_page,readpage:)
6)/*1*/申请一个页面,然后插入page cache中。
7)/*7.更新文件位置*/
find_page:
________page = find_get_page(mapping, index); /*查找文件*/
________if (unlikely(page == NULL)) {
____________handle_ra_miss(mapping, &ra, index);
/*1、页面没在page cache中,则进入这个分之,申请一个页面,获得最新的内容*/
____________goto no_cached_page;
________}
/*2、如果page cache中存在并且是最新的数据,则直接读文件;*/
/*3、page cache中存在文件但不是最新内容,则更新page cache中的文件,然后继续读文件*/
________if (!PageUptodate(page))
____________goto page_not_up_to_date;
/*2.*/
page_ok:
...
ret = actor(desc, page, offset, nr); /*讲从page cache中读到的配置保存到用户空间*/
...
/*actor->file_read_actor*/
/*3.*/
page_not_up_to_date:
...
________/* Get exclusive access to the page ... */
________lock_page(page);
________/* Did it get unhashed before we got the lock? */
________if (!page->mapping) {
____________unlock_page(page);
____________page_cache_release(page);
____________continue;
________}
________/* Did somebody else fill it already? */
________if (PageUptodate(page)) {
____________unlock_page(page);
____________goto page_ok;
________}
readpage:
________/* Start the actual read. The read will unlock the page. */
________error = mapping->a_ops->readpage(filp, page);
________if (!PageUptodate(page)) {
____________lock_page(page);
/*锁页,等待读中断返回*/
...
____________unlock_page(page);
________}
......
/**/
/*1.申请一个页面,然后插入page cache中*/
no_cached_page:
________/*
________ * Ok, it wasn't cached, so we need to create a new
________ * page..
________ */
________if (!cached_page) {
____________cached_page = page_cache_alloc_cold(mapping);
____________if (!cached_page) {
________________desc->error = -ENOMEM;
________________goto out;
____________}
________}
________error = add_to_page_cache_lru(cached_page, mapping,
________________________index, GFP_KERNEL);
________if (error) {
____________if (error == -EEXIST)
________________goto find_page;
____________desc->error = error;
____________goto out;
________}
________page = cached_page;
________cached_page = NULL;
________goto readpage;
____}
/*7.更新文件位置*/
out:
____*_ra = ra;
____*ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
____if (cached_page)
________page_cache_release(cached_page);
____if (filp)
________file_accessed(filp);
}
page cache的操作小结:do_generic_mapping_read(filp->f_mapping, &filp->f_ra, filp, ppos, desc, actor);整体上来看分为三个部分:
1)在page cache中能找到对应的page。(find_get_page(mapping, index);->radix_tree_lookup)
2)在page cache中没有对应的page,则申请page。(page_cache_alloc_cold->alloc_pages)
3)添加page到page cache中。(add_to_page_cache_lru->add_to_page_cache->radix_tree_insert)
以上三部分最终转换为对radix tree及内存的操作,由此得出page cache是由radix tree结构来管理的。