Linux读文件赋值,Linux文件系统之文件的读写(续二)

八:VFS层的I/O操作

VFS层是与用户界面直接交互的接口,在这一节里,我们将分为读写两部份来介绍VFS层的操作以及跟上层用用的交互.

8.1:文件的读操作

在用户空间,读文件操作的常用函数为read()。对应在系统空间的调用入口是sys_read().它的代码如下:

asmlinkage ssize_t sys_read(unsigned int fd, char __user * buf, size_t count)

{

struct file *file;

ssize_t ret = -EBADF;

int fput_needed;

//根据fd从进程中取出相应的file对象

file = fget_light(fd, &fput_needed);

if (file) {

loff_t pos = file_pos_read(file);

//文件的当前位置

ret = vfs_read(file, buf, count, &pos);

//更新当前的文件位置

file_pos_write(file, pos);

fput_light(file, fput_needed);

}

return ret;

}

从进程中取得文件描述符后和文件当前的操作位置后会调用vfs_read()执行具体的操作过程.它的代码如下:

ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)

{

struct inode *inode = file->f_dentry->d_inode;

ssize_t ret;

if (!(file->f_mode & FMODE_READ))

return -EBADF;

if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read))

return -EINVAL;

//检查当前区段是否允许读操作

ret = locks_verify_area(FLOCK_VERIFY_READ, inode, file, *pos, count);

if (!ret) {

//是否有权限

ret = security_file_permission (file, MAY_READ);

if (!ret) {

//如果有read 操作,调用之

if (file->f_op->read)

ret = file->f_op->read(file, buf, count, pos);

else

//否则调用aio_read

ret = do_sync_read(file, buf, count, pos);

//ret: 写入的字节数

if (ret > 0)

//产生通告

dnotify_parent(file->f_dentry, DN_ACCESS);

}

}

return ret;

}

从上面看到,会最终调用file的相关操作完成文件的读操作.曾记得我们在文件的打开一节中分析了文件的打开过程。在打开文件过程中,文件描述符的相关操作会被赋值为inode->f_op.对于ext2文件系统,inode的相关信息如下:

inode->i_fop = &ext2_file_operations;

struct file_operations ext2_file_operations = {

.llseek       = generic_file_llseek,

.read         = generic_file_read,

.write        = generic_file_write,

.aio_read = generic_file_aio_read,

.aio_write    = generic_file_aio_write,

.ioctl        = ext2_ioctl,

.mmap         = generic_file_mmap,

.open         = generic_file_open,

.release = ext2_release_file,

.fsync        = ext2_sync_file,

.readv        = generic_file_readv,

.writev       = generic_file_writev,

.sendfile = generic_file_sendfile,

}

相应文件读操作入口为generic_file_read():

ssize_t

generic_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos)

{

//用户空间的地址和长度

struct iovec local_iov = { .iov_base = buf, .iov_len = count };

//记录完成状态

struct kiocb kiocb;

ssize_t ret;

//kiocb.ki_key=KIOCB_SYNC_KEY; kiocb.ki_filp=filp;kiocb.ki_obj=current;

init_sync_kiocb(&kiocb, filp);

//返回读写完成的字节数

ret = __generic_file_aio_read(&kiocb, &local_iov, 1, ppos);

//异步操作,需用等待

if (-EIOCBQUEUED == ret)

ret = wait_on_sync_kiocb(&kiocb);

//返回完成的字节数

return ret;

}

__generic_file_aio_read()是一个很重要的函数,它是读操作的入口。代码如下:

ssize_t

__generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,

unsigned long nr_segs, loff_t *ppos)

{

struct file *filp = iocb->ki_filp;

ssize_t retval;

unsigned long seg;

size_t count;

count = 0;

for (seg = 0; seg < nr_segs; seg++) {

const struct iovec *iv = &iov[seg];

/*

* If any segment has a negative length, or the cumulative

* length ever wraps negative then return -EINVAL.

*/

count += iv->iov_len;

if (unlikely((ssize_t)(count|iv->iov_len) < 0))

return -EINVAL;

//检查从 iv->iov_base 开始的iov_len区间的合法性

if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len))

continue;

if (seg == 0)

return -EFAULT;

//nr_seg: 有效的数据段数目

nr_segs = seg;

//上一个数据段无效,将其长度减下来

count -= iv->iov_len;  /* This segment is no good */

break;

}

/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */

//如果定义了O_DIRECT:直接传送数据`绕过了页高速缓存

if (filp->f_flags & O_DIRECT) {

loff_t pos = *ppos, size;

struct address_space *mapping;

struct inode *inode;

mapping = filp->f_mapping;

inode = mapping->host;

retval = 0;

if (!count)

goto out; /* skip atime */

size = i_size_read(inode);

if (pos < size) {

retval = generic_file_direct_IO(READ, iocb,

iov, pos, nr_segs);

if (retval >= 0 && !is_sync_kiocb(iocb))

retval = -EIOCBQUEUED;

if (retval > 0)

*ppos = pos + retval;

}

file_accessed(filp);

goto out;

}

//count:读取文件的长度

retval = 0;

if (count) {

for (seg = 0; seg < nr_segs; seg++) {

//read_descriptor_t:读操作描述符`用来记录读的状态

read_descriptor_t desc;

desc.written = 0;

desc.arg.buf = iov[seg].iov_base;

desc.count = iov[seg].iov_len;

//如果没有要传输的数据`继续下一个iov

if (desc.count == 0)

continue;

desc.error = 0;

//对其中的每一个段调用do_generic_file_read

do_generic_file_read(filp,ppos,&desc,file_read_actor,0);

//desc.written:写入到用户空间的字节数

//更新retval

retval += desc.written;

if (!retval) {

retval = desc.error;

break;

}

}

}

out:

return retval;

}

这里有种特殊情况,当文件是用直接I/O模式打开时(文件描述符带有O_DIRECT标志),就会采用直接I/O而跳过了页高速缓区。这样的情况我们在之后再讨论.

对于普通模块的情况。将会对每一个段调用do_generic_file_read()来完成I/O操作。这个函数的代码如下:

do_generic_file_read()à do_generic_file_read():

/*

mapping:      页高速缓存区

_ra:          filep对应的file_ra_state

filep:        打开的文件描述符

ppos:         当前的操作位置

desc:         读操作描述符

actor:        内核空间到用户空间的拷贝函数

nonblock: 如果此变量为1,则需要预读

*/

void do_generic_mapping_read(struct address_space *mapping,

struct file_ra_state *_ra,

struct file *filp,

loff_t *ppos,

read_descriptor_t *desc,

read_actor_t actor,

int nonblock)

{

struct inode *inode = mapping->host;

unsigned long index, end_index, offset;

loff_t isize;

struct page *cached_page;

int error;

struct file_ra_state ra = *_ra;

cached_page = NULL;

//找到页面的偏移量。即确定是存储在那个存面中

index = *ppos >> PAGE_CACHE_SHIFT;

//第一个请求字节在页面的偏移量

//亦即请求的字节在页面中的偏移

offset = *ppos & ~PAGE_CACHE_MASK;

//inode对应的文件大小

isize = i_size_read(inode);

if (!isize)

goto out;

//最后的缓存页序号

end_index = (isize - 1) >> PAGE_CACHE_SHIFT;

for (;;) {

struct page *page;

unsigned long nr, ret;

/* nr is the maximum number of bytes to copy from this page */

//nr: 缓存页空间大小

nr = PAGE_CACHE_SIZE;

if (index >= end_index) {

//index > end_indx: 肯定是非法的页面缓存器大小

if (index > end_index)

goto out;

//执行到这里,肯定有index == end_index

//nr转化成了文件在最后一个缓存page中的位置

nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;

//offset是当前位置在页中的偏移,nr: 是最后一个块在磁盘中的偏移

//如果nr<=offset说明文件已经操作完了

if (nr <= offset) {

goto out;

}

}

//nr-offset: 页面的剩余操作字节数

nr = nr - offset;

//检查当前进程是否设置了重新调度标志`如果有`调用schdule()重新调度一次

cond_resched();

//文件预读

if (!nonblock)

page_cache_readahead(mapping, &ra, filp, index);

find_page:

//寻找当前位置对应的缓存页

page = find_get_page(mapping, index);

if (unlikely(page == NULL)) {

//没有找到对应的缓存页,说明在页缓存区中不存在此页面对应的缓存页

if (nonblock) {

desc->error = -EWOULDBLOCKIO;

break;

}

handle_ra_miss(mapping, &ra, index);

goto no_cached_page;

}

//在页缓存区中找到了相关的页面

//检查PG_uptodata标志是否被设置`如果这个标志被设置的话,就不需要从设备

//上去读取了

if (!PageUptodate(page)) {

//页面没有设置PG_uptodata`页面中的内容无效,所以要从文件系统中把数据读取出来

if (nonblock) {

page_cache_release(page);

desc->error = -EWOULDBLOCKIO;

break;

}

goto page_not_up_to_date;

}

page_ok:

/* If users can be writing to this page using arbitrary

* virtual addresses, take care about potential aliasing

* before reading the page on the kernel side.

*/

if (mapping_writably_mapped(mapping))

flush_dcache_page(page);

/*

* Mark the page accessed if we read the beginning.

*/

if (!offset)

mark_page_accessed(page);

/*

* Ok, we have the page, and it's up-to-date, so

* now we can copy it to user space...

*

* The actor routine returns how many bytes were actually used..

* NOTE! This may not be the same as how much of a user buffer

* we filled up (we may be padding etc), so we can only update

* "pos" here (the actor routine has to update the user buffer

* pointers and the remaining count).

*/

//页面与用户空间的值拷贝.返回拷贝的数据数

ret = actor(desc, page, offset, nr);

offset += ret;

index += offset >> PAGE_CACHE_SHIFT;

offset &= ~PAGE_CACHE_MASK;

page_cache_release(page);

//如果ret == nr: 拷贝的长度等于在页面中的剩余长度,说明拷贝没有发生错误

if (ret == nr && desc->count)

continue;

//否则,可以退出了

goto out;

page_not_up_to_date:

/* Get exclusive access to the page ... */

//要从文件系统中传数据到此页面上。将此页面锁定

lock_page(page);

/* Did it get unhashed before we got the lock? */

//有可能在锁页面的时候`有其它的进程将页面移除了页缓存区

//在这种情况下:将page解锁`并减少它的使用计数,重新循环```

//重新进入循环后,在页缓存区找不到对应的page.就会重新分配一个新的page

if (!page->mapping) {

unlock_page(page);

page_cache_release(page);

continue;

}

/* Did somebody else fill it already? */

//在加锁的时候,有其它的进程完成了从文件系统到具体页面的映射?

//在这种情况下,返回到page_ok.直接将页面上的内容copy到用户空间即可

if (PageUptodate(page)) {

unlock_page(page);

goto page_ok;

}

//读取页面

readpage:

/* Start the actual read. The read will unlock the page. */

//到这里的话,实际的读取过程开始了 ^_^

error = mapping->a_ops->readpage(filp, page);

//读取错误,退出

if (unlikely(error))

goto readpage_error;

//如果PG_uptodata标志仍然末设置.就一直等待,一直到page不处于锁定状态

// TODO: 在将文件系统的内容读入page之前,page一直是处理Lock状态的。一直到

//读取完成后,才会将页面解锁.    然后将进程唤醒

if (!PageUptodate(page)) {

wait_on_page_locked(page);

//如果页面仍然没有PG_uptodata标志.只可能是发生了错误.出错返回

if (!PageUptodate(page)) {

error = -EIO;

goto readpage_error;

}

}

/*

* i_size must be checked after we have done ->readpage.

*

* Checking i_size after the readpage allows us to calculate

* the correct value for "nr", which means the zero-filled

* part of the page is not copied back to userspace (unless

* another truncate extends the file - this is desired though).

*/

isize = i_size_read(inode);

end_index = (isize - 1) >> PAGE_CACHE_SHIFT;

//如果文件大小无效或者当前位置超过了文件大小

if (unlikely(!isize || index > end_index)) {

page_cache_release(page);

goto out;

}

/* nr is the maximum number of bytes to copy from this page */

//重新计算nr 即在页面中剩余的要copy的字节数

nr = PAGE_CACHE_SIZE;

if (index == end_index) {

nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;

if (nr <= offset) {

page_cache_release(page);

goto out;

}

}

nr = nr - offset;

goto page_ok;

readpage_error:

/* UHHUH! A synchronous read error occurred. Report it */

desc->error = error;

page_cache_release(page);

goto out;

no_cached_page:

/*

* Ok, it wasn't cached, so we need to create a new

* page..

*/

//在页缓区中没有相关的缓存页

//新分匹一个页面

if (!cached_page) {

cached_page = page_cache_alloc_cold(mapping);

if (!cached_page) {

desc->error = -ENOMEM;

goto out;

}

}

//将分得的页加到页缓存区和LRU

// TODO:在将新页面插入页缓存区域中,会将页面标志设置为PG_locked

error = add_to_page_cache_lru(cached_page, mapping,

index, GFP_KERNEL);

if (error) {

if (error == -EEXIST)

goto find_page;

desc->error = error;

goto out;

}

page = cached_page;

cached_page = NULL;

goto readpage;

}

out:

*_ra = ra;

//ppos: 最后的读取位置

*ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;

if (cached_page)

page_cache_release(cached_page);

if (filp)

file_accessed(filp);

}

如果参数为nonblock为1,则必须预读页面。在这里的调用nonblock为零,不需要考虑预读的情况。关于预读的操作,我们之后再给出分析.

在这个操作中,有这样几种可能的情况:

1:如果要访问的页面在页高速缓存中,而且已经被更新(含有PG_uptodata标志).只需要直接将其copy到用户空间即可.

2:序号对应的页面不在高速缓存中,那就需要在页高速缓存中增加序号对应的页面。然后从文件系统中读取数据到这个页面上.再拷贝到用户空间。

3:序号对应的页面在高速缓存中,但数据不是最新的.这就需要缓存页与文件系统进行同步.再将页面拷贝到用户空间.

对于2和3。它们有一部份是相同的,即从文件系统中读数据的过程。我们只需要分种对于第2的情况。对应的代码片段如下:

void do_generic_mapping_read(struct address_space *mapping,

struct file_ra_state *_ra,

struct file *filp,

loff_t *ppos,

read_descriptor_t *desc,

read_actor_t actor,

int nonblock)

{

……

page = find_get_page(mapping, index);

if (unlikely(page == NULL)) {

//没有找到对应的缓存页,说明在页缓存区中不存在此页面对应的缓存页

if (nonblock) {

desc->error = -EWOULDBLOCKIO;

break;

}

handle_ra_miss(mapping, &ra, index);

goto no_cached_page;

}

……

……

}

Handle_ra_miss()主要对文件的预读进行调整,在这里不进行分析,待分析预读机制的时候再来详细分析.

如果页面高速缓存中不存在此页面就会跳转到no_cached_page:

no_cached_page:

/*

* Ok, it wasn't cached, so we need to create a new

* page..

*/

//在页缓区中没有相关的缓存页

//新分匹一个页面

if (!cached_page) {

cached_page = page_cache_alloc_cold(mapping);

if (!cached_page) {

desc->error = -ENOMEM;

goto out;

}

}

//将分得的页加到页缓存区和LRU

// TODO:在将新页面插入页缓存区域中,会将页面标志设置为PG_locked

error = add_to_page_cache_lru(cached_page, mapping,

index, GFP_KERNEL);

if (error) {

if (error == -EEXIST)

goto find_page;

desc->error = error;

goto out;

}

page = cached_page;

cached_page = NULL;

goto readpage;

在这里,会首先调用page_cache_alloc_cold()分配一个页面。然后调用add_to_page_cache_lru()将页面插入页高速缓存并加入lru.然后跳转到readpage。这也是第3种情况对应的处理:

//读取页面

readpage:

/* Start the actual read. The read will unlock the page. */

//到这里的话,实际的读取过程开始了 ^_^

error = mapping->a_ops->readpage(filp, page);

在这里会看到,最终会调用页高速缓存的readpage方法进行读取操作。

文件页高速缓存的readpage操作

同理,还是以ext2文件系统为例来分析。在open的时候,它将页高速缓存对应的各项操作设置如下:

inode->i_mapping->a_ops = &ext2_aops;

struct address_space_operations ext2_aops = {

.readpage     = ext2_readpage,

.readpages         = ext2_readpages,

.writepage         = ext2_writepage,

.sync_page         = block_sync_page,

.prepare_write         = ext2_prepare_write,

.commit_write      = generic_commit_write,

.bmap              = ext2_bmap,

.direct_IO         = ext2_direct_IO,

.writepages        = ext2_writepages,

};

对应的入口为ext2_readpage:

static int ext2_readpage(struct file *file, struct page *page)

{

return mpage_readpage(page, ext2_get_block);

}

这是一个封装的函数,采用一个回调函数做为参数.该回调函数将相对于文件起始的块号转换为文件系统的逻辑块号.

Mpage_readpage()的代码如下:

int mpage_readpage(struct page *page, get_block_t get_block)

{

struct bio *bio = NULL;

sector_t last_block_in_bio = 0;

//转要读的信息转换为bio结构

bio = do_mpage_readpage(bio, page, 1,

&last_block_in_bio, get_block);

//提交这个bio

if (bio)

mpage_bio_submit(READ, bio);

return 0;

}

mpage_bio_submit()这个操作中有一部份代码在之前已经分析过了。剩余的代码很简单。这里不做分析.

do_mpage_readpage()的代码如下:

static struct bio *

do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,

sector_t *last_block_in_bio, get_block_t get_block)

{

struct inode *inode = page->mapping->host;

const unsigned blkbits = inode->i_blkbits;

//计算一个页面中的数据块数目

const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;

//block的大小

const unsigned blocksize = 1 << blkbits;

sector_t block_in_file;

sector_t last_block;

sector_t blocks[MAX_BUF_PER_PAGE];

unsigned page_block;

unsigned first_hole = blocks_per_page;

struct block_device *bdev = NULL;

struct buffer_head bh;

int length;

int fully_mapped = 1;

//如果页面是一个缓存区页,跳转到confused.直接更新页在中的块缓存区

if (page_has_buffers(page))

goto confused;

//页序号*每个页中的块数目 = 页面中的首个块号

block_in_file = page->index << (PAGE_CACHE_SHIFT - blkbits);

//文件最后的块: 文件大小/块大小

last_block = (i_size_read(inode) + blocksize - 1) >> blkbits;

bh.b_page = page;

//遍历页面中的块数

for (page_block = 0; page_block < blocks_per_page;

page_block++, block_in_file++) {

bh.b_state = 0;

if (block_in_file < last_block) {

//将文件中的块号转换成bh

if (get_block(inode, block_in_file, &bh, 0))

//如果有错误

goto confused;

}

//bh没有被映射,可能是一个文件空洞

if (!buffer_mapped(&bh)) {

fully_mapped = 0;

if (first_hole == blocks_per_page)

first_hole = page_block;

continue;

}

/* some filesystems will copy data into the page during

* the get_block call, in which case we don't want to

* read it again.  map_buffer_to_page copies the data

* we just collected from get_block into the page's buffers

* so readpage doesn't have to repeat the get_block call

*/

//如果块缓存区是最新的,将其数据直接copy到page

if (buffer_uptodate(&bh)) {

map_buffer_to_page(page, &bh, page_block);

goto confused;

}

if (first_hole != blocks_per_page)

goto confused;         /* hole -> non-hole */

/* Contiguous blocks? */

//判断请求的块缓存是不是连续的。如果不连续,就跳转到confused

if (page_block && blocks[page_block-1] != bh.b_blocknr-1)

goto confused;

blocks[page_block] = bh.b_blocknr;

bdev = bh.b_bdev;

}

if (first_hole != blocks_per_page) {

char *kaddr = kmap_atomic(page, KM_USER0);

memset(kaddr + (first_hole << blkbits), 0,

PAGE_CACHE_SIZE - (first_hole << blkbits));

flush_dcache_page(page);

kunmap_atomic(kaddr, KM_USER0);

if (first_hole == 0) {

SetPageUptodate(page);

unlock_page(page);

goto out;

}

} else if (fully_mapped) {

//设置PG_mappedtodisk

SetPageMappedToDisk(page);

}

/*

* This page will go to BIO.  Do we need to send this BIO off first?

*/

if (bio && (*last_block_in_bio != blocks[0] - 1))

bio = mpage_bio_submit(READ, bio);

alloc_new:

if (bio == NULL) {

//创建一个bio

bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),

min_t(int, nr_pages, bio_get_nr_vecs(bdev)),

GFP_KERNEL);

if (bio == NULL)

goto confused;

}

length = first_hole << blkbits;

//将page对应的偏移与长度设置到bio 中

if (bio_add_page(bio, page, length, 0) < length) {

bio = mpage_bio_submit(READ, bio);

goto alloc_new;

}

if (buffer_boundary(&bh) || (first_hole != blocks_per_page))

bio = mpage_bio_submit(READ, bio);

else

*last_block_in_bio = blocks[blocks_per_page - 1];

out:

return bio;

confused:

if (bio)

bio = mpage_bio_submit(READ, bio);

if (!PageUptodate(page))

block_read_full_page(page, get_block);

else

unlock_page(page);

goto out;

}

这段代码实际上做了一个小小的优化。它会判断要提交的块缓存区是不是连续的。如果是连续的就可以将它们放一个bio中。然后提交到通用块设备层。如果不是连续的,对于每一个块缓存区都要提交一次.

对于连续条件的bio提交很好理解,代码也很容易.重点分析对于不连续的块的处理。

在上面的代码中可以看到,对于不连续块是通过block_read_full_page()来处理的.代码如下:

int block_read_full_page(struct page *page, get_block_t *get_block)

{

struct inode *inode = page->mapping->host;

sector_t iblock, lblock;

struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];

unsigned int blocksize;

int nr, i;

int fully_mapped = 1;

//页面没有被锁定

if (!PageLocked(page))

PAGE_BUG(page);

//块大小

blocksize = 1 << inode->i_blkbits;

//如果页面中没有块缓存区,则在其中建立空的块缓存区

if (!page_has_buffers(page))

create_empty_buffers(page, blocksize, 0);

//块缓存区描述符的首部

head = page_buffers(page);

//页中的起始块号

iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);

//文件中的最后一个块号

lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;

bh = head;

nr = 0;

i = 0;

do {

//已经是最新的了,不需要提交,继续下一个

if (buffer_uptodate(bh))

continue;

//如果块缓存区没有被映射

if (!buffer_mapped(bh)) {

fully_mapped = 0;

if (iblock < lblock) {

//将文件块号转换为bh

if (get_block(inode, iblock, bh, 0))

SetPageError(page);

}

//如果这个bh还是没有映射。可能是对应文件的空洞区域

//将这个bh对应的区域置0

if (!buffer_mapped(bh)) {

void *kaddr = kmap_atomic(page, KM_USER0);

memset(kaddr + i * blocksize, 0, blocksize);

flush_dcache_page(page);

kunmap_atomic(kaddr, KM_USER0);

set_buffer_uptodate(bh);

continue;

}

/*

* get_block() might have updated the buffer

* synchronously

*/

//如果bh为最新了,不需要提交了

if (buffer_uptodate(bh))

continue;

}

//提要提交的bh保存到arr数组里

arr[nr++] = bh;

} while (i++, iblock++, (bh = bh->b_this_page) != head);

//设置PG_mappdtodisk

if (fully_mapped)

SetPageMappedToDisk(page);

//如果没有要提交的

if (!nr) {

/*

* All buffers are uptodate - we can set the page uptodate

* as well. But not if get_block() returned an error.

*/

if (!PageError(page))

SetPageUptodate(page);

unlock_page(page);

return 0;

}

/* Stage two: lock the buffers */

//对每一个提交的bh进行锁定

for (i = 0; i < nr; i++) {

bh = arr[i];

lock_buffer(bh);

mark_buffer_async_read(bh);

}

/*

* Stage 3: start the IO.  Check for uptodateness

* inside the buffer lock in case another process reading

* the underlying blockdev brought it uptodate (the sct fix).

*/

//提交每一个bh

for (i = 0; i < nr; i++) {

bh = arr[i];

if (buffer_uptodate(bh))

end_buffer_async_read(bh, 1);

else

submit_bh(READ, bh);

}

return 0;

}

从上面的代码中看了.对于不连续的读操作,会反复调用submit_bh()来完成.

8.2:文件的写操作

在用户空间中,用户的写操作接口为write.对应系统调用的入口为sys_write().

代码如下:

asmlinkage ssize_t sys_write(unsigned int fd, const char __user * buf, size_t count)

{

struct file *file;

ssize_t ret = -EBADF;

int fput_needed;

//取得文件描述符对应的file

//fget_ligsh():对fget()进行了优化。如果当前file没有被共享的话。那么在取的时候就不必要加锁

file = fget_light(fd, &fput_needed);

if (file) {

//当前文件指针位置

loff_t pos = file_pos_read(file);

ret = vfs_write(file, buf, count, &pos);

//更新文件指针

file_pos_write(file, pos);

//对共享情况下的解锁

fput_light(file, fput_needed);

}

return ret;

}

上面的代码与读操作差不多,都是取文件描述符和当前文件,操作完后,更新文件指针位置.

vfs_write()代码如下:

ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)

{

struct inode *inode = file->f_dentry->d_inode;

ssize_t ret;

//文件不可写?

if (!(file->f_mode & FMODE_WRITE))

return -EBADF;

//没有操作函数或者是有操作函数但没有写函数。出错返回

if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))

return -EINVAL;

//对写区域所加的强制锁

ret = locks_verify_area(FLOCK_VERIFY_WRITE, inode, file, *pos, count);

if (!ret) {

ret = security_file_permission (file, MAY_WRITE);

if (!ret) {

if (file->f_op->write)

ret = file->f_op->write(file, buf, count, pos);

else

ret = do_sync_write(file, buf, count, pos);

if (ret > 0)

dnotify_parent(file->f_dentry, DN_MODIFY);

}

}

return ret;

}

对于大部份情况,写操作会由file->f_op->write完成.在ext2文件系统中,此接口对应的函数为:

ssize_t generic_file_write(struct file *file, const char __user *buf,

size_t count, loff_t *ppos)

{

struct address_space *mapping = file->f_mapping;

struct inode *inode = mapping->host;

ssize_t  ret;

struct iovec local_iov = { .iov_base = (void __user *)buf,

.iov_len = count };

down(&inode->i_sem);

//返回write的有效字节数

ret = generic_file_write_nolock(file, &local_iov, 1, ppos);

up(&inode->i_sem);

//如果定义了O_SYNC或者inode定义了MS_SYNCHRONOUS标志

if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {

ssize_t err;

//把缓存区上面的东西写回设备

err = sync_page_range(inode, mapping, *ppos - ret, ret);

if (err < 0)

ret = err;

}

return ret;

}

如果打开文件时带有O_SYNC标志,或者文件系统带有SYNC标志,都会将缓存中的数据直接写到文件系统上.

转入generic_file_write_nolock():

ssize_t

generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,

unsigned long nr_segs, loff_t *ppos)

{

struct file *file = iocb->ki_filp;

struct address_space * mapping = file->f_mapping;

size_t ocount;         /* original count */

size_t count;      /* after file limit checks */

struct inode *inode = mapping->host;

unsigned long seg;

loff_t        pos;

ssize_t       written;

ssize_t       err;

ocount = 0;

for (seg = 0; seg < nr_segs; seg++) {

const struct iovec *iv = &iov[seg];

/*

* If any segment has a negative length, or the cumulative

* length ever wraps negative then return -EINVAL.

*/

ocount += iv->iov_len;

if (unlikely((ssize_t)(ocount|iv->iov_len) < 0))

return -EINVAL;

//判断用户给的区域是否合法

if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))

continue;

if (seg == 0)

return -EFAULT;

nr_segs = seg;

ocount -= iv->iov_len; /* This segment is no good */

break;

}

//count: 要write的字节总数

count = ocount;

//ppos:当前的位置

pos = *ppos;

/* We can write back this queue in page reclaim */

//backing_dev_info: 预读信息

current->backing_dev_info = mapping->backing_dev_info;

written = 0;

//对写操作的详细检查

err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));

if (err)

goto out;

if (count == 0)

goto out;

err = remove_suid(file->f_dentry);

if (err)

goto out;

//更新索引结点的时间戳信息

inode_update_time(inode, 1);

/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */

if (unlikely(file->f_flags & O_DIRECT)) {

written = generic_file_direct_write(iocb, iov,

&nr_segs, pos, ppos, count, ocount);

if (written < 0 || written == count)

goto out;

/*

* direct-io write to a hole: fall through to buffered I/O

* for completing the rest of the request.

*/

pos += written;

count -= written;

}

written = generic_file_buffered_write(iocb, iov, nr_segs,

pos, ppos, count, written);

out:

current->backing_dev_info = NULL;

return written ? written : err;

}

如果文件打开时带有了O_DIRECT标志,则会跳过文件缓存直接将数据写到文件系统中。对于O_DIRECT的操作我们在之后再做总结。对于一般的情况,都会转入到generic_file_buffered_write():

ssize_t

generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,

unsigned long nr_segs, loff_t pos, loff_t *ppos,

size_t count, ssize_t written)

{

struct file *file = iocb->ki_filp;

struct address_space * mapping = file->f_mapping;

struct address_space_operations *a_ops = mapping->a_ops;

struct inode *inode = mapping->host;

long     status = 0;

struct page   *page;

struct page   *cached_page = NULL;

size_t        bytes;

struct pagevec     lru_pvec;

const struct iovec *cur_iov = iov; /* current iovec */

size_t        iov_base = 0;    /* offset in the current iovec */

char __user   *buf;

pagevec_init(&lru_pvec, 0);

buf = iov->iov_base + written;   /* handle partial DIO write */

do {

unsigned long index;

unsigned long offset;

size_t copied;

//offset: 页面中的偏移

offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */

//offset: 页面序号

index = pos >> PAGE_CACHE_SHIFT;

//页面中的剩余信息

bytes = PAGE_CACHE_SIZE - offset;

//如果bytes > 数据的长度

if (bytes > count)

bytes = count;

/*

* Bring in the user page that we will copy from _first_.

* Otherwise there's a nasty deadlock on copying from the

* same page as we're writing to, without it being marked

* up-to-date.

*/

fault_in_pages_readable(buf, bytes);

//到页高速缓存中寻找index对应的页面。如果不存在,则新建

page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);

if (!page) {

status = -ENOMEM;

break;

}

//调用prepare_write。在这里就会涉及到缓存头的概念了 ^_^

status = a_ops->prepare_write(file, page, offset, offset+bytes);

if (unlikely(status)) {

loff_t isize = i_size_read(inode);

/*

* prepare_write() may have instantiated a few blocks

* outside i_size.  Trim these off again.

*/

unlock_page(page);

page_cache_release(page);

if (pos + bytes > isize)

vmtruncate(inode, isize);

break;

}

//把数据copy到缓冲区

if (likely(nr_segs == 1))

copied = filemap_copy_from_user(page, offset,

buf, bytes);

else

copied = filemap_copy_from_user_iovec(page, offset,

cur_iov, iov_base, bytes);

flush_dcache_page(page);

//调用commit_write。将数据写回设备

status = a_ops->commit_write(file, page, offset, offset+bytes);

if (likely(copied > 0)) {

if (!status)

status = copied;

if (status >= 0) {

written += status;

count -= status;

pos += status;

buf += status;

if (unlikely(nr_segs > 1))

filemap_set_next_iovec(&cur_iov,

&iov_base, status);

}

}

if (unlikely(copied != bytes))

if (status >= 0)

status = -EFAULT;

unlock_page(page);

mark_page_accessed(page);

page_cache_release(page);

if (status < 0)

break;

balance_dirty_pages_ratelimited(mapping);

cond_resched();

} while (count);

*ppos = pos;

if (cached_page)

page_cache_release(cached_page);

/*

* For now, when the user asks for O_SYNC, we'll actually give O_DSYNC

*/

if (likely(status >= 0)) {

if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) {

if (!a_ops->writepage || !is_sync_kiocb(iocb))

status = generic_osync_inode(inode, mapping,

OSYNC_METADATA|OSYNC_DATA);

}

}

/*

* If we get here for O_DIRECT writes then we must have fallen through

* to buffered writes (block instantiation inside i_size).  So we sync

* the file data here, to try to honour O_DIRECT expectations.

*/

if (unlikely(file->f_flags & O_DIRECT) && written)

status = filemap_write_and_wait(mapping);

pagevec_lru_add(&lru_pvec);

return written ? written : status;

}

从上面的代码可以看出:对于写操作,会先到高速缓存中取对应的page。然后调用a_ops->prepare_write()。然后将要写的数据拷贝到缓存区页上,接着调用a_ops->commit_write()。下来我们分别分别这两个操作.

8.2.1:页高速缓存的prepare_write()操作

Ext2系统对应的入口为:

static int

ext2_prepare_write(struct file *file, struct page *page,

unsigned from, unsigned to)

{

return block_prepare_write(page,from,to,ext2_get_block);

}

这里是一个封装函数。对于块设备来说,不同的只是后面所带的函数指针,这样的函数结构我们在读操作中也见过。Ext_get_block()函数的操作为,将对应文件的块号转换为文件系统的逻辑块号.

转入block_prepare_write():

int block_prepare_write(struct page *page, unsigned from, unsigned to,

get_block_t *get_block)

{

struct inode *inode = page->mapping->host;

int err = __block_prepare_write(inode, page, from, to, get_block);

//如果失败,清除page的uptodate标志

if (err)

ClearPageUptodate(page);

return err;

}

__block_prepare_write()的操作为:

static int __block_prepare_write(struct inode *inode, struct page *page,

unsigned from, unsigned to, get_block_t *get_block)

{

unsigned block_start, block_end;

sector_t block;

int err = 0;

unsigned blocksize, bbits;

struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;

BUG_ON(!PageLocked(page));

BUG_ON(from > PAGE_CACHE_SIZE);

BUG_ON(to > PAGE_CACHE_SIZE);

BUG_ON(from > to);

//标大小

blocksize = 1 << inode->i_blkbits;

if (!page_has_buffers(page))

create_empty_buffers(page, blocksize, 0);

head = page_buffers(page);

bbits = inode->i_blkbits;

//该页面的起始起号

block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);

for(bh = head, block_start = 0; bh != head || !block_start;

block++, block_start=block_end, bh = bh->b_this_page) {

block_end = block_start + blocksize;

//对于没有落在from->to这个区间的bh

// TODO: 这样做实际上要依赖一个条件: 块大小必须为512的整数倍且须为2的幂大小

if (block_end <= from || block_start >= to) {

if (PageUptodate(page)) {

if (!buffer_uptodate(bh))

set_buffer_uptodate(bh);

}

continue;

}

if (buffer_new(bh))

clear_buffer_new(bh);

if (!buffer_mapped(bh)) {

//这里可能会进行文件系统大小的扩充.

err = get_block(inode, block, bh, 1);

if (err)

goto out;

//块缓存区刚被分配,没有被访问就置为BH_NEW

//通常是通过get_block()刚刚映射好的,不能访问

if (buffer_new(bh)) {

clear_buffer_new(bh);

unmap_underlying_metadata(bh->b_bdev,

bh->b_blocknr);

//如果页面uptodate.则设置bh的相应标志

if (PageUptodate(page)) {

set_buffer_uptodate(bh);

continue;

}

//如果只是对该块缓存区的部份进行操作,则将不操作的部份置0

if (block_end > to || block_start < from) {

void *kaddr;

kaddr = kmap_atomic(page, KM_USER0);

if (block_end > to)

memset(kaddr+to, 0,

block_end-to);

if (block_start < from)

memset(kaddr+block_start,

0, from-block_start);

flush_dcache_page(page);

kunmap_atomic(kaddr, KM_USER0);

}

continue;

}

}

if (PageUptodate(page)) {

if (!buffer_uptodate(bh))

set_buffer_uptodate(bh);

continue;

}

//如果bh没有uptodata.先将其和文件系统同步

if (!buffer_uptodate(bh) && !buffer_delay(bh) &&

(block_start < from || block_end > to)) {

ll_rw_block(READ, 1, &bh);

*wait_bh++=bh;

}

}

/*

* If we issued read requests - let them complete.

*/

//如果有提交的bh.等待其I/O完成

while(wait_bh > wait) {

wait_on_buffer(*--wait_bh);

if (!buffer_uptodate(*wait_bh))

return -EIO;

}

return 0;

out:

/*

* Zero out any newly allocated blocks to avoid exposing stale

* data.  If BH_New is set, we know that the block was newly

* allocated in the above loop.

*/

bh = head;

block_start = 0;

do {

block_end = block_start+blocksize;

if (block_end <= from)

goto next_bh;

if (block_start >= to)

break;

if (buffer_new(bh)) {

void *kaddr;

clear_buffer_new(bh);

kaddr = kmap_atomic(page, KM_USER0);

memset(kaddr+block_start, 0, bh->b_size);

kunmap_atomic(kaddr, KM_USER0);

set_buffer_uptodate(bh);

mark_buffer_dirty(bh);

}

next_bh:

block_start = block_end;

bh = bh->b_this_page;

} while (bh != head);

return err;

}

对于读操作,写操作可能更加复杂,因为写操作要动态调整文件的大小。文件大小的调整过程是在ext_get_block()这个回调函数中完成的。

Prepare_write操作完成了对缓存冲页进行了必要的初始化和文件大小的扩充.

直正将数据写到文件系统上是在commit_write()中完成的:

int generic_commit_write(struct file *file, struct page *page,

unsigned from, unsigned to)

{

struct inode *inode = page->mapping->host;

loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;

__block_commit_write(inode,page,from,to);

/*

* No need to use i_size_read() here, the i_size

* cannot change under us because we hold i_sem.

*/

//如果文件被扩大了.更改inode->i_size

if (pos > inode->i_size) {

i_size_write(inode, pos);

mark_inode_dirty(inode);

}

return 0;

}

经过上面的分析,我们知道,在调用commit_write()之前,已经将要写的数据拷贝到了页缓冲区.

__block_commit_write()的代码如下:

static int __block_commit_write(struct inode *inode, struct page *page,

unsigned from, unsigned to)

{

unsigned block_start, block_end;

int partial = 0;

unsigned blocksize;

struct buffer_head *bh, *head;

blocksize = 1 << inode->i_blkbits;

//对被修改的部份置为dirty

for(bh = head = page_buffers(page), block_start = 0;

bh != head || !block_start;

block_start=block_end, bh = bh->b_this_page) {

block_end = block_start + blocksize;

if (block_end <= from || block_start >= to) {

if (!buffer_uptodate(bh))

partial = 1;

} else {

set_buffer_uptodate(bh);

mark_buffer_dirty(bh);

}

}

/*

* If this is a partial write which happened to make all buffers

* uptodate then we can optimize away a bogus readpage() for

* the next read(). Here we 'discover' whether the page went

* uptodate as a result of this (potentially partial) write.

*/

//如果整个页面的块缓存区都置为了dirty.则置页面的PG_uptodate标志.

if (!partial)

SetPageUptodate(page);

return 0;

}

在上面的代码中,我们看到,只是把块缓存区置为了“脏”,并没有直正的将数据写到文件系统中,那是什么时候完成这个写的过程的呢?

记得我们在分析pdflush线程数的时候,曾经介绍过 “回写陈旧的页面”。没错,就是在那里,旧页面被回写到了文件系统.

在那一节,我们遗留下了两个问题。即mapping->a_ops->writepages和mapping->a_ops->writepage的操作。我们在这一节里详细的分析一下.

8.2.1:mapping->a_ops->writepages()操作

对于ext2来说,它的mapping各项操作赋值为:

struct address_space_operations ext2_aops = {

……

.writepage         = ext2_writepage,

.writepages        = ext2_writepages,

……

}

相应的,writepages入口为ext2_writepages():

static int

ext2_writepages(struct address_space *mapping, struct writeback_control *wbc)

{

return mpage_writepages(mapping, wbc, ext2_get_block);

}

mpage_writepages()就是我们在pdflush线程组中曾经分析过的子函数.在这里不再赘述.

8.2.2: mapping->a_ops->writepage()操作

相应的入口为ext2_writepage():

static int ext2_writepage(struct page *page, struct writeback_control *wbc)

{

return block_write_full_page(page, ext2_get_block, wbc);

}

转入block_write_full_page()

static int __block_write_full_page(struct inode *inode, struct page *page,

get_block_t *get_block, struct writeback_control *wbc)

{

int err;

sector_t block;

sector_t last_block;

struct buffer_head *bh, *head;

int nr_underway = 0;

BUG_ON(!PageLocked(page));

//文件中的最后一个块号

last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;

//如果不是块缓存页,则在页中建立块缓存区

if (!page_has_buffers(page)) {

create_empty_buffers(page, 1 << inode->i_blkbits,

(1 << BH_Dirty)|(1 << BH_Uptodate));

}

/*

* Be very careful.  We have no exclusion from __set_page_dirty_buffers

* here, and the (potentially unmapped) buffers may become dirty at

* any time.  If a buffer becomes dirty here after we've inspected it

* then we just miss that fact, and the page stays dirty.

*

* Buffers outside i_size may be dirtied by __set_page_dirty_buffers;

* handle that here by just cleaning them.

*/

//块缓存页中的起始块号

block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);

//块缓存区描述符首部

head = page_buffers(page);

bh = head;

/*

* Get all the dirty buffers mapped to disk addresses and

* handle any aliases from the underlying blockdev's mapping.

*/

do {

//如果块号超过了文件的最后块号

if (block > last_block) {

/*

* mapped buffers outside i_size will occur, because

* this page can be outside i_size when there is a

* truncate in progress.

*/

/*

* The buffer was zeroed by block_write_full_page()

*/

clear_buffer_dirty(bh);

set_buffer_uptodate(bh);

} else if (!buffer_mapped(bh) && buffer_dirty(bh)) {

//从文件系统中读取文件相对块号对应的bh

err = get_block(inode, block, bh, 1);

if (err)

goto recover;

if (buffer_new(bh)) {

/* blockdev mappings never come here */

clear_buffer_new(bh);

unmap_underlying_metadata(bh->b_bdev,

bh->b_blocknr);

}

}

bh = bh->b_this_page;

block++;

} while (bh != head);

do {

get_bh(bh);

//块缓存区没有被映射

if (!buffer_mapped(bh))

continue;

/*

* If it's a fully non-blocking write attempt and we cannot

* lock the buffer then redirty the page.  Note that this can

* potentially cause a busy-wait loop from pdflush and kswapd

* activity, but those code paths have their own higher-level

* throttling.

*/

//在操作之前先锁定块缓存区

if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {

lock_buffer(bh);

} else if (test_set_buffer_locked(bh)) {

//如果操作模式为WB_SYNC_NONE或者不允许阻塞。

//在块缓存区已经被锁定时,直接退出

redirty_page_for_writepage(wbc, page);

continue;

}

//如果页面为脏,设置块缓存区为BH_ASYNC_WRITE

if (test_clear_buffer_dirty(bh)) {

mark_buffer_async_write(bh);

} else {

unlock_buffer(bh);

}

} while ((bh = bh->b_this_page) != head);

/*

* The page and its buffers are protected by PageWriteback(), so we can

* drop the bh refcounts early.

*/

BUG_ON(PageWriteback(page));

//设置页面回写标志

set_page_writeback(page);

unlock_page(page);

//遍历页中的块缓存区,将BH_ASYNC_WRITE标志的BH回写到文件系统

do {

struct buffer_head *next = bh->b_this_page;

if (buffer_async_write(bh)) {

submit_bh(WRITE, bh);

nr_underway++;

}

put_bh(bh);

bh = next;

} while (bh != head);

err = 0;

done:

if (nr_underway == 0) {

/*

* The page was marked dirty, but the buffers were

* clean.  Someone wrote them back by hand with

* ll_rw_block/submit_bh.  A rare case.

*/

int uptodate = 1;

do {

if (!buffer_uptodate(bh)) {

uptodate = 0;

break;

}

bh = bh->b_this_page;

} while (bh != head);

if (uptodate)

SetPageUptodate(page);

end_page_writeback(page);

/*

* The page and buffer_heads can be released at any time from

* here on.

*/

wbc->pages_skipped++;  /* We didn't write this page */

}

return err;

recover:

/*

* ENOSPC, or some other error.  We may already have added some

* blocks to the file, so we need to write these out to avoid

* exposing stale data.

* The page is currently locked and not marked for writeback

*/

bh = head;

/* Recovery: lock and submit the mapped buffers */

do {

get_bh(bh);

if (buffer_mapped(bh) && buffer_dirty(bh)) {

lock_buffer(bh);

mark_buffer_async_write(bh);

} else {

/*

* The buffer may have been set dirty during

* attachment to a dirty page.

*/

clear_buffer_dirty(bh);

}

} while ((bh = bh->b_this_page) != head);

SetPageError(page);

BUG_ON(PageWriteback(page));

set_page_writeback(page);

unlock_page(page);

do {

struct buffer_head *next = bh->b_this_page;

if (buffer_async_write(bh)) {

clear_buffer_dirty(bh);

submit_bh(WRITE, bh);

nr_underway++;

}

put_bh(bh);

bh = next;

} while (bh != head);

goto done;

}

该函数会遍历页面中的块缓存区,然后将脏的块缓存区写回文件系统.

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值