许多文件系统都是通过generic_file_write()函数来实现文件对象的write方法,即write(库函数)->sys_write()->generic_file_write():
ssize_t generic_file_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
ssize_t ret;
struct iovec local_iov = { .iov_base = (void __user *)buf,
.iov_len = count };
down(&inode->i_sem);
ret = __generic_file_write_nolock(file, &local_iov, 1, ppos);
up(&inode->i_sem);
if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
ssize_t err;
err = sync_page_range(inode, mapping, *ppos - ret, ret);
if (err < 0)
ret = err;
}
return ret;
}
generic_file_write会调用__generic_file_write_nolock(),即write(库函数)->sys_write()->generic_file_write()->__generic_file_write_nolock:
ssize_t
__generic_file_write_nolock(struct file *file, const struct iovec *iov,
unsigned long nr_segs, loff_t *ppos)
{
struct kiocb kiocb;
ssize_t ret;
init_sync_kiocb(&kiocb, file);
ret = __generic_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos);
if (ret == -EIOCBQUEUED)
ret = wait_on_sync_kiocb(&kiocb);
return ret;
}
write(库函数)->sys_write()->generic_file_write()->__generic_file_write_nolock()->__generic_file_aio_write_nolock():
ssize_t
__generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t *ppos)
{
struct file *file = iocb->ki_filp;
struct address_space * mapping = file->f_mapping;
size_t ocount; /* original count */
size_t count; /* after file limit checks */
struct inode *inode = mapping->host;
unsigned long seg;
loff_t pos;
ssize_t written;
ssize_t err;
ocount = 0;
for (seg = 0; seg < nr_segs; seg++) {
const struct iovec *iv = &iov[seg];
/*
* If any segment has a negative length, or the cumulative
* length ever wraps negative then return -EINVAL.
*/
ocount += iv->iov_len;
if (unlikely((ssize_t)(ocount|iv->iov_len) < 0))
return -EINVAL;
if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
continue;
if (seg == 0)
return -EFAULT;
nr_segs = seg;
ocount -= iv->iov_len; /* This segment is no good */
break;
}
count = ocount;
pos = *ppos;
/* We can write back this queue in page reclaim */
current->backing_dev_info = mapping->backing_dev_info;
written = 0;
err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
if (err)
goto out;
if (count == 0)
goto out;
err = remove_suid(file->f_dentry);
if (err)
goto out;
inode_update_time(inode, 1);
/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
if (unlikely(file->f_flags & O_DIRECT)) {
written = generic_file_direct_write(iocb, iov,
&nr_segs, pos, ppos, count, ocount);
if (written < 0 || written == count)
goto out;
/*
* direct-io write to a hole: fall through to buffered I/O
* for completing the rest of the request.
*/
pos += written;
count -= written;
}
written = generic_file_buffered_write(iocb, iov, nr_segs,
pos, ppos, count, written);
out:
current->backing_dev_info = NULL;
return written ? written : err;
}
write(库函数)->sys_write()->generic_file_write()->__generic_file_write_nolock()->__generic_file_aio_write_nolock()->generic_file_buffered_write():
ssize_t
generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos, loff_t *ppos,
size_t count, ssize_t written)
{
struct file *file = iocb->ki_filp;
struct address_space * mapping = file->f_mapping;
struct address_space_operations *a_ops = mapping->a_ops;
struct inode *inode = mapping->host;
long status = 0;
struct page *page;
struct page *cached_page = NULL;
size_t bytes;
struct pagevec lru_pvec;
const struct iovec *cur_iov = iov; /* current iovec */
size_t iov_base = 0; /* offset in the current iovec */
char __user *buf;
pagevec_init(&lru_pvec, 0);
buf = iov->iov_base + written; /* handle partial DIO write */
do {
unsigned long index;
unsigned long offset;
size_t copied;
offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
//获取要写的缓冲页面索引(如何根据页索引在radix树中获取到指定页描述符,ULK-PAGE600)
index = pos >> PAGE_CACHE_SHIFT;
bytes = PAGE_CACHE_SIZE - offset;
//最后剩一点写入内容的处理
if (bytes > count)
bytes = count;
/*
* Bring in the user page that we will copy from _first_.
* Otherwise there's a nasty deadlock on copying from the
* same page as we're writing to, without it being marked
* up-to-date.
*/
fault_in_pages_readable(buf, bytes);
//在radix树里面查找要被写的page,如果不存在则创建一个,见下面分析
page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
if (!page) {
status = -ENOMEM;
break;
}
//为这个page准备一组buffer_head结构,用于描述组成这个page的数据块,见下面分析
status = a_ops->prepare_write(file, page, offset, offset+bytes);
if (unlikely(status)) {
loff_t isize = i_size_read(inode);
/*
* prepare_write() may have instantiated a few blocks
* outside i_size. Trim these off again.
*/
unlock_page(page);
page_cache_release(page);
if (pos + bytes > isize)
vmtruncate(inode, isize);
break;
}
if (likely(nr_segs == 1))
copied = (page, offset,
buf, bytes);
else
copied = filemap_copy_from_user_iovec(page, offset,
cur_iov, iov_base, bytes);
flush_dcache_page(page);
//把基础缓冲区标记为脏,以便随后把他们都写到磁盘。
status = a_ops->commit_write(file, page, offset, offset+bytes);
if (likely(copied > 0)) {
if (!status)
status = copied;
if (status >= 0) {
written += status;
count -= status;
pos += status;
buf += status;
if (unlikely(nr_segs > 1))
filemap_set_next_iovec(&cur_iov,
&iov_base, status);
}
}
if (unlikely(copied != bytes))
if (status >= 0)
status = -EFAULT;
unlock_page(page);
mark_page_accessed(page);
page_cache_release(page);
if (status < 0)
break;
balance_dirty_pages_ratelimited(mapping);
cond_resched();
} while (count);
*ppos = pos;
if (cached_page)
page_cache_release(cached_page);
/*
* For now, when the user asks for O_SYNC, we'll actually give O_DSYNC
*/
if (likely(status >= 0)) {
if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
if (!a_ops->writepage || !is_sync_kiocb(iocb))
status = generic_osync_inode(inode, mapping,
OSYNC_METADATA|OSYNC_DATA);
}
}
/*
* If we get here for O_DIRECT writes then we must have fallen through
* to buffered writes (block instantiation inside i_size). So we sync
* the file data here, to try to honour O_DIRECT expectations.
*/
if (unlikely(file->f_flags & O_DIRECT) && written)
status = filemap_write_and_wait(mapping);
pagevec_lru_add(&lru_pvec);
return written ? written : status;
}
static inline struct page *
__grab_cache_page(struct address_space *mapping, unsigned long index,
struct page **cached_page, struct pagevec *lru_pvec)
{
int err;
struct page *page;
repeat:
//根据address_space地址和缓冲页的索引,获取缓冲页面的描述符(ULK-PAGE602)
page = find_lock_page(mapping, index);
if (!page) {
if (!*cached_page) {
*cached_page = page_cache_alloc(mapping);
if (!*cached_page)
return NULL;
}
//把一个新页的描述符插入到页高速缓存--在radix树中出入新节点
err = add_to_page_cache(*cached_page, mapping,
index, GFP_KERNEL);
if (err == -EEXIST)
goto repeat;
if (err == 0) {
page = *cached_page;
page_cache_get(page);
if (!pagevec_add(lru_pvec, page))
__pagevec_lru_add(lru_pvec);
*cached_page = NULL;
}
}
return page;
}
//prepare_write分析
address_space对象的prepare_write和commit_write方法专用于由generic_file_write()实现的通用写操作,这个函数适用于普通文件和块设备文件。每个磁盘文件系统都定义了自己的prepare_write方法。与读操作类似,这个方法只是普通函数的封装。例如,Ext2文件系统通过下列函数实现prepare_write方法
//在fs/buffer.c目录中
int ext2_prepare_write(struct file *file,struct page *page,unsigned from ,unsigned to)
{
return block_prepare_write(page,from,to,ext2_get_block);
}
一旦prepare_write,generic_file_write()函数就用存放在用户地址空间中的数据更新高速缓存页面。接下来,调用address_space对象的commit_write方法。这个方法由generic_commit_write()函数实现。generic_commit_write()函数执行如下步骤:
1.调用__block_commit_write()函数,执行如下步骤:
A.考虑页中受写操作影响的所有缓冲区;对于其中的每个缓冲区,将对应缓冲区首部的BH_Uptodate和BH_Dirty标志置位。
B.标记相应索引节点为脏,将索引节点加入超级块脏的索引节点连接
C.如果缓冲区页中的所有缓冲区是最新的,则将PG_uptodate标志置位
D.将页的PG_Dirty标志置位,并在基树中将页标记成脏