write对应的系统调用是sys_write,代码如下:
asmlinkage ssize_t sys_write(unsigned int fd, const char * buf, size_t count)
{
ssize_t ret;
struct file * file;
ret = -EBADF;
file = fget(fd);
if (file) {
if (file->f_mode & FMODE_WRITE) {
struct inode *inode = file->f_dentry->d_inode;
ret = locks_verify_area(FLOCK_VERIFY_WRITE, inode, file,
file->f_pos, count);
if (!ret) {
ssize_t (*write)(struct file *, const char *, size_t, loff_t *);
ret = -EINVAL;
if (file->f_op && (write = file->f_op->write) != NULL)
ret = write(file, buf, count, &file->f_pos);
}
}
if (ret > 0)
inode_dir_notify(file->f_dentry->d_parent->d_inode,
DN_MODIFY);
fput(file);
}
return ret;
}
fd假设就是
Linux内核源代码情景分析-文件的打开,一文中刚刚打开文件/usr/local/hello.c的文件号。fget(fd),根据打开文件号fd找到该已打开文件的file结构。代码如下:
struct file * fget(unsigned int fd)
{
struct file * file;
struct files_struct *files = current->files;
read_lock(&files->file_lock);
file = fcheck(fd);
if (file)
get_file(file);
read_unlock(&files->file_lock);
return file;
}
static inline struct file * fcheck(unsigned int fd)
{
struct file * file = NULL;
struct files_struct *files = current->files;
if (fd < files->max_fds)
file = files->fd[fd];
return file;
}
ssize_t
generic_file_write(struct file *file,const char *buf,size_t count,loff_t *ppos)//file是要写入文件节点的file结构,buf为要写入内容的指针,count是数量,ppos是要写入文件的位置
{
struct inode *inode = file->f_dentry->d_inode;
struct address_space *mapping = inode->i_mapping;
unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
loff_t pos;
struct page *page, *cached_page;
unsigned long written;
long status;
int err;
cached_page = NULL;
down(&inode->i_sem);
pos = *ppos;
err = -EINVAL;
if (pos < 0)
goto out;
err = file->f_error;
if (err) {
file->f_error = 0;
goto out;
}
written = 0;
if (file->f_flags & O_APPEND)
pos = inode->i_size;
/*
* Check whether we've reached the file size limit.
*/
err = -EFBIG;
if (limit != RLIM_INFINITY) {
if (pos >= limit) {
send_sig(SIGXFSZ, current, 0);
goto out;
}
if (count > limit - pos) {
send_sig(SIGXFSZ, current, 0);
count = limit - pos;
}
}
status = 0;
if (count) {
remove_suid(inode);
inode->i_ctime = inode->i_mtime = CURRENT_TIME;
mark_inode_dirty_sync(inode);
}
while (count) {
unsigned long bytes, index, offset;
char *kaddr;
int deactivate = 1;
/*
* Try to find the page in the cache. If it isn't there,
* allocate a free page.
*/
offset = (pos & (PAGE_CACHE_SIZE -1)); //根据当前位置pos计算出本次循环中要写多的缓冲页面index、在该页面中的起点offset以及写入长度bytes
index = pos >> PAGE_CACHE_SHIFT;
bytes = PAGE_CACHE_SIZE - offset;
if (bytes > count) {
bytes = count;
deactivate = 0;
}
/*
* Bring in the user page that we will copy from _first_.
* Otherwise there's a nasty deadlock on copying from the
* same page as we're writing to, without it being marked
* up-to-date.
*/
{ volatile unsigned char dummy;
__get_user(dummy, buf);
__get_user(dummy, buf+bytes-1);
}
status = -ENOMEM; /* we'll assign it later anyway */
page = __grab_cache_page(mapping, index, &cached_page);//在page_hash_table中找到该缓冲页面,如找不到,就分配、建立一个缓冲页面
if (!page)
break;
/* We have exclusive IO access to the page.. */
if (!PageLocked(page)) {
PAGE_BUG(page);
}
status = mapping->a_ops->prepare_write(file, page, offset, offset+bytes);//预写先读,先把文件在设备上对应位置的数据读到page中
if (status)
goto unlock;
kaddr = page_address(page);
status = copy_from_user(kaddr+offset, buf, bytes);//把数据从用户空间拷贝到page指向的页面中,已经放入了缓冲区
flush_dcache_page(page);
if (status)
goto fail_write;
status = mapping->a_ops->commit_write(file, page, offset, offset+bytes);//真正的写,把缓冲区的数据写到设备上
if (!status)
status = bytes;
if (status >= 0) {
written += status;
count -= status;
pos += status;
buf += status;
}
unlock:
/* Mark it unlocked again and drop the page.. */
UnlockPage(page);
if (deactivate)
deactivate_page(page);
page_cache_release(page);
if (status < 0)
break;
}
*ppos = pos;
if (cached_page)
page_cache_free(cached_page);
/* For now, when the user asks for O_SYNC, we'll actually
* provide O_DSYNC. */
if ((status >= 0) && (file->f_flags & O_SYNC))
status = generic_osync_inode(inode, 1); /* 1 means datasync */
err = written ? written : status;
out:
up(&inode->i_sem);
return err;
fail_write:
status = -EFAULT;
ClearPageUptodate(page);
kunmap(page);
goto unlock;
}
inode结构中有个指针i_mapping,指向一个address_space数据结构,其定义如下:
struct address_space {
struct list_head clean_pages; /* list of clean pages */
struct list_head dirty_pages; /* list of dirty pages */
struct list_head locked_pages; /* list of locked pages */
unsigned long nrpages; /* number of total pages */
struct address_space_operations *a_ops; /* methods */
struct inode *host; /* owner: inode, block_device */
struct vm_area_struct *i_mmap; /* list of private mappings */
struct vm_area_struct *i_mmap_shared; /* list of shared mappings */
spinlock_t i_shared_lock; /* and spinlock protecting it */
};
其中a_ops,它指向一个address_space_operations数据结构,就ext2文件系统来说,这个数据结构为ext2_aops,代码如下:
struct address_space_operations {
int (*writepage)(struct page *);
int (*readpage)(struct file *, struct page *);
int (*sync_page)(struct page *);
int (*prepare_write)(struct file *, struct page *, unsigned, unsigned);
int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
/* Unfortunately this kludge is needed for FIBMAP. Don't use it */
int (*bmap)(struct address_space *, long);
};
static inline struct page * __grab_cache_page(struct address_space *mapping,
unsigned long index, struct page **cached_page)
{
struct page *page, **hash = page_hash(mapping, index);
repeat:
page = __find_lock_page(mapping, index, hash);//在page_hash_table中寻找该缓冲页面
if (!page) {//如果找不到
if (!*cached_page) {//cached_page为NULL
*cached_page = page_cache_alloc();//分配一个页面
if (!*cached_page)
return NULL;
}
page = *cached_page;
if (add_to_page_cache_unique(page, mapping, index, hash))//加入到page_hash_table中
goto repeat;
*cached_page = NULL;
}
return page;
}
#define page_hash(mapping,index) (page_hash_table+_page_hashfn(mapping,index))
add_to_page_cache_unique,加入到page_hash_table中,代码如下:
static int add_to_page_cache_unique(struct page * page,
struct address_space *mapping, unsigned long offset,
struct page **hash)
{
int err;
struct page *alias;
spin_lock(&pagecache_lock);
alias = __find_page_nolock(mapping, offset, *hash);
err = 1;
if (!alias) {
__add_to_page_cache(page,mapping,offset,hash);
err = 0;
}
spin_unlock(&pagecache_lock);
return err;
}
static inline void __add_to_page_cache(struct page * page,
struct address_space *mapping, unsigned long offset,
struct page **hash)
{
unsigned long flags;
if (PageLocked(page))
BUG();
flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_dirty) | (1 << PG_referenced) | (1 << PG_arch_1));
page->flags = flags | (1 << PG_locked);
page_cache_get(page);
page->index = offset;//就是最初传递进来的页面缓存index
add_page_to_inode_queue(mapping, page);
add_page_to_hash_queue(page, hash);//加入到page_hash_table表中
lru_cache_add(page);
}
mapping->a_ops->prepare_write开始执行,指向了ext2_prepare_write,代码如下:
static int ext2_prepare_write(struct file *file, struct page *page, unsigned from, unsigned to)
{
return block_prepare_write(page,from,to,ext2_get_block);
}
int block_prepare_write(struct page *page, unsigned from, unsigned to,
get_block_t *get_block)
{
struct inode *inode = page->mapping->host;
int err = __block_prepare_write(inode, page, from, to, get_block);
if (err) {
ClearPageUptodate(page);
kunmap(page);
}
return err;
}
static int __block_prepare_write(struct inode *inode, struct page *page,
unsigned from, unsigned to, get_block_t *get_block)
{
unsigned block_start, block_end;
unsigned long block;
int err = 0;
unsigned blocksize, bbits;
struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
char *kaddr = kmap(page);
blocksize = inode->i_sb->s_blocksize;
if (!page->buffers)//说明是新分配的页面,没有buffer_head结构
create_empty_buffers(page, inode->i_dev, blocksize);//为该页面配备好相应的buffer_head结构,并建立起这个队列
head = page->buffers;
bbits = inode->i_sb->s_blocksize_bits;
block = page->index << (PAGE_CACHE_SHIFT - bbits);//这里用到了page->index
for(bh = head, block_start = 0; bh != head || !block_start;
block++, block_start=block_end, bh = bh->b_this_page) {
if (!bh)
BUG();
block_end = block_start+blocksize;
if (block_end <= from)
continue;
if (block_start >= to)
break;
if (!buffer_mapped(bh)) {
err = get_block(inode, block, bh, 1);//bh中存相关的信息,为ll_rw_block准备
if (err)
goto out;
if (buffer_new(bh)) {
unmap_underlying_metadata(bh);
if (Page_Uptodate(page)) {
set_bit(BH_Uptodate, &bh->b_state);
continue;
}
if (block_end > to)
memset(kaddr+to, 0, block_end-to);
if (block_start < from)
memset(kaddr+block_start, 0, from-block_start);
if (block_end > to || block_start < from)
flush_dcache_page(page);
continue;
}
}
if (Page_Uptodate(page)) {
set_bit(BH_Uptodate, &bh->b_state);
continue;
}
if (!buffer_uptodate(bh) &&
(block_start < from || block_end > to)) {//如果是新分配的页面,一定不一致,如果原有的页面,有可能不一致
ll_rw_block(READ, 1, &bh);//如果不一致,就从设备上读入数据到page中,bh中已经存好了用于从设备中读入数据的相关信息
*wait_bh++=bh;
}
}
/*
* If we issued read requests - let them complete.
*/
while(wait_bh > wait) {
wait_on_buffer(*--wait_bh);
err = -EIO;
if (!buffer_uptodate(*wait_bh))
goto out;
}
return 0;
out:
return err;
}
create_empty_buffers,为该页面配备好相应的buffer_head结构,并建立起这个队列
static void create_empty_buffers(struct page *page, kdev_t dev, unsigned long blocksize)
{
struct buffer_head *bh, *head, *tail;
head = create_buffers(page, blocksize, 1);
if (page->buffers)
BUG();
bh = head;
do {
bh->b_dev = dev;//重要点
bh->b_blocknr = 0;
bh->b_end_io = NULL;
tail = bh;
bh = bh->b_this_page;
} while (bh);
tail->b_this_page = head;
page->buffers = head;//重要点
page_cache_get(page);
}
static struct buffer_head * create_buffers(struct page * page, unsigned long size, int async)
{
struct buffer_head *bh, *head;
long offset;
try_again:
head = NULL;
offset = PAGE_SIZE;
while ((offset -= size) >= 0) {
bh = get_unused_buffer_head(async);
if (!bh)
goto no_grow;
bh->b_dev = B_FREE; /* Flag as unused */
bh->b_this_page = head;
head = bh;
bh->b_state = 0;
bh->b_next_free = NULL;
bh->b_pprev = NULL;
atomic_set(&bh->b_count, 0);
bh->b_size = size;//重要点,block_size
set_bh_page(bh, page, offset);
bh->b_list = BUF_CLEAN;
bh->b_end_io = NULL;
}
return head;
......
}
void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset)
{
bh->b_page = page;//重要点
if (offset >= PAGE_SIZE)
BUG();
if (PageHighMem(page))
/*
* This catches illegal uses and preserves the offset:
*/
bh->b_data = (char *)(0 + offset);
else
bh->b_data = page_address(page) + offset;//重要点,页面的实际位置
}
返回到generic_file_write,继续执行mapping->a_ops->commit_write,真正的写,把缓冲区的数据写到设备上,对应的指针是generic_commit_write,代码如下:
int generic_commit_write(struct file *file, struct page *page,
unsigned from, unsigned to)
{
struct inode *inode = page->mapping->host;
loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
__block_commit_write(inode,page,from,to);
kunmap(page);
if (pos > inode->i_size) {
inode->i_size = pos;
mark_inode_dirty(inode);
}
return 0;
}
static int __block_commit_write(struct inode *inode, struct page *page,
unsigned from, unsigned to)
{
unsigned block_start, block_end;
int partial = 0, need_balance_dirty = 0;
unsigned blocksize;
struct buffer_head *bh, *head;
blocksize = inode->i_sb->s_blocksize;
for(bh = head = page->buffers, block_start = 0;//page->buffers得到buffer_head结构
bh != head || !block_start;
block_start=block_end, bh = bh->b_this_page) {
block_end = block_start + blocksize;
if (block_end <= from || block_start >= to) {
if (!buffer_uptodate(bh))
partial = 1;
} else {
set_bit(BH_Uptodate, &bh->b_state);
if (!atomic_set_buffer_dirty(bh)) {
__mark_dirty(bh);
buffer_insert_inode_queue(bh, inode);
need_balance_dirty = 1;//只要有记录块缓冲区从"干净"状态变成"脏"状态,need_balance_dirty就置1
}
}
}
if (need_balance_dirty)
balance_dirty(bh->b_dev);//如果置1,这个函数看看这样的记录块是否已经积累到一定的数量,如果是,就唤醒bdflushin进行一次"冲刷"
/*
* is this a partial write that happened to make all buffers
* uptodate then we can optimize away a bogus readpage() for
* the next read(). Here we 'discover' wether the page went
* uptodate as a result of this (potentially partial) write.
*/
if (!partial)
SetPageUptodate(page);
return 0;
}
至此,文件写就分析完了,page和buffer_head同时管理页面,page->buffers指向了buffer_head,bh->b_page指向了page。