Kernel源码笔记之VFS：7.写文件

苟浩

已于 2022-05-29 19:41:13 修改

阅读量427

点赞数

分类专栏： Kernel 文章标签： kernel

于 2022-03-30 08:49:27 首次发布

本文链接：https://blog.csdn.net/jakelylll/article/details/123741908

版权

Kernel 专栏收录该内容

46 篇文章 6 订阅

订阅专栏

Kernel源码笔记目录

写文件

源码基于stable-5.10.102

写文件的系统调用，以及前面几个函数和读文件差不多，我们直接从generic_perform_write开始看

ssize_t generic_perform_write(struct file *file,
				struct iov_iter *i, loff_t pos)
{
	struct address_space *mapping = file->f_mapping;
	const struct address_space_operations *a_ops = mapping->a_ops;
	long status = 0;
	ssize_t written = 0;
	unsigned int flags = 0;

	do {
		struct page *page;
		unsigned long offset;	/* Offset into pagecache page */
		unsigned long bytes;	/* Bytes to write to page */
		size_t copied;		/* Bytes copied from user */
		void *fsdata;

        // 算出pos在页内的偏移，相当于 offset = pos % PAGE_SIZE
		offset = (pos & (PAGE_SIZE - 1));

        // 可写的数据量
		bytes = min_t(unsigned long, PAGE_SIZE - offset,
						iov_iter_count(i));

again:
		// 检查用户空间地址可读? todo: 没太看懂
		if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
			status = -EFAULT;
			break;
		}

        // 有信号要处理
		if (fatal_signal_pending(current)) {
			status = -EINTR;
			break;
		}

        // 为写做准备,主要是把需要写的页面准备好,如果内存里还没有,就要从磁盘上读
		status = a_ops->write_begin(file, mapping, pos, bytes, flags,
						&page, &fsdata);
		if (unlikely(status < 0))
			break;

		if (mapping_writably_mapped(mapping))
			flush_dcache_page(page);

        // 从用户空间向页里写数据
		copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
		flush_dcache_page(page);

        // 写结束
		status = a_ops->write_end(file, mapping, pos, bytes, copied,
						page, fsdata);
		if (unlikely(status < 0))
			break;

        // 写入的数量
		copied = status;

        // 检查是否需要调度,则让出cpu
		cond_resched();

        // 从count里减去已经复制的数据,里面写的比较复杂,不知道为啥要这么写
		iov_iter_advance(i, copied);
		if (unlikely(copied == 0)) {
			// 如果没有复制到数据,重新计算bytes?
			bytes = min_t(unsigned long, PAGE_SIZE - offset,
						iov_iter_single_seg_count(i));
			goto again;
		}

        // 累加相关计数器
		pos += copied;
		written += copied;

        // 刷新脏页,如果脏页太多,就会写磁盘
		balance_dirty_pages_ratelimited(mapping);
	} while (iov_iter_count(i));

	return written ? written : status;
}

write_begin函数, 大多数文件系统都会调用到block_write_begin.

int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
		unsigned flags, struct page **pagep, get_block_t *get_block)
{
    // 计算pos对应的页序号,相当于: index = pos / PAGE_SIZE
	pgoff_t index = pos >> PAGE_SHIFT;
	struct page *page;
	int status;

    // 获取index对应的页面
	page = grab_cache_page_write_begin(mapping, index, flags);
	if (!page)
		return -ENOMEM;

    // 与块设备上的页面同步
	status = __block_write_begin(page, pos, len, get_block);
	if (unlikely(status)) {
		unlock_page(page);
		put_page(page);
		page = NULL;
	}

    // 用pagep返回找到的page
	*pagep = page;
	return status;
}
struct page *grab_cache_page_write_begin(struct address_space *mapping,
					pgoff_t index, unsigned flags)
{
	struct page *page;
	int fgp_flags = FGP_LOCK|FGP_WRITE|FGP_CREAT;

	if (flags & AOP_FLAG_NOFS)
		fgp_flags |= FGP_NOFS;

    // pagecache_get_page用来查找/创建对应的页面，如果缓存里有页面，就
    // 直接返回，否则就创建一个并加入对缓存里
    // 这个代码详见读文件的解释
	page = pagecache_get_page(mapping, index, fgp_flags,
			mapping_gfp_mask(mapping));
	if (page)
        // 如果需要就等待页写回完成
		wait_for_stable_page(page);

	return page;
}

// __block_write_begin直接调用 __block_write_begin_int(page, pos, len, get_block, NULL)
int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
		get_block_t *get_block, struct iomap *iomap)
{
    // 算出页内偏移, 相当于: from = pos % PAGE_SIZE
	unsigned from = pos & (PAGE_SIZE - 1);

    // 数据终点
	unsigned to = from + len;
	struct inode *inode = page->mapping->host;
	unsigned block_start, block_end;
	sector_t block;
	int err = 0;
	unsigned blocksize, bbits;
	struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;

	BUG_ON(!PageLocked(page));
	BUG_ON(from > PAGE_SIZE);
	BUG_ON(to > PAGE_SIZE);
	BUG_ON(from > to);

    // 获取/创建页的buffer_head, 存在page->private里
	head = create_page_buffers(page, inode, 0);

    // 块大小
	blocksize = head->b_size;

    // 块大小对应的位
	bbits = block_size_bits(blocksize);

    // 对应的块号
	block = (sector_t)page->index << (PAGE_SHIFT - bbits);

    // 遍历页缓冲区所有的页,bh->b_this_page存储着下一页的指针
	for(bh = head, block_start = 0; bh != head || !block_start;
	    block++, block_start=block_end, bh = bh->b_this_page) {
        
        // 一块数据的终点
		block_end = block_start + blocksize;

        // 如果这块数据不在页的范围内,继续循环
		if (block_end <= from || block_start >= to) {
            // 如果页和buffer_head的最新标志不同,则把buffer_head改成和页相同的
            // todo: 为什么要在这里判断??
			if (PageUptodate(page)) {
				if (!buffer_uptodate(bh))
					set_buffer_uptodate(bh);
			}
			continue;
		}
        // 清除new标志
		if (buffer_new(bh))
			clear_buffer_new(bh);

        // 如果bh还没映射,则映射之
		if (!buffer_mapped(bh)) {
			WARN_ON(bh->b_size != blocksize);

            // get_block是各个文件系统传进来用于获取块的函数,
            // 将块上的内容,填到页面上
			if (get_block) {
				err = get_block(inode, block, bh, 1);
				if (err)
					break;
			} else {
				iomap_to_bh(inode, block, bh, iomap);
			}

			if (buffer_new(bh)) { // 如果是新映射的
				clean_bdev_bh_alias(bh);

				if (PageUptodate(page)) {
                    // 如果页内容是新的,则清除相应标志,继续循环
					clear_buffer_new(bh);
					set_buffer_uptodate(bh);
					mark_buffer_dirty(bh);
					continue;
				}

                // 如果块的起始点和页的起始点重合,则把不需要的位置都清0
				if (block_end > to || block_start < from)
					zero_user_segments(page,
						to, block_end,
						block_start, from);
				continue;
			}
		}
        // 同步bh和页面的uptodate标志, 继续循环
		if (PageUptodate(page)) {
			if (!buffer_uptodate(bh))
				set_buffer_uptodate(bh);
			continue; 
		}

        // 如果数据不是最新的,发起读操作
		if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
		    !buffer_unwritten(bh) &&
		     (block_start < from || block_end > to)) {
            // 这个从磁盘上直接读
			ll_rw_block(REQ_OP_READ, 0, 1, &bh);
			*wait_bh++=bh;
		}
	}
	
    // 有等待读盘的buffer,要等他们完成
	while(wait_bh > wait) {
		wait_on_buffer(*--wait_bh);
        // 如果读完,数据还不是最新的,那就出错了
		if (!buffer_uptodate(*wait_bh))
			err = -EIO;
	}
    // 如果有错误,就把各个页面清0
	if (unlikely(err))
		page_zero_new_buffers(page, from, to);
	return err;
}

int generic_write_end(struct file *file, struct address_space *mapping,
			loff_t pos, unsigned len, unsigned copied,
			struct page *page, void *fsdata)
{
	struct inode *inode = mapping->host;
	loff_t old_size = inode->i_size;
	bool i_size_changed = false;

    // 向磁盘提交写请求
	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);

    // 如果写入的数据超过了原来的大小,则修改inode->i_size
	if (pos + copied > inode->i_size) {
		i_size_write(inode, pos + copied);
		i_size_changed = true;
	}

	unlock_page(page);
	put_page(page);

    // 把页标脏
	if (old_size < pos)
		pagecache_isize_extended(inode, old_size, pos);
	
    // 如果大小改变了,把inode标脏
	if (i_size_changed)
		mark_inode_dirty(inode);
	return copied;
}

int block_write_end(struct file *file, struct address_space *mapping,
			loff_t pos, unsigned len, unsigned copied,
			struct page *page, void *fsdata)
{
	struct inode *inode = mapping->host;
	unsigned start;

    // 数据起点
	start = pos & (PAGE_SIZE - 1);

    // 如果复制数据失败,则把已经复制的清0
	if (unlikely(copied < len)) {
		if (!PageUptodate(page))
			copied = 0;

		page_zero_new_buffers(page, start+copied, start+len);
	}

    // 刷新dcache缓存??
	flush_dcache_page(page);

	__block_commit_write(inode, page, start, start+copied);

	return copied;
}

static int __block_commit_write(struct inode *inode, struct page *page,
		unsigned from, unsigned to)
{
	unsigned block_start, block_end;
	int partial = 0;
	unsigned blocksize;
	struct buffer_head *bh, *head;

	bh = head = page_buffers(page);
	blocksize = bh->b_size;

	block_start = 0;
	do {
		block_end = block_start + blocksize;
		if (block_end <= from || block_start >= to) {
			if (!buffer_uptodate(bh))
				partial = 1;
		} else {
			set_buffer_uptodate(bh);
			mark_buffer_dirty(bh);
		}
		clear_buffer_new(bh);

		block_start = block_end;
		bh = bh->b_this_page;
	} while (bh != head);

	/*
	 * If this is a partial write which happened to make all buffers
	 * uptodate then we can optimize away a bogus readpage() for
	 * the next read(). Here we 'discover' whether the page went
	 * uptodate as a result of this (potentially partial) write.
	 */
	if (!partial)
		SetPageUptodate(page);
	return 0;
}

苟浩

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Kernel源码笔记之VFS：7.写文件

写文件写文件的系统调用，以及前面几个函数和读文件差不多，我们直接从generic_perform_write开始看ssize_t generic_perform_write(struct file *file, struct iov_iter *i, loff_t pos){ struct address_space *mapping = file->f_mapping; const struct address_space_operations *a_ops = mapping-&
复制链接

扫一扫