Cache有两种:Page Cache 和Buffer Cache;
应用层----》VFS----》Page Cache ===Buffer Cache ----》具体文件系统----》Disk
Page Cache: 虚拟文件系统用Page Cache与用户态buf交换数据。
Buffer Cache: 具体文件系统用buffer cache与磁盘块交换数据。
数据结构:
struct page{
unsigned long private; //buffer_head 头指针
......
}
struct buffer_head{
char *data; //buffer_head状态相关
atomic_t b_count;//本缓存区的引用计数
size_t b_size;//块大小
struct block_device *b_bdev;
struct page*b_page;
unsigned long b_state;//缓存区的状态
...
}
#define BH_Uptodate 0 /* 包含有用的数据
#define BH_Dirty 1 /* 为脏
#define BH_Lock 2 /* 已锁
#define BH_Req 3 /* 不可用
#define BH_Mapped 4 /* 映射到磁盘
#define BH_New 5 /*新的,没有写到
#define BH_Protected 6 /* 已保护
buffer_head对象管理:
bh_cachep=kmem_cache_create("buffer_head",sizeof(struct buffer_head),0,
(SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD),NULL);
max_buffer_heads= 最大页*(4k/sizeof(buffer_head))
struct buffer_head* alloc_buffer_head(gfp_t gfp_flags); // 分配buffer_head
-->kmem_cache_alloc(bh_cachep,gfp_flags|__GFP_ZERO);
void free_buffer_head(struct buffer_head*bh); //释放buffer_head
-->kmem_cache_free(bh_cachp,bh);
文件系统IO流程:(写过程跟踪)
接下来调用unlock_page()将锁定的页面解锁(即清除PG_Locked标志位)。
如果是文件大小被改变,则还需要修改索引节点中的文件大小(inode 中的i_size)成员变量.
应用层----》VFS----》Page Cache ===Buffer Cache ----》具体文件系统----》Disk
Page Cache: 虚拟文件系统用Page Cache与用户态buf交换数据。
Buffer Cache: 具体文件系统用buffer cache与磁盘块交换数据。
数据结构:
struct page{
unsigned long private; //buffer_head 头指针
......
}
struct buffer_head{
char *data; //buffer_head状态相关
(1,未使用,该对象可用,b_data为NULL,
2,空闲,b_data指向一个空闲的缓存区,即缓存区没与与块设备中的数据块对应,
3,正在使用状态,b_data指向一个正在使用中的缓存区,
4,异步状态,b_data指向一个用来实现page I/O的临时缓存区)sector_t b_blocknr;//本缓存区对应的块号
atomic_t b_count;//本缓存区的引用计数
size_t b_size;//块大小
struct block_device *b_bdev;
struct page*b_page;
unsigned long b_state;//缓存区的状态
...
}
#define BH_Uptodate 0 /* 包含有用的数据
#define BH_Dirty 1 /* 为脏
#define BH_Lock 2 /* 已锁
#define BH_Req 3 /* 不可用
#define BH_Mapped 4 /* 映射到磁盘
#define BH_New 5 /*新的,没有写到
#define BH_Protected 6 /* 已保护
buffer_head对象管理:
bh_cachep=kmem_cache_create("buffer_head",sizeof(struct buffer_head),0,
(SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD),NULL);
max_buffer_heads= 最大页*(4k/sizeof(buffer_head))
struct buffer_head* alloc_buffer_head(gfp_t gfp_flags); // 分配buffer_head
-->kmem_cache_alloc(bh_cachep,gfp_flags|__GFP_ZERO);
void free_buffer_head(struct buffer_head*bh); //释放buffer_head
-->kmem_cache_free(bh_cachp,bh);
文件系统IO流程:(写过程跟踪)
应用层系统调用:read/write
SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf[数据],size_t, count[大小])
{
struct file *file;
ssize_t ret = -EBADF;
int fput_needed;
file = fget_light(fd, &fput_needed);//从文件fd找到对应的file
if (file) {
loff_t pos = file_pos_read(file);//获得当前的位置指针
ret = vfs_write(file, buf[数据], count[大小], &pos);
file_pos_write(file, pos);//更新位置指针
fput_light(file, fput_needed);
}
return ret;
}
ssize_t vfs_write(struct file *file, const char __user *buf[数据], size_t count[大小], loff_t *pos)
{
ssize_t ret;
...//各种检查
if (file->f_op->write)
ret = file->f_op->write(file, buf, count, pos); //具体文件系统(ext2/3/4,xfs)均实例化为do_sync_write/read
else
ret = do_sync_write(file, buf, count, pos);
...
return ret;
}
ssize_t do_sync_write(struct file *filp, const char __user *buf[数据], size_t len[大小], loff_t *ppos)
{
struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };//数据,大小
struct kiocb kiocb;
ssize_t ret;
init_sync_kiocb(&kiocb, filp);// kiocb.ki_filp=filp; //文件
kiocb.ki_pos = *ppos;
kiocb.ki_left = len;
kiocb.ki_nbytes = len;
...
ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
//具体文件系统异步调用接口,ext2/3/4实例化为generic_file_aio_read/write。
xfs实例化为xfs_file_aio_read/write.
...
}
//具体文件系统均实例化为generic_file_aio_write/read
ssize_t generic_file_aio_write(struct kiocb *iocb[文件], const struct iovec *iov[数据],unsigned long nr_segs, loff_t pos)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
ssize_t ret;
......
mutex_lock(&inode->i_mutex);
ret = __generic_file_aio_write(iocb[文件], iov[数据块], nr_segs[数据块个数], &iocb->ki_pos);
mutex_unlock(&inode->i_mutex);
if (ret > 0 || ret == -EIOCBQUEUED) {
ssize_t err;
err = generic_write_sync(file, pos, ret);//执行同步命令,vfs_fsync_range函数的封装
if (err < 0 && ret > 0)
ret = err;
}
return ret;
}
//具体的数据写入函数:iocb[IO状态结构,file,offset等], iov[数据向量数组],nr_segs[数组个数],ppos[偏移]
ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,unsigned long nr_segs, loff_t *ppos)
{
......
pos = *ppos;
......
/* 带O_DIRECT标志时,直接把数据写入磁盘,绕过文件系统buffer */
if (unlikely(file->f_flags & O_DIRECT)) {
loff_t endbyte;
ssize_t written_buffered;
//真接写入磁盘
written = generic_file_direct_write(iocb, iov, &nr_segs, pos,ppos, count, ocount);
......
} else { //写入buffer缓存区
written = generic_file_buffered_write(iocb, iov, nr_segs,pos, ppos, count, written);
}
......
}
//正常buffer写
ssize_t generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos, loff_t *ppos,size_t count[要写入的字节数], ssize_t written[已写入的字节数])
{
struct file *file = iocb->ki_filp;//文件
ssize_t status;
struct iov_iter i;//IO数据向量
iov_iter_init(&i, iov, nr_segs, count, written); // i->iov=iov;i->nr_segs=nr_segs;i->count=count+written;
status = generic_perform_write(file, &i, pos);
......
}
static ssize_t generic_perform_write(struct file *file[文件],
struct iov_iter *i[IO数据], loff_t pos)
{
struct address_space *mapping = file->f_mapping; //缓存区组织管理结构:地址空间
const struct address_space_operations *a_ops = mapping->a_ops;
long status = 0;
ssize_t written = 0;
unsigned int flags = 0;
......
do {
struct page *page;
pgoff_t index; /* Pagecache index for current page 缓存区中页的索引*/
unsigned long offset; /* Offset into pagecache page 页内偏移*/
unsigned long bytes; /* Bytes to write to page */
size_t copied; /* Bytes copied from user */
void *fsdata;
offset = (pos & (PAGE_CACHE_SIZE - 1)); // (页大小:4K)
index = pos >> PAGE_CACHE_SHIFT; //(页偏移:12位)
bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, iov_iter_count(i));
again:
......
status = a_ops->write_begin(file, mapping, pos, bytes, flags,
&page, &fsdata); //由具体的文件系统 索引或分配缓存页面
if (unlikely(status))
break;
if (mapping_writably_mapped(mapping)) //mapping->i_mmap_writable!=0 此页面在用户态是否被修改
flush_dcache_page(page); //刷新page,将与此page相关的缓存(cache,buffer)刷回page
pagefault_disable(); //关闭缺页中断
copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); //数据拷贝到缓存页面
pagefault_enable(); //使能缺页中断
flush_dcache_page(page);
mark_page_accessed(page);//对页面做标记
status = a_ops->write_end(file, mapping, pos, bytes, copied,
page, fsdata); //通知具体文件系统 把缓存数据页面提交到磁盘
if (unlikely(status < 0))
break;
copied = status;
cond_resched();//调度
iov_iter_advance(i, copied); //更新IO完成字节数
......
balance_dirty_pages_ratelimited(mapping);//脏页处理
} while (iov_iter_count(i));//i->count
return written ? written : status;
}
write_begin()-->ext2_write_begin()/xfs_vm_write_begin()[都是block_write_begin的封装]
write_end()--->ext2_write_end()/xfs_vm_write_end()[都是generic_write_end()-->block_write_end()的封装]
int block_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata,
get_block_t *get_block)
{
struct inode *inode = mapping->host;
int status = 0;
struct page *page;
pgoff_t index;
unsigned start, end;
int ownpage = 0;
index = pos >> PAGE_CACHE_SHIFT; //12
start = pos & (PAGE_CACHE_SIZE - 1); // 4k
end = start + len;
page = *pagep;
if (page == NULL) {
ownpage = 1;
//在地址空间查找或分配一个page
page = grab_cache_page_write_begin(mapping, index, flags);
if (!page) {
status = -ENOMEM;
goto out;
}
*pagep = page;
} else
BUG_ON(!PageLocked(page));
//为page分配一组 缓存头buffer_head,并初始化
status = __block_prepare_write(inode, page, start, end, get_block);
......
}
generic_write_end()是通用的页面完成处理函数,首先会调用block_write_end()来对页面中刚刚写入的缓冲区标记为(BH_Dirty)。
接下来调用unlock_page()将锁定的页面解锁(即清除PG_Locked标志位)。
如果是文件大小被改变,则还需要修改索引节点中的文件大小(inode 中的i_size)成员变量.