我们都知道Linux为了加速读写速度,采用了pagecache机制,用内存缓存磁盘内容,而buffer_head正是连接page和磁盘块的关键结构.
1.buffer head的作用
1. buffer_head是磁盘块的一个抽象,一个buffer_head对应一个磁盘块,buffer_head中保存对应的磁盘号
2. buffer_head把page与磁盘块联系起来,由于page和磁盘块的大小可能不一样,所以一个page可能管理多个buffer_head
这里假设page大小4K,块大小为1K, buffer_head,page和磁盘块关系如下:
2.page与磁盘块映射的建立
这里以写文件为例说明page cache,buffer_head和磁盘块的映射
采用异步IO方式写文件时,会调用到generic_perform_write函数
2.1 文件写流程
static ssize_t generic_perform_write(struct file *file,
struct iov_iter *i, loff_t pos)
{
do {
/*建立page,BH,磁盘块的映射关系 */
status = a_ops->write_begin(file, mapping, pos, bytes, flags,
&page, &fsdata);
if (unlikely(status))
break;
/*复制用户数据到page */
copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
/*标记缓冲区为dirty,等待异步IO完成 */
status = a_ops->write_end(file, mapping, pos, bytes, copied,
} while (iov_iter_count(i));
return written ? written : status;
}
write_bengin和write_end会调用到具体文件系统的实现,这里以ext4为例ext4_write_begin:
static int ext4_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
{
/*分配page cache */
page = grab_cache_page_write_begin(mapping, index, flags);
/*建立page cache与Buffer Head和磁盘块的联系,ext4_get_blok会分配实际的磁盘空间 */
ret = __block_write_begin(page, pos, len, ext4_get_block);
*pagep = page;
return ret;
}
每个inode都有一个address_space结构,不仅提供了文件系统层操作,还用一颗radix tree来管理inode所有page cache.
grab_cache_page_write_begin:首先会用index在mapping的radix tree中查找对应的page cache,找不到,创建新的页面.
而index表示page在文件中的偏移,单位是page_size。这里重点看__block_write_begin和ext4_get_block函数:
__block_write_begin:
1.给page分配buffer_head,由create_page_buffer完成
2.buffer_head与磁盘块的映射,由ext4_get_block完成
3.这里可能涉及到读磁盘,因为向page写入数据时,先要保证page已有的buffer数据与磁盘一致,否则会出现数据覆盖
int __block_write_begin(struct page *page, loff_t pos, unsigned len,
get_block_t *get_block)
{
/*给page创建buffer head */
head = create_page_buffers(page, inode, 0);
blocksize = head->b_size;
bbits = block_size_bits(blocksize);
/*文件索引,转换成文件内块号(这个不是磁盘块号) */
block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
for(bh = head, block_start = 0; bh != head || !block_start;
block++, block_start=block_end, bh = bh->b_this_page) {
block_end = block_start + blocksize;
/*如果要写的区间[from,to]没有落到当前的bh范围,直接不处理 */
if (block_end <= from || block_start >= to) {
if (PageUptodate(page)) {
if (!buffer_uptodate(bh))
set_buffer_uptodate(bh);
}
continue;
}
if (buffer_new(bh))
clear_buffer_new(bh);
/*给对应的bh分配磁盘空间 */
if (!buffer_mapped(bh)) {
WARN_ON(bh->b_size != blocksize);
err = get_block(inode, block, bh, 1);
if (err)
break;
}
/*待写的page已经与磁盘内容一致,直接不处理 */
if (PageUptodate(page)) {
if (!buffer_uptodate(bh))
set_buffer_uptodate(bh);
continue;
}
/*如果要写得区间[from,to]与磁盘不一致,需要从磁盘读数据 */
if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
!buffer_unwritten(bh) &&
(block_start < from || block_end > to)) {
ll_rw_block(READ, 1, &bh);/*更新pagecache内容,如果不更新,会存在数据覆盖 */
*wait_bh++=bh;
}
}
/*等待读完成 */
while(wait_bh > wait) {
wait_on_buffer(*--wait_bh);
if (!buffer_uptodate(*wait_bh))
err = -EIO;
}
return err;
}
2.2 buffer head分配
create_page_buffers判断当前page是否已分配buffer_head,否则调用create_empty_buffer创建buffer_head
此时创建完成的buffer_head并没有映射到具体的磁盘块
void create_empty_buffers(struct page *page,
unsigned long blocksize, unsigned long b_state)
{
struct buffer_head *bh, *head, *tail;
/*分配buffer_head */
head = alloc_page_buffers(page, blocksize, 1);
bh = head;
/*建立page下的buffer head为循环链表 */
do {
bh->b_state |= b_state;
tail = bh;
bh = bh->b_this_page;
} while (bh);
tail->b_this_page = head;
/* page与buffer_head 关联 */
attach_page_buffers(page, head);
}
2.3 buffer head 磁盘空间分配
ext4_get_block主要分配磁盘空间,并调用map_bh建立buffer_head与磁盘块的映射
static inline void
map_bh(struct buffer_head *bh, struct super_block *sb, sector_t block)
{
set_buffer_mapped(bh);
bh->b_bdev = sb->s_bdev;
bh->b_blocknr = block;
bh->b_size = sb->s_blocksize;
}
到这里,page,buffer_head和磁盘块的映射关系建立完成,之后的流程就是等待write数据异步写到磁盘
3 BIO提交
当buffer_head建立好后,就可以直接发起bio操作,这里以读流程来说明:
bh_submit_read是同步读函数,会等待buffer_head为unlock状态,
int bh_submit_read(struct buffer_head *bh)
{
BUG_ON(!buffer_locked(bh));
/*如果BH已经跟磁盘内容一致,则不需要发起BIO */
if (buffer_uptodate(bh)) {
unlock_buffer(bh);
return 0;
}
get_bh(bh);
/*设置回调函数 */
bh->b_end_io = end_buffer_read_sync;
submit_bh(READ, bh);
/*等待BH为unlock状态 */
wait_on_buffer(bh);
if (buffer_uptodate(bh))
return 0;
return -EIO;
}
submit_bh直接调用submit_bh_wbc函数发起bio
可以看到,到了bio层,就没有buffer head这个概念了,直接用page和bi_sector来操作对应的块
/*提交bio */
static int submit_bh_wbc(int rw, struct buffer_head *bh,
unsigned long bio_flags, struct writeback_control *wbc)
{
struct bio *bio;
int ret = 0;
/*分配BIO */
bio = bio_alloc(GFP_NOIO, 1);
if (wbc) {
wbc_init_bio(wbc, bio);
wbc_account_io(wbc, bh->b_page, bh->b_size);
}
/*逻辑块号转换成扇区号 */
bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio_set_dev(bio, bh->b_bdev);
bio->bi_io_vec[0].bv_page = bh->b_page;
bio->bi_io_vec[0].bv_len = bh->b_size;
/*当前bh的页内偏移*/
bio->bi_io_vec[0].bv_offset = bh_offset(bh);
bio->bi_vcnt = 1;
bio->bi_size = bh->b_size;
bio->bi_end_io = end_bio_bh_io_sync;
bio->bi_private = bh;
bio->bi_flags |= bio_flags;
/*对读写进行完全检查 */
guard_bh_eod(rw, bio, bh);
if (buffer_meta(bh))
rw |= REQ_META;
if (buffer_prio(bh))
rw |= REQ_PRIO;
bio_get(bio);
/*提交一个bio */
submit_bio(rw, bio);
bio_put(bio);
return ret;
3099,1 88%
4.buffer_head的状态
BH_Uptodate:表示BH的数据是最新的,甚至比磁盘还新(uptodate|dirtry)
BH_Dirty: BH数据是脏的,需要回刷到磁盘块
BH_Lock: BH正在进行IO操作
BH_Mapped: BH建立了磁盘映射