六:页面缓存层
页面高速缓存的核心结构为struct address_space.如下所示:
struct address_space {
//页高速缓存的属主
struct inode *host; /* owner: inode, block_device */
//包含全部页面的radix树
struct radix_tree_root page_tree; /* radix tree of all pages */
//访问树的自旋锁
spinlock_t tree_lock; /* and spinlock protecting it */
//地址空间中共享内存映射的个数
unsigned int i_mmap_writable;/* count VM_SHARED mappings */
//radix优先搜索树的根
struct prio_tree_root i_mmap; /* tree of private and shared mappings */
//地址空晨中非线性内存区链表
struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */
//radix优先搜索树所使用的自旋锁
spinlock_t i_mmap_lock; /* protect tree, count, list */
//截短文件时使用的计数器
atomic_t truncate_count; /* Cover race condition with truncate */
//所有者的页总数
unsigned long nrpages; /* number of total pages */
//最后一次回写操作所用到的页面序号
pgoff_t writeback_index;/* writeback starts here */
//页高速缓存对应的方法
struct address_space_operations *a_ops; /* methods */
//错误位和内存分配器的标志
unsigned long flags; /* error bits/gfp mask */
//指向拥有者数据块的backing_dev_info
struct backing_dev_info *backing_dev_info; /* device readahead, etc */
//private_list所用的自旋锁
spinlock_t private_lock; /* for use by the address_space */
//页面高速缓存的私有链表
struct list_head private_list; /* ditto */
struct address_space *assoc_mapping; /* ditto */
}
页描述符中有几个成员与页面高速缓存相关.page->mapping指向拥有这个页面的address_space.page->index表示在拥用者所表示的页面高速络缓存中以页为大小的偏移量.
与2.4内核不同的是,2.6是页面组织在radix_tree中,而2.4内核是将页面存放在一个全局散列表中.页面高速缓存的寻找,插入,更新,寻找特定状态的页面是非常频繁的操作.而radix_tree是一种更高效的结构.有必要讨论一下页面高速缓存的操作.
6.1:Radix_tree的结构
Radix_tree_root的结构如下:
//radix_tree根结点
struct radix_tree_root {
//树的深度
unsigned int height;
//内存分配标志
int gfp_mask;
//根结点下面的具体结点
struct radix_tree_node *rnode;
}
radix_tree_node结构如下:
struct radix_tree_node {
//不为空的结点数目
unsigned int count;
//RADIX_TREE_MAP_SIZE个插槽
void *slots[RADIX_TREE_MAP_SIZE];
//标记数组,两个64位数. 一个对应PG_dirty.一个对应PG_writeback
unsigned long tags[RADIX_TREE_TAGS][RADIX_TREE_TAG_LONGS];
}
引用<<
Understanding.the.Linux.Kernel.3>>中的一副图来表示上面结构的关系.如下示:
实际上,对每个结点有64条插槽,如果是中间结点,它指向另一个结点(radix_ree_node).如果是叶子结点.它指向的是一个page结构.
我们来思考一下,给定一个页面索引,怎么样radix_tree中寻找相关的结点呢?
从下往上看.每一个叶子结点上都连接有64个结点.超过64个结点就会跳转到它的上一结点的第2个插槽.如果超过64*2就会跳转到它上一个结点的第3个插槽.依次往下推,当把上一节的64插槽遍完之后就会跳转到它上一节点的上一节点的第二个插槽.依次推理,很容易得到以下规律.
将index从低往高位每6位分一组.(2的6次方等于64).从低组往高组分别对应页面从底层到高层的序号.例如,对于上图右边的情况来说的话,序号的低6位对应第二层的序号,次6位表示它的第一层序号.
据此也可以推理得到:32 = 6*5+2.整个树总共有6层,对于有6层的情况,它的最上层只有2位.
每一个结点有64个插槽.每个插槽又对应一个结点,很容易推理出:对于深度为height的radix_tree.它的最大页面数为: 24^(height) -1 = 2^(6*height) -1.这里减1是因为不能够把所有叶子都挂满.(如果全都挂满了,那就要扩展radix_tree的层次了)
6.2:radix_tree的标记
内核经常需要遍历文件属主的脏结点,然后将其写回磁盘.因此,需要一样的高效的方法从radix_tree中寻找脏结点.这样的情况也同样适用于正在进行回写操作的页面.如果按寻常的方法来遍历整个radix_tree树,速度将是难以忍受的.因此在radix_tree_node中加入了tags二维数组,用来表示其下结点的状态.需要注意的是:正如我们在上面分析的,可以把tags看成是两个元素的64位长的数组,数组中的每个元素代表64个子结点的状态. PG_dirty,PG_writeback分别对应数组中的每一项.
在内核中,radix_tree_tag_set()/radix_tree_tag_clear()用来设置/清除radix_tree中相应结点的标记.下面分别分析这两个函数的实现,以加深对radix_tree中标记设置的理解.
Radix_tree_tag_set()的代码如下:
//root: radix_tree的根结点
//index: 页面索引号
//tag: 要设置的标记(PAGECACHE_TAG_DIRTY/PAGECACHE_TAG_WRITEBACK)
void *radix_tree_tag_set(struct radix_tree_root *root,
unsigned long index, int tag)
{
unsigned int height, shift;
struct radix_tree_node **slot;
height = root->height;
//如果页面索引大于此深度的最大索引值,非法,退出
if (index > radix_tree_maxindex(height))
return NULL;
//取最上层对应的索引偏移值
shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
slot = &root->rnode;
while (height > 0) {
int offset;
//取得下一级的索引
offset = (index >> shift) & RADIX_TREE_MAP_MASK;
//设置标记
tag_set(*slot, tag, offset);
//取得下一级索引对应的结点
slot = (struct radix_tree_node **)((*slot)->slots + offset);
BUG_ON(*slot == NULL);
//更新下一次要位移的长度
shift -= RADIX_TREE_MAP_SHIFT;
height--;
}
return *slot;
}
要设置某一索引的标记,那就必须从根结点到该结点都设置此标记.
radix_tree_tag_clear()的代码如下所示:
void *radix_tree_tag_clear(struct radix_tree_root *root,
unsigned long index, int tag)
{
struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path;
unsigned int height, shift;
void *ret = NULL;
//树的高度
height = root->height;
//判断页面索引是否超过了此深度所允许的最大索引值
if (index > radix_tree_maxindex(height))
goto out;
//根结点对应的页面索引偏移
shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
//按照从根结点到子结点的顺序依次将结点保存进path
pathp->node = NULL;
pathp->slot = &root->rnode;
while (height > 0) {
int offset;
if (*pathp->slot == NULL)
goto out;
offset = (index >> shift) & RADIX_TREE_MAP_MASK;
//本次的node是指向上一层的slot
pathp[1].offset = offset;
pathp[1].node = *pathp[0].slot;
pathp[1].slot = (struct radix_tree_node **)
(pathp[1].node->slots + offset);
pathp++;
shift -= RADIX_TREE_MAP_SHIFT;
height--;
}
// 循环完了之后pathp指向最底层的结点
ret = *pathp[0].slot;
//如果最下层的插槽没有映射的页面
if (ret == NULL)
goto out;
do {
int idx;
//清除本层的tag
tag_clear(pathp[0].node, tag, pathp[0].offset);
//判断本层的其它插槽是否设置了tag.
//如果其它插槽还有tag,那它上层的tag就没必要清除了
for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
if (pathp[0].node->tags[tag][idx])
goto out;
}
//转到它的上一层
pathp--;
} while (pathp[0].node);
out:
return ret;
}
删除标记比增加标记要稍微复杂一点.增加标记时只需从上往下设置标记即可.删除标记要判断本层的其它插槽是否设置了标记,只有当本层所有插槽的标记清除完成之后,才可以把上层的标记清除.
radix_tree_tagged()来用判断基树中是否包含有指定状态的页面.它的代码如下:
int radix_tree_tagged(struct radix_tree_root *root, int tag)
{
int idx;
if (!root->rnode)
return 0;
for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
if (root->rnode->tags[tag][idx])
return 1;
}
return 0;
}
只需要判断它的根结点中是否有tag就行了.因为下层的状态都会回溯到根结点中.
6.3: radix_tree中页面的查找,删除和更新
find_get_page()用来在页面高速缓存中查找给定index的页面.它的代码如下:
struct page * find_get_page(struct address_space *mapping, unsigned long offset)
{
struct page *page;
//加锁
spin_lock_irq(&mapping->tree_lock);
page = (&mapping->page_tree, offset);
//如果找到了页面.增加其引用计数
if (page)
page_cache_get(page);
//解锁
spin_unlock_irq(&mapping->tree_lock);
return page;
}
radix_tree_lookup()的代码如下所示:
void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index)
{
unsigned int height, shift;
struct radix_tree_node **slot;
height = root->height;
//index的合法性判断
if (index > radix_tree_maxindex(height))
return NULL;
//按照index的相应字段值,到相应的高度中寻找插槽
shift = (height-1) * RADIX_TREE_MAP_SHIFT;
slot = &root->rnode;
while (height > 0) {
if (*slot == NULL)
return NULL;
slot = (struct radix_tree_node **)
((*slot)->slots +
((index >> shift) & RADIX_TREE_MAP_MASK));
shift -= RADIX_TREE_MAP_SHIFT;
height--;
}
//最后一个结点的插槽存放的就是相关页面.如果没有映射,这个插槽中的值对应NULL
return *slot;
}
回忆一下上面分析的关于radix_tree中页面的查找方法,不难理解这段代码.
find_get_pages()用来寻找一组页面.它的代码如下:
/*
mapping: address_space的地址
start: 起始的页面序号
nr_pages:要寻找的页面数
pages: page数组,用来存放找到的页面
返回找到的页面数目
*/
unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
unsigned int nr_pages, struct page **pages)
{
unsigned int i;
unsigned int ret;
//加锁
spin_lock_irq(&mapping->tree_lock);
ret = radix_tree_gang_lookup(&mapping->page_tree,
(void **)pages, start, nr_pages);
//为找到的页面增加引用计数
for (i = 0; i < ret; i++)
page_cache_get(pages[i]);
//解锁
spin_unlock_irq(&mapping->tree_lock);
return ret;
}
radix_tree_gang_lookup()的代码如下所示:
unsigned int
radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
unsigned long first_index, unsigned int max_items)
{
const unsigned long max_index = radix_tree_maxindex(root->height);
unsigned long cur_index = first_index;
unsigned int ret = 0;
while (ret < max_items) {
unsigned int nr_found;
unsigned long next_index; /* Index of next search */
if (cur_index > max_index)
break;
nr_found = __lookup(root, results + ret, cur_index,
max_items - ret, &next_index);
ret += nr_found;
if (next_index == 0)
break;
cur_index = next_index;
}
return ret;
}
__lookup()是这个操作中的核心函数,它的代码如下所示:
static unsigned int
__lookup(struct radix_tree_root *root, void **results, unsigned long index,
unsigned int max_items, unsigned long *next_index)
{
unsigned int nr_found = 0;
unsigned int shift;
unsigned int height = root->height;
struct radix_tree_node *slot;
shift = (height-1) * RADIX_TREE_MAP_SHIFT;
slot = root->rnode;
while (height > 0) {
unsigned long i = (index >> shift) & RADIX_TREE_MAP_MASK;
//遍历这一层的插槽
for ( ; i < RADIX_TREE_MAP_SIZE; i++) {
//如果插槽不为空,说明其下挂载了子结点,就转到它的子结点中去取页面
if (slot->slots[i] != NULL)
break;
//如果这个插槽为空,那就要跳过这个插槽所表示的所有所有序号
//清除序号的低位.即将序号与该插槽的起始序号对齐
index &= ~((1UL << shift) - 1);
//跳过一个插槽的序号数目
index += 1UL << shift;
if (index == 0)
goto out; /* 32-bit wraparound */
}
//如果i 等于插槽数目( RADIX_TREE_MAP_SIZE).说明它这层是空的.
//这一层是没有映射页面的,直接返回
if (i == RADIX_TREE_MAP_SIZE)
goto out;
//转跳到下一层
height--;
//如果搜索到了叶子结点.就可以找具体的映射页面了
if (height == 0) { /* Bottom level: grab some items */
unsigned long j = index & RADIX_TREE_MAP_MASK;
//遍历起始位置开始遍历插槽
//nr_fount: 找到的页面数目
for ( ; j < RADIX_TREE_MAP_SIZE; j++) {
index++;
if (slot->slots[j]) {
results[nr_found++] = slot->slots[j];
if (nr_found == max_items)
goto out;
}
}
}
//更新shift
shift -= RADIX_TREE_MAP_SHIFT;
//使slot指向下层的slot
slot = slot->slots[i];
}
out:
*next_index = index;
return nr_found;
}
从上面可以看出.其实它每一次就是取一个叶子结点中的64个插槽中对应的页面.循环取页面.一直取到调用规定的页面数目为止.
add_to_page_cache()用来在页高速缓存中增加一个页面.它的代码如下:
int add_to_page_cache(struct page *page, struct address_space *mapping,
pgoff_t offset, int gfp_mask)
{
//填充radix_tree_preloads.即填充一个per_cpu的radix_tree_inode储存区
//在这里会禁止内核抢占
int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
if (error == 0) {
spin_lock_irq(&mapping->tree_lock);
//在radix_tree中插入页面
error = radix_tree_insert(&mapping->page_tree, offset, page);
if (!error) {
//如果页面插入成功.增加其引用计数
page_cache_get(page);
//新增的页面.还没有任何内容,将页面Lock
SetPageLocked(page);
//设置page描述符的mapping和index字段
page->mapping = mapping;
page->index = offset;
//更新页面高速缓存的页面总数计数
mapping->nrpages++;
pagecache_acct(1);
}
spin_unlock_irq(&mapping->tree_lock);
//允许内核抢占
radix_tree_preload_end();
}
return error;
}
radix_tree_preload()与 radix_tree_preload_end()经常配合起来使用.后者主要是解除内核的禁止抢占,即后者允许内核抢占.注意在上面的操作中,如果radix_tree_preload()操作失败,是不会禁止内核抢占的.这点是需要注意的.也就是说,只有在radix_tree_preload()操作成功之后,才会配套使用radix_tree_preload_end().
两者的代码分别如下所示:
int radix_tree_preload(int gfp_mask)
{
struct radix_tree_preload *rtp;
struct radix_tree_node *node;
int ret = -ENOMEM;
//禁止内核抢占
preempt_disable();
//取得radix_tree_preloads(per_cpu变量)
rtp = &__get_cpu_var(radix_tree_preloads);
//如果radix_tree_preloads中的数组还没有存放满
while (rtp->nr < ARRAY_SIZE(rtp->nodes)) {
//分配radix_tree_node时,允许内核抢占
preempt_enable();
node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
//如果分配失败,退出,返回-ENOMEM
if (node == NULL)
goto out;
//分配完了,禁止内核抢占
preempt_disable();
//再次取radix_tree_preloads.再次判断里面的数组是否存放满了
//这里主要是防止其它控制路径对radix_tree_preloads进行了操作
rtp = &__get_cpu_var(radix_tree_preloads);
//如果没有满,则将新分配的radix_tree_node加入radix_tree_preloads.否则释放掉分配的radix_tree_node
if (rtp->nr < ARRAY_SIZE(rtp->nodes))
rtp->nodes[rtp->nr++] = node;
else
kmem_cache_free(radix_tree_node_cachep, node);
}
ret = 0;
out:
return ret;
}
static inline void radix_tree_preload_end(void)
{
preempt_enable();
}
在这里要注意radix_tree_preload()的操作,内核对每个CPU维持着一个radix_tree_preload的变量,它的结构如下所示:
struct radix_tree_preload {
int nr;
struct radix_tree_node *nodes[RADIX_TREE_MAX_PATH];
};
即每个CPU保存了一些分配好了的radix_tree_node.这样是为了保证在后续的添加操作中能够分配到radix_tree_node.
接着分析add_to_page_cache()的代码.
radix_tree_insert()是这个操作中的核心函数,它的代码如下所示:
int radix_tree_insert(struct radix_tree_root *root,
unsigned long index, void *item)
{
struct radix_tree_node *node = NULL, *tmp, **slot;
unsigned int height, shift;
int offset;
int error;
/* Make sure the tree is high enough. */
//确定树的高度是否足够,如果高度不够,则先对整个树进行扩展
if ((!index && !root->rnode) ||
//radix_tree_maxindex: 计算该高度能够存放的序号的最大值
index > radix_tree_maxindex(root->height)) {
error = radix_tree_extend(root, index);
if (error)
return error;
}
//下面如同get_cache_page()一样,按照index值逐层搜索,如果对应项为空,则为之新分配一个结点
slot = &root->rnode;
height = root->height;
shift = (height-1) * RADIX_TREE_MAP_SHIFT;
offset = 0; /* uninitialised var warning */
while (height > 0) {
if (*slot == NULL) {
/* Have to add a child node. */
if (!(tmp = radix_tree_node_alloc(root)))
return -ENOMEM;
*slot = tmp;
if (node)
node->count++;
}
/* Go a level down */
offset = (index >> shift) & RADIX_TREE_MAP_MASK;
node = *slot;
slot = (struct radix_tree_node **)(node->slots + offset);
shift -= RADIX_TREE_MAP_SHIFT;
height--;
}
//如果index对应的索引已经有映射页面了.返回-EEXIST
if (*slot != NULL)
return -EEXIST;
//否则.增加子节点的count计数
if (node) {
node->count++;
BUG_ON(tag_get(node, 0, offset));
BUG_ON(tag_get(node, 1, offset));
}
//将页面挂到子节点的相应插槽
*slot = item;
return 0;
}
如果当前树的深度不足以存放index,那就要扩展tadix_tree了.相应的扩充操作是在radix_tree_extend()完成的.它的代码如下:
static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
{
struct radix_tree_node *node;
unsigned int height;
char tags[RADIX_TREE_TAGS];
int tag;
/* Figure out what the height should be. */
//计算要扩展到多少深度才合适
height = root->height + 1;
while (index > radix_tree_maxindex(height))
height++;
//height就是该树的合适深度
//如果root->rnode==NULL.表示该树下面没有任何的子结点,也就是说没有映射任何的页面
//设置好树的深度值后返回即可.
//在插入结点时,如果某层对应偏移的结点为空,会为之建立结点
if (root->rnode == NULL) {
root->height = height;
goto out;
}
/*
* Prepare the tag status of the top-level node for propagation
* into the newly-pushed top-level node(s)
*/
//判断该树中是否设置了标记,如果有,就将tags数组的对应项置1
for (tag = 0; tag < RADIX_TREE_TAGS; tag++) {
int idx;
tags[tag] = 0;
for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
if (root->rnode->tags[tag][idx]) {
tags[tag] = 1;
break;
}
}
}
//将当前树扩展到适当的高度
do {
//分配一个radix_tree_node
if (!(node = radix_tree_node_alloc(root)))
return -ENOMEM;
/* Increase the height. */
//在顶端增加一个结点
node->slots[0] = root->rnode;
/* Propagate the aggregated tag info into the new root */
//以前的根结点现在就对于新增结点的第一个插槽.
//如果以前的根结点被打上了tag.就将新增结点的第一个插槽对应的子节点打上相应的tag
for (tag = 0; tag < RADIX_TREE_TAGS; tag++) {
if (tags[tag])
tag_set(node, tag, 0);
}
node->count = 1;
root->rnode = node;
root->height++;
} while (height > root->height);
out:
return 0;
}
上面的代码比较简单,请自行配合加上的注释进行理解.
这里有必要讨论一下radix_tree_inode的分配过程,内核对其分配有一个特殊的处理,代码如下:
static struct radix_tree_node *
radix_tree_node_alloc(struct radix_tree_root *root)
{
struct radix_tree_node *ret;
//先从slab中分配
ret = kmem_cache_alloc(radix_tree_node_cachep, root->gfp_mask);
if (ret == NULL && !(root->gfp_mask & __GFP_WAIT)) {
//如果分配失败了,再从radix_tree_preloads中分配
struct radix_tree_preload *rtp;
rtp = &__get_cpu_var(radix_tree_preloads);
if (rtp->nr) {
ret = rtp->nodes[rtp->nr - 1];
rtp->nodes[rtp->nr - 1] = NULL;
rtp->nr--;
}
}
return ret;
}
radix_tree_preloads的作用就在这里体现出来了,
remove_from_page_cache()用来将页从页高速缓存中移除.它的代码对应如下:
void remove_from_page_cache(struct page *page)
{
struct address_space *mapping = page->mapping;
//必须要将页面lock之后,才能将其删除
if (unlikely(!PageLocked(page)))
PAGE_BUG(page);
//获取自旋锁
spin_lock_irq(&mapping->tree_lock);
//将页面从page cache上移除
__remove_from_page_cache(page);
//释放自旋锁
spin_unlock_irq(&mapping->tree_lock);
}
__remove_from_page_cache()的代码如下:
void __remove_from_page_cache(struct page *page)
{
struct address_space *mapping = page->mapping;
//从radix_tree中删除该页面
radix_tree_delete(&mapping->page_tree, page->index);
//更新page描棕符的mapping字段,使其指向NULL
page->mapping = NULL;
//减少page cache中的页面计数
mapping->nrpages--;
pagecache_acct(-1);
}
radix_tree_delete()的代码如下:
void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
{
struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path;
struct radix_tree_path *orig_pathp;
unsigned int height, shift;
void *ret = NULL;
char tags[RADIX_TREE_TAGS];
int nr_cleared_tags;
height = root->height;
//index的有效性判断
if (index > radix_tree_maxindex(height))
goto out;
shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
pathp->node = NULL;
pathp->slot = &root->rnode;
//从根结点到子结点的相应结点保存到path中
while (height > 0) {
int offset;
if (*pathp->slot == NULL)
goto out;
offset = (index >> shift) & RADIX_TREE_MAP_MASK;
pathp[1].offset = offset;
pathp[1].node = *pathp[0].slot;
pathp[1].slot = (struct radix_tree_node **)
(pathp[1].node->slots + offset);
pathp++;
shift -= RADIX_TREE_MAP_SHIFT;
height--;
}
//pathp此时对应的是最后的一个结点
ret = *pathp[0].slot;
//如果index对应的page不存在.直接退出
if (ret == NULL)
goto out;
orig_pathp = pathp;
/*
* Clear all tags associated with the just-deleted item
*/
memset(tags, 0, sizeof(tags));
do {
int tag;
nr_cleared_tags = RADIX_TREE_TAGS;
for (tag = 0; tag < RADIX_TREE_TAGS; tag++) {
int idx;
//注意和radix_tree_tag_clear不相同的是,这里需要处理两个标记
//对于叶子结点,这里总是清除标记的
//对于中间间点,如果下层的叶子全部都没有标记了,则将本层的标记也清除了
if (!tags[tag])
tag_clear(pathp[0].node, tag, pathp[0].offset);
for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
// 判断本层有没有相应的标记值
if (pathp[0].node->tags[tag][idx]) {
tags[tag] = 1;
nr_cleared_tags--;
break;
}
}
}
pathp--;
} while (pathp[0].node && nr_cleared_tags);
pathp = orig_pathp;
//将叶子结点对应的页面置空
*pathp[0].slot = NULL;
//从叶子结点到根结点,依次减少该层的引用计数.
//如果引用计数为0.则将该结点删除
while (pathp[0].node && --pathp[0].node->count == 0) {
pathp--;
BUG_ON(*pathp[0].slot == NULL);
*pathp[0].slot = NULL;
radix_tree_node_free(pathp[1].node);
}
//如果根结点为空了,对应树的高度为0
if (root->rnode == NULL)
root->height = 0;
out:
return ret;
}
read_cache_page()用来更新页高速缓存中的页面.它的代码如下:
struct page *read_cache_page(struct address_space *mapping,
unsigned long index,
int (*filler)(void *,struct page*),
void *data)
{
struct page *page;
int err;
retry:
//从page cache中取得index对应的page .如果页面不存在,就会新建一个
page = __read_cache_page(mapping, index, filler, data);
//有错误,退出
if (IS_ERR(page))
goto out;
//记录页面已经被访问过
mark_page_accessed(page);
//如果页面被更新了,里面的数据对应磁盘中的数据,退出
if (PageUptodate(page))
goto out;
//这里对应的是页面还没有更新
//先将页面锁定
lock_page(page);
//如果页面不在page cache中,解锁页面,并将页面释放
if (!page->mapping) {
unlock_page(page);
page_cache_release(page);
goto retry;
}
//在加锁页面的时候,可能会睡眠,这里需要重新判断页面是否已经更新
if (PageUptodate(page)) {
unlock_page(page);
goto out;
}
//如果依然没有更新,那就调用filler从文件系统中读取数据
err = filler(data, page);
//读取失败,释放页面
if (err < 0) {
page_cache_release(page);
page = ERR_PTR(err);
}
out:
return page;
}
__read_cache_page()的代码如下:
static inline struct page *__read_cache_page(struct address_space *mapping,
unsigned long index,
int (*filler)(void *,struct page*),
void *data)
{
struct page *page, *cached_page = NULL;
int err;
repeat:
//从page cache中取得指定index的页面
page = find_get_page(mapping, index);
//如果页面不存在,则新建页面,并将其插入页高速缓存中
//否则,将找到的页面返回退可
if (!page) {
if (!cached_page) {
//新分配一个页面
cached_page = page_cache_alloc_cold(mapping);
if (!cached_page)
return ERR_PTR(-ENOMEM);
}
//将新分配的页面加入page cache,并将页面加至LRU
err = add_to_page_cache_lru(cached_page, mapping,
index, GFP_KERNEL);
if (err == -EEXIST)
goto repeat;
//如果失败,将分得的页面释放
if (err < 0) {
/* Presumably ENOMEM for radix tree node */
page_cache_release(cached_page);
return ERR_PTR(err);
}
page = cached_page;
cached_page = NULL;
//因为该页面是新加的,里面肯定是没有最新数据的,调用filler()往磁盘读数据
err = filler(data, page);
//如果失败,将页面释放
if (err < 0) {
page_cache_release(page);
page = ERR_PTR(err);
}
}
if (cached_page)
page_cache_release(cached_page);
return page;
}
七:块缓冲区
对于块设备来说,它每次读写的单元是块,而不是页面.因此,相对于设备的块,在内存中也有一个缓存区.它称之为块缓存区.在2.2版的内核中,页高速缓存与块缓存区是共存的,互不相关的,如果修改了一个缓存区的标记,也就必需修改另一个缓存区中的标记,这样操作起来是非常低效的.从2.4内核开始,块缓存是存放在缓存区页的专门页面中,而缓存区页又是存放在页高速缓存中.我们在后面会给出详细的分析.先给出块缓存区的相关结构分析:
7.1:buffer_head结构分析
每一个块缓存区是由buffer_head来描述符的,它的结构如下:
struct buffer_head {
/* First cache line: */
//缓存区的状态标记
unsigned long b_state; /* buffer state bitmap (see above) */
//页面中的缓存区
struct buffer_head *b_this_page;/* circular list of page's buffers */
//指向这个缓存区所在的页面
struct page *b_page; /* the page this bh is mapped to */
//该缓存区首部的引用计数
atomic_t b_count; /* users using this block */
//块的大小
u32 b_size; /* block size */
//对应该设备的物理块
sector_t b_blocknr; /* block number */
//指向数据块
//由此可以计算:缓存冲在内存的起始位置为b_data.结束位置为b_data+b_size
char *b_data; /* pointer to data block */
//缓冲区对应的设备
struct block_device *b_bdev;
//I/O完成的方法
bh_end_io_t *b_end_io; /* I/O completion */
//完成的相关数据
void *b_private; /* reserved for b_end_io */
//相关的映射链表
struct list_head b_assoc_buffers; /* associated with another mapping */
}
注意buffer_head中的b_data成员.如果buffer_head所属的页面是高端页面.这个值指向与页面起始地址的偏移值.如果是普通页面.这个值存放的是块缓存区所在的线性地址.
7.2:块缓存区与页高速缓存的关系:
引用<<
Understanding.the.Linux.Kernel.3rd >>中的一副图来描述块缓存区与页面缓存的关系:
如上图所示:
页面中存放的每个块缓存区大小都是相同的.page描述符的private指向了第一个buffer_head.
Buffer_head->b_data指向了块缓存的地址.
Buffer_head->b_page指向了块缓存区所在的页面描述符
Buffer_head->b_this_page指向了它的下一个buffer_head
页面中最后一个buffer_head->b_this_page指向在缓存区页中的第一个buffer_head.
7.3:增加块缓存页
grow_buffers()用来往页面高速缓存中增加块缓冲区页.它的代码如下:
//bdev:对应的块设备
//block:逻辑块号
//size: 块大小
static inline int
grow_buffers(struct block_device *bdev, sector_t block, int size)
{
struct page *page;
pgoff_t index;
int sizebits;
//计算数据页在块设备中的偏移量
//一个缓存页中的块缓冲区数量只能为2的倍数?
//在linux系统中,扇区大小是1<<9(512)的整数倍,块大小是扇区的整数倍而且
//大小必须要为2的幂
sizebits = -1;
// 1UL << sizebits: 一个页面中的块缓冲区数目
do {
sizebits++;
} while ((size << sizebits) < PAGE_SIZE);
//block序数/ 每个缓存页中能够存放的块缓存区个数 = 页面的在页缓存中的序号
index = block >> sizebits;
//页面序号* 每个缓存页中能够存放的块缓存区个数= 在这个缓存区页中的超始块缓存的逻辑块号
block = index << sizebits;
/* Create a page with the proper size buffers.. */
//建立块缓存区页
page = grow_dev_page(bdev, block, index, size);
//分配失败,退出
if (!page)
return 0;
//成功分配,解锁页面并减小页面的引用计数
unlock_page(page);
page_cache_release(page);
return 1;
}
这段代码开始部份,用一个do()while来判断页面中存放缓存区块的大小可能让人觉得疑惑.实际上,在linux系统中,系统默认的扇区大小为512.用户可以自定义扇区大小,但必须是512的整数倍.块大小是扇区的整数倍,又必须是2 的幂大小而且不可以超过页面大小.因为在32位系统中.块大小只参为这几种可能:512,1024,2048.4096.即为:1<<9,1<<10,1<<11,1<<12.在两层映射模式下,页面大小为1<<12.因此,上述几种可能在页中对应的数目分别是:1<<3,1<<2,1<<2 ,1<<0.这也是上面代码中do()while循环的理论依据.经过这个循环之后,页面中的块缓区数目为1 << sizebits.
block >> sizebits 等于block/1<<sizebits.即块序号/每个页面中的块缓存区数目,对应该块序号以页面为单位的偏移值.
index << sizebits 等于 index * (1<<sizebits).即页面序号*每个页面中块缓存区数目,对应该缓存区页面中存放的首个块缓存区.
grow_dev_page()的代码如下所示:
static struct page *
grow_dev_page(struct block_device *bdev, sector_t block,
pgoff_t index, int size)
{
struct inode *inode = bdev->bd_inode;
struct page *page;
struct buffer_head *bh;
//到页高速缓存中寻以页面索引为index的页面,如果不存在此页面,则新建
page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
if (!page)
return NULL;
//页面没有被锁定,BUG!
if (!PageLocked(page))
BUG();
//如果这个页面是一个缓存页码
if (page_has_buffers(page)) {
//取得这个缓存页中的首个buffer_head
bh = page_buffers(page);
//如果页缓存区与规定大小相等,则返回这个页面
if (bh->b_size == size)
return page;
//否则,就释放这个页面中的buffer_head
if (!try_to_free_buffers(page))
goto failed;
}
/*
* Allocate some buffers for this page
*/
//在缓存页中建立buffer_head
bh = create_buffers(page, size, 0);
if (!bh)
goto failed;
/*
* Link the page to the buffers and initialise them. Take the
* lock to be atomic wrt __find_get_block(), which does not
* run under the page lock.
*/
spin_lock(&inode->i_mapping->private_lock);
//对分配之后的块缓存头部做一些初始化动作
link_dev_buffers(page, bh);
init_page_buffers(page, bdev, block, size);
spin_unlock(&inode->i_mapping->private_lock);
return page;
failed:
BUG();
unlock_page(page);
page_cache_release(page);
return NULL;
}
这个函数里涉及到的重要的子函数比较.下面逐个分析.
find_or_create_page()代码如下:
struct page *find_or_create_page(struct address_space *mapping,
unsigned long index, unsigned int gfp_mask)
{
struct page *page, *cached_page = NULL;
int err;
repeat:
//在页高速缓存中找到并锁定这个页面
page = find_lock_page(mapping, index);
//如果没有找到相应的页面
if (!page) {
if (!cached_page) {
//分配一个页面
cached_page = alloc_page(gfp_mask);
if (!cached_page)
return NULL;
}
//将页面加入到页缓存区中,并将其加入到LRU.
//在加入到页缓存区的时候,会将其页面置于Lock
err = add_to_page_cache_lru(cached_page, mapping,
index, gfp_mask);
if (!err) {
page = cached_page;
cached_page = NULL;
} else if (err == -EEXIST)
goto repeat;
}
//如果cached_page不为空,释放它
if (cached_page)
page_cache_release(cached_page);
return page;
}
这个函数比较简单,很多操作在分析页面缓存的时候已经分析过了.
try_to_free_buffers()用来释放块缓存页中的块缓存区.代码如下:
int try_to_free_buffers(struct page *page)
{
struct address_space * const mapping = page->mapping;
struct buffer_head *buffers_to_free = NULL;
int ret = 0;
BUG_ON(!PageLocked(page));
if (PageWriteback(page))
return 0;
if (mapping == NULL) { /* can this still happen? */
//如果页面不在页高速缓存中,只需将其中的块缓存区删除就可以了,不用更新
//radix_tree对应的标记
ret = drop_buffers(page, &buffers_to_free);
goto out;
}
spin_lock(&mapping->private_lock);
//从块缓存页中找到块缓存区描述符头部
ret = drop_buffers(page, &buffers_to_free);
if (ret) {
/*
* If the filesystem writes its buffers by hand (eg ext3)
* then we can have clean buffers against a dirty page. We
* clean the page here; otherwise later reattachment of buffers
* could encounter a non-uptodate page, which is unresolvable.
* This only applies in the rare case where try_to_free_buffers
* succeeds but the page is not freed.
*/
//如果清除页中块缓存区成功,则清除页面的PG_dirty标记,且清除
//页高速缓存中对应结点的dirty标记
clear_page_dirty(page);
}
spin_unlock(&mapping->private_lock);
out:
//遍历页面中的块缓存区,将块缓存区删除
if (buffers_to_free) {
struct buffer_head *bh = buffers_to_free;
do {
struct buffer_head *next = bh->b_this_page;
free_buffer_head(bh);
bh = next;
} while (bh != buffers_to_free);
}
return ret;
}
try_to_free_buffers()-> drop_buffers()用来对缓存区页中的块缓存区进行处理,它的代码如下:
static int
drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
{
//取得页面中的块缓存区描述符头
struct buffer_head *head = page_buffers(page);
struct buffer_head *bh;
bh = head;
//遍历块缓存区描述符链表
do {
//如果块缓存区描述中包含I/O错误标志.则设置页面高速缓存的AS_EIO
if (buffer_write_io_error(bh))
set_bit(AS_EIO, &page->mapping->flags);
//如果块缓存区为ditry或者Lock,说明不能删除此块缓存区
if (buffer_busy(bh))
goto failed;
bh = bh->b_this_page;
} while (bh != head);
//清除块缓存区描述符的b_assoc_buffers成员
do {
struct buffer_head *next = bh->b_this_page;
if (!list_empty(&bh->b_assoc_buffers))
__remove_assoc_queue(bh);
bh = next;
} while (bh != head);
//buffers_to_free指向块缓存区描述符的头部
*buffers_to_free = head;
//因为页面中的块缓存区要删除了,清除page的PG_private标记,清除page的private 成员
//因为private 成员被清了,相应要减小page的引用计数
__clear_page_buffers(page);
return 1;
failed:
return 0;
}
create_buffers()用于在块缓存页中建立块缓存区,返回块缓存区描述符的首部.代码如下:
static struct buffer_head *
create_buffers(struct page * page, unsigned long size, int retry)
{
struct buffer_head *bh, *head;
long offset;
try_again:
head = NULL;
offset = PAGE_SIZE;
// TODO: 这里的分配是从链表后面往前面分配的.最后面一个bh的b_this_page为NULL
while ((offset -= size) >= 0) {
bh = alloc_buffer_head(GFP_NOFS);
if (!bh)
goto no_grow;
bh->b_bdev = NULL;
bh->b_this_page = head;
bh->b_blocknr = -1;
head = bh;
bh->b_state = 0;
atomic_set(&bh->b_count, 0);
bh->b_size = size;
/* Link the buffer to its page */
//设置bh的b_page字段和b_data字段
set_bh_page(bh, page, offset);
bh->b_end_io = NULL;
}
return head;
/*
* In case anything failed, we just free everything we got.
*/
no_grow:
if (head) {
do {
bh = head;
head = head->b_this_page;
free_buffer_head(bh);
} while (head);
}
/*
* Return failure for non-async IO requests. Async IO requests
* are not allowed to fail, so we have to wait until buffer heads
* become available. But we don't want tasks sleeping with
* partially complete buffers, so all were released above.
*/
if (!retry)
return NULL;
/* We're _really_ low on memory. Now we just
* wait for old buffer heads to become free due to
* finishing IO. Since this is an async request and
* the reserve list is empty, we're sure there are
* async buffer heads in use.
*/
free_more_memory();
goto try_again;
}
在这里要注意,代码中的bh分配顺序是从尾部到头部的,最末尾的bh->b_this_page为NULL.
对bh的b_page和b_data设置是在set_bh_page()中完成的.它的代码如下:
void set_bh_page(struct buffer_head *bh,
struct page *page, unsigned long offset)
{
//bh->b_page:指向分配块缓存区中的缓存区页
bh->b_page = page;
if (offset >= PAGE_SIZE)
BUG();
//如果块缓存区是高端页面.则b_data存放的是页面的偏移值
if (PageHighMem(page))
/*
* This catches illegal uses and preserves the offset:
*/
bh->b_data = (char *)(0 + offset);
else
//否则,存放的是块缓存区块的线性地址
bh->b_data = page_address(page) + offset;
}
我们在这里看到了对高端内存和常规内存的不同处理.
我们在create_buffers()中看到,最尾末的b_this_page没有设置.缓存区页的private字段也没有被设置.在接下来的link_dev_buffers()中操作中就会完成这些设置了.代码如下:
static inline void
link_dev_buffers(struct page *page, struct buffer_head *head)
{
struct buffer_head *bh, *tail;
//找到最末尾的块缓存区描述符
bh = head;
do {
tail = bh;
bh = bh->b_this_page;
} while (bh);
//设置最末尾的块缓存区描述符的b_this_page指向缓存区页中的块缓存区描述符的首链
tail->b_this_page = head;
//对缓存区页的设置
__set_page_buffers(page, head);
}
link_dev_buffers()-> __set_page_buffers()的代码如下:
static void
__set_page_buffers(struct page *page, struct buffer_head *head)
{
//增加页面的引用计数
page_cache_get(page);
//设置页面的PG_private标志
SetPagePrivate(page);
//page->private指向块缓存区描述符的首部
page->private = (unsigned long)head;
}
init_page_buffers()用来对buffer_head做一些其它的初始化:
static void
init_page_buffers(struct page *page, struct block_device *bdev,
sector_t block, int size)
{
struct buffer_head *head = page_buffers(page);
struct buffer_head *bh = head;
unsigned int b_state;
//块缓存区描述符的基本标志:BH_Mapped .表示这个是一个映射的块缓存区
b_state = 1 << BH_Mapped;
//如果page设置了PG_uptodata.则其中的块缓存区描述符设置BH_Uptodate
if (PageUptodate(page))
b_state |= 1 << BH_Uptodate;
do {
if (!(bh->b_state & (1 << BH_Mapped))) {
init_buffer(bh, NULL, NULL);
bh->b_bdev = bdev;
bh->b_blocknr = block;
bh->b_state = b_state;
}
//更新逻辑块号
block++;
bh = bh->b_this_page;
} while (bh != head);
}
7.4:释放块缓存页
try_to_release_page()用来释放给定的块缓存页.代码如下:
int try_to_release_page(struct page *page, int gfp_mask)
{
struct address_space * const mapping = page->mapping;
//如果page没有被Lock; BUG
BUG_ON(!PageLocked(page));
//如果页面正在被执行回写操作
if (PageWriteback(page))
return 0;
//如果页面高速缓存定义了releasepage操作,调用其操作接口
if (mapping && mapping->a_ops->releasepage)
return mapping->a_ops->releasepage(page, gfp_mask);
//进行一般的页面释放操作
return try_to_free_buffers(page);
}
如果页面高速缓存定义了页面的释放操作,则调用相应的接口就可以了,如果没有,则调用try_to_free_buffers().这个函数我们在之前已经分析过.这里不再做赘述.
7.5:在块缓存区查定指定的块缓存区
Linux为了提高效率,每个CPU维持着一个小磁盘高速缓存数组bh_lrus.每个缓存有8个指针,指向最近访问过的buffer_head.最后被使用的buffer_head对应指针索引为0.
内核中有很多API提供块缓存区的查找功能,逐个分析如下.
7.4.1:查找函数一:__find_get_block():
//bdev:对应的块设备
//block:逻辑块号
//block:块大小
struct buffer_head *
__find_get_block(struct block_device *bdev, sector_t block, int size)
{
struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
//没有在当前缓冲区中找到对应的bh
if (bh == NULL) {
//如果在IRU中没有这个BH.那就到address_space中是否有相关的缓存页
bh = __find_get_block_slow(bdev, block, size);
//如果找到了,将找到的BH添加到lru中
if (bh)
bh_lru_install(bh);
}
if (bh)
touch_buffer(bh);
return bh;
}
Look_bh_lru()用来到bh_lrus寻找块缓存区.代码如下:
static inline struct buffer_head *
lookup_bh_lru(struct block_device *bdev, sector_t block, int size)
{
struct buffer_head *ret = NULL;
struct bh_lru *lru;
int i;
check_irqs_on();
bh_lru_lock();
//每个CPU都维护着一个bh_lru
lru = &__get_cpu_var(bh_lrus);
//遍历bh_lru中的BH数组
//那也就是说每个CPU维护了大小为8个大小的BH
for (i = 0; i < BH_LRU_SIZE; i++) {
struct buffer_head *bh = lru->bhs[i];
if (bh && bh->b_bdev == bdev &&
bh->b_blocknr == block && bh->b_size == size) {
if (i) {
//i不为零,说明不是在LRU中的首结点.因此需要将它放到lru->bhs[0]
//因为它得到了访问.
//将前面的页面后移一个位置
while (i) {
lru->bhs[i] = lru->bhs[i - 1];
i--;
}
//将找到的页面放到第一个位置
lru->bhs[0] = bh;
}
//增加引用计数
get_bh(bh);
ret = bh;
break;
}
}
bh_lru_unlock();
return ret;
}
__find_get_block_slow()用来到缓存区页中寻找块缓存区.代码如下:
static struct buffer_head *
__find_get_block_slow(struct block_device *bdev, sector_t block, int unused)
{
struct inode *bd_inode = bdev->bd_inode;
struct address_space *bd_mapping = bd_inode->i_mapping;
struct buffer_head *ret = NULL;
pgoff_t index;
struct buffer_head *bh;
struct buffer_head *head;
struct page *page;
//inode->i_blkbits:以位为单位的块大小
//index:block对于的页面索引
index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
//到页面高速缓存中寻找给定索引的页面
page = find_get_page(bd_mapping, index);
if (!page)
goto out;
spin_lock(&bd_mapping->private_lock);
//如果该页面不是一个块缓存页
if (!page_has_buffers(page))
goto out_unlock;
//page->buffer
//页面中块缓存区描述符的首部
head = page_buffers(page);
bh = head;
//遍历链表,查看是否有给定逻辑块号的块缓存区
do {
if (bh->b_blocknr == block) {
ret = bh;
//如果找到.增加其引用计数
get_bh(bh);
goto out_unlock;
}
//bh->b_this_pages:指向下一个BH
bh = bh->b_this_page;
} while (bh != head);
printk("__find_get_block_slow() failed. "
"block=%llu, b_blocknr=%llu\n",
(unsigned long long)block, (unsigned long long)bh->b_blocknr);
printk("b_state=0x%08lx, b_size=%u\n", bh->b_state, bh->b_size);
printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
out_unlock:
spin_unlock(&bd_mapping->private_lock);
//减少页面的引用计数,因为在find_get_page的时候增加了它的引用计数
page_cache_release(page);
out:
return ret;
}
找到了块缓存区之后,要将其加入lru,这是在bh_lru_install()中完成的.代码如下:
static void bh_lru_install(struct buffer_head *bh)
{
struct buffer_head *evictee = NULL;
struct bh_lru *lru;
check_irqs_on();
bh_lru_lock();
lru = &__get_cpu_var(bh_lrus);
//如果bh_lrus的第一个位置不是bh. 那就需要将bh放到第一个位置,bh_lrus数组中的bh后移一位
if (lru->bhs[0] != bh) {
struct buffer_head *bhs[BH_LRU_SIZE];
int in;
int out = 0;
//增加其引用计数
get_bh(bh);
bhs[out++] = bh;
for (in = 0; in < BH_LRU_SIZE; in++) {
struct buffer_head *bh2 = lru->bhs[in];
//如果bh_lrus中有相同的buffer_head
//减少引用计数
if (bh2 == bh) {
__brelse(bh2);
} else {
//循环到了末尾.使evictee指向最后的buffer_head.它会从lru中移除
if (out >= BH_LRU_SIZE) {
BUG_ON(evictee != NULL);
evictee = bh2;
} else {
//将不重复的bh放到bhs数组中
bhs[out++] = bh2;
}
}
}
while (out < BH_LRU_SIZE)
bhs[out++] = NULL;
//将bhs 拷则到lru->bhs中
memcpy(lru->bhs, bhs, sizeof(bhs));
}
bh_lru_unlock();
//如果最后的位置有buffer_head,则减小它的引用计数
if (evictee)
__brelse(evictee);
}
7.4.2:查找函数二:__getblk()
__getblk()的大部份操作与__find_get_block()操作相同,所不同的是,如果页面中没有给定的块缓存就在块缓存区中建立块缓存区.代码如下:
struct buffer_head *
__getblk(struct block_device *bdev, sector_t block, int size)
{
//从缓存区中去找相应的BH
struct buffer_head *bh = __find_get_block(bdev, block, size);
//可能会阻塞
might_sleep();
//如果没有找到,则需要分配一个
if (bh == NULL)
bh = __getblk_slow(bdev, block, size);
//返回BH
return bh;
}
__find_get_block()我们在之前已经分析过了. __getblk_slow()代码如下:
struct buffer_head *
__getblk_slow(struct block_device *bdev, sector_t block, int size)
{
/* Size must be multiple of hard sectorsize */
//参数有效性判断
if (unlikely(size & (bdev_hardsect_size(bdev)-1) ||
(size < 512 || size > PAGE_SIZE))) {
printk(KERN_ERR "getblk(): invalid block size %d requested\n",
size);
printk(KERN_ERR "hardsect size: %d\n",
bdev_hardsect_size(bdev));
dump_stack();
return NULL;
}
for (;;) {
struct buffer_head * bh;
//先到寻找相关的块缓存区,可能在睡眠的时候已经建好了
bh = __find_get_block(bdev, block, size);
if (bh)
return bh;
//如果没有,就建立块缓存页,再次循环之会就可以找到指定的块缓存区
if (!grow_buffers(bdev, block, size))
free_more_memory();
}
}
7.4.3:查找函数三:__bread()
__bread()操作与__getblk()相似.都是从页缓存中去查到对应的块缓存区,如果块缓存区不存在,则为之新建.与__getblk()不相同的是:__getblk()返回的块缓存区可能是一个干净的没有任何数据的块缓存区.但是__bread()会从文件系统中去读数据.
struct buffer_head *
__bread(struct block_device *bdev, sector_t block, int size)
{
struct buffer_head *bh = __getblk(bdev, block, size);
//如果没有uptodata.那说明缓存里面的东西并不是最新的,需要把磁盘中的数据读进来
if (!buffer_uptodate(bh))
//从文件系统中读取具体的信息
bh = __bread_slow(bh);
return bh;
}
__bread_slow()的代码如下:
static struct buffer_head *__bread_slow(struct buffer_head *bh)
{
//先将块缓存区锁定
lock_buffer(bh);
//如果页面已经更新了,解锁返回
if (buffer_uptodate(bh)) {
unlock_buffer(bh);
return bh;
} else {
//增加其引用计数
get_bh(bh);
//为bh->b_end_io赋值
bh->b_end_io = end_buffer_read_sync;
//向通常块层提交请求
submit_bh(READ, bh);
//睡眠,等待页面解锁
wait_on_buffer(bh);
//再次判断是否更新
if (buffer_uptodate(bh))
return bh;
}
brelse(bh);
return NULL;
}
在这里就会涉及到与通用块设备层的交互了,即submit_bh()的操作.这部份操作详细的分析,我们在接下来的小节再给出.
在提交请求时,进程先将页面锁定.然后等待I/O调度.如果I/O操作成功或发生意外,就会解锁页面.会将其睡眠的进程唤醒.
7.5:向通用块设备层提交块缓存区
我们从上一节的代码可以看到,块缓冲区会通过submit_bh()向通用块设备层提交请求.它的代码如下:
int submit_bh(int rw, struct buffer_head * bh)
{
struct bio *bio;
int ret = 0;
//如果buffer_head没有被Lock,没有Mapping,没有b_end_io,就会BUG
BUG_ON(!buffer_locked(bh));
BUG_ON(!buffer_mapped(bh));
BUG_ON(!bh->b_end_io);
if (buffer_ordered(bh) && (rw == WRITE))
rw = WRITE_BARRIER;
/*
* Only clear out a write error when rewriting, should this
* include WRITE_SYNC as well?
*/
//设置buffer_head的BH_Req标志,表明至少被访问过一次
if (test_set_buffer_req(bh) && (rw == WRITE || rw == WRITE_BARRIER))
//清除BH_Write_EIO标志
clear_buffer_write_io_error(bh);
/*
* from here on down, it's all bio -- do the initial mapping,
* submit_bio -> generic_make_request may further map this bio around
*/
//分配一个bio,里面的bio_vec为1个
bio = bio_alloc(GFP_NOIO, 1);
//初始化bio的各项值
//起始扇区 = 块序号*每个块中的扇区数目
bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio->bi_bdev = bh->b_bdev;
//所属的页面
bio->bi_io_vec[0].bv_page = bh->b_page;
//数据长度
bio->bi_io_vec[0].bv_len = bh->b_size;
//页面中的偏移值
bio->bi_io_vec[0].bv_offset = bh_offset(bh);
//bio_vec数目
bio->bi_vcnt = 1;
bio->bi_idx = 0;
//块大小
bio->bi_size = bh->b_size;
bio->bi_end_io = end_bio_bh_io_sync;
//bio->bi_private指向这个buffer_head
bio->bi_private = bh;
//增加bio的使用计数
bio_get(bio);
//提交bio
submit_bio(rw, bio);
if (bio_flagged(bio, BIO_EOPNOTSUPP))
ret = -EOPNOTSUPP;
//I/O操作成功.减少使用计数
bio_put(bio);
return ret;
}
其实,我们从通用块设备层看到,块缓存区并不直接参数I/O操作的,必须要将其转换为bio.然后使用submit_bio()提交.
Submit_bio()的代码如下:
void submit_bio(int rw, struct bio *bio)
{
//扇区数目
int count = bio_sectors(bio);
//bio->bi_size和bio->bi_io_vec不能为空
BIO_BUG_ON(!bio->bi_size);
BIO_BUG_ON(!bio->bi_io_vec);
//Read/Write
bio->bi_rw = rw;
//增加一个per_cpu变量的引用计数
if (rw & WRITE)
mod_page_state(pgpgout, count);
else
mod_page_state(pgpgin, count);
if (unlikely(block_dump)) {
char b[BDEVNAME_SIZE];
printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n",
current->comm, current->pid,
(rw & WRITE) ? "WRITE" : "READ",
(unsigned long long)bio->bi_sector,
bdevname(bio->bi_bdev,b));
}
//通用块设备层的接口
generic_make_request(bio);
}
到这里,就可以看到page_cache层与Generic Block Layer是如何交互的了.
内核还提供了另外的一个接口ll_rw_block()用来处理多个buffer_head的I/O操作.它并不要求这个buffer_head是连续的块.代码如下:
void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
{
int i;
for (i = 0; i < nr; i++) {
struct buffer_head *bh = bhs[i];
//将块缓存区锁定.如果已经锁定了的,就不处理了
if (test_set_buffer_locked(bh))
continue;
//增加引用计数
get_bh(bh);
if (rw == WRITE) {
bh->b_end_io = end_buffer_write_sync;
//清除buffer_head的dirty标志,如果没有dirty标志,那就不需要提交
if (test_clear_buffer_dirty(bh)) {
submit_bh(WRITE, bh);
continue;
}
} else {
bh->b_end_io = end_buffer_read_sync;
//如果没有uptadate.提交请求
if (!buffer_uptodate(bh)) {
submit_bh(rw, bh);
continue;
}
}
//没有提交的buffer_headf进行解锁
unlock_buffer(bh);
//没有提交的buffer_headf 减少引用计数
put_bh(bh);
}
}
这个函数相当于是循环调用submit_bh().
7.6:关于pdflush线程组
我们知道很多时候都是把I/O数据存放在页面缓存中,要等待I/O调度之后才会将数据写回文件系统。如果系统在将数据写回文件系统前发生意外的话,就会引起数据丢失。而且脏数据如果长时间没有被写回磁盘,会长时间占用内存,这样对内存的使用效率也是不合理的。基于这样的考虑,内核需要提供一种机制周期性的将脏数据回写到磁盘中.这样的任务在linux2.4内核中,是由bdflush和kupdated线程来完成的.在linux2.6中是利用一组pdflush线程来完成的。
Linux内核可以根据系统情况动态的调度pdflush数程组的大小,但最低不能少于2个,最高不能超过8个。下面分析一下pdflush线程组的实现,以及它所完成的工作.
先来讨论pdflush所用到的数据结构:
struct pdflush_work {
//pdflush线程的描述符
struct task_struct *who; /* The thread */
//调用函数
void (*fn)(unsigned long); /* A callback function */
//函数的参数
unsigned long arg0; /* An argument to the callback */
//用来形成链表
struct list_head list; /* On pdflush_list, when idle */
//睡眠的时间戳
unsigned long when_i_went_to_sleep;
}
Pdflush的初始化:
static int __init pdflush_init(void)
{
int i;
for (i = 0; i < MIN_PDFLUSH_THREADS; i++)
start_one_pdflush_thread();
return 0;
}
初始化入口启动两个pdflush线程.启动线程是在start_one_pdflush_thread()中完成的,代码如下:
static void start_one_pdflush_thread(void)
{
kthread_run(pdflush, NULL, "pdflush");
}
由此看出,每个线程的执行入口都是pdflush().它的代码如下:
static int pdflush(void *dummy)
{
struct pdflush_work my_work;
/*
* pdflush can spend a lot of time doing encryption via dm-crypt. We
* don't want to do that at keventd's priority.
*/
//设置本进程的nice
set_user_nice(current, 0);
return __pdflush(&my_work);
}
static int __pdflush(struct pdflush_work *my_work)
{
current->flags |= PF_FLUSHER;
my_work->fn = NULL;
my_work->who = current;
//初始化本进程的pdflush_work
INIT_LIST_HEAD(&my_work->list);
spin_lock_irq(&pdflush_lock);
//更新pdflush线程组计数
nr_pdflush_threads++;
for ( ; ; ) {
struct pdflush_work *pdf;
//进程初始化之后,将进程本身的pdflus_work加入到pdflush_list,之后睡眠
set_current_state(TASK_INTERRUPTIBLE);
list_move(&my_work->list, &pdflush_list);
my_work->when_i_went_to_sleep = jiffies;
spin_unlock_irq(&pdflush_lock);
schedule();
//这里是被唤醒了之后的运行
if (current->flags & PF_FREEZE) {
refrigerator(PF_FREEZE);
spin_lock_irq(&pdflush_lock);
continue;
}
spin_lock_irq(&pdflush_lock);
//如果本进程的pdflush_work还在pdflush_list中.继续循环之后睡眠
//唤醒进程会将pdflush_work脱链的
if (!list_empty(&my_work->list)) {
printk("pdflush: bogus wakeup!\n");
my_work->fn = NULL;
continue;
}
//如果处理函数是空的,继续循环之后睡眠
if (my_work->fn == NULL) {
printk("pdflush: NULL work function\n");
continue;
}
spin_unlock_irq(&pdflush_lock);
//运行函数入口
(*my_work->fn)(my_work->arg0);
/*
* Thread creation: For how long have there been zero
* available threads?
*/
//如果拥塞时候超过了1Hz
//last_empty_jifs:最后一次空闲的时间
if (jiffies - last_empty_jifs > 1 * HZ) {
/* unlocked list_empty() test is OK here */
if (list_empty(&pdflush_list)) {
/* unlocked test is OK here */
//没有达到数程最大值,再创建一个pdflush
if (nr_pdflush_threads < MAX_PDFLUSH_THREADS)
start_one_pdflush_thread();
}
}
spin_lock_irq(&pdflush_lock);
my_work->fn = NULL;
/*
* Thread destruction: For how long has the sleepiest
* thread slept?
*/
//如果pdflush_list是空的,继续循环之后睡眠
if (list_empty(&pdflush_list))
continue;
//如果线程数小于最小线程,继续循环之后睡眠
if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS)
continue;
//如果上一个线程的睡眠时间都超过1*Hz了.break退出循环之后退出这个线程
pdf = list_entry(pdflush_list.prev, struct pdflush_work, list);
if (jiffies - pdf->when_i_went_to_sleep > 1 * HZ) {
/* Limit exit rate */
pdf->when_i_went_to_sleep = jiffies;
break; /* exeunt */
}
}
nr_pdflush_threads--;
spin_unlock_irq(&pdflush_lock);
return 0;
}
上面是对pdflush的一个处理流程,它创建之后就会被投入睡眠,一直到其它进程唤醒。我们来看一下这相唤醒的过程:
//fn.arg0:唤醒pdflush后执行的函数和函数对应的参数
int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0)
{
unsigned long flags;
int ret = 0;
if (fn == NULL)
BUG(); /* Hard to diagnose if it's deferred */
spin_lock_irqsave(&pdflush_lock, flags);
//没有空闲的线程了
if (list_empty(&pdflush_list)) {
spin_unlock_irqrestore(&pdflush_lock, flags);
ret = -1;
} else {
struct pdflush_work *pdf;
//取出线程中的pdflush_work
pdf = list_entry(pdflush_list.next, struct pdflush_work, list);
//脱链并初始化
list_del_init(&pdf->list);
//如果链表没有其它的空闲进程了,更新last_empty_jifs
if (list_empty(&pdflush_list))
last_empty_jifs = jiffies;
//设置pdflush_work的函数与参数
pdf->fn = fn;
pdf->arg0 = arg0;
wake_up_process(pdf->who);
spin_unlock_irqrestore(&pdflush_lock, flags);
}
return ret;
}
这个函数是唤醒pdflush线程运行特定函数的入口。接下来我们分析,pdflush在linux内核里完成了一些什么样的工作.
7.6.1:搜索刷新指定数目的脏页
调用入口:
pdflush_operation(background_writeout, nr_pages)
指向pdflush要运行的函数为backgroud_writeout().对应参数为要刷新页面数目.
backgroud_writeout()的代码如下:
static void background_writeout(unsigned long _min_pages)
{
long min_pages = _min_pages;
struct writeback_control wbc = {
.bdi = NULL,
.sync_mode = WB_SYNC_NONE,
.older_than_this = NULL,
.nr_to_write = 0,
.nonblocking = 1,
};
for ( ; ; ) {
struct writeback_state wbs;
long background_thresh;
long dirty_thresh;
//background_thresh:背景阀值
get_dirty_limits(&wbs, &background_thresh, &dirty_thresh);
//如果脏页数目没有超过指定的阀值且刷新完了指定的数目
if (wbs.nr_dirty + wbs.nr_unstable < background_thresh
&& min_pages <= 0)
break;
wbc.encountered_congestion = 0;
wbc.nr_to_write = MAX_WRITEBACK_PAGES;
wbc.pages_skipped = 0;
//尝试写1024个脏页
writeback_inodes(&wbc);
//更新min_pages:还要刷新的页面数目
min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
//如果页没有写完或者跳过了页,请求队列有可能被拥塞了
if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
/* Wrote less than expected */
//睡眠100s 或者使队列变得不拥塞
blk_congestion_wait(WRITE, HZ/10);
if (!wbc.encountered_congestion)
break;
}
}
}
这里重点讨论一下writeback_inodes():
void
writeback_inodes(struct writeback_control *wbc)
{
struct super_block *sb;
might_sleep();
spin_lock(&sb_lock);
restart:
//遍历超级块链表
sb = sb_entry(super_blocks.prev);
for (; sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.prev)) {
//sb->s_dirty:属于这个文件系统的脏页
//sb->s_io:等待i/o传输的页
//如果两个链表都为空,说明此文件系统中不需要被回写
if (!list_empty(&sb->s_dirty) || !list_empty(&sb->s_io)) {
/* we're making our own get_super here */
//增加其引用计数(注意在此之前已经加锁了)
sb->s_count++;
spin_unlock(&sb_lock);
/*
* If we can't get the readlock, there's no sense in
* waiting around, most of the time the FS is going to
* be unmounted by the time it is released.
*/
if (down_read_trylock(&sb->s_umount)) {
if (sb->s_root) {
spin_lock(&inode_lock);
sync_sb_inodes(sb, wbc);
spin_unlock(&inode_lock);
}
up_read(&sb->s_umount);
}
spin_lock(&sb_lock);
//减少引用计数,如果它所在的超级块链表为空了,跳转到restart
if (__put_super_and_need_restart(sb))
goto restart;
}
//如果回写了规定页面的数量.退出
if (wbc->nr_to_write <= 0)
break;
}
spin_unlock(&sb_lock);
}
在文件系统前面几节的分析,我们知道,每个super_block对应着一个文件系统。每个超级块又是存放在super_blocks中的。这个函数它将sb->s_dirty中的inode移到sb->s_io.然后统一对sb->s_io中的inode过行处理.
从上面的代码中看到,超级块的处理是在sync_sb_inodes()中完成的。它的代码如下:
static void
sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
{
const unsigned long start = jiffies; /* livelock avoidance */
//将s_dirty移动到s_io
if (!wbc->for_kupdate || list_empty(&sb->s_io))
list_splice_init(&sb->s_dirty, &sb->s_io);
//如果sb->s_io 不为空
while (!list_empty(&sb->s_io)) {
//取sb->s_io中的inode
struct inode *inode = list_entry(sb->s_io.prev,
struct inode, i_list);
struct address_space *mapping = inode->i_mapping;
struct backing_dev_info *bdi = mapping->backing_dev_info;
long pages_skipped;
//不允许将页面回写到磁盘中
if (bdi->memory_backed) {
list_move(&inode->i_list, &sb->s_dirty);
if (sb == blockdev_superblock) {
/*
* Dirty memory-backed blockdev: the ramdisk
* driver does this. Skip just this inode
*/
continue;
}
/*
* Dirty memory-backed inode against a filesystem other
* than the kernel-internal bdev filesystem. Skip the
* entire superblock.
*/
break;
}
//不允许阻塞,但请求队列又处于拥塞状态
if (wbc->nonblocking && bdi_write_congested(bdi)) {
wbc->encountered_congestion = 1;
if (sb != blockdev_superblock)
break; /* Skip a congested fs */
list_move(&inode->i_list, &sb->s_dirty);
continue; /* Skip a congested blockdev */
}
//要操作的块设备不是这个设备
if (wbc->bdi && bdi != wbc->bdi) {
if (sb != blockdev_superblock)
break; /* fs has the wrong queue */
list_move(&inode->i_list, &sb->s_dirty);
continue; /* blockdev has wrong queue */
}
/* Was this inode dirtied after sync_sb_inodes was called? */
//如果是sync_sb_inode执行后,结点变为了脏结点,就略过个结点
if (time_after(inode->dirtied_when, start))
break;
/* Was this inode dirtied too recently? */
//忽略比指定时间戳小的结点
if (wbc->older_than_this && time_after(inode->dirtied_when,
*wbc->older_than_this))
break;
/* Is another pdflush already flushing this queue? */
//已经有另外的pdflush在处理这个super_block中的结点了
if (current_is_pdflush() && !writeback_acquire(bdi))
break;
BUG_ON(inode->i_state & I_FREEING);
//增加引导用计数
__iget(inode);
pages_skipped = wbc->pages_skipped;
//对结点的处理
__writeback_single_inode(inode, wbc);
if (wbc->sync_mode == WB_SYNC_HOLD) {
inode->dirtied_when = jiffies;
list_move(&inode->i_list, &sb->s_dirty);
}
//处理完了,清除设备的BDI_pdflush 标志
if (current_is_pdflush())
writeback_release(bdi);
if (wbc->pages_skipped != pages_skipped) {
/*
* writeback is not making progress due to locked
* buffers. Skip this inode for now.
*/
list_move(&inode->i_list, &sb->s_dirty);
}
spin_unlock(&inode_lock);
//如果有抢占的情况,就将当前进程让出来
cond_resched();
//减小引用计数
iput(inode);
spin_lock(&inode_lock);
if (wbc->nr_to_write <= 0)
break;
}
return; /* Leave any unwritten inodes on s_io */
}
对inode的处理是在__writeback_single_inode()中完成的,它的代码如下:
static int
__writeback_single_inode(struct inode *inode,
struct writeback_control *wbc)
{
//如果inode被锁定了,但当前同步模式又不为WB_SYNC_ALL
//将inode移到s_dirty链中
if ((wbc->sync_mode != WB_SYNC_ALL) && (inode->i_state & I_LOCK)) {
list_move(&inode->i_list, &inode->i_sb->s_dirty);
return 0;
}
/*
* It's a data-integrity sync. We must wait.
*/
//如果是WB_SYNC_ALL模式,那就等待inode解锁
while (inode->i_state & I_LOCK) {
__iget(inode);
spin_unlock(&inode_lock);
__wait_on_inode(inode);
iput(inode);
spin_lock(&inode_lock);
}
return __sync_single_inode(inode, wbc);
}
转入到__sync_single_inode();
static int
__sync_single_inode(struct inode *inode, struct writeback_control *wbc)
{
unsigned dirty;
struct address_space *mapping = inode->i_mapping;
struct super_block *sb = inode->i_sb;
int wait = wbc->sync_mode == WB_SYNC_ALL;
int ret;
//如果inode还处理Lock状态,是不允许执行这项操作的
BUG_ON(inode->i_state & I_LOCK);
/* Set I_LOCK, reset I_DIRTY */
//如果inode为dirty.则dirty=1
dirty = inode->i_state & I_DIRTY;
//设置I_LOCK标志
inode->i_state |= I_LOCK;
//清除I_DIRTY
inode->i_state &= ~I_DIRTY;
spin_unlock(&inode_lock);
//回写这个inode的脏页
ret = do_writepages(mapping, wbc);
/* Don't write the inode if only I_DIRTY_PAGES was set */
if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
//如果inode也是脏的,将其回写到文件系统.它是调用super_block的write_inode方法 int err = write_inode(inode, wait);
if (ret == 0)
ret = err;
}
if (wait) {
int err = filemap_fdatawait(mapping);
if (ret == 0)
ret = err;
}
spin_lock(&inode_lock);
inode->i_state &= ~I_LOCK;
if (!(inode->i_state & I_FREEING)) {
if (!(inode->i_state & I_DIRTY) &&
mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
/*
* We didn't write back all the pages. nfs_writepages()
* sometimes bales out without doing anything. Redirty
* the inode. It is still on sb->s_io.
*/
if (wbc->for_kupdate) {
/*
* For the kupdate function we leave the inode
* at the head of sb_dirty so it will get more
* writeout as soon as the queue becomes
* uncongested.
*/
inode->i_state |= I_DIRTY_PAGES;
list_move_tail(&inode->i_list, &sb->s_dirty);
} else {
/*
* Otherwise fully redirty the inode so that
* other inodes on this superblock will get some
* writeout. Otherwise heavy writing to one
* file would indefinitely suspend writeout of
* all the other files.
*/
inode->i_state |= I_DIRTY_PAGES;
inode->dirtied_when = jiffies;
list_move(&inode->i_list, &sb->s_dirty);
}
} else if (inode->i_state & I_DIRTY) {
/*
* Someone redirtied the inode while were writing back
* the pages.
*/
//如果inode中还有脏页,将它放到sb->s_dirty中
list_move(&inode->i_list, &sb->s_dirty);
} else if (atomic_read(&inode->i_count)) {
/*
* The inode is clean, inuse
*/
//引用计数不为零.将它放到inode_in_use
list_move(&inode->i_list, &inode_in_use);
} else {
/*
* The inode is clean, unused
*/
//否将,将它放到inode_unused
list_move(&inode->i_list, &inode_unused);
inodes_stat.nr_unused++;
}
}
//inode的lock状态解除了,唤醒在这个inode等待的进程
wake_up_inode(inode);
return ret;
}
对于写操作,是由do_writepages()完成的.
int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
if (wbc->nr_to_write <= 0)
return 0;
if (mapping->a_ops->writepages)
return mapping->a_ops->writepages(mapping, wbc);
return generic_writepages(mapping, wbc);
}
在这里,假设_ops->writepages为空。流程转入eneric_writepages().
实际上,它是mpage_writepages()的完全封装函数.
重点分析mpage_writepages()的实现:
int
mpage_writepages(struct address_space *mapping,
struct writeback_control *wbc, get_block_t get_block)
{
struct backing_dev_info *bdi = mapping->backing_dev_info;
struct bio *bio = NULL;
sector_t last_block_in_bio = 0;
int ret = 0;
int done = 0;
int (*writepage)(struct page *page, struct writeback_control *wbc);
struct pagevec pvec;
int nr_pages;
pgoff_t index;
pgoff_t end = -1; /* Inclusive */
int scanned = 0;
int is_range = 0;
//如果操作不允许阻塞,但当前请求队列又是拥塞的情况
if (wbc->nonblocking && bdi_write_congested(bdi)) {
wbc->encountered_congestion = 1;
return 0;
}
writepage = NULL;
//如果get_block参数为NULL.取页高速缓存的wirtepage方法
if (get_block == NULL)
writepage = mapping->a_ops->writepage;
//初始化pvec
pagevec_init(&pvec, 0);
//如果操作模式为WB_SYNC_NONE,那就从mapping->writeback_index的序号开始
//否则,从0开始
if (wbc->sync_mode == WB_SYNC_NONE) {
index = mapping->writeback_index; /* Start from prev offset */
} else {
index = 0; /* whole-file sweep */
scanned = 1;
}
if (wbc->start || wbc->end) {
index = wbc->start >> PAGE_CACHE_SHIFT;
end = wbc->end >> PAGE_CACHE_SHIFT;
is_range = 1;
scanned = 1;
}
retry:
//到页高速缓存中取dirty的页面
while (!done && (index <= end) &&
(nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
PAGECACHE_TAG_DIRTY,
min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
unsigned i;
scanned = 1;
//遍历取到的页面
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
/*
* At this point we hold neither mapping->tree_lock nor
* lock on the page itself: the page may be truncated or
* invalidated (changing page->mapping to NULL), or even
* swizzled back from swapper_space to tmpfs file
* mapping
*/
//锁住页面
lock_page(page);
//页面不是指定的页高速缓存区
if (unlikely(page->mapping != mapping)) {
unlock_page(page);
continue;
}
//页序号是否合法
if (unlikely(is_range) && page->index > end) {
done = 1;
unlock_page(page);
continue;
}
//如果不为WB_SYNC_NONE模式,页面正在被回写,等待其操作完
if (wbc->sync_mode != WB_SYNC_NONE)
wait_on_page_writeback(page);
//如果页面正在被回写或者清除drity标志失败
if (PageWriteback(page) ||
!clear_page_dirty_for_io(page)) {
unlock_page(page);
continue;
}
//如果指定了writepage操作.就调用其接口
if (writepage) {
ret = (*writepage)(page, wbc);
if (ret) {
if (ret == -ENOSPC)
set_bit(AS_ENOSPC,
&mapping->flags);
else
set_bit(AS_EIO,
&mapping->flags);
}
} else {
//否则,调用mpage_writepage
bio = mpage_writepage(bio, page, get_block,
&last_block_in_bio, &ret, wbc);
}
//如果失败,或者指定数目的页面已经被回写完了
if (ret || (--(wbc->nr_to_write) <= 0))
done = 1;
//再次判断请求队列是否被拥塞
if (wbc->nonblocking && bdi_write_congested(bdi)) {
wbc->encountered_congestion = 1;
done = 1;
}
}
//释放pvec
pagevec_release(&pvec);
//如果有内核抢占,让出当前进程
cond_resched();
}
if (!scanned && !done) {
/*
* We hit the last page and there is more work to be done: wrap
* back to the start of the file
*/
scanned = 1;
index = 0;
goto retry;
}
if (!is_range)
mapping->writeback_index = index;
//最后,将bio提交
if (bio)
mpage_bio_submit(WRITE, bio);
return ret;
}
pagevec_lookup_tag()用于从页高速缓存中搜索找定tag的页面。它与我们分析的find_get_pages()实现差不多,可以自行对照分析.
对于没有定义writepage时的操作,也就是调用mpage_writepage()中的操作.这个过程如下:
static struct bio *
mpage_writepage(struct bio *bio, struct page *page, get_block_t get_block,
sector_t *last_block_in_bio, int *ret, struct writeback_control *wbc)
{
struct address_space *mapping = page->mapping;
struct inode *inode = page->mapping->host;
const unsigned blkbits = inode->i_blkbits;
unsigned long end_index;
const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;
sector_t last_block;
sector_t block_in_file;
sector_t blocks[MAX_BUF_PER_PAGE];
unsigned page_block;
unsigned first_unmapped = blocks_per_page;
struct block_device *bdev = NULL;
int boundary = 0;
sector_t boundary_block = 0;
struct block_device *boundary_bdev = NULL;
int length;
struct buffer_head map_bh;
loff_t i_size = i_size_read(inode);
if (page_has_buffers(page)) {
struct buffer_head *head = page_buffers(page);
struct buffer_head *bh = head;
/* If they're all mapped and dirty, do it */
page_block = 0;
do {
BUG_ON(buffer_locked(bh));
//可能有空洞
if (!buffer_mapped(bh)) {
/*
* unmapped dirty buffers are created by
* __set_page_dirty_buffers -> mmapped data
*/
if (buffer_dirty(bh))
goto confused;
if (first_unmapped == blocks_per_page)
first_unmapped = page_block;
continue;
}
//不连续
if (first_unmapped != blocks_per_page)
goto confused; /* hole -> non-hole */
//i不是连续的页面提交
if (!buffer_dirty(bh) || !buffer_uptodate(bh))
goto confused;
//判断是否跟前面的块缓存区连续
if (page_block) {
if (bh->b_blocknr != blocks[page_block-1] + 1)
goto confused;
}
//blocks:用来交录提交的块号
blocks[page_block++] = bh->b_blocknr;
boundary = buffer_boundary(bh);
if (boundary) {
boundary_block = bh->b_blocknr;
boundary_bdev = bh->b_bdev;
}
bdev = bh->b_bdev;
} while ((bh = bh->b_this_page) != head);
if (first_unmapped)
goto page_is_mapped;
/*
* Page has buffers, but they are all unmapped. The page was
* created by pagein or read over a hole which was handled by
* block_read_full_page(). If this address_space is also
* using mpage_readpages then this can rarely happen.
*/
goto confused;
}
/*
* The page has no buffers: map it to disk
*/
BUG_ON(!PageUptodate(page));
block_in_file = page->index << (PAGE_CACHE_SHIFT - blkbits);
last_block = (i_size - 1) >> blkbits;
map_bh.b_page = page;
for (page_block = 0; page_block < blocks_per_page; ) {
map_bh.b_state = 0;
if (get_block(inode, block_in_file, &map_bh, 1))
goto confused;
if (buffer_new(&map_bh))
unmap_underlying_metadata(map_bh.b_bdev,
map_bh.b_blocknr);
if (buffer_boundary(&map_bh)) {
boundary_block = map_bh.b_blocknr;
boundary_bdev = map_bh.b_bdev;
}
if (page_block) {
if (map_bh.b_blocknr != blocks[page_block-1] + 1)
goto confused;
}
blocks[page_block++] = map_bh.b_blocknr;
boundary = buffer_boundary(&map_bh);
bdev = map_bh.b_bdev;
if (block_in_file == last_block)
break;
block_in_file++;
}
BUG_ON(page_block == 0);
first_unmapped = page_block;
page_is_mapped:
end_index = i_size >> PAGE_CACHE_SHIFT;
if (page->index >= end_index) {
/*
* The page straddles i_size. It must be zeroed out on each
* and every writepage invokation because it may be mmapped.
* "A file is mapped in multiples of the page size. For a file
* that is not a multiple of the page size, the remaining memory
* is zeroed when mapped, and writes to that region are not
* written out to the file."
*/
unsigned offset = i_size & (PAGE_CACHE_SIZE - 1);
char *kaddr;
if (page->index > end_index || !offset)
goto confused;
kaddr = kmap_atomic(page, KM_USER0);
memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
flush_dcache_page(page);
kunmap_atomic(kaddr, KM_USER0);
}
/*
* This page will go to BIO. Do we need to send this BIO off first?
*/
if (bio && *last_block_in_bio != blocks[0] - 1)
bio = mpage_bio_submit(WRITE, bio);
//对于页面中连续的块缓存区,会通过bio进行段操作
alloc_new:
if (bio == NULL) {
bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
bio_get_nr_vecs(bdev), GFP_NOFS|__GFP_HIGH);
if (bio == NULL)
goto confused;
}
/*
* Must try to add the page before marking the buffer clean or
* the confused fail path above (OOM) will be very confused when
* it finds all bh marked clean (i.e. it will not write anything)
*/
length = first_unmapped << blkbits;
if (bio_add_page(bio, page, length, 0) < length) {
bio = mpage_bio_submit(WRITE, bio);
goto alloc_new;
}
/*
* OK, we have our BIO, so we can now mark the buffers clean. Make
* sure to only clean buffers which we know we'll be writing.
*/
if (page_has_buffers(page)) {
struct buffer_head *head = page_buffers(page);
struct buffer_head *bh = head;
unsigned buffer_counter = 0;
do {
if (buffer_counter++ == first_unmapped)
break;
clear_buffer_dirty(bh);
bh = bh->b_this_page;
} while (bh != head);
/*
* we cannot drop the bh if the page is not uptodate
* or a concurrent readpage would fail to serialize with the bh
* and it would read from disk before we reach the platter.
*/
if (buffer_heads_over_limit && PageUptodate(page))
try_to_free_buffers(page);
}
BUG_ON(PageWriteback(page));
set_page_writeback(page);
unlock_page(page);
if (boundary || (first_unmapped != blocks_per_page)) {
bio = mpage_bio_submit(WRITE, bio);
if (boundary_block) {
write_boundary_block(boundary_bdev,
boundary_block, 1 << blkbits);
}
} else {
*last_block_in_bio = blocks[blocks_per_page - 1];
}
goto out;
//对于不连续的页面,会调用a_ops-.writepage进行操作
confused:
if (bio)
bio = mpage_bio_submit(WRITE, bio);
*ret = page->mapping->a_ops->writepage(page, wbc);
/*
* The caller has a ref on the inode, so *mapping is stable
*/
if (*ret) {
if (*ret == -ENOSPC)
set_bit(AS_ENOSPC, &mapping->flags);
else
set_bit(AS_EIO, &mapping->flags);
}
out:
return bio;
}
这段代码其实并不复杂,如果I/O操作的块是连续的,则将它封装在bio 中,再将其提交到通用块设备层.如果是不连续的,则调用a_ops->writepage()进行操作.
对于mapping->a_ops->writepage和mapping->a_ops->writepages的相关操作,等分析vfs层的时候再给出详细分析.
总结一下关于background_writeout()的操作:它先遍历内核中的超级块,然后再给遍历超级块中的脏结点和I/O结点。对到取得的结点再执行write操作.直到脏页面低于给定的阀值且指定数目的页面被回写.这个过程比较简单,但篇幅较长,需要耐心的分析.
7.6.2:回写陈旧的页面
为了避免脏负在缓存区中存放的时间太长产生饥饿现实,内核每隔一段时间就会执行一次刷新的过程.注意到下面的代码:
static struct timer_list wb_timer =
TIMER_INITIALIZER(wb_timer_fn, 0, 0);
void __init page_writeback_init(void)
{
……
mod_timer(&wb_timer, jiffies + (dirty_writeback_centisecs * HZ) / 100);
,,,,,,
}
如上的代码如上,linux内核在初始化的时候会癖动wb_timer定时器,超时间隔为(dirty_writeback_centisecs * HZ) / 100. dirty_writeback_centisecs在linux内核中默认为100。可以由用户动态配置.
定时器超时之后,就会运行定时器函数wb_timer_fn():
static void wb_timer_fn(unsigned long unused)
{
//如果启动失败,会在1HZ后再启动
if (pdflush_operation(wb_kupdate, 0) < 0)
mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */
}
对应pdflush线程的处理函数为wb_kupdate()。代码如下:
static void wb_kupdate(unsigned long arg)
{
unsigned long oldest_jif;
unsigned long start_jif;
unsigned long next_jif;
long nr_to_write;
struct writeback_state wbs;
struct writeback_control wbc = {
.bdi = NULL,
.sync_mode = WB_SYNC_NONE,
.older_than_this = &oldest_jif,
.nr_to_write = 0,
.nonblocking = 1,
.for_kupdate = 1,
};
//将脏的超级块写回到磁盘
sync_supers();
get_writeback_state(&wbs);
oldest_jif = jiffies - (dirty_expire_centisecs * HZ) / 100;
start_jif = jiffies;
next_jif = start_jif + (dirty_writeback_centisecs * HZ) / 100;
nr_to_write = wbs.nr_dirty + wbs.nr_unstable +
(inodes_stat.nr_inodes - inodes_stat.nr_unused);
while (nr_to_write > 0) {
wbc.encountered_congestion = 0;
wbc.nr_to_write = MAX_WRITEBACK_PAGES;
writeback_inodes(&wbc);
if (wbc.nr_to_write > 0) {
if (wbc.encountered_congestion)
blk_congestion_wait(WRITE, HZ/10);
else
break; /* All the old data is written */
}
nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
}
//如果next_jif < jiffies +HZ --> next_jif = jiffies + HZ
if (time_before(next_jif, jiffies + HZ))
next_jif = jiffies + HZ;
//重新启动定时器
if (dirty_writeback_centisecs)
mod_timer(&wb_timer, next_jif);
}
从上面的代码可以看到,刷新页面与前面分析的background_writeout()的接口都是一样的。
就这样,每隔一段时间,就会让pdflush线程组执行一次页面的回写过程。
与调用background_writeout()的pdflush不同的是,这个pdflush定时就会启动,而前者只会在空闲页面低于阀值的时候才会被启动.