/proc/meminfo中有统计buffer和cache,这两个具体代表什么含义,最近分析了一下内核代码,希望可以解释清楚。
[root@rhevm ~]# cat /proc/meminfo
MemTotal: 5845796 kB
MemFree: 3648956 kB
Buffers: 42812 kB
Cached: 677024 kB
SwapCached: 0 kB
Active: 1620200 kB
Inactive: 413544 kB
Active(anon): 1316896 kB
Inactive(anon): 13576 kB
Active(file): 303304 kB
Inactive(file): 399968 kB
Unevictable: 0 kB
Mlocked: 0 kB
SwapTotal: 3071992 kB
SwapFree: 3071992 kB
Dirty: 1828 kB
Writeback: 0 kB
AnonPages: 1313984 kB
Mapped: 84048 kB
Shmem: 16496 kB
Slab: 70456 kB
SReclaimable: 41976 kB
SUnreclaim: 28480 kB
KernelStack: 3792 kB
PageTables: 22216 kB
NFS_Unstable: 0 kB
Bounce: 0 kB
WritebackTmp: 0 kB
CommitLimit: 5994888 kB
Committed_AS: 2613668 kB
VmallocTotal: 34359738367 kB
VmallocUsed: 26560 kB
VmallocChunk: 34359707596 kB
HardwareCorrupted: 0 kB
AnonHugePages: 864256 kB
HugePages_Total: 0
HugePages_Free: 0
HugePages_Rsvd: 0
HugePages_Surp: 0
Hugepagesize: 2048 kB
DirectMap4k: 8180 kB
DirectMap2M: 6135808 kB
meminfo的内容是通过meminfo_proc_show函数处理的。
static int meminfo_proc_show(struct seq_file *m, void *v) 在/fs/proc/meminfo.c中。
cache的计算如下:
cached = global_page_state(NR_FILE_PAGES) -
total_swapcache_pages - i.bufferram;
buffer的计算如下:
val->bufferram = nr_blockdev_pages();
首先分析buffer的统计:
long nr_blockdev_pages(void)
{
struct block_device *bdev;
long ret = 0;
spin_lock(&bdev_lock);
list_for_each_entry(bdev, &all_bdevs, bd_list) {
ret += bdev->bd_inode->i_mapping->nrpages;
}
spin_unlock(&bdev_lock);
return ret;
}
buffer的统计实际上是把所有块设备文件中的mapping进行了汇总。
内核什么实际会进行块设备的mapping处理?
struct buffer_head *
__getblk(struct block_device *bdev, sector_t block, unsigned size)
struct buffer_head *
__bread(struct block_device *bdev, sector_t block, unsigned size)
实际上很多资料提到linux有块缓存和页缓存,块缓存针对的是块设备,根据请求的块设备,偏移量,大小,在缓存中查找,如果存在的话,就不用再想通用块层提交bio了。
页缓存针对的是文件内容的缓存,根据请求的文件inode,偏移量,大小进行查找,和在读文件或者写文件的时候,文件内容保证在内容中,来提供性能。
但是,分析内核的代码发现,块缓存已经基于页缓存机制来实现,只不过定位的文件是设备文件的inode.
因此便利所有块设备的>i_mapping->nrpages;就可以计算出来块缓存的大小。
list_for_each_entry(bdev, &all_bdevs, bd_list) {
ret += bdev->bd_inode->i_mapping->nrpages;
}
cached = global_page_state(NR_FILE_PAGES) -
total_swapcache_pages - i.bufferram;
cache的计算需要减去buffer,是因为buffer块缓存也是基于cache页缓存的,因此也是用的是NR_FILE_PAGES标示,因此需要减掉buffer。
减掉swap获取实际内存中的cache大小。
那么什么情况下会使用cache,什么情况下会使用buffer?
根据前面的分析,如果调用的是__bread,则会统计到buffer上,如果是普通文件内容的缓存会被统计到cache上。
分析内核代码,调用__bread的地方,基本上是文件系统代码中获取元数据的部分。这部分会被统计到buffer上。
vfs在普通文件内容读写流程中,都会使用缓存机制,都会被统计到cache上。
__getblk(struct block_device *bdev, sector_t block, unsigned size)
{
struct buffer_head *bh = __find_get_block(bdev, block, size); //首先调用__find_get_block在块缓存中查找是否存在,如果存在直接返回
might_sleep();
if (bh == NULL)
bh = __getblk_slow(bdev, block, size); //如果不存在,则进行实际的io操作
return bh;
}
__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
{
struct buffer_head *bh = lookup_bh_lru(bdev, block, size); //根据lru,快速查找
if (bh == NULL) {
bh = __find_get_block_slow(bdev, block); //如果查找失败,通过页缓存进行查找
if (bh)
bh_lru_install(bh);
}
if (bh)
touch_buffer(bh);
return bh;
}
static struct buffer_head *
__find_get_block_slow(struct block_device *bdev, sector_t block)
{
struct inode *bd_inode = bdev->bd_inode;
struct address_space *bd_mapping = bd_inode->i_mapping; //获取对应块设备的address_space结构
struct buffer_head *ret = NULL;
pgoff_t index;
struct buffer_head *bh;
struct buffer_head *head;
struct page *page;
int all_mapped = 1;
index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
page = find_get_page(bd_mapping, index); //如果查找到,通过page获取buffer_head
if (!page)
goto out;
spin_lock(&bd_mapping->private_lock);
if (!page_has_buffers(page))
goto out_unlock;
head = page_buffers(page);
bh = head;
do {
if (!buffer_mapped(bh))
all_mapped = 0;
else if (bh->b_blocknr == block) {
ret = bh;
get_bh(bh);
goto out_unlock;
}
bh = bh->b_this_page;
} while (bh != head);
/* we might be here because some of the buffers on this page are
* not mapped. This is due to various races between
* file io on the block device and getblk. It gets dealt with
* elsewhere, don't buffer_error if we had some unmapped buffers
*/
if (all_mapped) {
printk("__find_get_block_slow() failed. "
"block=%llu, b_blocknr=%llu\n",
(unsigned long long)block,
(unsigned long long)bh->b_blocknr);
printk("b_state=0x%08lx, b_size=%zu\n",
bh->b_state, bh->b_size);
printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
}
out_unlock:
spin_unlock(&bd_mapping->private_lock);
page_cache_release(page);
out:
return ret;
}
/* If we *know* page->private refers to buffer_heads */
#define page_buffers(page) \
({ \
BUG_ON(!PagePrivate(page)); \
((struct buffer_head *)page_private(page)); \
})
查找失败处理:
static struct buffer_head *
__getblk_slow(struct block_device *bdev, sector_t block, int size)
{
/* Size must be multiple of hard sectorsize */
if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
(size < 512 || size > PAGE_SIZE))) {
printk(KERN_ERR "getblk(): invalid block size %d requested\n",
size);
printk(KERN_ERR "logical block size: %d\n",
bdev_logical_block_size(bdev));
dump_stack();
return NULL;
}
for (;;) {
struct buffer_head *bh;
int ret;
bh = __find_get_block(bdev, block, size);
if (bh)
return bh;
ret = grow_buffers(bdev, block, size);
if (ret < 0)
return NULL;
if (ret == 0)
free_more_memory();
}
}
调用grow_buffers进行块缓存的扩展。