kernel-2.6.32以后脏数据的下刷彻底取消了pdflush,而是将此部分功能添加到BDI机制中,并且是为每个设备创建了一个名为“flush-设备主次设备号”的线程,用于脏数据的下刷。
backing_dev_info结构体
struct backing_dev_info {
struct list_head bdi_list;
unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */
unsigned long state; /* Always use atomic bitops on this */
unsigned int capabilities; /* Device capabilities */
congested_fn *congested_fn; /* Function pointer if device is md/dm */
void *congested_data; /* Pointer to aux data for congested func */
char *name;
struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS];
unsigned long bw_time_stamp; /* last time write bw is updated */
unsigned long dirtied_stamp;
unsigned long written_stamp; /* pages written at bw_time_stamp */
unsigned long write_bandwidth; /* the estimated write bandwidth */
unsigned long avg_write_bandwidth; /* further smoothed write bw */
/*
* The base dirty throttle rate, re-calculated on every 200ms.
* All the bdi tasks' dirty rate will be curbed under it.
* @dirty_ratelimit tracks the estimated @balanced_dirty_ratelimit
* in small steps and is much more smooth/stable than the latter.
*/
unsigned long dirty_ratelimit;
unsigned long balanced_dirty_ratelimit;
struct fprop_local_percpu completions;
int dirty_exceeded;
unsigned int min_ratio;
unsigned int max_ratio, max_prop_frac;
struct bdi_writeback wb; /* default writeback info for this bdi */
spinlock_t wb_lock; /* protects work_list */
struct list_head work_list;
struct device *dev;
struct timer_list laptop_mode_wb_timer;
#ifdef CONFIG_DEBUG_FS
struct dentry *debug_dir;
struct dentry *debug_stats;
#endif
};
对应的state状态:
enum bdi_state {
BDI_wb_alloc, /* Default embedded wb allocated */ 表示该设备上已经申请出一个下刷任务
BDI_async_congested, /* The async (write) queue is getting full */
BDI_sync_congested, /* The sync queue is getting full */
BDI_REGISTERED, /* bdi_register() was done */ 表示该设备已经创建了flush线程
BDI_writeback_running, /* Writeback is in progress */
BDI_unused, /* Available bits start here */
};
用于sys接口计数统计的数值:
enum bdi_stat_item {
BDI_RECLAIMABLE,
BDI_WRITEBACK,
BDI_DIRTIED,
BDI_WRITTEN,
NR_BDI_STAT_ITEMS
};
初始化函数blk_alloc_queue_node,初始化ra_pages、state、capabilities、unplug_io_fn、unplug_io_data、name,bdi_init函数初始化其他的值。
struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
{
struct request_queue *q;
int err;
q = kmem_cache_alloc_node(blk_requestq_cachep,
gfp_mask | __GFP_ZERO, node_id);
if (!q)
return NULL;
q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask);
if (q->id < 0)
goto fail_q;
q->backing_dev_info.ra_pages =
(VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
q->backing_dev_info.state = 0;
q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY;
q->backing_dev_info.name = "block";
q->node = node_id;
err = bdi_init(&q->backing_dev_info);
setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
laptop_mode_timer_fn, (unsigned long) q); 设置5秒超时执行下刷函数
setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
backing_dev_info结构中描述每个设备的一些bdi状态信息,BDI中提供了一个sys接口,在/sys/kernel/debug/bdi目录下,每个设备以设备号区分,可以查看每个设备与bdi有关的状态等。
下面是CentOS的信息:
海思3520的内核3.10.y没有此信息,但是看内核中是由开CONFIG_DEBUG_FS宏的,也没有对应的打印,为啥?
内核启动
1. 创建名为sync_supers的线程,此线程由定时器来唤醒,此外无其他唤醒模式。每5s钟被唤醒执行一次函数sync_supers,用来下刷系统super_blocks链表中所有的元数据块信息。没看到代码实现啊?
2、定义一个默认的结构backing_dev_info,同时会创建一个线程bdi-default。
struct backing_dev_info default_backing_dev_info = {
.name = "default",
.ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
.state = 0,
.capabilities = BDI_CAP_MAP_COPY,
};
EXPORT_SYMBOL_GPL(default_backing_dev_info);
static int __init default_bdi_init(void)
{
int err;
bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_FREEZABLE |
WQ_UNBOUND | WQ_SYSFS, 0);
if (!bdi_wq)
return -ENOMEM;
err = bdi_init(&default_backing_dev_info);
if (!err)
bdi_register(&default_backing_dev_info, NULL, "default");
err = bdi_init(&noop_backing_dev_info);
return err;
}
subsys_initcall(default_bdi_init);
3、有新的设备分区添加时,add_disk会为每个设备定义一个结构backing_dev_info,然后将此结构挂到bdi_list链表尾;
void add_disk(struct gendisk *disk)
{
struct backing_dev_info *bdi;
/* Register BDI before referencing it from bdev */
bdi = &disk->queue->backing_dev_info;
bdi_register_dev(bdi, disk_devt(disk));
int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
{
return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev));
}
EXPORT_SYMBOL(bdi_register_dev);
int bdi_register(struct backing_dev_info *bdi, struct device *parent,
const char *fmt, ...)
{
va_list args;
struct device *dev;
if (bdi->dev) /* The driver needs to use separate queues per device */
return 0;
va_start(args, fmt);
dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
va_end(args);
if (IS_ERR(dev))
return PTR_ERR(dev);
bdi->dev = dev;
bdi_debug_register(bdi, dev_name(dev));
set_bit(BDI_REGISTERED, &bdi->state);
spin_lock_bh(&bdi_lock);
list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
spin_unlock_bh(&bdi_lock);
trace_writeback_bdi_register(bdi);
return 0;
}
EXPORT_SYMBOL(bdi_register);
下刷动作
wb_writeback_work用来描述一次下刷任务的信息,如该次下刷任务以何种模式下刷,下刷多少数据等,多用在唤醒下刷线程之前定义。
struct wb_writeback_work {
long nr_pages;
struct super_block *sb;
unsigned long *older_than_this;
enum writeback_sync_modes sync_mode;
unsigned int tagged_writepages:1;
unsigned int for_kupdate:1;
unsigned int range_cyclic:1;
unsigned int for_background:1;
enum wb_reason reason; /* why was writeback initiated? */
struct list_head list; /* pending work list */
struct completion *done; /* set if the caller waits */
};
触发下刷的操作时,可以将此work挂到一个工作队列中,由下刷线程来完成任务;也可以用此结构的信息,指导结构writeback_control中的值,然后直接调用下刷函数,完成工作任务。