上一篇文章《设备文件与设备号》当然不是突然穿插而来的自言自语,而是理解本文的前提,下面来看。flush-x:y是一类进程,这在系列的上一篇文章里已经讲到过,系统的绝大部分的bdi设备都会有对应的flush-x:y内核进程,而这个x:y是对应bdi设备的设备号。
先看一下系统当前挂载的文件系统:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
|
[root@localhost lenky]# cat /proc/mounts
rootfs / rootfs rw 0 0
/proc /proc proc rw,relatime 0 0
/sys /sys sysfs rw,seclabel,relatime 0 0
udev /dev devtmpfs rw,seclabel,relatime,size=502568k,nr_inodes=125642,mode=755 0 0
devpts /dev/pts devpts rw,seclabel,relatime,gid=5,mode=620,ptmxmode=000 0 0
tmpfs /dev/shm tmpfs rw,seclabel,relatime 0 0
/dev/mapper/VolGroup-lv_root / ext4 rw,seclabel,relatime,barrier=1,data=ordered 0 0
none /selinux selinuxfs rw,relatime 0 0
udev /dev devtmpfs rw,seclabel,relatime,size=502568k,nr_inodes=125642,mode=755 0 0
/proc/bus/usb /proc/bus/usb usbfs rw,relatime 0 0
/dev/sda1 /boot ext4 rw,seclabel,relatime,barrier=1,data=ordered 0 0
/dev/mapper/VolGroup-lv_home /home ext4 rw,seclabel,relatime,barrier=1,data=ordered 0 0
none /proc/sys/fs/binfmt_misc binfmt_misc rw,relatime 0 0
cgroup /cgroup/cpuset cgroup rw,relatime,cpuset 0 0
cgroup /cgroup/cpu cgroup rw,relatime,cpu 0 0
cgroup /cgroup/cpuacct cgroup rw,relatime,cpuacct 0 0
cgroup /cgroup/memory cgroup rw,relatime,memory 0 0
cgroup /cgroup/devices cgroup rw,relatime,devices 0 0
cgroup /cgroup/freezer cgroup rw,relatime,freezer 0 0
cgroup /cgroup/net_cls cgroup rw,relatime,net_cls 0 0
cgroup /cgroup/blkio cgroup rw,relatime,blkio 0 0
sunrpc /var/lib/nfs/rpc_pipefs rpc_pipefs rw,relatime 0 0
/etc/auto.misc /misc autofs rw,relatime,fd=7,pgrp=1393,timeout=300,minproto=5,maxproto=5,indirect 0 0
-hosts /net autofs rw,relatime,fd=13,pgrp=1393,timeout=300,minproto=5,maxproto=5,indirect 0 0
/dev/sdb1 /home/lenky/sdb/sdb1 ext4 rw,seclabel,relatime,barrier=1,data=ordered 0 0
/dev/sdc1 /home/lenky/sdc/sdc1 ext4 rw,seclabel,relatime,barrier=1,data=ordered 0 0
/dev/sdc2 /home/lenky/sdc/sdc2 ext4 rw,seclabel,relatime,barrier=1,data=ordered 0 0
[root@localhost lenky]#
|
注意需要关注的重点:
/dev/mapper/VolGroup-lv_root / ext4
/dev/mapper/VolGroup-lv_home /home ext4
/dev/sdb1 /home/lenky/sdb/sdb1 ext4
/dev/sdc1 /home/lenky/sdc/sdc1 ext4
/dev/sdc2 /home/lenky/sdc/sdc2 ext4
对应的设备号分别为:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
|
[root@localhost lenky]# ls -l /dev/dm-*
brw-rw----. 1 root disk 253, 0 Jan 12 06:24 /dev/dm-0
brw-rw----. 1 root disk 253, 1 Jan 12 06:24 /dev/dm-1
brw-rw----. 1 root disk 253, 2 Jan 12 06:24 /dev/dm-2
[root@localhost lenky]#
[root@localhost lenky]# ls -l /dev/mapper/*
crw-rw----. 1 root root 10, 236 Jan 12 06:24 /dev/mapper/control
lrwxrwxrwx. 1 root root 7 Jan 12 06:24 /dev/mapper/VolGroup-lv_home -> ../dm-2
lrwxrwxrwx. 1 root root 7 Jan 12 06:24 /dev/mapper/VolGroup-lv_root -> ../dm-0
lrwxrwxrwx. 1 root root 7 Jan 12 06:24 /dev/mapper/VolGroup-lv_swap -> ../dm-1
[root@localhost lenky]#
[root@localhost lenky]# ls -l /dev/sda*
brw-rw----. 1 root disk 8, 0 Jan 12 06:24 /dev/sda
brw-rw----. 1 root disk 8, 1 Jan 12 06:24 /dev/sda1
brw-rw----. 1 root disk 8, 2 Jan 12 06:24 /dev/sda2
[root@localhost lenky]#
[root@localhost lenky]# ls -l /dev/sdb*
brw-rw----. 1 root disk 8, 16 Jan 12 06:25 /dev/sdb
brw-rw----. 1 root disk 8, 17 Jan 12 06:25 /dev/sdb1
[root@localhost lenky]#
[root@localhost lenky]# ls -l /dev/sdc*
brw-rw----. 1 root disk 8, 32 Jan 12 06:29 /dev/sdc
brw-rw----. 1 root disk 8, 33 Jan 12 06:39 /dev/sdc1
brw-rw----. 1 root disk 8, 34 Jan 12 06:29 /dev/sdc2
brw-rw----. 1 root disk 8, 35 Jan 12 06:29 /dev/sdc3
[root@localhost lenky]#
|
在任意时刻,我们能看到的flush-x:y内核进程并不固定,原因之前已经说过:
1
2
3
4
5
6
|
[root@localhost lenky]# ps aux | grep flush-
root 1250 0.0 0.0 0 0 ? S 06:24 0:00 [flush-253:0]
root 2180 0.0 0.0 0 0 ? S 06:39 0:00 [flush-253:2]
root 2186 2.0 0.0 0 0 ? S 06:39 0:07 [flush-8:32]
root 2329 0.0 0.0 103204 800 pts/3 S+ 06:45 0:00 grep flush-
[root@localhost lenky]#
|
调用sync命令,强制同步操作会创建所有对应的flush-x:y内核进程:
1
2
3
4
5
6
7
8
9
|
[root@localhost lenky]# sync
[root@localhost lenky]# ps aux | grep flush-
root 1250 0.0 0.0 0 0 ? S 06:24 0:00 [flush-253:0]
root 2180 0.0 0.0 0 0 ? S 06:39 0:00 [flush-253:2]
root 2186 2.0 0.0 0 0 ? S 06:39 0:07 [flush-8:32]
root 2331 0.0 0.0 0 0 ? S 06:45 0:00 [flush-8:0]
root 2332 0.0 0.0 0 0 ? S 06:45 0:00 [flush-8:16]
root 2334 0.0 0.0 103204 800 pts/3 S+ 06:45 0:00 grep flush-
[root@localhost lenky]#
|
可以看到flush-x:y内核进程是对应bdi整个设备的,比如这里的单个磁盘,而不是各个磁盘分区。
最后来看代码,flush-x:y内核进程的主体函数是bdi_writeback_thread(…):
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
|
/*
* Handle writeback of dirty data for the device backed by this bdi. Also
* wakes up periodically and does kupdated style flushing.
*/
int
bdi_writeback_thread(
void
*data)
{
struct
bdi_writeback *wb = data;
struct
backing_dev_info *bdi = wb->bdi;
long
pages_written;
current->flags |= PF_SWAPWRITE;
set_freezable();
wb->last_active = jiffies;
/*
* Our parent may run at a different priority, just set us to normal
*/
set_user_nice(current, 0);
trace_writeback_thread_start(bdi);
while
(!kthread_should_stop()) {
/*
* Remove own delayed wake-up timer, since we are already awake
* and we'll take care of the preriodic write-back.
*/
del_timer(&wb->wakeup_timer);
pages_written = wb_do_writeback(wb, 0);
trace_writeback_pages_written(pages_written);
if
(pages_written)
wb->last_active = jiffies;
set_current_state(TASK_INTERRUPTIBLE);
if
(!list_empty(&bdi->work_list) || kthread_should_stop()) {
__set_current_state(TASK_RUNNING);
continue
;
}
if
(wb_has_dirty_io(wb) && dirty_writeback_interval)
schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
else
{
/*
* We have nothing to do, so can go sleep without any
* timeout and save power. When a work is queued or
* something is made dirty - we will be woken up.
*/
schedule();
}
try_to_freeze();
}
/* Flush any work that raced with us exiting */
if
(!list_empty(&bdi->work_list))
wb_do_writeback(wb, 1);
trace_writeback_thread_stop(bdi);
return
0;
}
|
函数主体是一个while循环,while语句调用一个判断函数决定是否该结束循环:
1
2
3
4
5
6
7
8
9
10
11
12
|
/**
* kthread_should_stop - should this kthread return now?
*
* When someone calls kthread_stop() on your kthread, it will be woken
* and this will return true. You should then return, and your return
* value will be passed through to kthread_stop().
*/
int
kthread_should_stop(
void
)
{
return
to_kthread(current)->should_stop;
}
EXPORT_SYMBOL(kthread_should_stop);
|
而这个should_stop标记字段会在bdi-default内核进程的KILL_THREAD动作里进行修改(上一篇文章提到过),也就是通过这个字段实现bdi-default内核进程对flush-x:y内核进程的控制:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
|
case
KILL_THREAD:
__set_current_state(TASK_RUNNING);
kthread_stop(task);
break
;
/**
* kthread_stop - stop a thread created by kthread_create().
* @k: thread created by kthread_create().
*
* Sets kthread_should_stop() for @k to return true, wakes it, and
* waits for it to exit. This can also be called after kthread_create()
* instead of calling wake_up_process(): the thread will exit without
* calling threadfn().
*
* If threadfn() may call do_exit() itself, the caller must ensure
* task_struct can't go away.
*
* Returns the result of threadfn(), or %-EINTR if wake_up_process()
* was never called.
*/
int
kthread_stop(
struct
task_struct *k)
{
struct
kthread *kthread;
int
ret;
trace_sched_kthread_stop(k);
get_task_struct(k);
kthread = to_kthread(k);
barrier();
/* it might have exited */
if
(k->vfork_done != NULL) {
kthread->should_stop = 1;
wake_up_process(k);
wait_for_completion(&kthread->exited);
}
ret = k->exit_code;
put_task_struct(k);
trace_sched_kthread_stop_ret(ret);
return
ret;
}
EXPORT_SYMBOL(kthread_stop);
|
while循环内的工作,除去其它细节,值得关注的主要有三点:第一,修改最后活动时间(语句:wb->last_active = jiffies;),这样bdi-default内核进程才能通过last_active这个字段来判断flush-x:y内核进程的活动状态,如果很久没有活动(比较的就是last_active字段)则把它kill掉;第二,当然就是进程的主要工作,调用函数wb_do_writeback(…)进行同步操作;第三,如果在进行一次同步操作之后,又有新的脏数据需要同步,那么先睡眠,等间隔时间(默认5秒)后超时醒来继续工作;如果已经没有脏数据需要同步,那么直接schedule()调度其它进程,而进程本身进入可中断睡眠状态(注意前面的语句:set_current_state(TASK_INTERRUPTIBLE);),等待后续被唤醒继续工作或被kill掉。
整个bdi-default和flush-x:y内核进程讲完了,为什么会有这样的设计?在这里有很好的说明:http://lwn.net/Articles/396757/,相比以前的多个pdflush间隔醒来,改进之后只需bdi-default一个内核进程间隔醒来就行了,这在电池供电设备上明显比较省电。
转载请保留地址:http://lenky.info/2012/02/18/linux%e5%86%85%e6%a0%b8%e8%bf%9b%e7%a8%8b%e8%af%a6%e8%a7%a3%e4%b9%8b%e4%b8%89%ef%bc%9aflush-xy/ 或 http://lenky.info/?p=1138