内核在哪里注册了/proc/mdstat节点
struct wait_queue_head {
spinlock_t lock;
struct list_head head;
};
#define __WAIT_QUEUE_HEAD_INITIALIZER(name) { \
.lock = __SPIN_LOCK_UNLOCKED(name.lock), \
.head = { &(name).head, &(name).head } }
#define DECLARE_WAIT_QUEUE_HEAD(name) \
struct wait_queue_head name = __WAIT_QUEUE_HEAD_INITIALIZER(name)
\linux-4.18-rc1\drivers\md\md.c
/*
* We have a system wide 'event count' that is incremented
* on any 'interesting' event, and readers of /proc/mdstat
* can use 'poll' or 'select' to find out when the event
* count increases.
*
* Events are:
* start array, stop array, error, add device, remove device,
* start build, activate spare
*/
static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
static atomic_t md_event_count;
void md_new_event(struct mddev *mddev)
{
atomic_inc(&md_event_count);
wake_up(&md_event_waiters); // 唤醒等待队列中的所有任务。
}
EXPORT_SYMBOL_GPL(md_new_event);
// wake_up最终调用的是函数__wake_up
/**
* __wake_up - wake up threads blocked on a waitqueue.
* @wq_head: the waitqueue
* @mode: which threads
* @nr_exclusive: how many wake-one or wake-many threads to wake up
* @key: is directly passed to the wakeup function
*
* It may be assumed that this function implies a write memory barrier before
* changing the task state if and only if any tasks are woken up.
*/
void __wake_up(struct wait_queue_head *wq_head, unsigned int mode,
int nr_exclusive, void *key)
{
__wake_up_common_lock(wq_head, mode, nr_exclusive, 0, key);
}
EXPORT_SYMBOL(__wake_up);
md_new_event函数其实就是给全局变量md_event_count加了1。
proc节点创建过程
static void md_geninit(void)
{
pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops);
}
注册的md_seq_fops操作方法,涉及seq_file相关操作。
struct seq_file {
char *buf;
size_t size;
size_t from;
size_t count;
size_t pad_until;
loff_t index;
loff_t read_pos;
u64 version;
struct mutex lock;
const struct seq_operations *op;
int poll_event;
const struct file *file;
void *private;
};
static const struct file_operations md_seq_fops = {
.owner = THIS_MODULE,
.open = md_seq_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
.poll = mdstat_poll,
};
// @inode: 打开文件inode信息
// @file: 打开文件file信息
static int md_seq_open(struct inode *inode, struct file *file)
{
struct seq_file *seq;
int error;
error = seq_open(file, &md_seq_ops);
if (error)
return error;
seq = file->private_data; // 获取seq_file指针
seq->poll_event = atomic_read(&md_event_count); // 将md_event_count计数赋值给poll_event
return error;
}
看看seq_open干了啥?
/**
* seq_open - initialize sequential file 根据file与seq_operation初始化seq_file结构体。
* @file: file we initialize
* @op: method table describing the sequence
*
* seq_open() sets @file, associating it with a sequence described
* by @op. @op->start() sets the iterator up and returns the first
* element of sequence. @op->stop() shuts it down. @op->next()
* returns the next element of sequence. @op->show() prints element
* into the buffer. In case of error ->start() and ->next() return
* ERR_PTR(error). In the end of sequence they return %NULL. ->show()
* returns 0 in case of success and negative number in case of error.
* Returning SEQ_SKIP means "discard this element and move on".
* Note: seq_open() will allocate a struct seq_file and store its
* pointer in @file->private_data. This pointer should not be modified.
*/
// 其实初始化一个seq_file结构体。
int seq_open(struct file *file, const struct seq_operations *op)
{
struct seq_file *p;
WARN_ON(file->private_data);
// 为seq_file申请内存。
p = kmem_cache_zalloc(seq_file_cache, GFP_KERNEL);
if (!p)
return -ENOMEM;
// 将file的private_data指针指向seq_file
file->private_data = p;
mutex_init(&p->lock); // 初始化seq_file结构互斥锁
p->op = op; // 将传入的op的值赋给seq_file
// No refcounting: the lifetime of 'p' is constrained
// to the lifetime of the file.
p->file = file; // 将file指针赋值给seq_file->file
/*
* Wrappers around seq_open(e.g. swaps_open) need to be
* aware of this. If they set f_version themselves, they
* should call seq_open first and then set f_version.
*/
file->f_version = 0;
/*
* seq_files support lseek() and pread(). They do not implement
* write() at all, but we clear FMODE_PWRITE here for historical
* reasons.
*
* If a client of seq_files a) implements file.write() and b) wishes to
* support pwrite() then that client will need to implement its own
* file.open() which calls seq_open() and then sets FMODE_PWRITE.
*/
file->f_mode &= ~FMODE_PWRITE;
return 0;
}
EXPORT_SYMBOL(seq_open);
实际显示是在md_seq_open初始化的seq_operations,也就是seq file的操作方法。
struct seq_operations {
void * (*start) (struct seq_file *m, loff_t *pos);
void (*stop) (struct seq_file *m, void *v);
void * (*next) (struct seq_file *m, void *v, loff_t *pos);
int (*show) (struct seq_file *m, void *v);
};
static const struct seq_operations md_seq_ops = {
.start = md_seq_start, // 获取mddev结构体指针
.next = md_seq_next, // 指向下一个mddev
.stop = md_seq_stop,
.show = md_seq_show, // seq_read方法在buf为null的情况下会申请PAGE_SIZE大小内存,
// 并且调用seq_file ops的start net stop show方法。
};
从用户态开到的是show方法打印的raid信息,重点看md_seq_show,哪里会调用show方法?
/* pers_list is a list of registered personalities protected
* by pers_lock.
* pers_lock does extra service to protect accesses to
* mddev->thread when the mutex cannot be held.
*/
static LIST_HEAD(pers_list);
static DEFINE_SPINLOCK(pers_lock);
// @v: 用来存放mddev结构体体指针
static int md_seq_show(struct seq_file *seq, void *v)
{
struct mddev *mddev = v;
sector_t sectors;
struct md_rdev *rdev;
// 如果传入的v的值为1,表示当前没有存在raid设备,pers_list记录当前系统支持哪些
// raid等级。
if (v == (void*)1) {
struct md_personality *pers;
seq_printf(seq, "Personalities : ");
spin_lock(&pers_lock);
list_for_each_entry(pers, &pers_list, list)
seq_printf(seq, "[%s] ", pers->name);
spin_unlock(&pers_lock);
seq_printf(seq, "\n");
seq->poll_event = atomic_read(&md_event_count);
return 0;
}
if (v == (void*)2) {
status_unused(seq);
return 0;
}
// 如果传入的mddev不为空,添加结构体访问的自旋锁。
spin_lock(&mddev->lock);
// 如果pers不为空,说明注册过mddev设备
// raid_disks表示了RAID设备的规模,disks则包含了该RAID设备中的所有磁盘的链
if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
seq_printf(seq, "%s : %sactive", mdname(mddev), // 逻辑设备名
mddev->pers ? "" : "in"); // mddev->pers为真,状态为active,为假inactive
if (mddev->pers) {
if (mddev->ro==1)
seq_printf(seq, " (read-only)");
if (mddev->ro==2)
seq_printf(seq, " (auto-read-only)");
seq_printf(seq, " %s", mddev->pers->name); // 逻辑设备别名,由创建时指定
}
// 准备打印raid成员盘相关信息
sectors = 0;
rcu_read_lock();
rdev_for_each_rcu(rdev, mddev) {
char b[BDEVNAME_SIZE];
seq_printf(seq, " %s[%d]",
bdevname(rdev->bdev,b), rdev->desc_nr); // 成员盘名和在raid设备的索引
if (test_bit(WriteMostly, &rdev->flags)) // 磁盘为WriteMostly
seq_printf(seq, "(W)");
if (test_bit(Journal, &rdev->flags)) // 磁盘被用作RAID设备的日志设备。
seq_printf(seq, "(J)");
if (test_bit(Faulty, &rdev->flags)) { // 磁盘已经发生故障
seq_printf(seq, "(F)");
continue;
}
// raid_disk其实就是在raid组中slot的编号,将设备添加进入raid组时会调用md_rdev_init,
// 会将raid_disk的值设定为-1,如果raid组中添加第三块磁盘时会被标记为spare,成员盘中
// 有磁盘被设定为faulty,spare磁盘会被自动设定为in_sync,也就是自动上线。
if (rdev->raid_disk < 0)
seq_printf(seq, "(S)"); /* spare */
if (test_bit(Replacement, &rdev->flags))
// 磁盘被标记为Replacement
seq_printf(seq, "(R)");
sectors += rdev->sectors;
}
rcu_read_unlock();
if (!list_empty(&mddev->disks)) { // 打印磁盘容量,单位为KB
if (mddev->pers)
seq_printf(seq, "\n %llu blocks",
(unsigned long long)
mddev->array_sectors / 2);
else
seq_printf(seq, "\n %llu blocks",
(unsigned long long)sectors / 2);
}
if (mddev->persistent) {
if (mddev->major_version != 0 ||
mddev->minor_version != 90) {
seq_printf(seq," super %d.%d",
mddev->major_version,
mddev->minor_version);
}
} else if (mddev->external)
seq_printf(seq, " super external:%s",
mddev->metadata_type);
else
seq_printf(seq, " super non-persistent");
if (mddev->pers) {
mddev->pers->status(seq, mddev); // 调用raid1_status打印raid状态
seq_printf(seq, "\n ");
if (mddev->pers->sync_request) {
if (status_resync(seq, mddev)) // 处于同步状态,打印同步状态信息
seq_printf(seq, "\n ");
}
} else
seq_printf(seq, "\n ");
bitmap_status(seq, mddev->bitmap); // 打印bitmap信息。
seq_printf(seq, "\n");
}
spin_unlock(&mddev->lock);
return 0;
}
// 调用raid1 status打印raid成员盘信息。
static void raid1_status(struct seq_file *seq, struct mddev *mddev)
{
struct r1conf *conf = mddev->private;
int i;
seq_printf(seq, " [%d/%d] [", conf->raid_disks, // raid组磁盘容量
conf->raid_disks - mddev->degraded); // raid组正常磁盘数量,添加新磁盘属于降级状态
rcu_read_lock();
for (i = 0; i < conf->raid_disks; i++) {
struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
/* In_sync标志位用于表示一个磁盘是否与其他磁盘处于同步状态,即该磁盘上的数据是否
与其他磁盘上的数据保持一致。如果In_sync标志位被设置,表示该磁盘处于同步状态,数据是一致的;
如果未设置,则表示该磁盘可能处于故障或者正在进行同步恢复过程中。*/
seq_printf(seq, "%s",
rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
}
rcu_read_unlock();
seq_printf(seq, "]");
}
// 关键结构体
struct bitmap {
struct bitmap_counts {
spinlock_t lock;
struct bitmap_page *bp;
unsigned long pages; /* total number of pages
* in the bitmap */
unsigned long missing_pages; /* number of pages
* not yet allocated */
unsigned long chunkshift; /* chunksize = 2^chunkshift
* (for bitops) */
unsigned long chunks; /* Total number of data
* chunks for the array */
} counts;
......
}
// 调用bitmap_status打印bitmap信息。
void bitmap_status(struct seq_file *seq, struct bitmap *bitmap)
{
unsigned long chunk_kb;
struct bitmap_counts *counts; // 内核bitmap计数信息。
if (!bitmap)
return;
counts = &bitmap->counts;
// chunk_size,一个bit代表对应磁盘上多大的数据块,这个值可以自定义。
chunk_kb = bitmap->mddev->bitmap_info.chunksize >> 10;
seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
"%lu%s chunk",
counts->pages - counts->missing_pages, // 当前bitmap在内存中占用多少个内存页
counts->pages, // 总共需要多少内存页来保存bitmap信息。
(counts->pages - counts->missing_pages)
<< (PAGE_SHIFT - 10), // 当前bitmap count在内存中占用多少KB。
chunk_kb ? chunk_kb : bitmap->mddev->bitmap_info.chunksize,
chunk_kb ? "KB" : "B");
// 如果bitmap保存在文件中,也就是外部元数据,打印文件路径。
if (bitmap->storage.file) {
seq_printf(seq, ", file: ");
seq_file_path(seq, bitmap->storage.file, " \t\n");
}
seq_printf(seq, "\n");
}
mdstat只是打印内核bitmap计数相关数据结构在内核中占用的内存大小,并没有打印磁盘实际需要多少个bit来跟踪。
可以用mdadm --examne-bitmap来查看实际使用bitmap占用的存储空间。
# mdadm --examine-bitmap /dev/sda
Filename : /dev/sda
Magic : 6d746962
Version : 4
UUID : 6d73b57d:3a759a43:e23bd973:c3134484
Events : 2258
Events Cleared : 1
State : OK
Chunksize : 64 MB
Daemon : 5s flush period
Write Mode : Normal
Sync Size : 468851456 (447.13 GiB 480.10 GB)
Bitmap : 7155 bits (chunks), 0 dirty (0.0%)
同步状态信息查询的函数还挺大。
static int status_resync(struct seq_file *seq, struct mddev *mddev)
{
sector_t max_sectors, resync, res;
unsigned long dt, db;
sector_t rt;
int scale;
unsigned int per_milli;
// raid设备处于同步恢复或重塑(reshape)过程中,设定同步扇区数量
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
max_sectors = mddev->resync_max_sectors;
else
max_sectors = mddev->dev_sectors;
resync = mddev->curr_resync; // 当前正在同步的扇区号
// 已经同步完成,将resync赋值为最大扇区号
if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
/* Still cleaning up */
resync = max_sectors;
} else if (resync > max_sectors)
resync = max_sectors;
else
// 自resync到resync - 当前正在同步的扇区数得到当前同步的扇区号
resync -= atomic_read(&mddev->recovery_active);
if (resync == 0) {
// 理解为需要做recovery但是还没有开始,设定为PENDING。
if (mddev->recovery_cp < MaxSector) { // 恢复检查点扇区号小于MaxSector
seq_printf(seq, "\tresync=PENDING");
return 1;
}
return 0;
}
if (resync < 3) { // ?
seq_printf(seq, "\tresync=DELAYED");
return 1;
}
WARN_ON(max_sectors == 0);
/* Pick 'scale' such that (resync>>scale)*1000 will fit
* in a sector_t, and (max_sectors>>scale) will fit in a
* u32, as those are the requirements for sector_div.
* Thus 'scale' must be at least 10
*/
scale = 10;
if (sizeof(sector_t) > sizeof(unsigned long)) {
while ( max_sectors/2 > (1ULL<<(scale+32)))
scale++;
}
res = (resync>>scale)*1000;
sector_div(res, (u32)((max_sectors>>scale)+1));
per_milli = res;
{
int i, x = per_milli/50, y = 20-x;
seq_printf(seq, "[");
for (i = 0; i < x; i++)
seq_printf(seq, "=");
seq_printf(seq, ">");
for (i = 0; i < y; i++)
seq_printf(seq, ".");
seq_printf(seq, "] ");
}
seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
(test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)? // 操作类型为reshape。
"reshape" :
(test_bit(MD_RECOVERY_CHECK, &mddev->recovery)? // check 操作
"check" :
(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? // resync或recovery操作
"resync" : "recovery"))),
per_milli/10, per_milli % 10,
(unsigned long long) resync/2,
(unsigned long long) max_sectors/2);
/*
* dt: time from mark until now
* db: blocks written from mark until now
* rt: remaining time
*
* rt is a sector_t, so could be 32bit or 64bit.
* So we divide before multiply in case it is 32bit and close
* to the limit.
* We scale the divisor (db) by 32 to avoid losing precision
* near the end of resync when the number of remaining sectors
* is close to 'db'.
* We then divide rt by 32 after multiplying by db to compensate.
* The '+1' avoids division by zero if db is very small.
*/
dt = ((jiffies - mddev->resync_mark) / HZ);
if (!dt) dt++;
db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active))
- mddev->resync_mark_cnt;
rt = max_sectors - resync; /* number of remaining sectors */
sector_div(rt, db/32+1);
rt *= dt;
rt >>= 5;
seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60,
((unsigned long)rt % 60)/6); // 预计多长时间同步完成
seq_printf(seq, " speed=%ldK/sec", db/2/dt); // 当前的同步速率
return 1;
}
上述函数的[1/2]分别表示什么意思呢?这个需要看start方法的返回值。
static void *md_seq_start(struct seq_file *seq, loff_t *pos)
{
struct list_head *tmp;
loff_t l = *pos;
struct mddev *mddev;
if (l >= 0x10000) // 文件偏移大于0x1000,返回NULL.
return NULL;
if (!l--) // 如果偏移为0,先打印Personalities信息
/* header */
return (void*)1;
spin_lock(&all_mddevs_lock);
list_for_each(tmp,&all_mddevs)
if (!l--) {
mddev = list_entry(tmp, struct mddev, all_mddevs);
mddev_get(mddev);
spin_unlock(&all_mddevs_lock);
return mddev;
}
spin_unlock(&all_mddevs_lock);
if (!l--)
return (void*)2;/* tail */
return NULL;
}