软raid /proc/mdstat内核实现

Configure-Handler

已于 2024-05-28 20:01:35 修改

阅读量422

点赞数 5

文章标签： linux 软raid

于 2024-03-25 09:05:51 首次发布

本文链接：https://blog.csdn.net/qq_42931917/article/details/137002068

版权

内核在哪里注册了/proc/mdstat节点

struct wait_queue_head {
	spinlock_t		lock;
	struct list_head	head;
};

#define __WAIT_QUEUE_HEAD_INITIALIZER(name) {					\
	.lock		= __SPIN_LOCK_UNLOCKED(name.lock),			\
	.head		= { &(name).head, &(name).head } }

#define DECLARE_WAIT_QUEUE_HEAD(name) \
	struct wait_queue_head name = __WAIT_QUEUE_HEAD_INITIALIZER(name)

\linux-4.18-rc1\drivers\md\md.c

/*
 * We have a system wide 'event count' that is incremented
 * on any 'interesting' event, and readers of /proc/mdstat
 * can use 'poll' or 'select' to find out when the event
 * count increases.
 *
 * Events are:
 *  start array, stop array, error, add device, remove device,
 *  start build, activate spare
 */
static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
static atomic_t md_event_count;
void md_new_event(struct mddev *mddev)
{
	atomic_inc(&md_event_count);
	wake_up(&md_event_waiters);	// 唤醒等待队列中的所有任务。
}
EXPORT_SYMBOL_GPL(md_new_event);

// wake_up最终调用的是函数__wake_up
/**
 * __wake_up - wake up threads blocked on a waitqueue.
 * @wq_head: the waitqueue
 * @mode: which threads
 * @nr_exclusive: how many wake-one or wake-many threads to wake up
 * @key: is directly passed to the wakeup function
 *
 * It may be assumed that this function implies a write memory barrier before
 * changing the task state if and only if any tasks are woken up.
 */
void __wake_up(struct wait_queue_head *wq_head, unsigned int mode,
			int nr_exclusive, void *key)
{
	__wake_up_common_lock(wq_head, mode, nr_exclusive, 0, key);
}
EXPORT_SYMBOL(__wake_up);

md_new_event函数其实就是给全局变量md_event_count加了1。

proc节点创建过程

static void md_geninit(void)
{
	pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));

	proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops);
}

注册的md_seq_fops操作方法，涉及seq_file相关操作。

struct seq_file {
	char *buf;
	size_t size;
	size_t from;
	size_t count;
	size_t pad_until;
	loff_t index;
	loff_t read_pos;
	u64 version;
	struct mutex lock;
	const struct seq_operations *op;
	int poll_event;
	const struct file *file;
	void *private;
};

static const struct file_operations md_seq_fops = {
	.owner		= THIS_MODULE,
	.open           = md_seq_open,
	.read           = seq_read,
	.llseek         = seq_lseek,
	.release	= seq_release,
	.poll		= mdstat_poll,
};

// @inode: 打开文件inode信息
// @file: 打开文件file信息
static int md_seq_open(struct inode *inode, struct file *file)
{
	struct seq_file *seq;
	int error;

	error = seq_open(file, &md_seq_ops);
	if (error)
		return error;

	seq = file->private_data;	// 获取seq_file指针
	seq->poll_event = atomic_read(&md_event_count);	// 将md_event_count计数赋值给poll_event
	return error;
}

看看seq_open干了啥？

/**
 *	seq_open -	initialize sequential file	根据file与seq_operation初始化seq_file结构体。
 *	@file: file we initialize
 *	@op: method table describing the sequence
 *
 *	seq_open() sets @file, associating it with a sequence described
 *	by @op.  @op->start() sets the iterator up and returns the first
 *	element of sequence. @op->stop() shuts it down.  @op->next()
 *	returns the next element of sequence.  @op->show() prints element
 *	into the buffer.  In case of error ->start() and ->next() return
 *	ERR_PTR(error).  In the end of sequence they return %NULL. ->show()
 *	returns 0 in case of success and negative number in case of error.
 *	Returning SEQ_SKIP means "discard this element and move on".
 *	Note: seq_open() will allocate a struct seq_file and store its
 *	pointer in @file->private_data. This pointer should not be modified.
 */
// 其实初始化一个seq_file结构体。
int seq_open(struct file *file, const struct seq_operations *op)
{
	struct seq_file *p;

	WARN_ON(file->private_data);

    // 为seq_file申请内存。
	p = kmem_cache_zalloc(seq_file_cache, GFP_KERNEL);
	if (!p)
		return -ENOMEM;

    // 将file的private_data指针指向seq_file
	file->private_data = p;

	mutex_init(&p->lock);	// 初始化seq_file结构互斥锁
	p->op = op;	// 将传入的op的值赋给seq_file

	// No refcounting: the lifetime of 'p' is constrained
	// to the lifetime of the file.
	p->file = file;	// 将file指针赋值给seq_file->file

	/*
	 * Wrappers around seq_open(e.g. swaps_open) need to be
	 * aware of this. If they set f_version themselves, they
	 * should call seq_open first and then set f_version.
	 */
	file->f_version = 0;

	/*
	 * seq_files support lseek() and pread().  They do not implement
	 * write() at all, but we clear FMODE_PWRITE here for historical
	 * reasons.
	 *
	 * If a client of seq_files a) implements file.write() and b) wishes to
	 * support pwrite() then that client will need to implement its own
	 * file.open() which calls seq_open() and then sets FMODE_PWRITE.
	 */
	file->f_mode &= ~FMODE_PWRITE;
	return 0;
}
EXPORT_SYMBOL(seq_open);

实际显示是在md_seq_open初始化的seq_operations，也就是seq file的操作方法。

struct seq_operations {
	void * (*start) (struct seq_file *m, loff_t *pos);
	void (*stop) (struct seq_file *m, void *v);
	void * (*next) (struct seq_file *m, void *v, loff_t *pos);
	int (*show) (struct seq_file *m, void *v);
};

static const struct seq_operations md_seq_ops = {
	.start  = md_seq_start,	// 获取mddev结构体指针
	.next   = md_seq_next,	// 指向下一个mddev
	.stop   = md_seq_stop,
	.show   = md_seq_show,	// seq_read方法在buf为null的情况下会申请PAGE_SIZE大小内存，
    					  // 并且调用seq_file ops的start net stop show方法。
};

从用户态开到的是show方法打印的raid信息，重点看md_seq_show，哪里会调用show方法？

/* pers_list is a list of registered personalities protected
 * by pers_lock.
 * pers_lock does extra service to protect accesses to
 * mddev->thread when the mutex cannot be held.
 */
static LIST_HEAD(pers_list);
static DEFINE_SPINLOCK(pers_lock);

// @v: 用来存放mddev结构体体指针
static int md_seq_show(struct seq_file *seq, void *v)
{
	struct mddev *mddev = v;
	sector_t sectors;
	struct md_rdev *rdev;

    // 如果传入的v的值为1，表示当前没有存在raid设备，pers_list记录当前系统支持哪些
    // raid等级。
	if (v == (void*)1) {
		struct md_personality *pers;
		seq_printf(seq, "Personalities : ");
		spin_lock(&pers_lock);
		list_for_each_entry(pers, &pers_list, list)
			seq_printf(seq, "[%s] ", pers->name);

		spin_unlock(&pers_lock);
		seq_printf(seq, "\n");
		seq->poll_event = atomic_read(&md_event_count);
		return 0;
	}
	if (v == (void*)2) {
		status_unused(seq);
		return 0;
	}

    // 如果传入的mddev不为空，添加结构体访问的自旋锁。
	spin_lock(&mddev->lock);
    // 如果pers不为空，说明注册过mddev设备
    // raid_disks表示了RAID设备的规模，disks则包含了该RAID设备中的所有磁盘的链
	if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
		seq_printf(seq, "%s : %sactive", mdname(mddev),	// 逻辑设备名
						mddev->pers ? "" : "in");	// mddev->pers为真，状态为active，为假inactive
		if (mddev->pers) {
			if (mddev->ro==1)
				seq_printf(seq, " (read-only)");
			if (mddev->ro==2)
				seq_printf(seq, " (auto-read-only)");
			seq_printf(seq, " %s", mddev->pers->name);	// 逻辑设备别名，由创建时指定
		}

        // 准备打印raid成员盘相关信息
		sectors = 0;
		rcu_read_lock();
		rdev_for_each_rcu(rdev, mddev) {
			char b[BDEVNAME_SIZE];
			seq_printf(seq, " %s[%d]",
				bdevname(rdev->bdev,b), rdev->desc_nr);	// 成员盘名和在raid设备的索引
			if (test_bit(WriteMostly, &rdev->flags))	// 磁盘为WriteMostly
				seq_printf(seq, "(W)");
			if (test_bit(Journal, &rdev->flags))	// 磁盘被用作RAID设备的日志设备。
				seq_printf(seq, "(J)");
			if (test_bit(Faulty, &rdev->flags)) {	// 磁盘已经发生故障
				seq_printf(seq, "(F)");
				continue;
			}
			// raid_disk其实就是在raid组中slot的编号，将设备添加进入raid组时会调用md_rdev_init，
			// 会将raid_disk的值设定为-1，如果raid组中添加第三块磁盘时会被标记为spare，成员盘中
			// 有磁盘被设定为faulty，spare磁盘会被自动设定为in_sync，也就是自动上线。
			if (rdev->raid_disk < 0)
				seq_printf(seq, "(S)"); /* spare */
			if (test_bit(Replacement, &rdev->flags))
				// 磁盘被标记为Replacement
				seq_printf(seq, "(R)");	
			sectors += rdev->sectors;
		}
		rcu_read_unlock();

		if (!list_empty(&mddev->disks)) {	// 打印磁盘容量，单位为KB
			if (mddev->pers)
				seq_printf(seq, "\n      %llu blocks",
					   (unsigned long long)
					   mddev->array_sectors / 2);
			else
				seq_printf(seq, "\n      %llu blocks",
					   (unsigned long long)sectors / 2);
		}
		if (mddev->persistent) {
			if (mddev->major_version != 0 ||
			    mddev->minor_version != 90) {
				seq_printf(seq," super %d.%d",
					   mddev->major_version,
					   mddev->minor_version);
			}
		} else if (mddev->external)
			seq_printf(seq, " super external:%s",
				   mddev->metadata_type);
		else
			seq_printf(seq, " super non-persistent");

		if (mddev->pers) {
			mddev->pers->status(seq, mddev);	// 调用raid1_status打印raid状态
			seq_printf(seq, "\n      ");
			if (mddev->pers->sync_request) {
				if (status_resync(seq, mddev))	// 处于同步状态，打印同步状态信息
					seq_printf(seq, "\n      ");
			}
		} else
			seq_printf(seq, "\n       ");

		bitmap_status(seq, mddev->bitmap);	// 打印bitmap信息。

		seq_printf(seq, "\n");
	}
	spin_unlock(&mddev->lock);

	return 0;
}

// 调用raid1 status打印raid成员盘信息。
static void raid1_status(struct seq_file *seq, struct mddev *mddev)
{
	struct r1conf *conf = mddev->private;
	int i;

	seq_printf(seq, " [%d/%d] [", conf->raid_disks,	// raid组磁盘容量
		   conf->raid_disks - mddev->degraded);	// raid组正常磁盘数量，添加新磁盘属于降级状态
	rcu_read_lock();
	for (i = 0; i < conf->raid_disks; i++) {
		struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
        /* In_sync标志位用于表示一个磁盘是否与其他磁盘处于同步状态，即该磁盘上的数据是否
        与其他磁盘上的数据保持一致。如果In_sync标志位被设置，表示该磁盘处于同步状态，数据是一致的；
        如果未设置，则表示该磁盘可能处于故障或者正在进行同步恢复过程中。*/
		seq_printf(seq, "%s",
			   rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
	}
	rcu_read_unlock();
	seq_printf(seq, "]");
}

// 关键结构体
struct bitmap {
	struct bitmap_counts {
		spinlock_t lock;
		struct bitmap_page *bp;
		unsigned long pages;		/* total number of pages
						 * in the bitmap */
		unsigned long missing_pages;	/* number of pages
						 * not yet allocated */
		unsigned long chunkshift;	/* chunksize = 2^chunkshift
						 * (for bitops) */
		unsigned long chunks;		/* Total number of data
						 * chunks for the array */
	} counts;
    ......
}

// 调用bitmap_status打印bitmap信息。
void bitmap_status(struct seq_file *seq, struct bitmap *bitmap)
{
	unsigned long chunk_kb;
	struct bitmap_counts *counts;	// 内核bitmap计数信息。

	if (!bitmap)
		return;

	counts = &bitmap->counts;

    // chunk_size，一个bit代表对应磁盘上多大的数据块，这个值可以自定义。
	chunk_kb = bitmap->mddev->bitmap_info.chunksize >> 10;
	seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
		   "%lu%s chunk",
		   counts->pages - counts->missing_pages,	// 当前bitmap在内存中占用多少个内存页
		   counts->pages,	// 总共需要多少内存页来保存bitmap信息。
		   (counts->pages - counts->missing_pages)
		   << (PAGE_SHIFT - 10),	// 当前bitmap count在内存中占用多少KB。
		   chunk_kb ? chunk_kb : bitmap->mddev->bitmap_info.chunksize,
		   chunk_kb ? "KB" : "B");
    // 如果bitmap保存在文件中，也就是外部元数据，打印文件路径。
	if (bitmap->storage.file) {
		seq_printf(seq, ", file: ");
		seq_file_path(seq, bitmap->storage.file, " \t\n");
	}

	seq_printf(seq, "\n");
}

mdstat只是打印内核bitmap计数相关数据结构在内核中占用的内存大小，并没有打印磁盘实际需要多少个bit来跟踪。
可以用mdadm --examne-bitmap来查看实际使用bitmap占用的存储空间。

# mdadm --examine-bitmap /dev/sda
        Filename : /dev/sda
           Magic : 6d746962
         Version : 4
            UUID : 6d73b57d:3a759a43:e23bd973:c3134484
          Events : 2258
  Events Cleared : 1
           State : OK
       Chunksize : 64 MB
          Daemon : 5s flush period
      Write Mode : Normal
       Sync Size : 468851456 (447.13 GiB 480.10 GB)
          Bitmap : 7155 bits (chunks), 0 dirty (0.0%)

同步状态信息查询的函数还挺大。

static int status_resync(struct seq_file *seq, struct mddev *mddev)
{
	sector_t max_sectors, resync, res;
	unsigned long dt, db;
	sector_t rt;
	int scale;
	unsigned int per_milli;

    // raid设备处于同步恢复或重塑（reshape）过程中，设定同步扇区数量
	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
	    test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
		max_sectors = mddev->resync_max_sectors;
	else
		max_sectors = mddev->dev_sectors;

	resync = mddev->curr_resync;	// 当前正在同步的扇区号
    	// 已经同步完成，将resync赋值为最大扇区号
	if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
			/* Still cleaning up */
			resync = max_sectors;
	} else if (resync > max_sectors)
		resync = max_sectors;
	else
        // 自resync到resync - 当前正在同步的扇区数得到当前同步的扇区号
		resync -= atomic_read(&mddev->recovery_active);

	if (resync == 0) {
        // 理解为需要做recovery但是还没有开始，设定为PENDING。
		if (mddev->recovery_cp < MaxSector) {	// 恢复检查点扇区号小于MaxSector
			seq_printf(seq, "\tresync=PENDING");
			return 1;
		}
		return 0;
	}
	if (resync < 3) {	// ?
		seq_printf(seq, "\tresync=DELAYED");
		return 1;
	}

	WARN_ON(max_sectors == 0);
	/* Pick 'scale' such that (resync>>scale)*1000 will fit
	 * in a sector_t, and (max_sectors>>scale) will fit in a
	 * u32, as those are the requirements for sector_div.
	 * Thus 'scale' must be at least 10
	 */
	scale = 10;
	if (sizeof(sector_t) > sizeof(unsigned long)) {
		while ( max_sectors/2 > (1ULL<<(scale+32)))
			scale++;
	}
	res = (resync>>scale)*1000;
	sector_div(res, (u32)((max_sectors>>scale)+1));

	per_milli = res;
	{
		int i, x = per_milli/50, y = 20-x;
		seq_printf(seq, "[");
		for (i = 0; i < x; i++)
			seq_printf(seq, "=");
		seq_printf(seq, ">");
		for (i = 0; i < y; i++)
			seq_printf(seq, ".");
		seq_printf(seq, "] ");
	}
	seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
		   (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?	// 操作类型为reshape。
		    "reshape" :
		    (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)?	// check 操作
		     "check" :
		     (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?	// resync或recovery操作
		      "resync" : "recovery"))),
		   per_milli/10, per_milli % 10,
		   (unsigned long long) resync/2,
		   (unsigned long long) max_sectors/2);

	/*
	 * dt: time from mark until now
	 * db: blocks written from mark until now
	 * rt: remaining time
	 *
	 * rt is a sector_t, so could be 32bit or 64bit.
	 * So we divide before multiply in case it is 32bit and close
	 * to the limit.
	 * We scale the divisor (db) by 32 to avoid losing precision
	 * near the end of resync when the number of remaining sectors
	 * is close to 'db'.
	 * We then divide rt by 32 after multiplying by db to compensate.
	 * The '+1' avoids division by zero if db is very small.
	 */
	dt = ((jiffies - mddev->resync_mark) / HZ);
	if (!dt) dt++;
	db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active))
		- mddev->resync_mark_cnt;

	rt = max_sectors - resync;    /* number of remaining sectors */
	sector_div(rt, db/32+1);
	rt *= dt;
	rt >>= 5;

	seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60,
		   ((unsigned long)rt % 60)/6);	// 预计多长时间同步完成

	seq_printf(seq, " speed=%ldK/sec", db/2/dt);	// 当前的同步速率
	return 1;
}

上述函数的[1/2]分别表示什么意思呢？这个需要看start方法的返回值。

static void *md_seq_start(struct seq_file *seq, loff_t *pos)
{
	struct list_head *tmp;
	loff_t l = *pos;
	struct mddev *mddev;

	if (l >= 0x10000)	// 文件偏移大于0x1000，返回NULL.
		return NULL;
	if (!l--)	// 如果偏移为0,先打印Personalities信息
		/* header */
		return (void*)1;

	spin_lock(&all_mddevs_lock);
	list_for_each(tmp,&all_mddevs)
		if (!l--) {
			mddev = list_entry(tmp, struct mddev, all_mddevs);
			mddev_get(mddev);
			spin_unlock(&all_mddevs_lock);
			return mddev;
		}
	spin_unlock(&all_mddevs_lock);
	if (!l--)
		return (void*)2;/* tail */
	return NULL;
}

Configure-Handler

关注

5
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
软raid /proc/mdstat内核实现

实际显示是在md_seq_open初始化的seq_operations，也就是seq file的操作方法。从用户态开到的是show方法打印的raid信息，重点看md_seq_show，哪里会调用show方法？md_new_event函数其实就是给全局变量md_event_count加了1。上述函数的[1/2]分别表示什么意思呢？注册的md_seq_fops操作方法，涉及seq_file相关操作。内核在哪里注册了/proc/mdstat节点。同步状态信息查询的函数还挺大。看看seq_open干了啥？
复制链接

扫一扫