linux内核奇遇记之md源代码解读之五
转载请注明出处:http://blog.csdn.net/liumangxiong
如果看懂了raid1阵列的run函数,那么看raid5阵列run就非常轻松了,因为两者要做的事情都是大同小异。
raid5的run函数很长,但很大一部分跟创建运行是没有关系的,特别是有一段跟reshape相关的,大多数系统都不关注该功能,因此可以直接跳过。经过删减之后的run函数如下:
- 5307 static int run(struct mddev *mddev)
- 5308 {
- 5309 struct r5conf *conf;
- 5310 int working_disks = 0;
- 5311 int dirty_parity_disks = 0;
- 5312 struct md_rdev *rdev;
- 5313 sector_t reshape_offset = 0;
- 5314 int i;
- 5315 long long min_offset_diff = 0;
- 5316 int first = 1;
- ...
- 5426 if (mddev->private == NULL)
- 5427 conf = setup_conf(mddev);
- 5428 else
- 5429 conf = mddev->private;
- 5430
- 5431 if (IS_ERR(conf))
- 5432 return PTR_ERR(conf);
- 5433
- 5434 conf->min_offset_diff = min_offset_diff;
- 5435 mddev->thread = conf->thread;
- 5436 conf->thread = NULL;
- 5437 mddev->private = conf;
- ...
- 5491
-
-
- 5494 mddev->degraded = calc_degraded(conf);
- ...
- 5503
- 5504 mddev->dev_sectors &= ~(mddev->chunk_sectors - 1);
- 5505 mddev->resync_max_sectors = mddev->dev_sectors;
- ...
- 5556 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
- 5557
- 5558 if (mddev->queue) {
- ...
- 5628 }
- 5629
- 5630 return 0;
是不是感觉超级简单呢,就像有些事情表面上看起来很复杂,但只要认真地去分析之后发现其实是有规律可循的。就像这个run函数,做的事情与raid1的run是相同的,就是建立读写的上下文环境。
5427行,创建struct r5conf,跟进函数:
- 5131 static struct r5conf *setup_conf(struct mddev *mddev)
- 5132 {
- 5133 struct r5conf *conf;
- 5134 int raid_disk, memory, max_disks;
- 5135 struct md_rdev *rdev;
- 5136 struct disk_info *disk;
- 5137 char pers_name[6];
- 5138
- 5139 if (mddev->new_level != 5
- 5140 && mddev->new_level != 4
- 5141 && mddev->new_level != 6) {
- 5142 printk(KERN_ERR "md/raid:%s: raid level not set to 4/5/6 (%d)\n",
- 5143 mdname(mddev), mddev->new_level);
- 5144 return ERR_PTR(-EIO);
- 5145 }
- 5146 if ((mddev->new_level == 5
- 5147 && !algorithm_valid_raid5(mddev->new_layout)) ||
- 5148 (mddev->new_level == 6
- 5149 && !algorithm_valid_raid6(mddev->new_layout))) {
- 5150 printk(KERN_ERR "md/raid:%s: layout %d not supported\n",
- 5151 mdname(mddev), mddev->new_layout);
- 5152 return ERR_PTR(-EIO);
- 5153 }
- 5154 if (mddev->new_level == 6 && mddev->raid_disks < 4) {
- 5155 printk(KERN_ERR "md/raid:%s: not enough configured devices (%d, minimum 4)\n",
- 5156 mdname(mddev), mddev->raid_disks);
- 5157 return ERR_PTR(-EINVAL);
- 5158 }
- 5159
- 5160 if (!mddev->new_chunk_sectors ||
- 5161 (mddev->new_chunk_sectors << 9) % PAGE_SIZE ||
- 5162 !is_power_of_2(mddev->new_chunk_sectors)) {
- 5163 printk(KERN_ERR "md/raid:%s: invalid chunk size %d\n",
- 5164 mdname(mddev), mddev->new_chunk_sectors << 9);
- 5165 return ERR_PTR(-EINVAL);
- 5166 }
- 5167
- 5168 conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL);
- 5169 if (conf == NULL)
- 5170 goto abort;
- 5171 spin_lock_init(&conf->device_lock);
- 5172 init_waitqueue_head(&conf->wait_for_stripe);
- 5173 init_waitqueue_head(&conf->wait_for_overlap);
- 5174 INIT_LIST_HEAD(&conf->handle_list);
- 5175 INIT_LIST_HEAD(&conf->hold_list);
- 5176 INIT_LIST_HEAD(&conf->delayed_list);
- 5177 INIT_LIST_HEAD(&conf->bitmap_list);
- 5178 INIT_LIST_HEAD(&conf->inactive_list);
- 5179 atomic_set(&conf->active_stripes, 0);
- 5180 atomic_set(&conf->preread_active_stripes, 0);
- 5181 atomic_set(&conf->active_aligned_reads, 0);
- 5182 conf->bypass_threshold = BYPASS_THRESHOLD;
- 5183 conf->recovery_disabled = mddev->recovery_disabled - 1;
- 5184
- 5185 conf->raid_disks = mddev->raid_disks;
- 5186 if (mddev->reshape_position == MaxSector)
- 5187 conf->previous_raid_disks = mddev->raid_disks;
- 5188 else
- 5189 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks;
- 5190 max_disks = max(conf->raid_disks, conf->previous_raid_disks);
- 5191 conf->scribble_len = scribble_len(max_disks);
- 5192
- 5193 conf->disks = kzalloc(max_disks * sizeof(struct disk_info),
- 5194 GFP_KERNEL);
- 5195 if (!conf->disks)
- 5196 goto abort;
- 5197
- 5198 conf->mddev = mddev;
- 5199
- 5200 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
- 5201 goto abort;
- 5202
- 5203 conf->level = mddev->new_level;
- 5204 if (raid5_alloc_percpu(conf) != 0)
- 5205 goto abort;
- 5206
- 5207 pr_debug("raid456: run(%s) called.\n", mdname(mddev));
- 5208
- 5209 rdev_for_each(rdev, mddev) {
- 5210 raid_disk = rdev->raid_disk;
- 5211 if (raid_disk >= max_disks
- 5212 || raid_disk < 0)
- 5213 continue;
- 5214 disk = conf->disks + raid_disk;
- 5215
- 5216 if (test_bit(Replacement, &rdev->flags)) {
- 5217 if (disk->replacement)
- 5218 goto abort;
- 5219 disk->replacement = rdev;
- 5220 } else {
- 5221 if (disk->rdev)
- 5222 goto abort;
- 5223 disk->rdev = rdev;
- 5224 }
- 5225
- 5226 if (test_bit(In_sync, &rdev->flags)) {
- 5227 char b[BDEVNAME_SIZE];
- 5228 printk(KERN_INFO "md/raid:%s: device %s operational as raid"
- 5229 " disk %d\n",
- 5230 mdname(mddev), bdevname(rdev->bdev, b), raid_disk);
- 5231 } else if (rdev->saved_raid_disk != raid_disk)
- 5232
- 5233 conf->fullsync = 1;
- 5234 }
- 5235
- 5236 conf->chunk_sectors = mddev->new_chunk_sectors;
- 5237 conf->level = mddev->new_level;
- 5238 if (conf->level == 6)
- 5239 conf->max_degraded = 2;
- 5240 else
- 5241 conf->max_degraded = 1;
- 5242 conf->algorithm = mddev->new_layout;
- 5243 conf->max_nr_stripes = NR_STRIPES;
- 5244 conf->reshape_progress = mddev->reshape_position;
- 5245 if (conf->reshape_progress != MaxSector) {
- 5246 conf->prev_chunk_sectors = mddev->chunk_sectors;
- 5247 conf->prev_algo = mddev->layout;
- 5248 }
- 5249
- 5250 memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
- 5251 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
- 5252 if (grow_stripes(conf, conf->max_nr_stripes)) {
- 5253 printk(KERN_ERR
- 5254 "md/raid:%s: couldn't allocate %dkB for buffers\n",
- 5255 mdname(mddev), memory);
- 5256 goto abort;
- 5257 } else
- 5258 printk(KERN_INFO "md/raid:%s: allocated %dkB\n",
- 5259 mdname(mddev), memory);
- 5260
- 5261 sprintf(pers_name, "raid%d", mddev->new_level);
- 5262 conf->thread = md_register_thread(raid5d, mddev, pers_name);
- 5263 if (!conf->thread) {
- 5264 printk(KERN_ERR
- 5265 "md/raid:%s: couldn't allocate thread.\n",
- 5266 mdname(mddev));
- 5267 goto abort;
- 5268 }
- 5269
- 5270 return conf;
同样,这个函数与raid1的setup_conf也很相似。
5139行,检查阵列级别,支持raid4,5,6。
5147行,检查raid5的layout是否正确。
5160行,检查阵列chunk大小,必须为page整数倍并且是2的n次方。
5168行,申请struct r5conf内存空间并初始化。
5185行,设置数据盘数。
5193行,申请struct disk_info,用于保存与磁盘的关联。
5200行,用于保存struct stripe_head的哈希表,用于快速查找指定扇区的stripe_head。
5209-5234行,最关键的是5223行,关联struct disk_info与struct md_rdev。
5236行,设置条块大小。
5237行,设置级别。
5241行,设置最大降级磁盘数。
5252行,申请struct stripe_head slab。跟进函数grow_stripes:
- 1501 static int grow_stripes(struct r5conf *conf, int num)
- 1502 {
- 1503 struct kmem_cache *sc;
- 1504 int devs = max(conf->raid_disks, conf->previous_raid_disks);
- 1505
- 1506 if (conf->mddev->gendisk)
- 1507 sprintf(conf->cache_name[0],
- 1508 "raid%d-%s", conf->level, mdname(conf->mddev));
- 1509 else
- 1510 sprintf(conf->cache_name[0],
- 1511 "raid%d-%p", conf->level, conf->mddev);
- 1512 sprintf(conf->cache_name[1], "%s-alt", conf->cache_name[0]);
- 1513
- 1514 conf->active_name = 0;
- 1515 sc = kmem_cache_create(conf->cache_name[conf->active_name],
- 1516 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
- 1517 0, 0, NULL);
- 1518 if (!sc)
- 1519 return 1;
- 1520 conf->slab_cache = sc;
- 1521 conf->pool_size = devs;
- 1522 while (num--)
- 1523 if (!grow_one_stripe(conf))
- 1524 return 1;
- 1525 return 0;
- 1526 }
1504行,计算数据盘数目。
1506行,设置slab名称。
1515行,创建slab, 空间大小为sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev)是因为struct stripe_head尾部有devs个struct r5dev。
1523行,创建空闲struct stripe_head。然而只是简单地创建就没有必要跟进看了,但该函数中隐藏着一个最经常调用的函数release_stripe,所以还是有必要跟进的:
- 1477 static int grow_one_stripe(struct r5conf *conf)
- 1478 {
- 1479 struct stripe_head *sh;
- 1480 sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL);
- 1481 if (!sh)
- 1482 return 0;
- 1483
- 1484 sh->raid_conf = conf;
- 1485
- 1486 spin_lock_init(&sh->stripe_lock);
- 1487
- 1488 if (grow_buffers(sh)) {
- 1489 shrink_buffers(sh);
- 1490 kmem_cache_free(conf->slab_cache, sh);
- 1491 return 0;
- 1492 }
- 1493
- 1494 atomic_set(&sh->count, 1);
- 1495 atomic_inc(&conf->active_stripes);
- 1496 INIT_LIST_HEAD(&sh->lru);
- 1497 release_stripe(sh);
- 1498 return 1;
- 1499 }
1480行,新申请一个struct stripe_head。
1484行,关联struct r5conf。
1488行,grow_buffers,为每个struct r5dev申请一个page页用于stripe_head页拷贝和计算校验。页指针保存在sh->dev[].page指针中。
1494行,设置struct stripe_head计数器,在1497行release_stripe中会递减。
1495行,递增阵列活跃条带数。
1496行,lru链表初始化。
1497行,释放struct stripe_head,添加到空闲条带空闲链表。release_stripe最终会调用到do_release_stripe,do_release_stripe里会执行到下面几行:
- 228 list_add_tail(&sh->lru, &conf->inactive_list);
- 229 wake_up(&conf->wait_for_stripe);
- 230 if (conf->retry_read_aligned)
- 231 md_wakeup_thread(conf->mddev->thread);
228行,添加struct stripe_head到inactive_list,即条带空闲链表。
229行,唤醒等待空闲条带的请求,因为每个阵列的struct stripe_head资源是有限的,申请不到时就在等待队列上等候。
231行,唤醒条块读请求。
继续返回到setup_conf函数中,这里已经通过grow_stripes为阵列申请了NR_STRIPES个struct stripe_head。
5262行,创建raid5主线程。
这样setup_conf函数也结束了,继续返回到run函数中。
5434-5437行,conf和mddev的关联和赋值。
5494-5556行,mddev相关域的赋值。
5558行,mddev请求队列struct queue_limits设置等等初始化。
小结一下,raid5的run函数同raid1基本作用是一样的,都是向上虚拟一个块设备,向下包装磁盘,建立读写请求的通道。区别在于raid5的读写是以struct stripe_head为基础的,而在raid5的读写中也是围绕着struct stripe_head展开的。
下一小节介绍raid10阵列的运行。