在RAID未降级的场景下,读修复的逻辑为尝试从故障磁盘再读一次,如果尝试重新读还是失败,则从另一成员盘中读取数据,再写入故障盘。
// HEAD detached at v4.18-rc8
// drivers/md/raid1.c
2204 /*
2205 * This is a kernel thread which:
2206 *
2207 * 1. Retries failed read operations on working mirrors.
2208 * 2. Updates the raid superblock when problems encounter.
2209 * 3. Performs writes following reads for array synchronising.
2210 */
2211
// @struct r1conf *conf: 只想r1conf结构体的指针。
// @read_disk: 读数据磁盘的slot。
// @sect - start sector: 读操作的起始扇区。
// @sectors: 读操作的扇区总量。
2212 static void fix_read_error(struct r1conf *conf, int read_disk,
2213 sector_t sect, int sectors)
2214 {
2215 struct mddev *mddev = conf->mddev;
2216 while(sectors) {
2217 int s = sectors;
2218 int d = read_disk;
2219 int success = 0;
2220 int start;
2221 struct md_rdev *rdev;
2222
// 1 >> 9 = 512,PAGE_SIZE / 512 = 8
// 读操作扇区数大于8个扇区,一次循环操作只操作8个扇区。
2223 if (s > (PAGE_SIZE>>9))
2224 s = PAGE_SIZE >> 9;
2225
// 这个迭代可以理解为对于状态正常的read_disk再次尝试读操作,如果读取失败,
// 并且尝试从所有的RAID成员盘中读取数据。
2226 do {
2227 sector_t first_bad;
2228 int bad_sectors;
2229
2230 rcu_read_lock();
2231 rdev = rcu_dereference(conf->mirrors[d].rdev);
2232 if (rdev &&
2233 (test_bit(In_sync, &rdev->flags) ||
// 磁盘没有被设定为fail,并且要操作的扇区不在recovery窗口,
// 并且没有被坏块列表记录。
2234 (!test_bit(Faulty, &rdev->flags) &&
2235 rdev->recovery_offset >= sect + s)) &&
2236 is_badblock(rdev, sect, s,
2237 &first_bad, &bad_sectors) == 0) {
2238 atomic_inc(&rdev->nr_pending);
2239 rcu_read_unlock();
// 从成员盘中读取数据。
2240 if (sync_page_io(rdev, sect, s<<9,
2241 conf->tmppage, REQ_OP_READ, 0, false))
2242 success = 1;
2243 rdev_dec_pending(rdev, mddev);
2244 if (success)
2245 break;
2246 } else
2247 rcu_read_unlock();
2248 d++;
2249 if (d == conf->raid_disks * 2)
2250 d = 0;
2251 } while (!success && d != read_disk);
2252
// 如果从所有的成员盘都没有成功读取数据,将设备标记为fail。
2253 if (!success) {
2254 /* Cannot read from anywhere - mark it bad */
2255 struct md_rdev *rdev = conf->mirrors[read_disk].rdev;
2256 if (!rdev_set_badblocks(rdev, sect, s, 0))
2257 md_error(mddev, rdev);
2258 break;
2259 }
// 如果从非read_disk中成功读取数据,将从其他成员盘中读取的数据回写到read_disk,
// 并且尝试再次读取,写操作主要是为了触发SSD的自动修复功能。
2260 /* write it back and re-read */
2261 start = d; // 此时d的值为成功读取数据的slot。
2262 while (d != read_disk) {
2263 if (d==0)
2264 d = conf->raid_disks * 2;
2265 d--;
2266 rcu_read_lock();
2267 rdev = rcu_dereference(conf->mirrors[d].rdev);
2268 if (rdev &&
2269 !test_bit(Faulty, &rdev->flags)) {
2270 atomic_inc(&rdev->nr_pending);
2271 rcu_read_unlock();
// 将读出的数据写到read_disk。
2272 r1_sync_page_io(rdev, sect, s,
2273 conf->tmppage, WRITE);
2274 rdev_dec_pending(rdev, mddev);
2275 } else
2276 rcu_read_unlock();
2277 }
2278 d = start; // 此时d的值为成功读取数据的slot。
2279 while (d != read_disk) {
2280 char b[BDEVNAME_SIZE];
2281 if (d==0)
2282 d = conf->raid_disks * 2;
2283 d--;
2284 rcu_read_lock();
2285 rdev = rcu_dereference(conf->mirrors[d].rdev);
2286 if (rdev &&
2287 !test_bit(Faulty, &rdev->flags)) {
2288 atomic_inc(&rdev->nr_pending);
2289 rcu_read_unlock();
2290 if (r1_sync_page_io(rdev, sect, s,
2291 conf->tmppage, READ)) {
2292 atomic_add(s, &rdev->corrected_errors);
2293 pr_info("md/raid1:%s: read error corrected (%d sectors at %llu on %s)\n",
2294 mdname(mddev), s,
2295 (unsigned long long)(sect +
2296 rdev->data_offset),
2297 bdevname(rdev->bdev, b));
2298 }
2299 rdev_dec_pending(rdev, mddev);
2300 } else
2301 rcu_read_unlock();
2302 }
2303 sectors -= s;
2304 sect += s;
2305 }
2306 }
// drivers/md/md.c
// @rdev: IO操作RAID成员盘。
// @sector: 读操作起始扇区。
// @size: 本次IO操作的尺寸,单位为字节。
// @page: 用来存放读取数据的内存页。
// @op: IO操作的类型。
// @op_flags: IO操作的标志信息。
// @metadata_op: 是否操作元数据信息。
// 返回值大于0表示sync_page_io成功,BLK_STS_OK取反为真。
850 int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
851 struct page *page, int op, int op_flags, bool metadata_op)
852 {
853 struct bio *bio = md_bio_alloc_sync(rdev->mddev);
854 int ret;
855
856 if (metadata_op && rdev->meta_bdev)
857 bio_set_dev(bio, rdev->meta_bdev);
858 else
859 bio_set_dev(bio, rdev->bdev);
860 bio_set_op_attrs(bio, op, op_flags);
861 if (metadata_op)
862 bio->bi_iter.bi_sector = sector + rdev->sb_start;
863 else if (rdev->mddev->reshape_position != MaxSector &&
864 (rdev->mddev->reshape_backwards ==
865 (sector >= rdev->mddev->reshape_position)))
866 bio->bi_iter.bi_sector = sector + rdev->new_data_offset;
867 else
868 bio->bi_iter.bi_sector = sector + rdev->data_offset;
869 bio_add_page(bio, page, size, 0);
870
// 阻塞等待IO操作结果。
871 submit_bio_wait(bio);
872
// bio->bi_status 用来标识bio状态。
873 ret = !bio->bi_status;
874 bio_put(bio);
875 return ret;
876 }
877 EXPORT_SYMBOL_GPL(sync_page_io);
// include/linux/blk_types.h
22 /*
23 * Block error status values. See block/blk-core:blk_errors for the details.
24 * Alpha cannot write a byte atomically, so we need to use 32-bit value.
25 */
26 #if defined(CONFIG_ALPHA) && !defined(__alpha_bwx__)
27 typedef u32 __bitwise blk_status_t;
28 #else
29 typedef u8 __bitwise blk_status_t;
30 #endif
31 #define BLK_STS_OK 0
32 #define BLK_STS_NOTSUPP ((__force blk_status_t)1)
33 #define BLK_STS_TIMEOUT ((__force blk_status_t)2)
34 #define BLK_STS_NOSPC ((__force blk_status_t)3)
35 #define BLK_STS_TRANSPORT ((__force blk_status_t)4)
36 #define BLK_STS_TARGET ((__force blk_status_t)5)
37 #define BLK_STS_NEXUS ((__force blk_status_t)6)
38 #define BLK_STS_MEDIUM ((__force blk_status_t)7)
39 #define BLK_STS_PROTECTION ((__force blk_status_t)8)
40 #define BLK_STS_RESOURCE ((__force blk_status_t)9)
41 #define BLK_STS_IOERR ((__force blk_status_t)10)
42
43 /* hack for device mapper, don't use elsewhere: */
44 #define BLK_STS_DM_REQUEUE ((__force blk_status_t)11)
45
46 #define BLK_STS_AGAIN ((__force blk_status_t)12)
47
48 /*
49 * BLK_STS_DEV_RESOURCE is returned from the driver to the block layer if
50 * device related resources are unavailable, but the driver can guarantee
51 * that the queue will be rerun in the future once resources become
52 * available again. This is typically the case for device specific
53 * resources that are consumed for IO. If the driver fails allocating these
54 * resources, we know that inflight (or pending) IO will free these
55 * resource upon completion.
56 *
57 * This is different from BLK_STS_RESOURCE in that it explicitly references
58 * a device specific resource. For resources of wider scope, allocation
59 * failure can happen without having pending IO. This means that we can't
60 * rely on request completions freeing these resources, as IO may not be in
61 * flight. Examples of that are kernel memory allocations, DMA mappings, or
62 * any other system wide resources.
63 */
64 #define BLK_STS_DEV_RESOURCE ((__force blk_status_t)13)
// drivers/md/raid1.c
// 返回值大于0表是IO操作成功。
1914 static int r1_sync_page_io(struct md_rdev *rdev, sector_t sector,
1915 int sectors, struct page *page, int rw)
1916 {
1917 if (sync_page_io(rdev, sector, sectors << 9, page, rw, 0, false))
1918 /* success */
1919 return 1;
1920 if (rw == WRITE) {
// 如果是写操作,成员盘会被标记为write_error。
1921 set_bit(WriteErrorSeen, &rdev->flags);
// 如果没有打上WantReplacement标记,RAID设备被标记为需要recovery。
1922 if (!test_and_set_bit(WantReplacement,
1923 &rdev->flags))
1924 set_bit(MD_RECOVERY_NEEDED, &
1925 rdev->mddev->recovery);
1926 }
1927 /* need to record an error - either for the block or the device */
// v0.9 版本中rdev_set_badblocks一定是返回0,无论读还是写都将设备标记为fail。
1928 if (!rdev_set_badblocks(rdev, sector, sectors, 0))
1929 md_error(rdev->mddev, rdev);
1930 return 0;
1931 }