ftrace:
0) | vfs_read() {
0) | generic_file_aio_read() {
0) | do_generic_file_read() {
0) 0.195 us | find_get_page();
0) 0.109 us | find_get_page();
0) 0.263 us | touch_atime();
0) 2.820 us | }
0) 3.559 us | }
0) 5.063 us | }
其中,大量重复的路径:
2) ! 11994.70 us | ocfs2_readpage();
2) 0.149 us | find_get_page();
2) | page_cache_sync_readahead() {
2) 0.547 us | __page_cache_alloc();
2) 0.181 us | __page_cache_alloc();
2) 0.184 us | __page_cache_alloc();
2) 0.178 us | __page_cache_alloc();
2) 4.190 us | }
2) 0.073 us | find_get_page();
2) 0.176 us | __page_cache_alloc();
2) ! 11983.20 us | ocfs2_readpage();
2) 0.150 us | find_get_page();
2) | page_cache_sync_readahead() {
2) 0.525 us | __page_cache_alloc();
2) 0.176 us | __page_cache_alloc();
2) 0.222 us | __page_cache_alloc();
2) 0.185 us | __page_cache_alloc();
2) 4.198 us | }
2) 0.068 us | find_get_page();
2) 0.182 us | __page_cache_alloc();
2) ! 11986.95 us | ocfs2_readpage();
分析do_generic_file_read()函数:
1258 /**
1259 * do_generic_file_read - generic file read routine
1260 * @filp: the file to read
1261 * @ppos: current file position
1262 * @desc: read_descriptor
1263 * @actor: read method
1264 *
1265 * This is a generic file read routine, and uses the
1266 * mapping->a_ops->readpage() function for the actual low-level stuff.
1267 *
1268 * This is really ugly. But the goto's actually try to clarify some
1269 * of the logic when it comes to error handling etc.
1270 */
1271 static void do_generic_file_read(struct file *filp, loff_t *ppos,
1272 read_descriptor_t *desc, read_actor_t actor)
1273 {
1274 struct address_space *mapping = filp->f_mapping;
1275 struct inode *inode = mapping->host;
1276 struct file_ra_state *ra = &filp->f_ra;
1277 pgoff_t index;
1278 pgoff_t last_index;
1279 pgoff_t prev_index;
1280 unsigned long offset; /* offset into pagecache page */
1281 unsigned int prev_offset;
1282 int error;
1283
1284 index = *ppos >> PAGE_CACHE_SHIFT;
//原来预读结构保存了上次读位置
1285 prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT;
1286 prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1);
//last_index应该指的是下次读操作完成后的位置
1287 last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
1288 offset = *ppos & ~PAGE_CACHE_MASK;
1289
1290 for (;;) {
//一次拷贝一页,末尾页特殊处理
1291 struct page *page;
1292 pgoff_t end_index;
1293 loff_t isize;
1294 unsigned long nr, ret;
1295
1296 cond_resched();
割....
1297 find_page:
//以index为key,在address space的radix tree,即->page_tree中,查找并引用
1298 page = find_get_page(mapping, index);
1299 if (!page) {
//在并发读写同一个文件的测试中,发现总是会查找失败,不知道为什么?像这样:
// 2) | page_cache_sync_readahead() {
// 2) 0.857 us | __page_cache_alloc();
// 2) 0.205 us | __page_cache_alloc();
// 2) 0.185 us | __page_cache_alloc();
// 2) 0.180 us | __page_cache_alloc();
// 2) 5.096 us | }
1300 page_cache_sync_readahead(mapping,
1301 ra, filp,
1302 index, last_index - index);
1303 page = find_get_page(mapping, index);
//难道还是找不到, 又得调用__page_cache_alloc()?
// 2) 0.088 us | find_get_page();
// 2) 0.223 us | __page_cache_alloc();
1304 if (unlikely(page == NULL))
1305 goto no_cached_page;
1306 }
//#define PageReadahead(page) test_bit(PG_readahead, &(page)->flags)
1307 if (PageReadahead(page)) {
/**
该页已经被预读到了,说明预读管用,所以可能又顺势多申请一些页面;
1) | page_cache_async_readahead() {
1) 1.114 us | __page_cache_alloc();
...
1) 0.318 us | __page_cache_alloc();
1) + 45.123 us | }
测试中,凡是进到这儿,就不会去调用ocfs2_readpage, 但是经常会在这里
浪费很多时间,比如
0) ! 2390.071 us | __lock_page_killable();
*/
1308 page_cache_async_readahead(mapping,
1309 ra, filp, page,
1310 index, last_index - index);
1311 }
1312 if (!PageUptodate(page)) {
//如果页不是最新的
1313 if (inode->i_blkbits == PAGE_CACHE_SHIFT ||
1314 !mapping->a_ops->is_partially_uptodate)
//如果块大小等于页大小; 或支持局部更新;
1315 goto page_not_up_to_date;
1316 if (!trylock_page(page))
//page_locked没有置位,那么置位
1317 goto page_not_up_to_date;
1318 /* Did it get truncated before we got the lock? */
//哪个lock? page_locked吗?
1319 if (!page->mapping)
1320 goto page_not_up_to_date_locked;
1321 if (!mapping->a_ops->is_partially_uptodate(page,
1322 desc, offset))
//全脏
1323 goto page_not_up_to_date_locked;
1324 unlock_page(page);
1325 }
割...
1326 page_ok:
//页已经在pagecache里面了
1327 /*
1328 * i_size must be checked after we know the page is Uptodate.
1329 *
1330 * Checking i_size after the check allows us to calculate
1331 * the correct value for "nr", which means the zero-filled
1332 * part of the page is not copied back to userspace (unless
1333 * another truncate extends the file - this is desired though).
1334 */
1335
1336 isize = i_size_read(inode);
1337 end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
1338 if (unlikely(!isize || index > end_index)) {
1339 page_cache_release(page);
1340 goto out;
1341 }
1342
1343 /* nr is the maximum number of bytes to copy from this page */
1344 nr = PAGE_CACHE_SIZE;
1345 if (index == end_index) {
1346 nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
1347 if (nr <= offset) {
1348 page_cache_release(page);
1349 goto out;
1350 }
1351 }
1352 nr = nr - offset;
1353
1354 /* If users can be writing to this page using arbitrary
1355 * virtual addresses, take care about potential aliasing
1356 * before reading the page on the kernel side.
1357 */
1358 if (mapping_writably_mapped(mapping))
1359 flush_dcache_page(page);
1360
1361 /*
1362 * When a sequential read accesses a page several times,
1363 * only mark it as accessed the first time.
1364 */
1365 if (prev_index != index || offset != prev_offset)
1366 mark_page_accessed(page);
1367 prev_index = index;
1368
1369 /*
1370 * Ok, we have the page, and it's up-to-date, so
1371 * now we can copy it to user space...
1372 *
1373 * The actor routine returns how many bytes were actually used..
1374 * NOTE! This may not be the same as how much of a user buffer
1375 * we filled up (we may be padding etc), so we can only update
1376 * "pos" here (the actor routine has to update the user buffer
1377 * pointers and the remaining count).
1378 */
1379 ret = actor(desc, page, offset, nr);
1380 offset += ret;
1381 index += offset >> PAGE_CACHE_SHIFT;
1382 offset &= ~PAGE_CACHE_MASK;
1383 prev_offset = offset;
1384
//为什么要释放这个页呢?
//数据已经从内核态page中,拷贝到用户空间;但万一下次读还是这个页呢?
1385 page_cache_release(page);
//nr代表需要拷贝的字节数, ret表示时间拷贝的字节数u;
//若本次拷贝成功,并且还没有读完
1386 if (ret == nr && desc->count)
1387 continue;
1388 goto out;
割...
1390 page_not_up_to_date:
/* 有时候在此消耗很长时间:
1) ! 10082.08 us | __lock_page_killable();
*/
1391 /* Get exclusive access to the page ... */
1392 error = lock_page_killable(page);
1393 if (unlikely(error))
1394 goto readpage_error;
1395
1396 page_not_up_to_date_locked:
1397 /* Did it get truncated before we got the lock? */
1398 if (!page->mapping) {
1399 unlock_page(page);
1400 page_cache_release(page);
//对应pagecache page没有准备好,从头再来...
1401 continue;
1402 }
1403
1404 /* Did somebody else fill it already? */
1405 if (PageUptodate(page)) {
1406 unlock_page(page);
1407 goto page_ok;
1408 }
割...
1410 readpage:
1411 /*
1412 * A previous I/O error may have been due to temporary
1413 * failures, eg. multipath errors.
1414 * PG_error will be set again if readpage fails.
1415 */
//不懂...
1416 ClearPageError(page);
1417 /* Start the actual read. The read will unlock the page. */
/*
调用ocfs2_readpage, 也是读耗时最严重的,清一色的严重:
1) ! 12604.57 us | ocfs2_readpage();
但readpage是pagecache层向block层发送读请求, 为什么要unlock the page呢?
*/
1418 error = mapping->a_ops->readpage(filp, page);
1419
1420 if (unlikely(error)) {
1421 if (error == AOP_TRUNCATED_PAGE) {
1422 page_cache_release(page);
1423 goto find_page;
1424 }
1425 goto readpage_error;
1426 }
1427
1428 if (!PageUptodate(page)) {
//刚刚从block层,读到pagecache page中的数据怎么会脏呢?
//从测试来看,还没发生这种情况
1429 error = lock_page_killable(page);
1430 if (unlikely(error))
1431 goto readpage_error;
1432 if (!PageUptodate(page)) {
1433 if (page->mapping == NULL) {
1434 /*
1435 * invalidate_mapping_pages got it
1436 */
1437 unlock_page(page);
1438 page_cache_release(page);
1439 goto find_page;
1440 }
1441 unlock_page(page);
1442 shrink_readahead_size_eio(filp, ra);
1443 error = -EIO;
1444 goto readpage_error;
1445 }
1446 unlock_page(page);
1447 }
1448
//page_ok表示页准备就绪,接着从内核态往用户态拷贝,拷贝完就退出
1449 goto page_ok;
割...
1451 readpage_error:
1452 /* UHHUH! A synchronous read error occurred. Report it */
1453 desc->error = error;
1454 page_cache_release(page);
1455 goto out;
1456
1457 no_cached_page:
1458 /*
1459 * Ok, it wasn't cached, so we need to create a new
1460 * page..
1461 */
1462 page = page_cache_alloc_cold(mapping);
1463 if (!page) {
1464 desc->error = -ENOMEM;
1465 goto out;
1466 }
1467 error = add_to_page_cache_lru(page, mapping,
1468 index, GFP_KERNEL);
1469 if (error) {
1470 page_cache_release(page);
1471 if (error == -EEXIST)
1472 goto find_page;
1473 desc->error = error;
1474 goto out;
1475 }
1476 goto readpage;
1477 }
割...
1479 out:
1480 ra->prev_pos = prev_index;
1481 ra->prev_pos <<= PAGE_CACHE_SHIFT;
1482 ra->prev_pos |= prev_offset;
1483
1484 *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset;
1485 file_accessed(filp);
1486 }