do_generic_file_read()

最新推荐文章于 2021-12-31 15:24:01 发布

weixin_34034670

最新推荐文章于 2021-12-31 15:24:01 发布

阅读量163

点赞数

文章标签： python

原文链接：https://my.oschina.net/u/2475751/blog/538320

版权

2019独角兽企业重金招聘Python工程师标准>>>

ftrace:

 0)               |  vfs_read() {
 0)               |    generic_file_aio_read() {
 0)               |      do_generic_file_read() {
 0)   0.195 us    |        find_get_page();
 0)   0.109 us    |        find_get_page();
 0)   0.263 us    |        touch_atime();
 0)   2.820 us    |      }
 0)   3.559 us    |    }
 0)   5.063 us    |  }

其中，大量重复的路径：

2) ! 11994.70 us |          ocfs2_readpage();
 2)   0.149 us    |          find_get_page();
 2)               |          page_cache_sync_readahead() {
 2)   0.547 us    |            __page_cache_alloc();
 2)   0.181 us    |            __page_cache_alloc();
 2)   0.184 us    |            __page_cache_alloc();
 2)   0.178 us    |            __page_cache_alloc();
 2)   4.190 us    |          }
 2)   0.073 us    |          find_get_page();
 2)   0.176 us    |          __page_cache_alloc();
 2) ! 11983.20 us |          ocfs2_readpage();
 2)   0.150 us    |          find_get_page();
 2)               |          page_cache_sync_readahead() {
 2)   0.525 us    |            __page_cache_alloc();
 2)   0.176 us    |            __page_cache_alloc();
 2)   0.222 us    |            __page_cache_alloc();
 2)   0.185 us    |            __page_cache_alloc();
 2)   4.198 us    |          }
 2)   0.068 us    |          find_get_page();
 2)   0.182 us    |          __page_cache_alloc();
 2) ! 11986.95 us |          ocfs2_readpage();

分析do_generic_file_read()函数：

1258 /**
1259  * do_generic_file_read - generic file read routine
1260  * @filp:   the file to read
1261  * @ppos:   current file position
1262  * @desc:   read_descriptor
1263  * @actor:  read method
1264  *
1265  * This is a generic file read routine, and uses the
1266  * mapping->a_ops->readpage() function for the actual low-level stuff.
1267  *
1268  * This is really ugly. But the goto's actually try to clarify some
1269  * of the logic when it comes to error handling etc.
1270  */
1271 static void do_generic_file_read(struct file *filp, loff_t *ppos,
1272         read_descriptor_t *desc, read_actor_t actor)
1273 {
1274     struct address_space *mapping = filp->f_mapping;
1275     struct inode *inode = mapping->host;
1276     struct file_ra_state *ra = &filp->f_ra;
1277     pgoff_t index;
1278     pgoff_t last_index;
1279     pgoff_t prev_index;
1280     unsigned long offset;      /* offset into pagecache page */
1281     unsigned int prev_offset;
1282     int error;
1283 
1284     index = *ppos >> PAGE_CACHE_SHIFT;
//原来预读结构保存了上次读位置
1285     prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT;
1286     prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1);
//last_index应该指的是下次读操作完成后的位置
1287     last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
1288     offset = *ppos & ~PAGE_CACHE_MASK;
1289 
1290     for (;;) {
//一次拷贝一页，末尾页特殊处理
1291         struct page *page;
1292         pgoff_t end_index;
1293         loff_t isize; 
1294         unsigned long nr, ret;
1295      
1296         cond_resched();

割....

1297 find_page: 
//以index为key，在address space的radix tree,即->page_tree中，查找并引用
1298         page = find_get_page(mapping, index);
1299         if (!page) {
//在并发读写同一个文件的测试中，发现总是会查找失败，不知道为什么？像这样：
// 2)               |          page_cache_sync_readahead() {
// 2)   0.857 us    |            __page_cache_alloc();
// 2)   0.205 us    |            __page_cache_alloc();
// 2)   0.185 us    |            __page_cache_alloc();
// 2)   0.180 us    |            __page_cache_alloc();
// 2)   5.096 us    |          }
1300             page_cache_sync_readahead(mapping,
1301                     ra, filp, 
1302                     index, last_index - index);
1303             page = find_get_page(mapping, index);
//难道还是找不到， 又得调用__page_cache_alloc()？
// 2)   0.088 us    |          find_get_page();
// 2)   0.223 us    |          __page_cache_alloc();
1304             if (unlikely(page == NULL))
1305                 goto no_cached_page;
1306         } 
//#define PageReadahead(page)     test_bit(PG_readahead, &(page)->flags)
1307         if (PageReadahead(page)) { 
/**
该页已经被预读到了，说明预读管用，所以可能又顺势多申请一些页面；
 1)               |          page_cache_async_readahead() {
 1)   1.114 us    |            __page_cache_alloc();
 ...
 1)   0.318 us    |            __page_cache_alloc();
 1) + 45.123 us   |          }
 
测试中，凡是进到这儿，就不会去调用ocfs2_readpage, 但是经常会在这里
浪费很多时间，比如
 0) ! 2390.071 us |          __lock_page_killable();
*/
1308             page_cache_async_readahead(mapping,
1309                     ra, filp, page,
1310                     index, last_index - index);
1311         }
1312         if (!PageUptodate(page)) {
//如果页不是最新的
1313             if (inode->i_blkbits == PAGE_CACHE_SHIFT ||
1314                     !mapping->a_ops->is_partially_uptodate)
//如果块大小等于页大小； 或支持局部更新；
1315                 goto page_not_up_to_date;
1316             if (!trylock_page(page))
//page_locked没有置位，那么置位
1317                 goto page_not_up_to_date;
1318             /* Did it get truncated before we got the lock? */
//哪个lock? page_locked吗？
1319             if (!page->mapping)
1320                 goto page_not_up_to_date_locked;
1321             if (!mapping->a_ops->is_partially_uptodate(page,
1322                                 desc, offset))
//全脏
1323                 goto page_not_up_to_date_locked;
1324             unlock_page(page);
1325         }

割...

1326 page_ok:
//页已经在pagecache里面了
1327         /*
1328          * i_size must be checked after we know the page is Uptodate.
1329          *
1330          * Checking i_size after the check allows us to calculate
1331          * the correct value for "nr", which means the zero-filled
1332          * part of the page is not copied back to userspace (unless
1333          * another truncate extends the file - this is desired though).
1334          */
1335 
1336         isize = i_size_read(inode);
1337         end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
1338         if (unlikely(!isize || index > end_index)) {
1339             page_cache_release(page);
1340             goto out;
1341         }
1342 
1343         /* nr is the maximum number of bytes to copy from this page */
1344         nr = PAGE_CACHE_SIZE;
1345         if (index == end_index) {
1346             nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
1347             if (nr <= offset) {
1348                 page_cache_release(page);
1349                 goto out;
1350             }
1351         }
1352         nr = nr - offset;
1353 
1354         /* If users can be writing to this page using arbitrary
1355          * virtual addresses, take care about potential aliasing
1356          * before reading the page on the kernel side.
1357          */
1358         if (mapping_writably_mapped(mapping))
1359             flush_dcache_page(page);
1360 
1361         /*
1362          * When a sequential read accesses a page several times,
1363          * only mark it as accessed the first time.
1364          */
1365         if (prev_index != index || offset != prev_offset)
1366             mark_page_accessed(page);
1367         prev_index = index;
1368 
1369         /*
1370          * Ok, we have the page, and it's up-to-date, so
1371          * now we can copy it to user space...
1372          *
1373          * The actor routine returns how many bytes were actually used..
1374          * NOTE! This may not be the same as how much of a user buffer
1375          * we filled up (we may be padding etc), so we can only update
1376          * "pos" here (the actor routine has to update the user buffer
1377          * pointers and the remaining count).
1378          */
1379         ret = actor(desc, page, offset, nr);
1380         offset += ret;
1381         index += offset >> PAGE_CACHE_SHIFT;
1382         offset &= ~PAGE_CACHE_MASK;
1383         prev_offset = offset;
1384 
//为什么要释放这个页呢？ 
//数据已经从内核态page中，拷贝到用户空间；但万一下次读还是这个页呢？
1385         page_cache_release(page);
//nr代表需要拷贝的字节数， ret表示时间拷贝的字节数u；
//若本次拷贝成功，并且还没有读完
1386         if (ret == nr && desc->count)
1387             continue;
1388         goto out;

割...

1390 page_not_up_to_date:
/* 有时候在此消耗很长时间：
 1) ! 10082.08 us |          __lock_page_killable();
 */
1391         /* Get exclusive access to the page ... */
1392         error = lock_page_killable(page);
1393         if (unlikely(error))
1394             goto readpage_error;
1395 
1396 page_not_up_to_date_locked:
1397         /* Did it get truncated before we got the lock? */
1398         if (!page->mapping) {
1399             unlock_page(page);
1400             page_cache_release(page);
//对应pagecache page没有准备好，从头再来...
1401             continue;
1402         }
1403 
1404         /* Did somebody else fill it already? */
1405         if (PageUptodate(page)) {
1406             unlock_page(page);
1407             goto page_ok;
1408         }

割...

1410 readpage:
1411         /*
1412          * A previous I/O error may have been due to temporary
1413          * failures, eg. multipath errors.
1414          * PG_error will be set again if readpage fails.
1415          */
//不懂...
1416         ClearPageError(page);
1417         /* Start the actual read. The read will unlock the page. */
/*
调用ocfs2_readpage, 也是读耗时最严重的，清一色的严重：
1) ! 12604.57 us |          ocfs2_readpage();
但readpage是pagecache层向block层发送读请求， 为什么要unlock the page呢？
*/
1418         error = mapping->a_ops->readpage(filp, page);
1419 
1420         if (unlikely(error)) {
1421             if (error == AOP_TRUNCATED_PAGE) {
1422                 page_cache_release(page);
1423                 goto find_page;
1424             }
1425             goto readpage_error;
1426         }
1427 
1428         if (!PageUptodate(page)) {
//刚刚从block层，读到pagecache page中的数据怎么会脏呢？
//从测试来看，还没发生这种情况
1429             error = lock_page_killable(page);
1430             if (unlikely(error))
1431                 goto readpage_error;
1432             if (!PageUptodate(page)) {
1433                 if (page->mapping == NULL) {
1434                     /*
1435                      * invalidate_mapping_pages got it
1436                      */
1437                     unlock_page(page);
1438                     page_cache_release(page);
1439                     goto find_page;
1440                 }
1441                 unlock_page(page);
1442                 shrink_readahead_size_eio(filp, ra);
1443                 error = -EIO;
1444                 goto readpage_error;
1445             }
1446             unlock_page(page);
1447         }
1448 
//page_ok表示页准备就绪，接着从内核态往用户态拷贝，拷贝完就退出
1449         goto page_ok;

割...

1451 readpage_error:
1452         /* UHHUH! A synchronous read error occurred. Report it */
1453         desc->error = error;
1454         page_cache_release(page);
1455         goto out;
1456 
1457 no_cached_page:
1458         /*
1459          * Ok, it wasn't cached, so we need to create a new
1460          * page..
1461          */
1462         page = page_cache_alloc_cold(mapping);
1463         if (!page) {
1464             desc->error = -ENOMEM;
1465             goto out;
1466         }
1467         error = add_to_page_cache_lru(page, mapping,
1468                         index, GFP_KERNEL);
1469         if (error) {
1470             page_cache_release(page);
1471             if (error == -EEXIST)
1472                 goto find_page;
1473             desc->error = error;
1474             goto out;
1475         }
1476         goto readpage;
1477     }

割...

1479 out:
1480     ra->prev_pos = prev_index;
1481     ra->prev_pos <<= PAGE_CACHE_SHIFT;
1482     ra->prev_pos |= prev_offset;
1483 
1484     *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset;
1485     file_accessed(filp);
1486 }

转载于:https://my.oschina.net/u/2475751/blog/538320