先看下ftrace, 可惜ocfs2_wait_for_mask不能ftrace,也不知到为什么?不过,它直接调用了wait_for_complete,所以就用这个函数代替了,这个函数浪费的时间最多了!
0) | __ocfs2_cluster_lock() {
0) | wait_for_completion() {
------------------------------------------
0) iomaker-10882 => ocfs2dc-10793
------------------------------------------
0) 0.000 us | ocfs2_dlm_lock();
------------------------------------------
0) ocfs2dc-10793 => iomaker-10882
------------------------------------------
0) ! 11609.94 us | }
0) 0.000 us | ocfs2_dlm_lock();
0) ! 443.137 us | wait_for_completion();
0) ! 12053.08 us | } /* __ocfs2_cluster_lock */
这个函数分析起来,没那么容易,又长又臭。 分片过代码吧:
1362 static int __ocfs2_cluster_lock(struct ocfs2_super *osb,
1363 struct ocfs2_lock_res *lockres,
1364 int level,
1365 u32 lkm_flags,
1366 int arg_flags,
1367 int l_subclass,
1368 unsigned long caller_ip)
1369 {
// lockres是->ip_inode_lockres, level=EX, lkm_flags=0, arg_flags=0, subclass=IO_LS_NORMAL
// caller_ip=__RET_IP_,不知道为什么需要这个参数?
1370 struct ocfs2_mask_waiter mw;
1371 int wait, catch_signals = !(osb->s_mount_opt & OCFS2_MOUNT_NOINTR);
1372 int ret = 0; /* gcc doesn't realize wait = 1 guarantees ret is set */
1373 unsigned long flags;
1374 unsigned int gen;
1375 int noqueue_attempted = 0;
1376
//ocfs2_mask_waiter结构设计的挺巧妙;mask暗指标志位掩码,lockres->l_flags有许多标志位,如OCFS2_LOCK_BUSY,
//OCFS2_LOCK_BLOCKED,OCFS2_LOCK_PENDING,etc.,->mw_mask用来指示哪一个bit,->mw_goal用来表示希望这个
//bit是0或1; waiter就意味着wait_for_completion这个位变成我们想要的值。
1377 ocfs2_init_mask_waiter(&mw);
1378
//ocfs2_inode_inode_lops->flags=LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB
//所以if成立
1379 if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
1380 lkm_flags |= DLM_LKF_VALBLK;
割...
1382 again:
//哪些情况会goto到这儿?
//#1493行,从ocfs2_dlm_lock中成功返回,但是BUSY标记还没有清除掉,这意味着ast还没被调用或返回,因为所有类型的ast都去清除这个标记;
//#1520行,因为args_flags=0,#1514行的if语句不成立,所以这行根本执行不到;
//#1525行,1524行if语句一定成立,即ret=0
1383 wait = 0;
1384
1385 spin_lock_irqsave(&lockres->l_lock, flags);
1386
1387 if (catch_signals && signal_pending(current)) {
1388 ret = -ERESTARTSYS;
1389 goto unlock;
1390 }
1391
1392 mlog_bug_on_msg(lockres->l_flags & OCFS2_LOCK_FREEING,
1393 "Cluster lock called on freeing lockres %s! flags "
1394 "0x%lx\n", lockres->l_name, lockres->l_flags);
1395
1396 /* We only compare against the currently granted level
1397 * here. If the lock is blocked waiting on a downconvert,
1398 * we'll get caught below. */
1399 if (lockres->l_flags & OCFS2_LOCK_BUSY &&
1400 level > lockres->l_level) {
//BUSY表示还有dlm lock请求没有返回,必须等着...
1401 /* is someone sitting in dlm_lock? If so, wait on
1402 * them. */
1403 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
1404 wait = 1;
1405 goto unlock;
//unlock处,#1502行,不能理解!!! 总之,很快就进入等待函数了...
1406 }
1407
1408 if (lockres->l_flags & OCFS2_LOCK_UPCONVERT_FINISHING) {
1409 /*
1410 * We've upconverted. If the lock now has a level we can
1411 * work with, we take it. If, however, the lock is not at the
1412 * required level, we go thru the full cycle. One way this could
1413 * happen is if a process requesting an upconvert to PR is
1414 * closely followed by another requesting upconvert to an EX.
1415 * If the process requesting EX lands here, we want it to
1416 * continue attempting to upconvert and let the process
1417 * requesting PR take the lock.
1418 * If multiple processes request upconvert to PR, the first one
1419 * here will take the lock. The others will have to go thru the
1420 * OCFS2_LOCK_BLOCKED check to ensure that there is no pending
1421 * downconvert request.
1422 */
//这段注释非常清楚
1423 if (level <= lockres->l_level)
1424 goto update_holders;
1425 }
1426
1427 if (lockres->l_flags & OCFS2_LOCK_BLOCKED &&
1428 !ocfs2_may_continue_on_blocked_lock(lockres, level)) {
//BLOCKED: blocked waiting for downconvert;
//ocfs2_may_continue_on...在想要的锁和->l_locking兼容,返回1; 我猜是为了避免重复等待
1429 /* is the lock is currently blocked on behalf of
1430 * another node */
1431 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BLOCKED, 0);
1432 wait = 1;
1433 goto unlock;
割...
1436 if (level > lockres->l_level) {
//申请的锁级别要高于当前granted lock level
1437 if (noqueue_attempted > 0) {
//noqueue_attempted一直等于0,所以可以无视这个if语句;
1438 ret = -EAGAIN;
1439 goto unlock;
1440 }
//lkm_flags不会将DLM_LKF_NOQUEUE置位,所以也可以无视这个if语句
1441 if (lkm_flags & DLM_LKF_NOQUEUE)
1442 noqueue_attempted = 1;
1443
//->l_action用来指示ast回调时执行哪个动作,有OCFS2_AST_ATTACH, OCFS2_AST_CONVERT, OCFS2_AST_DOWNCONVERT;
1444 if (lockres->l_action != OCFS2_AST_INVALID)
1445 mlog(ML_ERROR, "lockres %s has action %u pending\n",
1446 lockres->l_name, lockres->l_action);
1447
1448 if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
//如果OCFS2_LOCK_ATTACHED为0,表示该锁资源的LVB还没有初始化,也意味着这是初次对该资源加锁;
1449 lockres->l_action = OCFS2_AST_ATTACH;
1450 lkm_flags &= ~DLM_LKF_CONVERT;
1451 } else {
//否则,一定是申请锁转换
1452 lockres->l_action = OCFS2_AST_CONVERT;
1453 lkm_flags |= DLM_LKF_CONVERT;
1454 }
1455
1456 lockres->l_requested = level;
1457 lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
1458 gen = lockres_set_pending(lockres);
1459 spin_unlock_irqrestore(&lockres->l_lock, flags);
1460
1461 BUG_ON(level == DLM_LOCK_IV);
1462 BUG_ON(level == DLM_LOCK_NL);
1463
1464 mlog(ML_BASTS, "lockres %s, convert from %d to %d\n",
1465 lockres->l_name, lockres->l_level, level);
1467 /* call dlm_lock to upgrade lock now */
1468 ret = ocfs2_dlm_lock(osb->cconn,
1469 level,
1470 &lockres->l_lksb,
1471 lkm_flags,
1472 lockres->l_name,
1473 OCFS2_LOCK_ID_MAX_LEN - 1);
1474 lockres_clear_pending(lockres, gen, osb);
1475 if (ret) {
1476 if (!(lkm_flags & DLM_LKF_NOQUEUE) ||
1477 (ret != -EAGAIN)) {
1478 ocfs2_log_dlm_error("ocfs2_dlm_lock",
1479 ret, lockres);
1480 }
1481 ocfs2_recover_from_dlm_error(lockres, 1);
1482 goto out;
1483 }
1484
1485 mlog(0, "lock %s, successful return from ocfs2_dlm_lock\n",
1486 lockres->l_name);
1487
1488 /* At this point we've gone inside the dlm and need to
1489 * complete our work regardless. */
1490 catch_signals = 0;
1491
1492 /* wait for busy to clear and carry on */
1493 goto again;
1494 }
割...
1496 update_holders:
1497 /* Ok, if we get here then we're good to go. */
//能走到这一步,说明已经成功拿到了想要的锁
1498 ocfs2_inc_holders(lockres, level);
1499
1500 ret = 0;
1501 unlock:
//#1502行,不清楚要干什么?
1502 lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
1503
1504 spin_unlock_irqrestore(&lockres->l_lock, flags);
1505 out:
1506 /*
1507 * This is helping work around a lock inversion between the page lock
1508 * and dlm locks. One path holds the page lock while calling aops
1509 * which block acquiring dlm locks. The voting thread holds dlm
1510 * locks while acquiring page locks while down converting data locks.
1511 * This block is helping an aop path notice the inversion and back
1512 * off to unlock its page lock before trying the dlm lock again.
1513 */
//因为args_flags=0,这个if语句不会成立,直接无视
1514 if (wait && arg_flags & OCFS2_LOCK_NONBLOCK &&
1515 mw.mw_mask & (OCFS2_LOCK_BUSY|OCFS2_LOCK_BLOCKED)) {
1516 wait = 0;
1517 if (lockres_remove_mask_waiter(lockres, &mw))
1518 ret = -EAGAIN;
1519 else
1520 goto again;
1521 }
//资源被占着,而且锁不兼容,只能慢慢等了!!!
1522 if (wait) {
1523 ret = ocfs2_wait_for_mask(&mw);
1524 if (ret == 0)
1525 goto again;
1526 mlog_errno(ret);
1527 }
1528 ocfs2_update_lock_stats(lockres, level, &mw, ret);
1542 return ret;
1543 }