从ftrace来看,不出意外,最耗时间的果然是__ocfs2_cluster_lock()
0) | ocfs2_inode_lock_full_nested() {
0) 0.000 us | ocfs2_wait_for_recovery();
0) ! 12026.56 us | __ocfs2_cluster_lock();
0) 0.000 us | ocfs2_wait_for_recovery();
0) 0.000 us | ocfs2_inode_lock_update();
0) ! 12026.56 us | }
0) 0.000 us | ocfs2_inode_unlock();
同样,简单过下这个函数:
2272 /*
2273 * returns < 0 error if the callback will never be called, otherwise
2274 * the result of the lock will be communicated via the callback.
2275 */
2276 int ocfs2_inode_lock_full_nested(struct inode *inode,
2277 struct buffer_head **ret_bh,
2278 int ex,
2279 int arg_flags,
2280 int subclass)
2281 {
//arg_flags=0, subclass=IO_LS_NORMAL
2282 int status, level, acquired;
2283 u32 dlm_flags;
2284 struct ocfs2_lock_res *lockres = NULL;
2285 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2286 struct buffer_head *local_bh = NULL;
2287
2288 BUG_ON(!inode);
2289
//从这个log message来看,inode lock也是META lock,那么问题来了有相应的address_space lock吗?
//我推测应该没有,因为节点间共享的是inode,至于page cache节点间是独立的。
2290 mlog(0, "inode %llu, take %s META lock\n",
2291 (unsigned long long)OCFS2_I(inode)->ip_blkno,
2292 ex ? "EXMODE" : "PRMODE");
2293
2294 status = 0;
2295 acquired = 0;
2296 /* We'll allow faking a readonly metadata lock for
2297 * rodevices. */
2298 if (ocfs2_is_hard_readonly(osb)) {
2299 if (ex)
2300 status = -EROFS;
2301 goto bail;
2302 }
2303
2304 if (ocfs2_mount_local(osb))
2305 goto local;
2306
//就是说从这儿到local,是cluster特有的处理! arg_flags传入时等于0,if必成立
2307 if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
2308 ocfs2_wait_for_recovery(osb);
2309
//->ip_inode_lockres应该就是META lock, ->ip_rw_lockres就是读写,即数据锁吧
2310 lockres = &OCFS2_I(inode)->ip_inode_lockres;
//level=EX
2311 level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
2312 dlm_flags = 0;
2313 if (arg_flags & OCFS2_META_LOCK_NOQUEUE)
2314 dlm_flags |= DLM_LKF_NOQUEUE;
2315
//dlm_flags=0, args_flags=0,
2316 status = __ocfs2_cluster_lock(osb, lockres, level, dlm_flags,
2317 arg_flags, subclass, _RET_IP_);
2318 if (status < 0) {
2319 if (status != -EAGAIN && status != -EIOCBRETRY)
2320 mlog_errno(status);
2321 goto bail;
2322 }
2323
2324 /* Notify the error cleanup path to drop the cluster lock. */
2325 acquired = 1;
2326
2327 /* We wait twice because a node may have died while we were in
2328 * the lower dlm layers. The second time though, we've
2329 * committed to owning this lock so we don't allow signals to
2330 * abort the operation. */
2331 if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
2332 ocfs2_wait_for_recovery(osb);
2333
2334 local:
2335 /*
2336 * We only see this flag if we're being called from
2337 * ocfs2_read_locked_inode(). It means we're locking an inode
2338 * which hasn't been populated yet, so clear the refresh flag
2339 * and let the caller handle it.
2340 */
2341 if (inode->i_state & I_NEW) {
2342 status = 0;
//不能被ftrace?
2343 if (lockres)
2344 ocfs2_complete_lock_res_refresh(lockres, 0);
2345 goto bail;
2346 }
2347
2348 /* This is fun. The caller may want a bh back, or it may
2349 * not. ocfs2_inode_lock_update definitely wants one in, but
2350 * may or may not read one, depending on what's in the
2351 * LVB. The result of all of this is that we've *only* gone to
2352 * disk if we have to, so the complexity is worthwhile. */
//在inode被锁住的情况下, 先抛弃已缓存的inode元数据, 然后再调用ocfs2_refresh_inode_from_lvb更新inode一些关键字段;
//据说lvb是通过网络数据传输来更新的
2353 status = ocfs2_inode_lock_update(inode, &local_bh);
2354 if (status < 0) {
2355 if (status != -ENOENT)
2356 mlog_errno(status);
2357 goto bail;
2358 }
2359
2360 if (ret_bh) {
2361 status = ocfs2_assign_bh(inode, ret_bh, local_bh);
2362 if (status < 0) {
2363 mlog_errno(status);
2364 goto bail;
2365 }
2366 }
2367
2368 bail:
2369 if (status < 0) {
2370 if (ret_bh && (*ret_bh)) {
2371 brelse(*ret_bh);
2372 *ret_bh = NULL;
2373 }
2374 if (acquired)
2375 ocfs2_inode_unlock(inode, ex);
2376 }
2377
2378 if (local_bh)
2379 brelse(local_bh);
2380
2381 return status;
2382 }