先来看下ftrace, 时间基本上花在了ocfs2_write_begin()函数中。
1) | generic_perform_write() {
1) ! 12184.13 us | ocfs2_write_begin();
1) 0.341 us | iov_iter_copy_from_user_atomic();
1) + 11.759 us | ocfs2_write_end();
1) ! 12198.37 us | }
再作简单分析:
2577 static ssize_t generic_perform_write(struct file *file,
2578 struct iov_iter *i, loff_t pos)
2579 {
2580 struct address_space *mapping = file->f_mapping;
2581 const struct address_space_operations *a_ops = mapping->a_ops;
2582 long status = 0;
2583 ssize_t written = 0;
2584 unsigned int flags = 0;
2585
//请无视
2586 /*
2587 * Copies from kernel address space cannot fail (NFSD is a big user).
2588 */
2589 if (segment_eq(get_fs(), KERNEL_DS))
2590 flags |= AOP_FLAG_UNINTERRUPTIBLE;
2591
// do{}while()里面肯定是规律性重复
2592 do {
2593 struct page *page;
2594 unsigned long offset; /* Offset into pagecache page */
2595 unsigned long bytes; /* Bytes to write to page */
2596 size_t copied; /* Bytes copied from user */
2597 void *fsdata;
2598
//offset: pos是文件指针(这里指针意思是“位置”),若以页大小为单位来看待文件,offset即最后一页中文件
//的末尾;
//bytes: 这一次迭代,打算写入的字节数,如果最后一页空闲部分放不下iovec第一个分量,就先把尾页填满;
2599 offset = (pos & (PAGE_CACHE_SIZE - 1));
2600 bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
2601 iov_iter_count(i));
2602
2603 again:
2604
2605 /*
2606 * Bring in the user page that we will copy from _first_.
2607 * Otherwise there's a nasty deadlock on copying from the
2608 * same page as we're writing to, without it being marked
2609 * up-to-date.
2610 *
2611 * Not only is this an optimisation, but it is also required
2612 * to check that the address is actually valid, when atomic
2613 * usercopies are used, below.
2614 */
//掠过所有unlikely ;-)
2615 if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
2616 status = -EFAULT;
2617 break;
2618 }
2619
//write_begin()以后单独分析,先简单说几句。write_begin回调ocfs2_write_begin(),ocfs2_*
//调用ocfs2_inode_lock加EX锁,这个锁很厉害,会导致其他节点inode pagecache失效;down_write ip_alloc_sem
//防止在接下来ocfs2_write()中,->readpage()和空间分配并发执行。
2620 status = a_ops->write_begin(file, mapping, pos, bytes, flags,
2621 &page, &fsdata);
2622 if (unlikely(status))
2623 break;
2624
//如果用户进程调用mmap()对文件做了shared映射,恰好操作的是同一个page,就先的刷下pagecache;
//刷page cache前,先把data cache刷新到内存;
2625 if (mapping_writably_mapped(mapping))
2626 flush_dcache_page(page);
2627
//copy from user;-)
2628 pagefault_disable();
2629 copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
2630 pagefault_enable();
2631 flush_dcache_page(page);
2632
2633 mark_page_accessed(page);
//与write_begin相反,调用block_commit_write()向块层发送写请求,更新inode,dinode统计量,
//ocfs2_commit_trans()提交事物,销毁写辅助结构体
2634 status = a_ops->write_end(file, mapping, pos, bytes, copied,
2635 page, fsdata);
2636 if (unlikely(status < 0))
2637 break;
2638 copied = status;
2639
2640 cond_resched();
2641
2642 iov_iter_advance(i, copied);
2643 if (unlikely(copied == 0)) {
2644 /*
2645 * If we were unable to copy any data at all, we must
2646 * fall back to a single segment length write.
2647 *
2648 * If we didn't fallback here, we could livelock
2649 * because not all segments in the iov can be copied at
2650 * once without a pagefault.
2651 */
2652 bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
2653 iov_iter_single_seg_count(i));
2654 goto again;
2655 }
2656 pos += copied;
2657 written += copied;
2658
//这个函数调用了balance_dirty_pages_ratelimited_cr(mapping, 1); 从这个1看出来,while循环
//一次只处理一个page; 这个函数会周期性检查内存dirty程度,发起write back;
2659 balance_dirty_pages_ratelimited(mapping);
2660
2661 } while (iov_iter_count(i));
2662
2663 return written ? written : status;
2664 }