Linux ftrace 1.3、tracer (function、function_graph、irq_off)

最新推荐文章于 2025-03-20 09:39:18 发布

pwl999

最新推荐文章于 2025-03-20 09:39:18 发布

阅读量6.9k

点赞数 10

本文链接：https://blog.csdn.net/pwl999/article/details/80627095

版权

Trace 专栏收录该内容

29 篇文章

订阅专栏

本文详细介绍了Linux内核中的Function Tracer插桩机制，包括静态和动态插桩的过程，以及如何通过_mcount和ftrace_caller进行函数跟踪。探讨了如何避免不需要跟踪的函数被插桩，同时分析了动态插桩如何减少性能开销。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

如trace event一章的描述，任何一种trace都离不开以下流程：

函数插桩。使用各种插桩方式把自己的trace函数插入到需要跟踪的probe point上；
input trace数据。在trace的probe函数被命中时，会存储数据到ringbuffer当中；这里还包括filter和trigger功能；
ouput trace数据。用户或者程序需要读出trace数据，根据需要输出ram数据或者是方面用户阅读的数据；对数据的解析，遵循谁存储谁提供解析规则的原则；

以function tracer为首的tracer也遵循同样的流程。

1、fucntion tracer

function tracer最大的难点在于：dynamic ftrace时(如不做说明本文默认dynamic)对大量插桩点的管理。

root:/d/tracing # cat available_filter_functions | wc -l
49099
root:/d/tracing #
root:/d/tracing # cat available_events | wc -l
987

可以看到上述一个普通的系统，function tracer的插桩点为49099个，trace event的插桩点为987个。function tracer的插桩点几乎大于trace event插桩点两个数量级，而且trace event的插桩点是每一个插桩点都是独立控制的，而function tracer的插桩点默认是一起控制的(也可以通过set_ftrace_filter、set_ftrace_notrace来分开控制)。

这么庞大的数量的插桩点。如果独立控制会消耗大量的内存，如果集中控制又怎么能实现部分的filter和command控制？一旦桩函数的实现不简洁，开销会迅速放大。对这部分感兴趣可以直接跳转到1.2.2、插桩点的动态管理。

1.1、插桩原理

1.1.1、_mcount()

众所周知function trace是利用_mcount()函数进行插桩的。在gcc使用了“-pg”选项以后，会在每个函数的入口插入以下语句：

function:
    ...
    mov x0, x30
    bl _mcount
    [function's body ...]

默认的_mcount函数是空操作：

ENTRY(_mcount)
    ret
ENDPROC(_mcount)

每个函数入口插入对_mcount()函数的调用，就是gcc提供的插桩机制。我们可以重新定义_mcount()函数中的内容，调用想要执行的内容。

1.1.2、static ftrace插桩

static ftrace就是使用重定义_mcount()函数的方法来实现插桩的:

这里写图片描述

arch/arm64/kernel/entry-ftrace.S：

#ifndef CONFIG_DYNAMIC_FTRACE
/*
 * void _mcount(unsigned long return_address)
 * @return_address: return address to instrumented function
 *
 * This function makes calls, if enabled, to:
 *     - tracer function to probe instrumented function's entry,
 *     - ftrace_graph_caller to set up an exit hook
 */
ENTRY(_mcount)
    mcount_enter

    adrp    x0, ftrace_trace_function
    ldr x2, [x0, #:lo12:ftrace_trace_function]
    adr x0, ftrace_stub
    cmp x0, x2          // if (ftrace_trace_function
    b.eq    skip_ftrace_call    //     != ftrace_stub) {

    mcount_get_pc   x0      //       function's pc
    mcount_get_lr   x1      //       function's lr (= parent's pc)
    blr x2          //   (*ftrace_trace_function)(pc, lr);

#ifndef CONFIG_FUNCTION_GRAPH_TRACER
skip_ftrace_call:           //   return;
    mcount_exit         // }
#else
    mcount_exit         //   return;
                    // }
skip_ftrace_call:
    adrp    x1, ftrace_graph_return
    ldr x2, [x1, #:lo12:ftrace_graph_return]
    cmp x0, x2          //   if ((ftrace_graph_return
    b.ne    ftrace_graph_caller //        != ftrace_stub)

    adrp    x1, ftrace_graph_entry  //     || (ftrace_graph_entry
    adrp    x0, ftrace_graph_entry_stub //     != ftrace_graph_entry_stub))
    ldr x2, [x1, #:lo12:ftrace_graph_entry]
    add x0, x0, #:lo12:ftrace_graph_entry_stub
    cmp x0, x2
    b.ne    ftrace_graph_caller //     ftrace_graph_caller();

    mcount_exit
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
ENDPROC(_mcount)

#else /* CONFIG_DYNAMIC_FTRACE */

对应的伪代码如下：

void _mcount(void)
{
        /* save any bare state needed in order to do initial checking */

        /* (1) 如果函数指针ftrace_trace_function不等于默认的桩函数ftrace_stub， 
            调用function tracer的回调函数：ftrace_trace_function()
         */
        extern void (*ftrace_trace_function)(unsigned long, unsigned long);
        if (ftrace_trace_function != ftrace_stub)
                goto do_trace;

        /* (2) 如果 (ftrace_trace_function = = ftrace_stub) && 
            (ftrace_graph_return != ftrace_stub ||
            ftrace_graph_entry != ftrace_graph_entry_stub)
            调用function_graph tracer的回调函数：ftrace_graph_caller()
         */
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
        extern void (*ftrace_graph_return)(...);
        extern void (*ftrace_graph_entry)(...);
        if (ftrace_graph_return != ftrace_stub ||
            ftrace_graph_entry != ftrace_graph_entry_stub)
                ftrace_graph_caller();
+#endif

        /* restore any bare state */

        return;

do_trace:

        /* save all state needed by the ABI (see paragraph above) */

        unsigned long frompc = ...;
        unsigned long selfpc = <return address> - MCOUNT_INSN_SIZE;

        /* (1.1) 调用function tracer的回调函数：ftrace_trace_function() */
        ftrace_trace_function(frompc, selfpc);

        /* restore all state needed by the ABI */
}

问：static ftrace下，怎么保准notrace的函数不被跟踪？

1.1.3、dynamic ftrace插桩

static ftrace一旦使能，对kernel中所有的函数(除开notrace、online、其他特殊函数)进行插桩，这带来的性能开销是惊人的，有可能导致人们弃用ftrace功能。

为了解决这个问题，内核开发者推出了dynamic ftrace。因为实际上调试者一般不需要对所有函数进行追踪，只会对感兴趣的一部分函数进行追踪。dynamic ftrace把不需要追踪的函数入口处指令“bl _mcount”替换成“nop”，这样基本对性能无影响；对需要追踪的函替换入口处“bl _mcount”为需要调用的函数。

1、ftrace在初始化时，根据编译时“scripts/recordmcount.pl”脚本记录的所有函数入口处插桩位置的“bl _mcount”，将其替换成“nop”指令：

这里写图片描述

2、在tracer enable的时候，把需要跟踪的函数的插桩位置“nop”替换成“bl ftrace_caller”。

为什么不使用“bl _mcount”？主要原因是开发者不喜欢在_mcount()中使用宏来区分两种情况，索性重新创建一个新函数ftrace_caller()。

这里写图片描述

arch/arm64/kernel/entry-ftrace.S：

#ifndef CONFIG_DYNAMIC_FTRACE
#else /* CONFIG_DYNAMIC_FTRACE */
/*
 * _mcount() is used to build the kernel with -pg option, but all the branch
 * instructions to _mcount() are replaced to NOP initially at kernel start up,
 * and later on, NOP to branch to ftrace_caller() when enabled or branch to
 * NOP when disabled per-function base.
 */
ENTRY(_mcount)
    ret
ENDPROC(_mcount)

/*
 * void ftrace_caller(unsigned long return_address)
 * @return_address: return address to instrumented function
 *
 * This function is a counterpart of _mcount() in 'static' ftrace, and
 * makes calls to:
 *     - tracer function to probe instrumented function's entry,
 *     - ftrace_graph_caller to set up an exit hook
 */
ENTRY(ftrace_caller)
    mcount_enter

    mcount_get_pc0  x0      //     function's pc
    mcount_get_lr   x1      //     function's lr

    .global ftrace_call
ftrace_call:                // tracer(pc, lr);
    nop             // This will be replaced with "bl xxx"
                    // where xxx can be any kind of tracer.

#ifdef CONFIG_FUNCTION_GRAPH_TRACER
    .global ftrace_graph_call
ftrace_graph_call:          // ftrace_graph_caller();
    nop             // If enabled, this will be replaced
                    // "b ftrace_graph_caller"
#endif

    mcount_exit
ENDPROC(ftrace_caller)
#endif /* CONFIG_DYNAMIC_FTRACE */

对应的伪代码为：

void ftrace_caller(void)
{
        /* save all state needed by the ABI (see paragraph above) */

        unsigned long frompc = ...;
        unsigned long selfpc = <return address> - MCOUNT_INSN_SIZE;

        /* This will be replaced with "bl xxx" where xxx can be any kind of tracer. */
        /* (1) function tracer的桩位置
            放弃了函数指针，使用nop指令占位，在运行时动态修改指令到"bl xxx"
         */
ftrace_call:
        nop     

#ifdef CONFIG_FUNCTION_GRAPH_TRACER
        /* If enabled, this will be replaced "b ftrace_graph_caller" */
        /* (2) function_graph tracer的桩位置
            放弃了函数指针，使用nop指令占位，在运行时动态修改指令到"b ftrace_graph_caller"
         */
ftrace_graph_call:
        nop    
#endif

        /* restore all state needed by the ABI */

        return;
}

1.2、插桩点管理

1.2.1、插桩点的初始化

1、在编译的时候调用recordmcount.pl搜集所有_mcount()函数的调用点，并且所有的调用点地址保存到section __mcount_loc。

include/asm-generic/vmlinux.lds.h:

#ifdef CONFIG_FTRACE_MCOUNT_RECORD
#define MCOUNT_REC()    . = ALIGN(8);               \
            VMLINUX_SYMBOL(__start_mcount_loc) = .; \
            *(__mcount_loc)             \
            VMLINUX_SYMBOL(__stop_mcount_loc) = .;
#else
#define MCOUNT_REC()
#endif

并没有详细去研究其原理，感兴趣可以具体研究一下“scripts/recordmcount.pl、scripts/recordmcount.c”。

2、在初始化时，遍历section __mcount_loc的调用点地址，默认给所有“bl _mcount”替换成“nop”。

kernel/trace/ftrace.c：

void __init ftrace_init(void)
{
    extern unsigned long __start_mcount_loc[];
    extern unsigned long __stop_mcount_loc[];
    unsigned long count, flags;
    int ret;

    local_irq_save(flags);
    ret = ftrace_dyn_arch_init();
    local_irq_restore(flags);
    if (ret)
        goto failed;

    count = __stop_mcount_loc - __start_mcount_loc;
    if (!count) {
        pr_info("ftrace: No functions to be traced?\n");
        goto failed;
    }

    pr_info("ftrace: allocating %ld entries in %ld pages\n",
        count, count / ENTRIES_PER_PAGE + 1);

    last_ftrace_enabled = ftrace_enabled = 1;

    /* 遍历section __mcount_loc，处理其中保存的调用地址 */
    ret = ftrace_process_locs(NULL,
                  __start_mcount_loc,
                  __stop_mcount_loc);

    ret = register_module_notifier(&ftrace_module_exit_nb);
    if (ret)
        pr_warning("Failed to register trace ftrace module exit notifier\n");

    set_ftrace_early_filters();

    return;
 failed:
    ftrace_disabled = 1;
}

|→

static int ftrace_process_locs(struct module *mod,
                   unsigned long *start,
                   unsigned long *end)
{
    struct ftrace_page *start_pg;
    struct ftrace_page *pg;
    struct dyn_ftrace *rec;
    unsigned long count;
    unsigned long *p;
    unsigned long addr;
    unsigned long flags = 0; /* Shut up gcc */
    int ret = -ENOMEM;

    count = end - start;

    if (!count)
        return 0;

    /* (1) 对地址进行排序 */
    sort(start, count, sizeof(*start),
         ftrace_cmp_ips, NULL);

    /* (2) 对每个地址分配新的dyn_ftrace结构来存储 
        在section __mcount_loc中，只是简单的存储了unsigned long类型的调用地址
        dyn_ftrace结构除了使用->ip来存储地址，还使用->flags来存储当前的状态和被引用计数
     */
    start_pg = ftrace_allocate_pages(count);
    if (!start_pg)
        return -ENOMEM;

    mutex_lock(&ftrace_lock);

    /*
     * Core and each module needs their own pages, as
     * modules will free them when they are removed.
     * Force a new page to be allocated for modules.
     */
    if (!mod) {
        WARN_ON(ftrace_pages || ftrace_pages_start);
        /* First initialization */
        ftrace_pages = ftrace_pages_start = start_pg;
    } else {
        if (!ftrace_pages)
            goto out;

        if (WARN_ON(ftrace_pages->next)) {
            /* Hmm, we have free pages? */
            while (ftrace_pages->next)
                ftrace_pages = ftrace_pages->next;
        }

        ftrace_pages->next = start_pg;
    }

    /* (3) 更新dyn_ftrace新结构中的->ip字段 */
    p = start;
    pg = start_pg;
    while (p < end) {
        addr = ftrace_call_adjust(*p++);
        /*
         * Some architecture linkers will pad between
         * the different mcount_loc sections of different
         * object files to satisfy alignments.
         * Skip any NULL pointers.
         */
        if (!addr)
            continue;

        if (pg->index == pg->size) {
            /* We should have allocated enough */
            if (WARN_ON(!pg->next))
                break;
            pg = pg->next;
        }

        rec = &pg->records[pg->index++];
        rec->ip = addr;
    }

    /* We should have used all pages */
    WARN_ON(pg->next);

    /* Assign the last page to ftrace_pages */
    ftrace_pages = pg;

    /*
     * We only need to disable interrupts on start up
     * because we are modifying code that an interrupt
     * may execute, and the modification is not atomic.
     * But for modules, nothing runs the code we modify
     * until we are finished with it, and there's no
     * reason to cause large interrupt latencies while we do it.
     */
    if (!mod)
        local_irq_save(flags);
    /* (4) 更新dyn_ftrace新结构中的->flags字段 
        默认给所有调用点替换成“nop”指令
     */
    ftrace_update_code(mod, start_pg);
    if (!mod)
        local_irq_restore(flags);
    ret = 0;
 out:
    mutex_unlock(&ftrace_lock);

    return ret;
}

||→

static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
{
    struct ftrace_page *pg;
    struct dyn_ftrace *p;
    cycle_t start, stop;
    unsigned long update_cnt = 0;
    unsigned long ref = 0;
    bool test = false;
    int i;

    /*
     * When adding a module, we need to check if tracers are
     * currently enabled and if they are set to trace all functions.
     * If they are, we need to enable the module functions as well
     * as update the reference counts for those function records.
     */
    if (mod) {
        struct ftrace_ops *ops;

        for (ops = ftrace_ops_list;
             ops != &ftrace_list_end; ops = ops->next) {
            if (ops->flags & FTRACE_OPS_FL_ENABLED) {
                if (ops_traces_mod(ops))
                    ref++;
                else
                    test = true;
            }
        }
    }

    start = ftrace_now(raw_smp_processor_id());

    /* (4.1) 逐个遍历dyn_ftrace结构 */
    for (pg = new_pgs; pg; pg = pg->next) {

        for (i = 0; i < pg->index; i++) {
            int cnt = ref;

            /* If something went wrong, bail without enabling anything */
            if (unlikely(ftrace_disabled))
                return -1;

            p = &pg->records[i];
            /* (4.2) 根据有多少个filter操作对调用点地址的引用， 
                给dyn_ftrace->flags字段中的引用count部分赋值
             */
            if (test)
                cnt += referenced_filters(p);
            p->flags = cnt;

            /*
             * Do the initial record conversion from mcount jump
             * to the NOP instructions.
             */
            /* (4.3) 默认把所有的调用点修改成“nop” */
            if (!ftrace_code_disable(mod, p))
                break;

            update_cnt++;

            /*
             * If the tracing is enabled, go ahead and enable the record.
             *
             * The reason not to enable the record immediatelly is the
             * inherent check of ftrace_make_nop/ftrace_make_call for
             * correct previous instructions.  Making first the NOP
             * conversion puts the module to the correct state, thus
             * passing the ftrace_make_call check.
             */
            if (ftrace_start_up && cnt) {
                /* (4.4) 如果tracing已经使能，且引用计数不为0 
                    使能这个调用点
                 */
                int failed = __ftrace_replace_code(p, 1);
                if (failed)
                    ftrace_bug(failed, p);
            }
        }
    }

    stop = ftrace_now(raw_smp_processor_id());
    ftrace_update_time = stop - start;
    ftrace_update_tot_cnt += update_cnt;

    return 0;
}

1.2.2、插桩点的动态管理

为了应对本文最开始处提出的难题，内核使用了插桩点控制链表+命令hash表的形式。

这里写图片描述

function tracer插桩点动态管理的原理：

1、每个“bl _mcount”的插桩点ip对应一个每个插桩点对应一个dyn_ftrace结构。dyn_ftrace->flags的低26bit用来表示引用计数，如果有ftrcae_ops会操作到该ip，引用计数会加1。如果ref_cnt大于0，插桩点就需要使能了，把其替换为“bl ftrace_caller”；
2、ftrcae_ops采用hash表来表达对ip的引用，一个ftrcae_ops有两张hash表filter_hash和notrace_hash，综合的结果就是对ip的引用。function tracer模式时，有两个可能的ftrcae_ops：global_ops、trace_probe_ops。两个中任意一个引用到ip，都会造成ip的1级插桩点被替换为“bl ftrace_caller”；
3、通过设置“set_ftrace_filter/set_ftrace_notrace”会造成对global_ops的filter_hash/notrace_hash的配置，它的作用是配置function tracer的filter；
4、通过设置“set_ftrace_filter”会造成对trace_probe_ops的filter_hash的配置，它的作用是配置filter command，并且会把command和对应的ip加入到另外一张另外一张hash表ftrace_func_hash表中；
5、ftrace_caller()有两个2级插桩点：ftrace_call、ftrace_graph_call。在function tracer模式时，只有ftrace_call被设置成“bl ftrace_ops_list_func”；
6、ftrace_ops_list_func()的工作就是遍历ftrace_ops_list链表，逐个执行其中的ftrace_ops->func()。
7、global_ops的->func()函数为function_trace_call()，记录trace数据：ip + parent_ip；
8、trace_probe_ops的->func()函数为function_trace_probe_call()，逐个执行ftrace_func_hash表中，ip为本ip的filter command。

有以下几种场景，涉及到插桩点的动态管理：

tracer的使能。当使用“echo xxx_tracer > current_tracer”时，会关闭旧的current tracer并使能新的tracer。典型的包括function tracer合入function_graph tracer；
filter的配置。使用“echo function_name > set_ftrace_filter/set_ftrace_notrace”，可以配置部分function被trace，而不是所有function被trace；
filter command的配置。使用“echo ‘!__schedule_bug:traceoff’ > set_ftrace_filter”，类似命令可以配置条件触发的command，当条件满足后command会被执行；

下面我们看看具体场景下的代码实现细节：

1.2.3、tracer注册

1、tracer注册。tracer可以通过register_tracer()进行注册。

以function tracer为例，kernel/trace/trace_functions.c：

static struct tracer function_trace __tracer_data =
{
    .name       = "function",
    .init       = function_trace_init,
    .reset      = function_trace_reset,
    .start      = function_trace_start,
    .flags      = &func_flags,
    .set_flag   = func_set_flag,
    .allow_instances = true,
#ifdef CONFIG_FTRACE_SELFTEST
    .selftest   = trace_selftest_startup_function,
#endif
};

static __init int init_function_trace(void)
{
    init_func_cmd_traceon();
    return register_tracer(&function_trace);
}

register_tracer()的实现，kernel/trace/trace.c：

int __init register_tracer(struct tracer *type)
{
    struct tracer *t;
    int ret = 0;

    if (!type->name) {
        pr_info("Tracer must have a name\n");
        return -1;
    }

    if (strlen(type->name) >= MAX_TRACER_SIZE) {
        pr_info("Tracer has a name longer than %d\n", MAX_TRACER_SIZE);
        return -1;
    }

    mutex_lock(&trace_types_lock);

    tracing_selftest_running = true;

    /* (1) 如果tracer已经注册，直接返回 */
    for (t = trace_types; t; t = t->next) {
        if (strcmp(type->name, t->name) == 0) {
            /* already found */
            pr_info("Tracer %s already registered\n",
                type->name);
            ret = -1;
            goto out;
        }
    }

    /* (2) tracer对应的相关trace_option */
    if (!type->set_flag)
        type->set_flag = &dummy_set_flag;
    if (!type->flags)
        type->flags = &dummy_tracer_flags;
    else
        if (!type->flags->opts)
            type->flags->opts = dummy_tracer_opt;

    ret = run_tracer_selftest(type);
    if (ret < 0)
        goto out;

    /* (3) 将新的tracer加入到trace_types链表中 */
    type->next = trace_types;
    trace_types = type;
    add_tracer_options(&global_trace, type);

 out:
    tracing_selftest_running = false;
    mutex_unlock(&trace_types_lock);

    if (ret || !default_bootup_tracer)
        goto out_unlock;

    if (strncmp(default_bootup_tracer, type->name, MAX_TRACER_SIZE))
        goto out_unlock;

    printk(KERN_INFO "Starting tracer '%s'\n", type->name);
    /* Do we want this tracer to start on bootup? */
    /* (4) 设置boot默认的tracer */
    tracing_set_tracer(&global_trace, type->name);
    default_bootup_tracer = NULL;

    apply_trace_boot_options();

    /* disable other selftests, since this will break it. */
    tracing_selftest_disabled = true;
#ifdef CONFIG_FTRACE_STARTUP_TEST
    printk(KERN_INFO "Disabling FTRACE selftests due to running tracer '%s'\n",
           type->name);
#endif

 out_unlock:
    return ret;
}

1.2.4、tracer使能

2、tracer使能。我们可以使用“echo function > current_tracer”命令来使能或者切换tracer。

来具体看看其代码实现：

trace_create_file("current_tracer", 0644, d_tracer,
        tr, &set_tracer_fops);

↓

static const struct file_operations set_tracer_fops = {
    .open       = tracing_open_generic,
    .read       = tracing_set_trace_read,
    .write      = tracing_set_trace_write,
    .llseek     = generic_file_llseek,
};

↓

static ssize_t
tracing_set_trace_write(struct file *filp, const char __user *ubuf,
            size_t cnt, loff_t *ppos)
{
    /* (1) 当前文件所在trace buffer，默认是global_trace 
        初始化时：
        init_tracer_tracefs(&global_trace, d_tracer); -> 
        trace_create_file("current_tracer", 0644, d_tracer, tr, &set_tracer_fops);
     */
    struct trace_array *tr = filp->private_data;
    char buf[MAX_TRACER_SIZE+1];
    int i;
    size_t ret;
    int err;

    ret = cnt;

    if (cnt > MAX_TRACER_SIZE)
        cnt = MAX_TRACER_SIZE;

    if (copy_from_user(&buf, ubuf, cnt))
        return -EFAULT;

    buf[cnt] = 0;

    /* strip ending whitespace. */
    for (i = cnt - 1; i > 0 && isspace(buf[i]); i--)
        buf[i] = 0;

    /* (2) 使能新的tracer */
    err = tracing_set_tracer(tr, buf);
    if (err)
        return err;

    *ppos += ret;

    return ret;
}

|→

static int tracing_set_tracer(struct trace_array *tr, const char *buf)
{
    struct tracer *t;
#ifdef CONFIG_TRACER_MAX_TRACE
    bool had_max_tr;
#endif
    int ret = 0;

    mutex_lock(&trace_types_lock);

    if (!ring_buffer_expanded) {
        ret = __tracing_resize_ring_buffer(tr, trace_buf_size,
                        RING_BUFFER_ALL_CPUS);
        if (ret < 0)
            goto out;
        ret = 0;
    }

    /* (2.1) 根据名字，在trace_types链表中找到对应的tracer */
    for (t = trace_types; t; t = t->next) {
        if (strcmp(t->name, buf) == 0)
            break;
    }
    if (!t) {
        ret = -EINVAL;
        goto out;
    }
    if (t == tr->current_trace)
        goto out;

    /* Some tracers are only allowed for the top level buffer */
    if (!trace_ok_for_array(t, tr)) {
        ret = -EINVAL;
        goto out;
    }

    /* If trace pipe files are being read, we can't change the tracer */
    if (tr->current_trace->ref) {
        ret = -EBUSY;
        goto out;
    }

    trace_branch_disable();

    /* (2.2) 去使能旧的当前tracer */
    tr->current_trace->enabled--;

    if (tr->current_trace->reset)
        tr->current_trace->reset(tr);

    /* Current trace needs to be nop_trace before synchronize_sched */
    tr->current_trace = &nop_trace;

#ifdef CONFIG_TRACER_MAX_TRACE
    had_max_tr = tr->allocated_snapshot;

    if (had_max_tr && !t->use_max_tr) {
        /*
         * We need to make sure that the update_max_tr sees that
         * current_trace changed to nop_trace to keep it from
         * swapping the buffers after we resize it.
         * The update_max_tr is called from interrupts disabled
         * so a synchronized_sched() is sufficient.
         */
        synchronize_sched();
        free_snapshot(tr);
    }
#endif

#ifdef CONFIG_TRACER_MAX_TRACE
    if (t->use_max_tr && !had_max_tr) {
        ret = alloc_snapshot(tr);
        if (ret < 0)
            goto out;
    }
#endif

    /* (2.3) 调用新tracer的init函数 */
    if (t->init) {
        ret = tracer_init(t, tr);
        if (ret)
            goto out;
    }

    /* (2.4) 把新的tracer设置为当前tracer */
    tr->current_trace = t;
    tr->current_trace->enabled++;
    trace_branch_enable(tr);
 out:
    mutex_unlock(&trace_types_lock);

    return ret;
}

||→

int tracer_init(struct tracer *t, struct trace_array *tr)
{
    tracing_reset_online_cpus(&tr->trace_buffer);
    return t->init(tr);
}

1.2.5、function tracer使能

3、function tracer的使能。本质上是global_ops的注册后使能。

在初始化的时候，把global_trace的->ops初始化成了global_ops。

start_kernel() -> trace_init() -> tracer_alloc_buffers:

tracer_alloc_buffers
{

    ftrace_init_global_array_ops(&global_trace);

}

↓

__init void ftrace_init_global_array_ops(struct trace_array *tr)
{
    tr->ops = &global_ops;
    tr->ops->private = tr;
}

static struct ftrace_ops global_ops = {
    .func               = ftrace_stub,
    .local_hash.notrace_hash    = EMPTY_HASH,
    .local_hash.filter_hash     = EMPTY_HASH,
    INIT_OPS_HASH(global_ops)
    .flags              = FTRACE_OPS_FL_RECURSION_SAFE |
                      FTRACE_OPS_FL_INITIALIZED |
                      FTRACE_OPS_FL_PID,
};

继续分析t->init()，以function tracer为例，调用到function_trace_init()：

static int function_trace_init(struct trace_array *tr)
{
    ftrace_func_t func;

    /*
     * Instance trace_arrays get their ops allocated
     * at instance creation. Unless it failed
     * the allocation.
     */
    /* (2.4.1) 这里的tr默认就是global_trace 
        这里的tr->ops默认是global_ops
     */
    if (!tr->ops)
        return -ENOMEM;

    /* (2.4.2) func为保存trace信息到ringbuffer的函数 */
    /* Currently only the global instance can do stack tracing */
    if (tr->flags & TRACE_ARRAY_FL_GLOBAL &&
        func_flags.val & TRACE_FUNC_OPT_STACK)
        func = function_stack_trace_call;
    else
        func = function_trace_call;

    /* (2.4.3) tr->ops->func = func */
    ftrace_init_array_ops(tr, func);

    tr->trace_buffer.cpu = get_cpu();
    put_cpu();

    tracing_start_cmdline_record();

    /* (2.4.4) 启动function tracer 
        将tr->ops也加入到ftrace_ops_list当中
     */
    tracing_start_function_trace(tr);
    return 0;
}

↓

static void tracing_start_function_trace(struct trace_array *tr)
{
    tr->function_enabled = 0;
    register_ftrace_function(tr->ops);
    tr->function_enabled = 1;
}

|→

int register_ftrace_function(struct ftrace_ops *ops)
{
    int ret = -1;

    /* (2.4.4.1) 初始化hash表 
        ops->func_hash = &ops->local_hash;
     */
    ftrace_ops_init(ops);

    mutex_lock(&ftrace_lock);

    /* (2.4.4.2) 将global_ops加入ftrace_ops_list 
        并且根据情况，修改各个桩位置的指令
     */
    ret = ftrace_startup(ops, 0);

    mutex_unlock(&ftrace_lock);

    return ret;
}

||→

static int ftrace_startup(struct ftrace_ops *ops, int command)
{
    int ret;

    if (unlikely(ftrace_disabled))
        return -ENODEV;

    /* (2.4.4.2.1) 1、把global_ops加入ftrace_ops_list 
        2、根据ftrace_ops_list链表中成员的情况给ftrace_trace_function指针赋值：
            ftrace_ops_list链表为空，= ftrace_stub
            ftrace_ops_list链表有1个成员，= ftrace_ops_get_list_func(ftrace_ops_list)
            ftrace_ops_list链表有多个成员，= ftrace_ops_list_func
        3、更新ftrace_graph_entry
    */
    ret = __register_ftrace_function(ops);
    if (ret)
        return ret;

    ftrace_start_up++;
    command |= FTRACE_UPDATE_CALLS;

    /*
     * Note that ftrace probes uses this to start up
     * and modify functions it will probe. But we still
     * set the ADDING flag for modification, as probes
     * do not have trampolines. If they add them in the
     * future, then the probes will need to distinguish
     * between adding and updating probes.
     */
    ops->flags |= FTRACE_OPS_FL_ENABLED | FTRACE_OPS_FL_ADDING;

    /* (2.4.4.2.2) 遍历全部_mcount插桩点ftrace_rec 
        根据ip在新、旧hash表中的变化，设置对应rec->flags中的FTRACE_FL_IPMODIFY
     */
    ret = ftrace_hash_ipmodify_enable(ops);
    if (ret < 0) {
        /* Rollback registration process */
        __unregister_ftrace_function(ops);
        ftrace_start_up--;
        ops->flags &= ~FTRACE_OPS_FL_ENABLED;
        return ret;
    }

    /* (2.4.4.2.3) 遍历全部_mcount插桩点ftrace_rec 
        根据filter_hash、notrace_hash是否match ip，给对应rec->flags中ref_cnt进行加1/减1操作
     */
    ftrace_hash_rec_enable(ops, 1);

    /* (2.4.4.2.4) 更新插桩点： 
        FTRACE_UPDATE_CALLS被设置，更新_mcount插桩点：ref_cnt大于0的插桩点，更新成ftrace_caller()
        FTRACE_UPDATE_TRACE_FUNC被设置，更新ftrace_call插桩点：更新成ftrace_trace_function指向的函数
        FTRACE_START_FUNC_RET被设置，更新ftrace_graph_call插桩点：更新成ftrace_graph_caller()
     */
    ftrace_startup_enable(command);

    ops->flags &= ~FTRACE_OPS_FL_ADDING;

    return 0;
}

↓

ftrace_startup_enable() -> ftrace_run_update_code() -> arch_ftrace_update_code() -> ftrace_modify_all_code() -> 

void ftrace_modify_all_code(int command)
{
    int update = command & FTRACE_UPDATE_TRACE_FUNC;
    int err = 0;

    /*
     * If the ftrace_caller calls a ftrace_ops func directly,
     * we need to make sure that it only traces functions it
     * expects to trace. When doing the switch of functions,
     * we need to update to the ftrace_ops_list_func first
     * before the transition between old and new calls are set,
     * as the ftrace_ops_list_func will check the ops hashes
     * to make sure the ops are having the right functions
     * traced.
     */
    /* (2.4.4.2.4.1) 如果FTRACE_UPDATE_TRACE_FUNC被设置，对于ftrace_call插桩点，
        直接调用ftrace_ops_list链表中某个ftrace_ops的操作需要谨慎 
        保险起见，默认还是使用ftrace_ops_list_func()，它会轮询ftrace_ops_list链表中所有ftrace_ops
     */
    if (update) {
        err = ftrace_update_ftrace_func(ftrace_ops_list_func);
        if (FTRACE_WARN_ON(err))
            return;
    }

    /* (2.4.4.2.4.2) 如果FTRACE_UPDATE_CALLS被设置，对于_mcount插桩点，
        遍历全部ftrace_rec，ref_cnt大于0的插桩点，更新成ftrace_caller()
     */
    if (command & FTRACE_UPDATE_CALLS)
        ftrace_replace_code(1);
    else if (command & FTRACE_DISABLE_CALLS)
        ftrace_replace_code(0);

    /* (2.4.4.2.4.3) 如果FTRACE_UPDATE_TRACE_FUNC被设置，对于ftrace_call插桩点，
        如果ftrace_trace_function确实不等于ftrace_ops_list_func()
        更新成ftrace_trace_function指向的函数
     */
    if (update && ftrace_trace_function != ftrace_ops_list_func) {
        function_trace_op = set_function_trace_op;
        smp_wmb();
        /* If irqs are disabled, we are in stop machine */
        if (!irqs_disabled())
            smp_call_function(ftrace_sync_ipi, NULL, 1);
        err = ftrace_update_ftrace_func(ftrace_trace_function);
        if (FTRACE_WARN_ON(err))
            return;
    }

    /* (2.4.4.2.4.4) 如果FTRACE_START_FUNC_RET被设置，对于ftrace_graph_call插桩点，
        更新成ftrace_graph_caller()
     */
    if (command & FTRACE_START_FUNC_RET)
        err = ftrace_enable_ftrace_graph_caller();
    else if (command & FTRACE_STOP_FUNC_RET)
        err = ftrace_disable_ftrace_graph_caller();
    FTRACE_WARN_ON(err);
}

↓

void __weak ftrace_replace_code(int enable)
{
    struct dyn_ftrace *rec;
    struct ftrace_page *pg;
    int failed;

    if (unlikely(ftrace_disabled))
        return;

    /* 遍历ftrace_rec */
    do_for_each_ftrace_rec(pg, rec) {
        failed = __ftrace_replace_code(rec, enable);
        if (failed) {
            ftrace_bug(failed, rec);
            /* Stop processing */
            return;
        }
    } while_for_each_ftrace_rec();
}

↓

static int
__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
{
    unsigned long ftrace_old_addr;
    unsigned long ftrace_addr;
    int ret;

    /* (2.4.4.2.4.2.1) _mcount插桩点使能后的地址为： 
        ops->trampoline or ftrace_caller
     */
    ftrace_addr = ftrace_get_addr_new(rec);

    /* This needs to be done before we call ftrace_update_record */
    ftrace_old_addr = ftrace_get_addr_curr(rec);


    /* (2.4.4.2.4.2.2) 根据rec->flags中的flag和refcnt，以及enable 
        得出操作是：FTRACE_UPDATE_IGNORE/FTRACE_UPDATE_MAKE_CALL/FTRACE_UPDATE_MAKE_NOP
     */
    ret = ftrace_update_record(rec, enable);

    /* (2.4.4.2.4.2.3) 修改_mcount插桩点处的指令为新的桩函数跳转指令 */
    switch (ret) {
    case FTRACE_UPDATE_IGNORE:
        return 0;

    case FTRACE_UPDATE_MAKE_CALL:
        return ftrace_make_call(rec, ftrace_addr);

    case FTRACE_UPDATE_MAKE_NOP:
        return ftrace_make_nop(NULL, rec, ftrace_old_addr);

    case FTRACE_UPDATE_MODIFY_CALL:
        return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr);
    }

    return -1; /* unknow ftrace bug */
}

1.2.6、配置function tracer的filter

4、通过“set_ftrace_filter”、“set_ftrace_notrace”设置function trcer的filter。本质上是操作global_ops的filter_hash、notrace_hash。

/* global_ops注册成inode->i_private */
ftrace_create_filter_files(&global_ops, d_tracer);

↓

void ftrace_create_filter_files(struct ftrace_ops *ops,
                struct dentry *parent)
{

    trace_create_file("set_ftrace_filter", 0644, parent,
              ops, &ftrace_filter_fops);

    trace_create_file("set_ftrace_notrace", 0644, parent,
              ops, &ftrace_notrace_fops);
}

static const struct file_operations ftrace_filter_fops = {
    .open = ftrace_filter_open,
    .read = seq_read,
    .write = ftrace_filter_write,
    .llseek = tracing_lseek,
    .release = ftrace_regex_release,
};

“set_ftrace_filter”、“set_ftrace_notrace”的文件操作有个技巧，就是在open的时候分配一个临时hash表iter->hash来拷贝global_ops的filter_hash/notrace_hash的内容，在write操作实际设置filter时对iter->hash操作，在close的时候使用新的hash表iter->hash来更新global_ops。

open文件操作中对global_ops hash表的备份操作：

static int
ftrace_filter_open(struct inode *inode, struct file *file)
{
    /* (1) 得到global_ops结构 */
    struct ftrace_ops *ops = inode->i_private;

    /* (2) */
    return ftrace_regex_open(ops,
            FTRACE_ITER_FILTER | FTRACE_ITER_DO_HASH,
            inode, file);
}

↓

int
ftrace_regex_open(struct ftrace_ops *ops, int flag,
          struct inode *inode, struct file *file)
{
    struct ftrace_iterator *iter;
    struct ftrace_hash *hash;
    int ret = 0;

    ftrace_ops_init(ops);

    if (unlikely(ftrace_disabled))
        return -ENODEV;

    iter = kzalloc(sizeof(*iter), GFP_KERNEL);
    if (!iter)
        return -ENOMEM;

    if (trace_parser_get_init(&iter->parser, FTRACE_BUFF_MAX)) {
        kfree(iter);
        return -ENOMEM;
    }

    iter->ops = ops;
    iter->flags = flag;

    mutex_lock(&ops->func_hash->regex_lock);

    if (flag & FTRACE_ITER_NOTRACE)
        hash = ops->func_hash->notrace_hash;
    else
        hash = ops->func_hash->filter_hash;

    if (file->f_mode & FMODE_WRITE) {
        const int size_bits = FTRACE_HASH_DEFAULT_BITS;

        /* (2.1) 使用iter->hash备份global_ops中的filter_hash/notrace_hash */
        if (file->f_flags & O_TRUNC)
            iter->hash = alloc_ftrace_hash(size_bits);
        else
            iter->hash = alloc_and_copy_ftrace_hash(size_bits, hash);

        if (!iter->hash) {
            trace_parser_put(&iter->parser);
            kfree(iter);
            ret = -ENOMEM;
            goto out_unlock;
        }
    }

    if (file->f_mode & FMODE_READ) {
        iter->pg = ftrace_pages_start;

        ret = seq_open(file, &show_ftrace_seq_ops);
        if (!ret) {
            struct seq_file *m = file->private_data;
            m->private = iter;
        } else {
            /* Failed */
            free_ftrace_hash(iter->hash);
            trace_parser_put(&iter->parser);
            kfree(iter);
        }
    } else
        file->private_data = iter;

 out_unlock:
    mutex_unlock(&ops->func_hash->regex_lock);

    return ret;
}

write文件操作中对iter->hash表的更新操作：

ftrace_filter_write() -> ftrace_regex_write() 

↓

static ssize_t
ftrace_regex_write(struct file *file, const char __user *ubuf,
           size_t cnt, loff_t *ppos, int enable)
{
    struct ftrace_iterator *iter;
    struct trace_parser *parser;
    ssize_t ret, read;

    if (!cnt)
        return 0;

    if (file->f_mode & FMODE_READ) {
        struct seq_file *m = file->private_data;
        iter = m->private;
    } else
        iter = file->private_data;

    if (unlikely(ftrace_disabled))
        return -ENODEV;

    /* iter->hash is a local copy, so we don't need regex_lock */

    parser = &iter->parser;
    read = trace_get_user(parser, ubuf, cnt, ppos);

    if (read >= 0 && trace_parser_loaded(parser) &&
        !trace_parser_cont(parser)) {
        /* (1) 解析filter配置命令，配置到iter->hash中 */
        ret = ftrace_process_regex(iter->hash, parser->buffer,
                       parser->idx, enable);
        trace_parser_clear(parser);
        if (ret < 0)
            goto out;
    }

    ret = read;
 out:
    return ret;
}

↓

static int ftrace_process_regex(struct ftrace_hash *hash,
                char *buff, int len, int enable)
{
    char *func, *command, *next = buff;
    struct ftrace_func_command *p;
    int ret = -EINVAL;

    func = strsep(&next, ":");

    /* (1.1) filter配置，更新iter->hash */
    if (!next) {
        ret = ftrace_match_records(hash, func, len);
        if (!ret)
            ret = -EINVAL;
        if (ret < 0)
            return ret;
        return 0;
    }

    /* command found */

    command = strsep(&next, ":");

    /* (1.2) filter command配置，最后实际会操作到trace_probe_ops 
        和iter->hash、global_ops无关，下一小节中详述
    */
    mutex_lock(&ftrace_cmd_mutex);
    list_for_each_entry(p, &ftrace_commands, list) {
        if (strcmp(p->name, command) == 0) {
            ret = p->func(hash, func, command, next, enable);
            goto out_unlock;
        }
    }
 out_unlock:
    mutex_unlock(&ftrace_cmd_mutex);

    return ret;
}

↓

ftrace_match_records() -> match_records()

static int
match_records(struct ftrace_hash *hash, char *func, int len, char *mod)
{
    struct ftrace_page *pg;
    struct dyn_ftrace *rec;
    struct ftrace_glob func_g = { .type = MATCH_FULL };
    struct ftrace_glob mod_g = { .type = MATCH_FULL };
    struct ftrace_glob *mod_match = (mod) ? &mod_g : NULL;
    int exclude_mod = 0;
    int found = 0;
    int ret;
    int clear_filter;

    /* (1.1.1) 解析filter命令字符串 
        clear_filter = 命令中的“!”
     */
    if (func) {
        func_g.type = filter_parse_regex(func, len, &func_g.search,
                         &clear_filter);
        func_g.len = strlen(func_g.search);
    }

    if (mod) {
        mod_g.type = filter_parse_regex(mod, strlen(mod),
                &mod_g.search, &exclude_mod);
        mod_g.len = strlen(mod_g.search);
    }

    mutex_lock(&ftrace_lock);

    if (unlikely(ftrace_disabled))
        goto out_unlock;

    /* (1.1.2) 遍历ftrcae_rec */
    do_for_each_ftrace_rec(pg, rec) {
        /* 如果ip在filter中存在，将其加入/删除到iter->hash中 */
        if (ftrace_match_record(rec, &func_g, mod_match, exclude_mod)) {
            ret = enter_record(hash, rec, clear_filter);
            if (ret < 0) {
                found = ret;
                goto out_unlock;
            }
            found = 1;
        }
    } while_for_each_ftrace_rec();
 out_unlock:
    mutex_unlock(&ftrace_lock);

    return found;
}

↓

static int
enter_record(struct ftrace_hash *hash, struct dyn_ftrace *rec, int clear_filter)
{
    struct ftrace_func_entry *entry;
    int ret = 0;

    entry = ftrace_lookup_ip(hash, rec->ip);
    if (clear_filter) {
        /* Do nothing if it doesn't exist */
        if (!entry)
            return 0;

        free_hash_entry(hash, entry);
    } else {
        /* Do nothing if it exists */
        if (entry)
            return 0;

        ret = add_hash_entry(hash, rec->ip);
    }
    return ret;
}

close文件操作中对global_ops hash表的更新操作：

int ftrace_regex_release(struct inode *inode, struct file *file)
{
    struct seq_file *m = (struct seq_file *)file->private_data;
    struct ftrace_ops_hash old_hash_ops;
    struct ftrace_iterator *iter;
    struct ftrace_hash **orig_hash;
    struct ftrace_hash *old_hash;
    struct trace_parser *parser;
    int filter_hash;
    int ret;

    if (file->f_mode & FMODE_READ) {
        iter = m->private;
        seq_release(inode, file);
    } else
        iter = file->private_data;

    parser = &iter->parser;
    if (trace_parser_loaded(parser)) {
        parser->buffer[parser->idx] = 0;
        ftrace_match_records(iter->hash, parser->buffer, parser->idx);
    }

    trace_parser_put(parser);

    mutex_lock(&iter->ops->func_hash->regex_lock);

    if (file->f_mode & FMODE_WRITE) {
        filter_hash = !!(iter->flags & FTRACE_ITER_FILTER);

        if (filter_hash)
            orig_hash = &iter->ops->func_hash->filter_hash;
        else
            orig_hash = &iter->ops->func_hash->notrace_hash;

        mutex_lock(&ftrace_lock);
        old_hash = *orig_hash;
        old_hash_ops.filter_hash = iter->ops->func_hash->filter_hash;
        old_hash_ops.notrace_hash = iter->ops->func_hash->notrace_hash;
        /* (1) 使用iter->hash来更新global_ops的filter_hash/notrace_hash */
        ret = ftrace_hash_move(iter->ops, filter_hash,
                       orig_hash, iter->hash);
        /* (2) 根据最新hash表的内容，更新_mcount插桩点 
            遍历全部ftrace_rec：
                ref_cnt大于0的插桩点，更新成ftrace_caller()
                ref_cnt等于0的插桩点，更新成nop
         */
        if (!ret) {
            ftrace_ops_update_code(iter->ops, &old_hash_ops);
            free_ftrace_hash_rcu(old_hash);
        }
        mutex_unlock(&ftrace_lock);
    }

    mutex_unlock(&iter->ops->func_hash->regex_lock);
    free_ftrace_hash(iter->hash);
    kfree(iter);

    return 0;
}

↓

static void ftrace_ops_update_code(struct ftrace_ops *ops,
                   struct ftrace_ops_hash *old_hash)
{
    struct ftrace_ops *op;

    if (!ftrace_enabled)
        return;

    if (ops->flags & FTRACE_OPS_FL_ENABLED) {
        ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS, old_hash);
        return;
    }

    /*
     * If this is the shared global_ops filter, then we need to
     * check if there is another ops that shares it, is enabled.
     * If so, we still need to run the modify code.
     */
    if (ops->func_hash != &global_ops.local_hash)
        return;

    do_for_each_ftrace_op(op, ftrace_ops_list) {
        if (op->func_hash == &global_ops.local_hash &&
            op->flags & FTRACE_OPS_FL_ENABLED) {
            ftrace_run_modify_code(op, FTRACE_UPDATE_CALLS, old_hash);
            /* Only need to do this once */
            return;
        }
    } while_for_each_ftrace_op(op);
}

1.2.7、配置function tracer的filter command

5、通过“set_ftrace_filter”设置function trcer的filter command。本质上向trace_probe_ops注册cmd，以及操作trace_probe_ops的filter_hash、notrace_hash。

虽然同样是操作set_ftrace_filter，但是配置filter和配置filter command是操作到不同的实体：

配置filter。操作的是global_ops的filter_hash/notrace_hash的内容；
配置filter command。是把command向trace_probe_ops注册，并且操作trace_probe_ops的filter_hash/notrace_hash的内容；

在配置filter command之前首先得注册command：

init_function_trace() -> init_func_cmd_traceon()

static int __init init_func_cmd_traceon(void)
{
    int ret;

    /* 注册：把command加入到ftrace_commands链表 */
    ret = register_ftrace_command(&ftrace_traceoff_cmd);
    if (ret)
        return ret;

    ret = register_ftrace_command(&ftrace_traceon_cmd);
    if (ret)
        goto out_free_traceoff;

    ret = register_ftrace_command(&ftrace_stacktrace_cmd);
    if (ret)
        goto out_free_traceon;

    ret = register_ftrace_command(&ftrace_dump_cmd);
    if (ret)
        goto out_free_stacktrace;

    ret = register_ftrace_command(&ftrace_cpudump_cmd);
    if (ret)
        goto out_free_dump;

    return 0;
}

static struct ftrace_func_command ftrace_traceon_cmd = {
    .name           = "traceon",
    .func           = ftrace_trace_onoff_callback,
};

我们以”traceon”command为例，继续分析上一节对“set_ftrace_filter”的文件操作：

static int ftrace_process_regex(struct ftrace_hash *hash,
                char *buff, int len, int enable)
{
    char *func, *command, *next = buff;
    struct ftrace_func_command *p;
    int ret = -EINVAL;

    func = strsep(&next, ":");

    /* (1.1) filter配置，更新iter->hash */
    if (!next) {
        ret = ftrace_match_records(hash, func, len);
        if (!ret)
            ret = -EINVAL;
        if (ret < 0)
            return ret;
        return 0;
    }

    /* command found */

    command = strsep(&next, ":");

    /* (1.2) filter command配置，最后实际会操作到trace_probe_ops 
        和iter->hash、global_ops无关，下一小节中详述
    */
    mutex_lock(&ftrace_cmd_mutex);
    list_for_each_entry(p, &ftrace_commands, list) {
        if (strcmp(p->name, command) == 0) {
            ret = p->func(hash, func, command, next, enable);
            goto out_unlock;
        }
    }
 out_unlock:
    mutex_unlock(&ftrace_cmd_mutex);

    return ret;
}

↓

static int
ftrace_trace_onoff_callback(struct ftrace_hash *hash,
                char *glob, char *cmd, char *param, int enable)
{
    struct ftrace_probe_ops *ops;

    /* we register both traceon and traceoff to this callback */
    /* (1.2.1) 配置command的执行ops */
    if (strcmp(cmd, "traceon") == 0)
        ops = param ? &traceon_count_probe_ops : &traceon_probe_ops;
    else
        ops = param ? &traceoff_count_probe_ops : &traceoff_probe_ops;

    /* (1.2.2) 注册command到trace_probe_ops */
    return ftrace_trace_probe_callback(ops, hash, glob, cmd,
                       param, enable);
}

↓

static int
ftrace_trace_probe_callback(struct ftrace_probe_ops *ops,
                struct ftrace_hash *hash, char *glob,
                char *cmd, char *param, int enable)
{
    void *count = (void *)-1;
    char *number;
    int ret;

    /* hash funcs only work with set_ftrace_filter */
    if (!enable)
        return -EINVAL;

    /* (1.2.2.1) 如果命令是“!”，注销filter command */
    if (glob[0] == '!') {
        unregister_ftrace_function_probe_func(glob+1, ops);
        return 0;
    }

    if (!param)
        goto out_reg;

    number = strsep(&param, ":");

    if (!strlen(number))
        goto out_reg;

    /*
     * We use the callback data field (which is a pointer)
     * as our counter.
     */
    /* (1.2.2.2) 解析到filter command中的“count”字段 */
    ret = kstrtoul(number, 0, (unsigned long *)&count);
    if (ret)
        return ret;

 out_reg:
    /* (1.2.2.3) 继续注册filter command */
    ret = register_ftrace_function_probe(glob, ops, count);

    return ret < 0 ? ret : 0;
}

↓

int
register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
                  void *data)
{
    struct ftrace_ops_hash old_hash_ops;
    struct ftrace_func_probe *entry;
    struct ftrace_glob func_g;
    struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash;
    struct ftrace_hash *old_hash = *orig_hash;
    struct ftrace_hash *hash;
    struct ftrace_page *pg;
    struct dyn_ftrace *rec;
    int not;
    unsigned long key;
    int count = 0;
    int ret;

    /* (1.2.2.3.1) filter command中函数名部分的解析 */
    func_g.type = filter_parse_regex(glob, strlen(glob),
            &func_g.search, &not);
    func_g.len = strlen(func_g.search);

    /* we do not support '!' for function probes */
    if (WARN_ON(not))
        return -EINVAL;

    mutex_lock(&trace_probe_ops.func_hash->regex_lock);

    old_hash_ops.filter_hash = old_hash;
    /* Probes only have filters */
    old_hash_ops.notrace_hash = NULL;

    /* (1.2.2.3.1) 将trace_probe_ops的filter_hash拷贝到临时hash表 */
    hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash);
    if (!hash) {
        count = -ENOMEM;
        goto out;
    }

    if (unlikely(ftrace_disabled)) {
        count = -ENODEV;
        goto out;
    }

    mutex_lock(&ftrace_lock);

    /* 遍历ftrace_rec */
    do_for_each_ftrace_rec(pg, rec) {

        if (!ftrace_match_record(rec, &func_g, NULL, 0))
            continue;

        entry = kmalloc(sizeof(*entry), GFP_KERNEL);
        if (!entry) {
            /* If we did not process any, then return error */
            if (!count)
                count = -ENOMEM;
            goto out_unlock;
        }

        count++;

        entry->data = data;

        /*
         * The caller might want to do something special
         * for each function we find. We call the callback
         * to give the caller an opportunity to do so.
         */
        if (ops->init) {
            if (ops->init(ops, rec->ip, &entry->data) < 0) {
                /* caller does not like this func */
                kfree(entry);
                continue;
            }
        }

        /* (1.2.2.3.3) 如果当前ip符合当前函数filter规则，加入到临时hash表 */
        ret = enter_record(hash, rec, 0);
        if (ret < 0) {
            kfree(entry);
            count = ret;
            goto out_unlock;
        }


        /* (1.2.2.3.4) 把command的操作和对应ip组合成entry，加入到另外一张hash表ftrace_func_hash */
        entry->ops = ops;
        entry->ip = rec->ip;

        key = hash_long(entry->ip, FTRACE_HASH_BITS);
        hlist_add_head_rcu(&entry->node, &ftrace_func_hash[key]);

    } while_for_each_ftrace_rec();

    /* (1.2.2.3.5) 把临时hash表更新到trace_probe_ops的filter_hash中 */
    ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash);

    /* (1.2.2.3.6) 如果trace_probe_ops已经注册，根据hash表的更新来更新_mcount插桩点 
        如果trace_probe_ops没有注册，注册并更新_mcount插桩点
     */
    __enable_ftrace_function_probe(&old_hash_ops);

    if (!ret)
        free_ftrace_hash_rcu(old_hash);
    else
        count = ret;

 out_unlock:
    mutex_unlock(&ftrace_lock);
 out:
    mutex_unlock(&trace_probe_ops.func_hash->regex_lock);
    free_ftrace_hash(hash);

    return count;
}

trace_probe_ops在被调用的时候，执行ftrace_func_hash中的filter command：

ftrace_caller() -> ftrace_ops_list_func() -> __ftrace_ops_list_func() 

static inline void
__ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
               struct ftrace_ops *ignored, struct pt_regs *regs)
{
    struct ftrace_ops *op;
    int bit;

    bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX);
    if (bit < 0)
        return;

    /*
     * Some of the ops may be dynamically allocated,
     * they must be freed after a synchronize_sched().
     */
    preempt_disable_notrace();
    /* 遍历ftrace_ops_list链表，
        在当前ip满足hash的情况下，逐个执行ftrace_ops->func()
     */
    do_for_each_ftrace_op(op, ftrace_ops_list) {
        if (ftrace_ops_test(op, ip, regs)) {
            if (FTRACE_WARN_ON(!op->func)) {
                pr_warn("op=%p %pS\n", op, op);
                goto out;
            }
            op->func(ip, parent_ip, op, regs);
        }
    } while_for_each_ftrace_op(op);
out:
    preempt_enable_notrace();
    trace_clear_recursion(bit);
}

↓

static struct ftrace_ops trace_probe_ops __read_mostly =
{
    .func       = function_trace_probe_call,
    .flags      = FTRACE_OPS_FL_INITIALIZED,
    INIT_OPS_HASH(trace_probe_ops)
};

static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip,
                      struct ftrace_ops *op, struct pt_regs *pt_regs)
{
    struct ftrace_func_probe *entry;
    struct hlist_head *hhd;
    unsigned long key;

    key = hash_long(ip, FTRACE_HASH_BITS);

    hhd = &ftrace_func_hash[key];

    if (hlist_empty(hhd))
        return;

    /*
     * Disable preemption for these calls to prevent a RCU grace
     * period. This syncs the hash iteration and freeing of items
     * on the hash. rcu_read_lock is too dangerous here.
     */
    preempt_disable_notrace();
    /* 逐个执行ftrace_func_hash表中，ip为本ip的filter command */
    hlist_for_each_entry_rcu_notrace(entry, hhd, node) {
        if (entry->ip == ip)
            entry->ops->func(ip, parent_ip, &entry->data);
    }
    preempt_enable_notrace();
}

1.3、数据存入

function tracer的数据存入的路径为：

ftrace_caller() -> ftrace_ops_list_func() -> __ftrace_ops_list_func() -> global_ops->func() -> function_trace_call()：

static void
function_trace_call(unsigned long ip, unsigned long parent_ip,
            struct ftrace_ops *op, struct pt_regs *pt_regs)
{
    struct trace_array *tr = op->private;
    struct trace_array_cpu *data;
    unsigned long flags;
    int bit;
    int cpu;
    int pc;

    if (unlikely(!tr->function_enabled))
        return;

    pc = preempt_count();
    preempt_disable_notrace();

    bit = trace_test_and_set_recursion(TRACE_FTRACE_START, TRACE_FTRACE_MAX);
    if (bit < 0)
        goto out;

    cpu = smp_processor_id();
    data = per_cpu_ptr(tr->trace_buffer.data, cpu);
    if (!atomic_read(&data->disabled)) {
        local_save_flags(flags);
        trace_function(tr, ip, parent_ip, flags, pc);
    }
    trace_clear_recursion(bit);

 out:
    preempt_enable_notrace();
}

↓

void
trace_function(struct trace_array *tr,
           unsigned long ip, unsigned long parent_ip, unsigned long flags,
           int pc)
{
    struct trace_event_call *call = &event_function;
    struct ring_buffer *buffer = tr->trace_buffer.buffer;
    struct ring_buffer_event *event;
    struct ftrace_entry *entry;

    /* (1) 从ringbuffer中分配空间 
        type = TRACE_FN
     */
    event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry),
                      flags, pc);
    if (!event)
        return;
    entry   = ring_buffer_event_data(event);
    /* (2) 存入function tracer自定义的trace数据：ip、parent_ip */
    entry->ip           = ip;
    entry->parent_ip        = parent_ip;

    if (!call_filter_check_discard(call, entry, buffer, event))
        __buffer_unlock_commit(buffer, event);
}

1.3.1、数据格式

function tracer自定义的trace数据非常简单：ip、parent_ip

这里写图片描述

1.3.2、filter

关于filter和filter command的配置在1.2.2、插桩点的动态管理一节已经讲得非常详细了，这里就不再重复。

1.4、数据读出

从trace文件读出的function tracer默认数据格式为：

# cat trace
# tracer: function
#
# entries-in-buffer/entries-written: 36/36   #P:8
#
#                              _-----=> irqs-off
#                             / _----=> need-resched
#                            | / _---=> hardirq/softirq
#                            || / _--=> preempt-depth
#                            ||| /     delay
#           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION
#              | |       |   ||||       |         |
              sh-18361 [001] ...1 663922.156238: schedule_hrtimeout_range <-poll_schedule_timeout
              sh-18361 [001] ...1 663922.156251: schedule_hrtimeout_range_clock <-schedule_hrtimeout_range
          <idle>-0     [003] .n.2 663922.156660: schedule_preempt_disabled <-cpu_startup_entry
     ->transport-5191  [002] ...1 663922.157592: schedule_timeout <-wait_for_common
            adbd-5189  [002] ...1 663922.158219: schedule_hrtimeout_range <-poll_schedule_timeout
            adbd-5189  [002] ...1 663922.158222: schedule_hrtimeout_range_clock <-schedule_hrtimeout_range
          <idle>-0     [001] .n.2 663922.158342: schedule_preempt_disabled <-cpu_startup_entry
     ->transport-5191  [001] ...1 663922.159407: schedule_timeout <-unix_stream_read_generic
     <-transport-5192  [001] ...1 663922.159904: schedule_timeout <-wait_for_common
     ->transport-5191  [001] ...1 663922.160413: schedule_timeout <-unix_stream_read_generic
            adbd-5189  [001] ...1 663922.160895: schedule_hrtimeout_range <-poll_schedule_timeout
            adbd-5189  [001] ...1 663922.160898: schedule_hrtimeout_range_clock <-schedule_hrtimeout_range
          <idle>-0     [002] .n.2 663922.163694: schedule_preempt_disabled <-cpu_startup_entry
          <idle>-0     [003] .n.2 663922.175958: schedule_preempt_disabled <-cpu_startup_entry
  ndroid.systemu-2200  [003] ...1 663922.179945: schedule_hrtimeout_range <-SyS_epoll_wait
  ndroid.systemu-2200  [003] ...1 663922.179950: schedule_hrtimeout_range_clock <-schedule_hrtimeout_range
          <idle>-0     [003] .n.2 663922.181418: schedule_preempt_disabled <-cpu_startup_entry
          <idle>-0     [002] .n.2 663922.181584: schedule_preempt_disabled <-cpu_startup_entry
          <idle>-0     [002] .n.2 663922.200878: schedule_preempt_disabled <-cpu_startup_entry
          <idle>-0     [001] .n.2 663922.203506: schedule_preempt_disabled <-cpu_startup_entry
     rcu_preempt-7     [002] ...1 663922.203519: schedule_timeout <-rcu_gp_kthread
  ndroid.systemu-2200  [002] ...1 663922.205619: schedule_hrtimeout_range <-SyS_epoll_wait
  ndroid.systemu-2200  [002] ...1 663922.205624: schedule_hrtimeout_range_clock <-schedule_hrtimeout_range
          <idle>-0     [003] .n.2 663922.208538: schedule_preempt_disabled <-cpu_startup_entry
          <idle>-0     [002] .n.2 663922.213243: schedule_preempt_disabled <-cpu_startup_entry
          <idle>-0     [003] .n.2 663922.217986: schedule_preempt_disabled <-cpu_startup_entry
          <idle>-0     [003] .n.2 663922.227857: schedule_preempt_disabled <-cpu_startup_entry
  ndroid.systemu-2200  [003] ...1 663922.231819: schedule_hrtimeout_range <-SyS_epoll_wait
  ndroid.systemu-2200  [003] ...1 663922.231824: schedule_hrtimeout_range_clock <-schedule_hrtimeout_range
          <idle>-0     [002] .n.2 663922.234282: schedule_preempt_disabled <-cpu_startup_entry
          <idle>-0     [001] .n.2 663922.246355: schedule_preempt_disabled <-cpu_startup_entry
          <idle>-0     [002] .n.2 663922.251360: schedule_preempt_disabled <-cpu_startup_entry
  ndroid.systemu-2200  [002] ...1 663922.255168: schedule_hrtimeout_range <-SyS_epoll_wait
  ndroid.systemu-2200  [002] ...1 663922.255172: schedule_hrtimeout_range_clock <-schedule_hrtimeout_range
          <idle>-0     [003] .n.2 663922.256775: schedule_preempt_disabled <-cpu_startup_entry
          <idle>-0     [001] .n.2 663922.257253: schedule_preempt_disabled <-cpu_startup_entry

在kernel/trace/trace_ouput.c文件中，注册了系统默认的几种trace_event。function tracer使用TRACE_FN类型的trace_fn_event：

static struct trace_event *events[] __initdata = {
    &trace_fn_event,
    &trace_graph_ent_event,
    &trace_graph_ret_event,
    &trace_ctx_event,
    &trace_wake_event,
    &trace_stack_event,
    &trace_user_stack_event,
    &trace_bputs_event,
    &trace_bprint_event,
    &trace_print_event,
    NULL
};

__init static int init_events(void)
{
    struct trace_event *event;
    int i, ret;

    for (i = 0; events[i]; i++) {
        event = events[i];

        ret = register_trace_event(event);
        if (!ret) {
            printk(KERN_WARNING "event %d failed to register\n",
                   event->type);
            WARN_ON_ONCE(1);
        }
    }

    return 0;
}

|→

static struct trace_event_functions trace_fn_funcs = {
    .trace      = trace_fn_trace,
    .raw        = trace_fn_raw,
    .hex        = trace_fn_hex,
    .binary     = trace_fn_bin,
};

static struct trace_event trace_fn_event = {
    .type       = TRACE_FN,
    .funcs      = &trace_fn_funcs,
};

在数据读出时，会调用到event对应的event->funcs->trace()函数，seq_read() -> s_show() -> print_trace_line() -> print_trace_fmt() -> event->funcs->trace()：

TRACE_FN，event->funcs->trace()对应trace_fn_trace()：

/* TRACE_FN */
static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags,
                    struct trace_event *event)
{
    struct ftrace_entry *field;
    struct trace_seq *s = &iter->seq;

    trace_assign_type(field, iter->ent);

    /* (1) 打印出本ip对应的符号 */
    seq_print_ip_sym(s, field->ip, flags);

    /* (2) 如果trace option运行，打印出父ip对应的符号 */
    if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) {
        trace_seq_puts(s, " <-");
        seq_print_ip_sym(s, field->parent_ip, flags);
    }

    trace_seq_putc(s, '\n');

    return trace_handle_return(s);
}

2、function_graph tracer

function_graph tracer从function tracer发展而来，function tracer使用“_mcount”插桩可以跟踪到每个函数的调用入口，而function_graph tracer即可以跟踪到函数的入口还可以跟踪到函数的返回。

2.1、插桩原理

这里写图片描述

如上图：一切的关键是在入口桩函数被调用时，修改了func()的返回地址，不是返回到func’s parent()函数继续去执行，而是返回到reurn桩函数return_to_handler()中。return_to_handler()中执行完自己的return处理函数以后，再把返回地址恢复成func’s parent中的地址，返回继续执行原有的路径。

原本的入口处插桩，只能追踪到函数的切换。现在入口、出口同时插桩，还能获得函数的执行时长，做更多的分析。

2.2、插桩点管理

这里写图片描述

function_graph tracer插桩点动态管理的原理：

1、每个“bl _mcount”的插桩点ip对应一个每个插桩点对应一个dyn_ftrace结构。dyn_ftrace->flags的低26bit用来表示引用计数，如果有ftrcae_ops会操作到该ip，引用计数会加1。如果ref_cnt大于0，插桩点就需要使能了，把其替换为“bl ftrace_caller”；
2、ftrcae_ops采用hash表来表达对ip的引用，一个ftrcae_ops有两张hash表filter_hash和notrace_hash，综合的结果就是对ip的引用。function_graph tracer模式时，有一个的ftrcae_ops：graph_ops。并且graph_ops的hash表是直接引用global_ops的hash表，这样做的目的是可以保留function tracer模式时的filter配置。如果引用到ip，都会造成ip的1级插桩点被替换为“bl ftrace_caller”；
3、通过设置“set_ftrace_filter/set_ftrace_notrace”会造成对global_ops的filter_hash/notrace_hash的配置，它的作用相当于1级过滤。在function_graph tracer模式时，通过2级过滤的函数才能记录trace信息：1级过滤(函数必须是“set_ftrace_filter/set_ftrace_notrace”配置中允许的函数)，2级过滤(函数必须是“set_graph_function/set_graph_notrace”配置中允许的函数或者是被它们调用的子函数)；
4、通过设置“set_graph_function/set_graph_notrace”可以配置2级过滤，实际上是配置到ftrace_graph_funcs[]/ftrace_graph_notrace_funcs[]表中。
5、ftrace_caller()有两个2级插桩点：ftrace_call、ftrace_graph_call。在function tracer模式时，只有ftrace_graph_call被设置成“bl ftrace_graph_caller”；
6、ftrace_graph_caller()的工作就是设置环境，让函数在入口处调用trace_graph_entry()，在返回处调用trace_graph_return()；
7、在trace_graph_entry()中会进行2级过滤的合法性判断，函数必须是“set_graph_function/set_graph_notrace”配置中允许的函数或者是被它们调用的子函数才能继续执行，否则出错返回后面的trace_graph_return()都不会被执行。
8、trace_graph_entry()、trace_graph_return()的工作都是记录trace信息：ip+时间戳；

在以下场景时，function_graph会涉及到对各个插桩点的动态修改：

tracer的使能。当使用“echo xxx_tracer > current_tracer”时，会关闭旧的current tracer并使能新的tracer。典型的包括function tracer合入function_graph tracer；
function_graph filter的配置。使用“echo function_name > set_ftrace_filter/set_ftrace_notrace”，可以配置部分function被trace，而不是所有function被trace；
function filter的配置。使用“echo function_name > set_graph_function/set_graph_notrace”，可以配置部分function被trace，而不是所有function被trace；

下面我们看看具体场景下的代码实现细节：

2.2.1、function_graph tracer使能

参考“1.2.5、function tracer使能”这一节，tracer的使能最后调用的是tracer->init()函数。

对function_graph tracer来说，调用的是graph_trace_init()：

static struct tracer graph_trace __tracer_data = {
    .name       = "function_graph",
    .update_thresh  = graph_trace_update_thresh,
    .open       = graph_trace_open,
    .pipe_open  = graph_trace_open,
    .close      = graph_trace_close,
    .pipe_close = graph_trace_close,
    .init       = graph_trace_init,
    .reset      = graph_trace_reset,
    .print_line = print_graph_function,
    .print_header   = print_graph_headers,
    .flags      = &tracer_flags,
    .set_flag   = func_graph_set_flag,
#ifdef CONFIG_FTRACE_SELFTEST
    .selftest   = trace_selftest_startup_function_graph,
#endif
};

↓

static int graph_trace_init(struct trace_array *tr)
{
    int ret;

    set_graph_array(tr);
    if (tracing_thresh)
        ret = register_ftrace_graph(&trace_graph_thresh_return,
                        &trace_graph_thresh_entry);
    else
        ret = register_ftrace_graph(&trace_graph_return,
                        &trace_graph_entry);
    if (ret)
        return ret;
    tracing_start_cmdline_record();

    return 0;
}

↓

/* function_graph tracer的专用ftrace_ops：graph_ops */
static struct ftrace_ops graph_ops = {
    .func           = ftrace_stub,
    .flags          = FTRACE_OPS_FL_RECURSION_SAFE |
                   FTRACE_OPS_FL_INITIALIZED |
                   FTRACE_OPS_FL_PID |
                   FTRACE_OPS_FL_STUB,
#ifdef FTRACE_GRAPH_TRAMP_ADDR
    .trampoline     = FTRACE_GRAPH_TRAMP_ADDR,
    /* trampoline_size is only needed for dynamically allocated tramps */
#endif
    ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash)  /* graph_ops共用global_ops的hash表，这样可以支持"set_ftrace_filter"的配置 */
};


int register_ftrace_graph(trace_func_graph_ret_t retfunc,
            trace_func_graph_ent_t entryfunc)
{
    int ret = 0;

    mutex_lock(&ftrace_lock);

    /* we currently allow only one tracer registered at a time */
    if (ftrace_graph_active) {
        ret = -EBUSY;
        goto out;
    }

    register_pm_notifier(&ftrace_suspend_notifier);

    ftrace_graph_active++;
    ret = start_graph_tracing();
    if (ret) {
        ftrace_graph_active--;
        goto out;
    }

    /* (1) 给ftrace_graph_entry、ftrace_graph_return指针赋值 */
    ftrace_graph_return = retfunc;

    /*
     * Update the indirect function to the entryfunc, and the
     * function that gets called to the entry_test first. Then
     * call the update fgraph entry function to determine if
     * the entryfunc should be called directly or not.
     */
    __ftrace_graph_entry = entryfunc;
    ftrace_graph_entry = ftrace_graph_entry_test;
    update_function_graph_func();

    /* (2) 注册graph_ops:
        1、将graph_ops加入到ftrace_ops_list链表；
        2、根据graph_ops的hash表，更新_mcount插桩点；
        3、更新ftrace_graph_call插桩点为ftrace_graph_caller()
     */
    ret = ftrace_startup(&graph_ops, FTRACE_START_FUNC_RET);
out:
    mutex_unlock(&ftrace_lock);
    return ret;
}

2.2.2、配置function filter(1级过滤)

因为function_graph tracer的graph_ops继续共用global_ops的hash表，使用“set_ftrace_filter/set_ftrace_notrace”接口可以配置global_ops的filter_hash/notrace_hash表。所以可以继续使用“set_ftrace_filter/set_ftrace_notrace”来配置function_graph tracer的filter。

function_graph tracer还可以使用“set_graph_function/set_graph_notrace”接口来配置过滤，需要两种过滤条件都满足的函数才能被trace。所以我们命名当前过滤为1级过滤。

具体的代码解析参考： 1.2.6、配置function tracer的filter

2.2.3、配置function_graph filter(2级过滤)

通过设置“set_graph_function/set_graph_notrace”可以配置2级过滤，实际上是配置到ftrace_graph_funcs[]/ftrace_graph_notrace_funcs[]表中。

trace_create_file("set_graph_function", 0444, d_tracer,
                NULL,
                &ftrace_graph_fops);
trace_create_file("set_graph_notrace", 0444, d_tracer,
                NULL,
                &ftrace_graph_notrace_fops);

static const struct file_operations ftrace_graph_fops = {
    .open       = ftrace_graph_open,
    .read       = seq_read,
    .write      = ftrace_graph_write,
    .llseek     = tracing_lseek,
    .release    = ftrace_graph_release,
};

↓

static int
ftrace_graph_open(struct inode *inode, struct file *file)
{
    struct ftrace_graph_data *fgd;

    if (unlikely(ftrace_disabled))
        return -ENODEV;

    fgd = kmalloc(sizeof(*fgd), GFP_KERNEL);
    if (fgd == NULL)
        return -ENOMEM;

    fgd->table = ftrace_graph_funcs;
    fgd->size = FTRACE_GRAPH_MAX_FUNCS;
    fgd->count = &ftrace_graph_count;
    fgd->seq_ops = &ftrace_graph_seq_ops;

    return __ftrace_graph_open(inode, file, fgd);
}


static ssize_t
ftrace_graph_write(struct file *file, const char __user *ubuf,
           size_t cnt, loff_t *ppos)
{
    struct trace_parser parser;
    ssize_t read, ret = 0;
    struct ftrace_graph_data *fgd = file->private_data;

    if (!cnt)
        return 0;

    if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX))
        return -ENOMEM;

    read = trace_get_user(&parser, ubuf, cnt, ppos);

    if (read >= 0 && trace_parser_loaded((&parser))) {
        parser.buffer[parser.idx] = 0;

        mutex_lock(&graph_lock);

        /* we allow only one expression at a time */
        /* 根据filter设置条件，设置ftrace_graph_funcs[]表 */
        ret = ftrace_set_func(fgd->table, fgd->count, fgd->size,
                      parser.buffer);

        mutex_unlock(&graph_lock);
    }

    if (!ret)
        ret = read;

    trace_parser_put(&parser);

    return ret;
}

在trace_graph_entry()中会进行2级过滤的合法性判断，函数必须是“set_graph_function/set_graph_notrace”配置中允许的函数或者是被它们调用的子函数才能继续执行，否则出错返回后面的trace_graph_return()都不会被执行。

int trace_graph_entry(struct ftrace_graph_ent *trace)
{
    struct trace_array *tr = graph_array;
    struct trace_array_cpu *data;
    unsigned long flags;
    long disabled;
    int ret;
    int cpu;
    int pc;

    if (!ftrace_trace_task(current))
        return 0;

    /* trace it when it is-nested-in or is a function enabled. */
    /*  ftrace_graph_addr(trace->func) // 函数必须是“set_graph_function/set_graph_notrace”配置中允许的函数
        trace->depth // 或者是被它们调用的子函数
     */
    if ((!(trace->depth || ftrace_graph_addr(trace->func)) ||
         ftrace_graph_ignore_irqs()) || (trace->depth < 0) ||
        (max_depth && trace->depth >= max_depth))
        return 0;

    /*
     * Do not trace a function if it's filtered by set_graph_notrace.
     * Make the index of ret stack negative to indicate that it should
     * ignore further functions.  But it needs its own ret stack entry
     * to recover the original index in order to continue tracing after
     * returning from the function.
     */
    if (ftrace_graph_notrace_addr(trace->func))
        return 1;

    local_irq_save(flags);
    cpu = raw_smp_processor_id();
    data = per_cpu_ptr(tr->trace_buffer.data, cpu);
    disabled = atomic_inc_return(&data->disabled);
    if (likely(disabled == 1)) {
        pc = preempt_count();
        ret = __trace_graph_entry(tr, trace, flags, pc);
    } else {
        ret = 0;
    }

    atomic_dec(&data->disabled);
    local_irq_restore(flags);

    return ret;
}

↓

static inline int ftrace_graph_addr(unsigned long addr)
{
    int i;

    if (!ftrace_graph_count)
        return 1;

    /* 查找地址是否在ftrace_graph_funcs[]表中 */
    for (i = 0; i < ftrace_graph_count; i++) {
        if (addr == ftrace_graph_funcs[i]) {
            /*
             * If no irqs are to be traced, but a set_graph_function
             * is set, and called by an interrupt handler, we still
             * want to trace it.
             */
            if (in_irq())
                trace_recursion_set(TRACE_IRQ_BIT);
            else
                trace_recursion_clear(TRACE_IRQ_BIT);
            return 1;
        }
    }

    return 0;
}

2.3、数据存入

和function tracer不一样的是，function_graph在进入函数和返回函数时都有trace数据存入。

2.3.1、trace_graph_entry()

这里写图片描述

函数入口的存入：trace_graph_entry() -> __trace_graph_entry()：

int __trace_graph_entry(struct trace_array *tr,
                struct ftrace_graph_ent *trace,
                unsigned long flags,
                int pc)
{
    struct trace_event_call *call = &event_funcgraph_entry;
    struct ring_buffer_event *event;
    struct ring_buffer *buffer = tr->trace_buffer.buffer;
    struct ftrace_graph_ent_entry *entry;

    event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT,
                      sizeof(*entry), flags, pc);
    if (!event)
        return 0;
    entry   = ring_buffer_event_data(event);
    entry->graph_ent            = *trace;
    if (!call_filter_check_discard(call, entry, buffer, event))
        __buffer_unlock_commit(buffer, event);

    return 1;
}

2.3.2、trace_graph_return()

这里写图片描述

函数返回的存入：trace_graph_return() -> __trace_graph_return()：

void __trace_graph_return(struct trace_array *tr,
                struct ftrace_graph_ret *trace,
                unsigned long flags,
                int pc)
{
    struct trace_event_call *call = &event_funcgraph_exit;
    struct ring_buffer_event *event;
    struct ring_buffer *buffer = tr->trace_buffer.buffer;
    struct ftrace_graph_ret_entry *entry;

    event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET,
                      sizeof(*entry), flags, pc);
    if (!event)
        return;
    entry   = ring_buffer_event_data(event);
    entry->ret              = *trace;
    if (!call_filter_check_discard(call, entry, buffer, event))
        __buffer_unlock_commit(buffer, event);
}

2.3.3、filter

参见“2.2.2、配置function filter(1级过滤)”和“2.2.3、配置function_graph filter(2级过滤)”。

2.4、数据读出

从trace文件读出的function_graph tracer默认数据格式为：(显示function_graph中设置的函数或者是被它们调用的子函数)

# cat trace
# tracer: function_graph
#
# CPU  DURATION                  FUNCTION CALLS
# |     |   |                     |   |   |   |
 0)               |  tty_open() {
 0)   1.563 us    |    nonseekable_open();
 0)               |    tty_alloc_file() {
 0)               |      kmem_cache_alloc_trace() {
 0)               |        __might_sleep() {
 0)   0.417 us    |          ___might_sleep();
 0)   4.791 us    |        }
 0)   1.302 us    |        preempt_count_add();
 0)   1.250 us    |        preempt_count_sub();
 0)               |        __slab_alloc.isra.60.constprop.62() {
 0)               |          ___slab_alloc.constprop.63() {
 0)               |            _raw_spin_lock() {
 0)   1.250 us    |              preempt_count_add();
 0)   0.520 us    |              do_raw_spin_trylock();
 0)   9.531 us    |            }
 0)   0.781 us    |            preempt_count_add();
 0)   0.469 us    |            preempt_count_sub();
 0)               |            _raw_spin_unlock() {
 0)   0.521 us    |              do_raw_spin_unlock();
 0)   1.614 us    |              preempt_count_sub();
 0)   9.584 us    |            }
 0)               |            alloc_debug_processing() {
 0)               |              check_slab() {
 0)   1.198 us    |                slab_pad_check.part.52();
 0)   5.990 us    |              }
 0)               |              check_object() {
 0)   1.719 us    |                check_bytes_and_report();
 0)   0.521 us    |                check_bytes_and_report();
 0)   2.448 us    |                check_bytes_and_report();
 0)   0.469 us    |                check_bytes_and_report();
 0)   1.927 us    |                check_bytes_and_report();
 0) + 29.843 us   |              }
 0)               |              set_track() {

在通过trace文件读出数据时，如果tracer的print_line()函数有定义，则会调用print_line()函数进行解析。而不再去使用event->funcs->trace()函数。

1、读操作时，seq_read() -> s_show() -> print_trace_line() -> tracer->print_line()：

static struct tracer graph_trace __tracer_data = {
    .name       = "function_graph",
    .update_thresh  = graph_trace_update_thresh,
    .open       = graph_trace_open,
    .pipe_open  = graph_trace_open,
    .close      = graph_trace_close,
    .pipe_close = graph_trace_close,
    .init       = graph_trace_init,
    .reset      = graph_trace_reset,
    .print_line = print_graph_function,
    .print_header   = print_graph_headers,
    .flags      = &tracer_flags,
    .set_flag   = func_graph_set_flag,
#ifdef CONFIG_FTRACE_SELFTEST
    .selftest   = trace_selftest_startup_function_graph,
#endif
};

↓

static enum print_line_t
print_graph_function(struct trace_iterator *iter)
{
    return print_graph_function_flags(iter, tracer_flags.val);
}

↓

enum print_line_t
print_graph_function_flags(struct trace_iterator *iter, u32 flags)
{
    struct ftrace_graph_ent_entry *field;
    struct fgraph_data *data = iter->private;
    struct trace_entry *entry = iter->ent;
    struct trace_seq *s = &iter->seq;
    int cpu = iter->cpu;
    int ret;

    if (flags & TRACE_GRAPH_PRINT_FLAT)
        return TRACE_TYPE_UNHANDLED;

    if (data && per_cpu_ptr(data->cpu_data, cpu)->ignore) {
        per_cpu_ptr(data->cpu_data, cpu)->ignore = 0;
        return TRACE_TYPE_HANDLED;
    }

    /*
     * If the last output failed, there's a possibility we need
     * to print out the missing entry which would never go out.
     */
    if (data && data->failed) {
        field = &data->ent;
        iter->cpu = data->cpu;
        ret = print_graph_entry(field, s, iter, flags);
        if (ret == TRACE_TYPE_HANDLED && iter->cpu != cpu) {
            per_cpu_ptr(data->cpu_data, iter->cpu)->ignore = 1;
            ret = TRACE_TYPE_NO_CONSUME;
        }
        iter->cpu = cpu;
        return ret;
    }

    switch (entry->type) {
    /* (1) 打印TRACE_GRAPH_ENT类型的entry */
    case TRACE_GRAPH_ENT: {
        /*
         * print_graph_entry() may consume the current event,
         * thus @field may become invalid, so we need to save it.
         * sizeof(struct ftrace_graph_ent_entry) is very small,
         * it can be safely saved at the stack.
         */
        struct ftrace_graph_ent_entry saved;
        trace_assign_type(field, entry);
        saved = *field;
        return print_graph_entry(&saved, s, iter, flags);
    }
    /* (2) 打印TRACE_GRAPH_RET类型的entry */
    case TRACE_GRAPH_RET: {
        struct ftrace_graph_ret_entry *field;
        trace_assign_type(field, entry);
        return print_graph_return(&field->ret, s, entry, iter, flags);
    }

    /* (3) TRACE_STACK、TRACE_FN这两种类型的entry，graph无法处理，返回交由系统打印 */
    case TRACE_STACK:
    case TRACE_FN:
        /* dont trace stack and functions as comments */
        return TRACE_TYPE_UNHANDLED;

    /* (4) 打印其他类型的entry */
    default:
        return print_graph_comment(s, entry, iter, flags);
    }

    return TRACE_TYPE_HANDLED;
}

2、在第一次读操作时，会打印header。seq_read() -> s_show() -> print_trace_line() -> tracer->print_header()：

static void print_graph_headers(struct seq_file *s)
{
    print_graph_headers_flags(s, tracer_flags.val);
}

↓

void print_graph_headers_flags(struct seq_file *s, u32 flags)
{
    struct trace_iterator *iter = s->private;
    struct trace_array *tr = iter->tr;

    if (flags & TRACE_GRAPH_PRINT_FLAT) {
        trace_default_header(s);
        return;
    }

    if (!(tr->trace_flags & TRACE_ITER_CONTEXT_INFO))
        return;

    if (tr->trace_flags & TRACE_ITER_LATENCY_FMT) {
        /* print nothing if the buffers are empty */
        if (trace_empty(iter))
            return;

        print_trace_header(s, iter);
    }

    __print_graph_headers_flags(tr, s, flags);
}

3、在open的时候，也会调用到tracer的open函数。tracing_open() -> __tracing_open() -> tracer->open()：

void graph_trace_open(struct trace_iterator *iter)
{
    /* pid and depth on the last trace processed */
    struct fgraph_data *data;
    gfp_t gfpflags;
    int cpu;

    iter->private = NULL;

    /* We can be called in atomic context via ftrace_dump() */
    gfpflags = (in_atomic() || irqs_disabled()) ? GFP_ATOMIC : GFP_KERNEL;

    data = kzalloc(sizeof(*data), gfpflags);
    if (!data)
        goto out_err;

    data->cpu_data = alloc_percpu_gfp(struct fgraph_cpu_data, gfpflags);
    if (!data->cpu_data)
        goto out_err_free;

    for_each_possible_cpu(cpu) {
        pid_t *pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid);
        int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
        int *ignore = &(per_cpu_ptr(data->cpu_data, cpu)->ignore);
        int *depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);

        *pid = -1;
        *depth = 0;
        *ignore = 0;
        *depth_irq = -1;
    }

    iter->private = data;

    return;

 out_err_free:
    kfree(data);
 out_err:
    pr_warning("function graph tracer: not enough memory\n");
}

3、irqsoff tracer

irqsoff tracer用来追踪最大关中断时间。它的trace会提供几部分信息：

1、irqoff的最大时长:latency；
2、在最大irqoff这期间所有的function trace信息；
3、最后的irqon的函数回调信息；

3.1、插桩

这里写图片描述

irqsoff tracer的插桩方法，是直接在local_irq_enable()、local_irq_disable()中直接插入钩子函数trace_hardirqs_on()、trace_hardirqs_off()。

#ifdef CONFIG_TRACE_IRQFLAGS
#define local_irq_enable() \
    do { trace_hardirqs_on(); raw_local_irq_enable(); } while (0)
#define local_irq_disable() \
    do { raw_local_irq_disable(); trace_hardirqs_off(); } while (0)
#define local_irq_save(flags)               \
    do {                        \
        raw_local_irq_save(flags);      \
        trace_hardirqs_off();           \
    } while (0)


#define local_irq_restore(flags)            \
    do {                        \
        if (raw_irqs_disabled_flags(flags)) {   \
            raw_local_irq_restore(flags);   \
            trace_hardirqs_off();       \
        } else {                \
            trace_hardirqs_on();        \
            raw_local_irq_restore(flags);   \
        }                   \
    } while (0)

#define safe_halt()             \
    do {                    \
        trace_hardirqs_on();        \
        raw_safe_halt();        \
    } while (0)


#else /* !CONFIG_TRACE_IRQFLAGS */

3.1.1、trace_hardirqs_on/off()

local_irq_disable() -> trace_hardirqs_off()：

void trace_hardirqs_off(void)
{
    if (!preempt_trace() && irq_trace())
        start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
}

↓

static inline void
start_critical_timing(unsigned long ip, unsigned long parent_ip)
{
    int cpu;
    struct trace_array *tr = irqsoff_trace;
    struct trace_array_cpu *data;
    unsigned long flags;

    if (!tracer_enabled || !tracing_is_enabled())
        return;

    cpu = raw_smp_processor_id();

    /* (1) 如果已经开始tracing，中间的重复关中断操作被忽略 */
    if (per_cpu(tracing_cpu, cpu))
        return;

    data = per_cpu_ptr(tr->trace_buffer.data, cpu);

    if (unlikely(!data) || atomic_read(&data->disabled))
        return;

    atomic_inc(&data->disabled);

    /* (2) 记录trace的开始时间 */
    data->critical_sequence = max_sequence;
    data->preempt_timestamp = ftrace_now(cpu);
    data->critical_start = parent_ip ? : ip;

    local_save_flags(flags);

    /* (3) 记录一下当前的function trace信息 */
    __trace_function(tr, ip, parent_ip, flags, preempt_count());

    /* (4) 设置开始tracing标志 
        这是一个非常重要的标准，打开所有function trace的记录
     */
    per_cpu(tracing_cpu, cpu) = 1;

    atomic_dec(&data->disabled);
}

local_irq_enable() -> trace_hardirqs_on()：

void trace_hardirqs_on(void)
{
    if (!preempt_trace() && irq_trace())
        stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
}

↓

static inline void
stop_critical_timing(unsigned long ip, unsigned long parent_ip)
{
    int cpu;
    struct trace_array *tr = irqsoff_trace;
    struct trace_array_cpu *data;
    unsigned long flags;

    cpu = raw_smp_processor_id();
    /* Always clear the tracing cpu on stopping the trace */
    /* (1) 清除开始tracing标志 
        这是一个非常重要的标准，停止所有function trace的记录
     */
    if (unlikely(per_cpu(tracing_cpu, cpu)))
        per_cpu(tracing_cpu, cpu) = 0;
    else
        return;

    if (!tracer_enabled || !tracing_is_enabled())
        return;

    data = per_cpu_ptr(tr->trace_buffer.data, cpu);

    if (unlikely(!data) ||
        !data->critical_start || atomic_read(&data->disabled))
        return;

    atomic_inc(&data->disabled);

    /* (2) 记录一下当前的function trace信息 */
    local_save_flags(flags);
    __trace_function(tr, ip, parent_ip, flags, preempt_count());

    /* (3) check当前的irqoff持续时长是不是最大的latency */
    check_critical_timing(tr, data, parent_ip ? : ip, cpu);
    data->critical_start = 0;
    atomic_dec(&data->disabled);
}

↓

static void
check_critical_timing(struct trace_array *tr,
              struct trace_array_cpu *data,
              unsigned long parent_ip,
              int cpu)
{
    cycle_t T0, T1, delta;
    unsigned long flags;
    int pc;

    /* (3.1) 当前irqoff的持续时长 */
    T0 = data->preempt_timestamp;
    T1 = ftrace_now(cpu);
    delta = T1-T0;

    local_save_flags(flags);

    pc = preempt_count();

    /* (3.2) 判断是否是新的最大latency */
    if (!report_latency(tr, delta))
        goto out;

    raw_spin_lock_irqsave(&max_trace_lock, flags);

    /* check if we are still the max latency */
    if (!report_latency(tr, delta))
        goto out_unlock;

    /* (3.3) 如果是最大latency： 
        记录一下当前的function trace信息
        记录当前的stack信息
     */
    __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
    /* Skip 5 functions to get to the irq/preempt enable function */
    __trace_stack(tr, flags, 5, pc);

    if (data->critical_sequence != max_sequence)
        goto out_unlock;

    data->critical_end = parent_ip;

    /* (3.4) ---------重要----------------- 
        这一步非常重要，主要做了以下工作：
        1、把当前的trace buffer备份到max_buffer中
        2、更新max_buffer中max_buf->time_start时间戳，这个记录了最大latency trace信息的起始位置
     */
    if (likely(!is_tracing_stopped())) {
        tr->max_latency = delta;
        update_max_tr_single(tr, current, cpu);
    }

    max_sequence++;

out_unlock:
    raw_spin_unlock_irqrestore(&max_trace_lock, flags);

out:
    data->critical_sequence = max_sequence;
    data->preempt_timestamp = ftrace_now(cpu);
    __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
}

↓

void
update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
{
    int ret;

    if (tr->stop_count)
        return;

    WARN_ON_ONCE(!irqs_disabled());
    if (!tr->allocated_snapshot) {
        /* Only the nop tracer should hit this when disabling */
        WARN_ON_ONCE(tr->current_trace != &nop_trace);
        return;
    }

    arch_spin_lock(&tr->max_lock);

    /* (3.4.1) swap trace_buffer和max_buffer，把包含最大latency过程的trace信息备份到max_buffer中 */
    ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->trace_buffer.buffer, cpu);

    if (ret == -EBUSY) {
        /*
         * We failed to swap the buffer due to a commit taking
         * place on this CPU. We fail to record, but we reset
         * the max trace buffer (no one writes directly to it)
         * and flag that it failed.
         */
        trace_array_printk_buf(tr->max_buffer.buffer, _THIS_IP_,
            "Failed to swap buffers due to commit in progress\n");
    }

    WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY);

    /* (3.4.2) 已经把包含最大latency过程的trace信息备份到max_buffer中了， 
        但是这个信息中包含了多次irqoff的trace信息，哪一次才是最大latency的信息呢？
        使用max_buf->time_start来记录最大latency的起始时间
        max_buf->time_start = data->preempt_timestamp;
     */
    __update_max_tr(tr, tsk, cpu);
    arch_spin_unlock(&tr->max_lock);
}

3.1.2、irqoff trace信息的读取

参考上一节，最大latency的trace信息被备份到了max_buffer中了，且max_buf->time_start来记录最大latency的起始时间。

所以在trace读取irqoff trace信息时，需要从max_buffer读取。tracing_open() -> __tracing_open()：

static struct trace_iterator *
__tracing_open(struct inode *inode, struct file *file, bool snapshot)
{

#ifdef CONFIG_TRACER_MAX_TRACE
    /* Currently only the top directory has a snapshot */
    /* (1) 如果tracer的print_max被定义，从max_buffer从读取数据 */
    if (tr->current_trace->print_max || snapshot)
        iter->trace_buffer = &tr->max_buffer;
    else
#endif
        iter->trace_buffer = &tr->trace_buffer;

}

static struct tracer irqsoff_tracer __read_mostly =
{
    .name       = "irqsoff",
    .init       = irqsoff_tracer_init,
    .reset      = irqsoff_tracer_reset,
    .start      = irqsoff_tracer_start,
    .stop       = irqsoff_tracer_stop,
    .print_max  = true,     /* irqoff tracer的print_max为true */
    .print_header   = irqsoff_print_header,
    .print_line     = irqsoff_print_line,
    .flag_changed   = irqsoff_flag_changed,
#ifdef CONFIG_FTRACE_SELFTEST
    .selftest    = trace_selftest_startup_irqsoff,
#endif
    .open           = irqsoff_trace_open,
    .close          = irqsoff_trace_close,
    .allow_instances = true,
    .use_max_tr = true,
};

在读取时，还需要做时间同步只读取max_buf->time_start时间戳以后的数据。tracing_open() -> __tracing_open() -> tracing_iter_reset()：

void tracing_iter_reset(struct trace_iterator *iter, int cpu)
{
    struct ring_buffer_event *event;
    struct ring_buffer_iter *buf_iter;
    unsigned long entries = 0;
    u64 ts;

    per_cpu_ptr(iter->trace_buffer->data, cpu)->skipped_entries = 0;

    buf_iter = trace_buffer_iter(iter, cpu);
    if (!buf_iter)
        return;

    ring_buffer_iter_reset(buf_iter);

    /*
     * We could have the case with the max latency tracers
     * that a reset never took place on a cpu. This is evident
     * by the timestamp being before the start of the buffer.
     */
    /* (1) 对于时间戳小于time_start的trace数据全部丢弃掉， 
        只读取time_start时间戳以后的数据
     */
    while ((event = ring_buffer_iter_peek(buf_iter, &ts))) {
        if (ts >= iter->trace_buffer->time_start)
            break;
        entries++;
        ring_buffer_read(buf_iter, NULL);
    }

    per_cpu_ptr(iter->trace_buffer->data, cpu)->skipped_entries = entries;
}

3.1.3、irqsoff tracer的使能

irqsoff tracer需要使用function trace或者function_graph trace来记录最大latency的中间函数调用过程。所以它会注册一个global_ops->func，或者使用graph_ops注册。
trace数据的写入受trace_hardirqs_off()、trace_hardirqs_on()中配置的开关per_cpu(tracing_cpu, cpu)控制。在trace_hardirqs_off()时打开开关，在trace_hardirqs_on()时关闭开关，这样就记录下了irqoff过程中的具体函数调用过程。

参考“1.2.5、function tracer使能”这一节，tracer的使能最后调用的是tracer->init()函数。

对irqsoff tracer来说，调用的是irqsoff_tracer_init()：

static struct tracer irqsoff_tracer __read_mostly =
{
    .name       = "irqsoff",
    .init       = irqsoff_tracer_init,
    .reset      = irqsoff_tracer_reset,
    .start      = irqsoff_tracer_start,
    .stop       = irqsoff_tracer_stop,
    .print_max  = true,
    .print_header   = irqsoff_print_header,
    .print_line     = irqsoff_print_line,
    .flag_changed   = irqsoff_flag_changed,
#ifdef CONFIG_FTRACE_SELFTEST
    .selftest    = trace_selftest_startup_irqsoff,
#endif
    .open           = irqsoff_trace_open,
    .close          = irqsoff_trace_close,
    .allow_instances = true,
    .use_max_tr = true,
};

↓

static int irqsoff_tracer_init(struct trace_array *tr)
{
    trace_type = TRACER_IRQS_OFF;

    return __irqsoff_tracer_init(tr);
}

↓

static int __irqsoff_tracer_init(struct trace_array *tr)
{
    if (irqsoff_busy)
        return -EBUSY;

    save_flags = tr->trace_flags;

    /* non overwrite screws up the latency tracers */
    set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1);
    set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1);

    tr->max_latency = 0;
    irqsoff_trace = tr;
    /* make sure that the tracer is visible */
    smp_wmb();
    tracing_reset_online_cpus(&tr->trace_buffer);

    /* (1) 如果采用function trace模式来记录最大latency过程中的trace信息 
        global_ops->func = irqsoff_tracer_call()
     */
    ftrace_init_array_ops(tr, irqsoff_tracer_call);

    /* Only toplevel instance supports graph tracing */
    /* (2) 选择使用 function trace模式 or function_graph trace模式 注册
        默认是function trace模式
     */
    if (start_irqsoff_tracer(tr, (tr->flags & TRACE_ARRAY_FL_GLOBAL &&
                      is_graph(tr))))
        printk(KERN_ERR "failed to start irqsoff tracer\n");

    irqsoff_busy = true;
    return 0;
}

↓

start_irqsoff_tracer() -> register_irqsoff_function()

static int register_irqsoff_function(struct trace_array *tr, int graph, int set)
{
    int ret;

    /* 'set' is set if TRACE_ITER_FUNCTION is about to be set */
    if (function_enabled || (!set && !(tr->trace_flags & TRACE_ITER_FUNCTION)))
        return 0;

    if (graph)
        /* (2.1) function_graph trace模式注册 */
        ret = register_ftrace_graph(&irqsoff_graph_return,
                        &irqsoff_graph_entry);
    else
        /* (2.2) function trace模式注册 */
        ret = register_ftrace_function(tr->ops);

    if (!ret)
        function_enabled = true;

    return ret;
}

在记录trace数据时，需要判断per_cpu(tracing_cpu, cpu)开关。以function trace模式为例，我们来看看其irqsoff_tracer_call()函数：

static void
irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip,
            struct ftrace_ops *op, struct pt_regs *pt_regs)
{
    struct trace_array *tr = irqsoff_trace;
    struct trace_array_cpu *data;
    unsigned long flags;

    if (!func_prolog_dec(tr, &data, &flags))
        return;

    trace_function(tr, ip, parent_ip, flags, preempt_count());

    atomic_dec(&data->disabled);
}

↓

static int func_prolog_dec(struct trace_array *tr,
               struct trace_array_cpu **data,
               unsigned long *flags)
{
    long disabled;
    int cpu;

    /*
     * Does not matter if we preempt. We test the flags
     * afterward, to see if irqs are disabled or not.
     * If we preempt and get a false positive, the flags
     * test will fail.
     */
    cpu = raw_smp_processor_id();
    /* 对tracing_cpu开关的判断，由trace_hardirqs_off()、trace_hardirqs_on()函数控制 */
    if (likely(!per_cpu(tracing_cpu, cpu)))
        return 0;

    local_save_flags(*flags);
    /*
     * Slight chance to get a false positive on tracing_cpu,
     * although I'm starting to think there isn't a chance.
     * Leave this for now just to be paranoid.
     */
    if (!irqs_disabled_flags(*flags) && !preempt_count())
        return 0;

    *data = per_cpu_ptr(tr->trace_buffer.data, cpu);
    disabled = atomic_inc_return(&(*data)->disabled);

    if (likely(disabled == 1))
        return 1;

    atomic_dec(&(*data)->disabled);

    return 0;
}

3.2、数据存入

它的trace会提供几部分信息：

1、irqoff的最大时长:latency；
2、在最大irqoff这期间所有的function trace信息；
3、最后的irqon的函数回调信息；

具体的存入方法参考“3.1、插桩”这一节的描述。

3.3、数据读出

从trace文件读出的irqoff tracer默认数据格式为：(显示最大latency以及过程中的详细函数调用)

# cat trace
# tracer: irqsoff
#
# irqsoff latency trace v1.1.5 on 4.4.21-g6678ed0-01979-g8d8eeb5-cIcd1f2d8-dirty
# --------------------------------------------------------------------
# latency: 1703 us, #184/184, CPU#3 | (M:preempt VP:0, KP:0, SP:0 HP:0 #P:8)
#    -----------------
#    | task: swapper/3-0 (uid:0 nice:0 policy:0 rt_prio:0)
#    -----------------
#  => started at: lpm_cpuidle_enter
#  => ended at:   cpuidle_enter_state
#
#
#                  _------=> CPU#
#                 / _-----=> irqs-off
#                | / _----=> need-resched
#                || / _---=> hardirq/softirq
#                ||| / _--=> preempt-depth
#                |||| /     delay
#  cmd     pid   ||||| time  |   caller
#     \   /      |||||  \    |   /
  <idle>-0       3dn.1    3us : psci_enter_sleep <-lpm_cpuidle_enter
  <idle>-0       3dn.2    9us : update_debug_pc_event <-psci_enter_sleep
  <idle>-0       3dn.2   11us : _raw_spin_lock <-update_debug_pc_event
  <idle>-0       3dn.2   13us : preempt_count_add <-_raw_spin_lock
  <idle>-0       3dn.3   17us : do_raw_spin_trylock <-_raw_spin_lock
  <idle>-0       3dn.3   20us : _raw_spin_unlock <-update_debug_pc_event
  <idle>-0       3dn.3   23us : do_raw_spin_unlock <-_raw_spin_unlock
  <idle>-0       3dn.3   25us : preempt_count_sub <-_raw_spin_unlock
  <idle>-0       3dn.2   28us : ktime_get <-lpm_cpuidle_enter
  <idle>-0       3dn.2   33us : arch_counter_read <-ktime_get
  <idle>-0       3dn.2   35us : lpm_stats_cpu_exit <-lpm_cpuidle_enter
  <idle>-0       3dn.2   37us : update_level_stats.part.1 <-lpm_stats_cpu_exit
  <idle>-0       3dn.2   41us : cluster_unprepare <-lpm_cpuidle_enter
  <idle>-0       3dn.2   43us : _raw_spin_lock <-cluster_unprepare
  <idle>-0       3dn.2   44us : preempt_count_add <-_raw_spin_lock
  <idle>-0       3dn.3   46us : do_raw_spin_trylock <-_raw_spin_lock
  <idle>-0       3dn.3   50us : lpm_stats_cluster_exit <-cluster_unprepare
  <idle>-0       3dn.3   52us : update_level_stats.part.1 <-lpm_stats_cluster_exit
  <idle>-0       3dn.3   57us : update_debug_pc_event <-cluster_unprepare
  <idle>-0       3dn.3   59us : _raw_spin_lock <-update_debug_pc_event
  <idle>-0       3dn.3   60us : preempt_count_add <-_raw_spin_lock
  <idle>-0       3dn.4   62us : do_raw_spin_trylock <-_raw_spin_lock
  <idle>-0       3dn.4   65us : _raw_spin_unlock <-update_debug_pc_event
  <idle>-0       3dn.4   66us : do_raw_spin_unlock <-_raw_spin_unlock
  <idle>-0       3dn.4   69us : preempt_count_sub <-_raw_spin_unlock
  <idle>-0       3dn.3   71us : sched_set_cluster_dstate <-cluster_unprepare
  <idle>-0       3dn.3   74us : cpu_cluster_pm_exit <-cluster_unprepare
  <idle>-0       3dn.3   76us : _raw_read_lock <-cpu_cluster_pm_exit
  <idle>-0       3dn.3   78us : preempt_count_add <-_raw_read_lock
  <idle>-0       3dn.4   80us : do_raw_read_trylock <-_raw_read_lock
  <idle>-0       3dn.4   82us : cpu_pm_notify <-cpu_cluster_pm_exit
  <idle>-0       3dn.4   84us : __raw_notifier_call_chain <-cpu_pm_notify
  <idle>-0       3dn.4   86us : notifier_call_chain <-__raw_notifier_call_chain
  <idle>-0       3dn.4   88us : gic_cpu_pm_notifier <-notifier_call_chain
  <idle>-0       3dn.4   90us : arch_timer_cpu_pm_notify <-notifier_call_chain
  <idle>-0       3dn.4   92us : jtag_cpu_pm_callback <-notifier_call_chain
  <idle>-0       3dn.4   95us : gladiator_erp_pm_callback <-notifier_call_chain
  <idle>-0       3dn.4   97us : cpu_pm_pmu_notify <-notifier_call_chain
  <idle>-0       3dn.4   99us : cpu_pm_pmu_common <-cpu_pm_pmu_notify
  <idle>-0       3dn.4  102us : cti_cpu_pm_callback <-notifier_call_chain
  <idle>-0       3dn.4  104us : coresight_cti_ctx_restore <-cti_cpu_pm_callback
  <idle>-0       3dn.4  109us : _raw_spin_lock_irqsave <-coresight_cti_ctx_restore
  <idle>-0       3dn.4  111us : preempt_count_add <-_raw_spin_lock_irqsave
  <idle>-0       3dn.5  113us!: do_raw_spin_trylock <-_raw_spin_lock_irqsave
  <idle>-0       3dn.5  222us : _raw_spin_unlock_irqrestore <-coresight_cti_ctx_restore
  <idle>-0       3dn.5  224us : do_raw_spin_unlock <-_raw_spin_unlock_irqrestore
  <idle>-0       3dn.5  226us : preempt_count_sub <-_raw_spin_unlock_irqrestore
  <idle>-0       3dn.4  228us : _raw_spin_lock_irqsave <-coresight_cti_ctx_restore
  <idle>-0       3dn.4  230us : preempt_count_add <-_raw_spin_lock_irqsave
  <idle>-0       3dn.5  232us!: do_raw_spin_trylock <-_raw_spin_lock_irqsave
  <idle>-0       3dn.5  337us : _raw_spin_unlock_irqrestore <-coresight_cti_ctx_restore
  <idle>-0       3dn.5  339us : do_raw_spin_unlock <-_raw_spin_unlock_irqrestore
  <idle>-0       3dn.5  341us : preempt_count_sub <-_raw_spin_unlock_irqrestore
  <idle>-0       3dn.4  343us : _raw_spin_lock_irqsave <-coresight_cti_ctx_restore
  <idle>-0       3dn.4  346us : preempt_count_add <-_raw_spin_lock_irqsave
  <idle>-0       3dn.5  348us!: do_raw_spin_trylock <-_raw_spin_lock_irqsave
  <idle>-0       3dn.5  492us : _raw_spin_unlock_irqrestore <-coresight_cti_ctx_restore
  <idle>-0       3dn.5  494us : do_raw_spin_unlock <-_raw_spin_unlock_irqrestore
  <idle>-0       3dn.5  497us : preempt_count_sub <-_raw_spin_unlock_irqrestore
  <idle>-0       3dn.4  499us : _raw_spin_lock_irqsave <-coresight_cti_ctx_restore
  <idle>-0       3dn.4  501us : preempt_count_add <-_raw_spin_lock_irqsave
  <idle>-0       3dn.5  503us+: do_raw_spin_trylock <-_raw_spin_lock_irqsave
  <idle>-0       3dn.5  603us : _raw_spin_unlock_irqrestore <-coresight_cti_ctx_restore
  <idle>-0       3dn.5  605us : do_raw_spin_unlock <-_raw_spin_unlock_irqrestore
  <idle>-0       3dn.5  607us : preempt_count_sub <-_raw_spin_unlock_irqrestore
  <idle>-0       3dn.4  613us : fpsimd_cpu_pm_notifier <-notifier_call_chain
  <idle>-0       3dn.4  615us : _raw_read_unlock <-cpu_cluster_pm_exit
  <idle>-0       3dn.4  617us : do_raw_read_unlock <-_raw_read_unlock
  <idle>-0       3dn.4  619us : preempt_count_sub <-_raw_read_unlock
  <idle>-0       3dn.3  621us : update_cluster_history <-cluster_unprepare
  <idle>-0       3dn.3  623us : ktime_get <-update_cluster_history
  <idle>-0       3dn.3  626us : arch_counter_read <-ktime_get
  <idle>-0       3dn.3  629us : cluster_unprepare <-cluster_unprepare
  <idle>-0       3dn.3  631us : _raw_spin_lock <-cluster_unprepare
  <idle>-0       3dn.3  633us : preempt_count_add <-_raw_spin_lock
  <idle>-0       3dn.4  635us : do_raw_spin_trylock <-_raw_spin_lock
  <idle>-0       3dn.4  638us : _raw_spin_unlock <-cluster_unprepare
  <idle>-0       3dn.4  640us : do_raw_spin_unlock <-_raw_spin_unlock
  <idle>-0       3dn.4  642us : preempt_count_sub <-_raw_spin_unlock
  <idle>-0       3dn.3  644us : _raw_spin_unlock <-cluster_unprepare
  <idle>-0       3dn.3  645us : do_raw_spin_unlock <-_raw_spin_unlock
  <idle>-0       3dn.3  647us : preempt_count_sub <-_raw_spin_unlock
  <idle>-0       3dn.2  649us : tick_broadcast_oneshot_control <-lpm_cpuidle_enter
  <idle>-0       3dn.2  651us : __tick_broadcast_oneshot_control <-tick_broadcast_oneshot_control
  <idle>-0       3dn.2  653us : _raw_spin_lock <-__tick_broadcast_oneshot_control
  <idle>-0       3dn.2  654us : preempt_count_add <-_raw_spin_lock
  <idle>-0       3dn.3  657us : do_raw_spin_trylock <-_raw_spin_lock
  <idle>-0       3dn.3  660us : clockevents_switch_state <-__tick_broadcast_oneshot_control
  <idle>-0       3dn.3  663us : ktime_get <-__tick_broadcast_oneshot_control
  <idle>-0       3dn.3  667us : arch_counter_read <-ktime_get
  <idle>-0       3dn.3  668us : tick_program_event <-__tick_broadcast_oneshot_control
  <idle>-0       3dn.3  670us : clockevents_program_event <-tick_program_event
  <idle>-0       3dn.3  672us : ktime_get <-clockevents_program_event
  <idle>-0       3dn.3  676us : arch_counter_read <-ktime_get
  <idle>-0       3dn.3  678us : arch_timer_set_next_event_virt <-clockevents_program_event
  <idle>-0       3dn.3  679us : _raw_spin_unlock <-__tick_broadcast_oneshot_control
  <idle>-0       3dn.3  681us : do_raw_spin_unlock <-_raw_spin_unlock
  <idle>-0       3dn.3  683us : preempt_count_sub <-_raw_spin_unlock
  <idle>-0       3dn.2  685us : cpu_pm_exit <-lpm_cpuidle_enter
  <idle>-0       3dn.2  686us : _raw_read_lock <-cpu_pm_exit
  <idle>-0       3dn.2  688us : preempt_count_add <-_raw_read_lock
  <idle>-0       3dn.3  689us : do_raw_read_trylock <-_raw_read_lock
  <idle>-0       3dn.3  691us : cpu_pm_notify <-cpu_pm_exit
  <idle>-0       3dn.3  693us : __raw_notifier_call_chain <-cpu_pm_notify
  <idle>-0       3dn.3  694us : notifier_call_chain <-__raw_notifier_call_chain
  <idle>-0       3dn.3  696us : gic_cpu_pm_notifier <-notifier_call_chain
  <idle>-0       3dn.3  698us : gic_cpu_sys_reg_init <-gic_cpu_pm_notifier
  <idle>-0       3dn.3  700us : arch_timer_cpu_pm_notify <-notifier_call_chain
  <idle>-0       3dn.3  702us : jtag_cpu_pm_callback <-notifier_call_chain
  <idle>-0       3dn.3  704us : msm_jtag_restore_state <-jtag_cpu_pm_callback
  <idle>-0       3dn.3  708us : msm_jtag_etm_restore_state <-msm_jtag_restore_state
  <idle>-0       3dn.3  716us : raw_notifier_call_chain <-msm_jtag_etm_restore_state
  <idle>-0       3dn.3  717us!: notifier_call_chain <-raw_notifier_call_chain
  <idle>-0       3dn.3 1524us : gladiator_erp_pm_callback <-notifier_call_chain
  <idle>-0       3dn.3 1526us : cpu_pm_pmu_notify <-notifier_call_chain
  <idle>-0       3dn.3 1529us : armv8pmu_reset <-cpu_pm_pmu_notify
  <idle>-0       3dn.3 1531us : cpu_pm_pmu_common <-cpu_pm_pmu_notify
  <idle>-0       3dn.3 1533us : cpu_pm_pmu_setup <-cpu_pm_pmu_common
  <idle>-0       3dn.3 1538us : rcu_irq_enter <-cpu_pm_pmu_setup
  <idle>-0       3dn.3 1542us : rcu_eqs_exit_common.isra.48 <-rcu_irq_enter
  <idle>-0       3dn.3 1545us : rcu_try_advance_all_cbs <-rcu_eqs_exit_common.isra.48
  <idle>-0       3dn.3 1553us : armpmu_start <-cpu_pm_pmu_setup
  <idle>-0       3dn.3 1558us : armpmu_event_set_period <-armpmu_start
  <idle>-0       3dn.3 1562us : __rcu_read_lock <-perf_event_update_userpage
  <idle>-0       3dn.3 1566us : __rcu_read_unlock <-perf_event_update_userpage
  <idle>-0       3dn.3 1568us : armv8pmu_enable_event <-armpmu_start
  <idle>-0       3dn.3 1570us : _raw_spin_lock_irqsave <-armv8pmu_enable_event
  <idle>-0       3dn.3 1572us : preempt_count_add <-_raw_spin_lock_irqsave
  <idle>-0       3dn.4 1574us : do_raw_spin_trylock <-_raw_spin_lock_irqsave
  <idle>-0       3dn.4 1576us : _raw_spin_unlock_irqrestore <-armv8pmu_enable_event
  <idle>-0       3dn.4 1578us : do_raw_spin_unlock <-_raw_spin_unlock_irqrestore
  <idle>-0       3dn.4 1581us : preempt_count_sub <-_raw_spin_unlock_irqrestore
  <idle>-0       3dn.3 1584us : rcu_irq_exit <-cpu_pm_pmu_setup
  <idle>-0       3dn.3 1586us : rcu_eqs_enter_common.isra.46 <-rcu_irq_exit
  <idle>-0       3dn.3 1591us : rcu_irq_enter <-cpu_pm_pmu_setup
  <idle>-0       3dn.3 1593us : rcu_eqs_exit_common.isra.48 <-rcu_irq_enter
  <idle>-0       3dn.3 1596us : rcu_try_advance_all_cbs <-rcu_eqs_exit_common.isra.48
  <idle>-0       3dn.3 1598us : armpmu_start <-cpu_pm_pmu_setup
  <idle>-0       3dn.3 1601us : armpmu_event_set_period <-armpmu_start
  <idle>-0       3dn.3 1603us : __rcu_read_lock <-perf_event_update_userpage
  <idle>-0       3dn.3 1605us : __rcu_read_unlock <-perf_event_update_userpage
  <idle>-0       3dn.3 1606us : armv8pmu_enable_event <-armpmu_start
  <idle>-0       3dn.3 1608us : _raw_spin_lock_irqsave <-armv8pmu_enable_event
  <idle>-0       3dn.3 1611us : preempt_count_add <-_raw_spin_lock_irqsave
  <idle>-0       3dn.4 1615us : do_raw_spin_trylock <-_raw_spin_lock_irqsave
  <idle>-0       3dn.4 1617us : _raw_spin_unlock_irqrestore <-armv8pmu_enable_event
  <idle>-0       3dn.4 1619us : do_raw_spin_unlock <-_raw_spin_unlock_irqrestore
  <idle>-0       3dn.4 1622us : preempt_count_sub <-_raw_spin_unlock_irqrestore
  <idle>-0       3dn.3 1623us : rcu_irq_exit <-cpu_pm_pmu_setup
  <idle>-0       3dn.3 1627us : rcu_eqs_enter_common.isra.46 <-rcu_irq_exit
  <idle>-0       3dn.3 1629us : rcu_irq_enter <-cpu_pm_pmu_setup
  <idle>-0       3dn.3 1632us : rcu_eqs_exit_common.isra.48 <-rcu_irq_enter
  <idle>-0       3dn.3 1634us : rcu_try_advance_all_cbs <-rcu_eqs_exit_common.isra.48
  <idle>-0       3dn.3 1637us : armpmu_start <-cpu_pm_pmu_setup
  <idle>-0       3dn.3 1639us : armpmu_event_set_period <-armpmu_start
  <idle>-0       3dn.3 1642us : __rcu_read_lock <-perf_event_update_userpage
  <idle>-0       3dn.3 1644us : __rcu_read_unlock <-perf_event_update_userpage
  <idle>-0       3dn.3 1646us : armv8pmu_enable_event <-armpmu_start
  <idle>-0       3dn.3 1647us : _raw_spin_lock_irqsave <-armv8pmu_enable_event
  <idle>-0       3dn.3 1649us : preempt_count_add <-_raw_spin_lock_irqsave
  <idle>-0       3dn.4 1651us : do_raw_spin_trylock <-_raw_spin_lock_irqsave
  <idle>-0       3dn.4 1653us : _raw_spin_unlock_irqrestore <-armv8pmu_enable_event
  <idle>-0       3dn.4 1655us : do_raw_spin_unlock <-_raw_spin_unlock_irqrestore
  <idle>-0       3dn.4 1657us : preempt_count_sub <-_raw_spin_unlock_irqrestore
  <idle>-0       3dn.3 1659us : rcu_irq_exit <-cpu_pm_pmu_setup
  <idle>-0       3dn.3 1662us : rcu_eqs_enter_common.isra.46 <-rcu_irq_exit
  <idle>-0       3dn.3 1665us : armv8pmu_start <-cpu_pm_pmu_common
  <idle>-0       3dn.3 1667us : _raw_spin_lock_irqsave <-armv8pmu_start
  <idle>-0       3dn.3 1670us : preempt_count_add <-_raw_spin_lock_irqsave
  <idle>-0       3dn.4 1672us : do_raw_spin_trylock <-_raw_spin_lock_irqsave
  <idle>-0       3dn.4 1674us : _raw_spin_unlock_irqrestore <-armv8pmu_start
  <idle>-0       3dn.4 1676us : do_raw_spin_unlock <-_raw_spin_unlock_irqrestore
  <idle>-0       3dn.4 1679us : preempt_count_sub <-_raw_spin_unlock_irqrestore
  <idle>-0       3dn.3 1680us : cti_cpu_pm_callback <-notifier_call_chain
  <idle>-0       3dn.3 1683us : fpsimd_cpu_pm_notifier <-notifier_call_chain
  <idle>-0       3dn.3 1684us : _raw_read_unlock <-cpu_pm_exit
  <idle>-0       3dn.3 1686us : do_raw_read_unlock <-_raw_read_unlock
  <idle>-0       3dn.3 1688us : preempt_count_sub <-_raw_read_unlock
  <idle>-0       3dn.2 1691us : sched_set_cpu_cstate <-lpm_cpuidle_enter
  <idle>-0       3dn.2 1693us : ktime_get <-lpm_cpuidle_enter
  <idle>-0       3dn.2 1697us : arch_counter_read <-ktime_get
  <idle>-0       3dn.1 1702us : lpm_cpuidle_enter <-cpuidle_enter_state
  <idle>-0       3dn.1 1708us+: trace_hardirqs_on <-cpuidle_enter_state
  <idle>-0       3dn.1 1720us : <stack trace>
 => trace_hardirqs_on
 => lpm_cpuidle_enter
 => cpuidle_enter_state
 => cpuidle_enter
 => cpu_startup_entry
 => secondary_start_kernel
 =>

在通过trace文件读出数据时：

具体函数调用信息是使用trace_function()或者trace_graph_function()函数进行记录的，调用相应的方法解析即可；
最后的irqon的函数回调信息是使用__trace_stack()函数进行记录的，调用相应的方法解析即可；
最大latency信息的解析，使用tracer->print_header()；

在第一次读操作时，会打印header。seq_read() -> s_show() -> print_trace_line() -> tracer->print_header()：

static void irqsoff_print_header(struct seq_file *s)
{
    trace_default_header(s);
}

参考资料：

1、Function Tracer Design
2、ftrace function_graph分析