Linux perf 1.2、tracepoint events

perf复用了ftrace中trace_event的所有插桩点(即tracepoint),trace数据即可以通过ftrace的通道提供,也可以被包装成perf_event供perf工具使用。

可以通过perf list命令来查看系统中的tracpoint event,可以看到和/sys/kernel/debug/tracing/events/路径下的trace_event一致:

# simpleperf list tracepoint | more
List of tracepoint events:
  almk:almk_shrink
  almk:almk_vmpressure
  asoc:snd_soc_bias_level_done
  asoc:snd_soc_bias_level_start
  asoc:snd_soc_dapm_connected
  asoc:snd_soc_dapm_done
  asoc:snd_soc_dapm_path
  asoc:snd_soc_dapm_start
  asoc:snd_soc_dapm_walk_done
  asoc:snd_soc_dapm_widget_event_done
  asoc:snd_soc_dapm_widget_event_start
  asoc:snd_soc_dapm_widget_power

1、原理介绍:

trace_event在定义的时候,已经把trace_point、trace_event的callback函数、perf_event的callback函数全都给定义好了。只需要使用现成的架构把perf_event的callback函数给注册到trace_point上去就行了。

include/trace/trace_event.h:

/* (1) trace_event_class的定义 */
#undef DECLARE_EVENT_CLASS
#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print)  \
_TRACE_PERF_PROTO(call, PARAMS(proto));                 \
static char print_fmt_##call[] = print;                 \
static struct trace_event_class __used __refdata event_class_##call = { \
    .system         = TRACE_SYSTEM_STRING,          \
    .define_fields      = trace_event_define_fields_##call, \
    .fields         = LIST_HEAD_INIT(event_class_##call.fields),\
    .raw_init       = trace_event_raw_init,         \

    /* (1.1) trace_event的回调函数 */
    .probe          = trace_event_raw_event_##call,     \

    /* (1.2) 注册函数,把callback函数加入到tracepoint中 */
    .reg            = trace_event_reg,          \

    /* (1.3) perf_event部分的定义 */
    _TRACE_PERF_INIT(call)                      \
};

/* (2) trace_event_call的定义 */
#undef DEFINE_EVENT
#define DEFINE_EVENT(template, call, proto, args)           \
                                    \
static struct trace_event_call __used event_##call = {          \

    /* (2.1) 对trace_event_class的引用 */
    .class          = &event_class_##template,      \

    /* (2.2) tracepoint的定义 */
    {                               \
        .tp         = &__tracepoint_##call,     \
    },                              \
    .event.funcs        = &trace_event_type_funcs_##template,   \
    .print_fmt      = print_fmt_##template,         \
    .flags          = TRACE_EVENT_FL_TRACEPOINT,        \
};                                  \
static struct trace_event_call __used                   \
__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call

_TRACE_PERF_INIT()宏的展开:

#ifdef CONFIG_PERF_EVENTS

#define _TRACE_PERF_PROTO(call, proto)                  \
    static notrace void                     \
    perf_trace_##call(void *__data, proto);

#define _TRACE_PERF_INIT(call)                      \

    /* (1.3) perf_event的回调函数:perf_trace_##call() */
    .perf_probe     = perf_trace_##call,

#else
#define _TRACE_PERF_PROTO(call, proto)
#define _TRACE_PERF_INIT(call)
#endif /* CONFIG_PERF_EVENTS */

继续追踪perf_event的回调函数,include/trace/perf.h:

#undef DECLARE_EVENT_CLASS
#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print)  \
static notrace void                         \
perf_trace_##call(void *__data, proto)                  \
{                                   \
    struct trace_event_call *event_call = __data;           \
    struct trace_event_data_offsets_##call __maybe_unused __data_offsets;\
    struct trace_event_raw_##call *entry;               \
    struct pt_regs *__regs;                     \
    u64 __addr = 0, __count = 1;                    \
    struct task_struct *__task = NULL;              \
    struct hlist_head *head;                    \
    int __entry_size;                       \
    int __data_size;                        \
    int rctx;                           \
                                    \
    __data_size = trace_event_get_offsets_##call(&__data_offsets, args); \
                                    \
    head = this_cpu_ptr(event_call->perf_events);           \
    if (__builtin_constant_p(!__task) && !__task &&         \
                hlist_empty(head))          \
        return;                         \
                                    \
    /* (1) 计算需要存储的trace数据的长度 */
    __entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32),\
                 sizeof(u64));              \
    __entry_size -= sizeof(u32);                    \
                                    \

    /* (2) 分配存储数据的tmp buffer */
    entry = perf_trace_buf_prepare(__entry_size,            \
            event_call->event.type, &__regs, &rctx);    \
    if (!entry)                         \
        return;                         \
                                    \

    /* (3) get当前register */
    perf_fetch_caller_regs(__regs);                 \
                                    \

    /* (4) 给动态成员的占位符赋值:offset + size */
    tstruct                             \
                                    \

    /* (5) 记录trace数据到tmp buffer中 */
    { assign; }                         \
                                    \

    /* (6) 提交trace数据给连接到本tracepoint上的所有perf_event,拷贝tmp buffer到perf_event的ringbuffer中 */
    perf_trace_buf_submit(entry, __entry_size, rctx, __addr,    \
        __count, __regs, head, __task);             \
}

我们接下来看一下详细的过程和数据格式。

2、event init

tracepoint pmu的定义如下:

static struct pmu perf_tracepoint = {
    .task_ctx_nr    = perf_sw_context,

    .event_init = perf_tp_event_init,
    .add        = perf_trace_add,
    .del        = perf_trace_del,
    .start      = perf_swevent_start,
    .stop       = perf_swevent_stop,
    .read       = perf_swevent_read,

    .events_across_hotplug = 1,
};

在调用perf_event_open()系统调用创建新的perf_event的时候,会调用到pmu->event_init()函数。

perf_event_open() -> perf_event_alloc() -> perf_init_event() -> perf_try_init_event() -> pmu->event_init(event) -> perf_tp_event_init():

static int perf_tp_event_init(struct perf_event *event)
{
    int err;

    /* (1) attr.type类型不对,出错返回 */
    if (event->attr.type != PERF_TYPE_TRACEPOINT)
        return -ENOENT;

    /*
     * no branch sampling for tracepoint events
     */
    /* (2) tracepoint sample数据中不支持分支采样 */
    if (has_branch_stack(event))
        return -EOPNOTSUPP;

    /* (3) 继续init */
    err = perf_trace_init(event);
    if (err)
        return err;

    event->destroy = tp_perf_event_destroy;

    return 0;
}

↓

int perf_trace_init(struct perf_event *p_event)
{
    struct trace_event_call *tp_event;
    u64 event_id = p_event->attr.config;
    int ret = -EINVAL;

    mutex_lock(&event_mutex);

    /* (3.1) 遍历trace_event_call链表,找到type等于attr.config的trace_event_call */
    list_for_each_entry(tp_event, &ftrace_events, list) {
        if (tp_event->event.type == event_id &&
            tp_event->class && tp_event->class->reg &&
            try_module_get(tp_event->mod)) {

            /* (3.2) 继续初始化 */
            ret = perf_trace_event_init(tp_event, p_event);
            if (ret)
                module_put(tp_event->mod);
            break;
        }
    }
    mutex_unlock(&event_mutex);

    return ret;
}

↓

static int perf_trace_event_init(struct trace_event_call *tp_event,
                 struct perf_event *p_event)
{
    int ret;

    /* (3.2.1) 权限判断 */
    ret = perf_trace_event_perm(tp_event, p_event);
    if (ret)
        return ret;

    /* (3.2.2) register操作 */
    ret = perf_trace_event_reg(tp_event, p_event);
    if (ret)
        return ret;

    /* (3.2.3) open操作,对tracepoint来说这部分为空 */
    ret = perf_trace_event_open(p_event);
    if (ret) {
        perf_trace_event_unreg(p_event);
        return ret;
    }

    return 0;
}

↓

static int perf_trace_event_reg(struct trace_event_call *tp_event,
                struct perf_event *p_event)
{
    struct hlist_head __percpu *list;
    int ret = -ENOMEM;
    int cpu;

    p_event->tp_event = tp_event;
    if (tp_event->perf_refcount++ > 0)
        return 0;

    list = alloc_percpu(struct hlist_head);
    if (!list)
        goto fail;

    /* (3.2.2.1) 初始化trace_event_call上per_cpu的perf_event挂载链表:tp_event->perf_events 
        如果perf_event需要接收tracepoint的数据,需要按绑定的cpu挂载到对应的per_cpu链表上
     */
    for_each_possible_cpu(cpu)
        INIT_HLIST_HEAD(per_cpu_ptr(list, cpu));

    tp_event->perf_events = list;

    /* (3.2.2.2) 分配tmp buffer空间,buffer都是per_cpu的每个cpu上都有
        context包括4种:task, softirq, hardirq, nmi.
     */
    if (!total_ref_count) {
        char __percpu *buf;
        int i;

        for (i = 0; i < PERF_NR_CONTEXTS; i++) {
            buf = (char __percpu *)alloc_percpu(perf_trace_t);
            if (!buf)
                goto fail;

            perf_trace_buf[i] = buf;
        }
    }

    /* (3.2.2.3) 调用class->reg(TRACE_REG_PERF_REGISTER)注册 */
    ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER, NULL);
    if (ret)
        goto fail;

    total_ref_count++;
    return 0;

fail:
    if (!total_ref_count) {
        int i;

        for (i = 0; i < PERF_NR_CONTEXTS; i++) {
            free_percpu(perf_trace_buf[i]);
            perf_trace_buf[i] = NULL;
        }
    }

    if (!--tp_event->perf_refcount) {
        free_percpu(tp_event->perf_events);
        tp_event->perf_events = NULL;
    }

    return ret;
}

tp_event->class->reg()函数在trace_event_class定义时,已经确定为trace_event_reg()函数:

int trace_event_reg(struct trace_event_call *call,
            enum trace_reg type, void *data)
{
    struct trace_event_file *file = data;

    WARN_ON(!(call->flags & TRACE_EVENT_FL_TRACEPOINT));
    switch (type) {

    /* (1) 把trace_event的callback函数注册到tracepoint上 */
    case TRACE_REG_REGISTER:
        return tracepoint_probe_register(call->tp,
                         call->class->probe,
                         file);
    case TRACE_REG_UNREGISTER:
        tracepoint_probe_unregister(call->tp,
                        call->class->probe,
                        file);
        return 0;

#ifdef CONFIG_PERF_EVENTS

    /* (2) 把perf_event的callback函数注册到tracepoint上 */
    case TRACE_REG_PERF_REGISTER:
        return tracepoint_probe_register(call->tp,
                         call->class->perf_probe,
                         call);
    case TRACE_REG_PERF_UNREGISTER:
        tracepoint_probe_unregister(call->tp,
                        call->class->perf_probe,
                        call);
        return 0;
    case TRACE_REG_PERF_OPEN:
    case TRACE_REG_PERF_CLOSE:
    case TRACE_REG_PERF_ADD:
    case TRACE_REG_PERF_DEL:
        return 0;
#endif
    }
    return 0;
}

经过上述初始化注册以后,perf_event的callback函数已经插入到tracepoint中。此时如果tracepoint被命中,perf_event的callback函数会被调用,但是trace数据无人接收,因为tp_event->perf_events链表中还没有接收的perf_event。

3、event add/del

这里写图片描述

承接上一步perf_event注册以后,perf_event的回调函数已经可以被tracepoint调用,但是perf_event仍然收不到数据。因为这个时候perf_event还处于enable/inactive状态,需要处于active状态才能收到数据。

一个tracepoint会给多个perf_event提供数据,只有将perf_event加入到tp_event->perf_events per_cpu链表中才能收到数据。所以对tracepoint perf_event的start/stop就是操作tp_event->perf_events per_cpu链表。

在上一章“perf_event内核框架”中已经阐明,task维度的perf_event需要和task一起调度,其回调函数最后控制的就是perf_event的启动和停止。

context_switch() -> finish_task_switch() -> perf_event_task_sched_in() -> __perf_event_task_sched_in() -> perf_event_context_sched_in() -> perf_event_sched_in() -> ctx_sched_in() -> ctx_pinned_sched_in()/ctx_flexible_sched_in() -> group_sched_in() -> event_sched_in() -> pmu->add(event, PERF_EF_START) -> perf_trace_add():

int perf_trace_add(struct perf_event *p_event, int flags)
{
    struct trace_event_call *tp_event = p_event->tp_event;
    struct hlist_head __percpu *pcpu_list;
    struct hlist_head *list;

    pcpu_list = tp_event->perf_events;
    if (WARN_ON_ONCE(!pcpu_list))
        return -EINVAL;

    if (!(flags & PERF_EF_START))
        p_event->hw.state = PERF_HES_STOPPED;

    /* (1) 将perf_event加入到tp_event->perf_events的当前cpu链表中 */
    list = this_cpu_ptr(pcpu_list);
    hlist_add_head_rcu(&p_event->hlist_entry, list);

    /* (2) 空操作 */
    return tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event);
}

context_switch() -> prepare_task_switch() -> perf_event_task_sched_out() -> __perf_event_task_sched_out() -> perf_event_context_sched_out() -> ctx_sched_out() -> group_sched_out() -> event_sched_out() -> pmu->del() -> perf_trace_del():

void perf_trace_del(struct perf_event *p_event, int flags)
{
    struct trace_event_call *tp_event = p_event->tp_event;

    /* (1) 将perf_event从tp_event->perf_events的链表中删除 */
    if (!hlist_unhashed(&p_event->hlist_entry))
        hlist_del_rcu(&p_event->hlist_entry);

    /* (2) 空操作 */
    tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event);
}

4、event 数据采集

perf_event的主要功能就是提供count类型或者sample类型的trace数据。

tracepoint pmu提供trace数据的方式是被动的:tracepoint被命中后,perf callback函数被调用后,累加perf_event的count值,如果perf_event分配了ringbuffer记录sample数据。

4.1、count数据

我们在定义用户trace_event时使用__perf_count()宏指定每次tracepoint命中count的累加值。例如:

DECLARE_EVENT_CLASS(sched_stat_runtime,

    TP_PROTO(struct task_struct *tsk, u64 runtime, u64 vruntime),

    /* (1) 指定每次命中,count累加runtime */
    TP_ARGS(tsk, __perf_count(runtime), vruntime),

    TP_STRUCT__entry(
        __array( char,  comm,   TASK_COMM_LEN   )
        __field( pid_t, pid         )
        __field( u64,   runtime         )
        __field( u64,   vruntime            )
    ),

    TP_fast_assign(
        memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
        __entry->pid        = tsk->pid;
        __entry->runtime    = runtime;
        __entry->vruntime   = vruntime;
    ),

    TP_printk("comm=%s pid=%d runtime=%Lu [ns] vruntime=%Lu [ns]",
            __entry->comm, __entry->pid,
            (unsigned long long)__entry->runtime,
            (unsigned long long)__entry->vruntime)
);

如果没有使用__perf_count()宏定义,默认每次tracepoint命中count加1。

看看具体的代码:

/* (1.1) __perf_addr()宏,指定sample时PERF_SAMPLE_ADDR的值 */
#undef __perf_addr
#define __perf_addr(a)  (__addr = (a))

/* (1.2) __perf_count()宏,指定count的累加值 */
#undef __perf_count
#define __perf_count(c) (__count = (c))

/* (1.3) __perf_task()宏,指定由某个task上绑定的perf_event来接收trace数据 */
#undef __perf_task
#define __perf_task(t)  (__task = (t))

#undef DECLARE_EVENT_CLASS
#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print)  \
static notrace void                         \
perf_trace_##call(void *__data, proto)                  \
{                                   \
    struct trace_event_call *event_call = __data;           \
    struct trace_event_data_offsets_##call __maybe_unused __data_offsets;\
    struct trace_event_raw_##call *entry;               \
    struct pt_regs *__regs;                     \

    /* (1.4) 如果没有使用宏指定,默认值:
        __addr = 0
        __count = 1
        __task = NULL
     */
    u64 __addr = 0, __count = 1;                    \
    struct task_struct *__task = NULL;              \
    struct hlist_head *head;                    \
    int __entry_size;                       \
    int __data_size;                        \
    int rctx;                           \
                                    \
    /* (2.1) 获取动态成员的长度 */
    __data_size = trace_event_get_offsets_##call(&__data_offsets, args); \
                                    \
    /* (3) 获取tracepoint的本cpu perf_event链表,需要本cpu tracepoint数据的perf_event都链接于此 */
    head = this_cpu_ptr(event_call->perf_events);           \
    if (__builtin_constant_p(!__task) && !__task &&         \
                hlist_empty(head))          \
        return;                         \
                                    \

    /* (2.2) 计算需要存储的raw data的总长度:动态长度 + 固定成员长度 */
    __entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32),\
                 sizeof(u64));              \
    __entry_size -= sizeof(u32);                    \
                                    \
    /* (4) 根据长度获得tmp buffer 
        并得到当前的context:task, softirq, hardirq, nmi.
     */
    entry = perf_trace_buf_prepare(__entry_size,            \
            event_call->event.type, &__regs, &rctx);    \
    if (!entry)                         \
        return;                         \
                                    \

    /* (5) get当前register */
    perf_fetch_caller_regs(__regs);                 \
                                    \

    /* (6) 给动态成员的占位符赋值:offset + size */
    tstruct                             \
                                    \

    /* (7) 记录trace数据到tmp buffer中 */
    { assign; }                         \
                                    \

    /* (6) 提交raw data数据给连接到本tracepoint上的所有perf_event,拷贝tmp buffer到perf_event的ringbuffer中 
        注意:这里的数据只是raw data数据,整个perf的sample数据还包括其他一些数据
     */
    perf_trace_buf_submit(entry, __entry_size, rctx, __addr,    \
        __count, __regs, head, __task);             \
}

perf_trace_buf_prepare()函数的作用是获取一块tmp buffer,为了避免互斥根据当前的context(task, softirq, hardirq, nmi)和当前cpu,从per_cpu变量中分配到一块对应的buffer:

void *perf_trace_buf_prepare(int size, unsigned short type,
                 struct pt_regs **regs, int *rctxp)
{
    struct trace_entry *entry;
    unsigned long flags;
    char *raw_data;
    int pc;

    BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long));

    if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
            "perf buffer not large enough"))
        return NULL;

    pc = preempt_count();

    /* (4.1) 获取当前的context,并且防止递归进入 */
    *rctxp = perf_swevent_get_recursion_context();
    if (*rctxp < 0)
        return NULL;

    /* (4.2) 根据context。获取到保存reg的tmp buffer */
    if (regs)
        *regs = this_cpu_ptr(&__perf_regs[*rctxp]);

    /* (4.3) 根据context。获取到保存trace data的tmp buffer */
    raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]);

    /* zero the dead bytes from align to not leak stack to user */
    memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64));

    entry = (struct trace_entry *)raw_data;
    local_save_flags(flags);

    /* (4.4) 给tmp trace data buffer中的comm字段赋值 */
    tracing_generic_entry_update(entry, flags, pc);
    entry->type = type;

    return raw_data;
}

↓

int perf_swevent_get_recursion_context(void)
{
    struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);

    return get_recursion_context(swhash->recursion);
}

↓

static inline int get_recursion_context(int *recursion)
{
    int rctx;

    if (in_nmi())
        rctx = 3;
    else if (in_irq())
        rctx = 2;
    else if (in_softirq())
        rctx = 1;
    else
        rctx = 0;

    if (recursion[rctx])
        return -1;

    recursion[rctx]++;
    barrier();

    return rctx;
}

perf_trace_buf_submit()负责提交数据给this_cpu_ptr(event_call->perf_events)链表上等待的perf_event:

static inline void
perf_trace_buf_submit(void *raw_data, int size, int rctx, u64 addr,
               u64 count, struct pt_regs *regs, void *head,
               struct task_struct *task)
{
    perf_tp_event(addr, count, raw_data, size, regs, head, rctx, task);
}

↓

void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
           struct pt_regs *regs, struct hlist_head *head, int rctx,
           struct task_struct *task)
{
    struct perf_sample_data data;
    struct perf_event *event;

    /* (6.1) 存放raw data */
    struct perf_raw_record raw = {
        .size = entry_size,
        .data = record,
    };

    /* (6.2) 初始化sample data */
    perf_sample_data_init(&data, addr, 0);
    data.raw = &raw;

    /* (6.3) 把sample数据逐个发送给this_cpu_ptr(event_call->perf_events)链表上链接的perf_event */
    hlist_for_each_entry_rcu(event, head, hlist_entry) {
        if (perf_tp_event_match(event, &data, regs))
            perf_swevent_event(event, count, &data, regs);
    }

    /*
     * If we got specified a target task, also iterate its context and
     * deliver this event there too.
     */
    /* (6.4) __perf_task()宏,指定由某个task上绑定的perf_event来接收sample数据 */
    if (task && task != current) {
        struct perf_event_context *ctx;
        struct trace_entry *entry = record;

        rcu_read_lock();
        ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
        if (!ctx)
            goto unlock;

        list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
            if (event->attr.type != PERF_TYPE_TRACEPOINT)
                continue;
            if (event->attr.config != entry->type)
                continue;
            if (perf_tp_event_match(event, &data, regs))
                perf_swevent_event(event, count, &data, regs);
        }
unlock:
        rcu_read_unlock();
    }

    perf_swevent_put_recursion_context(rctx);
}

↓

static void perf_swevent_event(struct perf_event *event, u64 nr,
                   struct perf_sample_data *data,
                   struct pt_regs *regs)
{
    struct hw_perf_event *hwc = &event->hw;

    /* (6.3.1) 重点:
        perf_event的count值在这里累加
     */
    local64_add(nr, &event->count);

    if (!regs)
        return;

    /* (6.3.2) 没有指定sample方式的perf_event,不需要sample数据,直接返回 */
    if (!is_sampling_event(event))
        return;

    if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
        data->period = nr;
        return perf_swevent_overflow(event, 1, data, regs);
    } else
        data->period = event->hw.last_period;

    if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
        return perf_swevent_overflow(event, 1, data, regs);

    if (local64_add_negative(nr, &hwc->period_left))
        return;

    perf_swevent_overflow(event, 0, data, regs);
}

4.2、sample数据

我们来继续分析sample类型数据的上报。有两种上报方式:period、freq。

  • period上报方式:以每period间隔的方式上报。有3个相关的参数:

    event->hw.last_period // 当前的period值
    event->hw.period_left // 到period一个周期还剩多久,这是一个负值,取值范围(-last_period, 0)
    event->hw.sample_period // 最新的period值,只有当上一周期执行完后,才能更新:last_period = sample_period

static void perf_swevent_event(struct perf_event *event, u64 nr,
                   struct perf_sample_data *data,
                   struct pt_regs *regs)
{
    struct hw_perf_event *hwc = &event->hw;

    /* (1) 重点:
        perf_event的count值在这里累加
     */
    local64_add(nr, &event->count);

    if (!regs)
        return;

    /* (2) 没有指定sample方式的perf_event,不需要sample数据,直接返回 */
    if (!is_sampling_event(event))
        return;

    /* (3.1) 如果指定PERF_SAMPLE_PERIOD,且是period模式 
        不理period的限定,上报sample数据和当前data->period数据
        perf_swevent_overflow()的overflow参数 = 1,说明不理period的时间限制
     */
    if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
        data->period = nr;
        return perf_swevent_overflow(event, 1, data, regs);
    } else
        data->period = event->hw.last_period;

    /* (3.2) 如果hwc->sample_period == 1,且是period模式 
        每次都是上报,不需要重新设置period,所以overflow参数 = 1
     */
    if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
        return perf_swevent_overflow(event, 1, data, regs);

    /* (4.1) 判断period有没有到期,没有到期的数据丢弃返回。 */
    if (local64_add_negative(nr, &hwc->period_left))
        return;

    /* (4.2) period到期,上报数据。 */
    perf_swevent_overflow(event, 0, data, regs);
}

↓

static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
                    struct perf_sample_data *data,
                    struct pt_regs *regs)
{
    struct hw_perf_event *hwc = &event->hw;
    int throttle = 0;

    /* (4.2.1) overflow=0,说明是period到期后进来的,需要重新设置period 
        执行perf_swevent_set_period()返回的overflow是距离上一次上报经历了多少个period
     */
    if (!overflow)
        overflow = perf_swevent_set_period(event);

    if (hwc->interrupts == MAX_INTERRUPTS)
        return;

    /* (4.2.2) overflow是上一次经历了多少个period,或者是overflow=1强制进来的
        每个period上报一次数据
     */
    for (; overflow; overflow--) {
        if (__perf_event_overflow(event, throttle,
                        data, regs)) {
            /*
             * We inhibit the overflow from happening when
             * hwc->interrupts == MAX_INTERRUPTS.
             */
            break;
        }
        throttle = 1;
    }
}

↓

static int __perf_event_overflow(struct perf_event *event,
                   int throttle, struct perf_sample_data *data,
                   struct pt_regs *regs)
{
    int events = atomic_read(&event->event_limit);
    struct hw_perf_event *hwc = &event->hw;
    u64 seq;
    int ret = 0;

    /*
     * Non-sampling counters might still use the PMI to fold short
     * hardware counters, ignore those.
     */
    /* (4.2.2.1) 没有sample数据上报的返回 */
    if (unlikely(!is_sampling_event(event)))
        return 0;

    /* (4.2.2.2) throttle的判断,还没看懂? */
    seq = __this_cpu_read(perf_throttled_seq);
    if (seq != hwc->interrupts_seq) {
        hwc->interrupts_seq = seq;
        hwc->interrupts = 1;
    } else {
        hwc->interrupts++;
        if (unlikely(throttle
                 && hwc->interrupts >= max_samples_per_tick)) {
            __this_cpu_inc(perf_throttled_count);
            hwc->interrupts = MAX_INTERRUPTS;
            perf_log_throttle(event, 0);
            tick_nohz_full_kick();
            ret = 1;
        }
    }

    /* (4.2.2.4) 如果是freq模式,根据count的发生频率,重新计算period值 */
    if (event->attr.freq) {
        u64 now = perf_clock();
        s64 delta = now - hwc->freq_time_stamp;

        hwc->freq_time_stamp = now;

        if (delta > 0 && delta < 2*TICK_NSEC)
            perf_adjust_period(event, delta, hwc->last_period, true);
    }

    /*
     * XXX event_limit might not quite work as expected on inherited
     * events
     */

    /* (4.2.2.5) perf_event pending的处理,没看懂? */
    event->pending_kill = POLL_IN;
    if (events && atomic_dec_and_test(&event->event_limit)) {
        ret = 1;
        event->pending_kill = POLL_HUP;
        event->pending_disable = 1;
        irq_work_queue(&event->pending);
    }

    /* (4.2.2.6) 实际上报sample数据的地方了:
        调用event自己的overflow_handler()函数,或者默认perf_event_output()函数
     */
    if (event->overflow_handler)
        event->overflow_handler(event, data, regs);
    else
        perf_event_output(event, data, regs);

    if (*perf_event_fasync(event) && event->pending_kill) {
        event->pending_wakeup = 1;
        irq_work_queue(&event->pending);
    }

    return ret;
}

perf_event_output()是默认的上报sample数据函数:

void perf_event_output(struct perf_event *event,
            struct perf_sample_data *data,
            struct pt_regs *regs)
{
    struct perf_output_handle handle;
    struct perf_event_header header;

    /* protect the callchain buffers */
    rcu_read_lock();

    /* (1) 计算sample数据的总长度,并且根据attr.sample_type保存额外的数据 */
    perf_prepare_sample(&header, data, event, regs);

    /* (2) 根据总size在event的ringbuffer中分配空间 */
    if (perf_output_begin(&handle, event, header.size))
        goto exit;

    /* (3) 输出数据到ringbuffer中 */
    perf_output_sample(&handle, &header, data, event);

    /* (4) 把rb->head指针刷入rb->user_page->data_head */
    perf_output_end(&handle);

exit:
    rcu_read_unlock();
}

|→

int perf_output_begin(struct perf_output_handle *handle,
              struct perf_event *event, unsigned int size)
{
    struct ring_buffer *rb;
    unsigned long tail, offset, head;
    int have_lost, page_shift;
    struct {
        struct perf_event_header header;
        u64          id;
        u64          lost;
    } lost_event;

    rcu_read_lock();
    /*
     * For inherited events we send all the output towards the parent.
     */
    /* (2.1) 由于inherit创建的子进程,都使用父进程的ringbuffer */
    if (event->parent)
        event = event->parent;

    rb = rcu_dereference(event->rb);
    if (unlikely(!rb))
        goto out;

    if (unlikely(!rb->nr_pages))
        goto out;

    handle->rb    = rb;
    handle->event = event;

    have_lost = local_read(&rb->lost);
    if (unlikely(have_lost)) {
        size += sizeof(lost_event);
        if (event->attr.sample_id_all)
            size += event->id_header_size;
    }

    perf_output_get_handle(handle);

    /* (2.2) 在ringbuffer中寻找一块空间 */
    do {
        /* (2.2.1) tail指针从user_page中获取,head指针从rb结构中获取
            只有把数据写入确认写入以后,才会把head指针的改动从rb结构刷入user_page
         */
        tail = READ_ONCE(rb->user_page->data_tail);
        offset = head = local_read(&rb->head);
        if (!rb->overwrite &&
            unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size))
            goto fail;

        /*
         * The above forms a control dependency barrier separating the
         * @tail load above from the data stores below. Since the @tail
         * load is required to compute the branch to fail below.
         *
         * A, matches D; the full memory barrier userspace SHOULD issue
         * after reading the data and before storing the new tail
         * position.
         *
         * See perf_output_put_handle().
         */

        head += size;
    } while (local_cmpxchg(&rb->head, offset, head) != offset);

    /*
     * We rely on the implied barrier() by local_cmpxchg() to ensure
     * none of the data stores below can be lifted up by the compiler.
     */

    if (unlikely(head - local_read(&rb->wakeup) > rb->watermark))
        local_add(rb->watermark, &rb->wakeup);

    page_shift = PAGE_SHIFT + page_order(rb);

    /* (2.3) 计算分配得到的buffer的第一个page偏移和size */
    handle->page = (offset >> page_shift) & (rb->nr_pages - 1);
    offset &= (1UL << page_shift) - 1;
    handle->addr = rb->data_pages[handle->page] + offset;
    handle->size = (1UL << page_shift) - offset;

    if (unlikely(have_lost)) {
        struct perf_sample_data sample_data;

        lost_event.header.size = sizeof(lost_event);
        lost_event.header.type = PERF_RECORD_LOST;
        lost_event.header.misc = 0;
        lost_event.id          = event->id;
        lost_event.lost        = local_xchg(&rb->lost, 0);

        perf_event_header__init_id(&lost_event.header,
                       &sample_data, event);
        perf_output_put(handle, lost_event);
        perf_event__output_id_sample(event, handle, &sample_data);
    }

    return 0;

fail:
    local_inc(&rb->lost);
    perf_output_put_handle(handle);
out:
    rcu_read_unlock();

    return -ENOSPC;
}

|→

void perf_output_end(struct perf_output_handle *handle)
{
    perf_output_put_handle(handle);
    rcu_read_unlock();
}

↓

static void perf_output_put_handle(struct perf_output_handle *handle)
{
    struct ring_buffer *rb = handle->rb;
    unsigned long head;

again:
    head = local_read(&rb->head);

    /*
     * IRQ/NMI can happen here, which means we can miss a head update.
     */

    if (!local_dec_and_test(&rb->nest))
        goto out;

    /*
     * Since the mmap() consumer (userspace) can run on a different CPU:
     *
     *   kernel             user
     *
     *   if (LOAD ->data_tail) {        LOAD ->data_head
     *          (A)     smp_rmb()   (C)
     *  STORE $data			LOAD $data
     *  smp_wmb()   (B)     smp_mb()    (D)
     *  STORE ->data_head       STORE ->data_tail
     *   }
     *
     * Where A pairs with D, and B pairs with C.
     *
     * In our case (A) is a control dependency that separates the load of
     * the ->data_tail and the stores of $data. In case ->data_tail
     * indicates there is no room in the buffer to store $data we do not.
     *
     * D needs to be a full barrier since it separates the data READ
     * from the tail WRITE.
     *
     * For B a WMB is sufficient since it separates two WRITEs, and for C
     * an RMB is sufficient since it separates two READs.
     *
     * See perf_output_begin().
     */
    smp_wmb(); /* B, matches C */
    rb->user_page->data_head = head;

    /*
     * Now check if we missed an update -- rely on previous implied
     * compiler barriers to force a re-read.
     */
    if (unlikely(head != local_read(&rb->head))) {
        local_inc(&rb->nest);
        goto again;
    }

    if (handle->wakeup != local_read(&rb->wakeup))
        perf_output_wakeup(handle);

out:
    preempt_enable();
}

这里写图片描述

perf_event_type中详细描述了各种type类型的sample数据排列模式:

enum perf_event_type {

    /*
     * If perf_event_attr.sample_id_all is set then all event types will
     * have the sample_type selected fields related to where/when
     * (identity) an event took place (TID, TIME, ID, STREAM_ID, CPU,
     * IDENTIFIER) described in PERF_RECORD_SAMPLE below, it will be stashed
     * just after the perf_event_header and the fields already present for
     * the existing fields, i.e. at the end of the payload. That way a newer
     * perf.data file will be supported by older perf tools, with these new
     * optional fields being ignored.
     *
     * struct sample_id {
     *  { u32           pid, tid; } && PERF_SAMPLE_TID
     *  { u64           time;     } && PERF_SAMPLE_TIME
     *  { u64           id;       } && PERF_SAMPLE_ID
     *  { u64           stream_id;} && PERF_SAMPLE_STREAM_ID
     *  { u32           cpu, res; } && PERF_SAMPLE_CPU
     *  { u64           id;   } && PERF_SAMPLE_IDENTIFIER
     * } && perf_event_attr::sample_id_all
     *
     * Note that PERF_SAMPLE_IDENTIFIER duplicates PERF_SAMPLE_ID.  The
     * advantage of PERF_SAMPLE_IDENTIFIER is that its position is fixed
     * relative to header.size.
     */

    /*
     * The MMAP events record the PROT_EXEC mappings so that we can
     * correlate userspace IPs to code. They have the following structure:
     *
     * struct {
     *  struct perf_event_header    header;
     *
     *  u32             pid, tid;
     *  u64             addr;
     *  u64             len;
     *  u64             pgoff;
     *  char                filename[];
     *  struct sample_id        sample_id;
     * };
     */
    PERF_RECORD_MMAP            = 1,

    /*
     * struct {
     *  struct perf_event_header    header;
     *  u64             id;
     *  u64             lost;
     *  struct sample_id        sample_id;
     * };
     */
    PERF_RECORD_LOST            = 2,

    /*
     * struct {
     *  struct perf_event_header    header;
     *
     *  u32             pid, tid;
     *  char                comm[];
     *  struct sample_id        sample_id;
     * };
     */
    PERF_RECORD_COMM            = 3,

    /*
     * struct {
     *  struct perf_event_header    header;
     *  u32             pid, ppid;
     *  u32             tid, ptid;
     *  u64             time;
     *  struct sample_id        sample_id;
     * };
     */
    PERF_RECORD_EXIT            = 4,

    /*
     * struct {
     *  struct perf_event_header    header;
     *  u64             time;
     *  u64             id;
     *  u64             stream_id;
     *  struct sample_id        sample_id;
     * };
     */
    PERF_RECORD_THROTTLE            = 5,
    PERF_RECORD_UNTHROTTLE          = 6,

    /*
     * struct {
     *  struct perf_event_header    header;
     *  u32             pid, ppid;
     *  u32             tid, ptid;
     *  u64             time;
     *  struct sample_id        sample_id;
     * };
     */
    PERF_RECORD_FORK            = 7,

    /*
     * struct {
     *  struct perf_event_header    header;
     *  u32             pid, tid;
     *
     *  struct read_format      values;
     *  struct sample_id        sample_id;
     * };
     */
    PERF_RECORD_READ            = 8,

    /*
     * struct {
     *  struct perf_event_header    header;
     *
     *  #
     *  # Note that PERF_SAMPLE_IDENTIFIER duplicates PERF_SAMPLE_ID.
     *  # The advantage of PERF_SAMPLE_IDENTIFIER is that its position
     *  # is fixed relative to header.
     *  #
     *
     *  { u64           id;   } && PERF_SAMPLE_IDENTIFIER
     *  { u64           ip;   } && PERF_SAMPLE_IP
     *  { u32           pid, tid; } && PERF_SAMPLE_TID
     *  { u64           time;     } && PERF_SAMPLE_TIME
     *  { u64           addr;     } && PERF_SAMPLE_ADDR
     *  { u64           id;   } && PERF_SAMPLE_ID
     *  { u64           stream_id;} && PERF_SAMPLE_STREAM_ID
     *  { u32           cpu, res; } && PERF_SAMPLE_CPU
     *  { u64           period;   } && PERF_SAMPLE_PERIOD
     *
     *  { struct read_format    values;   } && PERF_SAMPLE_READ
     *
     *  { u64           nr,
     *    u64           ips[nr];  } && PERF_SAMPLE_CALLCHAIN
     *
     *  #
     *  # The RAW record below is opaque data wrt the ABI
     *  #
     *  # That is, the ABI doesn't make any promises wrt to
     *  # the stability of its content, it may vary depending
     *  # on event, hardware, kernel version and phase of
     *  # the moon.
     *  #
     *  # In other words, PERF_SAMPLE_RAW contents are not an ABI.
     *  #
     *
     *  { u32           size;
     *    char                  data[size];}&& PERF_SAMPLE_RAW
     *
     *  { u64                   nr;
     *        { u64 from, to, flags } lbr[nr];} && PERF_SAMPLE_BRANCH_STACK
     *
     *  { u64           abi; # enum perf_sample_regs_abi
     *    u64           regs[weight(mask)]; } && PERF_SAMPLE_REGS_USER
     *
     *  { u64           size;
     *    char          data[size];
     *    u64           dyn_size; } && PERF_SAMPLE_STACK_USER
     *
     *  { u64           weight;   } && PERF_SAMPLE_WEIGHT
     *  { u64           data_src; } && PERF_SAMPLE_DATA_SRC
     *  { u64           transaction; } && PERF_SAMPLE_TRANSACTION
     *  { u64           abi; # enum perf_sample_regs_abi
     *    u64           regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR
     * };
     */
    PERF_RECORD_SAMPLE          = 9,

    /*
     * The MMAP2 records are an augmented version of MMAP, they add
     * maj, min, ino numbers to be used to uniquely identify each mapping
     *
     * struct {
     *  struct perf_event_header    header;
     *
     *  u32             pid, tid;
     *  u64             addr;
     *  u64             len;
     *  u64             pgoff;
     *  u32             maj;
     *  u32             min;
     *  u64             ino;
     *  u64             ino_generation;
     *  u32             prot, flags;
     *  char                filename[];
     *  struct sample_id        sample_id;
     * };
     */
    PERF_RECORD_MMAP2           = 10,

    /*
     * Records that new data landed in the AUX buffer part.
     *
     * struct {
     *  struct perf_event_header    header;
     *
     *  u64             aux_offset;
     *  u64             aux_size;
     *  u64             flags;
     *  struct sample_id        sample_id;
     * };
     */
    PERF_RECORD_AUX             = 11,

    /*
     * Indicates that instruction trace has started
     *
     * struct {
     *  struct perf_event_header    header;
     *  u32             pid;
     *  u32             tid;
     * };
     */
    PERF_RECORD_ITRACE_START        = 12,

    /*
     * Records the dropped/lost sample number.
     *
     * struct {
     *  struct perf_event_header    header;
     *
     *  u64             lost;
     *  struct sample_id        sample_id;
     * };
     */
    PERF_RECORD_LOST_SAMPLES        = 13,

    /*
     * Records a context switch in or out (flagged by
     * PERF_RECORD_MISC_SWITCH_OUT). See also
     * PERF_RECORD_SWITCH_CPU_WIDE.
     *
     * struct {
     *  struct perf_event_header    header;
     *  struct sample_id        sample_id;
     * };
     */
    PERF_RECORD_SWITCH          = 14,

    /*
     * CPU-wide version of PERF_RECORD_SWITCH with next_prev_pid and
     * next_prev_tid that are the next (switching out) or previous
     * (switching in) pid/tid.
     *
     * struct {
     *  struct perf_event_header    header;
     *  u32             next_prev_pid;
     *  u32             next_prev_tid;
     *  struct sample_id        sample_id;
     * };
     */
    PERF_RECORD_SWITCH_CPU_WIDE     = 15,

    PERF_RECORD_MAX,            /* non-ABI */
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值