perf复用了ftrace中trace_event的所有插桩点(即tracepoint),trace数据即可以通过ftrace的通道提供,也可以被包装成perf_event供perf工具使用。
可以通过perf list命令来查看系统中的tracpoint event,可以看到和/sys/kernel/debug/tracing/events/路径下的trace_event一致:
# simpleperf list tracepoint | more
List of tracepoint events:
almk:almk_shrink
almk:almk_vmpressure
asoc:snd_soc_bias_level_done
asoc:snd_soc_bias_level_start
asoc:snd_soc_dapm_connected
asoc:snd_soc_dapm_done
asoc:snd_soc_dapm_path
asoc:snd_soc_dapm_start
asoc:snd_soc_dapm_walk_done
asoc:snd_soc_dapm_widget_event_done
asoc:snd_soc_dapm_widget_event_start
asoc:snd_soc_dapm_widget_power
1、原理介绍:
trace_event在定义的时候,已经把trace_point、trace_event的callback函数、perf_event的callback函数全都给定义好了。只需要使用现成的架构把perf_event的callback函数给注册到trace_point上去就行了。
include/trace/trace_event.h:
/* (1) trace_event_class的定义 */
#undef DECLARE_EVENT_CLASS
#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \
_TRACE_PERF_PROTO(call, PARAMS(proto)); \
static char print_fmt_##call[] = print; \
static struct trace_event_class __used __refdata event_class_##call = { \
.system = TRACE_SYSTEM_STRING, \
.define_fields = trace_event_define_fields_##call, \
.fields = LIST_HEAD_INIT(event_class_##call.fields),\
.raw_init = trace_event_raw_init, \
/* (1.1) trace_event的回调函数 */
.probe = trace_event_raw_event_##call, \
/* (1.2) 注册函数,把callback函数加入到tracepoint中 */
.reg = trace_event_reg, \
/* (1.3) perf_event部分的定义 */
_TRACE_PERF_INIT(call) \
};
/* (2) trace_event_call的定义 */
#undef DEFINE_EVENT
#define DEFINE_EVENT(template, call, proto, args) \
\
static struct trace_event_call __used event_##call = { \
/* (2.1) 对trace_event_class的引用 */
.class = &event_class_##template, \
/* (2.2) tracepoint的定义 */
{ \
.tp = &__tracepoint_##call, \
}, \
.event.funcs = &trace_event_type_funcs_##template, \
.print_fmt = print_fmt_##template, \
.flags = TRACE_EVENT_FL_TRACEPOINT, \
}; \
static struct trace_event_call __used \
__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call
_TRACE_PERF_INIT()宏的展开:
#ifdef CONFIG_PERF_EVENTS
#define _TRACE_PERF_PROTO(call, proto) \
static notrace void \
perf_trace_##call(void *__data, proto);
#define _TRACE_PERF_INIT(call) \
/* (1.3) perf_event的回调函数:perf_trace_##call() */
.perf_probe = perf_trace_##call,
#else
#define _TRACE_PERF_PROTO(call, proto)
#define _TRACE_PERF_INIT(call)
#endif /* CONFIG_PERF_EVENTS */
继续追踪perf_event的回调函数,include/trace/perf.h:
#undef DECLARE_EVENT_CLASS
#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \
static notrace void \
perf_trace_##call(void *__data, proto) \
{ \
struct trace_event_call *event_call = __data; \
struct trace_event_data_offsets_##call __maybe_unused __data_offsets;\
struct trace_event_raw_##call *entry; \
struct pt_regs *__regs; \
u64 __addr = 0, __count = 1; \
struct task_struct *__task = NULL; \
struct hlist_head *head; \
int __entry_size; \
int __data_size; \
int rctx; \
\
__data_size = trace_event_get_offsets_##call(&__data_offsets, args); \
\
head = this_cpu_ptr(event_call->perf_events); \
if (__builtin_constant_p(!__task) && !__task && \
hlist_empty(head)) \
return; \
\
/* (1) 计算需要存储的trace数据的长度 */
__entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32),\
sizeof(u64)); \
__entry_size -= sizeof(u32); \
\
/* (2) 分配存储数据的tmp buffer */
entry = perf_trace_buf_prepare(__entry_size, \
event_call->event.type, &__regs, &rctx); \
if (!entry) \
return; \
\
/* (3) get当前register */
perf_fetch_caller_regs(__regs); \
\
/* (4) 给动态成员的占位符赋值:offset + size */
tstruct \
\
/* (5) 记录trace数据到tmp buffer中 */
{ assign; } \
\
/* (6) 提交trace数据给连接到本tracepoint上的所有perf_event,拷贝tmp buffer到perf_event的ringbuffer中 */
perf_trace_buf_submit(entry, __entry_size, rctx, __addr, \
__count, __regs, head, __task); \
}
我们接下来看一下详细的过程和数据格式。
2、event init
tracepoint pmu的定义如下:
static struct pmu perf_tracepoint = {
.task_ctx_nr = perf_sw_context,
.event_init = perf_tp_event_init,
.add = perf_trace_add,
.del = perf_trace_del,
.start = perf_swevent_start,
.stop = perf_swevent_stop,
.read = perf_swevent_read,
.events_across_hotplug = 1,
};
在调用perf_event_open()系统调用创建新的perf_event的时候,会调用到pmu->event_init()函数。
perf_event_open() -> perf_event_alloc() -> perf_init_event() -> perf_try_init_event() -> pmu->event_init(event) -> perf_tp_event_init():
static int perf_tp_event_init(struct perf_event *event)
{
int err;
/* (1) attr.type类型不对,出错返回 */
if (event->attr.type != PERF_TYPE_TRACEPOINT)
return -ENOENT;
/*
* no branch sampling for tracepoint events
*/
/* (2) tracepoint sample数据中不支持分支采样 */
if (has_branch_stack(event))
return -EOPNOTSUPP;
/* (3) 继续init */
err = perf_trace_init(event);
if (err)
return err;
event->destroy = tp_perf_event_destroy;
return 0;
}
↓
int perf_trace_init(struct perf_event *p_event)
{
struct trace_event_call *tp_event;
u64 event_id = p_event->attr.config;
int ret = -EINVAL;
mutex_lock(&event_mutex);
/* (3.1) 遍历trace_event_call链表,找到type等于attr.config的trace_event_call */
list_for_each_entry(tp_event, &ftrace_events, list) {
if (tp_event->event.type == event_id &&
tp_event->class && tp_event->class->reg &&
try_module_get(tp_event->mod)) {
/* (3.2) 继续初始化 */
ret = perf_trace_event_init(tp_event, p_event);
if (ret)
module_put(tp_event->mod);
break;
}
}
mutex_unlock(&event_mutex);
return ret;
}
↓
static int perf_trace_event_init(struct trace_event_call *tp_event,
struct perf_event *p_event)
{
int ret;
/* (3.2.1) 权限判断 */
ret = perf_trace_event_perm(tp_event, p_event);
if (ret)
return ret;
/* (3.2.2) register操作 */
ret = perf_trace_event_reg(tp_event, p_event);
if (ret)
return ret;
/* (3.2.3) open操作,对tracepoint来说这部分为空 */
ret = perf_trace_event_open(p_event);
if (ret) {
perf_trace_event_unreg(p_event);
return ret;
}
return 0;
}
↓
static int perf_trace_event_reg(struct trace_event_call *tp_event,
struct perf_event *p_event)
{
struct hlist_head __percpu *list;
int ret = -ENOMEM;
int cpu;
p_event->tp_event = tp_event;
if (tp_event->perf_refcount++ > 0)
return 0;
list = alloc_percpu(struct hlist_head);
if (!list)
goto fail;
/* (3.2.2.1) 初始化trace_event_call上per_cpu的perf_event挂载链表:tp_event->perf_events
如果perf_event需要接收tracepoint的数据,需要按绑定的cpu挂载到对应的per_cpu链表上
*/
for_each_possible_cpu(cpu)
INIT_HLIST_HEAD(per_cpu_ptr(list, cpu));
tp_event->perf_events = list;
/* (3.2.2.2) 分配tmp buffer空间,buffer都是per_cpu的每个cpu上都有
context包括4种:task, softirq, hardirq, nmi.
*/
if (!total_ref_count) {
char __percpu *buf;
int i;
for (i = 0; i < PERF_NR_CONTEXTS; i++) {
buf = (char __percpu *)alloc_percpu(perf_trace_t);
if (!buf)
goto fail;
perf_trace_buf[i] = buf;
}
}
/* (3.2.2.3) 调用class->reg(TRACE_REG_PERF_REGISTER)注册 */
ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER, NULL);
if (ret)
goto fail;
total_ref_count++;
return 0;
fail:
if (!total_ref_count) {
int i;
for (i = 0; i < PERF_NR_CONTEXTS; i++) {
free_percpu(perf_trace_buf[i]);
perf_trace_buf[i] = NULL;
}
}
if (!--tp_event->perf_refcount) {
free_percpu(tp_event->perf_events);
tp_event->perf_events = NULL;
}
return ret;
}
tp_event->class->reg()函数在trace_event_class定义时,已经确定为trace_event_reg()函数:
int trace_event_reg(struct trace_event_call *call,
enum trace_reg type, void *data)
{
struct trace_event_file *file = data;
WARN_ON(!(call->flags & TRACE_EVENT_FL_TRACEPOINT));
switch (type) {
/* (1) 把trace_event的callback函数注册到tracepoint上 */
case TRACE_REG_REGISTER:
return tracepoint_probe_register(call->tp,
call->class->probe,
file);
case TRACE_REG_UNREGISTER:
tracepoint_probe_unregister(call->tp,
call->class->probe,
file);
return 0;
#ifdef CONFIG_PERF_EVENTS
/* (2) 把perf_event的callback函数注册到tracepoint上 */
case TRACE_REG_PERF_REGISTER:
return tracepoint_probe_register(call->tp,
call->class->perf_probe,
call);
case TRACE_REG_PERF_UNREGISTER:
tracepoint_probe_unregister(call->tp,
call->class->perf_probe,
call);
return 0;
case TRACE_REG_PERF_OPEN:
case TRACE_REG_PERF_CLOSE:
case TRACE_REG_PERF_ADD:
case TRACE_REG_PERF_DEL:
return 0;
#endif
}
return 0;
}
经过上述初始化注册以后,perf_event的callback函数已经插入到tracepoint中。此时如果tracepoint被命中,perf_event的callback函数会被调用,但是trace数据无人接收,因为tp_event->perf_events链表中还没有接收的perf_event。
3、event add/del
承接上一步perf_event注册以后,perf_event的回调函数已经可以被tracepoint调用,但是perf_event仍然收不到数据。因为这个时候perf_event还处于enable/inactive状态,需要处于active状态才能收到数据。
一个tracepoint会给多个perf_event提供数据,只有将perf_event加入到tp_event->perf_events per_cpu链表中才能收到数据。所以对tracepoint perf_event的start/stop就是操作tp_event->perf_events per_cpu链表。
在上一章“perf_event内核框架”中已经阐明,task维度的perf_event需要和task一起调度,其回调函数最后控制的就是perf_event的启动和停止。
context_switch() -> finish_task_switch() -> perf_event_task_sched_in() -> __perf_event_task_sched_in() -> perf_event_context_sched_in() -> perf_event_sched_in() -> ctx_sched_in() -> ctx_pinned_sched_in()/ctx_flexible_sched_in() -> group_sched_in() -> event_sched_in() -> pmu->add(event, PERF_EF_START) -> perf_trace_add():
int perf_trace_add(struct perf_event *p_event, int flags)
{
struct trace_event_call *tp_event = p_event->tp_event;
struct hlist_head __percpu *pcpu_list;
struct hlist_head *list;
pcpu_list = tp_event->perf_events;
if (WARN_ON_ONCE(!pcpu_list))
return -EINVAL;
if (!(flags & PERF_EF_START))
p_event->hw.state = PERF_HES_STOPPED;
/* (1) 将perf_event加入到tp_event->perf_events的当前cpu链表中 */
list = this_cpu_ptr(pcpu_list);
hlist_add_head_rcu(&p_event->hlist_entry, list);
/* (2) 空操作 */
return tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event);
}
context_switch() -> prepare_task_switch() -> perf_event_task_sched_out() -> __perf_event_task_sched_out() -> perf_event_context_sched_out() -> ctx_sched_out() -> group_sched_out() -> event_sched_out() -> pmu->del() -> perf_trace_del():
void perf_trace_del(struct perf_event *p_event, int flags)
{
struct trace_event_call *tp_event = p_event->tp_event;
/* (1) 将perf_event从tp_event->perf_events的链表中删除 */
if (!hlist_unhashed(&p_event->hlist_entry))
hlist_del_rcu(&p_event->hlist_entry);
/* (2) 空操作 */
tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event);
}
4、event 数据采集
perf_event的主要功能就是提供count类型或者sample类型的trace数据。
tracepoint pmu提供trace数据的方式是被动的:tracepoint被命中后,perf callback函数被调用后,累加perf_event的count值,如果perf_event分配了ringbuffer记录sample数据。
4.1、count数据
我们在定义用户trace_event时使用__perf_count()宏指定每次tracepoint命中count的累加值。例如:
DECLARE_EVENT_CLASS(sched_stat_runtime,
TP_PROTO(struct task_struct *tsk, u64 runtime, u64 vruntime),
/* (1) 指定每次命中,count累加runtime */
TP_ARGS(tsk, __perf_count(runtime), vruntime),
TP_STRUCT__entry(
__array( char, comm, TASK_COMM_LEN )
__field( pid_t, pid )
__field( u64, runtime )
__field( u64, vruntime )
),
TP_fast_assign(
memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
__entry->pid = tsk->pid;
__entry->runtime = runtime;
__entry->vruntime = vruntime;
),
TP_printk("comm=%s pid=%d runtime=%Lu [ns] vruntime=%Lu [ns]",
__entry->comm, __entry->pid,
(unsigned long long)__entry->runtime,
(unsigned long long)__entry->vruntime)
);
如果没有使用__perf_count()宏定义,默认每次tracepoint命中count加1。
看看具体的代码:
/* (1.1) __perf_addr()宏,指定sample时PERF_SAMPLE_ADDR的值 */
#undef __perf_addr
#define __perf_addr(a) (__addr = (a))
/* (1.2) __perf_count()宏,指定count的累加值 */
#undef __perf_count
#define __perf_count(c) (__count = (c))
/* (1.3) __perf_task()宏,指定由某个task上绑定的perf_event来接收trace数据 */
#undef __perf_task
#define __perf_task(t) (__task = (t))
#undef DECLARE_EVENT_CLASS
#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \
static notrace void \
perf_trace_##call(void *__data, proto) \
{ \
struct trace_event_call *event_call = __data; \
struct trace_event_data_offsets_##call __maybe_unused __data_offsets;\
struct trace_event_raw_##call *entry; \
struct pt_regs *__regs; \
/* (1.4) 如果没有使用宏指定,默认值:
__addr = 0
__count = 1
__task = NULL
*/
u64 __addr = 0, __count = 1; \
struct task_struct *__task = NULL; \
struct hlist_head *head; \
int __entry_size; \
int __data_size; \
int rctx; \
\
/* (2.1) 获取动态成员的长度 */
__data_size = trace_event_get_offsets_##call(&__data_offsets, args); \
\
/* (3) 获取tracepoint的本cpu perf_event链表,需要本cpu tracepoint数据的perf_event都链接于此 */
head = this_cpu_ptr(event_call->perf_events); \
if (__builtin_constant_p(!__task) && !__task && \
hlist_empty(head)) \
return; \
\
/* (2.2) 计算需要存储的raw data的总长度:动态长度 + 固定成员长度 */
__entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32),\
sizeof(u64)); \
__entry_size -= sizeof(u32); \
\
/* (4) 根据长度获得tmp buffer
并得到当前的context:task, softirq, hardirq, nmi.
*/
entry = perf_trace_buf_prepare(__entry_size, \
event_call->event.type, &__regs, &rctx); \
if (!entry) \
return; \
\
/* (5) get当前register */
perf_fetch_caller_regs(__regs); \
\
/* (6) 给动态成员的占位符赋值:offset + size */
tstruct \
\
/* (7) 记录trace数据到tmp buffer中 */
{ assign; } \
\
/* (6) 提交raw data数据给连接到本tracepoint上的所有perf_event,拷贝tmp buffer到perf_event的ringbuffer中
注意:这里的数据只是raw data数据,整个perf的sample数据还包括其他一些数据
*/
perf_trace_buf_submit(entry, __entry_size, rctx, __addr, \
__count, __regs, head, __task); \
}
perf_trace_buf_prepare()函数的作用是获取一块tmp buffer,为了避免互斥根据当前的context(task, softirq, hardirq, nmi)和当前cpu,从per_cpu变量中分配到一块对应的buffer:
void *perf_trace_buf_prepare(int size, unsigned short type,
struct pt_regs **regs, int *rctxp)
{
struct trace_entry *entry;
unsigned long flags;
char *raw_data;
int pc;
BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long));
if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
"perf buffer not large enough"))
return NULL;
pc = preempt_count();
/* (4.1) 获取当前的context,并且防止递归进入 */
*rctxp = perf_swevent_get_recursion_context();
if (*rctxp < 0)
return NULL;
/* (4.2) 根据context。获取到保存reg的tmp buffer */
if (regs)
*regs = this_cpu_ptr(&__perf_regs[*rctxp]);
/* (4.3) 根据context。获取到保存trace data的tmp buffer */
raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]);
/* zero the dead bytes from align to not leak stack to user */
memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64));
entry = (struct trace_entry *)raw_data;
local_save_flags(flags);
/* (4.4) 给tmp trace data buffer中的comm字段赋值 */
tracing_generic_entry_update(entry, flags, pc);
entry->type = type;
return raw_data;
}
↓
int perf_swevent_get_recursion_context(void)
{
struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
return get_recursion_context(swhash->recursion);
}
↓
static inline int get_recursion_context(int *recursion)
{
int rctx;
if (in_nmi())
rctx = 3;
else if (in_irq())
rctx = 2;
else if (in_softirq())
rctx = 1;
else
rctx = 0;
if (recursion[rctx])
return -1;
recursion[rctx]++;
barrier();
return rctx;
}
perf_trace_buf_submit()负责提交数据给this_cpu_ptr(event_call->perf_events)链表上等待的perf_event:
static inline void
perf_trace_buf_submit(void *raw_data, int size, int rctx, u64 addr,
u64 count, struct pt_regs *regs, void *head,
struct task_struct *task)
{
perf_tp_event(addr, count, raw_data, size, regs, head, rctx, task);
}
↓
void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
struct pt_regs *regs, struct hlist_head *head, int rctx,
struct task_struct *task)
{
struct perf_sample_data data;
struct perf_event *event;
/* (6.1) 存放raw data */
struct perf_raw_record raw = {
.size = entry_size,
.data = record,
};
/* (6.2) 初始化sample data */
perf_sample_data_init(&data, addr, 0);
data.raw = &raw;
/* (6.3) 把sample数据逐个发送给this_cpu_ptr(event_call->perf_events)链表上链接的perf_event */
hlist_for_each_entry_rcu(event, head, hlist_entry) {
if (perf_tp_event_match(event, &data, regs))
perf_swevent_event(event, count, &data, regs);
}
/*
* If we got specified a target task, also iterate its context and
* deliver this event there too.
*/
/* (6.4) __perf_task()宏,指定由某个task上绑定的perf_event来接收sample数据 */
if (task && task != current) {
struct perf_event_context *ctx;
struct trace_entry *entry = record;
rcu_read_lock();
ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
if (!ctx)
goto unlock;
list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
if (event->attr.type != PERF_TYPE_TRACEPOINT)
continue;
if (event->attr.config != entry->type)
continue;
if (perf_tp_event_match(event, &data, regs))
perf_swevent_event(event, count, &data, regs);
}
unlock:
rcu_read_unlock();
}
perf_swevent_put_recursion_context(rctx);
}
↓
static void perf_swevent_event(struct perf_event *event, u64 nr,
struct perf_sample_data *data,
struct pt_regs *regs)
{
struct hw_perf_event *hwc = &event->hw;
/* (6.3.1) 重点:
perf_event的count值在这里累加
*/
local64_add(nr, &event->count);
if (!regs)
return;
/* (6.3.2) 没有指定sample方式的perf_event,不需要sample数据,直接返回 */
if (!is_sampling_event(event))
return;
if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
data->period = nr;
return perf_swevent_overflow(event, 1, data, regs);
} else
data->period = event->hw.last_period;
if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
return perf_swevent_overflow(event, 1, data, regs);
if (local64_add_negative(nr, &hwc->period_left))
return;
perf_swevent_overflow(event, 0, data, regs);
}
4.2、sample数据
我们来继续分析sample类型数据的上报。有两种上报方式:period、freq。
period上报方式:以每period间隔的方式上报。有3个相关的参数:
event->hw.last_period // 当前的period值
event->hw.period_left // 到period一个周期还剩多久,这是一个负值,取值范围(-last_period, 0)
event->hw.sample_period // 最新的period值,只有当上一周期执行完后,才能更新:last_period = sample_period
static void perf_swevent_event(struct perf_event *event, u64 nr,
struct perf_sample_data *data,
struct pt_regs *regs)
{
struct hw_perf_event *hwc = &event->hw;
/* (1) 重点:
perf_event的count值在这里累加
*/
local64_add(nr, &event->count);
if (!regs)
return;
/* (2) 没有指定sample方式的perf_event,不需要sample数据,直接返回 */
if (!is_sampling_event(event))
return;
/* (3.1) 如果指定PERF_SAMPLE_PERIOD,且是period模式
不理period的限定,上报sample数据和当前data->period数据
perf_swevent_overflow()的overflow参数 = 1,说明不理period的时间限制
*/
if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
data->period = nr;
return perf_swevent_overflow(event, 1, data, regs);
} else
data->period = event->hw.last_period;
/* (3.2) 如果hwc->sample_period == 1,且是period模式
每次都是上报,不需要重新设置period,所以overflow参数 = 1
*/
if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
return perf_swevent_overflow(event, 1, data, regs);
/* (4.1) 判断period有没有到期,没有到期的数据丢弃返回。 */
if (local64_add_negative(nr, &hwc->period_left))
return;
/* (4.2) period到期,上报数据。 */
perf_swevent_overflow(event, 0, data, regs);
}
↓
static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
struct perf_sample_data *data,
struct pt_regs *regs)
{
struct hw_perf_event *hwc = &event->hw;
int throttle = 0;
/* (4.2.1) overflow=0,说明是period到期后进来的,需要重新设置period
执行perf_swevent_set_period()返回的overflow是距离上一次上报经历了多少个period
*/
if (!overflow)
overflow = perf_swevent_set_period(event);
if (hwc->interrupts == MAX_INTERRUPTS)
return;
/* (4.2.2) overflow是上一次经历了多少个period,或者是overflow=1强制进来的
每个period上报一次数据
*/
for (; overflow; overflow--) {
if (__perf_event_overflow(event, throttle,
data, regs)) {
/*
* We inhibit the overflow from happening when
* hwc->interrupts == MAX_INTERRUPTS.
*/
break;
}
throttle = 1;
}
}
↓
static int __perf_event_overflow(struct perf_event *event,
int throttle, struct perf_sample_data *data,
struct pt_regs *regs)
{
int events = atomic_read(&event->event_limit);
struct hw_perf_event *hwc = &event->hw;
u64 seq;
int ret = 0;
/*
* Non-sampling counters might still use the PMI to fold short
* hardware counters, ignore those.
*/
/* (4.2.2.1) 没有sample数据上报的返回 */
if (unlikely(!is_sampling_event(event)))
return 0;
/* (4.2.2.2) throttle的判断,还没看懂? */
seq = __this_cpu_read(perf_throttled_seq);
if (seq != hwc->interrupts_seq) {
hwc->interrupts_seq = seq;
hwc->interrupts = 1;
} else {
hwc->interrupts++;
if (unlikely(throttle
&& hwc->interrupts >= max_samples_per_tick)) {
__this_cpu_inc(perf_throttled_count);
hwc->interrupts = MAX_INTERRUPTS;
perf_log_throttle(event, 0);
tick_nohz_full_kick();
ret = 1;
}
}
/* (4.2.2.4) 如果是freq模式,根据count的发生频率,重新计算period值 */
if (event->attr.freq) {
u64 now = perf_clock();
s64 delta = now - hwc->freq_time_stamp;
hwc->freq_time_stamp = now;
if (delta > 0 && delta < 2*TICK_NSEC)
perf_adjust_period(event, delta, hwc->last_period, true);
}
/*
* XXX event_limit might not quite work as expected on inherited
* events
*/
/* (4.2.2.5) perf_event pending的处理,没看懂? */
event->pending_kill = POLL_IN;
if (events && atomic_dec_and_test(&event->event_limit)) {
ret = 1;
event->pending_kill = POLL_HUP;
event->pending_disable = 1;
irq_work_queue(&event->pending);
}
/* (4.2.2.6) 实际上报sample数据的地方了:
调用event自己的overflow_handler()函数,或者默认perf_event_output()函数
*/
if (event->overflow_handler)
event->overflow_handler(event, data, regs);
else
perf_event_output(event, data, regs);
if (*perf_event_fasync(event) && event->pending_kill) {
event->pending_wakeup = 1;
irq_work_queue(&event->pending);
}
return ret;
}
perf_event_output()是默认的上报sample数据函数:
void perf_event_output(struct perf_event *event,
struct perf_sample_data *data,
struct pt_regs *regs)
{
struct perf_output_handle handle;
struct perf_event_header header;
/* protect the callchain buffers */
rcu_read_lock();
/* (1) 计算sample数据的总长度,并且根据attr.sample_type保存额外的数据 */
perf_prepare_sample(&header, data, event, regs);
/* (2) 根据总size在event的ringbuffer中分配空间 */
if (perf_output_begin(&handle, event, header.size))
goto exit;
/* (3) 输出数据到ringbuffer中 */
perf_output_sample(&handle, &header, data, event);
/* (4) 把rb->head指针刷入rb->user_page->data_head */
perf_output_end(&handle);
exit:
rcu_read_unlock();
}
|→
int perf_output_begin(struct perf_output_handle *handle,
struct perf_event *event, unsigned int size)
{
struct ring_buffer *rb;
unsigned long tail, offset, head;
int have_lost, page_shift;
struct {
struct perf_event_header header;
u64 id;
u64 lost;
} lost_event;
rcu_read_lock();
/*
* For inherited events we send all the output towards the parent.
*/
/* (2.1) 由于inherit创建的子进程,都使用父进程的ringbuffer */
if (event->parent)
event = event->parent;
rb = rcu_dereference(event->rb);
if (unlikely(!rb))
goto out;
if (unlikely(!rb->nr_pages))
goto out;
handle->rb = rb;
handle->event = event;
have_lost = local_read(&rb->lost);
if (unlikely(have_lost)) {
size += sizeof(lost_event);
if (event->attr.sample_id_all)
size += event->id_header_size;
}
perf_output_get_handle(handle);
/* (2.2) 在ringbuffer中寻找一块空间 */
do {
/* (2.2.1) tail指针从user_page中获取,head指针从rb结构中获取
只有把数据写入确认写入以后,才会把head指针的改动从rb结构刷入user_page
*/
tail = READ_ONCE(rb->user_page->data_tail);
offset = head = local_read(&rb->head);
if (!rb->overwrite &&
unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size))
goto fail;
/*
* The above forms a control dependency barrier separating the
* @tail load above from the data stores below. Since the @tail
* load is required to compute the branch to fail below.
*
* A, matches D; the full memory barrier userspace SHOULD issue
* after reading the data and before storing the new tail
* position.
*
* See perf_output_put_handle().
*/
head += size;
} while (local_cmpxchg(&rb->head, offset, head) != offset);
/*
* We rely on the implied barrier() by local_cmpxchg() to ensure
* none of the data stores below can be lifted up by the compiler.
*/
if (unlikely(head - local_read(&rb->wakeup) > rb->watermark))
local_add(rb->watermark, &rb->wakeup);
page_shift = PAGE_SHIFT + page_order(rb);
/* (2.3) 计算分配得到的buffer的第一个page偏移和size */
handle->page = (offset >> page_shift) & (rb->nr_pages - 1);
offset &= (1UL << page_shift) - 1;
handle->addr = rb->data_pages[handle->page] + offset;
handle->size = (1UL << page_shift) - offset;
if (unlikely(have_lost)) {
struct perf_sample_data sample_data;
lost_event.header.size = sizeof(lost_event);
lost_event.header.type = PERF_RECORD_LOST;
lost_event.header.misc = 0;
lost_event.id = event->id;
lost_event.lost = local_xchg(&rb->lost, 0);
perf_event_header__init_id(&lost_event.header,
&sample_data, event);
perf_output_put(handle, lost_event);
perf_event__output_id_sample(event, handle, &sample_data);
}
return 0;
fail:
local_inc(&rb->lost);
perf_output_put_handle(handle);
out:
rcu_read_unlock();
return -ENOSPC;
}
|→
void perf_output_end(struct perf_output_handle *handle)
{
perf_output_put_handle(handle);
rcu_read_unlock();
}
↓
static void perf_output_put_handle(struct perf_output_handle *handle)
{
struct ring_buffer *rb = handle->rb;
unsigned long head;
again:
head = local_read(&rb->head);
/*
* IRQ/NMI can happen here, which means we can miss a head update.
*/
if (!local_dec_and_test(&rb->nest))
goto out;
/*
* Since the mmap() consumer (userspace) can run on a different CPU:
*
* kernel user
*
* if (LOAD ->data_tail) { LOAD ->data_head
* (A) smp_rmb() (C)
* STORE $data LOAD $data
* smp_wmb() (B) smp_mb() (D)
* STORE ->data_head STORE ->data_tail
* }
*
* Where A pairs with D, and B pairs with C.
*
* In our case (A) is a control dependency that separates the load of
* the ->data_tail and the stores of $data. In case ->data_tail
* indicates there is no room in the buffer to store $data we do not.
*
* D needs to be a full barrier since it separates the data READ
* from the tail WRITE.
*
* For B a WMB is sufficient since it separates two WRITEs, and for C
* an RMB is sufficient since it separates two READs.
*
* See perf_output_begin().
*/
smp_wmb(); /* B, matches C */
rb->user_page->data_head = head;
/*
* Now check if we missed an update -- rely on previous implied
* compiler barriers to force a re-read.
*/
if (unlikely(head != local_read(&rb->head))) {
local_inc(&rb->nest);
goto again;
}
if (handle->wakeup != local_read(&rb->wakeup))
perf_output_wakeup(handle);
out:
preempt_enable();
}
perf_event_type中详细描述了各种type类型的sample数据排列模式:
enum perf_event_type {
/*
* If perf_event_attr.sample_id_all is set then all event types will
* have the sample_type selected fields related to where/when
* (identity) an event took place (TID, TIME, ID, STREAM_ID, CPU,
* IDENTIFIER) described in PERF_RECORD_SAMPLE below, it will be stashed
* just after the perf_event_header and the fields already present for
* the existing fields, i.e. at the end of the payload. That way a newer
* perf.data file will be supported by older perf tools, with these new
* optional fields being ignored.
*
* struct sample_id {
* { u32 pid, tid; } && PERF_SAMPLE_TID
* { u64 time; } && PERF_SAMPLE_TIME
* { u64 id; } && PERF_SAMPLE_ID
* { u64 stream_id;} && PERF_SAMPLE_STREAM_ID
* { u32 cpu, res; } && PERF_SAMPLE_CPU
* { u64 id; } && PERF_SAMPLE_IDENTIFIER
* } && perf_event_attr::sample_id_all
*
* Note that PERF_SAMPLE_IDENTIFIER duplicates PERF_SAMPLE_ID. The
* advantage of PERF_SAMPLE_IDENTIFIER is that its position is fixed
* relative to header.size.
*/
/*
* The MMAP events record the PROT_EXEC mappings so that we can
* correlate userspace IPs to code. They have the following structure:
*
* struct {
* struct perf_event_header header;
*
* u32 pid, tid;
* u64 addr;
* u64 len;
* u64 pgoff;
* char filename[];
* struct sample_id sample_id;
* };
*/
PERF_RECORD_MMAP = 1,
/*
* struct {
* struct perf_event_header header;
* u64 id;
* u64 lost;
* struct sample_id sample_id;
* };
*/
PERF_RECORD_LOST = 2,
/*
* struct {
* struct perf_event_header header;
*
* u32 pid, tid;
* char comm[];
* struct sample_id sample_id;
* };
*/
PERF_RECORD_COMM = 3,
/*
* struct {
* struct perf_event_header header;
* u32 pid, ppid;
* u32 tid, ptid;
* u64 time;
* struct sample_id sample_id;
* };
*/
PERF_RECORD_EXIT = 4,
/*
* struct {
* struct perf_event_header header;
* u64 time;
* u64 id;
* u64 stream_id;
* struct sample_id sample_id;
* };
*/
PERF_RECORD_THROTTLE = 5,
PERF_RECORD_UNTHROTTLE = 6,
/*
* struct {
* struct perf_event_header header;
* u32 pid, ppid;
* u32 tid, ptid;
* u64 time;
* struct sample_id sample_id;
* };
*/
PERF_RECORD_FORK = 7,
/*
* struct {
* struct perf_event_header header;
* u32 pid, tid;
*
* struct read_format values;
* struct sample_id sample_id;
* };
*/
PERF_RECORD_READ = 8,
/*
* struct {
* struct perf_event_header header;
*
* #
* # Note that PERF_SAMPLE_IDENTIFIER duplicates PERF_SAMPLE_ID.
* # The advantage of PERF_SAMPLE_IDENTIFIER is that its position
* # is fixed relative to header.
* #
*
* { u64 id; } && PERF_SAMPLE_IDENTIFIER
* { u64 ip; } && PERF_SAMPLE_IP
* { u32 pid, tid; } && PERF_SAMPLE_TID
* { u64 time; } && PERF_SAMPLE_TIME
* { u64 addr; } && PERF_SAMPLE_ADDR
* { u64 id; } && PERF_SAMPLE_ID
* { u64 stream_id;} && PERF_SAMPLE_STREAM_ID
* { u32 cpu, res; } && PERF_SAMPLE_CPU
* { u64 period; } && PERF_SAMPLE_PERIOD
*
* { struct read_format values; } && PERF_SAMPLE_READ
*
* { u64 nr,
* u64 ips[nr]; } && PERF_SAMPLE_CALLCHAIN
*
* #
* # The RAW record below is opaque data wrt the ABI
* #
* # That is, the ABI doesn't make any promises wrt to
* # the stability of its content, it may vary depending
* # on event, hardware, kernel version and phase of
* # the moon.
* #
* # In other words, PERF_SAMPLE_RAW contents are not an ABI.
* #
*
* { u32 size;
* char data[size];}&& PERF_SAMPLE_RAW
*
* { u64 nr;
* { u64 from, to, flags } lbr[nr];} && PERF_SAMPLE_BRANCH_STACK
*
* { u64 abi; # enum perf_sample_regs_abi
* u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_USER
*
* { u64 size;
* char data[size];
* u64 dyn_size; } && PERF_SAMPLE_STACK_USER
*
* { u64 weight; } && PERF_SAMPLE_WEIGHT
* { u64 data_src; } && PERF_SAMPLE_DATA_SRC
* { u64 transaction; } && PERF_SAMPLE_TRANSACTION
* { u64 abi; # enum perf_sample_regs_abi
* u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR
* };
*/
PERF_RECORD_SAMPLE = 9,
/*
* The MMAP2 records are an augmented version of MMAP, they add
* maj, min, ino numbers to be used to uniquely identify each mapping
*
* struct {
* struct perf_event_header header;
*
* u32 pid, tid;
* u64 addr;
* u64 len;
* u64 pgoff;
* u32 maj;
* u32 min;
* u64 ino;
* u64 ino_generation;
* u32 prot, flags;
* char filename[];
* struct sample_id sample_id;
* };
*/
PERF_RECORD_MMAP2 = 10,
/*
* Records that new data landed in the AUX buffer part.
*
* struct {
* struct perf_event_header header;
*
* u64 aux_offset;
* u64 aux_size;
* u64 flags;
* struct sample_id sample_id;
* };
*/
PERF_RECORD_AUX = 11,
/*
* Indicates that instruction trace has started
*
* struct {
* struct perf_event_header header;
* u32 pid;
* u32 tid;
* };
*/
PERF_RECORD_ITRACE_START = 12,
/*
* Records the dropped/lost sample number.
*
* struct {
* struct perf_event_header header;
*
* u64 lost;
* struct sample_id sample_id;
* };
*/
PERF_RECORD_LOST_SAMPLES = 13,
/*
* Records a context switch in or out (flagged by
* PERF_RECORD_MISC_SWITCH_OUT). See also
* PERF_RECORD_SWITCH_CPU_WIDE.
*
* struct {
* struct perf_event_header header;
* struct sample_id sample_id;
* };
*/
PERF_RECORD_SWITCH = 14,
/*
* CPU-wide version of PERF_RECORD_SWITCH with next_prev_pid and
* next_prev_tid that are the next (switching out) or previous
* (switching in) pid/tid.
*
* struct {
* struct perf_event_header header;
* u32 next_prev_pid;
* u32 next_prev_tid;
* struct sample_id sample_id;
* };
*/
PERF_RECORD_SWITCH_CPU_WIDE = 15,
PERF_RECORD_MAX, /* non-ABI */
}