trace event是内核的一种静态插桩机制,内核在关键的位置,有数百个trace event插桩点,用以跟踪内核的关键运行信息,我们在内核代码中看到的以trace_开头的函数就是trace event桩函数。
1.定义trace event
定义一个trace event是用宏TRACE_EVENT()来实现的,该宏被称为内核中最长宏定义,如果感兴趣可以阅读参考文献中的文章,这里我们只简要分析下基本框架。TRACE_EVENT()宏首先定义了一个struct tracepoint类型的全局变量struct tracepoint __tracepoint_##name,结构体struct tracepoint是插桩点的抽象,其定义和成员说明如下:
struct tracepoint {
const char *name; // Tracepoint 名称
struct static_key key; // 是否使能插桩点,默认为false
void (*regfunc)(void); // 注册回调函数时的钩子函数,注意:只是钩子函数,不是注册函数
void (*unregfunc)(void); // 注销回调函数时的钩子函数,注意:只是钩子函数,不是注销函数
struct tracepoint_func __rcu *funcs; // 回调函数集合,在使能的情况下,桩函数依次调用集合中的回调函数
};
全局变量struct tracepoint __tracepoint_##name定义后,各成员的初始化情况是:const char *name字段是TRACE_EVENT()传递进来的名称;struct static_key key为false,其余三个成员均为null。
static const char __tpstrtab_##name[] \
__attribute__((section("__tracepoints_strings"))) = #name; \
struct tracepoint __tracepoint_##name \
__attribute__((section("__tracepoints"))) = \
{ __tpstrtab_##name, STATIC_KEY_INIT_FALSE, reg, unreg, NULL };\ <-------各成员的初始化
TRACE_EVENT()宏还定义了另外两种类型的全局变量:struct trace_event_class event_class_##name和struct trace_event_call event_##name。结构体struct trace_event_class和struct trace_event_call的定义以及成员初始化情况如下:
//trace_event_class定义
struct trace_event_class {
const char *system;
void *probe;
#ifdef CONFIG_PERF_EVENTS
void *perf_probe;
#endif
int (*reg)(struct trace_event_call *event,
enum trace_reg type, void *data);
int (*define_fields)(struct trace_event_call *);
struct list_head *(*get_fields)(struct trace_event_call *);
struct list_head fields;
int (*raw_init)(struct trace_event_call *);
};
//trace_event_class类型变量声明和初始化
static struct trace_event_class __used __refdata event_class_##call = { \
.system = TRACE_SYSTEM_STRING, \
.define_fields = trace_event_define_fields_##call, \
.fields = LIST_HEAD_INIT(event_class_##call.fields),\
.raw_init = trace_event_raw_init, \
.probe = trace_event_raw_event_##call, \
.reg = trace_event_reg, \
_TRACE_PERF_INIT(call) \
};
//trace_event_call定义
struct trace_event_call {
struct list_head list;
struct trace_event_class *class;
union {
char *name;
struct tracepoint *tp;
};
struct trace_event event;
char *print_fmt;
struct event_filter *filter;
void *mod;
void *data;
int flags; /* static flags of different events */
.........................................................
};
//trace_event_call类型变量声明和初始化
static struct trace_event_call __used event_##call = { \
.class = &event_class_##template, \
{ \
.tp = &__tracepoint_##call, \
}, \
.event.funcs = &trace_event_type_funcs_##call, \
.print_fmt = print_fmt_##call, \
.flags = TRACE_EVENT_FL_TRACEPOINT, \
};
综上,TRACE_EVENT()定义后的结构体关系如下图所示:
2.trace event初始化
在系统启动过程中会把定义的所有trace event组织起来统一管理,并能够通过debugfs向用户空间提供操作接口,位于/sys/kernel/tracing或者/sys/kernel/debug/tracing。把trace event组织起来的任务是在start_kernel()-->trace_init()中完成的,函数一直调用到event_trace_enable()。
void __init trace_init(void)
{
if (tracepoint_printk) {
tracepoint_print_iter =
kmalloc(sizeof(*tracepoint_print_iter), GFP_KERNEL);
if (WARN_ON(!tracepoint_print_iter))
tracepoint_printk = 0;
}
tracer_alloc_buffers();
trace_event_init();
}
void __init trace_event_init(void)
{
event_trace_memsetup();
init_ftrace_syscalls();
event_trace_enable();
}
event_trace_enable()中top_trace_array()返回的是static struct trace_array global_trace的指针,然后遍历所有的trace_event_call,将它们加入内核链表ftrace_events,最后调用__trace_early_add_events()函数,参数就是global_trace的指针。
static __init int event_trace_enable(void)
{
struct trace_array *tr = top_trace_array();//返回的是global_trace
struct trace_event_call **iter, *call;
int ret;
if (!tr)
return -ENODEV;
//遍历所有的trace_event_call
for_each_event(iter, __start_ftrace_events, __stop_ftrace_events) {
call = *iter;
ret = event_init(call); //init
if (!ret)
list_add(&call->list, &ftrace_events); //加入内核链表ftrace_events
}
__trace_early_add_events(tr);
.....................................................................
}
__trace_early_add_events()遍历内核链表ftrace_events中的所有trace_event_call,为每一个trace_event_call创建trace_event_file,并把trace_event_file加入到global_trace的events成员中。
static __init void __trace_early_add_events(struct trace_array *tr)
{
struct trace_event_call *call;
int ret;
//遍历ftrace_events
list_for_each_entry(call, &ftrace_events, list) {
if (WARN_ON_ONCE(call->mod))
continue;
ret = __trace_early_add_new_event(call, tr);
if (ret < 0)
pr_warn("Could not create early event %s\n",
trace_event_name(call));
}
}
static __init int __trace_early_add_new_event(struct trace_event_call *call, struct trace_array *tr)
{
struct trace_event_file *file;
file = trace_create_new_event(call, tr);
if (!file)
return -ENOMEM;
return 0;
}
static struct trace_event_file *trace_create_new_event(struct trace_event_call *call,
struct trace_array *tr)
{
struct trace_event_file *file;
//创建trace_event_file
file = kmem_cache_alloc(file_cachep, GFP_TRACE);
if (!file)
return NULL;
file->event_call = call;
file->tr = tr;
atomic_set(&file->sm_ref, 0);
atomic_set(&file->tm_ref, 0);
INIT_LIST_HEAD(&file->triggers);
list_add(&file->list, &tr->events); //加入到global_trace的events
return file;
}
初始化后的结构体关系,可以简化如下图所示:
3.trace event与debugfs关系建立
debugfs向用户空间提供操作trace event的接口,trace event与debugfs的关系是在event_trace_init()函数中建立的。event_trace_init()中首先创建根目录,并保存到global_trace的成员struct dentry *dir中。
static __init int event_trace_init(void)
{
struct trace_array *tr;
struct dentry *d_tracer;
struct dentry *entry;
int ret;
tr = top_trace_array();
if (!tr)
return -ENODEV;
//创建根目录
d_tracer = tracing_init_dentry();
if (IS_ERR(d_tracer))
return 0;
//在更目录下创建available_events节点
entry = tracefs_create_file("available_events", 0444, d_tracer,
tr, &ftrace_avail_fops);
....................................................................
ret = early_event_add_tracer(d_tracer, tr);
.....................................................................
}
struct dentry *tracing_init_dentry(void)
{
struct trace_array *tr = &global_trace;
................................................................
//创建名称为"tracing"的根目录放在global_trace的成员struct dentry *dir中
tr->dir = debugfs_create_automount("tracing", NULL,
trace_automount, NULL);
................................................................
}
early_event_add_tracer()-->create_event_toplevel_files()在tracing目录下创建events目录,并保存在global_trace的成员struct dentry *event_dir中。
static __init int early_event_add_tracer(struct dentry *parent, struct trace_array *tr)
{
int ret;
mutex_lock(&event_mutex);
ret = create_event_toplevel_files(parent, tr);
if (ret)
goto out_unlock;
down_write(&trace_event_sem);
__trace_early_add_event_dirs(tr);
up_write(&trace_event_sem);
out_unlock:
mutex_unlock(&event_mutex);
return ret;
}
static int create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
{
struct dentry *d_events;
struct dentry *entry;
.............................................................
//在tracing目录下创建events目录
d_events = tracefs_create_dir("events", parent);
if (!d_events) {
pr_warn("Could not create tracefs 'events' directory\n");
return -ENOMEM;
}
.............................................................
//赋值给global_trace的成员struct dentry *event_dir
tr->event_dir = d_events;
return 0;
}
early_event_add_tracer()-->create_event_toplevel_files()遍历每一个trace_event_file,并为它们创建目录。一个trace_event_file对应一个trace_event_call,一个trace_event_call对应一个trace_event_class,首先要在events目录下创建trace_event_class同名目录,存放在global_trace的成员struct list_head systems内核链表中,再在trace_event_class同名目录下创建tracepoint同名目录。可以看出,trace_event_class同名目录是tracepoint同名目录的父目录,这是因为,多个trace_event_call的trace_event_class名称是可以相同。在声明trace event的头文件中,开通会重定义宏#define TRACE_SYSTEM xxxxx,那么xxxxx就是该头文件声明的所有trace_event_call->class的名称。
static __init void __trace_early_add_event_dirs(struct trace_array *tr)
{
struct trace_event_file *file;
int ret;
//遍历每一个trace_event_file,为它们创建目录
list_for_each_entry(file, &tr->events, list) {
ret = event_create_dir(tr->event_dir, file);
if (ret < 0)
pr_warn("Could not create directory for event %s\n",
trace_event_name(file->event_call));
}
}
static int event_create_dir(struct dentry *parent, struct trace_event_file *file)
{
struct trace_event_call *call = file->event_call;
struct trace_array *tr = file->tr;
struct list_head *head;
struct dentry *d_events;
const char *name;
int ret;
//在events目录下创建trace_event_class同名目录
if (strcmp(call->class->system, TRACE_SYSTEM) != 0) {
d_events = event_subsystem_dir(tr, call->class->system, file, parent);
if (!d_events)
return -ENOMEM;
} else
d_events = parent;
name = trace_event_name(call);//tracepoint名称
//在在trace_event_class同名目录下创建tracepoint同名目录
file->dir = tracefs_create_dir(name, d_events);
if (!file->dir) {
pr_warn("Could not create tracefs '%s' directory\n", name);
return -1;
}
...............................................
}
static struct dentry *event_subsystem_dir(struct trace_array *tr, const char *name,
struct trace_event_file *file, struct dentry *parent)
{
struct trace_subsystem_dir *dir;
struct event_subsystem *system;
struct dentry *entry;
//在内核链表global_trace->systems中找不到同名目录才创建
list_for_each_entry(dir, &tr->systems, list) {
system = dir->subsystem;
if (strcmp(system->name, name) == 0) {
dir->nr_events++;
file->system = dir;
return dir->entry;
}
}
........................................................
dir = kmalloc(sizeof(*dir), GFP_KERNEL);
.........................................................
}
trace event与debugfs的对应关系,可以简化如下图:
4.trace event使能
在内核代码中,经常看到trace_开头的函数,例如trace_binder_transaction,这就是桩函数,桩函数的定义也是在TRACE_EVENT()宏中完成的。在桩函数中首先判断插桩点是否使能,如果没有使能则直接退出,如果使能则依次调用回调函数集合中的回调函数。可以看出要正常运行桩函数需要两个条件:一是插桩点要使能,二是回调函数集合中要有回调函数,这两个条件是在trace event使能时完成的。
#define __DECLARE_TRACE(name, proto, args, cond, data_proto, data_args) \
extern struct tracepoint __tracepoint_##name; \
static inline void trace_##name(proto) \
{ \
if (static_key_false(&__tracepoint_##name.key)) \ --->使能则继续运行
__DO_TRACE(&__tracepoint_##name, \
TP_PROTO(data_proto), \
TP_ARGS(data_args), \
TP_CONDITION(cond),,); \
if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) { \
rcu_read_lock_sched_notrace(); \
rcu_dereference_sched(__tracepoint_##name.funcs);\
rcu_read_unlock_sched_notrace(); \
} \
}
#define __DO_TRACE(tp, proto, args, cond, prercu, postrcu) \
do { \
struct tracepoint_func *it_func_ptr; \
void *it_func; \
void *__data; \
\
if (!(cond)) \
return; \
prercu; \
rcu_read_lock_sched_notrace(); \
it_func_ptr = rcu_dereference_sched((tp)->funcs); \
if (it_func_ptr) { \
do { \ --->依次运行回调集合中的回调函数
it_func = (it_func_ptr)->func; \
__data = (it_func_ptr)->data; \
((void(*)(proto))(it_func))(args); \
} while ((++it_func_ptr)->func); \
} \
rcu_read_unlock_sched_notrace(); \
postrcu; \
} while (0)
使能trace event的操作,就是向trace event下的enable节点写1,在内核中对应的接口为event_enable_write()。最终会调用到trace_event_class的reg()接口,对应的函数是event_enable_write。event_enable_write()判断是注册回调则调用函数tracepoint_probe_register()-->tracepoint_probe_register_prio(),tracepoint_probe_register_prio()创建回调结构体struct tracepoint_func,然后add到回调函数集合中,并使能插桩点。
static const struct file_operations ftrace_enable_fops = {
.open = tracing_open_generic,
.read = event_enable_read,
.write = event_enable_write,
.llseek = default_llseek,
};
event_enable_write()
----ftrace_event_enable_disable()
----__ftrace_event_enable_disable()
----call->class->reg(call, TRACE_REG_REGISTER, file)
----trace_event_reg()
----tracepoint_probe_register()
----tracepoint_probe_register_prio()
int tracepoint_probe_register_prio(struct tracepoint *tp, void *probe,
void *data, int prio)
{
struct tracepoint_func tp_func;
int ret;
mutex_lock(&tracepoints_mutex);
tp_func.func = probe;
tp_func.data = data;
tp_func.prio = prio;
ret = tracepoint_add_func(tp, &tp_func, prio);
mutex_unlock(&tracepoints_mutex);
return ret;
}
static int tracepoint_add_func(struct tracepoint *tp,
struct tracepoint_func *func, int prio)
{
struct tracepoint_func *old, *tp_funcs;
if (tp->regfunc && !static_key_enabled(&tp->key))
tp->regfunc();
tp_funcs = rcu_dereference_protected(tp->funcs,
lockdep_is_held(&tracepoints_mutex));
//新的回调函数加入回调函数集合
old = func_add(&tp_funcs, func, prio);
if (IS_ERR(old)) {
WARN_ON_ONCE(PTR_ERR(old) != -ENOMEM);
return PTR_ERR(old);
}
rcu_assign_pointer(tp->funcs, tp_funcs);
//如果没有使能则使能插桩点
if (!static_key_enabled(&tp->key))
static_key_slow_inc(&tp->key);
release_probes(old);
return 0;
}
5.参考资料