目录
0.前言
本文主要是根据阅码场 《Linux内核tracers的实现原理与应用》视频课程在aarch64上的实践。本文同样以blk_update_request函数的trace_block_rq_complete为例进行说明trace event的原理。
kernel版本:5.10
平台:arm64
1. trace event领域模型
注:如上领域模型主要参考trace系列3 - kprobe学习笔记 领域模型,只摘录出trace event相关,实际通过分析trace_event_register函数也可以得出如上的关系图
-
trace_event_class:用于描述trace event的类
-
trace_event_call:是trace_event的封装,会连入全局ftrace_events链表
-
trace_event:主要关联了trace_event_functions结构体, trace_event_functions定义了trace_event的回调,trace_event会连入全局的ftrace_event_list
-
tracepoint:trace_event基础设施,其中funcs指针数组管理了此tracepoint的所有probe回调
-
trace_event_functions:定义了trace_event的回调
struct trace_event_functions {
trace_print_func trace;
trace_print_func raw;
trace_print_func hex;
trace_print_func binary;
};
-
trace_array:用于描述trace的最顶层的结构体,目前ftrace_trace_arrays只有一个全局的trace_array即global_trace,可以看出每个trace_event_call对应一个trace_array,trace_array->event_dir指向/sys/kernel/debug/tracing/events目录
-
trace_event_file: 管理kprobe trace event下所有的文件,通过event_call指向trace_event_call,通过system指向trace_subsystem_dir,通过tr指向trace_array,可见trace_event_file, trace_event_call,trace_array是一一对应的,trace_event_file通过list连入trace_array的events链表
-
trace_subsystem_dir: 管理kprobe trace event的目录,通过entry指向管理的目录节点(/sys/kernel/debug/tracing/events/kprobe),通过tr指向trace_array,通过list连入trace_array的systems链表。从上述图示可以看出,trace_subsystem_dir:本例中就表示events/kprobes目录
trace_event_file,trace_array, trace_event_call,trace_subsystem_dir一一对应
1. TRACE_EVENT宏定义
TRACE_EVENT宏主要通过多次undef和define TRACE_EVENT宏定义,实现了TRACE_EVENT宏定义,此处只是象征性的列出了几次包含关系,而且有些遗漏,以展现此宏的复杂性,实际不必过分追究此宏的展开,只要在需要的时候通过blk-core.i这个预处理文件查询即可。
- 第一次定义(include/linux/tracepoint.h)
#define TRACE_EVENT(name, proto, args, struct, assign, print) \
DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
#define DECLARE_TRACE(name, proto, args) \
__DECLARE_TRACE(name, PARAMS(proto), PARAMS(args), \
cpu_online(raw_smp_processor_id()), \
PARAMS(void *__data, proto), \
PARAMS(__data, args))
#define __DECLARE_TRACE(name, proto, args, cond, data_proto, data_args) \
extern int __traceiter_##name(data_proto); \
DECLARE_STATIC_CALL(tp_func_##name, __traceiter_##name); \
extern struct tracepoint __tracepoint_##name; \
static inline void trace_##name(proto) \
{ \
if (static_key_false(&__tracepoint_##name.key)) \
__DO_TRACE(name, \
TP_PROTO(data_proto), \
TP_ARGS(data_args), \
TP_CONDITION(cond), 0); \
if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) { \
rcu_read_lock_sched_notrace(); \
rcu_dereference_sched(__tracepoint_##name.funcs);\
rcu_read_unlock_sched_notrace(); \
} \
} \
__DECLARE_TRACE_RCU(name, PARAMS(proto), PARAMS(args), \
PARAMS(cond), PARAMS(data_proto), PARAMS(data_args)) \
static inline int \
register_trace_##name(void (*probe)(data_proto), void *data) \
{ \
return tracepoint_probe_register(&__tracepoint_##name, \
(void *)probe, data); \
} \
static inline int \
register_trace_prio_##name(void (*probe)(data_proto), void *data,\
int prio) \
{ \
return tracepoint_probe_register_prio(&__tracepoint_##name, \
(void *)probe, data, prio); \
} \
static inline int \
unregister_trace_##name(void (*probe)(data_proto), void *data) \
{ \
return tracepoint_probe_unregister(&__tracepoint_##name,\
(void *)probe, data); \
} \
static inline void \
check_trace_callback_type_##name(void (*cb)(data_proto)) \
{ \
} \
static inline bool \
trace_##name##_enabled(void) \
{ \
return static_key_false(&__tracepoint_##name.key); \
}
#include include/trace/define_trace.h
- 第二次定义(include/trace/define_trace.h)
#undef TRACE_EVENT
#define TRACE_EVENT(name, proto, args, tstruct, assign, print) \
DEFINE_TRACE(name, PARAMS(proto), PARAMS(args))
#define DEFINE_TRACE(name, proto, args) \
DEFINE_TRACE_FN(name, NULL, NULL, PARAMS(proto), PARAMS(args));
#define DEFINE_TRACE_FN(_name, _reg, _unreg, proto, args) \
static const char __tpstrtab_##_name[] \
__section("__tracepoints_strings") = #_name; \
extern struct static_call_key STATIC_CALL_KEY(tp_func_##_name); \
int __traceiter_##_name(void *__data, proto); \
struct tracepoint __tracepoint_##_name __used \
__section("__tracepoints") = { \
.name = __tpstrtab_##_name, \
.key = STATIC_KEY_INIT_FALSE, \
.static_call_key = &STATIC_CALL_KEY(tp_func_##_name), \
.static_call_tramp = STATIC_CALL_TRAMP_ADDR(tp_func_##_name), \
.iterator = &__traceiter_##_name, \
.regfunc = _reg, \
.unregfunc = _unreg, \
.funcs = NULL }; \
__TRACEPOINT_ENTRY(_name); \
int __traceiter_##_name(void *__data, proto) \
{ \
struct tracepoint_func *it_func_ptr; \
void *it_func; \
\
it_func_ptr = \
rcu_dereference_raw((&__tracepoint_##_name)->funcs); \
do { \
it_func = (it_func_ptr)->func; \
__data = (it_func_ptr)->data; \
((void(*)(void *, proto))(it_func))(__data, args); \
} while ((++it_func_ptr)->func); \
return 0; \
} \
DEFINE_STATIC_CALL(tp_func_##_name, __traceiter_##_name);
- 第三次定义(include/trace/define_trace.h)
#undef DEFINE_EVENT
#define DEFINE_EVENT(template, name, proto, args) \
DEFINE_TRACE(name, PARAMS(proto), PARAMS(args))
//include/linux/tracepoint.h
#define DEFINE_TRACE(name, proto, args) \
DEFINE_TRACE_FN(name, NULL, NULL, PARAMS(proto), PARAMS(args));
//include/linux/tracepoint.h
#define DEFINE_TRACE_FN(_name, _reg, _unreg, proto, args) \
static const char __tpstrtab_##_name[] \
__section("__tracepoints_strings") = #_name; \
extern struct static_call_key STATIC_CALL_KEY(tp_func_##_name); \
int __traceiter_##_name(void *__data, proto); \
struct tracepoint __tracepoint_##_name __used \
__section("__tracepoints") = { \
.name = __tpstrtab_##_name, \
.key = STATIC_KEY_INIT_FALSE, \
.static_call_key = &STATIC_CALL_KEY(tp_func_##_name), \
.static_call_tramp = STATIC_CALL_TRAMP_ADDR(tp_func_##_name), \
.iterator = &__traceiter_##_name, \
.regfunc = _reg, \
.unregfunc = _unreg, \
.funcs = NULL }; \
__TRACEPOINT_ENTRY(_name); \
int __traceiter_##_name(void *__data, proto) \
{ \
struct tracepoint_func *it_func_ptr; \
void *it_func; \
\
it_func_ptr = \
rcu_dereference_raw((&__tracepoint_##_name)->funcs); \
do { \
it_func = (it_func_ptr)->func; \
__data = (it_func_ptr)->data; \
((void(*)(void *, proto))(it_func))(__data, args); \
} while ((++it_func_ptr)->func); \
return 0; \
} \
DEFINE_STATIC_CALL(tp_func_##_name, __traceiter_##_name);
- 第四次定义(include/trace/define_trace.h)
#undef DECLARE_TRACE
#define DECLARE_TRACE(name, proto, args) \
DEFINE_TRACE(name, PARAMS(proto), PARAMS(args))
//include/linux/tracepoint.h
#define DEFINE_TRACE(name, proto, args) \
DEFINE_TRACE_FN(name, NULL, NULL, PARAMS(proto), PARAMS(args));
//include/linux/tracepoint.h
#define DEFINE_TRACE_FN(_name, _reg, _unreg, proto, args) \
static const char __tpstrtab_##_name[] \
__section("__tracepoints_strings") = #_name; \
extern struct static_call_key STATIC_CALL_KEY(tp_func_##_name); \
int __traceiter_##_name(void *__data, proto); \
struct tracepoint __tracepoint_##_name __used \
__section("__tracepoints") = { \
.name = __tpstrtab_##_name, \
.key = STATIC_KEY_INIT_FALSE, \
.static_call_key = &STATIC_CALL_KEY(tp_func_##_name), \
.static_call_tramp = STATIC_CALL_TRAMP_ADDR(tp_func_##_name), \
.iterator = &__traceiter_##_name, \
.regfunc = _reg, \
.unregfunc = _unreg, \
.funcs = NULL }; \
__TRACEPOINT_ENTRY(_name); \
int __traceiter_##_name(void *__data, proto) \
{ \
struct tracepoint_func *it_func_ptr; \
void *it_func; \
\
it_func_ptr = \
rcu_dereference_raw((&__tracepoint_##_name)->funcs); \
do { \
it_func = (it_func_ptr)->func; \
__data = (it_func_ptr)->data; \
((void(*)(void *, proto))(it_func))(__data, args); \
} while ((++it_func_ptr)->func); \
return 0; \
} \
DEFINE_STATIC_CALL(tp_func_##_name, __traceiter_##_name);
- TRACE_EVENT宏的其它几次包含于定义
不在展开
2. TRACE_EVENT宏分析
本节主要以block_rq_complete为例,进行说明
2.1 使用TRACE_EVENT宏
下面我们来看一个实际的trace event定义:
// include/trace/events/block.h
TRACE_EVENT(block_rq_complete,
//trace_event_raw_event_block_rq_complete函数参数
TP_PROTO(struct request *rq, int error, unsigned int nr_bytes),
//trace_event_raw_event_block_rq_complete函数参数名
TP_ARGS(rq, error, nr_bytes),
//构造结构体 struct trace_event_raw_block_rq_complete
TP_STRUCT__entry(
__field( dev_t, dev )
__field( sector_t, sector )
__field( unsigned int, nr_sector )
__field( int, error )
__array( char, rwbs, RWBS_LEN )
__dynamic_array( char, cmd, 1 )
),
//给上面结构体赋值
TP_fast_assign(
__entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
__entry->sector = blk_rq_pos(rq);
__entry->nr_sector = nr_bytes >> 9;
__entry->error = error;
blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, nr_bytes);
__get_str(cmd)[0] = '\0';
),
//数据写到环形队列trace_event_buffer_commit()
TP_printk("%d,%d %s (%s) %llu + %u [%d]",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->rwbs, __get_str(cmd),
(unsigned long long)__entry->sector,
__entry->nr_sector, __entry->error)
);
2.2 TRACE_EVENT宏展开
如上宏定义展开后,将是一幅恐怖的画卷,在这里我们受kprobe event的启发来思考一下,kprobe event是主要是通过注册trace_event_call来创建一个新的trace event,而kprobe被称为动态的trace event,此处是静态的trace event,同为trace event,那么机制应该是复用相同的,静态trace event同样也需要定义trace_event_call。因此我们尝试通过执行make ./block/blk-core.i 看到编译结果,此处只是节选了关键部分:
/*trace_event_raw_block_rq_complete结构体定义*/
struct trace_event_raw_block_rq_complete
{
struct trace_entry ent;
dev_t dev;
sector_t sector;
unsigned int nr_sector;
int error;
char rwbs[8];
u32 __data_loc_cmd;
char __data[0];
};
/* tracepoint的probe回调,在trace event使能时来完成tracepoint与probe的绑定 */
static __attribute__((__no_instrument_function__))
void trace_event_raw_event_block_rq_complete(void *__data, struct request *rq, int error,
unsigned int nr_bytes)
{
struct trace_event_file *trace_file = __data;
struct trace_event_data_offsets_block_rq_complete __attribute__((__unused__)) __data_offsets;
struct trace_event_buffer fbuffer;
struct trace_event_raw_block_rq_complete *entry;
int __data_size;
if (trace_trigger_soft_disabled(trace_file))
return;
__data_size = trace_event_get_offsets_block_rq_complete(&__data_offsets, rq, error, nr_bytes);
entry = trace_event_buffer_reserve(&fbuffer, trace_file, sizeof(*entry) + __data_size);
if (!entry)
return;
entry->__data_loc_cmd = __data_offsets.cmd;
{
entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
entry->sector = blk_rq_pos(rq);
entry->nr_sector = nr_bytes >> 9;
entry->error = error;
blk_fill_rwbs(entry->rwbs, rq->cmd_flags, nr_bytes);
((char *)((void *)entry + (entry->__data_loc_cmd & 0xffff)))[0] = '\0';;
}
trace_event_buffer_commit(&fbuffer);
};
static inline __attribute__((__gnu_inline__)) __attribute__((__unused__))
__attribute__((__no_instrument_function__))
void ftrace_test_probe_block_rq_complete(void)
{
check_trace_callback_type_block_rq_complete(trace_event_raw_event_block_rq_complete);
}
static __attribute__((__no_instrument_function__))
enum print_line_t trace_raw_output_block_rq_complete(struct trace_iterator *iter, int flags,
struct trace_event *trace_event)
{
struct trace_seq *s = &iter->seq;
struct trace_seq __attribute__((__unused__)) *p = &iter->tmp_seq;
struct trace_event_raw_block_rq_complete *field;
int ret; field = (typeof(field))iter->ent;
ret = trace_raw_output_prep(iter, trace_event);
if (ret != TRACE_TYPE_HANDLED)
return ret;
trace_seq_printf(s, "%d,%d %s (%s) %llu + %u [%d]" "\n",
((unsigned int) ((field->dev) >> 20)),
((unsigned int) ((field->dev) & ((1U << 20) - 1))),
field->rwbs,
((char *)((void *)field + (field->__data_loc_cmd & 0xffff))),
(unsigned long long)field->sector, field->nr_sector,
field->error);
return trace_handle_return(s);
}
static struct trace_event_functions trace_event_type_funcs_block_rq_complete =
{
.trace = trace_raw_output_block_rq_complete,
};
static __attribute__((__no_instrument_function__))
void perf_trace_block_rq_complete(void *__data, struct request *rq, int error, unsigned int nr_bytes);
static char print_fmt_block_rq_complete[] = "\"" "%d,%d %s (%s) %llu + %u [%d]" "\",
" "((unsigned int) ((REC->dev) >> 20)), ((unsigned int) ((REC->dev) & ((1U << 20) - 1))),
REC->rwbs, __get_str(cmd), (unsigned long long)REC->sector, REC->nr_sector, REC->error";
/*定义tracepoint*/
struct tracepoint __tracepoint_block_rq_complete __attribute__((__used__)) __attribute__((__section__("__tracepoints"))) =
{
.name = __tpstrtab_block_rq_complete,
.key = { .enabled = { 0 },
{ .entries = (void *)0UL }
},
.static_call_key = &__SCK__tp_func_block_rq_complete,
.static_call_tramp = ((void *)0),
.iterator = &__traceiter_block_rq_complete,
.regfunc = ((void *)0),
.unregfunc = ((void *)0),
.funcs = ((void *)0)
};
/*定义trace_event_class*/
static struct trace_event_class __attribute__((__used__)) __attribute__((__section__(".ref.data"))) event_class_block_rq_complete = {
.system = str__block__trace_system_name,
.fields_array = trace_event_fields_block_rq_complete,
.fields = { &(event_class_block_rq_complete.fields), &(event_class_block_rq_complete.fields) },
.raw_init = trace_event_raw_init,
.probe = trace_event_raw_event_block_rq_complete,
.reg = trace_event_reg,
.perf_probe = perf_trace_block_rq_complete,
};
/*定义trace_event_call*/
static struct trace_event_call __attribute__((__used__)) event_block_rq_complete =
{
.class = &event_class_block_rq_complete,
{
//初始化trace_point
.tp = &__tracepoint_block_rq_complete,
},
.event.funcs = &trace_event_type_funcs_block_rq_complete,
.print_fmt = print_fmt_block_rq_complete,
.flags = TRACE_EVENT_FL_TRACEPOINT,
};
/*
* trace_init->trace_event_init->event_trace_enable会批量化注册trace_event_call(本质上是一个trace point),
* 将每个trace_event_call挂接到全局ftrace_events链表
*/
static struct trace_event_call __attribute__((__used__)) __attribute__((__section__("_ftrace_events")))
*__event_block_rq_complete = &event_block_rq_complete;
/* 触发tracepoint probe */
static inline __attribute__( (__gnu_inline__) ) __attribute__( (__unused__) ) __attribute__((__no_instrument_function__) )
void trace_block_rq_complete( struct request *rq, int error, unsigned int nr_bytes )
{
struct tracepoint_func *it_func_ptr;
__data = (it_func_ptr)->data;
__traceiter_block_rq_complete( __data, rq, error, nr_bytes );
}
int __traceiter_block_rq_complete(void *__data, struct request *rq, int error, unsigned int nr_bytes)
{
struct tracepoint_func *it_func_ptr;
void *it_func;
it_func_ptr = &__tracepoint_block_rq_complete)->funcs;
do {
it_func = (it_func_ptr)->func;
__data = (it_func_ptr)->data;
((void(*)(void *, struct request *rq, int error, unsigned int nr_bytes))(it_func))
(__data, rq, error, nr_bytes);
} while ((++it_func_ptr)->func);
return 0;
}
通过上面宏的分析,可以看到TRACE_EVENT宏实际上就是定义了trace_event_call 结构体变量,且每个trace event定义了一个,它们的指针会统一存放到_ftrace_events这个section中,可以猜测一下,初始化时就会获取这个section中的指针找到每个trace_event_call 执行注册。
2.3 trace_event_call注册
......
ftrace_stub_graph = ftrace_stub;
*(.init.rodata .init.rodata.*) . = ALIGN(8);
__start_ftrace_events = .;
KEEP(*(_ftrace_events))
__stop_ftrace_events = .;
__start_ftrace_eval_maps = .;
KEEP(*(_ftrace_eval_map))
__stop_ftrace_eval_maps = .;
. = ALIGN(8); __start_kprobe_blacklist = .;
KEEP(*(_kprobe_blacklist))
__stop_kprobe_blacklist = .;
......
我们可以看到arch/arm64/kernel/vmlinux.lds链接脚本片段,_ftrace_events位于__start_ftrace_events和__stop_ftrace_events 之间。
//kernel/trace/trace_events.c
extern struct trace_event_call *__start_ftrace_events[];
extern struct trace_event_call *__stop_ftrace_events[];
static __init int event_trace_enable(void)
{
struct trace_array *tr = top_trace_array();
struct trace_event_call **iter, *call;
int ret;
if (!tr)
return -ENODEV;
for_each_event(iter, __start_ftrace_events, __stop_ftrace_events) {
call = *iter;
ret = event_init(call);
if (!ret)
list_add(&call->list, &ftrace_events);
}
/*
* We need the top trace array to have a working set of trace
* points at early init, before the debug files and directories
* are created. Create the file entries now, and attach them
* to the actual file dentries later.
*/
__trace_early_add_events(tr);
early_enable_events(tr, false);
trace_printk_start_comm();
register_event_cmds();
register_trigger_cmds();
return 0;
}
trace_init->trace_event_init->event_trace_enable就是批量化注册trace_event的函数,其中event_init中会调用call->class->raw_init(call),根据前面对TRACE_EVENT宏的展开分析可知,此处的raw_init就是trace_event_raw_init。之后会将trace_event_call挂到全局ftrace_events链表,这个跟kprobe event的注册是一致的,只不过kprobe的注册是动态的,此处是静态的。关于kprobe event的注册可参考 trace系列3 - kprobe学习笔记的领域模型和trace_add_event_call、trace_probe_register_event_call流程。
int trace_event_raw_init(struct trace_event_call *call)
{
int id;
id = register_trace_event(&call->event);
if (!id)
return -ENODEV;
return 0;
}
trace_event_raw_init就是完成了trace event的注册。
3. trace event使能
运行如下命令将开启block_rq_complete的trace event输出
# echo 1 > /sys/kernel/debug/tracing/events/block/block_rq_complete/enable
具体这条命令做了什么呢?通过gdb来跟踪
event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,loff_t *ppos)
|--struct trace_event_file *file
|--kstrtoul_from_user(ubuf, cnt, 10, &val)
|--tracing_update_buffers()
| //前面在trace_create_file的时候会将trace_event_file保存在inode->i_private
|--file = event_file_data(filp)
|--ftrace_event_enable_disable(file, val)
| //此处以enable为1举例
|--__ftrace_event_enable_disable(file, enable, 0)
|--trace_event_reg(call, type, data)
| //将tracepoint与probe绑定
|--tracepoint_probe_register(call->tp,call->class->probe,file);
|--tracepoint_probe_register_prio(tp, probe, data, TRACEPOINT_DEFAULT_PRIO)
|--struct tracepoint_func tp_func;
| tp_func.func = probe;
| tp_func.data = data;
| tp_func.prio = prio;
|--tracepoint_add_func(tp, &tp_func, prio);
|--struct tracepoint_func *old, *tp_funcs;
|--tp_funcs = rcu_dereference_protected(tp->funcs,
| lockdep_is_held(&tracepoints_mutex));
|--old = func_add(&tp_funcs, func, prio);
| |--struct tracepoint_func *old, *new;
| |--new = allocate_probes(nr_probes + 2)
| |--new[pos] = *tp_func
| |--new[nr_probes + 1].func = NULL;
| | //new保存到tp_funcs
| |--*funcs = new;
|--rcu_assign_pointer(tp->funcs, tp_funcs);
|--tracepoint_update_call(tp, tp_funcs, false);
| //使能key
|--static_key_enable(&tp->key);
通过上面的分析可以知道,执行上面的指令实际就是初始化tp_func.func,本例为trace_event_raw_event_block_rq_complete,并将tp_func添加到ttracepoint.funcs中,这过程中也会使能此tracepoint.key。
4. trace event输出
在TRACE_EVENT宏定义展开时,我们看到TRACE_EVENT包含如下的定义,这个就是trace event输出时要调用的函数,本例中为:trace_block_rq_complete
注:如下定义从blk-core.i 中也可以看到
/*触发tracepoint执行*/
static inline void trace_##name(proto) \
{ \
if (static_key_false(&__tracepoint_##name.key)) \
__DO_TRACE(name, \
TP_PROTO(data_proto), \
TP_ARGS(data_args), \
TP_CONDITION(cond), 0); \
if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) { \
rcu_read_lock_sched_notrace(); \
rcu_dereference_sched(__tracepoint_##name.funcs);\
rcu_read_unlock_sched_notrace(); \
} \
}
static_key_false将判断是否使能了key,__DO_TRACE为主要的输出宏,展开如下:
/*
* it_func[0] is never NULL because there is at least one element in the array
* when the array itself is non NULL.
*
* Note, the proto and args passed in includes "__data" as the first parameter.
* The reason for this is to handle the "void" prototype. If a tracepoint
* has a "void" prototype, then it is invalid to declare a function
* as "(void *, void)".
*/
#define __DO_TRACE(name, proto, args, cond, rcuidle) \
do { \
struct tracepoint_func *it_func_ptr; \
int __maybe_unused __idx = 0; \
void *__data; \
\
if (!(cond)) \
return; \
\
/* srcu can't be used from NMI */ \
WARN_ON_ONCE(rcuidle && in_nmi()); \
\
/* keep srcu and sched-rcu usage consistent */ \
preempt_disable_notrace(); \
\
/* \
* For rcuidle callers, use srcu since sched-rcu \
* doesn't work from the idle path. \
*/ \
if (rcuidle) { \
__idx = srcu_read_lock_notrace(&tracepoint_srcu);\
rcu_irq_enter_irqson(); \
} \
//此处__tracepoint_##name可知为__tracepoint_block_rq_complete
it_func_ptr = \
rcu_dereference_raw((&__tracepoint_##name)->funcs); \
if (it_func_ptr) { \
__data = (it_func_ptr)->data; \
__DO_TRACE_CALL(name)(args); \
} \
\
if (rcuidle) { \
rcu_irq_exit_irqson(); \
srcu_read_unlock_notrace(&tracepoint_srcu, __idx);\
} \
\
preempt_enable_notrace(); \
} while (0)
__DO_TRACE主要是找到对应的tracepoint,此处为__tracepoint_block_rq_complete,然后找到它的funcs数组,对数组执行__DO_TRACE_CALL,__DO_TRACE_CALL宏定义如下:
/*__DO_TRACE_CALL宏定义*/
#ifdef CONFIG_HAVE_STATIC_CALL
#define __DO_TRACE_CALL(name) static_call(tp_func_##name)
#else
#define __DO_TRACE_CALL(name) __traceiter_##name
#endif /* CONFIG_HAVE_STATIC_CALL */
其中__DO_TRACE_CALL定义如上,实际__traceiter_##name会被替换为__traceiter_block_rq_complete
/*__traceiter_block_rq_complete已简化如下*/
int __traceiter_block_rq_complete(void *__data, struct request *rq, int error, unsigned int nr_bytes)
{
struct tracepoint_func *it_func_ptr;
void *it_func;
it_func_ptr = &__tracepoint_block_rq_complete)->funcs;
do {
it_func = (it_func_ptr)->func;
__data = (it_func_ptr)->data;
((void(*)(void *, struct request *rq, int error, unsigned int nr_bytes))(it_func))
(__data, rq, error, nr_bytes);
} while ((++it_func_ptr)->func);
return 0;
}
__traceiter_block_rq_complete会遍历it_func_ptr函数数组,然后执行其中的每一个函数
对如上分析总结一下, 从路上定义可以看出:trace_block_rq_complete通过tracepoint找到对应funcs数组,然后通过__traceiter_block_rq_complete遍历执行数组中注册的每一个probe。此处的probe就是trace_event_raw_event_block_rq_complete。
5. 小结
前面对trace event的原理做了一个简要的分析,比较分散,此处我们做一个小结。trace event本质首先是一个tracepoint,只不过它在tracepoint的基础上,又定义了一些特定的结构,下面我们将trace event与tracepoint的工作流程做一个简单的对比。前面在tracepoint简介一文中,我们知道tracepoint的流程包含三部分,我们看下trace event是如何定义的:
- 定义触发trace_event下的tracepoint的probe回调的函数
主要是在TRACE_EVENT宏定义中实现,本例中为trace_block_rq_complete函数,它用于触发trace_event下的tracepoint的probe回调 - 将trace_event下的tracepoint与probe进行绑定
在trace_event使能时,通过
event_enable_write->ftrace_event_enable_disable->__ftrace_event_enable_disable->
tracepoint_probe_register将tracepoint与probe进行绑定 - 定义trace_event下tracepoint的probe函数
在TRACE_EVENT宏中定义,本例为trace_event_raw_event_block_rq_complete,当执行触发函数trace_block_rq_complete时,此函数会被调用到
参考文档
Linux TraceEvent - 我见过的史上最长宏定义
附录
struct tracepoint_func {
void *func;
void *data;
int prio;
};
struct tracepoint {
const char *name; /* Tracepoint name */
struct static_key key;
struct static_call_key *static_call_key;
void *static_call_tramp;
void *iterator;
int (*regfunc)(void);
void (*unregfunc)(void);
struct tracepoint_func __rcu *funcs;
};
例子
# echo 0 > /sys/kernel/debug/tracing/tracing_on
# echo 0 > /sys/kernel/debug/tracing/events/enable
# echo > /sys/kernel/debug/tracing/trace
# echo suspend_resume >> /sys/kernel/debug/tracing/set_event
# echo 1 > /sys/kernel/debug/tracing/tracing_on