浅析Linux追踪技术之ftrace：Tracepoint

Aspiresky

于 2024-02-14 19:00:00 发布

阅读量1.9k

点赞数 35

文章标签： linux 运维服务器

本文链接：https://blog.csdn.net/anyegongjuezjd/article/details/136106332

版权

文章目录

概述

Tracepoint（跟踪点）是添加到代码流程中的调用点，并且允许开发者注册自定义的回调函数执行。默认情况下，跟踪点是关闭的状态，不会对原代码逻辑造成影响；当跟踪点为开启状态时，每次运行到跟踪点，都会调用开发者注册的回调函数。

Tracepoint使用

开发者使用Tracepoint需要进行两个步骤：

定义Tracepoint；
在代码流程中添加对跟踪点的调用。

定义Tracepoint

Linux内核使用TRACE_EVENT宏来定义以及向系统中添加一个Tracepoint，使用方式如下：

TRACE_EVENT(block_rq_complete,

	TP_PROTO(struct request *rq, int error, unsigned int nr_bytes),

	TP_ARGS(rq, error, nr_bytes),

	TP_STRUCT__entry(
		__field(  dev_t,	dev			)
		__field(  sector_t,	sector			)
		__field(  unsigned int,	nr_sector		)
		__field(  int,		error			)
		__array(  char,		rwbs,	RWBS_LEN	)
		__dynamic_array( char,	cmd,	1		)
	),

	TP_fast_assign(
		__entry->dev	   = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
		__entry->sector    = blk_rq_pos(rq);
		__entry->nr_sector = nr_bytes >> 9;
		__entry->error     = error;

		blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, nr_bytes);
		__get_str(cmd)[0] = '\0';
	),

	TP_printk("%d,%d %s (%s) %llu + %u [%d]",
		  MAJOR(__entry->dev), MINOR(__entry->dev),
		  __entry->rwbs, __get_str(cmd),
		  (unsigned long long)__entry->sector,
		  __entry->nr_sector, __entry->error)
);

其中：

block_rq_complete为跟踪点名称；
TP_PROTO部分定义了跟踪点回调函数原型；
TP_ARGS部分定义了回调函数的参数；
TP_STRUCT__entry部分定义了跟踪程序可以使用的数据结构，
TP_fast_assign部分描述了传递数据的方式；
TP_printk部分定义了打印数据结构的方法。

添加Tracepoint调用

bool blk_update_request(struct request *req, blk_status_t error,
		unsigned int nr_bytes)
{
	int total_bytes;

	trace_block_rq_complete(req, blk_status_to_errno(error), nr_bytes);

	if (!req->bio)
		return false;
		
	...

Tracepoint数据结构

Tracepoing机制的核心数据为tracepoint和tracepoint_func结构，tracepoint结构对应于一个跟踪点的概念，包含了跟踪点的名称、开关配置以及自定义的回调函数；而tracepoint_func结构描述了开发者注册的回调函数信息。
在这里插入图片描述

TRACE_EVENT实现

TRACE_EVENT宏定义如下：

#define TRACE_EVENT(name, proto, args, struct, assign, print)	\
	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))

name：要创建的跟踪点的名称。
prototype：跟踪点回调的原型
args：与原型匹配的参数。
struct：跟踪程序可以使用（但不需要）存储传递到跟踪点的数据的结构。
assign：已类似于 C-like 的方式将数据分配给结构。
print：以可读的ASCII格式输出结构的方法。

DECLARE_TRACE

#define DECLARE_TRACE(name, proto, args)				\
	__DECLARE_TRACE(name, PARAMS(proto), PARAMS(args),		\
			cpu_online(raw_smp_processor_id()),		\
			PARAMS(void *__data, proto),			\
			PARAMS(__data, args))

__DECLARE_TRACE

__DECLARE_TRACE宏定义了一系列内联函数(其中xxx为定义的tracepoint名称)：

trace_xxx：代码中调用trace_xxx函数记录tracepoint运行信息；
register_trace_xxx：向tracepoint注册回调函数，在tracepoint使能时会调用；
register_trace_prio_xxx：与register_trace_xxx相似，支持调用优先级；
unregister_trace_xxx：向tracepoint去注册回调函数；
trace_xxx_enabled：tracepoint使能接口

#define __DECLARE_TRACE(name, proto, args, cond, data_proto, data_args) \
	extern struct tracepoint __tracepoint_##name;			\
	static inline void trace_##name(proto)		// 定义trace_xxx函数，以上文示例即为trace_block_rq_complete		\
	{								\
		if (static_key_false(&__tracepoint_##name.key))		\
			__DO_TRACE(&__tracepoint_##name,		\
				TP_PROTO(data_proto),			\
				TP_ARGS(data_args),			\
				TP_CONDITION(cond), 0);			\
		if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) {		\
			WARN_ON_ONCE(!rcu_is_watching());		\
		}							\
	}								\
	__DECLARE_TRACE_RCU(name, PARAMS(proto), PARAMS(args),		\
		PARAMS(cond), PARAMS(data_proto), PARAMS(data_args))	\
	static inline int						\
	register_trace_##name(void (*probe)(data_proto), void *data)   \   // 定义register_trace_xxx函数，以上文示例即为register_trace_rq_complete
	{								\
		return tracepoint_probe_register(&__tracepoint_##name,	\
						(void *)probe, data);	\
	}								\
	static inline int						\
	register_trace_prio_##name(void (*probe)(data_proto), void *data,\     // 定义register_trace_prio_xxx函数，以上文示例即为register_trace_prio_block_rq_complete
				   int prio)				\
	{								\
		return tracepoint_probe_register_prio(&__tracepoint_##name, \
					      (void *)probe, data, prio); \
	}								\
	static inline int						\
	unregister_trace_##name(void (*probe)(data_proto), void *data)     \   //  定义unregister_trace_xxx函数，以上文示例即为unregister_trace_prio_block_rq_complete
	{								\
		return tracepoint_probe_unregister(&__tracepoint_##name,\
						(void *)probe, data);	\
	}								\
	static inline void						\
	check_trace_callback_type_##name(void (*cb)(data_proto))   \   // 定义check_trace_callback_type_xxx函数，以上文示例即为check_trace_callback_type_block_rq_complete
	{								\
	}								\
	static inline bool						\
	trace_##name##_enabled(void)				\  // 定义trace_xxx_enabled函数，以上文示例即为trace_block_rq_complete_enabled	
	{								\
		return static_key_false(&__tracepoint_##name.key);	\
	}

trace_xxx函数

trace_xxx函数内部主要是调用__DO_TRACE宏来完成实际的跟踪点信息处理，__DO_TRACE的核心流程是遍历tracepoint结构体里面的一个函数数组，执行对应的回调函数，这些回调函数由register_trace_xxx接口进行注册。

#define __DO_TRACE(tp, proto, args, cond, rcuidle)			\
	do {								\
		struct tracepoint_func *it_func_ptr;			\
		void *it_func;						\
		void *__data;						\
		int __maybe_unused __idx = 0;				\
									\
		if (!(cond))						\
			return;						\
									\
		/* srcu can't be used from NMI */			\
		WARN_ON_ONCE(rcuidle && in_nmi());			\
									\
		/* keep srcu and sched-rcu usage consistent */		\
		preempt_disable_notrace();				\
									\
		/*							\
		 * For rcuidle callers, use srcu since sched-rcu	\
		 * doesn't work from the idle path.			\
		 */							\
		if (rcuidle) {						\
			__idx = srcu_read_lock_notrace(&tracepoint_srcu);\
			rcu_irq_enter_irqson();				\
		}							\
									\
		it_func_ptr = rcu_dereference_raw((tp)->funcs);		\
									\
		if (it_func_ptr) {					\
			do {						\
				it_func = (it_func_ptr)->func;		\
				__data = (it_func_ptr)->data;		\
				((void(*)(proto))(it_func))(args);	\
			} while ((++it_func_ptr)->func);		\
		}							\
									\
		if (rcuidle) {						\
			rcu_irq_exit_irqson();				\
			srcu_read_unlock_notrace(&tracepoint_srcu, __idx);\
		}							\
									\
		preempt_enable_notrace();				\
	} while (0)