什么是perf
Perf的全名是performance Event,通过perf,程序可以利用PMU、tracepoint获取内核的性能信息。
Perf可以分析的事件非常多,可以分析hardware event,如cpu-cycle、instructions、cache-misses、branch-misses等;可以分析software event,如page-faults、context-switches等,也可以通过software event来采集火焰图,另一种就是tracepoint event。
PMU
1、内核将硬件PMU抽象为pmu结构体,将PMU可以采集的事件抽象为perf_event。struct pmu
中有一组函数,用于操作PMU
struct pmu {
struct list_head entry;
struct module *module;
struct device *dev;
const struct attribute_group **attr_groups;
const char *name;
int type; //表示PERF_TYPE_HARDWARE/PERF_TYPE_SOFTWARE/PERF_TYPE_TRACEPOINT等
void (*pmu_enable) (struct pmu *pmu); /* optional,火焰图就不需要 */
void (*pmu_disable) (struct pmu *pmu); /* optional */
int (*event_init) (struct perf_event *event);
int (*add) (struct perf_event *event, int flags); //添加一个perf_event事件
void (*del) (struct perf_event *event, int flags);
void (*start) (struct perf_event *event, int flags);
void (*stop) (struct perf_event *event, int flags);
void (*read) (struct perf_event *event); //更新perf_event的计数器
}
2、内核中的PMU分为两类,hardware和software,software的pmu是软件模拟出来的,比如用于采集火焰图的struct pmu perf_cpu_clock
;而硬件PMU则是由芯片的驱动提供的,比如arm架构的hardware采集在arch/arm64/kernel/perf_event.c中。
以采集火焰图的perf_cpu_clock
为例
static struct pmu perf_cpu_clock = {
.task_ctx_nr = perf_sw_context, //上下文类型,只有perf_hw_context或者sw
.event_init = cpu_clock_event_init, //初始化定时器
.add = cpu_clock_event_add,
.del = cpu_clock_event_del, //删除定时器
.start = cpu_clock_event_start,//启动定时器,采集调用栈
.stop = cpu_clock_event_stop,
.read = cpu_clock_event_read, //更新时间,没什么用
};
3、内核在启动的时候,每个硬件模块会调用perf_pmu_register()
函数将各个pmu注册到系统中,通过全局的LIST_HEAD(pmus)
和struct pmu
中的list_head entry
成员串成一个链表。
static LIST_HEAD(pmus);
int perf_pmu_register(struct pmu *pmu, const char *name, int type)
{
list_add_rcu(&pmu->entry, &pmus);
}
4、用户态调用perf_event_open
后,内核会根据用户态传入的perf_event_attr中的信息,遍历系统pmus链表,自动匹配出要使用哪个PMU,匹配的过程由perf_init_event
函数完成。
static struct pmu *perf_init_event(struct perf_event *event)
{
list_for_each_entry_rcu(pmu, &pmus, entry)
ret = perf_try_init_event(pmu, event);
}
PMU的event_init也需要有相应的拦截逻辑,以software和hardware为例:
//software event
static int perf_swevent_init(struct perf_event *event)
{
u64 event_id = event->attr.config;
if (event->attr.type != PERF_TYPE_SOFTWARE)
return -ENOENT;
switch (event_id) {
case PERF_COUNT_SW_CPU_CLOCK:
case PERF_COUNT_SW_TASK_CLOCK:
return -ENOENT;
}
}
//software event中采调用栈
static int cpu_clock_event_init(struct perf_event *event)
{
if (event->attr.type != PERF_TYPE_SOFTWARE)
return -ENOENT;
if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
return -ENOENT;
}
//hardware event,以arm架构为例
int armpmu_map_event(struct perf_event *event,
const unsigned (*event_map)[PERF_COUNT_HW_MAX],
const unsigned (*cache_map)
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
[PERF_COUNT_HW_CACHE_RESULT_MAX],
u32 raw_event_mask)
{
u64 config = event->attr.config;
int type = event->attr.type;
switch (type) {
case PERF_TYPE_HARDWARE:
return armpmu_map_hw_event(event_map, config); //内部继续对config做判断
case PERF_TYPE_HW_CACHE:
return armpmu_map_cache_event(cache_map, config);
case PERF_TYPE_RAW:
return armpmu_map_raw_event(raw_event_mask, config);
}
}
火焰图采集
本文以火焰图采集为例进行讲解
系统调用
火焰图采集的配置如下,注意需要对每个核心都调用一次perf_event_open,然后可以使用epoll对返回的fd监听
void perf_sample_init_cpu(struct CpuSample *cs)
{
//...
attr->size = sizeof(struct perf_event_attr);
attr->disabled = 1;
attr->sample_period = cs->period;
attr->wakeup_events = cs->wakeup_events;
attr->type = PERF_TYPE_SOFTWARE;
attr->config = PERF_COUNT_SW_CPU_CLOCK;
attr->sample_type = PERF_SAMPLE_TIME | PERF_SAMPLE_CPU | PERF_SAMPLE_TID | PERF_SAMPLE_CALLCHAIN;
cs->event_fd = perf_event_open(attr, -1, cs->cpu, -1, 0);
if (cs->event_fd == -1) {
log_err("perf_event_open failed: %s", strerror(errno));
exit(EXIT_FAILURE);
}
cs->mmap = mmap(NULL, (cs->pages_nr+1)*PAGE_SIZE, PROT_READ, MAP_SHARED, cs->event_fd, 0);
if (cs->mmap == 0) {
log_err("mmap failed");
exit(EXIT_FAILURE);
}
cs->offset = 0;
cs->last_offset = 0;
}
内核参数解析
构造perf_event的过程
SYSCALL_DEFINE5(perf_event_open,
struct perf_event_attr __user *, attr_uptr,
pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
{
struct perf_event *event, *sibling;
struct perf_event_context *ctx;
struct file *event_file = NULL;
struct task_struct *task = NULL;
int event_fd;
//申请一个fd
event_fd = get_unused_fd_flags(f_flags);
if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
//根据pid获取task_struct,pid=-1,task就是NULL
task = find_lively_task_by_vpid(pid);
}
//Allocate and initialize an event structure 匹配对应的PMU
event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
NULL, NULL, cgroup_fd);
pmu = event->pmu;
//处理group成员和组长分别是硬件事件和软件事件的情况
//采火焰图不走这,因为本身就是group_leader
if (group_leader) {
//...
}
//Get the target context (task or percpu),如果pid=-1,ctx是percpu的context
ctx = find_get_context(pmu, task, event);
//拿到file
event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
f_flags);
//Attach a performance event to a context.
//内部调用add_event_to_ctx(event, ctx)操作链表
perf_install_in_context(ctx, event, event->cpu);
//Install a file pointer in the fd array.
fd_install(event_fd, event_file);
return event_fd;
}
//为什么pid=0,却只监控到了当前线程?
static struct task_struct *
find_lively_task_by_vpid(pid_t vpid)
{
struct task_struct *task;
//如果vpid==0,则监控当前线程
if (!vpid)
task = current;
else
task = find_task_by_vpid(vpid);
return task;
}
period/freq的传递路径
/* 如果有sample_freq,就解析成sample_period */
static void perf_swevent_init_hrtimer(struct perf_event *event)
{
/*
* Since hrtimers have a fixed rate, we can do a static freq->period
* mapping and avoid the whole period adjust feedback stuff.
*/
if (event->attr.freq) {
long freq = event->attr.sample_freq;
event->attr.sample_period = NSEC_PER_SEC / freq;
hwc->sample_period = event->attr.sample_period;
}
}
/* sample_period传给定时器 */
static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
{
u64 period;
period = max_t(u64, 10000, event->hw.sample_period);
hrtimer_forward_now(hrtimer, ns_to_ktime(period));
}
wakeup_events的传递路径
void perf_output_sample(struct perf_output_handle *handle,
struct perf_event_header *header,
struct perf_sample_data *data,
struct perf_event *event)
{
...
if (wakeup_events) {
struct ring_buffer *rb = handle->rb;
int events = local_inc_return(&rb->events); //+1并返回+1后的值
if (events >= wakeup_events) { //如果该唤醒了
local_sub(wakeup_events, &rb->events);//清零rb->events -= wakeup_events
local_inc(&rb->wakeup); //唤醒标志位
}
}
//ring_buffer.c中
if (handle->wakeup != local_read(&rb->wakeup))
perf_output_wakeup(handle);
static void perf_output_wakeup(struct perf_output_handle *handle)
{
atomic_set(&handle->rb->poll, EPOLLIN);
handle->event->pending_wakeup = 1;
irq_work_queue(&handle->event->pending);
}
//唤醒中断,中断再wake_up_all(&event->waitq);
pid的传递路径
尚未整理
定时器
定时器初始化
//初始化定时器的过程
SYSCALL_DEFINE5(perf_event_open,……)
perf_event_alloc
perf_init_event
perf_try_init_event
pmu->event_init(event);
static struct pmu perf_cpu_clock = {
.event_init = cpu_clock_event_init, //init timer
};
cpu_clock_event_init调用perf_swevent_init_hrtimer初始化定时器
定时器回调
perf_swevent_init_hrtimer注册注册定时器回调函数为perf_swevent_hrtimer
perf_swevent_hrtimer
__perf_event_overflow
//overflow_handler中注册了溢出的处理函数
READ_ONCE(event->overflow_handler)(event, data, regs);
//溢出的处理函数是如何注册的?
perf_event_alloc
perf_event中注册了overflow_handler
if (overflow_handler) {
event->overflow_handler = overflow_handler;
event->overflow_handler_context = context;
// Write ring buffer from end to beginning
} else if (is_write_backward(event)){
event->overflow_handler = perf_event_output_backward;
event->overflow_handler_context = NULL;
//所以通常是forward
} else {
event->overflow_handler = perf_event_output_forward;
event->overflow_handler_context = NULL;
}
//溢出的处理函数干了什么?
perf_event_output_forward //overflow_handler回调函数
__perf_event_output(event, data, regs, perf_output_begin_forward);
//采样数据
perf_prepare_sample(&header, data, event, regs);
//perf_output_begin_forward函数指针,作用是初始化perf_output_handle
if (output_begin(&handle, event, header.size))
//采到的数据保存到环形缓冲区中
perf_output_sample(&handle, &header, data, event);
数据保存
dump栈
unwind是个啥,没搞懂,直接撸源码
(1)callchain是如何存储的?
struct perf_callchain_entry {
__u64 nr;
__u64 ip[0]; /* /proc/sys/kernel/perf_event_max_stack */ //变长数组
};
(2)get_perf_callchain函数
//regs->中断上下文,在进中断时perf_swevent_hrtimer中regs = get_irq_regs()拿到的
//init_nr=0,初始化的调用栈深度
//kernel/user->是否采集内核/用户态调用链
//max_stack->最大的调用栈深度,在attr.sample_max_stack中设置
//crosstask->是否允许夸任务采集调用链
//add_mark=true->是否在调用链中添加标记,标记是ffff ffff ffff ff80 和 ffff ffff ffff fe00
struct perf_callchain_entry *
get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
u32 max_stack, bool crosstask, bool add_mark)
{
struct perf_callchain_entry *entry;
struct perf_callchain_entry_ctx ctx;
int rctx;
entry = get_callchain_entry(&rctx); //获取一个空闲的struct perf_callchain_entry对象
ctx.entry = entry;
ctx.max_stack = max_stack;
ctx.nr = entry->nr = init_nr;
ctx.contexts = 0;
ctx.contexts_maxed = false;
//如果采集内核调用栈 & 当前处于内核中
if (kernel && !user_mode(regs)) {
//添加标记ffff ffff ffff ff80
if (add_mark)
perf_callchain_store_context(&ctx, PERF_CONTEXT_KERNEL);
//采集内核调用链
perf_callchain_kernel(&ctx, regs);
}
if (user) { //要采集用户态栈
if (!user_mode(regs)) {
if (current->mm) //说明是用户态陷入到内核态中
regs = task_pt_regs(current); //!获取当前线程的寄存器上下文信息,用户态和内核栈不是内存连续的
else //说明当前是一个内核线程在跑,什么都不记录
regs = NULL;
}
//用户态&用户态陷入内核态之后被定时器中断打断
if (regs) {
mm_segment_t fs;
//添加标记ffff ffff ffff fe00
if (add_mark)
perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
perf_callchain_user(&ctx, regs); //希望根据regs寄存器信息获取调用栈信息,保存在ctx->entry中
}
}
put_callchain_entry(rctx);
}
(3)perf_callchain_user函数
struct frame_tail { //组织成一个单链表
struct frame_tail __user *fp; //寄存器x29,下一个栈帧的地址
unsigned long lr; //寄存器x30,函数返回地址(代码段)
} __attribute__((packed));
void perf_callchain_user(struct perf_callchain_entry_ctx *entry,
struct pt_regs *regs)
{
perf_callchain_store(entry, regs->pc); //记录下现在的pc在哪里
if (!compat_user_mode(regs)) { //这个判断不用管,兼容32位的
/* AARCH64 mode */
struct frame_tail __user *tail;
tail = (struct frame_tail __user *)regs->regs[29]; //当前的栈顶指针 fp=sp
while (entry->nr < entry->max_stack && tail && !((unsigned long)tail & 0xf))
tail = user_backtrace(tail, entry); //记录并且单步回溯
}
}
static struct frame_tail __user *
user_backtrace(struct frame_tail __user *tail,
struct perf_callchain_entry_ctx *entry)
{
struct frame_tail buftail;
unsigned long lr;
err = __copy_from_user_inatomic(&buftail, tail, sizeof(buftail));
lr = ptrauth_strip_insn_pac(buftail.lr); //函数返回地址(代码段)
perf_callchain_store(entry, lr); //记录代码地址
if (tail >= buftail.fp) //栈帧只能单向地向高地址移动
return NULL;
return buftail.fp;
}
(4)perf_callchain_kernel函数
//arch/arm64/kernel/perf_callchain.c和stacktrace.c中
//regs是中断上下文的栈
void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
struct pt_regs *regs)
{
struct stackframe frame;
if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
/* We don't support guest os callchain now */
return;
}
start_backtrace(&frame, regs->regs[29], regs->pc);
walk_stackframe(current, &frame, callchain_trace, entry); //遍历栈帧&记录,entry传给callchain_trace
}
//一个通用的栈回溯函数,fn可以定制化自己的功能
void notrace walk_stackframe(struct task_struct *tsk, struct stackframe *frame,
int (*fn)(struct stackframe *, void *), void *data)
{
while (1) {
if (fn(frame, data)) //记录函数是callchain_trace
break;
ret = unwind_frame(tsk, frame);//里面改frame的fp指针,实现遍历
if (ret < 0) //到头了
break;
}
}
static int callchain_trace(struct stackframe *frame, void *data)
{
struct perf_callchain_entry_ctx *entry = data;
perf_callchain_store(entry, frame->pc);
return 0;
}
//unwind_frame实现单步的栈回溯
struct stack_info {
unsigned long low;
unsigned long high;
enum stack_type type;
};
int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame)
{
unsigned long fp = frame->fp;
struct stack_info info;
if (fp & 0xf) //最低4位是否为0,8bit对齐
return -EINVAL;
if (!tsk)
tsk = current;
//根据fp的地址判断当前处于什么栈,把这个栈允许的边界地址以及栈类型放到info中
if (!on_accessible_stack(tsk, fp, &info))
return -EINVAL;
//如果这个栈空间已经回溯过了,这有助于防止“形成循环回溯”的异常情况
if (test_bit(info.type, frame->stacks_done))
return -EINVAL;
/*
* As stacks grow downward, any valid record on the same stack must be
* at a strictly higher address than the prior record.
*
* Stacks can nest in several valid orders, e.g.
*
* TASK -> IRQ -> OVERFLOW -> SDEI_NORMAL
* TASK -> SDEI_NORMAL -> SDEI_CRITICAL -> OVERFLOW
*
* ... but the nesting itself is strict. Once we transition from one
* stack to another, it's never valid to unwind back to that first
* stack.
*/
if (info.type == frame->prev_type) {
if (fp <= frame->prev_fp) //同一中类型的栈必然是单调递增的
return -EINVAL;
} else { //不同类型可能不是递增
//标志这个栈空间回溯过了
set_bit(frame->prev_type, frame->stacks_done);
}
/*
* Record this frame record's values and location. The prev_fp and
* prev_type are only meaningful to the next unwind_frame() invocation.
*/
//更新frame中的值,让fp和pc指向下一个栈帧,核心!!!!
frame->fp = READ_ONCE_NOCHECK(*(unsigned long *)(fp));
frame->pc = READ_ONCE_NOCHECK(*(unsigned long *)(fp + 8)); //也就是lr寄存器
frame->prev_fp = fp;
frame->prev_type = info.type;
}
ringbuffer保存数据
尚未整理
用户态/内核态通讯机制
mmap
//先学习下mmap
int (*mmap) (struct file *, struct vm_area_struct *);
//mmap中调用remap_pfn_range函数
// 第一个参数:虚拟地址描述结构体(声明在include/linux/mm_types.h,起始mm.h中已经包含了它),一般是系统传递下来
// 第二个参数:虚拟起始地址
// 第三个参数:物理地址
// 第四个参数:映射空间大小,单位字节
// 第五个参数:给新 VMA 要求的”protection”. 驱动直接使用 vma->vm_page_prot
// 返回值,成功返回0,否则返回-1
int remap_pfn_range(struct vm_area_struct *, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t);
//举个例子
#include <linux/slab.h>
#include <linux/mm.h>
static int simple_remap_mmap(struct file *filp, struct vm_area_struct *vma);
struct file_operations fo = {
...
.mmap = simple_remap_mmap,
...
};
static int simple_remap_mmap(struct file *filp, struct vm_area_struct *vma)
{
void *p;
p = kmalloc(200,GFP_KERNEL)
if(!p)
return -1;
if (remap_pfn_range(vma, vma->vm_start, virt_to_phys(p),
vma->vm_end - vma->vm_start,
vma->vm_page_prot))
return -EAGAIN;
return 0;
}
//回到主题,perf_mmap中干了什么?
if (!rb) {
/* 申请ringbuffer */
rb = rb_alloc(nr_pages,
event->attr.watermark ? event->attr.wakeup_watermark : 0,
event->cpu, flags);
ring_buffer_attach(event, rb);
perf_event_init_userpage(event);
perf_event_update_userpage(event);//update the user visible data
}
poll
perf_poll中干了什么?
perf_poll(struct file *file, poll_table *wait)
poll_wait(file, &event->waitq, wait) //把本进程挂载&event->waitq等待队列上
//谁唤醒了等待队列?
perf_event_alloc
init_irq_work(&event->pending, perf_pending_event);//irq_work 主要是提供一个在中断上下文执行回调函数的框架
perf_pending_event(struct irq_work *entry)
perf_event_wakeup(struct perf_event *event)
ring_buffer_wakeup(struct perf_event *event)
wake_up_all(&event->waitq);
//谁触发了event->pending
perf_swevent_hrtimer(定时器回调函数)
__perf_event_overflow
irq_work_queue(&event->pending);//在此之前调用了记录函数->这个路径应该不走吧...这里没搞懂
//另一个路径见上文'wakeup_events'部分
其他问题
两个perf record同时工作可以么?
可以,两次init会启两个定时器