linux perf arm,linux kernel perf event(counter)

最近接到一个客户bug,说是运行perf fuzzer的时候,手机会crash掉。当时我懵了。Perf

fuzzer是什么鬼。

经过坚持不懈的google之后,终于找到了一些资料。

perf counters added to the mainline

fuzzing perf events

或者看kernel/tools/perf/design.txt文档

Perf event这东西本来的名字perf counter更加贴切。Counter也就是计数器。无非指一些软件事件发生的次数或者硬件时间的发生次数。

软件事件就是指software event 或者tracepoint,其实在我看来,这两者大同小异。不知道为啥分为两类。

硬件时间需要借助于硬件设备,不管是PMU还是CORE DEBUG(watchpoint/breakpoint).用于统计一段时间内的硬件时间。

每一种类型在perf event框架中抽象成为一个PMU设备。(既可以是硬件,也可以是软件)。

先从perf_sw_event这个函数入手吧。其实这个函数的目的就是根据event_id

找到相应的事件,然后增加计数器。记住我说的是计数器,也就是刚才说到的perf counter.

perf_pmu_register(&perf_swevent, "software",

PERF_TYPE_SOFTWARE);

perf_pmu_register(&perf_cpu_clock, NULL, -1);

perf_pmu_register(&perf_task_clock, NULL,

-1);

perf_tp_register();

ret = init_hw_breakpoint();

这边一共注册了5中PMU设备,分别是software event, cpu,

task, tracepoint, breakpoint.

Struct pmu定义在kernel/include/linux/perf_event.h文件中,其中包含很多的函数指针。是由perf_pmu_register的时候传入的。

之前一直没搞清楚这个perf event到底是怎么工作的,直到我看到了perf_sw_event函数。

简单的在内核中搜了一下这个函数,你们居然是这么玩的。

./arch/arm/mm/fault.c:335: perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);

./arch/arm/mm/fault.c:339: perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,

./arch/arm/mm/fault.c:343: perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,

./arch/arm/kernel/swp_emulate.c:190: perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS, 1, regs, regs->ARM_pc);

./arch/mips/mm/fault.c:156: perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);

./arch/mips/mm/fault.c:168: perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,

./arch/mips/mm/fault.c:172: perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,

好吧,都是在事件发生的时候默默的给计数器加一。相应的还有DECLARE_EVENT_CLASS跟perf_bp_event。

另外除了上面的pmu之外,还有真实的硬件PMU。

Kernel/arch/arm64/perf_event.c

init_hw_perf_events 硬件PMU在软件PMU的基础上进行了再一次的封装。

struct arm_pmu {

struct pmu pmu;

cpumask_t active_irqs;

const char *name;

irqreturn_t (*handle_irq)(int irq_num, void *dev);

void (*enable)(struct hw_perf_event *evt, int idx);

void (*disable)(struct hw_perf_event *evt, int idx);

int (*get_event_idx)(struct pmu_hw_events *hw_events,

struct hw_perf_event *hwc);

int (*set_event_filter)(struct hw_perf_event *evt,

struct perf_event_attr *attr);

u32 (*read_counter)(int idx);

void (*write_counter)(int idx, u32 val);

void (*start)(void);

void (*stop)(void);

void (*reset)(void *);

int (*request_irq)(struct arm_pmu *,

irq_handler_t handler);

void (*free_irq)(struct arm_pmu *);

int (*map_event)(struct perf_event *event);

int num_events;

int pmu_state;

atomic_t active_events;

struct mutex reserve_mutex;

u64 max_period;

struct platform_device *plat_device;

struct pmu_hw_events *(*get_hw_events)(void);

void (*save_pm_registers)(void *hcpu);

void (*restore_pm_registers)(void *hcpu);

};

添加了很多函数指针,比如map_event用于查询某一种硬件操作是否支持。

static struct arm_pmu *__init armv8_pmuv3_pmu_init(void)

{

armv8pmu.name = "arm/armv8-pmuv3";

armv8pmu.map_event = armv8_pmuv3_map_event;

armv8pmu.num_events = armv8pmu_read_num_pmnc_events();

armv8pmu.set_event_filter = armv8pmu_set_event_filter;

return &armv8pmu;

}

static struct arm_pmu armv8pmu = {

.handle_irq = armv8pmu_handle_irq,

.enable = armv8pmu_enable_event,

.disable = armv8pmu_disable_event,

.read_counter = armv8pmu_read_counter,

.write_counter = armv8pmu_write_counter,

.get_event_idx = armv8pmu_get_event_idx,

.start = armv8pmu_start,

.stop = armv8pmu_stop,

.reset = armv8pmu_reset,

.request_irq = armv8pmu_request_irq,

.free_irq = armv8pmu_free_irq,

.save_pm_registers = armv8pmu_save_pm_registers,

.restore_pm_registers = armv8pmu_restore_pm_registers,

.max_period = (1LLU << 32) - 1,

};

这么多新添加的函数指针,吓得我感觉从arm官网上下载文档看了一遍

还好还好,都是对于硬件操作的函数。

再次看一下perf_fuzzer是怎么工作的。strace看了一下

perf_event_open(0x5577398f28, 15581, 3, 0, PERF_FLAG_PID_CGROUP) = -1 EINVAL (Invalid argument)

perf_event_open(0x5577398f28, 0, 7, 0, PERF_FLAG_FD_NO_GROUP|0x80628200) = -1 EINVAL (Invalid argument)

perf_event_open(0x5577398f28, 0, 4, 0, 0x6060e610 /* PERF_FLAG_??? */) = -1 EINVAL (Invalid argument)

perf_event_open(0x5577398f28, 0, 3, -1, PERF_FLAG_FD_NO_GROUP|0x8) = -1 EINVAL (Invalid argument)

perf_event_open(0x5577398f28, 0, 7, -1, 0x8000 /* PERF_FLAG_??? */) = -1 EINVAL (Invalid argument)

perf_event_open(0x5577398f28, 0, 1, -1, 0) = 3

mmap(NULL, 10172, PROT_READ|PROT_WRITE, MAP_SHARED, 3, 0) = 0x7fb762a000

rt_sigaction(SIGRT_5, {0x5577168f5c, [], SA_SIGINFO}, NULL, 8) = 0

fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK|O_ASYNC) = 0

fcntl(3, F_SETSIG, 0x25)                = 0

fcntl(3, F_SETOWN, 3774)                = 0

ioctl(3, _IOC(_IOC_READ, 0x24, 0x07, 0x08), 0x7ffffe9578) = 0

close(3)                                = 0

munmap(0x7fb762a000, 10172)             = 0

ppoll([{fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd

ppoll([{fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd

clone(child_stack=0, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x7fb7997ff8) = 3780

prctl(PR_TASK_PERF_EVENTS_ENABLE)       = 0

ppoll([{fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd

kill(3780, SIGKILL)                     = 0

wait4(3780, [{WIFSIGNALED(s) && WTERMSIG(s) == SIGKILL}], 0, NULL) = 3780

--- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_KILLED, si_pid=3780, si_uid=2000, si_status=SIGKILL, si_utime=1, si_stime=0} ---

ppoll([{fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd=0, events=POLLIN}, {fd

openat(AT_FDCWD, "/proc/sys/kernel/perf_cpu_time_max_percent", O_RDONLY) = 3

fstat(3, {st_mode=S_IFREG|0644, st_size=0, ...}) = 0

翻来覆去就是这几个系统调用,最重要的还是传进来的perf_event_attr结构体。

如果perf fuzzer的代码看起来有点复杂的话,可以直接 中的例子。

头疼的是这个perf_event_attr结构体在不同的kernel版本上定义的不同(

天哪。。。。所以不要告诉我这玩意在3.10上跑的好好的,在跟3.18上不能跑),感觉这是将来的一个大坑。

其二,这玩意之前在intel的芯片上用的比较多,想想intel的台式机是什么性能,所以内核的默认参数perf_event_max_sample_rate设为10万次/每秒。

但是到了ARM平台上,如果用上ARM 硬件PMU的话。每秒10万次的中断,而且PMU本来就是用来测试每个单独CPU的,这些中断不能在CPU间

balance, 感觉整个世界要炸了。。然后watchdog bark或者NMI software lockdep的问题一大堆。

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值