1 需要让程序可以在运行时coredump的时候能够打印 当前的调用栈
原理是捕获coredump时对应的信号,再回调里进行打印
SIGSEGV 是 Unix 系统中的一个信号,表示“段错误”(Segmentation Fault)。当程序试图访问它没有权限访问的内存区域时,操作系统会向该进程发送 SIGSEGV 信号。这通常是由于以下几种情况引起的:
- 空指针解引用:访问一个未初始化的指针或已经被释放的指针。
- 越界访问:访问数组或缓冲区时超出了其分配的边界。
- 非法内存访问:试图读取或写入未映射的内存地址。
#include <stdio.h>
#include <execinfo.h>
#include <signal.h>
#include <stdlib.h>
#include <unistd.h>
signal(SIGSEGV, handler);
void handler(int sig)
{
void *array[10];
size_t size;
// get void*'s for all entries on the stack
size = backtrace(array, 10);
// print out all the frames to stderr
fprintf(stderr, "Error: signal %d:\n", sig);
backtrace_symbols_fd(array, size, STDERR_FILENO);
exit(1);
}
2 制造一个coredump
[I] [164735] [05-22] [16:33:58:880141] [sync_strategy.h:50] [default] msg sync_cache [0]->size():1
[I] [164735] [05-22] [16:33:58:880173] [sync_strategy.h:65] [default] check msg timestamp , cam_id:0, timestamp:1696651381053, max_ts:1696651381053, ts_delta:100
[I] [164735] [05-22] [16:33:58:880184] [sync_strategy.h:98] [default] Sync success, cam_id:0, timestamp:1696651381053
[I] [164735] [05-22] [16:33:58:880897] [gpu_run_module.cpp:1375] [default] STEP INTO RunSuperpointModel
Error: signal 11:
config/perception_x86/config_superpoint//../../../lib/libe3_dataflow.so(_ZN17CarizonPerception7handlerEi+0x28)[0x7fab65dd5d68]
/lib/x86_64-linux-gnu/libc.so.6(+0x42520)[0x7fab6f042520]
config/perception_x86/config_superpoint//../../../lib/libe3_dataflow.so(_ZN17CarizonPerception8GpuModel15ParseSuperpointERSt6vectorIPvSaIS2_EEPN5hobot7message7MessageEi+0x1cb6)[0x7fab65d55f46]
config/perception_x86/config_superpoint//../../../lib/libe3_dataflow.so(_ZN17CarizonPerception8GpuModel17ParseBranchResultERSt6vectorIPvSaIS2_EERSt13unordered_mapIiPN5hobot7message7MessageESt4hashIiESt8equal_toIiESaISt4pairIKiSA_EEEi+0x1363)[0x7fab65d6a8e3]
config/perception_x86/config_superpoint//../../../lib/libe3_dataflow.so(_ZN17CarizonPerception8GpuModel11ParseResultERSt6vectorIPvSaIS2_EERSt13unordered_mapIiPN5hobot7message7MessageESt4hashIiESt8equal_toIiESaISt4pairIKiSA_EEE+0x206)[0x7fab65d6b136]
config/perception_x86/config_superpoint//../../../lib/libe3_dataflow.so(_ZN17CarizonPerception12GpuRunModule18RunSuperpointModelESt10shared_ptrIN5hobot8dataflow15MsgResourceProcEERKSt6vectorIS1_ISt5dequeIS1_INS2_7message7MessageEESaISA_EEESaISD_EE+0xad8)[0x7fab65eef978]
config/perception_x86/config_superpoint//../../../lib/libe3_dataflow.so(_ZNSt17_Function_handlerIFvSt10shared_ptrIN5hobot8dataflow15MsgResourceProcEERKSt6vectorIS0_ISt5dequeIS0_INS1_7message7MessageEESaIS9_EEESaISC_EEESt5_BindIFSt7_Mem_fnIMN17CarizonPerception12GpuRunModuleEFvS4_SG_EEPSL_St12_PlaceholderILi1EESQ_ILi2EEEEE9_M_invokeERKSt9_Any_dataOS4_SG_+0x59)[0x7fab65effbd9]
/home/yuxuan.zhang/optimization/e3_dataflow_package_x86/lib/libdataflow.so.1(+0x2399a7)[0x7fab700399a7]
/home/yuxuan.zhang/optimization/e3_dataflow_package_x86/lib/libdataflow.so.1(_ZN5hobot8dataflow19BaseConditionFilter19UserCallBackWrapperEv+0x895)[0x7fab6ff9e5b5]
/home/yuxuan.zhang/optimization/e3_dataflow_package_x86/lib/libschedulegroup.so.1(_ZN5hobot13schedulegroup15SimpleTaskQueue18InlineExecuteTasksEv+0x180)[0x7fab70399230]
Segmentation fault (core dumped)
3 查看dump的地方
c++filt
可以看到代码coredump的地方是,把修饰后的符号可读化,后面的偏移地址可以忽略
_ZN17CarizonPerception8GpuModel15ParseSuperpointERSt6vectorIPvSaIS2_EEPN5hobot7message7MessageEi+0x1cb6
c++filt _ZN17CarizonPerception8GpuModel15ParseSuperpointERSt6vectorIPvSaIS2_EEPN5hobot7message7MessageEi
CarizonPerception::GpuModel::ParseSuperpoint(std::vector<void*, std::allocator<void*> >&, hobot::message::Message*, int)
objdump
通过反汇编查看符号的起始地址,是0000000000b54290
objdump -TC libe3_dataflow.so | grep 'CarizonPerception::GpuModel::ParseSuperpoint'
34133:0000000000b54290 g DF .text 0000000000002d3f Base CarizonPerception::GpuModel::ParseSuperpoint(std::vector<void*, std::allocator<void*> >&, hobot::message::Message*, int)
计算代码的地址
0xb54290 + 0x1cb6 = B55F46
addr2line
通过nm/readelf 指令也可以看到,这个库中的符号信息已经被剥离了,所以后面的addr2line要用带符号信息的库
// 通过nm判断调试信息
nm -C libe3_dataflow.so
nm: libe3_dataflow.so: no symbols
// 通过 readelf 判断调试信息
readelf -S libe3_dataflow.so | grep debug
readelf -S libe3_dataflow.so.gdb | grep debug
72: [33] .debug_aranges PROGBITS 0000000000000000 02470d24
74: [34] .debug_info PROGBITS 0000000000000000 0251e524
76: [35] .debug_abbrev PROGBITS 0000000000000000 12b43d79
78: [36] .debug_line PROGBITS 0000000000000000 12ce1d59
80: [37] .debug_str PROGBITS 0000000000000000 136a2cfd
82: [38] .debug_loc PROGBITS 0000000000000000 191e7fe8
84: [39] .debug_ranges PROGBITS 0000000000000000 1dc792db
最后通过addr2line找到对应的地址,在2980行
addr2line -e lib/libe3_dataflow.so.gdb 0xB55F46
/home/yuxuan.zhang/optimization/src/framework/x86/gpu/gpu_model.cpp:2980 (discriminator 2)
-e Specify the name of the executable for which addresses should be translated.
这里恰好就是我们人为制造的一个coredump
注意
关于strip
这里要使用带符号信息的lib库,才能找到对应的行号,关于符号信息的生成,这里的cmake主要是这样的
set(OUTPUT_NAME ${PROJECT_NAME})
add_library(${OUTPUT_NAME}_shared SHARED)
# 这里设置了库输出的名字,这就是为什么没有生成 libe3_dataflow_shared.so的原因
set_target_properties(${OUTPUT_NAME}_shared PROPERTIES
OUTPUT_NAME ${OUTPUT_NAME}
VERSION ${CMAKE_PROJECT_VERSION}
SOVERSION ${CMAKE_PROJECT_VERSION_MAJOR}
)
ADD_CUSTOM_COMMAND (
TARGET ${OUTPUT_NAME}_shared
POST_BUILD
COMMAND cp ../lib/libe3_dataflow.so ../lib/libe3_dataflow.so.gdb)
ADD_CUSTOM_TARGET (strip_binary ALL
COMMAND ${CMAKE_STRIP} -s ../lib/libe3_dataflow.so
DEPENDS ${OUTPUT_NAME}_shared)
看看strip到底再做什么
readelf -S libe3_dataflow.so
readelf -S libe3_dataflow.so.gdb
diff一下两个指令的输出,可以看到带有符号信息的库要多出9个section,这9个section自然就是记录着符号信息的节了。这也解释了为什么addr2line可以用在带gdb符号的库里,因为符号信息的段是在最后才加的,所以两个库的代码段,数据段等信息是一致的。
[33] .debug_aranges PROGBITS 0000000000000000 02470d24
00000000000ad800 0000000000000000 0 0 1
[34] .debug_info PROGBITS 0000000000000000 0251e524
0000000010625855 0000000000000000 0 0 1
[35] .debug_abbrev PROGBITS 0000000000000000 12b43d79
000000000019dfe0 0000000000000000 0 0 1
[36] .debug_line PROGBITS 0000000000000000 12ce1d59
00000000009c0fa4 0000000000000000 0 0 1
[37] .debug_str PROGBITS 0000000000000000 136a2cfd
0000000005b452eb 0000000000000001 MS 0 0 1
[38] .debug_loc PROGBITS 0000000000000000 191e7fe8
0000000004a912f3 0000000000000000 0 0 1
[39] .debug_ranges PROGBITS 0000000000000000 1dc792db
0000000001401810 0000000000000000 0 0 1
[40] .symtab SYMTAB 0000000000000000 1f07aaf0
00000000001e8f58 0000000000000018 41 21996 8
[41] .strtab STRTAB 0000000000000000 1f263a48
0000000000769ffb 0000000000000000 0 0 1
[42] .shstrtab STRTAB 0000000000000000 1f9cda43
00000000000001a6 0000000000000000 0 0 1
如果没有带符号的库
同样有办法能大概定位,直接查看原库函数的反汇编
objdump -d libe3_dataflow.so > disassembly.txt
可以定位到 B55F46 附近的代码,根据反汇编结果看,周围的代码是在给vector赋值,从而定位到问题代码
b55dae: e8 ed 62 eb ff call a0c0a0 <_ZNSt6vectorIS_IfSaIfEESaIS1_EEC1EmRKS1_RKS2_@plt>
b55db3: 4c 63 bd 74 f1 ff ff movslq -0xe8c(%rbp),%r15
b55dba: 4c 8d b5 b0 f3 ff ff lea -0xc50(%rbp),%r14
b55dc1: 4c 89 e1 mov %r12,%rcx
b55dc4: 48 89 da mov %rbx,%rdx
b55dc7: 4c 89 f7 mov %r14,%rdi
b55dca: 4c 89 fe mov %r15,%rsi
b55dcd: e8 ee 7b ed ff call a2d9c0 <_ZNSt6vectorIS_IS_IfSaIfEESaIS1_EESaIS3_EEC1EmRKS3_RKS4_@plt>
b55dd2: 48 89 df mov %rbx,%rdi
b55dd5: e8 86 cc ec ff call a22a60 <_ZNSt6vectorIS_IfSaIfEESaIS1_EED1Ev@plt>
b55dda: 48 8b bd d0 f3 ff ff mov -0xc30(%rbp),%rdi
b55de1: 48 85 ff test %rdi,%rdi
b55de4: 74 05 je b55deb <_ZN17CarizonPerception8GpuModel15ParseSuperpointERSt6vectorIPvSaIS2_EEPN5hobot7message7MessageEi@@Base+0x1b5b>
b55de6: e8 05 cf eb ff call a12cf0 <_ZdlPv@plt>
b55deb: 44 8b 95 74 f1 ff ff mov -0xe8c(%rbp),%r10d
b55df2: 45 85 d2 test %r10d,%r10d
b55df5: 0f 8e 68 02 00 00 jle b56063 <_ZN17CarizonPerception8GpuModel15ParseSuperpointERSt6vectorIPvSaIS2_EEPN5hobot7message7MessageEi@@Base+0x1dd3>
b55dfb: 8b 85 74 f1 ff ff mov -0xe8c(%rbp),%eax
b55e01: 4c 8b a5 b0 f3 ff ff mov -0xc50(%rbp),%r12