Android 12 进程native crash流程分析

最新推荐文章于 2025-02-10 14:15:00 发布

pecuyu

最新推荐文章于 2025-02-10 14:15:00 发布

阅读量3.8k

点赞数 4

分类专栏： Android stability Android源码解析 Android 文章标签： android 稳定性 crash native crash_dump

本文链接：https://blog.csdn.net/qq_28261343/article/details/126682680

版权

Android 同时被 3 个专栏收录

56 篇文章

订阅专栏

Android源码解析

31 篇文章

订阅专栏

Android stability

12 篇文章

订阅专栏

文章托管在gitee上 Android Notes , 同步csdn
本文基于Android12 分析

概述

在Android中，crash大致可以做如下分类：

Java crash, 通常发生在Java虚拟机层面之上的，如 system_server/app java crash
Native crash，主要是C/C++ 层面发生的crash，system_server/app也可能发生native crash，因为它们都zygote fork而来，而zygote是运行 app_process 这个native 程序而来。
kernel crash，通常会触发kernel panic 死机，通常是因为驱动或硬件导致。

本篇主要是看 Native crash 抓log流程。

实现机制介绍

实现机制主要是基于信号机制和ptrace机制，如下：

对于Android中的应用或native程序而言，它在启动时会首先加载linker模块做一些初始化，之后控制权才会回到进程自身的逻辑，因此可以在linker初始化的时候做一些工作，以实现抓取native crash的log，而在linker init过程，注册了一些 signal 的处理器（linux默认通常是直接kill进程）。
当进程异常时收到相关信号，signal 处理器会对信号流程做拦截处理，此处异常进程fork出新进程crash_dump，通过crash_dump去ptrace到异常进程，获取其调用栈、内存等信息，将输出内容写入到tombstoned提供的fd（通过socket连接tombstoned获取输出fd）。
当完成dump操作，会重新发送信号kill异常进程（在此操作之前会将signal 处理器重置为默认）。

流程概述图

流程大致如下图所示：
native crash流程图

流程分析

接下来，从 linker 的入口_start开始看起。如何分析入口可见参考。

begin.S

// bionic/linker/arch/arm64/begin.S
ENTRY(_start)
  // Force unwinds to end in this function.
  .cfi_undefined x30

  mov x0, sp
  bl __linker_init  // 调用 __linker_init

  /* linker init returns the _entry address in the main image */
  br x0
END(_start)

__linker_init

/// bionic/linker/linker_main.cpp
/*
 * This is the entry point for the linker, called from begin.S. This
 * method is responsible for fixing the linker's own relocations, and
 * then calling __linker_init_post_relocation().
 *
 * Because this method is called before the linker has fixed it's own
 * relocations, any attempt to reference an extern variable, extern
 * function, or other GOT reference will generate a segfault.
 */
extern "C" ElfW(Addr) __linker_init(void* raw_args) {
  // Initialize TLS early so system calls and errno work.
  KernelArgumentBlock args(raw_args);
  bionic_tcb temp_tcb __attribute__((uninitialized));
  linker_memclr(&temp_tcb, sizeof(temp_tcb));
  __libc_init_main_thread_early(args, &temp_tcb);

  ...
  // Prelink the linker so we can access linker globals.
  if (!tmp_linker_so.prelink_image()) __linker_cannot_link(args.argv[0]);
  if (!tmp_linker_so.link_image(SymbolLookupList(&tmp_linker_so), &tmp_linker_so, nullptr, nullptr)) __linker_cannot_link(args.argv[0]);

  return __linker_init_post_relocation(args, tmp_linker_so);  // 此处
}

__linker_init_post_relocation

linker的一些初始化，主要看linker_main函数

/// bionic/linker/linker_main.cpp
/*
 * This code is called after the linker has linked itself and fixed its own
 * GOT. It is safe to make references to externs and other non-local data at
 * this point. The compiler sometimes moves GOT references earlier in a
 * function, so avoid inlining this function (http://b/80503879).
 */
static ElfW(Addr) __attribute__((noinline))
__linker_init_post_relocation(KernelArgumentBlock& args, soinfo& tmp_linker_so) {
  // Finish initializing the main thread.
  __libc_init_main_thread_late();

  // We didn't protect the linker's RELRO pages in link_image because we
  // couldn't make system calls on x86 at that point, but we can now...
  if (!tmp_linker_so.protect_relro()) __linker_cannot_link(args.argv[0]);

  // And we can set VMA name for the bss section now
  set_bss_vma_name(&tmp_linker_so);

  // Initialize the linker's static libc's globals
  __libc_init_globals();

  // Initialize the linker's own global variables
  tmp_linker_so.call_constructors();

  // Setting the linker soinfo's soname can allocate heap memory, so delay it until here.
  for (const ElfW(Dyn)* d = tmp_linker_so.dynamic; d->d_tag != DT_NULL; ++d) {
    if (d->d_tag == DT_SONAME) {
      tmp_linker_so.set_soname(tmp_linker_so.get_string(d->d_un.d_val));
    }
  }

  // When the linker is run directly rather than acting as PT_INTERP, parse
  // arguments and determine the executable to load. When it's instead acting
  // as PT_INTERP, AT_ENTRY will refer to the loaded executable rather than the
  // linker's _start.
  const char* exe_to_load = nullptr;
  if (getauxval(AT_ENTRY) == reinterpret_cast<uintptr_t>(&_start)) { // 直接执行时
    if (args.argc == 3 && !strcmp(args.argv[1], "--list")) {
      // We're being asked to behave like ldd(1).
      g_is_ldd = true;
      exe_to_load = args.argv[2];
    } else if (args.argc <= 1 || !strcmp(args.argv[1], "--help")) {
      async_safe_format_fd(STDOUT_FILENO,
         "Usage: %s [--list] PROGRAM [ARGS-FOR-PROGRAM...]\n"
         "       %s [--list] path.zip!/PROGRAM [ARGS-FOR-PROGRAM...]\n"
         "\n"
         "A helper program for linking dynamic executables. Typically, the kernel loads\n"
         "this program because it's the PT_INTERP of a dynamic executable.\n"
         "\n"
         "This program can also be run directly to load and run a dynamic executable. The\n"
         "executable can be inside a zip file if it's stored uncompressed and at a\n"
         "page-aligned offset.\n"
         "\n"
         "The --list option gives behavior equivalent to ldd(1) on other systems.\n",
         args.argv[0], args.argv[0]);
      _exit(EXIT_SUCCESS);
    } else {
      exe_to_load = args.argv[1];
      __libc_shared_globals()->initial_linker_arg_count = 1;
    }
  }

  // store argc/argv/envp to use them for calling constructors
  g_argc = args.argc - __libc_shared_globals()->initial_linker_arg_count;
  g_argv = args.argv + __libc_shared_globals()->initial_linker_arg_count;
  g_envp = args.envp;
  __libc_shared_globals()->init_progname = g_argv[0];

  // Initialize static variables. Note that in order to
  // get correct libdl_info we need to call constructors
  // before get_libdl_info().
  sonext = solist = solinker = get_libdl_info(tmp_linker_so);
  g_default_namespace.add_soinfo(solinker);
  // 进入 linker_main
  ElfW(Addr) start_address = linker_main(args, exe_to_load);

  if (g_is_ldd) _exit(EXIT_SUCCESS);

  INFO("[ Jumping to _start (%p)... ]", reinterpret_cast<void*>(start_address));

  // Return the address that the calling assembly stub should jump to.
  return start_address;
}

linker_main

/// bionic/linker/linker_main.cpp
static ElfW(Addr) linker_main(KernelArgumentBlock& args, const char* exe_to_load) {
  ...
  // Sanitize the environment.
  __libc_init_AT_SECURE(args.envp);

  // Initialize system properties
  __system_properties_init(); // may use 'environ'

  // Initialize platform properties.
  platform_properties_init();

  // Register the debuggerd signal handler.
  linker_debuggerd_init(); // 初始化 signal handler

  ...

linker_debuggerd_init

/// bionic/linker/linker_debuggerd_android.cpp
void linker_debuggerd_init() {
  // There may be a version mismatch between the bootstrap linker and the crash_dump in the APEX,
  // so don't pass in any process info from the bootstrap linker.
  debuggerd_callbacks_t callbacks = {
#if defined(__ANDROID_APEX__)
      .get_process_info = get_process_info,
#endif
      .post_dump = notify_gdb_of_libraries,
  };
  debuggerd_init(&callbacks);  // 此处，调用库 libdebuggerd_handler_fallback
}

debuggerd_init

/// system/core/debuggerd/handler/debuggerd_handler.cpp
void debuggerd_init(debuggerd_callbacks_t* callbacks) {
  if (callbacks) {
    g_callbacks = *callbacks;
  }
  // 预开辟了 debuggerd thread stack 并设置保护属性，生成一个共享其父地址空间但不共享其文件描述符表
  // 确保打log和连接tombstoned时所需的文件描述符
  size_t thread_stack_pages = 8;
  void* thread_stack_allocation = mmap(nullptr, PAGE_SIZE * (thread_stack_pages + 2), PROT_NONE,
                                       MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
  if (thread_stack_allocation == MAP_FAILED) {
    fatal_errno("failed to allocate debuggerd thread stack");
  }

  char* stack = static_cast<char*>(thread_stack_allocation) + PAGE_SIZE;
  if (mprotect(stack, PAGE_SIZE * thread_stack_pages, PROT_READ | PROT_WRITE) != 0) {
    fatal_errno("failed to mprotect debuggerd thread stack");
  }

  // Stack grows negatively, set it to the last byte in the page...
  stack = (stack + thread_stack_pages * PAGE_SIZE - 1);
  // and align it.
  stack -= 15;
  pseudothread_stack = stack; // clone pseudothread_stack 用到

  // 初始化 sigaction
  struct sigaction action;
  memset(&action, 0, sizeof(action));
  sigfillset(&action.sa_mask);
  action.sa_sigaction = debuggerd_signal_handler; // signal处理器
  action.sa_flags = SA_RESTART | SA_SIGINFO;

  // Use the alternate signal stack if available so we can catch stack overflows.
  action.sa_flags |= SA_ONSTACK; // 使用单独的栈，使之能抓栈溢出的异常

#define SA_EXPOSE_TAGBITS 0x00000800
  // Request that the kernel set tag bits in the fault address. This is necessary for diagnosing MTE
  // faults.
  action.sa_flags |= SA_EXPOSE_TAGBITS;

  debuggerd_register_handlers(&action);  // 注册 action 实现
}

debuggerd_register_handlers

/// @system/core/debuggerd/include/debuggerd/handler.h
// DEBUGGER_ACTION_DUMP_TOMBSTONE and DEBUGGER_ACTION_DUMP_BACKTRACE are both
// triggered via BIONIC_SIGNAL_DEBUGGER. The debugger_action_t is sent via si_value
// using sigqueue(2) or equivalent. If no si_value is specified (e.g. if the
// signal is sent by kill(2)), the default behavior is to print the backtrace
// to the log.
  //  debuggerd信号用于输出trace  ---   35 (__SIGRTMIN + 3)        debuggerd
#define DEBUGGER_SIGNAL BIONIC_SIGNAL_DEBUGGER 

static void __attribute__((__unused__)) debuggerd_register_handlers(struct sigaction* action) {
  char value[PROP_VALUE_MAX] = "";
  bool enabled =
      !(__system_property_get("ro.debuggable", value) > 0 && !strcmp(value, "1") &&
        __system_property_get("debug.debuggerd.disable", value) > 0 && !strcmp(value, "1"));
  if (enabled) { // 有一个开关，当debuggable且disable 则不会注册下面信号处理。
    sigaction(SIGABRT, action, nullptr);
    sigaction(SIGBUS, action, nullptr);
    sigaction(SIGFPE, action, nullptr);
    sigaction(SIGILL, action, nullptr);
    sigaction(SIGSEGV, action, nullptr);
    sigaction(SIGSTKFLT, action, nullptr);
    sigaction(SIGSYS, action, nullptr);
    sigaction(SIGTRAP, action, nullptr);
  }

  sigaction(BIONIC_SIGNAL_DEBUGGER, action, nullptr);  //  设置信号处理action
}

下面是Android对一些特殊信号的定义：

/// @bionic/libc/platform/bionic/reserved_signals.h
// Realtime signals reserved for internal use:
//   32 (__SIGRTMIN + 0)        POSIX timers
//   33 (__SIGRTMIN + 1)        libbacktrace
//   34 (__SIGRTMIN + 2)        libcore
//   35 (__SIGRTMIN + 3)        debuggerd
//   36 (__SIGRTMIN + 4)        platform profilers (heapprofd, traced_perf)
//   37 (__SIGRTMIN + 5)        coverage (libprofile-extras)
//   38 (__SIGRTMIN + 6)        heapprofd ART managed heap dumps
//   39 (__SIGRTMIN + 7)        fdtrack
//   40 (__SIGRTMIN + 8)        android_run_on_all_threads (bionic/pthread_internal.cpp)

#define BIONIC_SIGNAL_POSIX_TIMERS (__SIGRTMIN + 0)
#define BIONIC_SIGNAL_BACKTRACE (__SIGRTMIN + 1)
#define BIONIC_SIGNAL_DEBUGGER (__SIGRTMIN + 3)
#define BIONIC_SIGNAL_PROFILER (__SIGRTMIN + 4)
#define BIONIC_SIGNAL_ART_PROFILER (__SIGRTMIN + 6)
#define BIONIC_SIGNAL_FDTRACK (__SIGRTMIN + 7)
#define BIONIC_SIGNAL_RUN_ON_ALL_THREADS (__SIGRTMIN + 8)

信号

在linux环境，执行如下命令，就可以看到各种信号的值及对应的含义：

# kill -l
 1    HUP Hangup                           23    URG Urgent I/O condition             45     45 Signal 45
 2    INT Interrupt                        24   XCPU CPU time limit exceeded          46     46 Signal 46
 3   QUIT Quit                             25   XFSZ File size limit exceeded         47     47 Signal 47
 4    ILL Illegal instruction              26 VTALRM Virtual timer expired            48     48 Signal 48
 5   TRAP Trap                             27   PROF Profiling timer expired          49     49 Signal 49
 6   ABRT Aborted                          28  WINCH Window size changed              50     50 Signal 50
 7    BUS Bus error                        29     IO I/O possible                     51     51 Signal 51
 8    FPE Floating point exception         30    PWR Power failure                    52     52 Signal 52
 9   KILL Killed                           31    SYS Bad system call                  53     53 Signal 53
10   USR1 User signal 1                    32     32 Signal 32                        54     54 Signal 54
11   SEGV Segmentation fault               33     33 Signal 33                        55     55 Signal 55
12   USR2 User signal 2                    34     34 Signal 34                        56     56 Signal 56
13   PIPE Broken pipe                      35     35 Signal 35                        57     57 Signal 57
14   ALRM Alarm clock                      36     36 Signal 36                        58     58 Signal 58
15   TERM Terminated                       37     37 Signal 37                        59     59 Signal 59
16 STKFLT Stack fault                      38     38 Signal 38                        60     60 Signal 60
17   CHLD Child exited                     39     39 Signal 39                        61     61 Signal 61
18   CONT Continue                         40     40 Signal 40                        62     62 Signal 62
19   STOP Stopped (signal)                 41     41 Signal 41                        63     63 Signal 63
20   TSTP Stopped                          42     42 Signal 42                        64     64 Signal 64
21   TTIN Stopped (tty input)              43     43 Signal 43
22   TTOU Stopped (tty output)             44     44 Signal 44

比较常见的错误信号如下：

11 SEGV Segmentation fault 段错误
- 解引用空指针或未初始化的或已经被释放的指针
- 访问字节对齐错误的内存
- 向只读内存区写操作
- 读写分配的内存区域之外的内存
- 其他内存损坏
6 ABRT Aborted 通常是程序主动调用abort ，在tombstone文件一般有abort信息
7 SIGBUS Bus error 比如出现的内存对齐问题
4 ILL Illegal instruction 非法指令问题
8 FPE Floating point exception 非法算数问题，比较执行除0操作
13 PIPE Broken pipe 管道损坏问题，比如向一个已经关闭的socket写
3 QUIT Quit Android对应用进程做了拦截处理，可以进行dump trace ，执行 kill -3 $pid
35 debuggerd 信号，使用于Android，用于dump trace

当进程发生crash时，会收到相关信号，之前设置的信号处理器会进行处理

debuggerd_signal_handler

处理流程如下：

打印crash信号概述
clone创建子线程去执行抓dump
等待抓dump完成
重新发送信号kill自身

/// @system/core/debuggerd/handler/debuggerd_handler.cpp
// Handler that does crash dumping by forking and doing the processing in the child.
// Do this by ptracing the relevant thread, and then execing debuggerd to do the actual dump.
static void debuggerd_signal_handler(int signal_number, siginfo_t* info, void* context) {
  // Make sure we don't change the value of errno, in case a signal comes in between the process
  // making a syscall and checking errno.
  ErrnoRestorer restorer;

  auto *ucontext = static_cast<ucontext_t*>(context);

  // It's possible somebody cleared the SA_SIGINFO flag, which would mean
  // our "info" arg holds an undefined value.
  if (!have_siginfo(signal_number)) {
    info = nullptr;
  }

  struct siginfo dummy_info = {};
  if (!info) {   // 收集 summary 信息 ，也就是打印的第一行
    memset(&dummy_info, 0, sizeof(dummy_info));
    dummy_info.si_signo = signal_number;
    dummy_info.si_code = SI_USER;
    dummy_info.si_pid = __getpid();
    dummy_info.si_uid = getuid();
    info = &dummy_info;
  } else if (info->si_code >= 0 || info->si_code == SI_TKILL) {
    // rt_tgsigqueueinfo(2)'s documentation appears to be incorrect on kernels
    // that contain commit 66dd34a (3.9+). The manpage claims to only allow
    // negative si_code values that are not SI_TKILL, but 66dd34a changed the
    // check to allow all si_code values in calls coming from inside the house.
  }

  debugger_process_info process_info = {};
  uintptr_t si_val = reinterpret_cast<uintptr_t>(info->si_ptr);
  if (signal_number == BIONIC_SIGNAL_DEBUGGER) { // 判断是否是 debuggerd信号
    if (info->si_code == SI_QUEUE && info->si_pid == __getpid()) {
      // Allow for the abort message to be explicitly specified via the sigqueue value.
      // Keep the bottom bit intact for representing whether we want a backtrace or a tombstone.
      if (si_val != kDebuggerdFallbackSivalUintptrRequestDump) {
        process_info.abort_msg = reinterpret_cast<void*>(si_val & ~1);
        info->si_ptr = reinterpret_cast<void*>(si_val & 1);
      }
    }
  } else if (g_callbacks.get_process_info) {
    process_info = g_callbacks.get_process_info();
  }

  // If sival_int is ~0, it means that the fallback handler has been called
  // once before and this function is being called again to dump the stack
  // of a specific thread. It is possible that the prctl call might return 1,
  // then return 0 in subsequent calls, so check the sival_int to determine if
  // the fallback handler should be called first.
  if (si_val == kDebuggerdFallbackSivalUintptrRequestDump ||
      prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0) == 1) {
    // This check might be racy if another thread sets NO_NEW_PRIVS, but this should be unlikely,
    // you can only set NO_NEW_PRIVS to 1, and the effect should be at worst a single missing
    // ANR trace.
    debuggerd_fallback_handler(info, ucontext, process_info.abort_msg);
    resend_signal(info);
    return;
  }

  // Only allow one thread to handle a signal at a time.
  int ret = pthread_mutex_lock(&crash_mutex);  // 同一时间只处理一个 信号
  if (ret != 0) {
    async_safe_format_log(ANDROID_LOG_INFO, "libc", "pthread_mutex_lock failed: %s", strerror(ret));
    return;
  }

  // 打印信号的概述 summary , 如下形式
  // Fatal signal 11 (SIGSEGV), code 0 (SI_USER from pid 3176, uid 0) in tid 457 (audioserver), pid 457 (audioserver)
  log_signal_summary(info);

  debugger_thread_info thread_info = {
      .crashing_tid = __gettid(),
      .pseudothread_tid = -1,
      .siginfo = info,
      .ucontext = context,
      .process_info = process_info,
  };

  // Set PR_SET_DUMPABLE to 1, so that crash_dump can ptrace us.
  int orig_dumpable = prctl(PR_GET_DUMPABLE);
  if (prctl(PR_SET_DUMPABLE, 1) != 0) {
    fatal_errno("failed to set dumpable");
  }

  // On kernels with yama_ptrace enabled, also allow any process to attach.
  bool restore_orig_ptracer = true;
  if (prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY) != 0) {
    if (errno == EINVAL) {
      // This kernel does not support PR_SET_PTRACER_ANY, or Yama is not enabled.
      restore_orig_ptracer = false;
    } else {
      fatal_errno("failed to set traceable");
    }
  }

  // Essentially pthread_create without CLONE_FILES, so we still work during file descriptor
  // exhaustion.
  pid_t child_pid =  // 类似pthread_create，但是不clone 文件描述符表
    clone(debuggerd_dispatch_pseudothread, pseudothread_stack,
          CLONE_THREAD | CLONE_SIGHAND | CLONE_VM | CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID,
          &thread_info, nullptr, nullptr, &thread_info.pseudothread_tid);
  if (child_pid == -1) {
    fatal_errno("failed to spawn debuggerd dispatch thread");
  }

  // Wait for the child to start...
  futex_wait(&thread_info.pseudothread_tid, -1); // 等待处理线程启动

  // and then wait for it to terminate.
  futex_wait(&thread_info.pseudothread_tid, child_pid); // 等待处理线程结束

  // Restore PR_SET_DUMPABLE to its original value.
  if (prctl(PR_SET_DUMPABLE, orig_dumpable) != 0) {
    fatal_errno("failed to restore dumpable");
  }

  // Restore PR_SET_PTRACER to its original value.
  if (restore_orig_ptracer && prctl(PR_SET_PTRACER, 0) != 0) {
    fatal_errno("failed to restore traceable");
  }

  if (info->si_signo == BIONIC_SIGNAL_DEBUGGER) {
    // If the signal is fatal, don't unlock the mutex to prevent other crashing threads from
    // starting to dump right before our death.
    pthread_mutex_unlock(&crash_mutex);
  } else {
    // Resend the signal, so that either the debugger or the parent's waitpid sees it.
    resend_signal(info); // 重新发送信号，终止进程。
  }
}

先看下resend_signal，因为中间的流程过长，影响分析。这个方法主要是确保进程在执行dump后，kill掉自身，不管是否真正成功dump。当然，它会等待crash_dump执行完或者后者退出（比如执行发送异常），但是crash_dump不会去抓自身的异常，防止无限循环dump。

resend_signal

static void resend_signal(siginfo_t* info) {
  // Signals can either be fatal or nonfatal.
  // For fatal signals, crash_dump will send us the signal we crashed with
  // before resuming us, so that processes using waitpid on us will see that we
  // exited with the correct exit status (e.g. so that sh will report
  // "Segmentation fault" instead of "Killed"). For this to work, we need
  // to deregister our signal handler for that signal before continuing.
  if (info->si_signo != BIONIC_SIGNAL_DEBUGGER) { // 非debugger信号，它通常会用来输出trace，并不杀死进程
    signal(info->si_signo, SIG_DFL); // 设为默认，会kill进程
    // 调用系统调用 rt_tgsigqueueinfo 重新发送信号
    int rc = syscall(SYS_rt_tgsigqueueinfo, __getpid(), __gettid(), info->si_signo, info);
    if (rc != 0) {
      fatal_errno("failed to resend signal during crash");
    }
  }
}

debuggerd_dispatch_pseudothread

在pseudothread线程处理抓crash trace流程，工作流程如下：

pipe 创建 pseudothread 与 crash_dump 双向通信管道
写入 crash信息到管道，后续到 crash_dump 读取会用到
fork子进程执行 crash_dump, 真正去抓log
阻塞等待 crash_dump pipe回复信息，收到继续执行创建 vm process
waitpid 等待 crash_dump 结束
等待dump进程退出

/// system/core/debuggerd/handler/debuggerd_handler.cpp
static int debuggerd_dispatch_pseudothread(void* arg) {
  debugger_thread_info* thread_info = static_cast<debugger_thread_info*>(arg);

  for (int i = 0; i < 1024; ++i) {//关闭已打开fd
    // Don't use close to avoid bionic's file descriptor ownership checks.
    syscall(__NR_close, i);
  }

  int devnull = TEMP_FAILURE_RETRY(open("/dev/null", O_RDWR));
  if (devnull == -1) {
    fatal_errno("failed to open /dev/null");
  } else if (devnull != 0) {
    fatal_errno("expected /dev/null fd to be 0, actually %d", devnull);
  }

  // devnull will be 0.
  TEMP_FAILURE_RETRY(dup2(devnull, 1));  // 重定向输出到 /dev/null
  TEMP_FAILURE_RETRY(dup2(devnull, 2));

  unique_fd input_read, input_write;
  unique_fd output_read, output_write;
  // 创建2对管道，用于此线程和 crash_dump 进程进行通信
  // 此pseudothread 写 output_write -->  crash_dump 读 output_read
  // 此pseudothread 读 input_read   <--  crash_dump 写 input_write
  if (!Pipe(&input_read, &input_write) != 0 || !Pipe(&output_read, &output_write)) {
    fatal_errno("failed to create pipe");
  }

  uint32_t version;
  ssize_t expected;

  // ucontext_t is absurdly large on AArch64, so piece it together manually with writev.
  struct iovec iovs[4] = {
      {.iov_base = &version, .iov_len = sizeof(version)},
      {.iov_base = thread_info->siginfo, .iov_len = sizeof(siginfo_t)},
      {.iov_base = thread_info->ucontext, .iov_len = sizeof(ucontext_t)},
  };

  // pipe 数据处理
  if (thread_info->process_info.fdsan_table) {
    // Dynamic executables always use version 4. There is no need to increment the version number if
    // the format changes, because the sender (linker) and receiver (crash_dump) are version locked.
    version = 4;
    expected = sizeof(CrashInfoHeader) + sizeof(CrashInfoDataDynamic);

    iovs[3] = {.iov_base = &thread_info->process_info,
               .iov_len = sizeof(thread_info->process_info)};
  } else {
    // Static executables always use version 1.
    version = 1;
    expected = sizeof(CrashInfoHeader) + sizeof(CrashInfoDataStatic);

    iovs[3] = {.iov_base = &thread_info->process_info.abort_msg, .iov_len = sizeof(uintptr_t)};
  }
  errno = 0;
  if (fcntl(output_write.get(), F_SETPIPE_SZ, expected) < static_cast<int>(expected)) {
    fatal_errno("failed to set pipe buffer size");
  }
  // 数据写入管道
  ssize_t rc = TEMP_FAILURE_RETRY(writev(output_write.get(), iovs, arraysize(iovs)));
  if (rc == -1) {
    fatal_errno("failed to write crash info");
  } else if (rc != expected) {
    fatal("failed to write crash info, wrote %zd bytes, expected %zd", rc, expected);
  }

  // Don't use fork(2) to avoid calling pthread_atfork handlers.
  pid_t crash_dump_pid = __fork();
  if (crash_dump_pid == -1) {
    async_safe_format_log(ANDROID_LOG_FATAL, "libc",
                          "failed to fork in debuggerd signal handler: %s", strerror(errno));
  } else if (crash_dump_pid == 0) { // 创建了 crash_dump 进程
    // 将STDOUT 重定向到读管道write fd， crash_dump可以直接写标准输出与pseudothread通信
    // 后面 crash_dump 进程里面通过 dup(STDOUT_FILENO) 比较方便的获取相关fd
    TEMP_FAILURE_RETRY(dup2(input_write.get(), STDOUT_FILENO));
    // 将STDIN 重定向到写管道read fd， crash_dump可以直接从标准输入读取pseudothread的数据
    TEMP_FAILURE_RETRY(dup2(output_read.get(), STDIN_FILENO));
    input_read.reset();
    input_write.reset();
    output_read.reset();
    output_write.reset(); // 关闭fd

    raise_caps();

    char main_tid[10];
    char pseudothread_tid[10];
    char debuggerd_dump_type[10];
    async_safe_format_buffer(main_tid, sizeof(main_tid), "%d", thread_info->crashing_tid);
    async_safe_format_buffer(pseudothread_tid, sizeof(pseudothread_tid), "%d",
                             thread_info->pseudothread_tid);
    async_safe_format_buffer(debuggerd_dump_type, sizeof(debuggerd_dump_type), "%d",
                             get_dump_type(thread_info));
    // 执行/apex/com.android.runtime/bin/crash_dump32|crash_dump64 ，根据位数判断
    execle(CRASH_DUMP_PATH, CRASH_DUMP_NAME, main_tid, pseudothread_tid, debuggerd_dump_type,
           nullptr, nullptr);
    async_safe_format_log(ANDROID_LOG_FATAL, "libc", "failed to exec crash_dump helper: %s",
                          strerror(errno));
    return 1;
  }
  // 关闭fds
  input_write.reset();
  output_read.reset();

  // crash_dump will ptrace and pause all of our threads, and then write to the pipe to tell
  // us to fork off a process to read memory from.
  char buf[4];
  rc = TEMP_FAILURE_RETRY(read(input_read.get(), &buf, sizeof(buf)));// 等待crash_dump写，在需要创建vm_process时

  bool success = false;
  if (rc == 1 && buf[0] == '\1') {
    // crash_dump successfully started, and is ptracing us.
    // Fork off a copy of our address space for it to use.
    create_vm_process();
    success = true;
  } else {
    // Something went wrong, log it.
    if (rc == -1) {
      async_safe_format_log(ANDROID_LOG_FATAL, "libc", "read of IPC pipe failed: %s",
                            strerror(errno));
    } else if (rc == 0) {
      async_safe_format_log(ANDROID_LOG_FATAL, "libc",
                            "crash_dump helper failed to exec, or was killed");
    } else if (rc != 1) {
      async_safe_format_log(ANDROID_LOG_FATAL, "libc",
                            "read of IPC pipe returned unexpected value: %zd", rc);
    } else if (buf[0] != '\1') {
      async_safe_format_log(ANDROID_LOG_FATAL, "libc", "crash_dump helper reported failure");
    }
  }

  // Don't leave a zombie child.
  int status;
  if (TEMP_FAILURE_RETRY(waitpid(crash_dump_pid, &status, 0)) == -1) {//回收crash dump进程残留
    async_safe_format_log(ANDROID_LOG_FATAL, "libc", "failed to wait for crash_dump helper: %s",
                          strerror(errno));
  } else if (WIFSTOPPED(status) || WIFSIGNALED(status)) {
    async_safe_format_log(ANDROID_LOG_FATAL, "libc", "crash_dump helper crashed or stopped");
  }

  if (success) {
    if (thread_info->siginfo->si_signo != BIONIC_SIGNAL_DEBUGGER) {
      // For crashes, we don't need to minimize pause latency.
      // Wait for the dump to complete before having the process exit, to avoid being murdered by
      // ActivityManager or init.
      TEMP_FAILURE_RETRY(read(input_read, &buf, sizeof(buf)));   // 等待dump完成
    }
  }

  return success ? 0 : 1;
}

crash_dump#main

主要工作如下：

重置信号处理器，防止自身异常时dump自身
设置pipe信号处理器
fork子进程进程dump，其工作如下
- 设置30s的alarm，防止dump过长或长时间卡住
- 收集打开的文件描述符
- 获取所有线程信息
- PTRACE_O_TRACECLONE 监听pseudothread clone，并通知其继续，后者会创建vm process，获取一份内存拷贝
- 连接tombstoned，获取trace输出的tombstone临时文件fd
- engrave_tombstone 输出trace内容到 tombstone fd
- 通知ams发生native crash 事件
- 通知 tombstoned 已完成dump，后者完成tombstone临时文件重命名

/// system/core/debuggerd/crash_dump.cpp
int main(int argc, char** argv) {
  DefuseSignalHandlers(); // 重置信号处理器，防止dump自身
  InstallSigPipeHandler(); // 设置pipe信号处理器

  // There appears to be a bug in the kernel where our death causes SIGHUP to
  // be sent to our process group if we exit while it has stopped jobs (e.g.
  // because of wait_for_debugger). Use setsid to create a new process group to
  // avoid hitting this.
  setsid();

  atrace_begin(ATRACE_TAG, "before reparent");
  pid_t target_process = getppid();

  // Open /proc/`getppid()` before we daemonize.
  std::string target_proc_path = "/proc/" + std::to_string(target_process);
  int target_proc_fd = open(target_proc_path.c_str(), O_DIRECTORY | O_RDONLY);
  if (target_proc_fd == -1) {
    PLOG(FATAL) << "failed to open " << target_proc_path;
  }

  // Make sure getppid() hasn't changed.
  if (getppid() != target_process) {
    LOG(FATAL) << "parent died";
  }
  atrace_end(ATRACE_TAG);

  // Reparent ourselves to init, so that the signal handler can waitpid on the
  // original process to avoid leaving a zombie for non-fatal dumps.
  // Move the input/output pipes off of stdout/stderr, out of paranoia.
  unique_fd output_pipe(dup(STDOUT_FILENO));// 在之前重定向过，现在重定向回来
  unique_fd input_pipe(dup(STDIN_FILENO));

  unique_fd fork_exit_read, fork_exit_write;
  if (!Pipe(&fork_exit_read, &fork_exit_write)) { // 创建 crash_dump 与子进程通信管道
    PLOG(FATAL) << "failed to create pipe";
  }

 // 创建子进程，真正ptrace去抓log，
  pid_t forkpid = fork();
  if (forkpid == -1) {
    PLOG(FATAL) << "fork failed";
  } else if (forkpid == 0) {
    fork_exit_read.reset(); // 关闭此fd
  } else {
    // We need the pseudothread to live until we get around to verifying the vm pid against it.
    // The last thing it does is block on a waitpid on us, so wait until our child tells us to die.
    fork_exit_write.reset(); // 关闭此fd
    char buf;
    TEMP_FAILURE_RETRY(read(fork_exit_read.get(), &buf, sizeof(buf)));  // 使pseudothread阻塞在waitpid
    _exit(0);
  }

  // 下面进入子进程逻辑
  ATRACE_NAME("after reparent");
  pid_t pseudothread_tid;
  DebuggerdDumpType dump_type;
  ProcessInfo process_info;

  Initialize(argv);
  ParseArgs(argc, argv, &pseudothread_tid, &dump_type);//解析参数。

  // Die if we take too long.
  // Note: processes with many threads and minidebug-info can take a bit to
  //       unwind, do not make this too small. b/62828735
  alarm(30 * android::base::HwTimeoutMultiplier()); // 30s 超时时间，否则被kill

  // Collect the list of open files.
  OpenFilesList open_files;
  {
    ATRACE_NAME("open files");
    populate_open_files_list(&open_files, g_target_thread);//收集打开文件
  }

  // In order to reduce the duration that we pause the process for, we ptrace
  // the threads, fetch their registers and associated information, and then
  // fork a separate process as a snapshot of the process's address space.
  std::set<pid_t> threads;
  if (!android::procinfo::GetProcessTids(g_target_thread, &threads)) {
    PLOG(FATAL) << "failed to get process threads";
  }

  std::map<pid_t, ThreadInfo> thread_info;
  siginfo_t siginfo;
  std::string error;

  { // 读取线程相关信息
    ATRACE_NAME("ptrace");
    for (pid_t thread : threads) {
      // Trace the pseudothread separately, so we can use different options.
      if (thread == pseudothread_tid) {
        continue;
      }

      if (!ptrace_seize_thread(target_proc_fd, thread, &error)) {
        bool fatal = thread == g_target_thread;
        LOG(fatal ? FATAL : WARNING) << error;
      }

      ThreadInfo info;
      info.pid = target_process;
      info.tid = thread;
      info.uid = getuid();
      info.thread_name = get_thread_name(thread);

      unique_fd attr_fd(openat(target_proc_fd, "attr/current", O_RDONLY | O_CLOEXEC));
      if (!android::base::ReadFdToString(attr_fd, &info.selinux_label)) {
        PLOG(WARNING) << "failed to read selinux label";
      }

      if (!ptrace_interrupt(thread, &info.signo)) {
        PLOG(WARNING) << "failed to ptrace interrupt thread " << thread;
        ptrace(PTRACE_DETACH, thread, 0, 0);
        continue;
      }

      struct iovec iov = {
          &info.tagged_addr_ctrl,
          sizeof(info.tagged_addr_ctrl),
      };
      if (ptrace(PTRACE_GETREGSET, thread, NT_ARM_TAGGED_ADDR_CTRL,
                 reinterpret_cast<void*>(&iov)) == -1) {
        info.tagged_addr_ctrl = -1;
      }

      if (thread == g_target_thread) { // 是crash 线程，读取相关crash信息
        // Read the thread's registers along with the rest of the crash info out of the pipe.
        ReadCrashInfo(input_pipe, &siginfo, &info.registers, &process_info); // 读crash信息
        info.siginfo = &siginfo;
        info.signo = info.siginfo->si_signo;

        info.command_line = get_command_line(g_target_thread);
      } else {
        info.registers.reset(unwindstack::Regs::RemoteGet(thread));
        if (!info.registers) {
          PLOG(WARNING) << "failed to fetch registers for thread " << thread;
          ptrace(PTRACE_DETACH, thread, 0, 0);
          continue;
        }
      }

      thread_info[thread] = std::move(info);
    }
  }

  // Trace the pseudothread with PTRACE_O_TRACECLONE and tell it to fork.
  // 被跟踪进程在下一次调用clone()时将其停止，并自动跟踪新产生的进程，
  // 新产生的进程从收到SIGSTOP信号或PTRACE_EVENT_STOP(如果使用PTRACE_SEIZE)开始。
  // 其新产生的进程的pid可以通过PTRACE_GETEVENTMSG得到。
  if (!ptrace_seize_thread(target_proc_fd, pseudothread_tid, &error, PTRACE_O_TRACECLONE)) {
    LOG(FATAL) << "failed to seize pseudothread: " << error;
  }

  // 通知pseudothread继续，会调用create_vm_process
  if (TEMP_FAILURE_RETRY(write(output_pipe.get(), "\1", 1)) != 1) {
    PLOG(FATAL) << "failed to write to pseudothread";
  }

  pid_t vm_pid = wait_for_vm_process(pseudothread_tid); // 等待创建vm process,用于访问地址空间
  if (ptrace(PTRACE_DETACH, pseudothread_tid, 0, 0) != 0) {
    PLOG(FATAL) << "failed to detach from pseudothread";
  }

  // The pseudothread can die now.
  fork_exit_write.reset(); // reset会关闭write端，read端收到通知，然后crash_dump退出

  // Defer the message until later, for readability.
  bool wait_for_debugger = android::base::GetBoolProperty(
      "debug.debuggerd.wait_for_debugger",
      android::base::GetBoolProperty("debug.debuggerd.wait_for_gdb", false));  // 是否等待gdb来连接，注意这两个属性
  if (siginfo.si_signo == BIONIC_SIGNAL_DEBUGGER) {
    wait_for_debugger = false;
  }

  // Detach from all of our attached threads before resuming.
  for (const auto& [tid, thread] : thread_info) {
    int resume_signal = thread.signo == BIONIC_SIGNAL_DEBUGGER ? 0 : thread.signo;
    if (wait_for_debugger) {
      resume_signal = 0;
      if (tgkill(target_process, tid, SIGSTOP) != 0) {
        PLOG(WARNING) << "failed to send SIGSTOP to " << tid;
      }
    }

    LOG(DEBUG) << "detaching from thread " << tid;
    if (ptrace(PTRACE_DETACH, tid, 0, resume_signal) != 0) {
      PLOG(ERROR) << "failed to detach from thread " << tid;
    }
  }

  // Drop our capabilities now that we've fetched all of the information we need.
  drop_capabilities();

  {// 连接 tombstoned 获取输出fd
    ATRACE_NAME("tombstoned_connect");
    LOG(INFO) << "obtaining output fd from tombstoned, type: " << dump_type;
    g_tombstoned_connected = tombstoned_connect(g_target_thread, &g_tombstoned_socket, &g_output_fd,
                                                &g_proto_fd, dump_type);
  }

  if (g_tombstoned_connected) {// 将 STDOUT_FILENO  -> g_output_fd
    if (TEMP_FAILURE_RETRY(dup2(g_output_fd.get(), STDOUT_FILENO)) == -1) {
      PLOG(ERROR) << "failed to dup2 output fd (" << g_output_fd.get() << ") to STDOUT_FILENO";
    }
  } else {// 连接失败，输出到/dev/null
    unique_fd devnull(TEMP_FAILURE_RETRY(open("/dev/null", O_RDWR)));
    TEMP_FAILURE_RETRY(dup2(devnull.get(), STDOUT_FILENO));
    g_output_fd = std::move(devnull);
  }

  LOG(INFO) << "performing dump of process " << target_process
            << " (target tid = " << g_target_thread << ")";

  int signo = siginfo.si_signo;
  bool fatal_signal = signo != BIONIC_SIGNAL_DEBUGGER;// 非debugger 信号
  bool backtrace = false;

  // si_value is special when used with BIONIC_SIGNAL_DEBUGGER.
  //   0: dump tombstone
  //   1: dump backtrace
  if (!fatal_signal) { // debugger 信号用于输出trace
    int si_val = siginfo.si_value.sival_int;
    if (si_val == 0) {
      backtrace = false;
    } else if (si_val == 1) {
      backtrace = true;
    } else {
      LOG(WARNING) << "unknown si_value value " << si_val;
    }
  }

  // TODO: Use seccomp to lock ourselves down.
  unwindstack::UnwinderFromPid unwinder(256, vm_pid, unwindstack::Regs::CurrentArch());
  if (!unwinder.Init()) {
    LOG(FATAL) << "Failed to init unwinder object.";
  }

  std::string amfd_data;
  if (backtrace) { // 输出 backtrace
    ATRACE_NAME("dump_backtrace");
    dump_backtrace(std::move(g_output_fd), &unwinder, thread_info, g_target_thread);
  } else {
    {
      ATRACE_NAME("fdsan table dump");
      populate_fdsan_table(&open_files, unwinder.GetProcessMemory(),
                           process_info.fdsan_table_address);
    }

    { // 雕刻墓碑，并输出到 tombstone 文件
      ATRACE_NAME("engrave_tombstone");
      engrave_tombstone(std::move(g_output_fd), std::move(g_proto_fd), &unwinder, thread_info,
                        g_target_thread, process_info, &open_files, &amfd_data);
    }
  }

  if (fatal_signal) { // 向 AMS 反馈出现 native crash 事件
    // Don't try to notify ActivityManager if it just crashed, or we might hang until timeout.
    if (thread_info[target_process].thread_name != "system_server") {
      activity_manager_notify(target_process, signo, amfd_data);
    }
  }

  if (wait_for_debugger) {  // 用于gdb调试
    // Use ALOGI to line up with output from engrave_tombstone.
    ALOGI(
        "***********************************************************\n"
        "* Process %d has been suspended while crashing.\n"
        "* To attach the debugger, run this on the host:\n"
        "*\n"
        "*     gdbclient.py -p %d\n"
        "*\n"
        "***********************************************************",
        target_process, target_process);
  }

  // Close stdout before we notify tombstoned of completion.
  close(STDOUT_FILENO);
  // 最后通知完成输出
  if (g_tombstoned_connected && !tombstoned_notify_completion(g_tombstoned_socket.get())) {
    LOG(ERROR) << "failed to notify tombstoned of completion";
  }

  return 0;
}

大致总结下上面流程：

当进程发生crash时，在linker的linker_debuggerd_init过程设置的信号处理器函数，即debuggerd_signal_handler将被调用
在信号处理过程，首先clone创建了子线程pseudothread去执行抓dump，它会进一步fork子进程执行crash_dump，也就是dump操作实际上是在 crash_dump 中完成的。
crash_dump fork子进程进程dump，后者通过 ptrace 来获取crash进程相关信息，之后连接tombstoned，获取trace输出到tombstone的文件fd，通过 engrave_tombstone 输出trace内容到 tombstone 文件，后面还会通知ams发生native crash 事件
pseudothread 等待回收 crash_dump 进程并等待抓dump完成
debuggerd_signal_handler等待pseudothread结束，之后重新发送信号kill自身

下面继续看 crash_dump 的 main函数执行流程细节。

DefuseSignalHandlers

重置signal处理器和设置sigset, 防止去dump自身crash，以防反复去dump

static void DefuseSignalHandlers() {
  // Don't try to dump ourselves.
  struct sigaction action = {};
  action.sa_handler = SIG_DFL;
  debuggerd_register_handlers(&action);

  sigset_t mask;
  sigemptyset(&mask);
  if (sigprocmask(SIG_SETMASK, &mask, nullptr) != 0) {
    PLOG(FATAL) << "failed to set signal mask";
  }
}

InstallSigPipeHandler

static void InstallSigPipeHandler() {
  struct sigaction action = {};
  action.sa_handler = SIG_IGN; // 忽略 SIGPIPE
  action.sa_flags = SA_RESTART;
  sigaction(SIGPIPE, &action, nullptr);
}

连接tombstoned

// system/core/debuggerd/crash_dump.cpp
int main(int argc, char** argv) {
...
	{ // 通过 socket 连接 tombstoned，获取输出tombstone文件的 g_output_fd
	  ATRACE_NAME("tombstoned_connect");
	  LOG(INFO) << "obtaining output fd from tombstoned, type: " << dump_type;
	  g_tombstoned_connected = tombstoned_connect(g_target_thread, &g_tombstoned_socket, &g_output_fd,
	                                              &g_proto_fd, dump_type);
	}
...

tombstoned_connect

通过socket连接tombstoned ，写入dump请求，并接收返回的fd，用来输出trace

/// @system/core/debuggerd/tombstoned/tombstoned_client.cpp
bool tombstoned_connect(pid_t pid, unique_fd* tombstoned_socket, unique_fd* text_output_fd,
                        unique_fd* proto_output_fd, DebuggerdDumpType dump_type) {
  unique_fd sockfd( // crash连接的socket是tombstoned_crash，输出Java trace连接的是tombstoned_java_trace
      socket_local_client((dump_type != kDebuggerdJavaBacktrace ? kTombstonedCrashSocketName
                                                                : kTombstonedJavaTraceSocketName),
                          ANDROID_SOCKET_NAMESPACE_RESERVED, SOCK_SEQPACKET));
  if (sockfd == -1) {
    async_safe_format_log(ANDROID_LOG_ERROR, "libc", "failed to connect to tombstoned: %s",
                          strerror(errno));
    return false;
  }
  // 发起dump请求
  TombstonedCrashPacket packet = {};
  packet.packet_type = CrashPacketType::kDumpRequest;
  packet.packet.dump_request.pid = pid;
  packet.packet.dump_request.dump_type = dump_type;
  if (TEMP_FAILURE_RETRY(write(sockfd, &packet, sizeof(packet))) != sizeof(packet)) {
    async_safe_format_log(ANDROID_LOG_ERROR, "libc", "failed to write DumpRequest packet: %s",
                          strerror(errno));
    return false;
  }

  unique_fd tmp_output_fd, tmp_proto_fd;
  ssize_t rc = -1;
  // 读取tombstoned返回的相关文件fd，用来输出相关log/trace
  if (dump_type == kDebuggerdTombstoneProto) {
    rc = ReceiveFileDescriptors(sockfd, &packet, sizeof(packet), &tmp_output_fd, &tmp_proto_fd);
  } else {
    rc = ReceiveFileDescriptors(sockfd, &packet, sizeof(packet), &tmp_output_fd);
  }

  if (rc == -1) {
    async_safe_format_log(ANDROID_LOG_ERROR, "libc",
                          "failed to read response to DumpRequest packet: %s", strerror(errno));
    return false;
  } else if (rc != sizeof(packet)) {
    async_safe_format_log(
        ANDROID_LOG_ERROR, "libc",
        "received DumpRequest response packet of incorrect length (expected %zu, got %zd)",
        sizeof(packet), rc);
    return false;
  }

  // Make the fd O_APPEND so that our output is guaranteed to be at the end of a file.
  // (This also makes selinux rules consistent, because selinux distinguishes between writing to
  // a regular fd, and writing to an fd with O_APPEND).
  int flags = fcntl(tmp_output_fd.get(), F_GETFL);
  if (fcntl(tmp_output_fd.get(), F_SETFL, flags | O_APPEND) != 0) {
    async_safe_format_log(ANDROID_LOG_WARN, "libc", "failed to set output fd flags: %s",
                          strerror(errno));
  }

  *tombstoned_socket = std::move(sockfd);
  *text_output_fd = std::move(tmp_output_fd);
  if (proto_output_fd) {
    *proto_output_fd = std::move(tmp_proto_fd);
  }
  return true;
}

下面简单插入下 tombstoned 的启动流程，在tombstoned.rc中有如下配置，可以发现它被当做了一个可以启动的“服务”，同时在init启动它的时候会给它创建三个socket。

/// @system/core/debuggerd/tombstoned/tombstoned.rc
service tombstoned /system/bin/tombstoned
    user tombstoned
    group system

    socket tombstoned_crash seqpacket 0666 system system
    socket tombstoned_intercept seqpacket 0666 system system
    socket tombstoned_java_trace seqpacket 0666 system system
    writepid /dev/cpuset/system-background/tasks

那它是在哪启动的呢？在Android12中是直接写在init.rc中，在post-fs-data流程进行启动。而在Android11是配置在tombstoned.rc，时机也是post-fs-data，通过它的注释可以知道修改是为了早点启动来抓tombstone。

/// @system/core/rootdir/init.rc
on post-fs-data
	...
    # Start tombstoned early to be able to store tombstones.
    mkdir /data/anr 0775 system system encryption=Require
    mkdir /data/tombstones 0771 system system encryption=Require
    mkdir /data/vendor/tombstones 0771 root root
    mkdir /data/vendor/tombstones/wifi 0771 wifi wifi
    start tombstoned    # 启动 tombstoned

接下来看 tombstoned 的处理。

tombstoned#main

在main方法里面获取socket，并设置事件监听。

/// @system/core/debuggerd/tombstoned/tombstoned.cpp
int main(int, char* []) {
  ...
    int crash_socket = android_get_control_socket(kTombstonedCrashSocketName);  // tombstoned_crash
	...
  // 创建处理 crash 的 socket，接受请求回调 crash_accept_cb
  evconnlistener* tombstone_listener =
    evconnlistener_new(base, crash_accept_cb, CrashQueue::for_tombstones(), LEV_OPT_CLOSE_ON_FREE,
                       -1 /* backlog */, crash_socket);
  ...
}

执行 accept 后回调 crash_accept_cb

crash_accept_cb

static void crash_accept_cb(evconnlistener* listener, evutil_socket_t sockfd, sockaddr*, int,
                            void*) {
  event_base* base = evconnlistener_get_base(listener);
  Crash* crash = new Crash();

  // TODO: Make sure that only java crashes come in on the java socket
  // and only native crashes on the native socket.
  struct timeval timeout = {1 * android::base::HwTimeoutMultiplier(), 0};
  event* crash_event = event_new(base, sockfd, EV_TIMEOUT | EV_READ, crash_request_cb, crash);
  crash->crash_socket_fd.reset(sockfd);
  crash->crash_event = crash_event;
  event_add(crash_event, &timeout); // 添加一个 read监听, 会回调 crash_request_cb
}

收到client写入的dump请求后，会触发read事件，执行 crash_request_cb

crash_request_cb

static void crash_request_cb(evutil_socket_t sockfd, short ev, void* arg) {
  std::unique_ptr<Crash> crash(static_cast<Crash*>(arg));
  TombstonedCrashPacket request = {};

  if ((ev & EV_TIMEOUT) != 0) {
    LOG(WARNING) << "crash request timed out";
    return;
  } else if ((ev & EV_READ) == 0) {// 非读事件，
    LOG(WARNING) << "tombstoned received unexpected event from crash socket";
    return;
  }
  // 读取client写的数据
  ssize_t rc = TEMP_FAILURE_RETRY(read(sockfd, &request, sizeof(request)));
  if (rc == -1) {
    PLOG(WARNING) << "failed to read from crash socket";
    return;
  } else if (rc != sizeof(request)) {
    LOG(WARNING) << "crash socket received short read of length " << rc << " (expected "
                 << sizeof(request) << ")";
    return;
  }

  if (request.packet_type != CrashPacketType::kDumpRequest) { // dump
    LOG(WARNING) << "unexpected crash packet type, expected kDumpRequest, received  "
                 << StringPrintf("%#2hhX", request.packet_type);
    return;
  }

  crash->crash_type = request.packet.dump_request.dump_type;
  if (crash->crash_type < 0 || crash->crash_type > kDebuggerdTombstoneProto) { // proto
    LOG(WARNING) << "unexpected crash dump type: " << crash->crash_type;
    return;
  }

  if (crash->crash_type != kDebuggerdJavaBacktrace) { // java trace
    crash->crash_pid = request.packet.dump_request.pid;
  } else {
    // Requests for java traces are sent from untrusted processes, so we
    // must not trust the PID sent down with the request. Instead, we ask the
    // kernel.
    ucred cr = {};
    socklen_t len = sizeof(cr);
    int ret = getsockopt(sockfd, SOL_SOCKET, SO_PEERCRED, &cr, &len);
    if (ret != 0) {
      PLOG(ERROR) << "Failed to getsockopt(..SO_PEERCRED)";
      return;
    }

    crash->crash_pid = cr.pid;
  }

  pid_t crash_pid = crash->crash_pid;
  LOG(INFO) << "received crash request for pid " << crash_pid;
  // 对crash并发处理限制只有1个，即大于1个要入队等待
  if (CrashQueue::for_crash(crash)->maybe_enqueue_crash(std::move(crash))) {
    LOG(INFO) << "enqueueing crash request for pid " << crash_pid;
  } else {
    perform_request(std::move(crash)); // 当前没有正在执行的crash dump，则直接执行dump
  }
}

perform_request

static void perform_request(std::unique_ptr<Crash> crash) {
  unique_fd output_fd;
  bool intercepted =
      intercept_manager->GetIntercept(crash->crash_pid, crash->crash_type, &output_fd);
  if (intercepted) {
    if (crash->crash_type == kDebuggerdTombstoneProto) {
      crash->output.proto = CrashArtifact::devnull();
    }
  } else {
    // 获取输出文件fd
    if (auto o = CrashQueue::for_crash(crash.get())->get_output(crash->crash_type); o) {
      crash->output = std::move(*o);
      output_fd.reset(dup(crash->output.text.fd));
    } else {
      LOG(ERROR) << "failed to get crash output for type " << crash->crash_type;
      return;
    }
  }
  // 写响应给client，即crash_dump
  TombstonedCrashPacket response = {.packet_type = CrashPacketType::kPerformDump};

  ssize_t rc = -1;
  if (crash->output.proto) {
    rc = SendFileDescriptors(crash->crash_socket_fd, &response, sizeof(response), output_fd.get(),
                             crash->output.proto->fd.get());
  } else { // 写回输出的tombstone fd
    rc = SendFileDescriptors(crash->crash_socket_fd, &response, sizeof(response), output_fd.get());
  }

  output_fd.reset();

  if (rc == -1) {
    PLOG(WARNING) << "failed to send response to CrashRequest";
    return;
  } else if (rc != sizeof(response)) {
    PLOG(WARNING) << "crash socket write returned short";
    return;
  }

  // TODO: Make this configurable by the interceptor?
  struct timeval timeout = {10 * android::base::HwTimeoutMultiplier(), 0};

  event_base* base = event_get_base(crash->crash_event);
  // 监听crash dump 完成。 收到请求回调 crash_completed_cb
  event_assign(crash->crash_event, base, crash->crash_socket_fd, EV_TIMEOUT | EV_READ,
               crash_completed_cb, crash.get());
  event_add(crash->crash_event, &timeout);
  CrashQueue::for_crash(crash)->on_crash_started();

  // The crash is now owned by the event loop.
  crash.release();
}

CrashQueue::get_output

获取输出trace的文件fd

/// @system/core/debuggerd/tombstoned/tombstoned.cpp
std::optional<CrashOutput> get_output(DebuggerdDumpType dump_type) {
  CrashOutput result;

  switch (dump_type) {
    case kDebuggerdNativeBacktrace: // debuggerd 输出trace，不需要输出文件
      // Don't generate tombstones for native backtrace requests.
      return {};

    case kDebuggerdTombstoneProto:
      if (!supports_proto_) {
        LOG(ERROR) << "received kDebuggerdTombstoneProto on a queue that doesn't support proto";
        return {};
      }
      result.proto = create_temporary_file();
      result.text = create_temporary_file();   //  创建了一些临时文件
      break;

    case kDebuggerdJavaBacktrace:
    case kDebuggerdTombstone:
      result.text = create_temporary_file();
      break;

    default:
      LOG(ERROR) << "unexpected dump type: " << dump_type;
      return {};
  }

  return result;
}

看一下 create_temporary_file 函数的实现：

  CrashArtifact create_temporary_file() const {
    CrashArtifact result;

    std::optional<std::string> path;
    result.fd.reset(openat(dir_fd_, ".", O_WRONLY | O_APPEND | O_TMPFILE | O_CLOEXEC, 0660));
    if (result.fd == -1) {
      // We might not have O_TMPFILE. Try creating with an arbitrary filename instead.
      static size_t counter = 0;
      std::string tmp_filename = StringPrintf(".temporary%zu", counter++); // 创建 .temporaryXX 文件，序号是递增的，
      result.fd.reset(openat(dir_fd_, tmp_filename.c_str(),
                             O_WRONLY | O_APPEND | O_CREAT | O_TRUNC | O_CLOEXEC, 0660));
      if (result.fd == -1) {
        PLOG(FATAL) << "failed to create temporary tombstone in " << dir_path_;
      }

      result.temporary_path = std::move(tmp_filename);
    }

    return std::move(result);
  }

engrave_tombstone

输出crash信息到tombstone文件：

基本信息，如时间
crash线程的信息
更多log，如 system、main log
其他线程的信息
打开的文件信息

/// @system/core/debuggerd/libdebuggerd/tombstone.cpp
void engrave_tombstone(unique_fd output_fd, unique_fd proto_fd, unwindstack::Unwinder* unwinder,
                       const std::map<pid_t, ThreadInfo>& threads, pid_t target_thread,
                       const ProcessInfo& process_info, OpenFilesList* open_files,
                       std::string* amfd_data) {
  // Don't copy log messages to tombstone unless this is a development device.
  Tombstone tombstone;
  engrave_tombstone_proto(&tombstone, unwinder, threads, target_thread, process_info, open_files);

  if (proto_fd != -1) {
    if (!tombstone.SerializeToFileDescriptor(proto_fd.get())) {
      async_safe_format_log(ANDROID_LOG_ERROR, LOG_TAG, "failed to write proto tombstone: %s",
                            strerror(errno));
    }
  }

  log_t log;
  log.current_tid = target_thread;
  log.crashed_tid = target_thread;
  log.tfd = output_fd.get();
  log.amfd_data = amfd_data;

  bool translate_proto = GetBoolProperty("debug.debuggerd.translate_proto_to_text", true);
  if (translate_proto) {
    tombstone_proto_to_text(tombstone, [&log](const std::string& line, bool should_log) {
      _LOG(&log, should_log ? logtype::HEADER : logtype::LOGS, "%s\n", line.c_str());
    });
  } else {
    bool want_logs = GetBoolProperty("ro.debuggable", false);// debug版本打印更多log

    _LOG(&log, logtype::HEADER,
         "*** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***\n");
    dump_header_info(&log);
    _LOG(&log, logtype::HEADER, "Timestamp: %s\n", get_timestamp().c_str());

    auto it = threads.find(target_thread);
    if (it == threads.end()) {
      async_safe_fatal("failed to find target thread");
    }
    // 写入crash线程相关信息
    dump_thread(&log, unwinder, it->second, process_info, true);

    if (want_logs) { // 其他log,主要是 system, main log,取结尾的部分
      dump_logs(&log, it->second.pid, 50);
    }
    // 写入其他线程相关信息
    for (auto& [tid, thread_info] : threads) {
      if (tid == target_thread) {
        continue;
      }

      dump_thread(&log, unwinder, thread_info, process_info, false);
    }

    if (open_files) { // 写入打开的文件信息
      _LOG(&log, logtype::OPEN_FILES, "\nopen files:\n");
      dump_open_files_list(&log, *open_files, "    ");
    }

    if (want_logs) {
      dump_logs(&log, it->second.pid, 0);
    }
  }
}

_LOG

输出log到指定fd和logcat

/// @system/core/debuggerd/libdebuggerd/utility.cpp
__attribute__((__weak__, visibility("default")))
void _LOG(log_t* log, enum logtype ltype, const char* fmt, ...) {
  va_list ap;
  va_start(ap, fmt);
  _VLOG(log, ltype, fmt, ap);
  va_end(ap);
}

// 继续打印 _VLOG
__attribute__((__weak__, visibility("default")))
void _VLOG(log_t* log, enum logtype ltype, const char* fmt, va_list ap) {
  bool write_to_tombstone = (log->tfd != -1);  // 是否输出到 tombstone
  bool write_to_logcat = is_allowed_in_logcat(ltype)  // 是否输出到 logcat
                      && log->crashed_tid != -1
                      && log->current_tid != -1
                      && (log->crashed_tid == log->current_tid);
  static bool write_to_kmsg = should_write_to_kmsg();  // 是否输出到 kernel

  std::string msg;
  android::base::StringAppendV(&msg, fmt, ap);

  if (msg.empty()) return;

  if (write_to_tombstone) { // 写 tombstone
    TEMP_FAILURE_RETRY(write(log->tfd, msg.c_str(), msg.size()));
  }

  if (write_to_logcat) { //   写 main log， 指定buffer 是 LOG_ID_CRASH， 级别 ANDROID_LOG_FATAL
    __android_log_buf_write(LOG_ID_CRASH, ANDROID_LOG_FATAL, LOG_TAG, msg.c_str());
    if (log->amfd_data != nullptr) {// 向 am 反馈的数据
      *log->amfd_data += msg;
    }

    if (write_to_kmsg) {  // 写 kernel log
      unique_fd kmsg_fd(open("/dev/kmsg_debug", O_WRONLY | O_APPEND | O_CLOEXEC));
      if (kmsg_fd.get() >= 0) {
        // Our output might contain newlines which would otherwise be handled by the android logger.
        // Split the lines up ourselves before sending to the kernel logger.
        if (msg.back() == '\n') {
          msg.back() = '\0';
        }

        std::vector<std::string> fragments = android::base::Split(msg, "\n");
        for (const std::string& fragment : fragments) {
          static constexpr char prefix[] = "<3>DEBUG: ";
          struct iovec iov[3];
          iov[0].iov_base = const_cast<char*>(prefix);
          iov[0].iov_len = strlen(prefix);
          iov[1].iov_base = const_cast<char*>(fragment.c_str());
          iov[1].iov_len = fragment.length();
          iov[2].iov_base = const_cast<char*>("\n");
          iov[2].iov_len = 1;
          TEMP_FAILURE_RETRY(writev(kmsg_fd.get(), iov, 3));
        }
      }
    }
  }
}

dump_thread

dump线程信息：

打印pid、name 等信息
打印信号相关信息
尝试打印 abort 信息
打印寄存器信息
打印 backtrace
打印内存信息

/// @system/core/debuggerd/libdebuggerd/tombstone.cpp
static bool dump_thread(log_t* log, unwindstack::Unwinder* unwinder, const ThreadInfo& thread_info,
                        const ProcessInfo& process_info, bool primary_thread) {
  log->current_tid = thread_info.tid;
  if (!primary_thread) {
    _LOG(log, logtype::THREAD, "--- --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---\n");
  }
  dump_thread_info(log, thread_info); // 打印pid\name 等信息

  if (thread_info.siginfo) { // 打印信号相关信息
    dump_signal_info(log, thread_info, process_info, unwinder->GetProcessMemory().get());
  }

  std::unique_ptr<GwpAsanCrashData> gwp_asan_crash_data;
  std::unique_ptr<ScudoCrashData> scudo_crash_data;
  if (primary_thread) { // 获取 crash 数据
    gwp_asan_crash_data = std::make_unique<GwpAsanCrashData>(unwinder->GetProcessMemory().get(),
                                                             process_info, thread_info);
    scudo_crash_data =
        std::make_unique<ScudoCrashData>(unwinder->GetProcessMemory().get(), process_info);
  }

  if (primary_thread && gwp_asan_crash_data->CrashIsMine()) {
    gwp_asan_crash_data->DumpCause(log);
  } else if (thread_info.siginfo && !(primary_thread && scudo_crash_data->CrashIsMine())) {
    dump_probable_cause(log, thread_info.siginfo, unwinder->GetMaps(), thread_info.registers.get());
  }

  if (primary_thread) { // crash 线程还要尝试打印 abort 信息
    dump_abort_message(log, unwinder->GetProcessMemory().get(), process_info.abort_msg_address);
  }

  dump_registers(log, thread_info.registers.get()); // 输出寄存器信息

  // Unwind will mutate the registers, so make a copy first.
  std::unique_ptr<unwindstack::Regs> regs_copy(thread_info.registers->Clone());
  unwinder->SetRegs(regs_copy.get());
  unwinder->Unwind();
  // 使用 unwinder 输出 backtrace
  if (unwinder->NumFrames() == 0) {
    _LOG(log, logtype::THREAD, "Failed to unwind\n");
    if (unwinder->LastErrorCode() != unwindstack::ERROR_NONE) {
      _LOG(log, logtype::THREAD, "  Error code: %s\n", unwinder->LastErrorCodeString());
      _LOG(log, logtype::THREAD, "  Error address: 0x%" PRIx64 "\n", unwinder->LastErrorAddress());
    }
  } else {
    _LOG(log, logtype::BACKTRACE, "\nbacktrace:\n");
    log_backtrace(log, unwinder, "    ");  // 打印 backtrace
  }

  if (primary_thread) {
    if (gwp_asan_crash_data->HasDeallocationTrace()) { // 输出释放信息
      gwp_asan_crash_data->DumpDeallocationTrace(log, unwinder);
    }

    if (gwp_asan_crash_data->HasAllocationTrace()) { // 输出分配信息
      gwp_asan_crash_data->DumpAllocationTrace(log, unwinder);
    }

    scudo_crash_data->DumpCause(log, unwinder); // dump causes

    unwindstack::Maps* maps = unwinder->GetMaps();
    dump_memory_and_code(log, maps, unwinder->GetProcessMemory().get(),
                         thread_info.registers.get()); // 打印内存信息
    if (maps != nullptr) {  
      uint64_t addr = 0;
      if (process_info.has_fault_address) {
        addr = process_info.untagged_fault_address;
      }
      dump_all_maps(log, unwinder, addr); // 打印 memory map
    }
  }

  log->current_tid = log->crashed_tid;
  return true;
}

dump_thread_info

线程相关信息,在tombstone可以看到

/// @system/core/debuggerd/libdebuggerd/tombstone.cpp
static void dump_thread_info(log_t* log, const ThreadInfo& thread_info) {
  // Don't try to collect logs from the threads that implement the logging system itself.
  if (thread_info.uid == AID_LOGD) log->should_retrieve_logcat = false;

  const char* process_name = "<unknown>";
  if (!thread_info.command_line.empty()) {
    process_name = thread_info.command_line[0].c_str();
  }
  // 输出是哪个线程
  _LOG(log, logtype::HEADER, "pid: %d, tid: %d, name: %s  >>> %s <<<\n", thread_info.pid,
       thread_info.tid, thread_info.thread_name.c_str(), process_name);
  _LOG(log, logtype::HEADER, "uid: %d\n", thread_info.uid);
  if (thread_info.tagged_addr_ctrl != -1) {
    _LOG(log, logtype::HEADER, "tagged_addr_ctrl: %016lx\n", thread_info.tagged_addr_ctrl);
  }
}

dump_signal_info

打印信号信息

/// @system/core/debuggerd/libdebuggerd/tombstone.cpp
static void dump_signal_info(log_t* log, const ThreadInfo& thread_info,
                             const ProcessInfo& process_info, unwindstack::Memory* process_memory) {
  char addr_desc[64];  // ", fault addr 0x1234"
  if (process_info.has_fault_address) {  // 打印 fault address
    // SIGILL faults will never have tagged addresses, so okay to
    // indiscriminately use the tagged address here.
    size_t addr = process_info.maybe_tagged_fault_address;
    if (thread_info.siginfo->si_signo == SIGILL) {
      uint32_t instruction = {};
      process_memory->Read(addr, &instruction, sizeof(instruction));
      snprintf(addr_desc, sizeof(addr_desc), "0x%zx (*pc=%#08x)", addr, instruction);
    } else {
      snprintf(addr_desc, sizeof(addr_desc), "0x%zx", addr);
    }
  } else { // 没有 fault address
    snprintf(addr_desc, sizeof(addr_desc), "--------");
  }

  char sender_desc[32] = {};  // " from pid 1234, uid 666"
  if (signal_has_sender(thread_info.siginfo, thread_info.pid)) {
    get_signal_sender(sender_desc, sizeof(sender_desc), thread_info.siginfo);
  }
  // 输出类似 signal 6 (SIGABRT), code -1 (SI_QUEUE), fault addr --------
  _LOG(log, logtype::HEADER, "signal %d (%s), code %d (%s%s), fault addr %s\n",
       thread_info.siginfo->si_signo, get_signame(thread_info.siginfo),
       thread_info.siginfo->si_code, get_sigcode(thread_info.siginfo), sender_desc, addr_desc);
}

get_signame 获取信号的描述

/// @system/core/debuggerd/libdebuggerd/utility.cpp
const char* get_signame(const siginfo_t* si) {
  switch (si->si_signo) {
    case SIGABRT: return "SIGABRT";
    case SIGBUS: return "SIGBUS";
    case SIGFPE: return "SIGFPE";
    case SIGILL: return "SIGILL";
    case SIGSEGV: return "SIGSEGV";
    case SIGSTKFLT: return "SIGSTKFLT";
    case SIGSTOP: return "SIGSTOP";
    case SIGSYS: return "SIGSYS";
    case SIGTRAP: return "SIGTRAP";
    case BIONIC_SIGNAL_DEBUGGER: // Android定义,通常用于输出 trace
      return "<debuggerd signal>";
    default: return "?";
  }
}

get_sigcode 获取信号具体错误描述

/// @system/core/debuggerd/libdebuggerd/utility.cpp
const char* get_sigcode(const siginfo_t* si) {
  // Try the signal-specific codes...
  switch (si->si_signo) {
    case SIGILL:
      switch (si->si_code) {
        case ILL_ILLOPC: return "ILL_ILLOPC";
        case ILL_ILLOPN: return "ILL_ILLOPN";
        case ILL_ILLADR: return "ILL_ILLADR";
        case ILL_ILLTRP: return "ILL_ILLTRP";
        case ILL_PRVOPC: return "ILL_PRVOPC";
        case ILL_PRVREG: return "ILL_PRVREG";
        case ILL_COPROC: return "ILL_COPROC";
        case ILL_BADSTK: return "ILL_BADSTK";
        case ILL_BADIADDR:
          return "ILL_BADIADDR";
        case __ILL_BREAK:
          return "ILL_BREAK";
        case __ILL_BNDMOD:
          return "ILL_BNDMOD";
      }
      static_assert(NSIGILL == __ILL_BNDMOD, "missing ILL_* si_code");
      break;
    case SIGBUS:
      switch (si->si_code) {
        case BUS_ADRALN: return "BUS_ADRALN";
        case BUS_ADRERR: return "BUS_ADRERR";
        case BUS_OBJERR: return "BUS_OBJERR";
        case BUS_MCEERR_AR: return "BUS_MCEERR_AR";
        case BUS_MCEERR_AO: return "BUS_MCEERR_AO";
      }
      static_assert(NSIGBUS == BUS_MCEERR_AO, "missing BUS_* si_code");
      break;
    case SIGFPE:
      switch (si->si_code) {
        case FPE_INTDIV: return "FPE_INTDIV";
        case FPE_INTOVF: return "FPE_INTOVF";
        case FPE_FLTDIV: return "FPE_FLTDIV";
        case FPE_FLTOVF: return "FPE_FLTOVF";
        case FPE_FLTUND: return "FPE_FLTUND";
        case FPE_FLTRES: return "FPE_FLTRES";
        case FPE_FLTINV: return "FPE_FLTINV";
        case FPE_FLTSUB: return "FPE_FLTSUB";
        case __FPE_DECOVF:
          return "FPE_DECOVF";
        case __FPE_DECDIV:
          return "FPE_DECDIV";
        case __FPE_DECERR:
          return "FPE_DECERR";
        case __FPE_INVASC:
          return "FPE_INVASC";
        case __FPE_INVDEC:
          return "FPE_INVDEC";
        case FPE_FLTUNK:
          return "FPE_FLTUNK";
        case FPE_CONDTRAP:
          return "FPE_CONDTRAP";
      }
      static_assert(NSIGFPE == FPE_CONDTRAP, "missing FPE_* si_code");
      break;
    case SIGSEGV:
      switch (si->si_code) {
        case SEGV_MAPERR: return "SEGV_MAPERR";
        case SEGV_ACCERR: return "SEGV_ACCERR";
        case SEGV_BNDERR: return "SEGV_BNDERR";
        case SEGV_PKUERR: return "SEGV_PKUERR";
        case SEGV_ACCADI:
          return "SEGV_ACCADI";
        case SEGV_ADIDERR:
          return "SEGV_ADIDERR";
        case SEGV_ADIPERR:
          return "SEGV_ADIPERR";
        case SEGV_MTEAERR:
          return "SEGV_MTEAERR";
        case SEGV_MTESERR:
          return "SEGV_MTESERR";
      }
      static_assert(NSIGSEGV == SEGV_MTESERR, "missing SEGV_* si_code");
      break;
    case SIGSYS:
      switch (si->si_code) {
        case SYS_SECCOMP: return "SYS_SECCOMP";
        case SYS_USER_DISPATCH:
          return "SYS_USER_DISPATCH";
      }
      static_assert(NSIGSYS == SYS_USER_DISPATCH, "missing SYS_* si_code");
      break;
    case SIGTRAP:
      switch (si->si_code) {
        case TRAP_BRKPT: return "TRAP_BRKPT";
        case TRAP_TRACE: return "TRAP_TRACE";
        case TRAP_BRANCH: return "TRAP_BRANCH";
        case TRAP_HWBKPT: return "TRAP_HWBKPT";
        case TRAP_UNK:
          return "TRAP_UNDIAGNOSED";
      }
      if ((si->si_code & 0xff) == SIGTRAP) {
        switch ((si->si_code >> 8) & 0xff) {
          case PTRACE_EVENT_FORK:
            return "PTRACE_EVENT_FORK";
          case PTRACE_EVENT_VFORK:
            return "PTRACE_EVENT_VFORK";
          case PTRACE_EVENT_CLONE:
            return "PTRACE_EVENT_CLONE";
          case PTRACE_EVENT_EXEC:
            return "PTRACE_EVENT_EXEC";
          case PTRACE_EVENT_VFORK_DONE:
            return "PTRACE_EVENT_VFORK_DONE";
          case PTRACE_EVENT_EXIT:
            return "PTRACE_EVENT_EXIT";
          case PTRACE_EVENT_SECCOMP:
            return "PTRACE_EVENT_SECCOMP";
          case PTRACE_EVENT_STOP:
            return "PTRACE_EVENT_STOP";
        }
      }
      static_assert(NSIGTRAP == TRAP_UNK, "missing TRAP_* si_code");
      break;
  }
  // Then the other codes...
  switch (si->si_code) {
    case SI_USER: return "SI_USER";
    case SI_KERNEL: return "SI_KERNEL";
    case SI_QUEUE: return "SI_QUEUE";
    case SI_TIMER: return "SI_TIMER";
    case SI_MESGQ: return "SI_MESGQ";
    case SI_ASYNCIO: return "SI_ASYNCIO";
    case SI_SIGIO: return "SI_SIGIO";
    case SI_TKILL: return "SI_TKILL";
    case SI_DETHREAD: return "SI_DETHREAD";
  }
  // Then give up...
  return "?";
}

log_backtrace

/// @system/core/debuggerd/libdebuggerd/utility.cpp
void log_backtrace(log_t* log, unwindstack::Unwinder* unwinder, const char* prefix) {
  if (unwinder->elf_from_memory_not_file()) {
    _LOG(log, logtype::BACKTRACE,
         "%sNOTE: Function names and BuildId information is missing for some frames due\n", prefix);
    _LOG(log, logtype::BACKTRACE,
         "%sNOTE: to unreadable libraries. For unwinds of apps, only shared libraries\n", prefix);
    _LOG(log, logtype::BACKTRACE, "%sNOTE: found under the lib/ directory are readable.\n", prefix);
#if defined(ROOT_POSSIBLE)
    _LOG(log, logtype::BACKTRACE,
         "%sNOTE: On this device, run setenforce 0 to make the libraries readable.\n", prefix);
#endif
  }

  unwinder->SetDisplayBuildID(true);
  for (size_t i = 0; i < unwinder->NumFrames(); i++) { // 输出所有的 frame
    _LOG(log, logtype::BACKTRACE, "%s%s\n", prefix, unwinder->FormatFrame(i).c_str());
  }
}

Unwinder::FormatFrame

格式化输出一个 frame

/// @system/unwinding/libunwindstack/Unwinder.cpp
std::string Unwinder::FormatFrame(const FrameData& frame) const {
  std::string data;
  if (ArchIs32Bit(arch_)) { // pc 编号地址
    data += android::base::StringPrintf("  #%02zu pc %08" PRIx64, frame.num, frame.rel_pc);
  } else {
    data += android::base::StringPrintf("  #%02zu pc %016" PRIx64, frame.num, frame.rel_pc);
  }

  if (frame.map_start == frame.map_end) { // map_name
    // No valid map associated with this frame.
    data += "  <unknown>";
  } else if (!frame.map_name.empty()) {
    data += "  ";
    data += frame.map_name;
  } else {
    data += android::base::StringPrintf("  <anonymous:%" PRIx64 ">", frame.map_start);
  }

  if (frame.map_elf_start_offset != 0) { // 距 elf 起始的偏移 offset
    data += android::base::StringPrintf(" (offset 0x%" PRIx64 ")", frame.map_elf_start_offset);
  }

  if (!frame.function_name.empty()) { // 函数名
    char* demangled_name = __cxa_demangle(frame.function_name.c_str(), nullptr, nullptr, nullptr);
    if (demangled_name == nullptr) {
      data += " (";
      data += frame.function_name;
    } else {
      data += " (";
      data += demangled_name;
      free(demangled_name);
    }
    if (frame.function_offset != 0) {// 函数偏移 offset
      data += android::base::StringPrintf("+%" PRId64, frame.function_offset);
    }
    data += ')';
  }

  MapInfo* map_info = maps_->Find(frame.map_start);
  if (map_info != nullptr && display_build_id_) {
    std::string build_id = map_info->GetPrintableBuildID();
    if (!build_id.empty()) {
      data += " (BuildId: " + build_id + ')';  // 最后的 BuildId
    }
  }
  return data;
}

dump_memory_and_code

打印内存信息

/// @system/core/debuggerd/libdebuggerd/tombstone.cpp
void dump_memory_and_code(log_t* log, unwindstack::Maps* maps, unwindstack::Memory* memory,
                          unwindstack::Regs* regs) {
  regs->IterateRegisters([log, maps, memory](const char* reg_name, uint64_t reg_value) {
    std::string label{"memory near "s + reg_name};
    if (maps) {
      unwindstack::MapInfo* map_info = maps->Find(untag_address(reg_value));
      if (map_info != nullptr && !map_info->name().empty()) {
        label += " (" + map_info->name() + ")";
      }
    }
    dump_memory(log, memory, reg_value, label);
  });
}

dump_memory 打印内存信息

void dump_memory(log_t* log, unwindstack::Memory* memory, uint64_t addr, const std::string& label) {
  // Dump 256 bytes
  uintptr_t data[MEMORY_BYTES_TO_DUMP / sizeof(uintptr_t)];
  uint8_t tags[MEMORY_BYTES_TO_DUMP / kTagGranuleSize];

  ssize_t bytes = dump_memory(data, sizeof(data), tags, sizeof(tags), &addr, memory);
  if (bytes == -1) {
    return;
  }

  _LOG(log, logtype::MEMORY, "\n%s:\n", label.c_str());

  // Dump the code around memory as:
  //  addr             contents                           ascii
  //  0000000000008d34 ef000000e8bd0090 e1b00000512fff1e  ............../Q
  //  0000000000008d44 ea00b1f9e92d0090 e3a070fcef000000  ......-..p......
  // On 32-bit machines, there are still 16 bytes per line but addresses and
  // words are of course presented differently.
  uintptr_t* data_ptr = data;
  uint8_t* tags_ptr = tags;
  for (size_t line = 0; line < static_cast<size_t>(bytes) / MEMORY_BYTES_PER_LINE; line++) {
    uint64_t tagged_addr = addr | static_cast<uint64_t>(*tags_ptr++) << 56;
    std::string logline;
    android::base::StringAppendF(&logline, "    %" PRIPTR, tagged_addr);

    addr += MEMORY_BYTES_PER_LINE;
    std::string ascii;
    for (size_t i = 0; i < MEMORY_BYTES_PER_LINE / sizeof(uintptr_t); i++) {
      android::base::StringAppendF(&logline, " %" PRIPTR, static_cast<uint64_t>(*data_ptr));

      // Fill out the ascii string from the data.
      uint8_t* ptr = reinterpret_cast<uint8_t*>(data_ptr);
      for (size_t val = 0; val < sizeof(uintptr_t); val++, ptr++) {
        if (*ptr >= 0x20 && *ptr < 0x7f) {
          ascii += *ptr;
        } else {
          ascii += '.';
        }
      }
      data_ptr++;
    }
    _LOG(log, logtype::MEMORY, "%s  %s\n", logline.c_str(), ascii.c_str());
  }
}

dump 完成通知

/// @system/core/debuggerd/crash_dump.cpp
int main(int argc, char** argv) {
	...
  // Close stdout before we notify tombstoned of completion.
  close(STDOUT_FILENO);
  // 调用 tombstoned_notify_completion 通知 tombstoned 已完成 dump
  if (g_tombstoned_connected && !tombstoned_notify_completion(g_tombstoned_socket.get())) {
    LOG(ERROR) << "failed to notify tombstoned of completion";
  }
}

tombstoned_notify_completion

bool tombstoned_notify_completion(int tombstoned_socket) {
  TombstonedCrashPacket packet = {};
  packet.packet_type = CrashPacketType::kCompletedDump; 
  // 写 kCompletedDump 到 tombstoned crash socket
  if (TEMP_FAILURE_RETRY(write(tombstoned_socket, &packet, sizeof(packet))) != sizeof(packet)) {
    return false;
  }
  return true;
}

在之前perform_request流程知道，注册了一个complete监听，当收到请求时会回调 crash_completed_cb

crash_completed_cb

/// @system/core/debuggerd/tombstoned/tombstoned.cpp
static void crash_completed_cb(evutil_socket_t sockfd, short ev, void* arg) {
  std::unique_ptr<Crash> crash(static_cast<Crash*>(arg));
  CrashQueue* queue = CrashQueue::for_crash(crash);

  queue->on_crash_completed(); // 通知queue有完成，正dump的计数减一

  if ((ev & EV_READ) == EV_READ) { // read 事件
    crash_completed(sockfd, std::move(crash)); // 关键
  }

  // If there's something queued up, let them proceed.
  queue->maybe_dequeue_crashes(perform_request);  // 尝试对queue中的下一个request执行perform_request
}

crash_completed

tombstoned 处理完成的逻辑主要在这个函数

static void crash_completed(borrowed_fd sockfd, std::unique_ptr<Crash> crash) {
  TombstonedCrashPacket request = {};
  CrashQueue* queue = CrashQueue::for_crash(crash);
  // 读请求
  ssize_t rc = TEMP_FAILURE_RETRY(read(sockfd.get(), &request, sizeof(request)));
  if (rc == -1) {
    PLOG(WARNING) << "failed to read from crash socket";
    return;
  } else if (rc != sizeof(request)) {
    LOG(WARNING) << "crash socket received short read of length " << rc << " (expected "
                 << sizeof(request) << ")";
    return;
  }
  // 类型需要是 kCompletedDump
  if (request.packet_type != CrashPacketType::kCompletedDump) {
    LOG(WARNING) << "unexpected crash packet type, expected kCompletedDump, received "
                 << uint32_t(request.packet_type);
    return;
  }

  if (crash->output.text.fd == -1) { // fd要存在
    LOG(WARNING) << "missing output fd";
    return;
  }

  CrashArtifactPaths paths = queue->get_next_artifact_paths(); // 获取文件名
  // 重命名 tombstone 文件
  if (rename_tombstone_fd(crash->output.text.fd, queue->dir_fd(), paths.text)) {
    if (crash->crash_type == kDebuggerdJavaBacktrace) {
      LOG(ERROR) << "Traces for pid " << crash->crash_pid << " written to: " << paths.text;
    } else {
      // NOTE: Several tools parse this log message to figure out where the
      // tombstone associated with a given native crash was written. Any changes
      // to this message must be carefully considered.
      LOG(ERROR) << "Tombstone written to: " << paths.text;
    }
  }

  if (crash->output.proto && crash->output.proto->fd != -1) {
    if (!paths.proto) {
      LOG(ERROR) << "missing path for proto tombstone";
    } else {
      rename_tombstone_fd(crash->output.proto->fd, queue->dir_fd(), *paths.proto);
    }
  }

  // If we don't have O_TMPFILE, we need to clean up after ourselves.
  if (crash->output.text.temporary_path) { // unlinkat 临时文件
    rc = unlinkat(queue->dir_fd().get(), crash->output.text.temporary_path->c_str(), 0);
    if (rc != 0) {
      PLOG(ERROR) << "failed to unlink temporary tombstone at " << paths.text;
    }
  }
  if (crash->output.proto && crash->output.proto->temporary_path) {
    rc = unlinkat(queue->dir_fd().get(), crash->output.proto->temporary_path->c_str(), 0);
    if (rc != 0) {
      PLOG(ERROR) << "failed to unlink temporary proto tombstone";
    }
  }
}

CrashQueue::get_next_artifact_paths

返回的名称为 tombstone_xx

CrashArtifactPaths get_next_artifact_paths() {
    CrashArtifactPaths result; // file_name_prefix_  为 tombstone_
    result.text = StringPrintf("%s%02d", file_name_prefix_.c_str(), next_artifact_);

    if (supports_proto_) {
      result.proto = StringPrintf("%s%02d.pb", file_name_prefix_.c_str(), next_artifact_);
    }

    next_artifact_ = (next_artifact_ + 1) % max_artifacts_; // 名字编号是一个循环，最大值在CrashQueue创建的时候定义 
    return result;
  }

对于 tombstone的定义如下

// max_artifacts_是由tombstoned.max_tombstone_count控制的，默认是32，也就是最多只有32个tombstone文件，多的话进行循环覆盖最老的。
    static CrashQueue queue("/data/tombstones", "tombstone_" /* file_name_prefix */,
                            GetIntProperty("tombstoned.max_tombstone_count", 32),
                            1 /* max_concurrent_dumps */, true /* supports_proto */);

简单看下 rename_tombstone_fd 实现

static bool rename_tombstone_fd(borrowed_fd fd, borrowed_fd dirfd, const std::string& path) {
  // Always try to unlink the tombstone file.
  // linkat doesn't let us replace a file, so we need to unlink before linking
  // our results onto disk, and if we fail for some reason, we should delete
  // stale tombstones to avoid confusing inconsistency.
  int rc = unlinkat(dirfd.get(), path.c_str(), 0); // 先尝试删除已经存在的 /data/tombstones/tombstone_xx
  if (rc != 0 && errno != ENOENT) {
    PLOG(ERROR) << "failed to unlink tombstone at " << path;
    return false;
  }

  std::string fd_path = StringPrintf("/proc/self/fd/%d", fd.get());
  // 进行重命名操作 /data/tombstones/tombstone_xx
  rc = linkat(AT_FDCWD, fd_path.c_str(), dirfd.get(), path.c_str(), AT_SYMLINK_FOLLOW);
  if (rc != 0) {
    PLOG(ERROR) << "failed to link tombstone at " << path;
    return false;
  }
  return true;
}

下面看看AMS对crash_dump上报的crash处理逻辑。

系统监听 Native crash

SystemServer#startOtherServices

/// frameworks/base/services/java/com/android/server/SystemServer.java
/**
* Starts a miscellaneous grab bag of stuff that has yet to be refactored and organized.
*/
private void startOtherServices(@NonNull TimingsTraceAndSlog t) {
  ...
  // 当系统启动完成，会调用 AMS 的 systemReady， 在其回调中，会调用 AMS的 startObservingNativeCrashes来监听Native crash
  mActivityManagerService.systemReady(() -> {
      Slog.i(TAG, "Making services ready");
      t.traceBegin("StartActivityManagerReadyPhase");
      mSystemServiceManager.startBootPhase(t, SystemService.PHASE_ACTIVITY_MANAGER_READY);
      t.traceEnd();
      t.traceBegin("StartObservingNativeCrashes");
      try {
          mActivityManagerService.startObservingNativeCrashes();
      } catch (Throwable e) {

}

AMS#startObservingNativeCrashes

public void startObservingNativeCrashes() {
    // NativeCrashListener继承自Thread，开启了新线程来监听
    final NativeCrashListener ncl = new NativeCrashListener(this);
    ncl.start();
}

NativeCrashListener#run

run 方法是其实现的地方，建立了一个 socket server 端，以供 crash_dump 来连接并反馈

/// frameworks/base/services/core/java/com/android/server/am/NativeCrashListener.java
@Override
public void run() {
    final byte[] ackSignal = new byte[1];

    if (DEBUG) Slog.i(TAG, "Starting up");

    // The file system entity for this socket is created with 0777 perms, owned
    // by system:system. selinux restricts things so that only crash_dump can
    // access it.
    {
        File socketFile = new File(DEBUGGERD_SOCKET_PATH);
        if (socketFile.exists()) {
            socketFile.delete();
        }
    }

    try {// 创建 server socket
        FileDescriptor serverFd = Os.socket(AF_UNIX, SOCK_STREAM, 0);
        final UnixSocketAddress sockAddr = UnixSocketAddress.createFileSystem(
                DEBUGGERD_SOCKET_PATH);
        Os.bind(serverFd, sockAddr);
        Os.listen(serverFd, 1);
        Os.chmod(DEBUGGERD_SOCKET_PATH, 0777);

        while (true) { // 循环处理 crash_dump 的上报请求
            FileDescriptor peerFd = null;
            try {
                if (MORE_DEBUG) Slog.v(TAG, "Waiting for debuggerd connection");
                peerFd = Os.accept(serverFd, null /* peerAddress */); // 等待 crash_dump client连接
                if (MORE_DEBUG) Slog.v(TAG, "Got debuggerd socket " + peerFd);
                if (peerFd != null) {
                    // the reporting thread may take responsibility for
                    // acking the debugger; make sure we play along.
                    consumeNativeCrashData(peerFd); // 解析上报的crash数据
                }
            } catch (Exception e) {
                Slog.w(TAG, "Error handling connection", e);
            } finally {
                // Always ack crash_dump's connection to us.  The actual
                // byte written is irrelevant.
                if (peerFd != null) {
                    try { // 写一个回复 通知 crash_dump 继续，具体内容无关紧要
                        Os.write(peerFd, ackSignal, 0, 1);
                    } catch (Exception e) {
                        /* we don't care about failures here */
                        if (MORE_DEBUG) {
                            Slog.d(TAG, "Exception writing ack: " + e.getMessage());
                        }
                    }
                    try {
                        Os.close(peerFd);
                    } catch (ErrnoException e) {
                        if (MORE_DEBUG) {
                            Slog.d(TAG, "Exception closing socket: " + e.getMessage());
                        }
                    }
                }
            }
        }
    } catch (Exception e) {
        Slog.e(TAG, "Unable to init native debug socket!", e);
    }
}

NativeCrashListener#consumeNativeCrashData

接收、解析crash数据，并向AMS上报crash

// Read a crash report from the connection
void consumeNativeCrashData(FileDescriptor fd) {
    if (MORE_DEBUG) Slog.i(TAG, "debuggerd connected");
    final byte[] buf = new byte[4096];
    final ByteArrayOutputStream os = new ByteArrayOutputStream(4096);

    try {
        StructTimeval timeout = StructTimeval.fromMillis(SOCKET_TIMEOUT_MILLIS);
        Os.setsockoptTimeval(fd, SOL_SOCKET, SO_RCVTIMEO, timeout);
        Os.setsockoptTimeval(fd, SOL_SOCKET, SO_SNDTIMEO, timeout);

        // The socket is guarded by an selinux neverallow rule that only
        // permits crash_dump to connect to it. This allows us to trust the
        // received values.

        // first, the pid and signal number
        int headerBytes = readExactly(fd, buf, 0, 8);
        if (headerBytes != 8) {
            // protocol failure; give up
            Slog.e(TAG, "Unable to read from debuggerd");
            return;
        }

        int pid = unpackInt(buf, 0);
        int signal = unpackInt(buf, 4);
        if (DEBUG) {
            Slog.v(TAG, "Read pid=" + pid + " signal=" + signal);
        }

        // now the text of the dump
        if (pid > 0) { // 获取了有效的pid
            final ProcessRecord pr;
            // 寻找pid对应的ProcessRecord，通常应用进程、系统进程都有，但是纯 Native进程没有
            synchronized (mAm.mPidsSelfLocked) {
                pr = mAm.mPidsSelfLocked.get(pid);
            }
            if (pr != null) {
                // Don't attempt crash reporting for persistent apps
                if (pr.isPersistent()) {
                    if (DEBUG) {
                        Slog.v(TAG, "Skipping report for persistent app " + pr);
                    }
                    return;
                }

                int bytes;
                do {
                    // get some data
                    bytes = Os.read(fd, buf, 0, buf.length);
                    if (bytes > 0) {
                        if (MORE_DEBUG) {
                            String s = new String(buf, 0, bytes, "UTF-8");
                            Slog.v(TAG, "READ=" + bytes + "> " + s);
                        }
                        // did we just get the EOD null byte?
                        if (buf[bytes-1] == 0) {
                            os.write(buf, 0, bytes-1);  // exclude the EOD token
                            break;
                        }
                        // no EOD, so collect it and read more
                        os.write(buf, 0, bytes);
                    }
                } while (bytes > 0);

                // Okay, we've got the report.
                if (DEBUG) Slog.v(TAG, "processing");

                // Mark the process record as being a native crash so that the
                // cleanup mechanism knows we're still submitting the report
                // even though the process will vanish as soon as we let
                // debuggerd proceed.
                synchronized (mAm) {
                    synchronized (mAm.mProcLock) {
                        pr.mErrorState.setCrashing(true);
                        pr.mErrorState.setForceCrashReport(true);
                    }
                }

                // Crash reporting is synchronous but we want to let debuggerd
                // go about it business right away, so we spin off the actual
                // reporting logic on a thread and let it take it's time.
                final String reportString = new String(os.toByteArray(), "UTF-8");
                (new NativeCrashReporter(pr, signal, reportString)).start();// 子线程处理上报
            } else {
                Slog.w(TAG, "Couldn't find ProcessRecord for pid " + pid);
            }
        } else {
            Slog.e(TAG, "Bogus pid!");
        }
    } catch (Exception e) {
        Slog.e(TAG, "Exception dealing with report", e);
        // ugh, fail.
    }
}

NativeCrashReporter#run

NativeCrashReporter是继承Thread类，用来异步上报crash事件

@Override
public void run() {
    try {
        CrashInfo ci = new CrashInfo();
        ci.exceptionClassName = "Native crash";
        ci.exceptionMessage = Os.strsignal(mSignal);
        ci.throwFileName = "unknown";
        ci.throwClassName = "unknown";
        ci.throwMethodName = "unknown";
        ci.stackTrace = mCrashReport;

        if (DEBUG) Slog.v(TAG, "Calling handleApplicationCrash()");
        // 下面类似Java crash 流程
        mAm.handleApplicationCrashInner("native_crash", mApp, mApp.processName, ci);
        if (DEBUG) Slog.v(TAG, "<-- handleApplicationCrash() returned");
    } catch (Exception e) {
        Slog.e(TAG, "Unable to report native crash", e);
    }
}