文章托管在gitee上 Android Notes , 同步csdn
Trace生成流程
从第2篇可知,Watchdog的Trace生成过程如下:
- 当等待时间 >Max/2 , 即评估状态为 WAITED_HALF,则会输出第一次Trace
- 当等待时间 >Max, 即评估状态为 OVERDUE,则会输出第二次Trace,以及一些其他的信息(kernel log,binder相关信息,dropbox等)
- 最终,会将两次生成的Trace合二为一,生成一个最终的Trace。
下面看判断条件
} else if (waitState == WAITED_HALF) {
if (!waitedHalf) {
Slog.i(TAG, "WAITED_HALF");
waitedHalf = true; // 设置标志,防止反复进入
// We've waited half, but we'd need to do the stack trace dump w/o the lock.
pids = new ArrayList<>(mInterestingJavaPids); // 需要dump的进程
doWaitedHalfDump = true; // 设置标志,会生成trace
} else {
continue;
}
} else { // OVERDUE, 会dump 第二次
// something is overdue!
blockedCheckers = getBlockedCheckersLocked();
subject = describeCheckersLocked(blockedCheckers);
allowRestart = mAllowRestart;
pids = new ArrayList<>(mInterestingJavaPids);
}
输出第一次Trace
if (doWaitedHalfDump) {
// We've waited half the deadlock-detection interval. Pull a stack
// trace and wait another half.
ActivityManagerService.dumpStackTraces(pids, null, null,
getInterestingNativePids(), null, subject);
continue;
}
添加 dump Java进程
pids 表示需要dump的Java进程信息,通过mInterestingJavaPids所得,默认添加系统进程pid到mInterestingJavaPids,会输出system_server进程的trace。在一些特殊进程启动、退出时,会从mInterestingJavaPids添加、移除pid
/**
* Notifies the watchdog when a Java process with {@code pid} is started.
* This process may have its stack trace dumped during an ANR.
*/
public void processStarted(String processName, int pid) { // app 启动后回调
if (isInterestingJavaProcess(processName)) {
Slog.i(TAG, "Interesting Java process " + processName + " started. Pid " + pid);
synchronized (mLock) {
mInterestingJavaPids.add(pid);
}
}
}
// 兴趣进程判断
private static boolean isInterestingJavaProcess(String processName) {
return processName.equals(StorageManagerService.sMediaStoreAuthorityProcessName)
|| processName.equals("com.android.phone");
}
/**
* Notifies the watchdog when a Java process with {@code pid} dies.
*/
public void processDied(String processName, int pid) { // app 退出时回调
if (isInterestingJavaProcess(processName)) {
Slog.i(TAG, "Interesting Java process " + processName + " died. Pid " + pid);
synchronized (mLock) {
mInterestingJavaPids.remove(Integer.valueOf(pid));
}
}
}
添加 dump native进程
getInterestingNativePids() 参数获取要打印的native进程
static ArrayList<Integer> getInterestingNativePids() {
HashSet<Integer> pids = new HashSet<>();
addInterestingAidlPids(pids); // 添加 AIDL_INTERFACE_PREFIXES_OF_INTEREST 中的进程
addInterestingHidlPids(pids); // 添加 HAL_INTERFACES_OF_INTEREST 中的进程
// 添加NATIVE_STACKS_OF_INTEREST中的进程
int[] nativePids = Process.getPidsForCommands(NATIVE_STACKS_OF_INTEREST);
if (nativePids != null) {
for (int i : nativePids) {
pids.add(i);
}
}
return new ArrayList<Integer>(pids);
}
addInterestingAidlPids
private static void addInterestingAidlPids(HashSet<Integer> pids) {
ServiceDebugInfo[] infos = ServiceManager.getServiceDebugInfo();
if (infos == null) return;
for (ServiceDebugInfo info : infos) { // 添加 aidl 进程
for (String prefix : AIDL_INTERFACE_PREFIXES_OF_INTEREST) {
if (info.name.startsWith(prefix)) {
pids.add(info.debugPid);
}
}
}
}
addInterestingHidlPids
private static void addInterestingHidlPids(HashSet<Integer> pids) {
try {
IServiceManager serviceManager = IServiceManager.getService();
ArrayList<IServiceManager.InstanceDebugInfo> dump =
serviceManager.debugDump();
for (IServiceManager.InstanceDebugInfo info : dump) {
if (info.pid == IServiceManager.PidConstant.NO_PID) {
continue;
}
// 添加 hidl 进程
if (!HAL_INTERFACES_OF_INTEREST.contains(info.interfaceName)) {
continue;
}
pids.add(info.pid);
}
} catch (RemoteException e) {
Log.w(TAG, e);
}
}
输出第二次Trace
long anrTime = SystemClock.uptimeMillis();
StringBuilder report = new StringBuilder();
report.append(MemoryPressureUtil.currentPsiState());
ProcessCpuTracker processCpuTracker = new ProcessCpuTracker(false);
StringWriter tracesFileException = new StringWriter();
final File stack = ActivityManagerService.dumpStackTraces(
pids, processCpuTracker, new SparseArray<>(), getInterestingNativePids(),
tracesFileException, subject);
subject 是卡顿原因的主题,通过 Watchdog.HandlerChecker#describeBlockedStateLocked 获取
String describeBlockedStateLocked() {
if (mCurrentMonitor == null) {
return "Blocked in handler on " + mName + " (" + getThread().getName() + ")";
} else {
return "Blocked in monitor " + mCurrentMonitor.getClass().getName()
+ " on " + mName + " (" + getThread().getName() + ")";
}
}
dump kernel log
// Trigger the kernel to dump all blocked threads, and backtraces on all CPUs to the kernel log
doSysRq('w');
doSysRq('l');
private void doSysRq(char c) {
try { // 往 sysrq-trigger 写命令
FileWriter sysrq_trigger = new FileWriter("/proc/sysrq-trigger");
sysrq_trigger.write(c);
sysrq_trigger.close();
} catch (IOException e) {
Slog.w(TAG, "Failed to write to /proc/sysrq-trigger", e);
}
}
输出Watchdog到dropbox
// Try to add the error to the dropbox, but assuming that the ActivityManager
// itself may be deadlocked. (which has happened, causing this statement to
// deadlock and the watchdog as a whole to be ineffective)
Thread dropboxThread = new Thread("watchdogWriteToDropbox") {
public void run() {
// If a watched thread hangs before init() is called, we don't have a
// valid mActivity. So we can't log the error to dropbox.
if (mActivity != null) {
mActivity.addErrorToDropBox( // 实际通过DropBoxManager向DropBoxManagerService发起调用
"watchdog", null, "system_server", null, null, null,
null, report.toString(), stack, null, null, null,
errorId);
}
}
};
dropboxThread.start();
try {
dropboxThread.join(2000); // wait up to 2 seconds for it to return.
} catch (InterruptedException ignored) {}
合并Trace
这部分将会合并两次的Trace。通过流将两个trace文件写入到新文件,生成一个新文件。这部分本地还没有最新代码,等更新了S最新代码,再上代码。
ActivityManagerService.dumpStackTraces 实现
从上面的分析来看,trace生成的关键实现是AMS#dumpStackTraces。这部分调用比较长,实际上是system_server通过socket连接tombstoned进程,让后者dump相关进程的信息返回相关数据,然后system_server将trace信息写入指定的文件。
/**
* If a stack trace dump file is configured, dump process stack traces.
* @param firstPids of dalvik VM processes to dump stack traces for first
* @param lastPids of dalvik VM processes to dump stack traces for last
* @param nativePids optional list of native pids to dump stack crawls
* @param logExceptionCreatingFile optional writer to which we log errors creating the file
* @param subject optional line related to the error
*/
public static File dumpStackTraces(ArrayList<Integer> firstPids,
ProcessCpuTracker processCpuTracker, SparseArray<Boolean> lastPids,
ArrayList<Integer> nativePids, StringWriter logExceptionCreatingFile,
String subject) {
return dumpStackTraces(firstPids, processCpuTracker, lastPids, nativePids,
logExceptionCreatingFile, null, subject);
}
调用重载方法
/**
* @param firstPidOffsets Optional, when it's set, it receives the start/end offset
* of the very first pid to be dumped.
*/
/* package */ static File dumpStackTraces(ArrayList<Integer> firstPids,
ProcessCpuTracker processCpuTracker, SparseArray<Boolean> lastPids,
ArrayList<Integer> nativePids, StringWriter logExceptionCreatingFile,
long[] firstPidOffsets, String subject) {
ArrayList<Integer> extraPids = null;
Slog.i(TAG, "dumpStackTraces pids=" + lastPids + " nativepids=" + nativePids);
// Measure CPU usage as soon as we're called in order to get a realistic sampling
// of the top users at the time of the request.
...
final File tracesDir = new File(ANR_TRACE_DIR); // ANR_TRACE_DIR = "/data/anr";
// Each set of ANR traces is written to a separate file and dumpstate will process
// all such files and add them to a captured bug report if they're recent enough.
maybePruneOldTraces(tracesDir); // 删除过旧的trace文件
// NOTE: We should consider creating the file in native code atomically once we've
// gotten rid of the old scheme of dumping and lot of the code that deals with paths
// can be removed.
File tracesFile;
try { // 创建trace文件 anr_yyyy-MM-dd-HH-mm-ss-SSS
tracesFile = createAnrDumpFile(tracesDir);
} catch (IOException e) {
Slog.w(TAG, "Exception creating ANR dump file:", e);
if (logExceptionCreatingFile != null) {
logExceptionCreatingFile.append("----- Exception creating ANR dump file -----\n");
e.printStackTrace(new PrintWriter(logExceptionCreatingFile));
}
return null;
}
// 标题,Trace文件第一行,Subject: Blocked in monitor *** on xxx thread
if (subject != null) {
try (FileOutputStream fos = new FileOutputStream(tracesFile, true)) {
String header = "Subject: " + subject + "\n";
fos.write(header.getBytes(StandardCharsets.UTF_8));
} catch (IOException e) {
Slog.w(TAG, "Exception writing subject to ANR dump file:", e);
}
}
// 开始执行dump
Pair<Long, Long> offsets = dumpStackTraces(
tracesFile.getAbsolutePath(), firstPids, nativePids, extraPids);
if (firstPidOffsets != null) {
if (offsets == null) {
firstPidOffsets[0] = firstPidOffsets[1] = -1;
} else {
firstPidOffsets[0] = offsets.first; // Start offset to the ANR trace file
firstPidOffsets[1] = offsets.second; // End offset to the ANR trace file
}
}
return tracesFile;
}
dumpStackTraces
- 执行 dump Java 进程trace, 调用 dumpJavaTracesTombstoned
- 执行 dump native 进程trace, 调用Debug.dumpNativeBacktraceToFileTimeout
- 执行 extra 进程trace, 调用dumpJavaTracesTombstoned
/**
* @return The start/end offset of the trace of the very first PID
*/
public static Pair<Long, Long> dumpStackTraces(String tracesFile, ArrayList<Integer> firstPids,
ArrayList<Integer> nativePids, ArrayList<Integer> extraPids) {
Slog.i(TAG, "Dumping to " + tracesFile);
// We don't need any sort of inotify based monitoring when we're dumping traces via
// tombstoned. Data is piped to an "intercept" FD installed in tombstoned so we're in full
// control of all writes to the file in question.
// We must complete all stack dumps within 20 seconds.
long remainingTime = 20 * 1000; // dump 总时间 20s
// As applications are usually interested with the ANR stack traces, but we can't share with
// them the stack traces other than their own stacks. So after the very first PID is
// dumped, remember the current file size.
long firstPidStart = -1;
long firstPidEnd = -1;
// First collect all of the stacks of the most important pids.
if (firstPids != null) {
int num = firstPids.size();
for (int i = 0; i < num; i++) {
final int pid = firstPids.get(i);
// We don't copy ANR traces from the system_server intentionally.
final boolean firstPid = i == 0 && MY_PID != pid;
File tf = null;
if (firstPid) {
tf = new File(tracesFile);
firstPidStart = tf.exists() ? tf.length() : 0;
}
Slog.i(TAG, "Collecting stacks for pid " + pid);
// dump Java 进程信息
final long timeTaken = dumpJavaTracesTombstoned(pid, tracesFile,
remainingTime);
remainingTime -= timeTaken;
if (remainingTime <= 0) {
Slog.e(TAG, "Aborting stack trace dump (current firstPid=" + pid
+ "); deadline exceeded.");
return firstPidStart >= 0 ? new Pair<>(firstPidStart, firstPidEnd) : null;
}
if (firstPid) {
firstPidEnd = tf.length();
}
if (DEBUG_ANR) {
Slog.d(TAG, "Done with pid " + firstPids.get(i) + " in " + timeTaken + "ms");
}
}
}
// Next collect the stacks of the native pids
if (nativePids != null) {
for (int pid : nativePids) {
Slog.i(TAG, "Collecting stacks for native pid " + pid);
final long nativeDumpTimeoutMs = Math.min(NATIVE_DUMP_TIMEOUT_MS, remainingTime);
final long start = SystemClock.elapsedRealtime();
// dump native 进程
Debug.dumpNativeBacktraceToFileTimeout(
pid, tracesFile, (int) (nativeDumpTimeoutMs / 1000));
final long timeTaken = SystemClock.elapsedRealtime() - start;
remainingTime -= timeTaken;
if (remainingTime <= 0) {
Slog.e(TAG, "Aborting stack trace dump (current native pid=" + pid +
"); deadline exceeded.");
return firstPidStart >= 0 ? new Pair<>(firstPidStart, firstPidEnd) : null;
}
if (DEBUG_ANR) {
Slog.d(TAG, "Done with native pid " + pid + " in " + timeTaken + "ms");
}
}
}
// Lastly, dump stacks for all extra PIDs from the CPU tracker.
if (extraPids != null) {
for (int pid : extraPids) {
Slog.i(TAG, "Collecting stacks for extra pid " + pid);
final long timeTaken = dumpJavaTracesTombstoned(pid, tracesFile, remainingTime);
remainingTime -= timeTaken;
if (remainingTime <= 0) {
Slog.e(TAG, "Aborting stack trace dump (current extra pid=" + pid +
"); deadline exceeded.");
return firstPidStart >= 0 ? new Pair<>(firstPidStart, firstPidEnd) : null;
}
if (DEBUG_ANR) {
Slog.d(TAG, "Done with extra pid " + pid + " in " + timeTaken + "ms");
}
}
}
Slog.i(TAG, "Done dumping");
return firstPidStart >= 0 ? new Pair<>(firstPidStart, firstPidEnd) : null;
}
下面以 dumpJavaTracesTombstoned 为例。
dumpJavaTracesTombstoned
/**
* Dump java traces for process {@code pid} to the specified file. If java trace dumping
* fails, a native backtrace is attempted. Note that the timeout {@code timeoutMs} only applies
* to the java section of the trace, a further {@code NATIVE_DUMP_TIMEOUT_MS} might be spent
* attempting to obtain native traces in the case of a failure. Returns the total time spent
* capturing traces.
*/
private static long dumpJavaTracesTombstoned(int pid, String fileName, long timeoutMs) {
final long timeStart = SystemClock.elapsedRealtime();
// 通过 Debug 实现
boolean javaSuccess = Debug.dumpJavaBacktraceToFileTimeout(pid, fileName,
(int) (timeoutMs / 1000));
if (javaSuccess) {
// Check that something is in the file, actually. Try-catch should not be necessary,
// but better safe than sorry.
try {
long size = new File(fileName).length();
if (size < JAVA_DUMP_MINIMUM_SIZE) { // 空文件
Slog.w(TAG, "Successfully created Java ANR file is empty!");
javaSuccess = false;
}
} catch (Exception e) {
Slog.w(TAG, "Unable to get ANR file size", e);
javaSuccess = false;
}
}
if (!javaSuccess) {
Slog.w(TAG, "Dumping Java threads failed, initiating native stack dump.");
if (!Debug.dumpNativeBacktraceToFileTimeout(pid, fileName,
(NATIVE_DUMP_TIMEOUT_MS / 1000))) { // 尝试dump native stack
Slog.w(TAG, "Native stack dump failed!");
}
}
return SystemClock.elapsedRealtime() - timeStart;
}
接下来通过 Debug实现连接 debuggerd 来完成dump
Debug#dumpJavaBacktraceToFileTimeout
Debug的dumpJavaBacktraceToFileTimeout方法是一个native方法
/// @frameworks/base/core/java/android/os/Debug.java
/**
* Append the Java stack traces of a given native process to a specified file.
*
* @param pid pid to dump.
* @param file path of file to append dump to.
* @param timeoutSecs time to wait in seconds, or 0 to wait forever.
* @hide
*/
public static native boolean dumpJavaBacktraceToFileTimeout(int pid, String file,
该native的实现在 android_os_Debug.cpp, 对应的jni函数是 android_os_Debug_dumpJavaBacktraceToFileTimeout
/// @frameworks/base/core/jni/android_os_Debug.cpp
{ "dumpJavaBacktraceToFileTimeout", "(ILjava/lang/String;I)Z",
(void*)android_os_Debug_dumpJavaBacktraceToFileTimeout },
android_os_Debug_dumpJavaBacktraceToFileTimeout
static jboolean android_os_Debug_dumpJavaBacktraceToFileTimeout(JNIEnv* env, jobject clazz,
jint pid, jstring fileName, jint timeoutSecs) {
// 注意此处的 kDebuggerdJavaBacktrace , 在 tombstoned 会用到
const bool ret = dumpTraces(env, pid, fileName, timeoutSecs, kDebuggerdJavaBacktrace);
return ret ? JNI_TRUE : JNI_FALSE;
}
此jni实现直接调用了 dumpTraces
static bool dumpTraces(JNIEnv* env, jint pid, jstring fileName, jint timeoutSecs,
DebuggerdDumpType dumpType) {
const ScopedUtfChars fileNameChars(env, fileName);
if (fileNameChars.c_str() == nullptr) {
return false;
}
// 打开trace文件
android::base::unique_fd fd(open(fileNameChars.c_str(),
O_CREAT | O_WRONLY | O_NOFOLLOW | O_CLOEXEC | O_APPEND,
0666));
if (fd < 0) {
PLOG(ERROR) << "Can't open " << fileNameChars.c_str();
return false;
}
// libdebuggerd_client 连接 tombstoned 来执行dump,此函数在 debuggerd/client.h 定义
int res = dump_backtrace_to_file_timeout(pid, dumpType, timeoutSecs, fd);
if (fdatasync(fd.get()) != 0) {
PLOG(ERROR) << "Failed flushing trace.";
}
return res == 0;
}
dump_backtrace_to_file_timeout
/// @system/core/debuggerd/client/debuggerd_client.cpp
int dump_backtrace_to_file_timeout(pid_t tid, DebuggerdDumpType dump_type, int timeout_secs,
int fd) {
android::base::unique_fd copy(dup(fd));
if (copy == -1) {
return -1;
}
// debuggerd_trigger_dump results in every thread in the process being interrupted
// by a signal, so we need to fetch the wchan data before calling that.
std::string wchan_data = get_wchan_data(tid);
int timeout_ms = timeout_secs > 0 ? timeout_secs * 1000 : 0;
// 开始连接 tombstoned 进程
int ret = debuggerd_trigger_dump(tid, dump_type, timeout_ms, std::move(copy)) ? 0 : -1;
// Dump wchan data, since only privileged processes (CAP_SYS_ADMIN) can read
// kernel stack traces (/proc/*/stack).
dump_wchan_data(wchan_data, fd, tid);
return ret;
}
debuggerd_trigger_dump
- 通过 socket 连接 tombstoned, 与其进行交互
- 创建pipe,将write端传给服务端,本地端通过poll监听read fd
- 发送Signal 3信号触发 SignalCatcher 去执行dump
- 接受来自对端的trace数据,并将之写入trace文件
/// @system/core/debuggerd/client/debuggerd_client.cpp
bool debuggerd_trigger_dump(pid_t tid, DebuggerdDumpType dump_type, unsigned int timeout_ms,
unique_fd output_fd) {
pid_t pid = tid;
if (dump_type == kDebuggerdJavaBacktrace) {
// Java dumps always get sent to the tgid, so we need to resolve our tid to a tgid.
android::procinfo::ProcessInfo procinfo;
std::string error;
if (!android::procinfo::GetProcessInfo(tid, &procinfo, &error)) {
LOG(ERROR) << "libdebugged_client: failed to get process info: " << error;
return false;
}
pid = procinfo.pid;
}
LOG(INFO) << "libdebuggerd_client: started dumping process " << pid;
unique_fd sockfd;
const auto end = std::chrono::steady_clock::now() + std::chrono::milliseconds(timeout_ms);
auto time_left = [&end]() { return end - std::chrono::steady_clock::now(); };
auto set_timeout = [timeout_ms, &time_left](int sockfd) { // 定义设置 timeout 函数
if (timeout_ms <= 0) {
return sockfd;
}
auto remaining = time_left();
if (remaining < decltype(remaining)::zero()) {
LOG(ERROR) << "libdebuggerd_client: timeout expired";
return -1;
}
struct timeval timeout;
populate_timeval(&timeout, remaining);
// 设置接收超时
if (setsockopt(sockfd, SOL_SOCKET, SO_RCVTIMEO, &timeout, sizeof(timeout)) != 0) {
PLOG(ERROR) << "libdebuggerd_client: failed to set receive timeout";
return -1;
}
// 设置发送超时
if (setsockopt(sockfd, SOL_SOCKET, SO_SNDTIMEO, &timeout, sizeof(timeout)) != 0) {
PLOG(ERROR) << "libdebuggerd_client: failed to set send timeout";
return -1;
}
return sockfd;
};
sockfd.reset(socket(AF_LOCAL, SOCK_SEQPACKET, 0)); // 创建 socket
if (sockfd == -1) {
PLOG(ERROR) << "libdebugger_client: failed to create socket";
return false;
}
// 连接 socket tombstoned_intercept
if (socket_local_client_connect(set_timeout(sockfd.get()), kTombstonedInterceptSocketName,
ANDROID_SOCKET_NAMESPACE_RESERVED, SOCK_SEQPACKET) == -1) {
PLOG(ERROR) << "libdebuggerd_client: failed to connect to tombstoned";
return false;
}
InterceptRequest req = {
.dump_type = dump_type,
.pid = pid,
};
if (!set_timeout(sockfd)) {
PLOG(ERROR) << "libdebugger_client: failed to set timeout";
return false;
}
// Create an intermediate pipe to pass to the other end.
unique_fd pipe_read, pipe_write;
if (!Pipe(&pipe_read, &pipe_write)) { // 创建 pipe
PLOG(ERROR) << "libdebuggerd_client: failed to create pipe";
return false;
}
std::string pipe_size_str;
int pipe_buffer_size = 1024 * 1024;
// 读取 pipe max size
if (android::base::ReadFileToString("/proc/sys/fs/pipe-max-size", &pipe_size_str)) {
pipe_size_str = android::base::Trim(pipe_size_str);
if (!android::base::ParseInt(pipe_size_str.c_str(), &pipe_buffer_size, 0)) {
LOG(FATAL) << "failed to parse pipe max size '" << pipe_size_str << "'";
}
}
if (fcntl(pipe_read.get(), F_SETPIPE_SZ, pipe_buffer_size) != pipe_buffer_size) {
PLOG(ERROR) << "failed to set pipe buffer size";
}
// 发送pipe的write fd传给对端
ssize_t rc = SendFileDescriptors(set_timeout(sockfd), &req, sizeof(req), pipe_write.get());
pipe_write.reset();
if (rc != sizeof(req)) {
PLOG(ERROR) << "libdebuggerd_client: failed to send output fd to tombstoned";
return false;
}
// Check to make sure we've successfully registered.
InterceptResponse response; // 接受服务端的初始响应,获取注册状态
rc = TEMP_FAILURE_RETRY(recv(set_timeout(sockfd.get()), &response, sizeof(response), MSG_TRUNC));
if (rc == 0) {
LOG(ERROR) << "libdebuggerd_client: failed to read initial response from tombstoned: "
<< "timeout reached?";
return false;
} else if (rc == -1) {
PLOG(ERROR) << "libdebuggerd_client: failed to read initial response from tombstoned";
return false;
} else if (rc != sizeof(response)) {
LOG(ERROR) << "libdebuggerd_client: received packet of unexpected length from tombstoned while "
"reading initial response: expected "
<< sizeof(response) << ", received " << rc;
return false;
}
if (response.status != InterceptStatus::kRegistered) {
LOG(ERROR) << "libdebuggerd_client: unexpected registration response: "
<< static_cast<int>(response.status);
return false;
}
if (!send_signal(tid, dump_type)) { // 关键,发送信号会触发相关dump
return false;
}
// 读取服务端响应
rc = TEMP_FAILURE_RETRY(recv(set_timeout(sockfd.get()), &response, sizeof(response), MSG_TRUNC));
if (rc == 0) {
LOG(ERROR) << "libdebuggerd_client: failed to read status response from tombstoned: "
"timeout reached?";
return false;
} else if (rc == -1) {
PLOG(ERROR) << "libdebuggerd_client: failed to read status response from tombstoned";
return false;
} else if (rc != sizeof(response)) {
LOG(ERROR) << "libdebuggerd_client: received packet of unexpected length from tombstoned while "
"reading confirmation response: expected "
<< sizeof(response) << ", received " << rc;
return false;
}
if (response.status != InterceptStatus::kStarted) {
response.error_message[sizeof(response.error_message) - 1] = '\0';
LOG(ERROR) << "libdebuggerd_client: tombstoned reported failure: " << response.error_message;
return false;
}
// Forward output from the pipe to the output fd.
while (true) { // 读取服务端写过来的数据,然后输出到dump文件
auto remaining_ms = std::chrono::duration_cast<std::chrono::milliseconds>(time_left()).count();
if (timeout_ms <= 0) {
remaining_ms = -1;
} else if (remaining_ms < 0) {
LOG(ERROR) << "libdebuggerd_client: timeout expired";
return false;
}
struct pollfd pfd = {
.fd = pipe_read.get(), .events = POLLIN, .revents = 0,
};
rc = poll(&pfd, 1, remaining_ms); // 通过 poll 监听 read fd
if (rc == -1) {
if (errno == EINTR) {
continue;
} else {
PLOG(ERROR) << "libdebuggerd_client: error while polling";
return false;
}
} else if (rc == 0) {
LOG(ERROR) << "libdebuggerd_client: timeout expired";
return false;
}
char buf[1024];
rc = TEMP_FAILURE_RETRY(read(pipe_read.get(), buf, sizeof(buf))); // 读取数据。
if (rc == 0) {
// Done.
break;
} else if (rc == -1) {
PLOG(ERROR) << "libdebuggerd_client: error while reading";
return false;
}
if (!android::base::WriteFully(output_fd.get(), buf, rc)) { // 写数据
PLOG(ERROR) << "libdebuggerd_client: error while writing";
return false;
}
}
LOG(INFO) << "libdebuggerd_client: done dumping process " << pid;
return true;
}
send_signal
static bool send_signal(pid_t pid, const DebuggerdDumpType dump_type) {
// dump java trace 发送 3 SIGQUIT
// 否则发送 35 BIONIC_SIGNAL_DEBUGGER
const int signal = (dump_type == kDebuggerdJavaBacktrace) ? SIGQUIT : BIONIC_SIGNAL_DEBUGGER;
sigval val;
val.sival_int = (dump_type == kDebuggerdNativeBacktrace) ? 1 : 0;
if (sigqueue(pid, signal, val) != 0) {// 发送指定信号到进程
PLOG(ERROR) << "libdebuggerd_client: failed to send signal to pid " << pid;
return false;
}
return true;
}
此处是 dump java trace, 发送的是 SIGQUIT,信号会被art的SignalCatcher线程处理,然后去dump 虚拟机相关状态。
之后它会通过socket连接到tombstoned,去获取一个输出的fd, 而实际获取的fd则是上面传给tombstoned的 pipe_write fd。
接下来看tombstoned 方的处理,首先从其启动开始写。
tombstoned
下面简单描述下 tombstoned 的启动流程,在tombstoned.rc中有如下配置,可以发现它被当做了一个可以启动的“服务”,同时在init启动它的时候会给它创建三个socket,而在此处会使用到 tombstoned_intercept socket。
/// @system/core/debuggerd/tombstoned/tombstoned.rc
service tombstoned /system/bin/tombstoned
user tombstoned
group system
socket tombstoned_crash seqpacket 0666 system system
socket tombstoned_intercept seqpacket 0666 system system
socket tombstoned_java_trace seqpacket 0666 system system
writepid /dev/cpuset/system-background/tasks
那它是在哪启动的呢? 在Android12中是直接写在init.rc中,在post-fs-data流程进行启动。而在Android11是配置在tombstoned.rc,时机也是post-fs-data,通过它的注释可以知道修改是为了早点启动来抓tombstone。
/// @system/core/rootdir/init.rc
on post-fs-data
...
# Start tombstoned early to be able to store tombstones.
mkdir /data/anr 0775 system system encryption=Require
mkdir /data/tombstones 0771 system system encryption=Require
mkdir /data/vendor/tombstones 0771 root root
mkdir /data/vendor/tombstones/wifi 0771 wifi wifi
start tombstoned # 启动 tombstoned
tombstoned#main
在main方法里面获取socket,并设置事件监听。
/// @system/core/debuggerd/tombstoned/tombstoned.cpp
int main(int, char* []) {
...
int intercept_socket = android_get_control_socket(kTombstonedInterceptSocketName);
int crash_socket = android_get_control_socket(kTombstonedCrashSocketName);
if (intercept_socket == -1 || crash_socket == -1) {
PLOG(FATAL) << "failed to get socket from init";
}
evutil_make_socket_nonblocking(intercept_socket);
evutil_make_socket_nonblocking(crash_socket);
event_base* base = event_base_new();
if (!base) {
LOG(FATAL) << "failed to create event_base";
}
// 对 tombstoned_intercept socket 处理的封装
intercept_manager = new InterceptManager(base, intercept_socket);
...
}
InterceptManager构造
/// @system/core/debuggerd/tombstoned/intercept_manager.cpp
InterceptManager::InterceptManager(event_base* base, int intercept_socket) : base(base) {
// 创建listener, accept回调 intercept_accept_cb
this->listener = evconnlistener_new(base, intercept_accept_cb, this, LEV_OPT_CLOSE_ON_FREE,
/* backlog */ -1, intercept_socket);
}
当收到client连接请求,accept成功后调用intercept_accept_cb
intercept_accept_cb
/// @system/core/debuggerd/tombstoned/intercept_manager.cpp
static void intercept_accept_cb(evconnlistener* listener, evutil_socket_t sockfd, sockaddr*, int,
void* arg) {
Intercept* intercept = new Intercept();
intercept->intercept_manager = static_cast<InterceptManager*>(arg);
intercept->sockfd.reset(sockfd); // 记录client fd
struct timeval timeout = {1 * android::base::HwTimeoutMultiplier(), 0};
event_base* base = evconnlistener_get_base(listener);
event* intercept_event =
event_new(base, sockfd, EV_TIMEOUT | EV_READ, intercept_request_cb, intercept);
intercept->intercept_event = intercept_event;
event_add(intercept_event, &timeout); // 添加一个读监听,等client写请求,当有请求回调intercept_request_cb
}
intercept_request_cb
static void intercept_request_cb(evutil_socket_t sockfd, short ev, void* arg) {
auto intercept = reinterpret_cast<Intercept*>(arg);
InterceptManager* intercept_manager = intercept->intercept_manager;
CHECK_EQ(sockfd, intercept->sockfd.get()); // 检查是同一个client
if ((ev & EV_TIMEOUT) != 0) { // 超时
LOG(WARNING) << "tombstoned didn't receive InterceptRequest before timeout";
goto fail;
} else if ((ev & EV_READ) == 0) { // 收到非读事件
LOG(WARNING) << "tombstoned received unexpected event on intercept socket";
goto fail;
}
{
unique_fd rcv_fd;
InterceptRequest intercept_request;
ssize_t result = // 接收client发过来的fd
ReceiveFileDescriptors(sockfd, &intercept_request, sizeof(intercept_request), &rcv_fd);
if (result == -1) {
PLOG(WARNING) << "failed to read from intercept socket";
goto fail;
} else if (result != sizeof(intercept_request)) {
LOG(WARNING) << "intercept socket received short read of length " << result << " (expected "
<< sizeof(intercept_request) << ")";
goto fail;
}
// Move the received FD to the upper half, in order to more easily notice FD leaks.
int moved_fd = fcntl(rcv_fd.get(), F_DUPFD, 512);
if (moved_fd == -1) {
LOG(WARNING) << "failed to move received fd (" << rcv_fd.get() << ")";
goto fail;
}
rcv_fd.reset(moved_fd);
// We trust the other side, so only do minimal validity checking.
if (!is_intercept_request_valid(intercept_request)) { // 有效性检查
InterceptResponse response = {};
response.status = InterceptStatus::kFailed;
snprintf(response.error_message, sizeof(response.error_message), "invalid intercept request");
TEMP_FAILURE_RETRY(write(sockfd, &response, sizeof(response)));
goto fail;
}
intercept->intercept_pid = intercept_request.pid;
intercept->dump_type = intercept_request.dump_type;
// Check if it's already registered.
if (intercept_manager->intercepts.count(intercept_request.pid) > 0) { // 检查是否已经有一个注册存在
InterceptResponse response = {};
response.status = InterceptStatus::kFailedAlreadyRegistered;
snprintf(response.error_message, sizeof(response.error_message),
"pid %" PRId32 " already intercepted, type %d", intercept_request.pid,
intercept_request.dump_type);
TEMP_FAILURE_RETRY(write(sockfd, &response, sizeof(response)));
LOG(WARNING) << response.error_message;
goto fail;
}
// Let the other side know that the intercept has been registered, now that we know we can't
// fail. tombstoned is single threaded, so this isn't racy.
InterceptResponse response = {};
response.status = InterceptStatus::kRegistered;
if (TEMP_FAILURE_RETRY(write(sockfd, &response, sizeof(response))) == -1) {// 通知client已注册成功
PLOG(WARNING) << "failed to notify interceptor of registration";
goto fail;
}
intercept->output_fd = std::move(rcv_fd); // 记录写fd, 后续会用到。
// 此处将该 pid 加入 intercepts 集合
intercept_manager->intercepts 集合[intercept_request.pid] = std::unique_ptr<Intercept>(intercept);
intercept->registered = true; // 记录注册状态
LOG(INFO) << "registered intercept for pid " << intercept_request.pid << " and type "
<< intercept_request.dump_type;
// Register a different read event on the socket so that we can remove intercepts if the socket
// closes (e.g. if a user CTRL-C's the process that requested the intercept).
event_assign(intercept->intercept_event, intercept_manager->base, sockfd, EV_READ | EV_TIMEOUT,
intercept_close_cb, arg); // 对端 close 监听
struct timeval timeout = {.tv_sec = 10 * android::base::HwTimeoutMultiplier(), .tv_usec = 0};
event_add(intercept->intercept_event, &timeout);
}
return;
fail:
delete intercept;
}
到此,跟着流程走到这,好像缺点什么,没有真正的去执行dump,把内容传回去。实际上,在之前提到过,client会发送信号3给相关进程,而相关dump操作会由 SignalCatcher 来完成,最后将内容写会给 client。 关于 SignalCatcher 相关内容会新开一片来写。
流程图
如下,是AMS#dumpStackTraces流程中dump Java trace 的大致流程图