Android系统中,有硬件WatchDog用于定时检测关键硬件是否正常工作,类似地,在framework层有一个软件WatchDog用于定期检测关键系统服务是否发生死锁事件、Handler线程消息处理是否超时。
一、WatchDog初始化
SystemServer.java,sytem_server进程启动阶段,启动Watchdog线程。
private void startBootstrapServices(@NonNull TimingsTraceAndSlog t) {
......
// Start the watchdog as early as possible so we can crash the system server// if we deadlock during early boott.traceBegin("StartWatchdog");
final Watchdog watchdog = Watchdog.getInstance(); // 对象创建
watchdog.start(); // 调用start方法初始化
mDumper.addDumpable(watchdog);
......
}
Watchdog.java,创建对象,并启动内部线程,执行run方法。
private Watchdog() {
mThread = new Thread(this::run, "watchdog");
// Initialize handler checkers for each common thread we want to check. Note
// that we are not currently checking the background thread, since it can
// potentially hold longer running operations with no guarantees about the timeliness
// of operations there.
//
// Use a custom thread to check monitors to avoid lock contention from impacted other
// threads.
// 创建HandlerThread监控线程并启动,线程名称为"watchdog.monitor"
ServiceThread t = new ServiceThread("watchdog.monitor",
android.os.Process.THREAD_PRIORITY_DEFAULT, true /*allowIo*/);
t.start();
// 封装到HandlerChecker,加入mHandlerCheckers列表
mMonitorChecker = new HandlerChecker(new Handler(t.getLooper()), "monitor thread");
mHandlerCheckers.add(withDefaultTimeout(mMonitorChecker));
// 将foreground thread、main thread、ui thread等加入mHandlerCheckers列表
mHandlerCheckers.add(withDefaultTimeout(
new HandlerChecker(FgThread.getHandler(), "foreground thread")));
// Add checker for main thread. We only do a quick check since there
// can be UI running on the thread.
mHandlerCheckers.add(withDefaultTimeout(
new HandlerChecker(new Handler(Looper.getMainLooper()), "main thread")));
// Add checker for shared UI thread.
mHandlerCheckers.add(withDefaultTimeout(
new HandlerChecker(UiThread.getHandler(), "ui thread")));
// And also check IO thread.
mHandlerCheckers.add(withDefaultTimeout(
new HandlerChecker(IoThread.getHandler(), "i/o thread")));
// And the display thread.
mHandlerCheckers.add(withDefaultTimeout(
new HandlerChecker(DisplayThread.getHandler(), "display thread")));
// And the animation thread.
mHandlerCheckers.add(withDefaultTimeout(
new HandlerChecker(AnimationThread.getHandler(), "animation thread")));
// And the surface animation thread.
mHandlerCheckers.add(withDefaultTimeout(
new HandlerChecker(SurfaceAnimationThread.getHandler(),
"surface animation thread")));
// Initialize monitor for Binder threads.
addMonitor(new BinderThreadMonitor());
mInterestingJavaPids.add(Process.myPid());
// See the notes on DEFAULT_TIMEOUT.
assert DB ||
DEFAULT_TIMEOUT > ZygoteConnectionConstants.WRAPPED_PID_TIMEOUT_MILLIS;
mTraceErrorLogger = new TraceErrorLogger();
}
public void start() {
mThread.start();
}
二、加入Watchdog监控方式
有两种方式加入Watchdog监控:
addThread():用于监测Handler线程,默认超时时长为60s.这种超时往往是所对应的handler线程消息处理得慢;
addMonitor(): 用于监控实现了Watchdog.Monitor接口的服务.这种超时可能是”android.fg”线程消息处理得慢,也可能是monitor迟迟拿不到锁;
// PowerManagerService.java
@Override
public void onStart() {
// 加入watchdog监听
Watchdog.getInstance().addMonitor(this); // pms服务实现了Watchdog.Monitor接口
Watchdog.getInstance().addThread(mHandler);
}
// 在watch dog中调用,用于检测PMS服务中是否存在死锁
@Override // Watchdog.Monitor implementation
public void monitor() {
// Grab and release lock for watchdog monitor to detect deadlocks.
synchronized (mLock) {
}
}
// monitor对象加入mMonitorQueue,mMonitorQueue加入到mMonitors
public void addMonitor(Monitor monitor) {
synchronized (mLock) {
mMonitorChecker.addMonitorLocked(monitor);
}
}
void addMonitorLocked(Monitor monitor) {
// We don't want to update mMonitors when the Handler is in the middle of checking
// all monitors. We will update mMonitors on the next schedule if it is safe
mMonitorQueue.add(monitor);
}
// handler对象加入到mHandlerCheckers列表
public void addThread(Handler thread) {
synchronized (mLock) {
final String name = thread.getLooper().getThread().getName();
mHandlerCheckers.add(withDefaultTimeout(new HandlerChecker(thread, name)));
}
}
public final class HandlerChecker implements Runnable {
......
public void scheduleCheckLocked(long handlerCheckerTimeoutMillis) {
mWaitMaxMillis = handlerCheckerTimeoutMillis;
if (mCompleted) {
// Safe to update monitors in queue, Handler is not in the middle of work
mMonitors.addAll(mMonitorQueue);
mMonitorQueue.clear();
}
if ((mMonitors.size() == 0 && mHandler.getLooper().getQueue().isPolling())
|| (mPauseCount > 0)) {
// Don't schedule until after resume OR
// If the target looper has recently been polling, then
// there is no reason to enqueue our checker on it since that
// is as good as it not being deadlocked. This avoid having
// to do a context switch to check the thread. Note that we
// only do this if we have no monitors since those would need to
// be executed at this point.
mCompleted = true;
return;
}
if (!mCompleted) {
// we already have a check in flight, so no need
return;
}
mCompleted = false;
mCurrentMonitor = null;
mStartTimeMillis = SystemClock.uptimeMillis();
mHandler.postAtFrontOfQueue(this);
}
......
}
三、Watchdog检测机制
当调用Watchdog.getInstance().start()时,则进入线程“watchdog”的run()方法, 该方法分成两部分:
前半部用于监测是否触发超时 ---三
后半部当触发超时则输出各种信息 ---四
private void run() {
boolean waitedHalf = false;
while (true) {
List<HandlerChecker> blockedCheckers = Collections.emptyList();
String subject = "";
boolean allowRestart = true;
int debuggerWasConnected = 0;
boolean doWaitedHalfDump = false;
// The value of mWatchdogTimeoutMillis might change while we are executing the loop.
// We store the current value to use a consistent value for all handlers.
final long watchdogTimeoutMillis = mWatchdogTimeoutMillis;
final long checkIntervalMillis = watchdogTimeoutMillis / 2;
final ArrayList<Integer> pids;
synchronized (mLock) {
//执行所有的Checker的监控方法, 每个Checker记录当前的mStartTime
long timeout = checkIntervalMillis; // 默认为30s
// Make sure we (re)spin the checkers that have become idle within
// this wait-and-check interval
for (int i=0; i<mHandlerCheckers.size(); i++) {
HandlerCheckerAndTimeout hc = mHandlerCheckers.get(i);
// We pick the watchdog to apply every time we reschedule the checkers. The
// default timeout might have changed since the last run.
hc.checker().scheduleCheckLocked(hc.customTimeoutMillis()
.orElse(watchdogTimeoutMillis * Build.HW_TIMEOUT_MULTIPLIER));
}
if (debuggerWasConnected > 0) {
debuggerWasConnected--;
}
// NOTE: We use uptimeMillis() here because we do not want to increment the time we
// wait while asleep. If the device is asleep then the thing that we are waiting
// to timeout on is asleep as well and won't have a chance to run, causing a false
// positive on when to kill things.
long start = SystemClock.uptimeMillis();
// 通过循环,保证执行30s才会继续往下执行
while (timeout > 0) {
if (Debug.isDebuggerConnected()) {
debuggerWasConnected = 2;
}
try {
// 等待timeout时长后,做检测
mLock.wait(timeout);
// Note: mHandlerCheckers and mMonitorChecker may have changed after waiting
} catch (InterruptedException e) {
Log.wtf(TAG, e);
}
if (Debug.isDebuggerConnected()) {
debuggerWasConnected = 2;
}
timeout = checkIntervalMillis - (SystemClock.uptimeMillis() - start);
}
final int waitState = evaluateCheckerCompletionLocked();
if (waitState == COMPLETED) {
// The monitors have returned; reset
waitedHalf = false;
continue;
} else if (waitState == WAITING) {
// still waiting but within their configured intervals; back off and recheck
continue;
} else if (waitState == WAITED_HALF) {
if (!waitedHalf) {
Slog.i(TAG, "WAITED_HALF");
waitedHalf = true;
// We've waited half, but we'd need to do the stack trace dump w/o the lock.
blockedCheckers = getCheckersWithStateLocked(WAITED_HALF);
subject = describeCheckersLocked(blockedCheckers);
pids = new ArrayList<>(mInterestingJavaPids);
doWaitedHalfDump = true;
} else {
continue;
}
} else {
// something is overdue!
// watchdog处理流程
......
}
} // END synchronized (mLock)
......
}
}
public final class HandlerChecker implements Runnable {
public void scheduleCheckLocked(long handlerCheckerTimeoutMillis) {
mWaitMaxMillis = handlerCheckerTimeoutMillis;
if (mCompleted) {
// Safe to update monitors in queue, Handler is not in the middle of work
mMonitors.addAll(mMonitorQueue);
mMonitorQueue.clear();
}
if ((mMonitors.size() == 0 && mHandler.getLooper().getQueue().isPolling())
|| (mPauseCount > 0)) {
// Don't schedule until after resume OR
// If the target looper has recently been polling, then
// there is no reason to enqueue our checker on it since that
// is as good as it not being deadlocked. This avoid having
// to do a context switch to check the thread. Note that we
// only do this if we have no monitors since those would need to
// be executed at this point.
mCompleted = true;
return;
}
if (!mCompleted) {
// we already have a check in flight, so no need
return;
}
mCompleted = false;
mCurrentMonitor = null;
// 记录当下的时间
mStartTimeMillis = SystemClock.uptimeMillis();
// 发送消息,插入消息队列最开头, 见下方的run()方法
mHandler.postAtFrontOfQueue(this);
}
@Override
public void run() {
// Once we get here, we ensure that mMonitors does not change even if we call
// #addMonitorLocked because we first add the new monitors to mMonitorQueue and
// move them to mMonitors on the next schedule when mCompleted is true, at which
// point we have completed execution of this method.
final int size = mMonitors.size();
// 执行各系统服务的monitor()方法
for (int i = 0 ; i < size ; i++) {
synchronized (mLock) {
mCurrentMonitor = mMonitors.get(i);
}
mCurrentMonitor.monitor();
}
synchronized (mLock) {
mCompleted = true;
mCurrentMonitor = null;
}
}
..............
}
四、Watchdog处理流程
private void run() {
boolean waitedHalf = false;
while (true) {
..............
// something is overdue!
// watchdog处理流程
blockedCheckers = getCheckersWithStateLocked(OVERDUE);
subject = describeCheckersLocked(blockedCheckers);
allowRestart = mAllowRestart;
pids = new ArrayList<>(mInterestingJavaPids);
// If we got here, that means that the system is most likely hung.
//
// First collect stack traces from all threads of the system process.
//
// Then, if we reached the full timeout, kill this process so that the system will
// restart. If we reached half of the timeout, just log some information and continue.
// 收集日志信息
logWatchog(doWaitedHalfDump, subject, pids);
if (doWaitedHalfDump) {
// We have waited for only half of the timeout, we continue to wait for the duration
// of the full timeout before killing the process.
continue;
}
IActivityController controller;
synchronized (mLock) {
controller = mController;
}
if (controller != null) {
Slog.i(TAG, "Reporting stuck state to activity controller");
try {
Binder.setDumpDisabled("Service dumps disabled due to hung system process.");
// 1 = keep waiting, -1 = kill system
int res = controller.systemNotResponding(subject);
if (res >= 0) {
Slog.i(TAG, "Activity controller requested to coninue to wait");
waitedHalf = false;
continue;
}
} catch (RemoteException e) {
}
}
// Only kill the process if the debugger is not attached.
if (Debug.isDebuggerConnected()) {
debuggerWasConnected = 2;
}
if (debuggerWasConnected >= 2) {
Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process");
} else if (debuggerWasConnected > 0) {
Slog.w(TAG, "Debugger was connected: Watchdog is *not* killing the system process");
} else if (!allowRestart) {
Slog.w(TAG, "Restart not allowed: Watchdog is *not* killing the system process");
} else {
Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + subject);
WatchdogDiagnostics.diagnoseCheckers(blockedCheckers);
Slog.w(TAG, "*** GOODBYE!");
if (!Build.IS_USER && isCrashLoopFound()
&& !WatchdogProperties.should_ignore_fatal_count().orElse(false)) {
breakCrashLoop();
}
// 重启system_server进程
Process.killProcess(Process.myPid());
System.exit(10);
}
waitedHalf = false;
}
}
收集日志信息,psi状态信息、system_server和几个native进程的栈信息保存到dropbox和打印所有CPU的backtrace到kernel log。
private void logWatchog(boolean halfWatchdog, String subject, ArrayList<Integer> pids) {
// Get critical event log before logging the half watchdog so that it doesn't
// occur in the log.
String criticalEvents =
CriticalEventLog.getInstance().logLinesForSystemServerTraceFile();
final UUID errorId = mTraceErrorLogger.generateErrorId();
if (mTraceErrorLogger.isAddErrorIdEnabled()) {
mTraceErrorLogger.addProcessInfoAndErrorIdToTrace("system_server", Process.myPid(),
errorId);
mTraceErrorLogger.addSubjectToTrace(subject, errorId);
}
final String dropboxTag;
if (halfWatchdog) {
dropboxTag = "pre_watchdog";
CriticalEventLog.getInstance().logHalfWatchdog(subject);
FrameworkStatsLog.write(FrameworkStatsLog.SYSTEM_SERVER_PRE_WATCHDOG_OCCURRED);
} else {
dropboxTag = "watchdog";
CriticalEventLog.getInstance().logWatchdog(subject, errorId);
EventLog.writeEvent(EventLogTags.WATCHDOG, subject);
// Log the atom as early as possible since it is used as a mechanism to trigger
// Perfetto. Ideally, the Perfetto trace capture should happen as close to the
// point in time when the Watchdog happens as possible.
FrameworkStatsLog.write(FrameworkStatsLog.SYSTEM_SERVER_WATCHDOG_OCCURRED, subject);
}
long anrTime = SystemClock.uptimeMillis();
StringBuilder report = new StringBuilder();
report.append(ResourcePressureUtil.currentPsiState());
ProcessCpuTracker processCpuTracker = new ProcessCpuTracker(false);
StringWriter tracesFileException = new StringWriter();
// 输出system_server和几个native进程的栈信息
final File stack = StackTracesDumpHelper.dumpStackTraces(
pids, processCpuTracker, new SparseBooleanArray(),
CompletableFuture.completedFuture(getInterestingNativePids()), tracesFileException,
subject, criticalEvents, Runnable::run, /* latencyTracker= */null);
// Give some extra time to make sure the stack traces get written.
// The system's been hanging for a whlie, another second or two won't hurt much.
SystemClock.sleep(5000);
processCpuTracker.update();
report.append(processCpuTracker.printCurrentState(anrTime));
report.append(tracesFileException.getBuffer());
// 收集或打印kernel阻塞线程的信息和ckernel log
if (!halfWatchdog) {
// Trigger the kernel to dump all blocked threads, and backtraces on all CPUs to the
// kernel log
doSysRq('w');
doSysRq('l');
}
// Try to add the error to the dropbox, but assuming that the ActivityManager
// itself may be deadlocked. (which has happened, causing this statement to
// deadlock and the watchdog as a whole to be ineffective)
// 创建dropbox线程,将report数据写入
Thread dropboxThread = new Thread("watchdogWriteToDropbox") {
public void run() {
// If a watched thread hangs before init() is called, we don't have a
// valid mActivity. So we can't log the error to dropbox.
if (mActivity != null) {
mActivity.addErrorToDropBox(
dropboxTag, null, "system_server", null, null, null,
null, report.toString(), stack, null, null, null,
errorId);
}
}
};
dropboxThread.start();
try {
dropboxThread.join(2000); // wait up to 2 seconds for it to return.
} catch (InterruptedException ignored) { }
}
// 返回native进程的pid
static ArrayList<Integer> getInterestingNativePids() {
HashSet<Integer> pids = new HashSet<>();
addInterestingAidlPids(pids);
addInterestingHidlPids(pids);
int[] nativePids = Process.getPidsForCommands(NATIVE_STACKS_OF_INTEREST);
if (nativePids != null) {
for (int i : nativePids) {
pids.add(i);
}
}
return new ArrayList<Integer>(pids);
}
// Which native processes to dump into dropbox's stack traces
public static final String[] NATIVE_STACKS_OF_INTEREST = new String[] {
"/system/bin/audioserver",
"/system/bin/cameraserver",
"/system/bin/drmserver",
"/system/bin/keystore2",
"/system/bin/mediadrmserver",
"/system/bin/mediaserver",
"/system/bin/netd",
"/system/bin/sdcard",
"/system/bin/surfaceflinger",
"/system/bin/vold",
"media.extractor", // system/bin/mediaextractor
"media.metrics", // system/bin/mediametrics
"media.codec", // vendor/bin/hw/android.hardware.media.omx@1.0-service
"media.swcodec", // /apex/com.android.media.swcodec/bin/mediaswcodec
"media.transcoding", // Media transcoding service
"com.android.bluetooth", // Bluetooth service
"/apex/com.android.os.statsd/bin/statsd", // Stats daemon
};