Android WatchDog机制

Youth cowboy

已于 2024-01-18 14:30:57 修改

阅读量1k

点赞数 5

分类专栏： stability 文章标签： android

于 2024-01-10 15:50:26 首次发布

本文链接：https://blog.csdn.net/youthcowboy/article/details/135505482

版权

stability 专栏收录该内容

12 篇文章

订阅专栏

本文详细介绍了Android系统中WatchDog的初始化过程、如何加入监控、以及其检测机制，包括对关键服务的死锁检测和Handler线程消息处理超时的管理，确保系统稳定性。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

Android系统中，有硬件WatchDog用于定时检测关键硬件是否正常工作，类似地，在framework层有一个软件WatchDog用于定期检测关键系统服务是否发生死锁事件、Handler线程消息处理是否超时。

一、WatchDog初始化

SystemServer.java，sytem_server进程启动阶段，启动Watchdog线程。

private void startBootstrapServices(@NonNull TimingsTraceAndSlog t) {
        ......
        // Start the watchdog as early as possible so we can crash the system server// if we deadlock during early boott.traceBegin("StartWatchdog");
        final Watchdog watchdog = Watchdog.getInstance();    // 对象创建
        watchdog.start();    // 调用start方法初始化
        mDumper.addDumpable(watchdog);
        ......
}

Watchdog.java，创建对象，并启动内部线程，执行run方法。

private Watchdog() {
        mThread = new Thread(this::run, "watchdog");

        // Initialize handler checkers for each common thread we want to check.  Note
        // that we are not currently checking the background thread, since it can
        // potentially hold longer running operations with no guarantees about the timeliness
        // of operations there.
        //
        // Use a custom thread to check monitors to avoid lock contention from impacted other
        // threads.
        // 创建HandlerThread监控线程并启动，线程名称为"watchdog.monitor"
        ServiceThread t = new ServiceThread("watchdog.monitor",
                        android.os.Process.THREAD_PRIORITY_DEFAULT, true /*allowIo*/);
        t.start();
        // 封装到HandlerChecker，加入mHandlerCheckers列表
        mMonitorChecker = new HandlerChecker(new Handler(t.getLooper()), "monitor thread");
        mHandlerCheckers.add(withDefaultTimeout(mMonitorChecker));
        // 将foreground thread、main thread、ui thread等加入mHandlerCheckers列表
        mHandlerCheckers.add(withDefaultTimeout(
                        new HandlerChecker(FgThread.getHandler(), "foreground thread")));
        // Add checker for main thread.  We only do a quick check since there
        // can be UI running on the thread.
        mHandlerCheckers.add(withDefaultTimeout(
                        new HandlerChecker(new Handler(Looper.getMainLooper()), "main thread")));
        // Add checker for shared UI thread.
        mHandlerCheckers.add(withDefaultTimeout(
                        new HandlerChecker(UiThread.getHandler(), "ui thread")));
        // And also check IO thread.
        mHandlerCheckers.add(withDefaultTimeout(
                        new HandlerChecker(IoThread.getHandler(), "i/o thread")));
        // And the display thread.
        mHandlerCheckers.add(withDefaultTimeout(
                        new HandlerChecker(DisplayThread.getHandler(), "display thread")));
        // And the animation thread.
        mHandlerCheckers.add(withDefaultTimeout(
                         new HandlerChecker(AnimationThread.getHandler(), "animation thread")));
        // And the surface animation thread.
        mHandlerCheckers.add(withDefaultTimeout(
                        new HandlerChecker(SurfaceAnimationThread.getHandler(),
                                "surface animation thread")));
        // Initialize monitor for Binder threads.
        addMonitor(new BinderThreadMonitor());

        mInterestingJavaPids.add(Process.myPid());

        // See the notes on DEFAULT_TIMEOUT.
        assert DB ||
                        DEFAULT_TIMEOUT > ZygoteConnectionConstants.WRAPPED_PID_TIMEOUT_MILLIS;

        mTraceErrorLogger = new TraceErrorLogger();
}

public void start() {       
    mThread.start();
}

二、加入Watchdog监控方式

有两种方式加入Watchdog监控：

addThread()：用于监测Handler线程，默认超时时长为60s.这种超时往往是所对应的handler线程消息处理得慢；

addMonitor(): 用于监控实现了Watchdog.Monitor接口的服务.这种超时可能是”android.fg”线程消息处理得慢，也可能是monitor迟迟拿不到锁；

// PowerManagerService.java
 
 @Override
public void onStart() {
    // 加入watchdog监听
    Watchdog.getInstance().addMonitor(this);    // pms服务实现了Watchdog.Monitor接口
    Watchdog.getInstance().addThread(mHandler);
}

// 在watch dog中调用，用于检测PMS服务中是否存在死锁
@Override // Watchdog.Monitor implementation
public void monitor() {
    // Grab and release lock for watchdog monitor to detect deadlocks.
    synchronized (mLock) {
    }
}

// monitor对象加入mMonitorQueue，mMonitorQueue加入到mMonitors
public void addMonitor(Monitor monitor) {
    synchronized (mLock) {
            mMonitorChecker.addMonitorLocked(monitor);
    }
}

void addMonitorLocked(Monitor monitor) {
        // We don't want to update mMonitors when the Handler is in the middle of checking
        // all monitors. We will update mMonitors on the next schedule if it is safe
        mMonitorQueue.add(monitor);
}
// handler对象加入到mHandlerCheckers列表
public void addThread(Handler thread) {
    synchronized (mLock) {
            final String name = thread.getLooper().getThread().getName();
            mHandlerCheckers.add(withDefaultTimeout(new HandlerChecker(thread, name)));
    }
}

public final class HandlerChecker implements Runnable {
......
    public void scheduleCheckLocked(long handlerCheckerTimeoutMillis) {
            mWaitMaxMillis = handlerCheckerTimeoutMillis;
            if (mCompleted) {
                    // Safe to update monitors in queue, Handler is not in the middle of work
                    mMonitors.addAll(mMonitorQueue);
                    mMonitorQueue.clear();
            }
            if ((mMonitors.size() == 0 && mHandler.getLooper().getQueue().isPolling())
                            || (mPauseCount > 0)) {
                    // Don't schedule until after resume OR
                    // If the target looper has recently been polling, then
                    // there is no reason to enqueue our checker on it since that
                    // is as good as it not being deadlocked.  This avoid having
                    // to do a context switch to check the thread. Note that we
                    // only do this if we have no monitors since those would need to
                    // be executed at this point.
                    mCompleted = true;
                    return;
            }
            if (!mCompleted) {
                    // we already have a check in flight, so no need
                    return;
            }
    
            mCompleted = false;
            mCurrentMonitor = null;
            mStartTimeMillis = SystemClock.uptimeMillis();
            mHandler.postAtFrontOfQueue(this);
    }
    ......
}

三、Watchdog检测机制

当调用Watchdog.getInstance().start()时，则进入线程“watchdog”的run()方法, 该方法分成两部分:

前半部用于监测是否触发超时 ---三

后半部当触发超时则输出各种信息 ---四


private void run() {
        boolean waitedHalf = false;

        while (true) {
                List<HandlerChecker> blockedCheckers = Collections.emptyList();
                String subject = "";
                boolean allowRestart = true;
                int debuggerWasConnected = 0;
                boolean doWaitedHalfDump = false;
                // The value of mWatchdogTimeoutMillis might change while we are executing the loop.
                // We store the current value to use a consistent value for all handlers.
                final long watchdogTimeoutMillis = mWatchdogTimeoutMillis;
                final long checkIntervalMillis = watchdogTimeoutMillis / 2;
                final ArrayList<Integer> pids;
                synchronized (mLock) {
                        //执行所有的Checker的监控方法, 每个Checker记录当前的mStartTime
                        long timeout = checkIntervalMillis;    // 默认为30s
                        // Make sure we (re)spin the checkers that have become idle within
                        // this wait-and-check interval
                        for (int i=0; i<mHandlerCheckers.size(); i++) {
                                HandlerCheckerAndTimeout hc = mHandlerCheckers.get(i);
                                // We pick the watchdog to apply every time we reschedule the checkers. The
                                // default timeout might have changed since the last run.
                                hc.checker().scheduleCheckLocked(hc.customTimeoutMillis()
                                                .orElse(watchdogTimeoutMillis * Build.HW_TIMEOUT_MULTIPLIER));
                        }

                        if (debuggerWasConnected > 0) {
                                debuggerWasConnected--;
                        }

                        // NOTE: We use uptimeMillis() here because we do not want to increment the time we
                        // wait while asleep. If the device is asleep then the thing that we are waiting
                        // to timeout on is asleep as well and won't have a chance to run, causing a false
                        // positive on when to kill things.
                        long start = SystemClock.uptimeMillis();
                        // 通过循环,保证执行30s才会继续往下执行
                        while (timeout > 0) {
                                if (Debug.isDebuggerConnected()) {
                                        debuggerWasConnected = 2;
                                }
                                try {
                                        // 等待timeout时长后，做检测
                                        mLock.wait(timeout);
                                        // Note: mHandlerCheckers and mMonitorChecker may have changed after waiting
                                } catch (InterruptedException e) {
                                        Log.wtf(TAG, e);
                                }
                                if (Debug.isDebuggerConnected()) {
                                        debuggerWasConnected = 2;
                                }
                                timeout = checkIntervalMillis - (SystemClock.uptimeMillis() - start);
                        }

                        final int waitState = evaluateCheckerCompletionLocked();
                        if (waitState == COMPLETED) {
                                // The monitors have returned; reset
                                waitedHalf = false;
                                continue;
                        } else if (waitState == WAITING) {
                                // still waiting but within their configured intervals; back off and recheck
                                continue;
                        } else if (waitState == WAITED_HALF) {
                                if (!waitedHalf) {
                                        Slog.i(TAG, "WAITED_HALF");
                                        waitedHalf = true;
                                        // We've waited half, but we'd need to do the stack trace dump w/o the lock.
                                        blockedCheckers = getCheckersWithStateLocked(WAITED_HALF);
                                        subject = describeCheckersLocked(blockedCheckers);
                                        pids = new ArrayList<>(mInterestingJavaPids);
                                        doWaitedHalfDump = true;
                                } else {
                                        continue;
                                }
                        } else {
                                // something is overdue!
                                // watchdog处理流程
                                ......
                        }
                } // END synchronized (mLock)

                ......
        }
}

public final class HandlerChecker implements Runnable { 
    public void scheduleCheckLocked(long handlerCheckerTimeoutMillis) {
        mWaitMaxMillis = handlerCheckerTimeoutMillis;
        if (mCompleted) {
                // Safe to update monitors in queue, Handler is not in the middle of work
                mMonitors.addAll(mMonitorQueue);
                mMonitorQueue.clear();
        }
        if ((mMonitors.size() == 0 && mHandler.getLooper().getQueue().isPolling())
                        || (mPauseCount > 0)) {
                // Don't schedule until after resume OR
                // If the target looper has recently been polling, then
                // there is no reason to enqueue our checker on it since that
                // is as good as it not being deadlocked.  This avoid having
                // to do a context switch to check the thread. Note that we
                // only do this if we have no monitors since those would need to
                // be executed at this point.
                mCompleted = true;
                return;
        }
        if (!mCompleted) {
                // we already have a check in flight, so no need
                return;
        }
    
        mCompleted = false;
        mCurrentMonitor = null;
        // 记录当下的时间
        mStartTimeMillis = SystemClock.uptimeMillis();
        // 发送消息，插入消息队列最开头， 见下方的run()方法
        mHandler.postAtFrontOfQueue(this);
    }
    
    @Override
    public void run() {
            // Once we get here, we ensure that mMonitors does not change even if we call
            // #addMonitorLocked because we first add the new monitors to mMonitorQueue and
            // move them to mMonitors on the next schedule when mCompleted is true, at which
            // point we have completed execution of this method.
            final int size = mMonitors.size();
            // 执行各系统服务的monitor()方法
            for (int i = 0 ; i < size ; i++) {
                    synchronized (mLock) {
                            mCurrentMonitor = mMonitors.get(i);
                    }
                    mCurrentMonitor.monitor();
            }
    
            synchronized (mLock) {
                    mCompleted = true;
                    mCurrentMonitor = null;
            }
    }
    ..............

}

四、Watchdog处理流程

private void run() {
        boolean waitedHalf = false;

        while (true) {
            ..............
                // something is overdue!
                // watchdog处理流程
                blockedCheckers = getCheckersWithStateLocked(OVERDUE);
                subject = describeCheckersLocked(blockedCheckers);
                allowRestart = mAllowRestart;
                pids = new ArrayList<>(mInterestingJavaPids);

                // If we got here, that means that the system is most likely hung.
                //
                // First collect stack traces from all threads of the system process.
                //
                // Then, if we reached the full timeout, kill this process so that the system will
                // restart. If we reached half of the timeout, just log some information and continue.
                // 收集日志信息
                logWatchog(doWaitedHalfDump, subject, pids);

                if (doWaitedHalfDump) {
                        // We have waited for only half of the timeout, we continue to wait for the duration
                        // of the full timeout before killing the process.
                        continue;
                }

                IActivityController controller;
                synchronized (mLock) {
                        controller = mController;
                }
                if (controller != null) {
                        Slog.i(TAG, "Reporting stuck state to activity controller");
                        try {
                                Binder.setDumpDisabled("Service dumps disabled due to hung system process.");
                                // 1 = keep waiting, -1 = kill system
                                int res = controller.systemNotResponding(subject);
                                if (res >= 0) {
                                        Slog.i(TAG, "Activity controller requested to coninue to wait");
                                        waitedHalf = false;
                                        continue;
                                }
                        } catch (RemoteException e) {
                        }
                }

                // Only kill the process if the debugger is not attached.
                if (Debug.isDebuggerConnected()) {
                        debuggerWasConnected = 2;
                }
                if (debuggerWasConnected >= 2) {
                        Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process");
                } else if (debuggerWasConnected > 0) {
                        Slog.w(TAG, "Debugger was connected: Watchdog is *not* killing the system process");
                } else if (!allowRestart) {
                        Slog.w(TAG, "Restart not allowed: Watchdog is *not* killing the system process");
                } else {
                        Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + subject);
                        WatchdogDiagnostics.diagnoseCheckers(blockedCheckers);
                        Slog.w(TAG, "*** GOODBYE!");
                        if (!Build.IS_USER && isCrashLoopFound()
                                        && !WatchdogProperties.should_ignore_fatal_count().orElse(false)) {
                                breakCrashLoop();
                        }
                        // 重启system_server进程
                        Process.killProcess(Process.myPid());
                        System.exit(10);
                }

                waitedHalf = false;
        }
}

收集日志信息，psi状态信息、system_server和几个native进程的栈信息保存到dropbox和打印所有CPU的backtrace到kernel log。

private void logWatchog(boolean halfWatchdog, String subject, ArrayList<Integer> pids) {
        // Get critical event log before logging the half watchdog so that it doesn't
        // occur in the log.
        String criticalEvents =
                        CriticalEventLog.getInstance().logLinesForSystemServerTraceFile();
        final UUID errorId = mTraceErrorLogger.generateErrorId();
        if (mTraceErrorLogger.isAddErrorIdEnabled()) {
                mTraceErrorLogger.addProcessInfoAndErrorIdToTrace("system_server", Process.myPid(),
                                errorId);
                mTraceErrorLogger.addSubjectToTrace(subject, errorId);
        }

        final String dropboxTag;
        if (halfWatchdog) {
                dropboxTag = "pre_watchdog";
                CriticalEventLog.getInstance().logHalfWatchdog(subject);
                FrameworkStatsLog.write(FrameworkStatsLog.SYSTEM_SERVER_PRE_WATCHDOG_OCCURRED);
        } else {
                dropboxTag = "watchdog";
                CriticalEventLog.getInstance().logWatchdog(subject, errorId);
                EventLog.writeEvent(EventLogTags.WATCHDOG, subject);
                // Log the atom as early as possible since it is used as a mechanism to trigger
                // Perfetto. Ideally, the Perfetto trace capture should happen as close to the
                // point in time when the Watchdog happens as possible.
                FrameworkStatsLog.write(FrameworkStatsLog.SYSTEM_SERVER_WATCHDOG_OCCURRED, subject);
        }

        long anrTime = SystemClock.uptimeMillis();
        StringBuilder report = new StringBuilder();
        report.append(ResourcePressureUtil.currentPsiState());
        ProcessCpuTracker processCpuTracker = new ProcessCpuTracker(false);
        StringWriter tracesFileException = new StringWriter();
        // 输出system_server和几个native进程的栈信息
        final File stack = StackTracesDumpHelper.dumpStackTraces(
                        pids, processCpuTracker, new SparseBooleanArray(),
                        CompletableFuture.completedFuture(getInterestingNativePids()), tracesFileException,
                        subject, criticalEvents, Runnable::run, /* latencyTracker= */null);
        // Give some extra time to make sure the stack traces get written.
        // The system's been hanging for a whlie, another second or two won't hurt much.
        SystemClock.sleep(5000);
        processCpuTracker.update();
        report.append(processCpuTracker.printCurrentState(anrTime));
        report.append(tracesFileException.getBuffer());
        // 收集或打印kernel阻塞线程的信息和ckernel log
        if (!halfWatchdog) {
                // Trigger the kernel to dump all blocked threads, and backtraces on all CPUs to the
                // kernel log
                doSysRq('w');
                doSysRq('l');
        }

        // Try to add the error to the dropbox, but assuming that the ActivityManager
        // itself may be deadlocked.  (which has happened, causing this statement to
        // deadlock and the watchdog as a whole to be ineffective)
        // 创建dropbox线程，将report数据写入
        Thread dropboxThread = new Thread("watchdogWriteToDropbox") {
                        public void run() {
                                // If a watched thread hangs before init() is called, we don't have a
                                // valid mActivity. So we can't log the error to dropbox.
                                if (mActivity != null) {
                                        mActivity.addErrorToDropBox(
                                                        dropboxTag, null, "system_server", null, null, null,
                                                        null, report.toString(), stack, null, null, null,
                                                        errorId);
                                }
                        }
                };
        dropboxThread.start();
        try {
                dropboxThread.join(2000);  // wait up to 2 seconds for it to return.
        } catch (InterruptedException ignored) { }
}

// 返回native进程的pid
static ArrayList<Integer> getInterestingNativePids() {
        HashSet<Integer> pids = new HashSet<>();
        addInterestingAidlPids(pids);
        addInterestingHidlPids(pids);

        int[] nativePids = Process.getPidsForCommands(NATIVE_STACKS_OF_INTEREST);
        if (nativePids != null) {
                for (int i : nativePids) {
                        pids.add(i);
                }
        }

        return new ArrayList<Integer>(pids);
}

// Which native processes to dump into dropbox's stack traces
public static final String[] NATIVE_STACKS_OF_INTEREST = new String[] {
        "/system/bin/audioserver",
        "/system/bin/cameraserver",
        "/system/bin/drmserver",
        "/system/bin/keystore2",
        "/system/bin/mediadrmserver",
        "/system/bin/mediaserver",
        "/system/bin/netd",
        "/system/bin/sdcard",
        "/system/bin/surfaceflinger",
        "/system/bin/vold",
        "media.extractor", // system/bin/mediaextractor
        "media.metrics", // system/bin/mediametrics
        "media.codec", // vendor/bin/hw/android.hardware.media.omx@1.0-service
        "media.swcodec", // /apex/com.android.media.swcodec/bin/mediaswcodec
        "media.transcoding", // Media transcoding service
        "com.android.bluetooth",  // Bluetooth service
        "/apex/com.android.os.statsd/bin/statsd",  // Stats daemon
};