一、简介
软件狗。类似硬件狗,硬件狗是被动等“喂”,系统主要线程主动调用硬件接口,告诉系统本线程是正常的。但android 这种framework层中的软件狗本身是一个线程,会主动询问系统关键线程和服务是否正常。android抛弃硬件狗,大概是因为采用硬件狗的方式会使每个系统线程臃肿、复杂,且不能询问“死”的系统资源;采用软件狗的方式不仅能监视活着的线程,还能监视死的系统资源,并且将监视功能集中到一个线程上,使系统模块化更明晰,可扩展性更高。
二、原理
1、watchdog线程会循环隔断时间询问系统线程和服务,如果某次询问发现关键线程或者是关键服务一段时间内没有应答,则会杀死SystemServer,进而诱发Zygote自杀,Zygote死亡的信号传递给init进程后,init进程会杀死所有Zygote的子进程并重启Zygote。这样手机就经历了软重启。
2、watchdog通过给线程发送消息,如果消息被处理,则认为线程没有异常。
3、watchdog通过获取服务的锁,如果能正确获得资源锁,则认为服务没有异常。
2、watchdog通过给线程发送消息,如果消息被处理,则认为线程没有异常。
3、watchdog通过获取服务的锁,如果能正确获得资源锁,则认为服务没有异常。
三、代码分析
package com.android.server;
//继承自线程,重写了run方法
public class Watchdog extends Thread {
static final String TAG = "Watchdog";
static final boolean localLOGV = false || false;
// Set this to true to use debug default values.
static final boolean DB = false;
// Set this to true to have the watchdog record kernel thread stacks when it fires
//是否列出kernel堆栈
static final boolean RECORD_KERNEL_THREADS = true;
//默认线程响应时间
static final long DEFAULT_TIMEOUT = DB ? 10*1000 : 60*1000;
//watchlog询问的间隔时间
static final long CHECK_INTERVAL = DEFAULT_TIMEOUT / 2;
// These are temporally ordered: larger values as lateness increases
//测试结果
static final int COMPLETED = 0;//说明系统运行良好
static final int WAITING = 1;//说明系统有线程正在响应
static final int WAITED_HALF = 2;//说明系统有线程响应时间超过预定等待时间的一半
static final int OVERDUE = 3;//说明系统有线程响应超时,报错,重启
// Which native processes to dump into dropbox's stack traces
// 如果系统重启,则列出以下进程的堆栈
public static final String[] NATIVE_STACKS_OF_INTEREST = new String[] {
"/system/bin/mediaserver",
"/system/bin/sdcard",
"/system/bin/surfaceflinger"
};
//此类是单例模式
static Watchdog sWatchdog;
/* This handler will be used to post message back onto the main thread */
//最重要的数据结构,HandlerChecker继承自Runnable,重写了run方法,run方法同时检测线程和服务是否正常
final ArrayList
mHandlerCheckers = new ArrayList
();
//将系统服务挂载到这个Runnable中,使用这个线程去监测服务是否正常。
final HandlerChecker mMonitorChecker;
//没有使用
ContentResolver mResolver;
ActivityManagerService mActivity;
//电话应用特殊对待
int mPhonePid;
//请求用户是否重启systemserver所需要的对话框
IActivityController mController;
//是否允许重启systemserver
boolean mAllowRestart = true;
SimpleDateFormat mTraceDateFormat = new SimpleDateFormat("dd_MMM_HH_mm_ss.SSS");
/**
* Used for checking status of handle threads and scheduling monitor callbacks.
*/
//包装被监视的线程
public final class HandlerChecker implements Runnable {
//线程句柄
private final Handler mHandler;
//线程名称
private final String mName;
//等待响应的时间
private final long mWaitMax;
//注册服务的数组,服务必须继承Monitor 接口,重写monitor()方法.
private final ArrayList
mMonitors = new ArrayList
();
private boolean mCompleted;
private Monitor mCurrentMonitor;
private long mStartTime;
HandlerChecker(Handler handler, String name, long waitMaxMillis) {
mHandler = handler;
mName = name;
mWaitMax = waitMaxMillis;
mCompleted = true;
}
//注册系统服务
public void addMonitor(Monitor monitor) {
mMonitors.add(monitor);
}
//检查线程是否正常
public void scheduleCheckLocked() {
if (mMonitors.size() == 0 && mHandler.getLooper().isIdling()) {
// If the target looper is or just recently was idling, then
// there is no reason to enqueue our checker on it since that
// is as good as it not being deadlocked. This avoid having
// to do a context switch to check the thread. Note that we
// only do this if mCheckReboot is false and we have no
// monitors, since those would need to be executed at this point.
mCompleted = true;
return;
}
if (!mCompleted) {
// we already have a check in flight, so no need
return;
}
mCompleted = false;
mCurrentMonitor = null;
mStartTime = SystemClock.uptimeMillis();
//使用线程的句柄给线程发送消息
mHandler.postAtFrontOfQueue(this);
}
public boolean isOverdueLocked() {
return (!mCompleted) && (SystemClock.uptimeMillis() > mStartTime + mWaitMax);
}
//给线程发送消息后的结果分析
public int getCompletionStateLocked() {
if (mCompleted) {
return COMPLETED;
} else {
long latency = SystemClock.uptimeMillis() - mStartTime;
if (latency < mWaitMax/2) {
return WAITING;
} else if (latency < mWaitMax) {
return WAITED_HALF;
}
}
return OVERDUE;
}
public Thread getThread() {
return mHandler.getLooper().getThread();
}
public String getName() {
return mName;
}
public String describeBlockedStateLocked() {
if (mCurrentMonitor == null) {
return "Blocked in handler on " + mName + " (" + getThread().getName() + ")";
} else {
return "Blocked in monitor " + mCurrentMonitor.getClass().getName()
+ " on " + mName + " (" + getThread().getName() + ")";
}
}
//给线程发送消息后,如果线程处理该消息,则会进入此方法
@Override
public void run() {
//遍历挂载的服务
final int size = mMonitors.size();
for (int i = 0 ; i < size ; i++) {
synchronized (Watchdog.this) {
mCurrentMonitor = mMonitors.get(i);
}
//调用服务重载的函数,里面一般是获取一下资源锁
mCurrentMonitor.monitor();
}
synchronized (Watchdog.this) {
mCompleted = true;
mCurrentMonitor = null;
}
}
}
final class RebootRequestReceiver extends BroadcastReceiver {
@Override
public void onReceive(Context c, Intent intent) {
if (intent.getIntExtra("nowait", 0) != 0) {
rebootSystem("Received ACTION_REBOOT broadcast");
return;
}
Slog.w(TAG, "Unsupported ACTION_REBOOT broadcast: " + intent);
}
}
//需要被监视的服务,必须继承此接口
public interface Monitor {
void monitor();
}
//单例模式
public static Watchdog getInstance() {
if (sWatchdog == null) {
sWatchdog = new Watchdog();
}
return sWatchdog;
}
private Watchdog() {
super("watchdog");
// Initialize handler checkers for each common thread we want to check. Note
// that we are not currently checking the background thread, since it can
// potentially hold longer running operations with no guarantees about the timeliness
// of operations there.
// The shared foreground thread is the main checker. It is where we
// will also dispatch monitor checks and do other work.
//此线程主要用于监视服务
mMonitorChecker = new HandlerChecker(FgThread.getHandler(),
"foreground thread", DEFAULT_TIMEOUT);
mHandlerCheckers.add(mMonitorChecker);
// Add checker for main thread. We only do a quick check since there
// can be UI running on the thread.
//默认注册的线程
mHandlerCheckers.add(new HandlerChecker(new Handler(Looper.getMainLooper()),
"main thread", DEFAULT_TIMEOUT));
// Add checker for shared UI thread.
mHandlerCheckers.add(new HandlerChecker(UiThread.getHandler(),
"ui thread", DEFAULT_TIMEOUT));
// And also check IO thread.
mHandlerCheckers.add(new HandlerChecker(IoThread.getHandler(),
"i/o thread", DEFAULT_TIMEOUT));
// And the display thread.
mHandlerCheckers.add(new HandlerChecker(DisplayThread.getHandler(),
"display thread", DEFAULT_TIMEOUT));
}
public void init(Context context, ActivityManagerService activity) {
mResolver = context.getContentResolver();
mActivity = activity;
context.registerReceiver(new RebootRequestReceiver(),
new IntentFilter(Intent.ACTION_REBOOT),
android.Manifest.permission.REBOOT, null);
}
public void processStarted(String name, int pid) {
synchronized (this) {
if ("com.android.phone".equals(name)) {
mPhonePid = pid;
}
}
}
public void setActivityController(IActivityController controller) {
synchronized (this) {
mController = controller;
}
}
public void setAllowRestart(boolean allowRestart) {
synchronized (this) {
mAllowRestart = allowRestart;
}
}
//注册服务
public void addMonitor(Monitor monitor) {
synchronized (this) {
if (isAlive()) {
throw new RuntimeException("Monitors can't be added once the Watchdog is running");
}
mMonitorChecker.addMonitor(monitor);
}
}
public void addThread(Handler thread) {
addThread(thread, DEFAULT_TIMEOUT);
}
//注册线程
public void addThread(Handler thread, long timeoutMillis) {
synchronized (this) {
if (isAlive()) {
throw new RuntimeException("Threads can't be added once the Watchdog is running");
}
final String name = thread.getLooper().getThread().getName();
mHandlerCheckers.add(new HandlerChecker(thread, name, timeoutMillis));
}
}
/**
* Perform a full reboot of the system.
*/
void rebootSystem(String reason) {
Slog.i(TAG, "Rebooting system because: " + reason);
IPowerManager pms = (IPowerManager)ServiceManager.getService(Context.POWER_SERVICE);
try {
pms.reboot(false, reason, false);
} catch (RemoteException ex) {
}
}
//遍历所有线程的响应结果
private int evaluateCheckerCompletionLocked() {
int state = COMPLETED;
for (int i=0; i
getBlockedCheckersLocked() { ArrayList
checkers = new ArrayList
(); for (int i=0; i
checkers) { StringBuilder builder = new StringBuilder(128); for (int i=0; i
0) { builder.append(", "); } builder.append(checkers.get(i).describeBlockedStateLocked()); } return builder.toString(); } //watchdog线程的run方法 @Override public void run() { boolean waitedHalf = false; while (true) { final ArrayList
blockedCheckers; final String subject; final boolean allowRestart; int debuggerWasConnected = 0; synchronized (this) { long timeout = CHECK_INTERVAL; // Make sure we (re)spin the checkers that have become idle within // this wait-and-check interval //给所有线程发送消息 for (int i=0; i
0) { debuggerWasConnected--; } // NOTE: We use uptimeMillis() here because we do not want to increment the time we // wait while asleep. If the device is asleep then the thing that we are waiting // to timeout on is asleep as well and won't have a chance to run, causing a false // positive on when to kill things. //睡眠一段时间 long start = SystemClock.uptimeMillis(); while (timeout > 0) { if (Debug.isDebuggerConnected()) { debuggerWasConnected = 2; } try { wait(timeout); } catch (InterruptedException e) { Log.wtf(TAG, e); } if (Debug.isDebuggerConnected()) { debuggerWasConnected = 2; } timeout = CHECK_INTERVAL - (SystemClock.uptimeMillis() - start); } //获取线程响应消息的结果 final int waitState = evaluateCheckerCompletionLocked(); if (waitState == COMPLETED) {//正常 // The monitors have returned; reset waitedHalf = false; continue; } else if (waitState == WAITING) {//正在响应 // still waiting but within their configured intervals; back off and recheck continue; } else if (waitState == WAITED_HALF) {//响应略有延时,打印系统服务的堆栈,然后继续监测一下 if (!waitedHalf) { // We've waited half the deadlock-detection interval. Pull a stack // trace and wait another half. ArrayList
pids = new ArrayList
(); pids.add(Process.myPid()); ActivityManagerService.dumpStackTraces(true, pids, null, null, NATIVE_STACKS_OF_INTEREST); waitedHalf = true; } continue; } // something is overdue! //有线程有问题 blockedCheckers = getBlockedCheckersLocked(); subject = describeCheckersLocked(blockedCheckers); allowRestart = mAllowRestart; } // If we got here, that means that the system is most likely hung. // First collect stack traces from all threads of the system process. // Then kill this process so that the system will restart. EventLog.writeEvent(EventLogTags.WATCHDOG, subject); ArrayList
pids = new ArrayList
(); pids.add(Process.myPid()); if (mPhonePid > 0) pids.add(mPhonePid); // Pass !waitedHalf so that just in case we somehow wind up here without having // dumped the halfway stacks, we properly re-initialize the trace file. // 打印系统服务的堆栈 final File stack = ActivityManagerService.dumpStackTraces( !waitedHalf, pids, null, null, NATIVE_STACKS_OF_INTEREST); // Give some extra time to make sure the stack traces get written. // The system's been hanging for a minute, another second or two won't hurt much. SystemClock.sleep(2000); // Pull our own kernel thread stacks as well if we're configured for that // 打印kernel的堆栈 if (RECORD_KERNEL_THREADS) { dumpKernelStackTraces(); } // Trigger the kernel to dump all blocked threads to the kernel log try { FileWriter sysrq_trigger = new FileWriter("/proc/sysrq-trigger"); sysrq_trigger.write("w"); sysrq_trigger.close(); } catch (IOException e) { Slog.e(TAG, "Failed to write to /proc/sysrq-trigger"); Slog.e(TAG, e.getMessage()); } String tracesPath = SystemProperties.get("dalvik.vm.stack-trace-file", null); String traceFileNameAmendment = "_SystemServer_WDT" + mTraceDateFormat.format(new Date()); if (tracesPath != null && tracesPath.length() != 0) { File traceRenameFile = new File(tracesPath); String newTracesPath; int lpos = tracesPath.lastIndexOf ("."); if (-1 != lpos) newTracesPath = tracesPath.substring (0, lpos) + traceFileNameAmendment + tracesPath.substring (lpos); else newTracesPath = tracesPath + traceFileNameAmendment; traceRenameFile.renameTo(new File(newTracesPath)); tracesPath = newTracesPath; } final File newFd = new File(tracesPath); // Try to add the error to the dropbox, but assuming that the ActivityManager // itself may be deadlocked. (which has happened, causing this statement to // deadlock and the watchdog as a whole to be ineffective) Thread dropboxThread = new Thread("watchdogWriteToDropbox") { public void run() { mActivity.addErrorToDropBox( "watchdog", null, "system_server", null, null, subject, null, newFd, null); } }; dropboxThread.start(); try { dropboxThread.join(2000); // wait up to 2 seconds for it to return. } catch (InterruptedException ignored) {} // At times, when user space watchdog traces don't give an indication on // which component held a lock, because of which other threads are blocked, // (thereby causing Watchdog), crash the device to analyze RAM dumps boolean crashOnWatchdog = SystemProperties .getBoolean("persist.sys.crashOnWatchdog", false); if (crashOnWatchdog) { // wait until the above blocked threads be dumped into kernel log SystemClock.sleep(3000); // now try to crash the target try { FileWriter sysrq_trigger = new FileWriter("/proc/sysrq-trigger"); sysrq_trigger.write("c"); sysrq_trigger.close(); } catch (IOException e) { Slog.e(TAG, "Failed to write 'c' to /proc/sysrq-trigger"); Slog.e(TAG, e.getMessage()); } } //弹出对话框,让用户确认是否等待 IActivityController controller; synchronized (this) { controller = mController; } if (controller != null) { Slog.i(TAG, "Reporting stuck state to activity controller"); try { Binder.setDumpDisabled("Service dumps disabled due to hung system process."); // 1 = keep waiting, -1 = kill system int res = controller.systemNotResponding(subject); if (res >= 0) { Slog.i(TAG, "Activity controller requested to coninue to wait"); waitedHalf = false; continue; } } catch (RemoteException e) { } } // Only kill the process if the debugger is not attached. if (Debug.isDebuggerConnected()) { debuggerWasConnected = 2; } if (debuggerWasConnected >= 2) { Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process"); } else if (debuggerWasConnected > 0) { Slog.w(TAG, "Debugger was connected: Watchdog is *not* killing the system process"); } else if (!allowRestart) { Slog.w(TAG, "Restart not allowed: Watchdog is *not* killing the system process"); } else { Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + subject); for (int i=0; i