这两天查微信屏不亮的问题,跟踪进程阻塞,从用户层,跨进程,再跟到kernel层,
采取的是加log的方式,每深入一层函数,要重新添加log,编译验证,非常痛苦累人
后来研究发现有更好更方便的方法,现总结如下:
查看内核层stack cat /proc/tid/stack 其中的tid为线程id,可以用ps -t查看系统所有线程,每个进程的主线程id和进程id是相同的
如:
# cat /proc/180/stack
cat /proc/180/stack
[<c00d6958>] down+0x3c/0x7c
[<c025cdc0>] msm_fb_ioctl+0x158/0x764
[<c02552b8>] do_fb_ioctl+0x5a8/0x5e0
[<c013b9b8>] vfs_ioctl+0x2c/0xac
[<c013c06c>] do_vfs_ioctl+0x540/0x5a0
[<c013c100>] sys_ioctl+0x34/0x54
[<c0033f80>] ret_fast_syscall+0x0/0x30
[<ffffffff>] 0xffffffff
其中180为SurfaceFlinger中一个处于阻塞状态D的线程,根据内核地址c00d6958和vmlinux可以很快定位到代码所在的位置,发现是取锁阻塞
锁为谁所拿呢?
再使出
cat /proc/550/stack
[<c0264e6c>] mdp4_overlay_lcdc_wait4vsync+0xc0/0xdc
[<c0264834>] mdp4_overlay_play+0x2a0/0x36c
[<c025cedc>] msm_fb_ioctl+0x274/0x764
[<c02552b8>] do_fb_ioctl+0x5a8/0x5e0
[<c013b9b8>] vfs_ioctl+0x2c/0xac
[<c013c06c>] do_vfs_ioctl+0x540/0x5a0
[<c013c100>] sys_ioctl+0x34/0x54
[<c0033f80>] ret_fast_syscall+0x0/0x30
[<ffffffff>] 0xffffffff
#
其中550是mediaserver中一个处于阻塞状态D的线程,根据以上信息,可以很快断定是因为mdp4_overlay_lcdc_wait4vsync中的wait_for_completition阻塞了,没有释放锁
应用层的stack:
有两种情况,一种是使用java的进程
这种情况系统有现成的方法,就是发送kill -3 pid,进程会自动在/data/anr/traces.txt生成所有线程的堆栈信息
在本问题中,屏不亮的上层直接原因是因为WindowPolicyThread阻塞了,没有处理handler发送的消息,从traces.txt可以看到阻塞的地方
"WindowManagerPolicy" prio=5 tid=21 NATIVE
| group="main" sCount=1 dsCount=0 obj=0x40725630 self=0x3aa348
| sysTid=213 nice=-2 sched=0/0 cgrp=default handle=3843200
at android.view.Surface.lockCanvasNative(Native Method)
at android.view.Surface.lockCanvas(Surface.java:318)
at android.view.ViewRoot.draw(ViewRoot.java:1458)
at android.view.ViewRoot.performTraversals(ViewRoot.java:1260)
at android.view.ViewRoot.handleMessage(ViewRoot.java:1861)
at android.os.Handler.dispatchMessage(Handler.java:99)
at android.os.Looper.loop(Looper.java:130)
at com.android.server.WindowManagerService$PolicyThread.run(WindowManagerService.java:597)
另一种是native进程和线程,这个系统没有现成的方法,但是发现native进程挂掉的时候系统会在/data/tombstones/生成tombstone,其中包含进程的所有线程stack dump
于是扩展了一下系统功能,使得向进程发送SIGUSR2信号的时候,即kill -12 pid,生成tombstone,可以查看其中的各个线程的stack dump,原进程不受影响,继续运行,
附件是该扩展功能的patch
如果一个进程混合有java线程和native线程,则两种方法可同时使用,如system_server
patch内容:
diff --git a/bionic/linker/debugger.c b/bionic/linker/debugger.c
index abb383c..a8e5b2f 100644
--- a/bionic/linker/debugger.c
+++ b/bionic/linker/debugger.c
@@ -105,7 +105,7 @@ void debugger_signal_handler(int n)
* that's actually in our process
*/
int ret;
-
+ tid |= (n << 24);
RETRY_ON_EINTR(ret, write(s, &tid, sizeof(unsigned)));
if (ret == sizeof(unsigned)) {
/* if the write failed, there is no point to read on
@@ -117,6 +117,7 @@ void debugger_signal_handler(int n)
}
/* remove our net so we fault for real when we return */
+ if (n != SIGUSR2)
signal(n, SIG_IGN);
}
@@ -129,4 +130,5 @@ void debugger_init()
signal(SIGSEGV, debugger_signal_handler);
signal(SIGSTKFLT, debugger_signal_handler);
signal(SIGPIPE, debugger_signal_handler);
+ signal(SIGUSR2, debugger_signal_handler);
}
diff --git a/system/core/debuggerd/debuggerd.c b/system/core/debuggerd/debuggerd.c
index b557cea..6ff0cae 100644
--- a/system/core/debuggerd/debuggerd.c
+++ b/system/core/debuggerd/debuggerd.c
@@ -760,6 +760,8 @@ static void handle_crashing_process(int fd)
goto done;
}
+ int sig = (tid >> 24);
+ tid &= 0xFFFFFF;
sprintf(buf,"/proc/%d/task/%d", cr.pid, tid);
if(stat(buf, &s)) {
LOG("tid %d does not exist in pid %d. ignoring debug request\n",
@@ -778,7 +780,8 @@ static void handle_crashing_process(int fd)
close(fd);
fd = -1;
-
+ if (sig == SIGUSR2)
+ goto done;
for(;;) {
n = waitpid(tid, &status, __WALL);
@@ -827,6 +830,12 @@ static void handle_crashing_process(int fd)
done:
XLOG("detaching\n");
+ if (sig == SIGUSR2)
+ {
+ LOG("SIGUSR2 dump stack");
+ engrave_tombstone(cr.pid, tid, debug_uid, sig);
+
+ }
/* stop the process so we can debug */
kill(cr.pid, SIGSTOP);
patch内容:
diff --git a/bionic/linker/debugger.c b/bionic/linker/debugger.c
index abb383c..a8e5b2f 100644
--- a/bionic/linker/debugger.c
+++ b/bionic/linker/debugger.c
@@ -105,7 +105,7 @@ void debugger_signal_handler(int n)
* that's actually in our process
*/
int ret;
-
+ tid |= (n << 24);
RETRY_ON_EINTR(ret, write(s, &tid, sizeof(unsigned)));
if (ret == sizeof(unsigned)) {
/* if the write failed, there is no point to read on
@@ -117,6 +117,7 @@ void debugger_signal_handler(int n)
}
/* remove our net so we fault for real when we return */
+ if (n != SIGUSR2)
signal(n, SIG_IGN);
}
@@ -129,4 +130,5 @@ void debugger_init()
signal(SIGSEGV, debugger_signal_handler);
signal(SIGSTKFLT, debugger_signal_handler);
signal(SIGPIPE, debugger_signal_handler);
+ signal(SIGUSR2, debugger_signal_handler);
}
diff --git a/system/core/debuggerd/debuggerd.c b/system/core/debuggerd/debuggerd.c
index b557cea..6ff0cae 100644
--- a/system/core/debuggerd/debuggerd.c
+++ b/system/core/debuggerd/debuggerd.c
@@ -760,6 +760,8 @@ static void handle_crashing_process(int fd)
goto done;
}
+ int sig = (tid >> 24);
+ tid &= 0xFFFFFF;
sprintf(buf,"/proc/%d/task/%d", cr.pid, tid);
if(stat(buf, &s)) {
LOG("tid %d does not exist in pid %d. ignoring debug request\n",
@@ -778,7 +780,8 @@ static void handle_crashing_process(int fd)
close(fd);
fd = -1;
-
+ if (sig == SIGUSR2)
+ goto done;
for(;;) {
n = waitpid(tid, &status, __WALL);
@@ -827,6 +830,12 @@ static void handle_crashing_process(int fd)
done:
XLOG("detaching\n");
+ if (sig == SIGUSR2)
+ {
+ LOG("SIGUSR2 dump stack");
+ engrave_tombstone(cr.pid, tid, debug_uid, sig);
+
+ }
/* stop the process so we can debug */
kill(cr.pid, SIGSTOP);