线程池溢出情景分析
客户端发现服务端线程用完了的情景:
static void binder_transaction(struct binder_proc *proc,
struct binder_thread *thread,
struct binder_transaction_data *tr, int reply,
binder_size_t extra_buffers_size)
{
//......
if (reply) {
//......
} else if (!(t->flags & TF_ONE_WAY)) {
BUG_ON(t->buffer->async_transaction != 0);
binder_inner_proc_lock(proc);
//将 tcomplete 插入到事务发起 binder 线程的 todo 队列中
//等 Server 端收到 ServiceManager 的回复后就会执行这个 binder_work
binder_enqueue_deferred_thread_work_ilocked(thread, tcomplete);
//记录一些信息
t->need_reply = 1;
t->from_parent = thread->transaction_stack;
thread->transaction_stack = t;
binder_inner_proc_unlock(proc);
//将t->work插入目标线程的todo队列中并唤醒目标进程
if (!binder_proc_transaction(t, target_proc, target_thread)) {
binder_inner_proc_lock(proc);
binder_pop_transaction_ilocked(thread, t);
binder_inner_proc_unlock(proc);
goto err_dead_proc_or_thread;
}
} else {
//......
}
if (target_thread)
binder_thread_dec_tmpref(target_thread);
binder_proc_dec_tmpref(target_proc);
if (target_node)
binder_dec_node_tmpref(target_node);
/*
* write barrier to synchronize with initialization
* of log entry
*/
smp_wmb();
WRITE_ONCE(e->debug_id_done, t_debug_id);
return;
//......
}
binder_proc_transaction 的实现如下:
static bool binder_proc_transaction(struct binder_transaction *t,
struct binder_proc *proc,
struct binder_thread *thread)
{
struct binder_node *node = t->buffer->target_node;
struct binder_priority node_prio;
bool oneway = !!(t->flags & TF_ONE_WAY);
bool pending_async = false;
BUG_ON(!node);
binder_node_lock(node);
node_prio.prio = node->min_priority;
node_prio.sched_policy = node->sched_policy;
if (oneway) {
BUG_ON(thread);
if (node->has_async_transaction) {
pending_async = true;
} else {
node->has_async_transaction = true;
}
}
binder_inner_proc_lock(proc);
//如果目标进程死亡或者目标线程不为NULL且死亡
if (proc->is_dead || (thread && thread->is_dead)) {
binder_inner_proc_unlock(proc);
binder_node_unlock(node);
return false;
}
// thread 为空
// pending_async false
if (!thread && !pending_async) //走这
//从 target_proc 的 waiting_threads 链表中选择第一个作为 target_thread
//如果没用空闲的线程,这里会返回 NULL
thread = binder_select_thread_ilocked(proc);
if (thread) {
binder_transaction_priority(thread->task, t, node_prio,
node->inherit_rt);
//把 binder_transaction 插入到 target_thread 的 todo 链表中
binder_enqueue_thread_work_ilocked(thread, &t->work);
} else if (!pending_async) {//同步调用,走这里
//将数据插入 proc->todo
binder_enqueue_work_ilocked(&t->work, &proc->todo);
} else {
binder_enqueue_work_ilocked(&t->work, &node->async_todo);
}
if (!pending_async) //走这
//唤醒远程线程
binder_wakeup_thread_ilocked(proc, thread, !oneway /* sync */);
binder_inner_proc_unlock(proc);
binder_node_unlock(node);
return true;
}
//从 target_proc 的 waiting_threads 链表中选择第一个作为 target_thread
static struct binder_thread *
binder_select_thread_ilocked(struct binder_proc *proc)
{
struct binder_thread *thread;
assert_spin_locked(&proc->inner_lock);
thread = list_first_entry_or_null(&proc->waiting_threads,
struct binder_thread,
waiting_thread_node);
if (thread)
list_del_init(&thread->waiting_thread_node);
return thread;
}
//把 binder_transaction 插入到 target_thread 的 todo 链表中
static void
binder_enqueue_thread_work_ilocked(struct binder_thread *thread,
struct binder_work *work)
{
binder_enqueue_work_ilocked(work, &thread->todo);
thread->process_todo = true;
}
static void
binder_enqueue_work_ilocked(struct binder_work *work,
struct list_head *target_list)
{
BUG_ON(target_list == NULL);
BUG_ON(work->entry.next && !list_empty(&work->entry));
list_add_tail(&work->entry, target_list);
}
//唤醒接收端
static void binder_wakeup_thread_ilocked(struct binder_proc *proc,
struct binder_thread *thread,
bool sync)
{
assert_spin_locked(&proc->inner_lock);
if (thread) { // thread 为空
if (sync)
wake_up_interruptible_sync(&thread->wait);
else
wake_up_interruptible(&thread->wait);
return;
}
//走这
binder_wakeup_poll_threads_ilocked(proc, sync);
}
static void binder_wakeup_thread_ilocked(struct binder_proc *proc,
struct binder_thread *thread,
bool sync)
{
assert_spin_locked(&proc->inner_lock);
if (thread) {
if (sync)
wake_up_interruptible_sync(&thread->wait);
else
wake_up_interruptible(&thread->wait);
return;
}
binder_wakeup_poll_threads_ilocked(proc, sync);
}
//从 binder_proc 的 threads 中挨个唤醒
static void binder_wakeup_poll_threads_ilocked(struct binder_proc *proc,
bool sync)
{
struct rb_node *n;
struct binder_thread *thread;
for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n)) {
thread = rb_entry(n, struct binder_thread, rb_node);
if (thread->looper & BINDER_LOOPER_STATE_POLL &&
binder_available_for_proc_work_ilocked(thread)) {
//此时 thread 处于运行状态,并不会立即响应 wake_up
if (sync)
//同步唤醒进程
//阻塞在这里,直到 thread 响应 wake_up 后,函数返回
wake_up_interruptible_sync(&thread->wait);
else
wake_up_interruptible(&thread->wait);
}
}
}
服务端线程的响应:
//服务端线程忙完自己的工作,就会进入下一个循环,向驱动读数据
static int binder_thread_read(struct binder_proc *proc,
struct binder_thread *thread,
binder_uintptr_t binder_buffer, size_t size,
binder_size_t *consumed, int non_block)
{
//......
retry:
binder_inner_proc_lock(proc);
//这里返回 true
wait_for_proc_work = binder_available_for_proc_work_ilocked(thread);
binder_inner_proc_unlock(proc);
thread->looper |= BINDER_LOOPER_STATE_WAITING;
//......
if (non_block) {
if (!binder_has_work(thread, wait_for_proc_work))
ret = -EAGAIN;
} else {
//不会 wait ,接着往下执行
ret = binder_wait_for_work(thread, wait_for_proc_work);
}
thread->looper &= ~BINDER_LOOPER_STATE_WAITING;
if (ret)
return ret;
//......
while (1) {
uint32_t cmd;
struct binder_transaction_data_secctx tr;
struct binder_transaction_data *trd = &tr.transaction_data;
struct binder_work *w = NULL;
struct list_head *list = NULL;
struct binder_transaction *t = NULL;
struct binder_thread *t_from;
size_t trsize = sizeof(*trd);
binder_inner_proc_lock(proc);
if (!binder_worklist_empty_ilocked(&thread->todo))
list = &thread->todo;
else if (!binder_worklist_empty_ilocked(&proc->todo) &&
wait_for_proc_work) //走这个分支
list = &proc->todo; //拿到 binder_proc 中的 todo 链表
else {
binder_inner_proc_unlock(proc);
/* no data added */
if (ptr - buffer == 4 && !thread->looper_need_return)
goto retry;
break;
}
if (end - ptr < sizeof(tr) + 4) {
binder_inner_proc_unlock(proc);
break;
}
w = binder_dequeue_work_head_ilocked(list); //从头部挨个处理 binder_work
if (binder_worklist_empty_ilocked(&thread->todo))
thread->process_todo = false;
}
static bool binder_available_for_proc_work_ilocked(struct binder_thread *thread)
{
return !thread->transaction_stack &&
binder_worklist_empty_ilocked(&thread->todo) &&
(thread->looper & (BINDER_LOOPER_STATE_ENTERED |
BINDER_LOOPER_STATE_REGISTERED));
}
static int binder_wait_for_work(struct binder_thread *thread,
bool do_proc_work)
{
DEFINE_WAIT(wait);
struct binder_proc *proc = thread->proc;
int ret = 0;
freezer_do_not_count();
binder_inner_proc_lock(proc);
for (;;) {
prepare_to_wait(&thread->wait, &wait, TASK_INTERRUPTIBLE);
//返回 true 直接 break
if (binder_has_work_ilocked(thread, do_proc_work))
break;
if (do_proc_work)
list_add(&thread->waiting_thread_node,
&proc->waiting_threads);
binder_inner_proc_unlock(proc);
schedule();
binder_inner_proc_lock(proc);
list_del_init(&thread->waiting_thread_node);
if (signal_pending(current)) {
ret = -ERESTARTSYS;
break;
}
}
//清除 wait,接着往下执行
finish_wait(&thread->wait, &wait);
binder_inner_proc_unlock(proc);
freezer_count();
return ret;
}
线程池溢出 Debug 技巧
binder 线程到达上限, 这个的情景是 app 向 service 发起请求的频率过高,service 端如果对所有的业务执行都加了锁的话,则会导致 service 端用于接收处理 binder 事件的线程全部卡住,当线程池(default 16个线程)耗尽之后,就无法再处理请求。如果这个时候 app 的主线程如果再调用该 serivce 提供的方法,就很容易出现 anr
我们可以通过 /sys/kernel/debug/binder 这个目录下的文件来查看所有进程的使用 binder 的状况来确定是否有线程溢出的情况出现:
一般是通过 transactions 这个文件来查看 binder 运行状况:
binder transactions:
proc 2890
context binder
buffer 45136: 0000000000000000 size 4:0:0 delivered
proc 2197
context hwbinder
buffer 29820: 0000000000000000 size 4:0:0 delivered
第一行是 proc 2890 表示进程的 pid,第二行表示 binder 类型,接下来就是 buffer,用于表示内核中的 binder_buffer,如果这个 buffer 在程序运行过程中越来越多,那么就有内存泄漏的可能存在,不过这种情况很少,应用程序都是通过 libbinder 这个库来使用 binder 驱动的,在一次 ipc 调用结束后,库本生会执行 binder_buffer 的清理操作,这种系统中的公用库一般都非常稳定,出 bug 的几率很小。
proc 1523
context hwbinder
thread 1542: l 02 need_return 0 tr 0
incoming transaction 126150: 0000000000000000 from 1744:1838 to 1523:1542 code 4 flags 10 pri 1:89 r1 node 234 size 44:0 data 0000000000000000
buffer 126150: 0000000000000000 size 44:0:0 active
还有一种情况是 proc 下面会出现 thread,如果 thread 特别多,那么就可能存在线程溢出的情况。为了进一步确认是否已经占满,dump 该进程获得其墓碑文件,就基本能看出来线程的情况。
解决线程池耗尽的问题,一般是从 app 端去限制请求的频率,如果不能拿到 app 源码,我们也可以修改 Framework 源码来限制特定的 IPC 远程调用的访问频率。
参考资料
-
android 系统核心机制binder(14)binder调试总结
-
一个关于binder的debug技巧