问题稳定复现手法:主屏有画面刷新时,拔掉DP线,设备冻屏;
现象:ADB可用,但显示冻屏,怀疑死锁导致某CPU卡死;
分析过程:
adb bugreport生成bugreport;使用addr2line将symbol导入后看调用栈,找到有用的一些:
可以看到PresentDisplay的时候,没拿到Lock而__futex_wait_ex:
Stack Trace:
RELADDR FUNCTION
000000000004e1b0 syscall+32
v--------------> __futex(void volatile*, int, int, timespec const*, int)
v--------------> FutexWithTimeout(void volatile*, int, int, bool, timespec const*, int)
0000000000052a7c __futex_wait_ex(void volatile*, bool, int, bool, timespec const*)+148
v--------------> NonPI::NormalMutexLock(pthread_mutex_internal_t*, unsigned short, bool, timespec const*)
00000000000b7fb8 NonPI::MutexLockWithTimeout(pthread_mutex_internal_t*, bool, timespec const*)+224
v--------------> sdm::Locker::Lock()
v--------------> sdm::Locker::ScopeLock::ScopeLock(sdm::Locker&) -->sdm/include/utils/locker.h:46
v--------------> sdm::HWCSession::PresentDisplay(unsigned long, std::__1::shared_ptr<sdm::Fence>*)
接着看到DestroyPluggableDisplay时也会直接进Wait():
Stack Trace:
RELADDR FUNCTION
000000000004e1b0 syscall+32
v--------------> __futex(void volatile*, int, int, timespec const*, int)
v--------------> FutexWithTimeout(void volatile*, int, int, bool, timespec const*, int)
0000000000052a7c __futex_wait_ex(void volatile*, bool, int, bool, timespec const*)+148
v--------------> __pthread_cond_timedwait(pthread_cond_internal_t*, pthread_mutex_t*, bool, timespec const*)
00000000000b6054 pthread_cond_wait+84
v--------------> sdm::Locker::Wait()
v--------------> sdm::Locker::SequenceWaitScopeLock::SequenceWaitScopeLock(sdm::Locker&)
v--------------> sdm::HWCSession::DestroyPluggableDisplay(sdm::HWCSession::DisplayMapInfo*)
000000000004ad14 sdm::HWCSession::DestroyDisplay(sdm::HWCSession::DisplayMapInfo*)+232
首先看一下PresentDisplay源码:
int32_t HWCSession::PresentDisplay(hwc2_display_t display, shared_ptr<Fence> *out_retire_fence) {
auto status = HWC2::Error::BadDisplay;
DTRACE_SCOPED();
//首先拿system_locker_
SCOPE_LOCK(system_locker_);
if (display >= HWCCallbacks::kNumDisplays) {
DLOGW("Invalid Display : display = %" PRIu64, display);
return HWC2_ERROR_BAD_DISPLAY;
}
...
if (status != HWC2::Error::None && status != HWC2::Error::NotValidated) {
SEQUENCE_CANCEL_SCOPE_LOCK(locker_[target_display]);
}
通过trace中的行号,能定位到PresentDisplay卡住的地方是ScopeLock
41 class Locker {
42 public:
43 class ScopeLock {
44 public:
45 explicit ScopeLock(Locker& locker) : locker_(locker) {
46 locker_.Lock();
47 }
48
49 ~ScopeLock() {
50 locker_.Unlock();
51 }
说明有地方占用了system_locker_:
再看一下DestroyPluggableDisplay源码:
2951 void HWCSession::DestroyPluggableDisplay(DisplayMapInfo *map_info) {
2952 hwc2_display_t client_id = map_info->client_id;
2953
2954 DLOGI("Notify hotplug display disconnected: client id = %d", UINT32(client_id));
2955 callbacks_.Hotplug(client_id, HWC2::Connection::Disconnected);
2956
2957 SCOPE_LOCK(system_locker_);
2958 {
2959 SEQUENCE_WAIT_SCOPE_LOCK(locker_[client_id]);
2960 auto &hwc_display = hwc_display_[client_id];
2961 if (!hwc_display) {
2962 return;
2963 }
2964 DLOGI("Destroy display %d-%d, client id = %d", map_info->sdm_id, map_info->disp_type,
2965 UINT32(client_id));
2966 {
从trace中可以看到DestroyPluggableDisplay走到了SEQUENCE_WAIT_SCOPE_LOCK中卡住了;
研究下display中的lock.h:
//hardware/qcom/display/sdm/include/utils/locker.h
//宏定义:
33 #define SCOPE_LOCK(locker) Locker::ScopeLock lock(locker)
34 #define SEQUENCE_ENTRY_SCOPE_LOCK(locker) Locker::SequenceEntryScopeLock lock(locker)
35 #define SEQUENCE_EXIT_SCOPE_LOCK(locker) Locker::SequenceExitScopeLock lock(locker)
36 #define SEQUENCE_WAIT_SCOPE_LOCK(locker) Locker::SequenceWaitScopeLock lock(locker)
37 #define SEQUENCE_CANCEL_SCOPE_LOCK(locker) Locker::SequenceCancelScopeLock lock(locker)
41 class Locker {
42 public:
43 class ScopeLock {
44 public:
45 explicit ScopeLock(Locker& locker) : locker_(locker) {
46 locker_.Lock();
47 }
48
49 ~ScopeLock() {
50 locker_.Unlock();
51 }
...
57 class SequenceEntryScopeLock {
58 public:
59 explicit SequenceEntryScopeLock(Locker& locker) : locker_(locker) {
60 locker_.Lock();
61 locker_.sequence_wait_ = 1;
62 }
63
64 ~SequenceEntryScopeLock() {
65 locker_.Unlock();
66 }
67
...
72 class SequenceExitScopeLock {
73 public:
74 explicit SequenceExitScopeLock(Locker& locker) : locker_(locker) {
75 locker_.Lock();
76 locker_.sequence_wait_ = 0;
77 }
78
79 ~SequenceExitScopeLock() {
80 locker_.Broadcast();
81 locker_.Unlock();
82 }
...
88 class SequenceWaitScopeLock {
89 public:
90 explicit SequenceWaitScopeLock(Locker& locker) : locker_(locker), error_(false) {
91 locker_.Lock();
92
93 while (locker_.sequence_wait_ == 1) {
94 locker_.Wait();
95 error_ = (locker_.sequence_wait_ == -1);
96 }
97 }
98
99 ~SequenceWaitScopeLock() {
100 locker_.Unlock();
101 }
...
112 class SequenceCancelScopeLock {
113 public:
114 explicit SequenceCancelScopeLock(Locker& locker) : locker_(locker) {
115 locker_.Lock();
116 locker_.sequence_wait_ = -1;
117 }
118
119 ~SequenceCancelScopeLock() {
120 locker_.Broadcast();
121 locker_.Unlock();
122 }
//实际的系统调用:
128 Locker() : sequence_wait_(0) {
148 void Lock() { pthread_mutex_lock(&mutex_); } //阻塞调用获取线程互斥锁,获取不到则进入到这个锁的排队队列中,并会进入阻塞状态, 直到拿到锁之后才会返回。
149 int32_t TryLock() { return pthread_mutex_trylock(&mutex_); } //非阻塞调用:当请求的锁正在被占用的时候, 不会进入阻塞状态,而是立刻返回,并返回一个错误代码 EBUSY,意思是说, 有其它线程正在使用这个锁。
150 void Unlock() { pthread_mutex_unlock(&mutex_); }
151 void Signal() { pthread_cond_signal(&condition_); }
152 void Broadcast() { pthread_cond_broadcast(&condition_); }
153 void Wait() { pthread_cond_wait(&condition_, &mutex_); } //pthread_cond_wait() 用于阻塞当前线程,等待别的线程使用pthread_cond_signal()或pthread_cond_broadcast来唤醒它
...
165 private:
//这个flag很重要,上面各种锁会设置不同状态,SequenceWaitScopeLock会判断flag进行等待;
169 int sequence_wait_; // This flag is set to 1 on sequence entry, 0 on exit, and -1 on cancel.
170 // Some routines will wait for sequence of function calls to finish
171 // so that capturing a transitionary snapshot of context is prevented.
172 // If flag is set to -1, these routines will exit without doing any
173 // further processing.
}
从上可以看出SequenceWaitScopeLock判断了sequence_wait_的值为1时,会一直等待,这就是DestroyPluggableDisplay卡在SEQUENCE_WAIT_SCOPE_LOCK的原因,而什么会让 sequence_wait _为1呢?可以看到SequenceEntryScopeLock时会让其变为1,所以就需要SequenceExitScopeLock或SequenceCancelScopeLock来给解除wait状态;由此推断出某些地方Entry了但没有Exit或Cancel;
跟踪Entry,发现只在ValidateDisplay中会有调用Entry:
1218 int32_t HWCSession::ValidateDisplay(hwc2_display_t display, uint32_t *out_num_types,
1219 uint32_t *out_num_requests) {
...
1240 SEQUENCE_ENTRY_SCOPE_LOCK(locker_[target_display]);
1241 if (pending_power_mode_[display]) {
1242 status = HWC2::Error::None;
1243 } else if (hwc_display_[target_display]) {
1244 hwc_display_[target_display]->ProcessActiveConfigChange();
1245 hwc_display_[target_display]->SetFastPathComposition(false);
1246 status = ValidateDisplayInternal(target_display, out_num_types, out_num_requests);
1247 }
1248 }
Exit和Cancel的位置:
750 int32_t HWCSession::PresentDisplay(hwc2_display_t display, shared_ptr<Fence> *out_retire_fence) {
...
{
774 SEQUENCE_EXIT_SCOPE_LOCK(locker_[target_display]);
775 if (!hwc_display_[target_display]) {
776 DLOGW("Removed Display : display = %" PRIu64, target_display);
777 return HWC2_ERROR_BAD_DISPLAY;
778 }
}
...
804 if (status != HWC2::Error::None && status != HWC2::Error::NotValidated) {
805 SEQUENCE_CANCEL_SCOPE_LOCK(locker_[target_display]);
806 }
}
...
1218 int32_t HWCSession::ValidateDisplay(hwc2_display_t display, uint32_t *out_num_types,
1219 uint32_t *out_num_requests) {
1220 // out_num_types and out_num_requests will be non-NULL
1221
...
1239 {
1240 SEQUENCE_ENTRY_SCOPE_LOCK(locker_[target_display]);
1241 if (pending_power_mode_[display]) {
1242 status = HWC2::Error::None;
1243 } else if (hwc_display_[target_display]) {
1244 hwc_display_[target_display]->ProcessActiveConfigChange();
1245 hwc_display_[target_display]->SetFastPathComposition(false);
1246 status = ValidateDisplayInternal(target_display, out_num_types, out_num_requests);
1247 }
1248 }
...
1250 // Sequence locking currently begins on Validate, so cancel the sequence lock on failures
1251 if (status != HWC2::Error::None && status != HWC2::Error::HasChanges) {
1252 SEQUENCE_CANCEL_SCOPE_LOCK(locker_[target_display]);
1253 }
...
1255 if (display != target_display) {
1256 // Validate done on a dummy display. Assume present is complete.
1257 SEQUENCE_EXIT_SCOPE_LOCK(locker_[target_display]);
1258 }
1259
1260 return INT32(status);
1261 }
给ValidateDisplay加log,发现在单屏送显的时候就会调用ValidateDisplay;插入DP和拔出也会调用;Trance可以看文件中;
正常滑动刷屏时只会调用SEQUENCE_ENTRY_SCOPE_LOCK而CANCEL和EXIT都不会调用;
插入DP和拔出也只会调用ENTRY;所以EXIT LOCK都是由PresentDisplay完成的;
正常拔出时的log:
07-01 12:00:25.529 1188 1222 I SDM : HWCSession::DestroyPluggableDisplay: Notify hotplug display disconnected: client id = 1
07-01 12:00:25.514 0 0 I [drm:dp_display_send_hpd_event][msm-dp-info][940 ][name=DP-1]: [status=disconnected] [bpp=0] [pattern=0]
07-01 12:00:25.570 1267 1267 W HWComposer: Attempted to disconnect unknown display 1
07-01 12:00:25.570 1267 1267 D DisplayDevice: getVsyncPeriodFromHWC: value is cached. return 8333333
07-01 12:00:25.570 1188 1222 W SDM : HWCSession::DestroyPluggableDisplay: DestroyPluggableDisplay:callbacks_Hotplug out weisi
07-01 12:00:25.571 1188 1222 W SDM : HWCSession::DestroyPluggableDisplay: DestroyPluggableDisplay:got scope lock
07-01 12:00:25.571 1188 1222 W SDM : HWCSession::DestroyPluggableDisplay: DestroyPluggableDisplay:got wait scope lock
异常拔出时的log:
07-01 13:02:01.235 1187 1240 I SDM : HWCSession::DestroyPluggableDisplay: Notify hotplug display disconnected: client id = 1
↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓ 在DestroyPluggableDisplay中插入了validateDisplay函数
07-01 13:02:01.239 1187 1187 E SDM : HWCSession::ValidateDisplay: ValidateDisplay IN --weisi
07-01 13:02:01.239 1187 1187 E SDM : HWCSession::ValidateDisplay: ValidateDisplay 1111 --weisi
07-01 13:02:01.239 1187 1187 E SDM : HWCSession::ValidateDisplay: ValidateDisplay out --weisi
↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑ 在DestroyPluggableDisplay中插入了validateDisplay函数
07-01 13:02:01.249 1273 1273 W HWComposer: Attempted to disconnect unknown display 1
07-01 13:02:01.250 1273 1273 D DisplayDevice: getVsyncPeriodFromHWC: value is cached. return 8333333
07-01 13:02:01.272 1187 1240 W SDM : HWCSession::DestroyPluggableDisplay: DestroyPluggableDisplay:callbacks_Hotplug out weisi
07-01 13:02:01.273 1187 1240 W SDM : HWCSession::DestroyPluggableDisplay: DestroyPluggableDisplay:got scope lock
对比正常的拔出log,异常时在DestroyPluggableDisplay中插入了validateDisplay函数,使sequence_wait_的值变成了1,导致DestroyPluggableDisplay调用SEQUENCE_WAIT_SCOPE_LOCK时陷入while死循环中,而未能调到PresentDisplay中的CANCEL或EXIT函数,所以造成死锁!!为验证猜想,在PresentDisplay中加入更多的log再次复现:
正常拔出:
07-01 20:48:46.967 1181 1215 W SDM : HWCSession::DestroyPluggableDisplay: DestroyPluggableDisplay:callbacks_Hotplug out weisi
07-01 20:48:46.967 1181 1215 W SDM : HWCSession::DestroyPluggableDisplay: DestroyPluggableDisplay:got scope lock for id:1
07-01 20:48:46.967 1181 1215 W SDM : HWCSession::DestroyPluggableDisplay: DestroyPluggableDisplay:got wait scope lock for 1
07-01 20:48:46.967 1181 1215 I SDM : HWCSession::DestroyPluggableDisplay: Destroy display 57-1, client id = 1
异常拔出:
07-01 20:50:23.690 1181 1215 I SDM : HWCSession::DestroyPluggableDisplay: Notify hotplug display disconnected: client id = 1
07-01 20:50:23.691 1181 1181 E SDM : HWCSession::PresentDisplay: PresentDisplay EXIT LOCK DONE for weisi display ID:1
↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓
07-01 20:50:23.691 1181 1181 E SDM : HWCSession::ValidateDisplay: ValidateDisplay IN display ID: 1 --weisi
07-01 20:50:23.691 1181 1181 E SDM : HWCSession::ValidateDisplay: ValidateDisplay out --weisi
↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑ DestroyPluggableDisplay中插入了ValidateDisplay,操作的是display 1
07-01 20:50:23.696 1265 1265 W HWComposer: Attempted to disconnect unknown display 1
07-01 20:50:23.696 1265 1265 D DisplayDevice: getVsyncPeriodFromHWC: value is cached. return 8333333
↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓
07-01 20:50:23.715 1181 1181 E SDM : HWCSession::PresentDisplay: PresentDisplay EXIT LOCK DONE for weisi display ID:0
07-01 20:50:23.725 1181 1181 E SDM : HWCSession::PresentDisplay: PresentDisplay EXIT LOCK DONE for weisi display ID:0
07-01 20:50:23.730 1181 1181 E SDM : HWCSession::PresentDisplay: PresentDisplay EXIT LOCK DONE for weisi display ID:0
↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑ 这里只EXIT了display 0的锁;
07-01 20:50:23.734 1181 1215 W SDM : HWCSession::DestroyPluggableDisplay: DestroyPluggableDisplay:callbacks_Hotplug out weisi
07-01 20:50:23.734 1181 1215 W SDM : HWCSession::DestroyPluggableDisplay: DestroyPluggableDisplay:got scope lock for id:1
加上display ID后,可以看到,异常时调用DestroyPluggableDisplay操作的是display 1,也就是给Display 1的sequence_wait_变为1,而后面的PresentDisplay只EXIT了display 0的锁;所以接下来DestroyPluggableDisplay调用SEQUENCE_WAIT_SCOPE_LOCK会进入wait等待;
对比R的log,发现ValidateDisplay很少调用:
02-14 11:29:50.200 1071 1202 E SDM : HWCSession::PresentDisplay: PresentDisplay EXIT LOCK DONE for weisi display ID:1
02-14 11:29:50.213 1071 1202 E SDM : HWCSession::PresentDisplay: PresentDisplay EXIT LOCK DONE for weisi display ID:0
02-14 11:29:50.216 1071 1202 E SDM : HWCSession::PresentDisplay: PresentDisplay EXIT LOCK DONE for weisi display ID:1
02-14 11:29:50.231 1071 1202 E SDM : HWCSession::PresentDisplay: PresentDisplay EXIT LOCK DONE for weisi display ID:0
02-14 11:29:50.232 1071 1126 I SDM : HWCSession::HandlePluggableDisplays: Handling hotplug...
02-14 11:29:50.235 1071 1202 E SDM : HWCSession::PresentDisplay: PresentDisplay EXIT LOCK DONE for weisi display ID:1
02-14 11:29:50.240 1071 1202 E SDM : HWCSession::PresentDisplay: PresentDisplay EXIT LOCK DONE for weisi display ID:0
02-14 11:29:50.241 1071 1126 I SDM : HWCSession::DestroyPluggableDisplay: Notify hotplug display disconnected: client id = 1
02-14 11:29:50.242 1071 1202 E SDM : HWCSession::PresentDisplay: PresentDisplay EXIT LOCK DONE for weisi display ID:1
02-14 11:29:50.252 1071 1205 E SDM : HWCSession::PresentDisplay: PresentDisplay EXIT LOCK DONE for weisi display ID:0
02-14 11:29:50.264 1071 1205 E SDM : HWCSession::PresentDisplay: PresentDisplay EXIT LOCK DONE for weisi display ID:0
02-14 11:29:50.274 1071 1205 E SDM : HWCSession::PresentDisplay: PresentDisplay EXIT LOCK DONE for weisi display ID:0
02-14 11:29:50.276 1071 1126 W SDM : HWCSession::DestroyPluggableDisplay: DestroyPluggableDisplay:callbacks_Hotplug out weisi
02-14 11:29:50.276 1071 1126 W SDM : HWCSession::DestroyPluggableDisplay: DestroyPluggableDisplay:got scope lock for id:1
02-14 11:29:50.276 1071 1126 W SDM : HWCSession::DestroyPluggableDisplay: DestroyPluggableDisplay:got wait scope lock for 1
02-14 11:29:50.277 1071 1126 I SDM : HWCSession::DestroyPluggableDisplay: Destroy display 57-1, client id = 1
02-14 11:29:50.325 1071 1126 I SDM : HWCSession::HandlePluggableDisplays: Handling hotplug... Done.
02-14 11:29:50.325 1071 1126 I SDM : HWCSession::UEventHandler: Connected = 0
高通接受死锁,将destroy内的wait lock改为CANCEL LOCK;问题解决