高通android HWC死锁导致拔投屏线冻屏问题解决过程

问题稳定复现手法:主屏有画面刷新时,拔掉DP线,设备冻屏;
现象:ADB可用,但显示冻屏,怀疑死锁导致某CPU卡死;
分析过程:
adb bugreport生成bugreport;使用addr2line将symbol导入后看调用栈,找到有用的一些:

可以看到PresentDisplay的时候,没拿到Lock而__futex_wait_ex:

Stack Trace:
  RELADDR           FUNCTION                                                                                 
  000000000004e1b0  syscall+32                                                                               
  v-------------->  __futex(void volatile*, int, int, timespec const*, int)                                 
  v-------------->  FutexWithTimeout(void volatile*, int, int, bool, timespec const*, int)                   
  0000000000052a7c  __futex_wait_ex(void volatile*, bool, int, bool, timespec const*)+148                   
  v-------------->  NonPI::NormalMutexLock(pthread_mutex_internal_t*, unsigned short, bool, timespec const*) 
  00000000000b7fb8  NonPI::MutexLockWithTimeout(pthread_mutex_internal_t*, bool, timespec const*)+224   
  v-------------->  sdm::Locker::Lock()                                                                     
  v-------------->  sdm::Locker::ScopeLock::ScopeLock(sdm::Locker&) -->sdm/include/utils/locker.h:46         
  v-------------->  sdm::HWCSession::PresentDisplay(unsigned long, std::__1::shared_ptr<sdm::Fence>*)       

接着看到DestroyPluggableDisplay时也会直接进Wait():

Stack Trace:
  RELADDR           FUNCTION                               
  000000000004e1b0  syscall+32                                                                     
  v-------------->  __futex(void volatile*, int, int, timespec const*, int)                     
  v-------------->  FutexWithTimeout(void volatile*, int, int, bool, timespec const*, int)           
  0000000000052a7c  __futex_wait_ex(void volatile*, bool, int, bool, timespec const*)+148            
  v-------------->  __pthread_cond_timedwait(pthread_cond_internal_t*, pthread_mutex_t*, bool, timespec const*)    
  00000000000b6054  pthread_cond_wait+84                           
  v-------------->  sdm::Locker::Wait()                                        
  v-------------->  sdm::Locker::SequenceWaitScopeLock::SequenceWaitScopeLock(sdm::Locker&)           
  v-------------->  sdm::HWCSession::DestroyPluggableDisplay(sdm::HWCSession::DisplayMapInfo*)        
  000000000004ad14  sdm::HWCSession::DestroyDisplay(sdm::HWCSession::DisplayMapInfo*)+232                   

首先看一下PresentDisplay源码:

int32_t HWCSession::PresentDisplay(hwc2_display_t display, shared_ptr<Fence> *out_retire_fence) {
    auto status = HWC2::Error::BadDisplay;
    DTRACE_SCOPED();
  //首先拿system_locker_ 
    SCOPE_LOCK(system_locker_);   									
    if (display >= HWCCallbacks::kNumDisplays) {
      DLOGW("Invalid Display : display = %" PRIu64, display);
      return HWC2_ERROR_BAD_DISPLAY;
    }
...
   if (status != HWC2::Error::None && status != HWC2::Error::NotValidated) {
      SEQUENCE_CANCEL_SCOPE_LOCK(locker_[target_display]);     						
    }

通过trace中的行号,能定位到PresentDisplay卡住的地方是ScopeLock

41  class Locker {
42   public:
43    class ScopeLock {
44     public:
45      explicit ScopeLock(Locker& locker) : locker_(locker) {
46        locker_.Lock();
47      }
48  
49      ~ScopeLock() {
50        locker_.Unlock();
51      }

说明有地方占用了system_locker_:

再看一下DestroyPluggableDisplay源码:

2951  void HWCSession::DestroyPluggableDisplay(DisplayMapInfo *map_info) {
2952    hwc2_display_t client_id = map_info->client_id;
2953  
2954    DLOGI("Notify hotplug display disconnected: client id = %d", UINT32(client_id));
2955    callbacks_.Hotplug(client_id, HWC2::Connection::Disconnected);
2956  
2957    SCOPE_LOCK(system_locker_);
2958    {
2959      SEQUENCE_WAIT_SCOPE_LOCK(locker_[client_id]);
2960      auto &hwc_display = hwc_display_[client_id];
2961      if (!hwc_display) {
2962        return;
2963      }
2964      DLOGI("Destroy display %d-%d, client id = %d", map_info->sdm_id, map_info->disp_type,
2965           UINT32(client_id));
2966      {

从trace中可以看到DestroyPluggableDisplay走到了SEQUENCE_WAIT_SCOPE_LOCK中卡住了;

研究下display中的lock.h:

//hardware/qcom/display/sdm/include/utils/locker.h
//宏定义:
33  #define SCOPE_LOCK(locker) Locker::ScopeLock lock(locker)
34  #define SEQUENCE_ENTRY_SCOPE_LOCK(locker) Locker::SequenceEntryScopeLock lock(locker)
35  #define SEQUENCE_EXIT_SCOPE_LOCK(locker) Locker::SequenceExitScopeLock lock(locker)
36  #define SEQUENCE_WAIT_SCOPE_LOCK(locker) Locker::SequenceWaitScopeLock lock(locker)
37  #define SEQUENCE_CANCEL_SCOPE_LOCK(locker) Locker::SequenceCancelScopeLock lock(locker)

41  class Locker {
42   public:
43    class ScopeLock {
44     public:
45      explicit ScopeLock(Locker& locker) : locker_(locker) {
46        locker_.Lock();
47      }
48  
49      ~ScopeLock() {
50        locker_.Unlock();
51      }
...
57    class SequenceEntryScopeLock {
58     public:
59      explicit SequenceEntryScopeLock(Locker& locker) : locker_(locker) {
60        locker_.Lock();
61        locker_.sequence_wait_ = 1;
62      }
63  
64      ~SequenceEntryScopeLock() {
65        locker_.Unlock();
66      }
67  
...   
72    class SequenceExitScopeLock {
73     public:
74      explicit SequenceExitScopeLock(Locker& locker) : locker_(locker) {
75        locker_.Lock();
76        locker_.sequence_wait_ = 0;
77      }
78  
79      ~SequenceExitScopeLock() {
80        locker_.Broadcast();
81        locker_.Unlock();
82      }
...
88    class SequenceWaitScopeLock {
89     public:
90      explicit SequenceWaitScopeLock(Locker& locker) : locker_(locker), error_(false) {
91        locker_.Lock();
92  
93        while (locker_.sequence_wait_ == 1) {
94          locker_.Wait();
95          error_ = (locker_.sequence_wait_ == -1);
96        }
97      }
98  
99      ~SequenceWaitScopeLock() {
100        locker_.Unlock();
101      }
...
112    class SequenceCancelScopeLock {
113     public:
114      explicit SequenceCancelScopeLock(Locker& locker) : locker_(locker) {
115        locker_.Lock();
116        locker_.sequence_wait_ = -1;
117      }
118  
119      ~SequenceCancelScopeLock() {
120        locker_.Broadcast();
121        locker_.Unlock();
122      }
	//实际的系统调用:
128    Locker() : sequence_wait_(0) {
    
148    void Lock() { pthread_mutex_lock(&mutex_); } //阻塞调用获取线程互斥锁,获取不到则进入到这个锁的排队队列中,并会进入阻塞状态, 直到拿到锁之后才会返回。
149    int32_t TryLock() { return pthread_mutex_trylock(&mutex_); }  //非阻塞调用:当请求的锁正在被占用的时候, 不会进入阻塞状态,而是立刻返回,并返回一个错误代码 EBUSY,意思是说, 有其它线程正在使用这个锁。
150    void Unlock() { pthread_mutex_unlock(&mutex_); }
151    void Signal() { pthread_cond_signal(&condition_); }
152    void Broadcast() { pthread_cond_broadcast(&condition_); }
153    void Wait() { pthread_cond_wait(&condition_, &mutex_); }  //pthread_cond_wait() 用于阻塞当前线程,等待别的线程使用pthread_cond_signal()或pthread_cond_broadcast来唤醒它
...
165   private:
    //这个flag很重要,上面各种锁会设置不同状态,SequenceWaitScopeLock会判断flag进行等待;
169    int sequence_wait_;   // This flag is set to 1 on sequence entry, 0 on exit, and -1 on cancel.
170                          // Some routines will wait for sequence of function calls to finish
171                          // so that capturing a transitionary snapshot of context is prevented.
172                          // If flag is set to -1, these routines will exit without doing any
173                          // further processing.
		}

从上可以看出SequenceWaitScopeLock判断了sequence_wait_的值为1时,会一直等待,这就是DestroyPluggableDisplay卡在SEQUENCE_WAIT_SCOPE_LOCK的原因,而什么会让 sequence_wait _为1呢?可以看到SequenceEntryScopeLock时会让其变为1,所以就需要SequenceExitScopeLock或SequenceCancelScopeLock来给解除wait状态;由此推断出某些地方Entry了但没有Exit或Cancel;

跟踪Entry,发现只在ValidateDisplay中会有调用Entry:

1218  int32_t HWCSession::ValidateDisplay(hwc2_display_t display, uint32_t *out_num_types,
1219                                      uint32_t *out_num_requests) {
...
1240      SEQUENCE_ENTRY_SCOPE_LOCK(locker_[target_display]);
1241      if (pending_power_mode_[display]) {
1242        status = HWC2::Error::None;
1243      } else if (hwc_display_[target_display]) {
1244        hwc_display_[target_display]->ProcessActiveConfigChange();
1245        hwc_display_[target_display]->SetFastPathComposition(false);
1246        status = ValidateDisplayInternal(target_display, out_num_types, out_num_requests);
1247      }
1248    }

Exit和Cancel的位置:

750  int32_t HWCSession::PresentDisplay(hwc2_display_t display, shared_ptr<Fence> *out_retire_fence) {
...
       {
774      SEQUENCE_EXIT_SCOPE_LOCK(locker_[target_display]);
775      if (!hwc_display_[target_display]) {
776        DLOGW("Removed Display : display = %" PRIu64, target_display);
777        return HWC2_ERROR_BAD_DISPLAY;
778      }
		}
...
804    	if (status != HWC2::Error::None && status != HWC2::Error::NotValidated) {
805      SEQUENCE_CANCEL_SCOPE_LOCK(locker_[target_display]);
806    	 }
	    }
...
1218  int32_t HWCSession::ValidateDisplay(hwc2_display_t display, uint32_t *out_num_types,
1219                                      uint32_t *out_num_requests) {
1220    //  out_num_types and out_num_requests will be non-NULL
1221  
...
1239    {
1240      SEQUENCE_ENTRY_SCOPE_LOCK(locker_[target_display]);
1241      if (pending_power_mode_[display]) {
1242        status = HWC2::Error::None;
1243      } else if (hwc_display_[target_display]) {
1244        hwc_display_[target_display]->ProcessActiveConfigChange();
1245        hwc_display_[target_display]->SetFastPathComposition(false);
1246        status = ValidateDisplayInternal(target_display, out_num_types, out_num_requests);
1247      }
1248    }
...
1250    // Sequence locking currently begins on Validate, so cancel the sequence lock on failures
1251    if (status != HWC2::Error::None && status != HWC2::Error::HasChanges) {
1252      SEQUENCE_CANCEL_SCOPE_LOCK(locker_[target_display]);
1253    }
...
1255    if (display != target_display) {
1256      // Validate done on a dummy display. Assume present is complete.
1257      SEQUENCE_EXIT_SCOPE_LOCK(locker_[target_display]);
1258    }
1259  
1260    return INT32(status);
1261  }

给ValidateDisplay加log,发现在单屏送显的时候就会调用ValidateDisplay;插入DP和拔出也会调用;Trance可以看文件中;
正常滑动刷屏时只会调用SEQUENCE_ENTRY_SCOPE_LOCK而CANCEL和EXIT都不会调用;
插入DP和拔出也只会调用ENTRY;所以EXIT LOCK都是由PresentDisplay完成的;

正常拔出时的log:

07-01 12:00:25.529  1188  1222 I SDM     : HWCSession::DestroyPluggableDisplay: Notify hotplug display disconnected: client id = 1
07-01 12:00:25.514     0     0 I [drm:dp_display_send_hpd_event][msm-dp-info][940 ][name=DP-1]: [status=disconnected] [bpp=0] [pattern=0]
07-01 12:00:25.570  1267  1267 W HWComposer: Attempted to disconnect unknown display 1
07-01 12:00:25.570  1267  1267 D DisplayDevice: getVsyncPeriodFromHWC: value is cached. return 8333333
07-01 12:00:25.570  1188  1222 W SDM     : HWCSession::DestroyPluggableDisplay: DestroyPluggableDisplay:callbacks_Hotplug out weisi
07-01 12:00:25.571  1188  1222 W SDM     : HWCSession::DestroyPluggableDisplay: DestroyPluggableDisplay:got scope lock
07-01 12:00:25.571  1188  1222 W SDM     : HWCSession::DestroyPluggableDisplay: DestroyPluggableDisplay:got wait scope lock

异常拔出时的log:

07-01 13:02:01.235  1187  1240 I SDM     : HWCSession::DestroyPluggableDisplay: Notify hotplug display disconnected: client id = 1   

↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓    在DestroyPluggableDisplay中插入了validateDisplay函数
07-01 13:02:01.239  1187  1187 E SDM     : HWCSession::ValidateDisplay: ValidateDisplay IN --weisi
07-01 13:02:01.239  1187  1187 E SDM     : HWCSession::ValidateDisplay: ValidateDisplay 1111 --weisi
07-01 13:02:01.239  1187  1187 E SDM     : HWCSession::ValidateDisplay: ValidateDisplay out --weisi
↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑    在DestroyPluggableDisplay中插入了validateDisplay函数

07-01 13:02:01.249  1273  1273 W HWComposer: Attempted to disconnect unknown display 1
07-01 13:02:01.250  1273  1273 D DisplayDevice: getVsyncPeriodFromHWC: value is cached. return 8333333
07-01 13:02:01.272  1187  1240 W SDM     : HWCSession::DestroyPluggableDisplay: DestroyPluggableDisplay:callbacks_Hotplug out weisi
07-01 13:02:01.273  1187  1240 W SDM     : HWCSession::DestroyPluggableDisplay: DestroyPluggableDisplay:got scope lock

对比正常的拔出log,异常时在DestroyPluggableDisplay中插入了validateDisplay函数,使sequence_wait_的值变成了1,导致DestroyPluggableDisplay调用SEQUENCE_WAIT_SCOPE_LOCK时陷入while死循环中,而未能调到PresentDisplay中的CANCEL或EXIT函数,所以造成死锁!!为验证猜想,在PresentDisplay中加入更多的log再次复现:

正常拔出:

07-01 20:48:46.967 1181 1215 W SDM : HWCSession::DestroyPluggableDisplay: DestroyPluggableDisplay:callbacks_Hotplug out weisi
07-01 20:48:46.967 1181 1215 W SDM : HWCSession::DestroyPluggableDisplay: DestroyPluggableDisplay:got scope lock for id:1
07-01 20:48:46.967 1181 1215 W SDM : HWCSession::DestroyPluggableDisplay: DestroyPluggableDisplay:got wait scope lock for 1
07-01 20:48:46.967 1181 1215 I SDM : HWCSession::DestroyPluggableDisplay: Destroy display 57-1, client id = 1

异常拔出:

07-01 20:50:23.690 1181 1215 I SDM : HWCSession::DestroyPluggableDisplay: Notify hotplug display disconnected: client id = 1
07-01 20:50:23.691 1181 1181 E SDM : HWCSession::PresentDisplay: PresentDisplay EXIT LOCK DONE for weisi display ID:1

↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓
07-01 20:50:23.691 1181 1181 E SDM : HWCSession::ValidateDisplay: ValidateDisplay IN display ID: 1 --weisi
07-01 20:50:23.691 1181 1181 E SDM : HWCSession::ValidateDisplay: ValidateDisplay out --weisi
↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑ DestroyPluggableDisplay中插入了ValidateDisplay,操作的是display 1

07-01 20:50:23.696 1265 1265 W HWComposer: Attempted to disconnect unknown display 1
07-01 20:50:23.696 1265 1265 D DisplayDevice: getVsyncPeriodFromHWC: value is cached. return 8333333

↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓
07-01 20:50:23.715 1181 1181 E SDM : HWCSession::PresentDisplay: PresentDisplay EXIT LOCK DONE for weisi display ID:0
07-01 20:50:23.725 1181 1181 E SDM : HWCSession::PresentDisplay: PresentDisplay EXIT LOCK DONE for weisi display ID:0
07-01 20:50:23.730 1181 1181 E SDM : HWCSession::PresentDisplay: PresentDisplay EXIT LOCK DONE for weisi display ID:0
↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑ 这里只EXIT了display 0的锁;

07-01 20:50:23.734 1181 1215 W SDM : HWCSession::DestroyPluggableDisplay: DestroyPluggableDisplay:callbacks_Hotplug out weisi

07-01 20:50:23.734 1181 1215 W SDM : HWCSession::DestroyPluggableDisplay: DestroyPluggableDisplay:got scope lock for id:1

加上display ID后,可以看到,异常时调用DestroyPluggableDisplay操作的是display 1,也就是给Display 1的sequence_wait_变为1,而后面的PresentDisplay只EXIT了display 0的锁;所以接下来DestroyPluggableDisplay调用SEQUENCE_WAIT_SCOPE_LOCK会进入wait等待;

对比R的log,发现ValidateDisplay很少调用:

02-14 11:29:50.200  1071  1202 E SDM     : HWCSession::PresentDisplay: PresentDisplay EXIT LOCK DONE for weisi display ID:1
02-14 11:29:50.213  1071  1202 E SDM     : HWCSession::PresentDisplay: PresentDisplay EXIT LOCK DONE for weisi display ID:0
02-14 11:29:50.216  1071  1202 E SDM     : HWCSession::PresentDisplay: PresentDisplay EXIT LOCK DONE for weisi display ID:1
02-14 11:29:50.231  1071  1202 E SDM     : HWCSession::PresentDisplay: PresentDisplay EXIT LOCK DONE for weisi display ID:0
02-14 11:29:50.232  1071  1126 I SDM     : HWCSession::HandlePluggableDisplays: Handling hotplug...
02-14 11:29:50.235  1071  1202 E SDM     : HWCSession::PresentDisplay: PresentDisplay EXIT LOCK DONE for weisi display ID:1
02-14 11:29:50.240  1071  1202 E SDM     : HWCSession::PresentDisplay: PresentDisplay EXIT LOCK DONE for weisi display ID:0
02-14 11:29:50.241  1071  1126 I SDM     : HWCSession::DestroyPluggableDisplay: Notify hotplug display disconnected: client id = 1
02-14 11:29:50.242  1071  1202 E SDM     : HWCSession::PresentDisplay: PresentDisplay EXIT LOCK DONE for weisi display ID:1
02-14 11:29:50.252  1071  1205 E SDM     : HWCSession::PresentDisplay: PresentDisplay EXIT LOCK DONE for weisi display ID:0
02-14 11:29:50.264  1071  1205 E SDM     : HWCSession::PresentDisplay: PresentDisplay EXIT LOCK DONE for weisi display ID:0
02-14 11:29:50.274  1071  1205 E SDM     : HWCSession::PresentDisplay: PresentDisplay EXIT LOCK DONE for weisi display ID:0
02-14 11:29:50.276  1071  1126 W SDM     : HWCSession::DestroyPluggableDisplay: DestroyPluggableDisplay:callbacks_Hotplug out weisi
02-14 11:29:50.276  1071  1126 W SDM     : HWCSession::DestroyPluggableDisplay: DestroyPluggableDisplay:got scope lock for id:1
02-14 11:29:50.276  1071  1126 W SDM     : HWCSession::DestroyPluggableDisplay: DestroyPluggableDisplay:got wait scope lock for 1
02-14 11:29:50.277  1071  1126 I SDM     : HWCSession::DestroyPluggableDisplay: Destroy display 57-1, client id = 1
02-14 11:29:50.325  1071  1126 I SDM     : HWCSession::HandlePluggableDisplays: Handling hotplug... Done.
02-14 11:29:50.325  1071  1126 I SDM     : HWCSession::UEventHandler: Connected = 0

高通接受死锁,将destroy内的wait lock改为CANCEL LOCK;问题解决

  • 1
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值