一次挂起现象的分析

这次挂起现象是每日测试集在执行load数据时,挂起了,查看日志发现数据库也不checkpoint了,自己分析了下,都是浮云啊。。。。经过大神经理的一番教导,恍然大悟,速度记录下来此次历程

有用的几个jstack信息如下,我先过滤出来:

第一段jstack,也是最重要的一部分

"CloudWave Tablet Server Compact Task Thread:0" prio=10 tid=0x00007f013c015800 nid=0x382a waiting on condition [0x00007f00d79f7000]
   java.lang.Thread.State: WAITING (parking)
        at sun.misc.Unsafe.park(Native Method)
        - parking to wait for  <0x00000007733aac00> (a java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject)
        at java.util.concurrent.locks.LockSupport.park(LockSupport.java:186)
        at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:2043)
        at java.util.concurrent.LinkedBlockingQueue.put(LinkedBlockingQueue.java:349)
        at com.cloudwave.server.master.MasterServer$BlockPolicy.rejectedExecution(`.java:11401)    
        at java.util.concurrent.ThreadPoolExecutor.reject(ThreadPoolExecutor.java:821)
        at java.util.concurrent.ThreadPoolExecutor.execute(ThreadPoolExecutor.java:1372)
        at com.cloudwave.server.tablet.TabletServer.publishCompactTabletTask(TabletServer.java:4146)
        at com.cloudwave.server.tablet.MemoryTableManager.register(MemoryTableManager.java:88)
        at com.cloudwave.server.tablet.BaseTablet.setupMemoryTable(BaseTablet.java:220)
        at com.cloudwave.server.tablet.BaseTablet.mergeCompact(BaseTablet.java:1267)
        at com.cloudwave.server.tablet.BaseTablet.compact(BaseTablet.java:763)
        at com.cloudwave.server.tablet.TabletServer.compactTablet(TabletServer.java:13740)
        at com.cloudwave.server.tablet.CompactTabletTask.run(CompactTabletTask.java:25)
        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
        at java.lang.Thread.run(Thread.java:745)
"CloudWave Master Server Instant Thread:8" prio=10 tid=0x00007f016c84d800 nid=0x37d1 waiting on condition [0x00007f01509e8000]
   java.lang.Thread.State: WAITING (parking)
        at sun.misc.Unsafe.park(Native Method)
        - parking to wait for  <0x00000007733aac00> (a java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject)
        at java.util.concurrent.locks.LockSupport.park(LockSupport.java:186)
        at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:2043)
        at java.util.concurrent.LinkedBlockingQueue.put(LinkedBlockingQueue.java:349)
        at com.cloudwave.server.master.MasterServer$BlockPolicy.rejectedExecution(MasterServer.java:11401)
        at java.util.concurrent.ThreadPoolExecutor.reject(ThreadPoolExecutor.java:821)
        at java.util.concurrent.ThreadPoolExecutor.execute(ThreadPoolExecutor.java:1372)
        at com.cloudwave.server.tablet.TabletServer.publishCompactTabletTask(TabletServer.java:4146)
        at com.cloudwave.server.tablet.MemoryTableManager.register(MemoryTableManager.java:88)
        at com.cloudwave.server.tablet.BaseTablet.insert(BaseTablet.java:347)
        at com.cloudwave.server.tablet.TabletServer.insertBatch(TabletServer.java:1281)
        at com.cloudwave.server.tablet.client.TabletLocalConnection.insertBatch(TabletLocalConnection.java:2445)
        at com.cloudwave.server.master.MasterServer.insertBatch(MasterServer.java:7172)
        - locked <0x00000007a8938a58> (a com.cloudwave.server.master.TabletInfo)
        at com.cloudwave.server.master.MasterServer.insertBatch(MasterServer.java:5784)
        at com.cloudwave.server.master.MasterServer.insertBatch(MasterServer.java:5342)
        at com.cloudwave.server.sql.engine.UpdateEngine.handleInsertBatch(UpdateEngine.java:1221)
        at com.cloudwave.server.kernel.DataLoadRequest.run(DataLoadRequest.java:187)
        - locked <0x00000007bf917390> (a com.cloudwave.server.kernel.ServerSession)
        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
        at java.lang.Thread.run(Thread.java:745)




"CloudWave Master Server Instant Thread:3" prio=10 tid=0x00007f00f4324800 nid=0x37c4 waiting on condition [0x00007f01516f4000]
   java.lang.Thread.State: WAITING (parking)
        at sun.misc.Unsafe.park(Native Method)
        - parking to wait for  <0x00000007733aac00> (a java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject)
        at java.util.concurrent.locks.LockSupport.park(LockSupport.java:186)
        at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:2043)
        at java.util.concurrent.LinkedBlockingQueue.put(LinkedBlockingQueue.java:349)
        at com.cloudwave.server.master.MasterServer$BlockPolicy.rejectedExecution(MasterServer.java:11401)
        at java.util.concurrent.ThreadPoolExecutor.reject(ThreadPoolExecutor.java:821)
        at java.util.concurrent.ThreadPoolExecutor.execute(ThreadPoolExecutor.java:1372)
        at com.cloudwave.server.tablet.TabletServer.publishCompactTabletTask(TabletServer.java:4146)
        at com.cloudwave.server.tablet.MemoryTableManager.register(MemoryTableManager.java:88)
        at com.cloudwave.server.tablet.BaseTablet.insert(BaseTablet.java:347)
        at com.cloudwave.server.tablet.TabletServer.insertBatch(TabletServer.java:1281)
        at com.cloudwave.server.tablet.client.TabletLocalConnection.insertBatch(TabletLocalConnection.java:2445)
        at com.cloudwave.server.master.MasterServer.insertBatch(MasterServer.java:7172)
        - locked <0x00000007a2868ad8> (a com.cloudwave.server.master.TabletInfo)
        at com.cloudwave.server.master.MasterServer.insertBatch(MasterServer.java:5784)
        at com.cloudwave.server.master.MasterServer.insertBatch(MasterServer.java:5325)
        at com.cloudwave.server.sql.engine.UpdateEngine.handleInsertBatch(UpdateEngine.java:1221)
        at com.cloudwave.server.kernel.DataLoadRequest.run(DataLoadRequest.java:187)
        - locked <0x00000007a388a0f0> (a com.cloudwave.server.kernel.ServerSession)
        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
        at java.lang.Thread.run(Thread.java:745)



"CloudWave Master Server Check Point Thread" prio=10 tid=0x00007f016c149000 nid=0x3789 waiting on condition [0x00007f0158989000]
   java.lang.Thread.State: TIMED_WAITING (sleeping)
        at java.lang.Thread.sleep(Native Method)
        at com.cloudwave.server.master.CheckPointTask.run(CheckPointTask.java:90)
        at com.cloudwave.server.master.CheckPointThread.run(CheckPointThread.java:28)       


"CloudWave Tablet Server Checkpoint Task Thread:0" prio=10 tid=0x00007f0104001800 nid=0x3785 waiting on condition [0x00007f0158d8d000]
   java.lang.Thread.State: WAITING (parking)
        at sun.misc.Unsafe.park(Native Method)
        - parking to wait for  <0x00000007a2856330> (a java.util.concurrent.locks.ReentrantReadWriteLock$NonfairSync)
        at java.util.concurrent.locks.LockSupport.park(LockSupport.java:186)
        at java.util.concurrent.locks.AbstractQueuedSynchronizer.parkAndCheckInterrupt(AbstractQueuedSynchronizer.java:834)
        at java.util.concurrent.locks.AbstractQueuedSynchronizer.acquireQueued(AbstractQueuedSynchronizer.java:867)
        at java.util.concurrent.locks.AbstractQueuedSynchronizer.acquire(AbstractQueuedSynchronizer.java:1197)
        at java.util.concurrent.locks.ReentrantReadWriteLock$WriteLock.lock(ReentrantReadWriteLock.java:945)
        at com.cloudwave.server.tablet.BaseTablet.compact(BaseTablet.java:746)
        at com.cloudwave.server.tablet.TabletServer.compactAllTablets(TabletServer.java:13673)
        at com.cloudwave.server.tablet.TabletServerCompactTask.run(TabletServerCompactTask.java:23)
        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
        at java.lang.Thread.run(Thread.java:745)        
  1. 从第一个jstasck代码块的开始分析,看看是谁调用了CompactTabletTask线程,发现就有2个地方调用这里写图片描述
    这里写图片描述
  2. 两个调用地方的区别是:TabletServer. publishCompactTabletTask是放到了线程池中执行,TabletServer.executeCompactTabletTask是直接调用run()方法,根据jstasck是通过线程池执行的。所以调用方法是publishCompactTabletTask(TabletId tabletId)。查看下线程池是compactExecutor,ctrl+F继续找看看怎么定义的这里写图片描述
    可以看出了这个线程池就1个线程让他执行,等待的线程池所使用的缓冲队列workQueue是有8个,如果有更多的请求进来,会让他们阻塞到外面等待
    原理差不多是这样的:
    这里写图片描述
  3. 根据jstack继续往下看
    查看到BaseTablet 220。memoryTable是构造的一张对物理表在内存去映射的一张表 ,现在需要去注册。一直到这里
    这里写图片描述
    这里是重点
    这个把memory的数据需要removed,也就是写到硬盘 ,当执行到 tabletServer.publishCompactTabletTask(removedTabletId);
    这里写图片描述

现在有两种处理方案
1. 抛弃上面那种异步处理,让Compact线程直接写数据,但是这样方案我们后期计算后,会影响性能的
2. 直接将线程池的线程放大,并且将workQueue增加数量,可以缓解这个问题,这个就是追求一个请求和处理的平衡,但是担心多线程compact会引发其他不可控的现象。

这次暴漏这个问题是因为把memoryTableCache分成两部分,间接的加快了Compact请求

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值