这次挂起现象是每日测试集在执行load数据时,挂起了,查看日志发现数据库也不checkpoint了,自己分析了下,都是浮云啊。。。。经过大神经理的一番教导,恍然大悟,速度记录下来此次历程
有用的几个jstack信息如下,我先过滤出来:
第一段jstack,也是最重要的一部分
"CloudWave Tablet Server Compact Task Thread:0" prio=10 tid=0x00007f013c015800 nid=0x382a waiting on condition [0x00007f00d79f7000]
java.lang.Thread.State: WAITING (parking)
at sun.misc.Unsafe.park(Native Method)
- parking to wait for <0x00000007733aac00> (a java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject)
at java.util.concurrent.locks.LockSupport.park(LockSupport.java:186)
at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:2043)
at java.util.concurrent.LinkedBlockingQueue.put(LinkedBlockingQueue.java:349)
at com.cloudwave.server.master.MasterServer$BlockPolicy.rejectedExecution(`.java:11401)
at java.util.concurrent.ThreadPoolExecutor.reject(ThreadPoolExecutor.java:821)
at java.util.concurrent.ThreadPoolExecutor.execute(ThreadPoolExecutor.java:1372)
at com.cloudwave.server.tablet.TabletServer.publishCompactTabletTask(TabletServer.java:4146)
at com.cloudwave.server.tablet.MemoryTableManager.register(MemoryTableManager.java:88)
at com.cloudwave.server.tablet.BaseTablet.setupMemoryTable(BaseTablet.java:220)
at com.cloudwave.server.tablet.BaseTablet.mergeCompact(BaseTablet.java:1267)
at com.cloudwave.server.tablet.BaseTablet.compact(BaseTablet.java:763)
at com.cloudwave.server.tablet.TabletServer.compactTablet(TabletServer.java:13740)
at com.cloudwave.server.tablet.CompactTabletTask.run(CompactTabletTask.java:25)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
"CloudWave Master Server Instant Thread:8" prio=10 tid=0x00007f016c84d800 nid=0x37d1 waiting on condition [0x00007f01509e8000]
java.lang.Thread.State: WAITING (parking)
at sun.misc.Unsafe.park(Native Method)
- parking to wait for <0x00000007733aac00> (a java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject)
at java.util.concurrent.locks.LockSupport.park(LockSupport.java:186)
at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:2043)
at java.util.concurrent.LinkedBlockingQueue.put(LinkedBlockingQueue.java:349)
at com.cloudwave.server.master.MasterServer$BlockPolicy.rejectedExecution(MasterServer.java:11401)
at java.util.concurrent.ThreadPoolExecutor.reject(ThreadPoolExecutor.java:821)
at java.util.concurrent.ThreadPoolExecutor.execute(ThreadPoolExecutor.java:1372)
at com.cloudwave.server.tablet.TabletServer.publishCompactTabletTask(TabletServer.java:4146)
at com.cloudwave.server.tablet.MemoryTableManager.register(MemoryTableManager.java:88)
at com.cloudwave.server.tablet.BaseTablet.insert(BaseTablet.java:347)
at com.cloudwave.server.tablet.TabletServer.insertBatch(TabletServer.java:1281)
at com.cloudwave.server.tablet.client.TabletLocalConnection.insertBatch(TabletLocalConnection.java:2445)
at com.cloudwave.server.master.MasterServer.insertBatch(MasterServer.java:7172)
- locked <0x00000007a8938a58> (a com.cloudwave.server.master.TabletInfo)
at com.cloudwave.server.master.MasterServer.insertBatch(MasterServer.java:5784)
at com.cloudwave.server.master.MasterServer.insertBatch(MasterServer.java:5342)
at com.cloudwave.server.sql.engine.UpdateEngine.handleInsertBatch(UpdateEngine.java:1221)
at com.cloudwave.server.kernel.DataLoadRequest.run(DataLoadRequest.java:187)
- locked <0x00000007bf917390> (a com.cloudwave.server.kernel.ServerSession)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
"CloudWave Master Server Instant Thread:3" prio=10 tid=0x00007f00f4324800 nid=0x37c4 waiting on condition [0x00007f01516f4000]
java.lang.Thread.State: WAITING (parking)
at sun.misc.Unsafe.park(Native Method)
- parking to wait for <0x00000007733aac00> (a java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject)
at java.util.concurrent.locks.LockSupport.park(LockSupport.java:186)
at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:2043)
at java.util.concurrent.LinkedBlockingQueue.put(LinkedBlockingQueue.java:349)
at com.cloudwave.server.master.MasterServer$BlockPolicy.rejectedExecution(MasterServer.java:11401)
at java.util.concurrent.ThreadPoolExecutor.reject(ThreadPoolExecutor.java:821)
at java.util.concurrent.ThreadPoolExecutor.execute(ThreadPoolExecutor.java:1372)
at com.cloudwave.server.tablet.TabletServer.publishCompactTabletTask(TabletServer.java:4146)
at com.cloudwave.server.tablet.MemoryTableManager.register(MemoryTableManager.java:88)
at com.cloudwave.server.tablet.BaseTablet.insert(BaseTablet.java:347)
at com.cloudwave.server.tablet.TabletServer.insertBatch(TabletServer.java:1281)
at com.cloudwave.server.tablet.client.TabletLocalConnection.insertBatch(TabletLocalConnection.java:2445)
at com.cloudwave.server.master.MasterServer.insertBatch(MasterServer.java:7172)
- locked <0x00000007a2868ad8> (a com.cloudwave.server.master.TabletInfo)
at com.cloudwave.server.master.MasterServer.insertBatch(MasterServer.java:5784)
at com.cloudwave.server.master.MasterServer.insertBatch(MasterServer.java:5325)
at com.cloudwave.server.sql.engine.UpdateEngine.handleInsertBatch(UpdateEngine.java:1221)
at com.cloudwave.server.kernel.DataLoadRequest.run(DataLoadRequest.java:187)
- locked <0x00000007a388a0f0> (a com.cloudwave.server.kernel.ServerSession)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
"CloudWave Master Server Check Point Thread" prio=10 tid=0x00007f016c149000 nid=0x3789 waiting on condition [0x00007f0158989000]
java.lang.Thread.State: TIMED_WAITING (sleeping)
at java.lang.Thread.sleep(Native Method)
at com.cloudwave.server.master.CheckPointTask.run(CheckPointTask.java:90)
at com.cloudwave.server.master.CheckPointThread.run(CheckPointThread.java:28)
"CloudWave Tablet Server Checkpoint Task Thread:0" prio=10 tid=0x00007f0104001800 nid=0x3785 waiting on condition [0x00007f0158d8d000]
java.lang.Thread.State: WAITING (parking)
at sun.misc.Unsafe.park(Native Method)
- parking to wait for <0x00000007a2856330> (a java.util.concurrent.locks.ReentrantReadWriteLock$NonfairSync)
at java.util.concurrent.locks.LockSupport.park(LockSupport.java:186)
at java.util.concurrent.locks.AbstractQueuedSynchronizer.parkAndCheckInterrupt(AbstractQueuedSynchronizer.java:834)
at java.util.concurrent.locks.AbstractQueuedSynchronizer.acquireQueued(AbstractQueuedSynchronizer.java:867)
at java.util.concurrent.locks.AbstractQueuedSynchronizer.acquire(AbstractQueuedSynchronizer.java:1197)
at java.util.concurrent.locks.ReentrantReadWriteLock$WriteLock.lock(ReentrantReadWriteLock.java:945)
at com.cloudwave.server.tablet.BaseTablet.compact(BaseTablet.java:746)
at com.cloudwave.server.tablet.TabletServer.compactAllTablets(TabletServer.java:13673)
at com.cloudwave.server.tablet.TabletServerCompactTask.run(TabletServerCompactTask.java:23)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
- 从第一个jstasck代码块的开始分析,看看是谁调用了CompactTabletTask线程,发现就有2个地方调用
- 两个调用地方的区别是:TabletServer. publishCompactTabletTask是放到了线程池中执行,TabletServer.executeCompactTabletTask是直接调用run()方法,根据jstasck是通过线程池执行的。所以调用方法是publishCompactTabletTask(TabletId tabletId)。查看下线程池是compactExecutor,ctrl+F继续找看看怎么定义的
可以看出了这个线程池就1个线程让他执行,等待的线程池所使用的缓冲队列workQueue是有8个,如果有更多的请求进来,会让他们阻塞到外面等待
原理差不多是这样的:
- 根据jstack继续往下看
查看到BaseTablet 220。memoryTable是构造的一张对物理表在内存去映射的一张表 ,现在需要去注册。一直到这里
这里是重点
这个把memory的数据需要removed,也就是写到硬盘 ,当执行到 tabletServer.publishCompactTabletTask(removedTabletId);
现在有两种处理方案
1. 抛弃上面那种异步处理,让Compact线程直接写数据,但是这样方案我们后期计算后,会影响性能的
2. 直接将线程池的线程放大,并且将workQueue增加数量,可以缓解这个问题,这个就是追求一个请求和处理的平衡,但是担心多线程compact会引发其他不可控的现象。
这次暴漏这个问题是因为把memoryTableCache分成两部分,间接的加快了Compact请求