一、状况描述
XXXX项目组反馈系统在线人数达到160或以上后,会出现较严重的系统效率问题:
单据节点的打开达到分钟级
二、现场分析
2.1检查系统日志文件发现:
Exception = java.net.SocketTimeoutException
Source = com.ibm.ws.webcontainer.channel.WCCByteBufferInputStream
probeid = 102
Stack Dump = java.net.SocketTimeoutException: Async operation timed out
at com.ibm.ws.tcp.channel.impl.AioTCPReadRequestContextImpl.processSyncReadRequest(AioTCPReadRequestContextImpl.java:157)
at com.ibm.ws.tcp.channel.impl.TCPReadRequestContextImpl.read(TCPReadRequestContextImpl.java:109)
at com.ibm.ws.http.channel.impl.HttpServiceContextImpl.fillABuffer(HttpServiceContextImpl.java:4139)
at com.ibm.ws.http.channel.impl.HttpServiceContextImpl.readSingleBlock(HttpServiceContextImpl.java:3383)
at com.ibm.ws.http.channel.impl.HttpServiceContextImpl.readBodyBuffer(HttpServiceContextImpl.java:3488)
at com.ibm.ws.http.channel.inbound.impl.HttpInboundServiceContextImpl.getRequestBodyBuffer(HttpInboundServiceContextImpl.java:1606)
at com.ibm.ws.webcontainer.channel.WCCByteBufferInputStream.bufferIsGood(WCCByteBufferInputStream.java:109)
at com.ibm.ws.webcontainer.channel.WCCByteBufferInputStream.read(WCCByteBufferInputStream.java:79)
at com.ibm.ws.webcontainer.srt.http.HttpInputStream.read(HttpInputStream.java:294)
at java.io.BufferedInputStream.fill(BufferedInputStream.java:218)
at java.io.BufferedInputStream.read(BufferedInputStream.java:235)
at nc.bs.framework.comn.NetObjectInputStream.readInt(NetObjectInputStream.java:275)
at nc.bs.framework.comn.NetObjectInputStream.readObject(NetObjectInputStream.java:247)
at nc.bs.framework.comn.serv.ServiceDispatcher.execCall(ServiceDispatcher.java:91)
at nc.bs.framework.comn.serv.CommonServletDispatcher.doGet(CommonServletDispatcher.java:76)
at nc.bs.framework.comn.serv.CommonServletDispatcher.doPost(CommonServletDispatcher.java:95)
at javax.servlet.http.HttpServlet.service(HttpServlet.java:763)
at javax.servlet.http.HttpServlet.service(HttpServlet.java:856)
at com.ibm.ws.webcontainer.servlet.ServletWrapper.service(ServletWrapper.java:989)
at com.ibm.ws.webcontainer.servlet.ServletWrapper.service(ServletWrapper.java:930)
at com.ibm.ws.webcontainer.filter.WebAppFilterChain.doFilter(WebAppFilterChain.java:145)
at nc.bs.framework.server.LoggerServletFilter.doFilter(LoggerServletFilter.java:32)
at com.ibm.ws.webcontainer.filter.FilterInstanceWrapper.doFilter(FilterInstanceWrapper.java:190)
at com.ibm.ws.webcontainer.filter.WebAppFilterChain.doFilter(WebAppFilterChain.java:130)
at com.ibm.ws.webcontainer.filter.WebAppFilterChain._doFilter(WebAppFilterChain.java:87)
at com.ibm.ws.webcontainer.filter.WebAppFilterManager.doFilter(WebAppFilterManager.java:761)
at com.ibm.ws.webcontainer.filter.WebAppFilterManager.doFilter(WebAppFilterManager.java:673)
at com.ibm.ws.webcontainer.servlet.ServletWrapper.handleRequest(ServletWrapper.java:498)
at com.ibm.ws.wswebcontainer.servlet.ServletWrapper.handleRequest(ServletWrapper.java:464)
at com.ibm.ws.webcontainer.servlet.CacheServletWrapper.handleRequest(CacheServletWrapper.java:90)
at com.ibm.ws.webcontainer.WebContainer.handleRequest(WebContainer.java:744)
at com.ibm.ws.wswebcontainer.WebContainer.handleRequest(WebContainer.java:1455)
at com.ibm.ws.webcontainer.channel.WCChannelLink.ready(WCChannelLink.java:113)
at com.ibm.ws.http.channel.inbound.impl.HttpInboundLink.handleDiscrimination(HttpInboundLink.java:454)
at com.ibm.ws.http.channel.inbound.impl.HttpInboundLink.handleNewInformation(HttpInboundLink.java:383)
at com.ibm.ws.http.channel.inbound.impl.HttpICLReadCallback.complete(HttpICLReadCallback.java:102)
at com.ibm.ws.tcp.channel.impl.AioReadCompletionListener.futureCompleted(AioReadCompletionListener.java:165)
at com.ibm.io.async.AbstractAsyncFuture.invokeCallback(AbstractAsyncFuture.java:217)
at com.ibm.io.async.AsyncChannelFuture.fireCompletionActions(AsyncChannelFuture.java:161)
at com.ibm.io.async.AsyncFuture.completed(AsyncFuture.java:136)
at com.ibm.io.async.ResultHandler.complete(ResultHandler.java:195)
at com.ibm.io.async.ResultHandler.runEventProcessingLoop(ResultHandler.java:743)
at com.ibm.io.async.ResultHandler$2.run(ResultHandler.java:873)
at com.ibm.ws.util.ThreadPool$Worker.run(ThreadPool.java:1469)
Caused by: com.ibm.io.async.AsyncTimeoutException(Async operation timed out, [Timeout, rc=0])
at com.ibm.io.async.AbstractAsyncFuture.waitForCompletion(AbstractAsyncFuture.java:359)
at com.ibm.io.async.AsyncFuture.getByteCount(AsyncFuture.java:216)
at com.ibm.ws.tcp.channel.impl.AioSocketIOChannel.readAIOSync(AioSocketIOChannel.java:214)
at com.ibm.ws.tcp.channel.impl.AioTCPReadRequestContextImpl.processSyncReadRequest(AioTCPReadRequestContextImpl.java:150)
... 43 more
系统抛java.net.SocketTimeoutException,系统无法分配一个新的连接,导致了waitTime。
检查was设置发现:webcontainer 配置为:30---70,开启wasPMI后对系统线程池,当出现问题时,线程池被耗尽。持续跟踪后发现was ffdc 中类似错误增多。
跟踪系统线程池堆栈信息发现:
master(端口:9080)服务器应用线程较多,线程池超过70
ncMem01(端口:9081)预警调度、短信网关、应用,线程池超过70
ncMem02 (端口:9082)应用线程,线程池未超
解决方案:
修改系统调度分配,单点登陆不单独使用9080,采用apache分配模式,扩大webcontainer参数为50---150
2、2 观察JAVA GC日志发现每次回收间隔都较长
解决方案:
查看was配置,JVM –Xms 512 –Xmx 2048 ,析内存中的碎片太多,导致GC频繁,使服务的响应速度变慢,修改 JVM –Xms768 –Xmx 1536
2、3 更改后系统在业务高峰期,效率稍有提高,使用jvmdump分析工具后查看到:
com.ibm.ejs.j2c.PoolManager 调用 com.ibm.db2.jcc占用比例持续增长
查看数据库状况,发现数据库索引的schema 为db2admin ,导出ddl举例:
CREATE INDEX DB2ADMIN.I1_FLOW_WORKNOTE_1
ON DB2INST1.PUB_WORKFLOWNOTE
("BILLID" ASC,
"APPROVESTATUS" ASC
)
PCTFREE 10
ALLOW REVERSE SCANS;
另外:数据库大表存放表空间存放于一个数据文件中
346791 rows from table "DB2INST1"."BD_CHINAMOBLE_ACCSUBJ"
484349 rows from table "DB2INST1"."CMER_ASSET_DISINFO"
373461 rows from table "DB2INST1"."CMER_ASSET_INFO"
452050 rows from table "DB2INST1"."INTERFACE_PO_DISTR_ORDER"
158921 rows from table "DB2INST1"."INTERFACE_PO_LINE_ORDER"
50758 rows from table "DB2INST1"."INTERFACE_PO_ORDER"
176378 rows from table "DB2INST1"."INTERFACE_PO_SHIPMENTS_ORDER"
562092 rows from table "DB2INST1"."INTERFACE_PROJECT_B"
352812 rows from table "DB2INST1"."INTERFACE_QUEUELOGGER"
407429 rows from table "DB2INST1"."PUB_WF_ACTINSTANCE"
361424 rows from table "DB2INST1"."PUB_WF_ACTINSTANCESRC"
367895 rows from table "DB2INST1"."PUB_WF_TASK"
407811 rows from table "DB2INST1"."PUB_WORKFLOWNOTE"
数据库对以上表的使用较频繁,当业务高峰时,导致单文件读写时的I/O瓶颈。
解决方案:
重建db2索引,并修改大表分区,将使用频繁的大表平均分布至表空间三个表空间文件,并更新db2统计信息