一.问题 描述
使用xxl-job 日志显示调度成功,但执行器就是没有执行,且没有结果回调,日志如下图。
二.排查原因
查看xxl-job客户端(执行器)中的源代码JobThread,发现调度和任务执行依靠队列 triggerQueue,这个是参数队列,调度直接往这个队列里面put参数对象。执行依靠是JobThread run方法,从队列中获取参数对象,能获取就执行,任务和线程是一一绑定的关系,调度成功,但没有结果回调。可以发现任务卡在执行任务当中handler.execute();
package com.xxl.job.core.thread;
import com.xxl.job.core.biz.model.HandleCallbackParam;
import com.xxl.job.core.biz.model.ReturnT;
import com.xxl.job.core.biz.model.TriggerParam;
import com.xxl.job.core.context.XxlJobContext;
import com.xxl.job.core.context.XxlJobHelper;
import com.xxl.job.core.executor.XxlJobExecutor;
import com.xxl.job.core.handler.IJobHandler;
import com.xxl.job.core.log.XxlJobFileAppender;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.util.Collections;
import java.util.Date;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.*;
/**
* handler thread
* @author xuxueli 2016-1-16 19:52:47
*/
public class JobThread extends Thread{
private static Logger logger = LoggerFactory.getLogger(JobThread.class);
private int jobId;
private IJobHandler handler;
private LinkedBlockingQueue<TriggerParam> triggerQueue;
private Set<Long> triggerLogIdSet; // avoid repeat trigger for the same TRIGGER_LOG_ID
private volatile boolean toStop = false;
private String stopReason;
private boolean running = false; // if running job
private int idleTimes = 0; // idel times
public JobThread(int jobId, IJobHandler handler) {
this.jobId = jobId;
this.handler = handler;
this.triggerQueue = new LinkedBlockingQueue<TriggerParam>();
this.triggerLogIdSet = Collections.synchronizedSet(new HashSet<Long>());
}
public IJobHandler getHandler() {
return handler;
}
/**
* new trigger to queue
*
* @param triggerParam
* @return
*/
public ReturnT<String> pushTriggerQueue(TriggerParam triggerParam) {
// avoid repeat
if (triggerLogIdSet.contains(triggerParam.getLogId())) {
logger.info(">>>>>>>>>>> repeate trigger job, logId:{}", triggerParam.getLogId());
return new ReturnT<String>(ReturnT.FAIL_CODE, "repeate trigger job, logId:" + triggerParam.getLogId());
}
triggerLogIdSet.add(triggerParam.getLogId());
triggerQueue.add(triggerParam);
return ReturnT.SUCCESS;
}
/**
* kill job thread
*
* @param stopReason
*/
public void toStop(String stopReason) {
/**
* Thread.interrupt只支持终止线程的阻塞状态(wait、join、sleep),
* 在阻塞出抛出InterruptedException异常,但是并不会终止运行的线程本身;
* 所以需要注意,此处彻底销毁本线程,需要通过共享变量方式;
*/
this.toStop = true;
this.stopReason = stopReason;
}
/**
* is running job
* @return
*/
public boolean isRunningOrHasQueue() {
return running || triggerQueue.size()>0;
}
@Override
public void run() {
// init
try {
handler.init();
} catch (Throwable e) {
logger.error(e.getMessage(), e);
}
// execute
while(!toStop){
running = false;
idleTimes++;
TriggerParam triggerParam = null;
try {
// to check toStop signal, we need cycle, so wo cannot use queue.take(), instand of poll(timeout)
//任务队列,triggerParam是任务参数,调度是直接往这个队列put参数对象
triggerParam = triggerQueue.poll(3L, TimeUnit.SECONDS);
if (triggerParam!=null) {
running = true;
idleTimes = 0;
triggerLogIdSet.remove(triggerParam.getLogId());
// log filename, like "logPath/yyyy-MM-dd/9999.log"
String logFileName = XxlJobFileAppender.makeLogFileName(new Date(triggerParam.getLogDateTime()), triggerParam.getLogId());
XxlJobContext xxlJobContext = new XxlJobContext(
triggerParam.getJobId(),
triggerParam.getExecutorParams(),
logFileName,
triggerParam.getBroadcastIndex(),
triggerParam.getBroadcastTotal());
// init job context
XxlJobContext.setXxlJobContext(xxlJobContext);
// execute
XxlJobHelper.log("<br>----------- xxl-job job execute start -----------<br>----------- Param:" + xxlJobContext.getJobParam());
if (triggerParam.getExecutorTimeout() > 0) {
// limit timeout
Thread futureThread = null;
try {
FutureTask<Boolean> futureTask = new FutureTask<Boolean>(new Callable<Boolean>() {
@Override
public Boolean call() throws Exception {
// init job context
XxlJobContext.setXxlJobContext(xxlJobContext);
handler.execute();
return true;
}
});
futureThread = new Thread(futureTask);
futureThread.start();
Boolean tempResult = futureTask.get(triggerParam.getExecutorTimeout(), TimeUnit.SECONDS);
} catch (TimeoutException e) {
XxlJobHelper.log("<br>----------- xxl-job job execute timeout");
XxlJobHelper.log(e);
// handle result
XxlJobHelper.handleTimeout("job execute timeout ");
} finally {
futureThread.interrupt();
}
} else {
// 执行任务 handler是要执行的方法,也就是@XxlJob注解中的执行方法
//可以发现一个任务在一个执行器当中是对应着唯一一个线程去执行
handler.execute();
}
// valid execute handle data
if (XxlJobContext.getXxlJobContext().getHandleCode() <= 0) {
XxlJobHelper.handleFail("job handle result lost.");
} else {
String tempHandleMsg = XxlJobContext.getXxlJobContext().getHandleMsg();
tempHandleMsg = (tempHandleMsg!=null&&tempHandleMsg.length()>50000)
?tempHandleMsg.substring(0, 50000).concat("...")
:tempHandleMsg;
XxlJobContext.getXxlJobContext().setHandleMsg(tempHandleMsg);
}
XxlJobHelper.log("<br>----------- xxl-job job execute end(finish) -----------<br>----------- Result: handleCode="
+ XxlJobContext.getXxlJobContext().getHandleCode()
+ ", handleMsg = "
+ XxlJobContext.getXxlJobContext().getHandleMsg()
);
} else {
if (idleTimes > 30) {
if(triggerQueue.size() == 0) { // avoid concurrent trigger causes jobId-lost
XxlJobExecutor.removeJobThread(jobId, "excutor idel times over limit.");
}
}
}
} catch (Throwable e) {
if (toStop) {
XxlJobHelper.log("<br>----------- JobThread toStop, stopReason:" + stopReason);
}
// handle result
StringWriter stringWriter = new StringWriter();
e.printStackTrace(new PrintWriter(stringWriter));
String errorMsg = stringWriter.toString();
XxlJobHelper.handleFail(errorMsg);
XxlJobHelper.log("<br>----------- JobThread Exception:" + errorMsg + "<br>----------- xxl-job job execute end(error) -----------");
} finally {
if(triggerParam != null) {
// callback handler info
if (!toStop) {
// 结果回调
TriggerCallbackThread.pushCallBack(new HandleCallbackParam(
triggerParam.getLogId(),
triggerParam.getLogDateTime(),
XxlJobContext.getXxlJobContext().getHandleCode(),
XxlJobContext.getXxlJobContext().getHandleMsg() )
);
} else {
// 结果回调
TriggerCallbackThread.pushCallBack(new HandleCallbackParam(
triggerParam.getLogId(),
triggerParam.getLogDateTime(),
XxlJobContext.HANDLE_COCE_FAIL,
stopReason + " [job running, killed]" )
);
}
}
}
}
// callback trigger request in queue
while(triggerQueue !=null && triggerQueue.size()>0){
TriggerParam triggerParam = triggerQueue.poll();
if (triggerParam!=null) {
// is killed
TriggerCallbackThread.pushCallBack(new HandleCallbackParam(
triggerParam.getLogId(),
triggerParam.getLogDateTime(),
XxlJobContext.HANDLE_COCE_FAIL,
stopReason + " [job not executed, in the job queue, killed.]")
);
}
}
// destroy
try {
handler.destroy();
} catch (Throwable e) {
logger.error(e.getMessage(), e);
}
logger.info(">>>>>>>>>>> xxl-job JobThread stoped, hashCode:{}", Thread.currentThread());
}
}
看看执行器中的代码,没有发现问题,问题也没法线下复现。我猜测是这个方法线程阻塞了,或者线程挂掉了。
@XxlJob("liveStatusTask")
public void liveStatusTask() {
List<LiveStatusVO> ids = tchLiveService.getNotStartAndLivingLiveIds();
if (CollectionUtil.isEmpty(ids)) {
return;
}
Map<Integer, Integer> map = ids.stream().collect(Collectors.toMap(LiveStatusVO::getLiveId, LiveStatusVO::getStatus));
//这里是远程调用其他项目api使用的httpClient工具
List<LiveVO> vos = ctydLiveService.listLiveVO(ids.stream().map(LiveStatusVO::getLiveId).collect(Collectors.toList()));
Map<Integer, Integer> idMap = ids.stream().collect(Collectors.toMap(LiveStatusVO::getLiveId, LiveStatusVO::getId));
vos.forEach(vo -> {
if (Objects.equals(map.get(vo.getLiveId()), vo.getStatus())) {
return;
}
log.info("status:{} liveId:{}", vo.getStatus(), vo.getLiveId());
LambdaUpdateWrapper<TchLive> updateWrapper = Wrappers.lambdaUpdate(TchLive.class)
.in(TchLive::getLiveId, vo.getLiveId())
.eq(TchLive::getDelFlag, DelFlagEnum.NORMAL)
.set(TchLive::getStatus, vo.getStatus())
.set(!Objects.isNull(vo.getStartTime()), TchLive::getStartTime, vo.getStartTime())
.set(!Objects.isNull(vo.getEndTime()), TchLive::getEndTime, vo.getEndTime());
if (ObjectUtil.isAllNotEmpty(vo.getStartTime(), vo.getEndTime())) {
long timeLength = (vo.getEndTime() - vo.getStartTime()) / 1000 / 60;
updateWrapper.set(TchLive::getTimeLength, (int) timeLength);
}
tchLiveService.update(updateWrapper);
if (Objects.equals(vo.getStatus(), LiveStatusEnum.CLOSED.getValue())) {
tchLiveStudentService.updateOnlineTime(vo.getLiveId(), idMap.get(vo.getLiveId()));
}
});
}
}
使用 jstack 1147643(进程id),查看jvm中的线程情况,找到了其执行线程,发现就是阻塞到ctydLiveService.listLiveVO中使用httpClient中没有设置超时时间导致线程阻塞了,网上搜索at java.net.SocketInputStream.socketRead0(Native Method)还是有很多类似问题,致此找到了原因。
"Thread-34" #2044 prio=10 os_prio=0 tid=0x00007f4474010800 nid=0x124ace runnable [0x00007f44649e1000]
java.lang.Thread.State: RUNNABLE
at java.net.SocketInputStream.socketRead0(Native Method)
at java.net.SocketInputStream.socketRead(SocketInputStream.java:116)
at java.net.SocketInputStream.read(SocketInputStream.java:171)
at java.net.SocketInputStream.read(SocketInputStream.java:141)
at org.apache.http.impl.io.SessionInputBufferImpl.streamRead(SessionInputBufferImpl.java:137)
at org.apache.http.impl.io.SessionInputBufferImpl.fillBuffer(SessionInputBufferImpl.java:153)
at org.apache.http.impl.io.SessionInputBufferImpl.readLine(SessionInputBufferImpl.java:280)
at org.apache.http.impl.conn.DefaultHttpResponseParser.parseHead(DefaultHttpResponseParser.java:138)
at org.apache.http.impl.conn.DefaultHttpResponseParser.parseHead(DefaultHttpResponseParser.java:56)
at org.apache.http.impl.io.AbstractMessageParser.parse(AbstractMessageParser.java:259)
at org.apache.http.impl.DefaultBHttpClientConnection.receiveResponseHeader(DefaultBHttpClientConnection.java:163)
at org.apache.http.impl.conn.CPoolProxy.receiveResponseHeader(CPoolProxy.java:157)
at org.apache.http.protocol.HttpRequestExecutor.doReceiveResponse(HttpRequestExecutor.java:273)
at org.apache.http.protocol.HttpRequestExecutor.execute(HttpRequestExecutor.java:125)
at org.apache.http.impl.execchain.MainClientExec.execute(MainClientExec.java:272)
at org.apache.http.impl.execchain.ProtocolExec.execute(ProtocolExec.java:186)
at org.apache.http.impl.execchain.RetryExec.execute(RetryExec.java:89)
at org.apache.http.impl.execchain.RedirectExec.execute(RedirectExec.java:110)
at org.apache.http.impl.client.InternalHttpClient.doExecute(InternalHttpClient.java:185)
at org.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:72)
at org.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:221)
at org.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:165)
at org.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:140)
at com.yunduan.ykxt.common.core.util.HttpClientUtil.doGet(HttpClientUtil.java:416)
at com.yunduan.ykxt.manage.external.ctydcourseware.CtydLiveClient.listLiveVO(CtydLiveClient.java:86)
at com.yunduan.ykxt.manage.external.ctydcourseware.service.impl.CtydLiveServiceImpl.listLiveVO(CtydLiveServiceImpl.java:68)
at com.yunduan.ykxt.manage.jobs.JobsService.liveStatusTask(JobsService.java:97)
at sun.reflect.GeneratedMethodAccessor3154.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at com.xxl.job.core.handler.impl.MethodJobHandler.execute(MethodJobHandler.java:31)
at com.xxl.job.core.thread.JobThread.run(JobThread.java:163)
前面可以知道是执行线程阻塞了,一直阻塞的话,时间够长是可以导致OOM错误的,应为调度一直往队列里面丢任务。却没有线程去消费掉。用命令:jmap -histo 1147643 | grep “com.xxl.job.core.biz.model”
查看一下有多少个任务实例,有积压了50906任务。
解决以及优化方案
一.设置超时时间
二.创建线程池去执行xxl-job的任务,万一有一个线程被阻塞住了不影响整体的任务执行。
注意:使用线程池的时候,使用XxlJobHelper.log记录日志的时候会出现问题,要对记录日志方法进行修改。