xxl-job 调度成功却不执行的问题记录

一.问题 描述

使用xxl-job 日志显示调度成功,但执行器就是没有执行,且没有结果回调,日志如下图。
在这里插入图片描述

二.排查原因

查看xxl-job客户端(执行器)中的源代码JobThread,发现调度和任务执行依靠队列 triggerQueue,这个是参数队列,调度直接往这个队列里面put参数对象。执行依靠是JobThread run方法,从队列中获取参数对象,能获取就执行,任务和线程是一一绑定的关系,调度成功,但没有结果回调。可以发现任务卡在执行任务当中handler.execute();

package com.xxl.job.core.thread;

import com.xxl.job.core.biz.model.HandleCallbackParam;
import com.xxl.job.core.biz.model.ReturnT;
import com.xxl.job.core.biz.model.TriggerParam;
import com.xxl.job.core.context.XxlJobContext;
import com.xxl.job.core.context.XxlJobHelper;
import com.xxl.job.core.executor.XxlJobExecutor;
import com.xxl.job.core.handler.IJobHandler;
import com.xxl.job.core.log.XxlJobFileAppender;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.PrintWriter;
import java.io.StringWriter;
import java.util.Collections;
import java.util.Date;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.*;


/**
 * handler thread
 * @author xuxueli 2016-1-16 19:52:47
 */
public class JobThread extends Thread{
	private static Logger logger = LoggerFactory.getLogger(JobThread.class);

	private int jobId;
	private IJobHandler handler;
	private LinkedBlockingQueue<TriggerParam> triggerQueue;
	private Set<Long> triggerLogIdSet;		// avoid repeat trigger for the same TRIGGER_LOG_ID

	private volatile boolean toStop = false;
	private String stopReason;

    private boolean running = false;    // if running job
	private int idleTimes = 0;			// idel times


	public JobThread(int jobId, IJobHandler handler) {
		this.jobId = jobId;
		this.handler = handler;
		this.triggerQueue = new LinkedBlockingQueue<TriggerParam>();
		this.triggerLogIdSet = Collections.synchronizedSet(new HashSet<Long>());
	}
	public IJobHandler getHandler() {
		return handler;
	}

    /**
     * new trigger to queue
     *
     * @param triggerParam
     * @return
     */
	public ReturnT<String> pushTriggerQueue(TriggerParam triggerParam) {
		// avoid repeat
		if (triggerLogIdSet.contains(triggerParam.getLogId())) {
			logger.info(">>>>>>>>>>> repeate trigger job, logId:{}", triggerParam.getLogId());
			return new ReturnT<String>(ReturnT.FAIL_CODE, "repeate trigger job, logId:" + triggerParam.getLogId());
		}

		triggerLogIdSet.add(triggerParam.getLogId());
		triggerQueue.add(triggerParam);
        return ReturnT.SUCCESS;
	}

    /**
     * kill job thread
     *
     * @param stopReason
     */
	public void toStop(String stopReason) {
		/**
		 * Thread.interrupt只支持终止线程的阻塞状态(wait、join、sleep),
		 * 在阻塞出抛出InterruptedException异常,但是并不会终止运行的线程本身;
		 * 所以需要注意,此处彻底销毁本线程,需要通过共享变量方式;
		 */
		this.toStop = true;
		this.stopReason = stopReason;
	}

    /**
     * is running job
     * @return
     */
    public boolean isRunningOrHasQueue() {
        return running || triggerQueue.size()>0;
    }

    @Override
	public void run() {

    	// init
    	try {
			handler.init();
		} catch (Throwable e) {
    		logger.error(e.getMessage(), e);
		}

		// execute
		while(!toStop){
			running = false;
			idleTimes++;

            TriggerParam triggerParam = null;
            try {
				// to check toStop signal, we need cycle, so wo cannot use queue.take(), instand of poll(timeout)
				//任务队列,triggerParam是任务参数,调度是直接往这个队列put参数对象 
				triggerParam = triggerQueue.poll(3L, TimeUnit.SECONDS);
				if (triggerParam!=null) {
					running = true;
					idleTimes = 0;
					triggerLogIdSet.remove(triggerParam.getLogId());

					// log filename, like "logPath/yyyy-MM-dd/9999.log"
					String logFileName = XxlJobFileAppender.makeLogFileName(new Date(triggerParam.getLogDateTime()), triggerParam.getLogId());
					XxlJobContext xxlJobContext = new XxlJobContext(
							triggerParam.getJobId(),
							triggerParam.getExecutorParams(),
							logFileName,
							triggerParam.getBroadcastIndex(),
							triggerParam.getBroadcastTotal());

					// init job context
					XxlJobContext.setXxlJobContext(xxlJobContext);

					// execute
					XxlJobHelper.log("<br>----------- xxl-job job execute start -----------<br>----------- Param:" + xxlJobContext.getJobParam());

					if (triggerParam.getExecutorTimeout() > 0) {
						// limit timeout
						Thread futureThread = null;
						try {
							FutureTask<Boolean> futureTask = new FutureTask<Boolean>(new Callable<Boolean>() {
								@Override
								public Boolean call() throws Exception {

									// init job context
									XxlJobContext.setXxlJobContext(xxlJobContext);

									handler.execute();
									return true;
								}
							});
							futureThread = new Thread(futureTask);
							futureThread.start();

							Boolean tempResult = futureTask.get(triggerParam.getExecutorTimeout(), TimeUnit.SECONDS);
						} catch (TimeoutException e) {

							XxlJobHelper.log("<br>----------- xxl-job job execute timeout");
							XxlJobHelper.log(e);

							// handle result
							XxlJobHelper.handleTimeout("job execute timeout ");
						} finally {
							futureThread.interrupt();
						}
					} else {
						// 执行任务 handler是要执行的方法,也就是@XxlJob注解中的执行方法
						//可以发现一个任务在一个执行器当中是对应着唯一一个线程去执行
						handler.execute();
					}

					// valid execute handle data
					if (XxlJobContext.getXxlJobContext().getHandleCode() <= 0) {
						XxlJobHelper.handleFail("job handle result lost.");
					} else {
						String tempHandleMsg = XxlJobContext.getXxlJobContext().getHandleMsg();
						tempHandleMsg = (tempHandleMsg!=null&&tempHandleMsg.length()>50000)
								?tempHandleMsg.substring(0, 50000).concat("...")
								:tempHandleMsg;
						XxlJobContext.getXxlJobContext().setHandleMsg(tempHandleMsg);
					}
					XxlJobHelper.log("<br>----------- xxl-job job execute end(finish) -----------<br>----------- Result: handleCode="
							+ XxlJobContext.getXxlJobContext().getHandleCode()
							+ ", handleMsg = "
							+ XxlJobContext.getXxlJobContext().getHandleMsg()
					);

				} else {
					if (idleTimes > 30) {
						if(triggerQueue.size() == 0) {	// avoid concurrent trigger causes jobId-lost
							XxlJobExecutor.removeJobThread(jobId, "excutor idel times over limit.");
						}
					}
				}
			} catch (Throwable e) {
				if (toStop) {
					XxlJobHelper.log("<br>----------- JobThread toStop, stopReason:" + stopReason);
				}

				// handle result
				StringWriter stringWriter = new StringWriter();
				e.printStackTrace(new PrintWriter(stringWriter));
				String errorMsg = stringWriter.toString();

				XxlJobHelper.handleFail(errorMsg);

				XxlJobHelper.log("<br>----------- JobThread Exception:" + errorMsg + "<br>----------- xxl-job job execute end(error) -----------");
			} finally {
                if(triggerParam != null) {
                    // callback handler info
                    if (!toStop) {
                        // 结果回调
                        TriggerCallbackThread.pushCallBack(new HandleCallbackParam(
                        		triggerParam.getLogId(),
								triggerParam.getLogDateTime(),
								XxlJobContext.getXxlJobContext().getHandleCode(),
								XxlJobContext.getXxlJobContext().getHandleMsg() )
						);
                    } else {
                         // 结果回调
                        TriggerCallbackThread.pushCallBack(new HandleCallbackParam(
                        		triggerParam.getLogId(),
								triggerParam.getLogDateTime(),
								XxlJobContext.HANDLE_COCE_FAIL,
								stopReason + " [job running, killed]" )
						);
                    }
                }
            }
        }

		// callback trigger request in queue
		while(triggerQueue !=null && triggerQueue.size()>0){
			TriggerParam triggerParam = triggerQueue.poll();
			if (triggerParam!=null) {
				// is killed
				TriggerCallbackThread.pushCallBack(new HandleCallbackParam(
						triggerParam.getLogId(),
						triggerParam.getLogDateTime(),
						XxlJobContext.HANDLE_COCE_FAIL,
						stopReason + " [job not executed, in the job queue, killed.]")
				);
			}
		}

		// destroy
		try {
			handler.destroy();
		} catch (Throwable e) {
			logger.error(e.getMessage(), e);
		}

		logger.info(">>>>>>>>>>> xxl-job JobThread stoped, hashCode:{}", Thread.currentThread());
	}
}

看看执行器中的代码,没有发现问题,问题也没法线下复现。我猜测是这个方法线程阻塞了,或者线程挂掉了。

	@XxlJob("liveStatusTask")
	public void liveStatusTask() {
		List<LiveStatusVO> ids = tchLiveService.getNotStartAndLivingLiveIds();
		if (CollectionUtil.isEmpty(ids)) {
			return;
		}
		Map<Integer, Integer> map = ids.stream().collect(Collectors.toMap(LiveStatusVO::getLiveId, LiveStatusVO::getStatus));
		//这里是远程调用其他项目api使用的httpClient工具
		List<LiveVO> vos = ctydLiveService.listLiveVO(ids.stream().map(LiveStatusVO::getLiveId).collect(Collectors.toList()));
		Map<Integer, Integer> idMap = ids.stream().collect(Collectors.toMap(LiveStatusVO::getLiveId, LiveStatusVO::getId));
		vos.forEach(vo -> {
			if (Objects.equals(map.get(vo.getLiveId()), vo.getStatus())) {
				return;
			}
			log.info("status:{} liveId:{}", vo.getStatus(), vo.getLiveId());
			LambdaUpdateWrapper<TchLive> updateWrapper = Wrappers.lambdaUpdate(TchLive.class)
					.in(TchLive::getLiveId, vo.getLiveId())
					.eq(TchLive::getDelFlag, DelFlagEnum.NORMAL)
					.set(TchLive::getStatus, vo.getStatus())
					.set(!Objects.isNull(vo.getStartTime()), TchLive::getStartTime, vo.getStartTime())
					.set(!Objects.isNull(vo.getEndTime()), TchLive::getEndTime, vo.getEndTime());
			if (ObjectUtil.isAllNotEmpty(vo.getStartTime(), vo.getEndTime())) {
				long timeLength = (vo.getEndTime() - vo.getStartTime()) / 1000 / 60;
				updateWrapper.set(TchLive::getTimeLength, (int) timeLength);
			}
			tchLiveService.update(updateWrapper);
			if (Objects.equals(vo.getStatus(), LiveStatusEnum.CLOSED.getValue())) {
				tchLiveStudentService.updateOnlineTime(vo.getLiveId(), idMap.get(vo.getLiveId()));
			}
		});
		}
	}

使用 jstack 1147643(进程id),查看jvm中的线程情况,找到了其执行线程,发现就是阻塞到ctydLiveService.listLiveVO中使用httpClient中没有设置超时时间导致线程阻塞了,网上搜索at java.net.SocketInputStream.socketRead0(Native Method)还是有很多类似问题,致此找到了原因。

"Thread-34" #2044 prio=10 os_prio=0 tid=0x00007f4474010800 nid=0x124ace runnable [0x00007f44649e1000]
   java.lang.Thread.State: RUNNABLE
        at java.net.SocketInputStream.socketRead0(Native Method)
        at java.net.SocketInputStream.socketRead(SocketInputStream.java:116)
        at java.net.SocketInputStream.read(SocketInputStream.java:171)
        at java.net.SocketInputStream.read(SocketInputStream.java:141)
        at org.apache.http.impl.io.SessionInputBufferImpl.streamRead(SessionInputBufferImpl.java:137)
        at org.apache.http.impl.io.SessionInputBufferImpl.fillBuffer(SessionInputBufferImpl.java:153)
        at org.apache.http.impl.io.SessionInputBufferImpl.readLine(SessionInputBufferImpl.java:280)
        at org.apache.http.impl.conn.DefaultHttpResponseParser.parseHead(DefaultHttpResponseParser.java:138)
        at org.apache.http.impl.conn.DefaultHttpResponseParser.parseHead(DefaultHttpResponseParser.java:56)
        at org.apache.http.impl.io.AbstractMessageParser.parse(AbstractMessageParser.java:259)
        at org.apache.http.impl.DefaultBHttpClientConnection.receiveResponseHeader(DefaultBHttpClientConnection.java:163)
        at org.apache.http.impl.conn.CPoolProxy.receiveResponseHeader(CPoolProxy.java:157)
        at org.apache.http.protocol.HttpRequestExecutor.doReceiveResponse(HttpRequestExecutor.java:273)
        at org.apache.http.protocol.HttpRequestExecutor.execute(HttpRequestExecutor.java:125)
        at org.apache.http.impl.execchain.MainClientExec.execute(MainClientExec.java:272)
        at org.apache.http.impl.execchain.ProtocolExec.execute(ProtocolExec.java:186)
        at org.apache.http.impl.execchain.RetryExec.execute(RetryExec.java:89)
        at org.apache.http.impl.execchain.RedirectExec.execute(RedirectExec.java:110)
        at org.apache.http.impl.client.InternalHttpClient.doExecute(InternalHttpClient.java:185)
        at org.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:72)
        at org.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:221)
        at org.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:165)
        at org.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:140)
        at com.yunduan.ykxt.common.core.util.HttpClientUtil.doGet(HttpClientUtil.java:416)
        at com.yunduan.ykxt.manage.external.ctydcourseware.CtydLiveClient.listLiveVO(CtydLiveClient.java:86)
        at com.yunduan.ykxt.manage.external.ctydcourseware.service.impl.CtydLiveServiceImpl.listLiveVO(CtydLiveServiceImpl.java:68)
        at com.yunduan.ykxt.manage.jobs.JobsService.liveStatusTask(JobsService.java:97)
        at sun.reflect.GeneratedMethodAccessor3154.invoke(Unknown Source)
        at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
        at java.lang.reflect.Method.invoke(Method.java:498)
        at com.xxl.job.core.handler.impl.MethodJobHandler.execute(MethodJobHandler.java:31)
        at com.xxl.job.core.thread.JobThread.run(JobThread.java:163)

前面可以知道是执行线程阻塞了,一直阻塞的话,时间够长是可以导致OOM错误的,应为调度一直往队列里面丢任务。却没有线程去消费掉。用命令:jmap -histo 1147643 | grep “com.xxl.job.core.biz.model”
查看一下有多少个任务实例,有积压了50906任务。
在这里插入图片描述

解决以及优化方案

一.设置超时时间
二.创建线程池去执行xxl-job的任务,万一有一个线程被阻塞住了不影响整体的任务执行。
注意:使用线程池的时候,使用XxlJobHelper.log记录日志的时候会出现问题,要对记录日志方法进行修改。

评论 10
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值