场景:延迟指定时间,然后定期执行检查任务,每个机器部署的项目都去queryAllTask然后进行处理,在真正check每个任务的时候,如果有机器在检查这个任务,此台机器就不会在检查该任务
- 线程池命名工厂类
public class NamedThreadFactory implements ThreadFactory {
private final AtomicInteger mThreadNum = new AtomicInteger(1);
private final String mPrefix;
private final boolean mDaemo;
private final ThreadGroup mGroup;
public NamedThreadFactory(String prefix) {
this(prefix, false);
}
public NamedThreadFactory(String prefix, boolean daemo) {
mPrefix = prefix + "-thread-";
mDaemo = daemo;
SecurityManager s = System.getSecurityManager();
mGroup = (s == null) ? Thread.currentThread().getThreadGroup() : s.getThreadGroup();
}
@Override
public Thread newThread(Runnable runnable) {
String name = mPrefix + mThreadNum.getAndIncrement();
Thread ret = new Thread(mGroup, runnable, name, 0);
ret.setDaemon(mDaemo);
return ret;
}
}
- 基于redis的分布式任务定时检查任务,所有的机器按照线程池并发数进行检查,如果不是非要和线程池进行分布式多任务并行处理模板代码一起用,可以不用redis,此处用的redis主要是分布式锁,也可以使用zk的分布式锁实现
public abstract class AbstractDistributedTaskCheckExecutor<T> extends ScheduledThreadPoolExecutor {
private final Logger logger = LoggerFactory.getLogger(getClass());
private StringCommands stringCommands;
private HashCommands hashCommands;
protected ValueCommands valueCommands;
private long fixedDelaySeconds = 60;
public AbstractDistributedTaskCheckExecutor() {
this(50);
}
/**
* 构造当前节点线程池
*
* @param poolSize 用于处理任务检查的线程池大小
*/
public AbstractDistributedTaskCheckExecutor(int poolSize) {
super(poolSize + 1, new NamedThreadFactory("checktask"), new AbortPolicy());
setMaximumPoolSize(poolSize + 1);
setRemoveOnCancelPolicy(true);
}
/**
* 初始化方法
*/
public final void init() {
stringCommands = RedisFactory.getClusterStringCommands(getRedisGroupName());
hashCommands = RedisFactory.getClusterHashCommands(getRedisGroupName());
valueCommands = RedisFactory.getClusterValueCommands(getRedisGroupName());
}
/**
* 开始启动接收分布式线程
*
* @param fixedDelay 固定延迟时长
* @param fixedUnit 固定延迟时长单位
*/
public final void start(long fixedDelay, TimeUnit fixedUnit) {
this.fixedDelaySeconds = fixedUnit.toSeconds(fixedDelay);
scheduleWithFixedDelay(() -> loadCheckTask(), 30, fixedDelaySeconds, TimeUnit.SECONDS);
}
/**
* 从数据库加载需要检查的任务
*/
private final void loadCheckTask() {
try {
if (GaeaConfiguration.getBoolean(ConfOptions.POSEIDONX_JOB_AUTO_RECOVERY_SUSPENDED)) {
logger.info("开启了暂停任务恢复检测和告警(用于集群维护窗口)");
return;
}
logger.error("loadCheckTask...");
List<T> tasks = queryCheckTasks();
if (tasks != null) {
logger.error("taskIds={}", tasks);
for (T id : tasks) {
ScheduledFuture scheduledFuture = scheduleTask(id);
if (scheduledFuture != null) {
try {
//因为Random实例被多线程使用,虽然共享该实例是线程安全的,但会因为竞争同一个seed导致的性能下降,每个节点检查线程池大小50
// ,60s检查一次,防止50个任务检查不完,每次停顿在1s以内
TimeUnit.MILLISECONDS.sleep(ThreadLocalRandom.current().nextInt(1000));
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}
}
}
} catch (Exception e) {
logger.error("loadCheckTask exception", e);
}
}
/**
* 调度任务前修改任务,放入RunnableScheduledFuture
*
* @param runnable
* @param task
* @param <V>
* @return
*/
@Override
protected <V> RunnableScheduledFuture<V> decorateTask(Runnable runnable, RunnableScheduledFuture<V> task) {
if (runnable instanceof AbstractDistributedTaskCheckExecutor.CheckPointTask) {
CheckPointTask checkPointTask = (AbstractDistributedTaskCheckExecutor.CheckPointTask) runnable;
checkPointTask.setScheduledFuture(task);
}
return super.decorateTask(runnable, task);
}
/**
* 封装的提交线程池的定时任务
*/
private class CheckPointTask implements Runnable {
private T id;
private RunnableScheduledFuture scheduledFuture;
public CheckPointTask(T id) {
this.id = id;
}
public void setScheduledFuture(RunnableScheduledFuture scheduledFuture) {
this.scheduledFuture = scheduledFuture;
}
@Override
public void run() {
//检测的时候这个任务不能是启动或停止过程中,因为防止用户正在停止,有可能出现刚stop job,此时db中任务状态还是运行中,检测线程检测就会恢复
//故检查和任务启动和停止互斥
Integer taskId = hashCommands.get(getRedisNamespace(), AbstractDistributedTaskHandler.getTaskRedisKey((Integer) id), "taskId", true);
logger.info("taskCheckExecutor该任务taskId={},正在启动或停止:{}", id, (taskId == null ? false : true));
if (taskId == null) {
try {
check(scheduledFuture, id);
} catch (Exception e) {
logger.error("check exception id=" + id, e);
}
}
}
}
/**
* 获取redis key名称
*
* @param id 任务ID
* @return redis key名称
*/
protected final String getTaskRedisKey(T id) {
return String.format("poseidonx:checkpointtasks:%s:%s", getRedisCheckPointKey(), id);
}
/**
* 当前节点立即计划检查任务,当该节点任务满的时候不处理,等待其它节点计划处理
*
* @param id 任务
*/
public final ScheduledFuture scheduleTask(T id) {
logger.error("scheduleTask id={}", id);
logger.error("getActiveCount()=" + getActiveCount());
if (getActiveCount() >= getCorePoolSize()) {
logger.error("getActiveCount() >= getCorePoolSize()", id);
return null;
}
String redisKey = getTaskRedisKey(id);
Boolean success = stringCommands.setIfAbsent(getRedisNamespace(), redisKey, NetUtil.getLocalHostAddress(), true);
if (!success) {
return null;
}
stringCommands.expire(getRedisNamespace(), redisKey, fixedDelaySeconds, TimeUnit.SECONDS, true);
logger.info("schedule check task: {}", id);
//提交完的任务不能立刻就去检查,因为有可能flink中task太多,导致检查出现问题,就直接kill了,延迟60s在检查
return schedule(new CheckPointTask(id), fixedDelaySeconds, TimeUnit.SECONDS);
}
/**
* 获取任务的信息和执行状态
*
* @param id
* @return
*/
public final String getTaskSatus(T id) {
String taskKey = getTaskRedisKey(id);
return stringCommands.get(getRedisNamespace(), taskKey, true);
}
/**
* 获取redis的GroupName
*
* @return
*/
public abstract String getRedisGroupName();
/**
* 获取redis的Namespace
*
* @return
*/
public abstract int getRedisNamespace();
/**
* 获取检查的redis的标识
*
* @return
*/
public abstract String getRedisCheckPointKey();
/**
* 查询需要检查的任务列表
*
* @return
*/
public abstract List<T> queryCheckTasks();
/**
* 定时检查的方法
*
* @param scheduledFuture 调度器,可以自行取消调度
* @param id 任务id参数
*/
public abstract void check(RunnableScheduledFuture scheduledFuture, T id);
}
- 检查任务执行器
@Slf4j
@Component
public class FlinkCheckTaskExecutor extends AbstractDistributedTaskCheckExecutor<Integer> implements CommandLineRunner {
/**
* 实现spring的CommandLineRunner接口,在项目启动之后会spring会自动调用run方法
*/
@Override
public void run(String... strings) throws Exception {
init();
//开发环境不启动
if (!EnvironmentConstant.DEVELOPMENT.equals(poseidonxConfig.getEnvironment())) {
start(1, TimeUnit.MINUTES);
}
}
@Override
public String getRedisGroupName() {
return poseidonxConfig.getRedisGroupName();
}
@Override
public int getRedisNamespace() {
return poseidonxConfig.getRedisNamespace();
}
@Override
public String getRedisCheckPointKey() {
return "flinktasks";
}
/**
* todo 获取需要检查的所有任务,根据自己业务逻辑写
*
* @return
*/
@Override
public List<Integer> queryCheckTasks() {
return poseidonxFlinkTaskMapper.listIdByRunning();
}
@Override
public void check(RunnableScheduledFuture scheduledFuture, Integer id) {
log.info("$$$$$$$$$$$$$ begin check task: {} {}", id, new Date());
PoseidonxFlinkTask poseidonxFlinkTask = poseidonxFlinkTaskMapper.selectByPrimaryKey(id);
//先判断是否有节点正在恢复该任务
if (valueCommands.get(getRedisNamespace(), this.getRecoveryTaskRedisKey(poseidonxFlinkTask.getTaskName()),
true) != null) {
log.error("recoveryTaskRedisKey未过期taskName={}", poseidonxFlinkTask.getTaskName());
return;
}
//todo 非运行状态的任务,释放当前线程池正在执行检查的线程;根据业务逻辑进行相应处理
scheduledFuture.cancel(true);
alertRuleService.startChrckByRule(poseidonxFlinkTask);
log.info("^^^^^^^^^^^^^ end check task: {} {}", id, new Date());
}
}