Apache DolphinScheduler3.0源码剖析
1.工作流操作
1.1概述
用户通过编辑器编辑项目,工作流等信息,保存并运行的工作流的执行过程
1.2 创建工作流
1.2.1 第一步 执行ProcessDefinitionController的createProcessDefinition函数
@PostMapping()
@ResponseStatus(HttpStatus.CREATED)
@ApiException(CREATE_PROCESS_DEFINITION_ERROR)
@AccessLogAnnotation(ignoreRequestArgs = "loginUser")
public Result createProcessDefinition(@ApiIgnore @RequestAttribute(value = Constants.SESSION_USER) User loginUser,
@ApiParam(name = "projectCode", value = "PROJECT_CODE", required = true) @PathVariable long projectCode,
@RequestParam(value = "name", required = true) String name,
@RequestParam(value = "description", required = false) String description,
@RequestParam(value = "globalParams", required = false, defaultValue = "[]") String globalParams,
@RequestParam(value = "locations", required = false) String locations,
@RequestParam(value = "timeout", required = false, defaultValue = "0") int timeout,
@RequestParam(value = "tenantCode", required = true) String tenantCode,
@RequestParam(value = "taskRelationJson", required = true) String taskRelationJson,
@RequestParam(value = "taskDefinitionJson", required = true) String taskDefinitionJson,
@RequestParam(value = "otherParamsJson", required = false) String otherParamsJson,
@RequestParam(value = "executionType", defaultValue = "PARALLEL") ProcessExecutionTypeEnum executionType) {
Map<String, Object> result = processDefinitionService.createProcessDefinition(loginUser, projectCode, name,
description, globalParams,
locations, timeout, tenantCode, taskRelationJson, taskDefinitionJson, otherParamsJson, executionType);
return returnDataList(result);
}
1.2.2 第二步 执行ProcessDefinitionServiceImpl的createProcessDefinition函数
/**
* create process definition
*
* @param loginUser login user
* @param projectCode project code
* @param name process definition name
* @param description description
* @param globalParams global params
* @param locations locations for nodes
* @param timeout timeout
* @param tenantCode tenantCode
* @param taskRelationJson relation json for nodes
* @param taskDefinitionJson taskDefinitionJson
* @return create result code
*/
@Override
@Transactional
public Map<String, Object> createProcessDefinition(User loginUser,
long projectCode,
String name,
String description,
String globalParams,
String locations,
int timeout,
String tenantCode,
String taskRelationJson,
String taskDefinitionJson,
String otherParamsJson,
ProcessExecutionTypeEnum executionType) {
Project project = projectMapper.queryByCode(projectCode);
// 判断用户是否有操作此项目权限
Map<String, Object> result =
projectService.checkProjectAndAuth(loginUser, project, projectCode, WORKFLOW_CREATE);
if (result.get(Constants.STATUS) != Status.SUCCESS) {
return result;
}
if(checkDescriptionLength(description)){
putMsg(result, Status.DESCRIPTION_TOO_LONG_ERROR);
return result;
}
// 判断新的项目名称是否已存在
ProcessDefinition definition = processDefinitionMapper.verifyByDefineName(project.getCode(), name);
if (definition != null) {
putMsg(result, Status.PROCESS_DEFINITION_NAME_EXIST, name);
return result;
}
List<TaskDefinitionLog> taskDefinitionLogs = JSONUtils.toList(taskDefinitionJson, TaskDefinitionLog.class);
Map<String, Object> checkTaskDefinitions = checkTaskDefinitionList(taskDefinitionLogs, taskDefinitionJson);
if (checkTaskDefinitions.get(Constants.STATUS) != Status.SUCCESS) {
return checkTaskDefinitions;
}
List<ProcessTaskRelationLog> taskRelationList =
JSONUtils.toList(taskRelationJson, ProcessTaskRelationLog.class);
Map<String, Object> checkRelationJson =
checkTaskRelationList(taskRelationList, taskRelationJson, taskDefinitionLogs);
if (checkRelationJson.get(Constants.STATUS) != Status.SUCCESS) {
return checkRelationJson;
}
// 判断用户是否绑定了租户,未绑定则无法执行
int tenantId = -1;
if (!Constants.DEFAULT.equals(tenantCode)) {
Tenant tenant = tenantMapper.queryByTenantCode(tenantCode);
if (tenant == null) {
putMsg(result, Status.TENANT_NOT_EXIST);
return result;
}
tenantId = tenant.getId();
}
// 未项目定义生成一个编码
long processDefinitionCode;
try {
processDefinitionCode = CodeGenerateUtils.getInstance().genCode();
} catch (CodeGenerateException e) {
putMsg(result, Status.INTERNAL_SERVER_ERROR_ARGS);
return result;
}
ProcessDefinition processDefinition =
new ProcessDefinition(projectCode, name, processDefinitionCode, description,
globalParams, locations, timeout, loginUser.getId(), tenantId);
processDefinition.setExecutionType(executionType);
// 创建项目的有向无环图
return createDagDefine(loginUser, taskRelationList, processDefinition, taskDefinitionLogs, otherParamsJson);
}
1.2.3 第三步 执行ProcessDefinitionServiceImpl的createDagDefine函数
protected Map<String, Object> createDagDefine(User loginUser,
List<ProcessTaskRelationLog> taskRelationList,
ProcessDefinition processDefinition,
List<TaskDefinitionLog> taskDefinitionLogs, String otherParamsJson) {
Map<String, Object> result = new HashMap<>();
int saveTaskResult = processService.saveTaskDefine(loginUser, processDefinition.getProjectCode(),
taskDefinitionLogs, Boolean.TRUE);
if (saveTaskResult == Constants.EXIT_CODE_SUCCESS) {
logger.info("The task has not changed, so skip");
}
if (saveTaskResult == Constants.DEFINITION_FAILURE) {
putMsg(result, Status.CREATE_TASK_DEFINITION_ERROR);
throw new ServiceException(Status.CREATE_TASK_DEFINITION_ERROR);
}
// 保存项目定义到数据库中
int insertVersion = processService.saveProcessDefine(loginUser, processDefinition, Boolean.TRUE, Boolean.TRUE);
if (insertVersion == 0) {
putMsg(result, Status.CREATE_PROCESS_DEFINITION_ERROR);
throw new ServiceException(Status.CREATE_PROCESS_DEFINITION_ERROR);
}
int insertResult = processService.saveTaskRelation(loginUser, processDefinition.getProjectCode(),
processDefinition.getCode(),
insertVersion, taskRelationList, taskDefinitionLogs, Boolean.TRUE);
if (insertResult == Constants.EXIT_CODE_SUCCESS) {
putMsg(result, Status.SUCCESS);
result.put(Constants.DATA_LIST, processDefinition);
} else {
putMsg(result, Status.CREATE_PROCESS_TASK_RELATION_ERROR);
throw new ServiceException(Status.CREATE_PROCESS_TASK_RELATION_ERROR);
}
saveOtherRelation(loginUser, processDefinition, result, otherParamsJson);
return result;
}
1.2.4整体执行流程时序图
1.3运行工作流
2.Master相关流程
2.1概述
Master运行线程,检索出需要执行的命令,经过一系列处理后通过网络发送给Worker执行
2.2 Master类的run函数
/**
* run master server
*/
@PostConstruct
public void run() throws SchedulerException {
// 初始化masterRPCServer线程,启动一个netty server进行监听
this.masterRPCServer.start();
// 通过SPI机制加载功能模块插件
this.taskPluginManager.loadPlugin();
// self tolerant
this.masterRegistryClient.start();
this.masterRegistryClient.setRegistryStoppable(this);
// 初始化并运行masterSchedulerBootstrap线程,从数据库中读取要执行的Command并转换成ProcessInstance进行触发
this.masterSchedulerBootstrap.init();
this.masterSchedulerBootstrap.start();
this.eventExecuteService.start();
this.failoverExecuteThread.start();
// Quartz定时器启动,实现类QuartzScheduler
this.schedulerApi.start();
Runtime.getRuntime().addShutdownHook(new Thread(() -> {
if (!ServerLifeCycleManager.isStopped()) {
close("MasterServer shutdownHook");
}
}));
}
2.3 MasterRPCServer类的start函数
public void start() {
logger.info("Starting Master RPC Server...");
// init remoting server
NettyServerConfig serverConfig = new NettyServerConfig();
serverConfig.setListenPort(masterConfig.getListenPort());
this.nettyRemotingServer = new NettyRemotingServer(serverConfig);
//
this.nettyRemotingServer.registerProcessor(CommandType.TASK_EXECUTE_RUNNING, taskExecuteRunningProcessor);
this.nettyRemotingServer.registerProcessor(CommandType.TASK_EXECUTE_RESULT, taskExecuteResponseProcessor);
this.nettyRemotingServer.registerProcessor(CommandType.TASK_KILL_RESPONSE, taskKillResponseProcessor);
this.nettyRemotingServer.registerProcessor(CommandType.STATE_EVENT_REQUEST, stateEventProcessor);
this.nettyRemotingServer.registerProcessor(CommandType.TASK_FORCE_STATE_EVENT_REQUEST, taskEventProcessor);
this.nettyRemotingServer.registerProcessor(CommandType.TASK_WAKEUP_EVENT_REQUEST, taskEventProcessor);
this.nettyRemotingServer.registerProcessor(CommandType.CACHE_EXPIRE, cacheProcessor);
this.nettyRemotingServer.registerProcessor(CommandType.TASK_REJECT, taskRecallProcessor);
this.nettyRemotingServer.registerProcessor(CommandType.WORKFLOW_EXECUTING_DATA_REQUEST, workflowExecutingDataRequestProcessor);
this.nettyRemotingServer.registerProcessor(CommandType.TASK_EXECUTE_START, taskExecuteStartProcessor);
// logger server
this.nettyRemotingServer.registerProcessor(CommandType.GET_LOG_BYTES_REQUEST, loggerRequestProcessor);
this.nettyRemotingServer.registerProcessor(CommandType.ROLL_VIEW_LOG_REQUEST, loggerRequestProcessor);
this.nettyRemotingServer.registerProcessor(CommandType.VIEW_WHOLE_LOG_REQUEST, loggerRequestProcessor);
this.nettyRemotingServer.registerProcessor(CommandType.REMOVE_TAK_LOG_REQUEST, loggerRequestProcessor);
this.nettyRemotingServer.start();
logger.info("Started Master RPC Server...");
}
2.4 TaskPluginManager类的loadPlugin函数
public void loadPlugin() {
if (!loadedFlag.compareAndSet(false, true)) {
logger.warn("The task plugin has already been loaded");
return;
}
PrioritySPIFactory<TaskChannelFactory> prioritySPIFactory = new PrioritySPIFactory<>(TaskChannelFactory.class);
for (Map.Entry<String, TaskChannelFactory> entry : prioritySPIFactory.getSPIMap().entrySet()) {
String factoryName = entry.getKey();
TaskChannelFactory factory = entry.getValue();
logger.info("Registering task plugin: {} - {}", factoryName, factory.getClass());
taskChannelFactoryMap.put(factoryName, factory);
taskChannelMap.put(factoryName, factory.create());
logger.info("Registered task plugin: {} - {}", factoryName, factory.getClass());
}
}
2.5 MasterSchedulerBootstrap执行过程分析
/**
* run of MasterSchedulerService
*/
@Override
public void run() {
while (!ServerLifeCycleManager.isStopped()) {
try {
if (!ServerLifeCycleManager.isRunning()) {
// the current server is not at running status, cannot consume command.
Thread.sleep(Constants.SLEEP_TIME_MILLIS);
}
// todo: if the workflow event queue is much, we need to handle the back pressure
boolean isOverload =
OSUtils.isOverload(masterConfig.getMaxCpuLoadAvg(), masterConfig.getReservedMemory());
if (isOverload) {
MasterServerMetrics.incMasterOverload();
Thread.sleep(Constants.SLEEP_TIME_MILLIS);
continue;
}
// 从数据库中查询当前Master所要运行的Command
List<Command> commands = findCommands();
if (CollectionUtils.isEmpty(commands)) {
// indicate that no command ,sleep for 1s
Thread.sleep(Constants.SLEEP_TIME_MILLIS);
continue;
}
// 将Command转换成进程实例
List<ProcessInstance> processInstances = command2ProcessInstance(commands);
if (CollectionUtils.isEmpty(processInstances)) {
// indicate that the command transform to processInstance error, sleep for 1s
Thread.sleep(Constants.SLEEP_TIME_MILLIS);
continue;
}
MasterServerMetrics.incMasterConsumeCommand(commands.size());
// 遍历处理所有的进程实例
processInstances.forEach(processInstance -> {
try {
LoggerUtils.setWorkflowInstanceIdMDC(processInstance.getId());
if (processInstanceExecCacheManager.contains(processInstance.getId())) {
logger.error(
"The workflow instance is already been cached, this case shouldn't be happened");
}
WorkflowExecuteRunnable workflowRunnable = new WorkflowExecuteRunnable(processInstance,
processService,
processInstanceDao,
nettyExecutorManager,
processAlertManager,
masterConfig,
stateWheelExecuteThread,
curingGlobalParamsService);
// 根据processInstance 创建workflowRunnable对象,并缓存到processInstanceExecCacheManager中
processInstanceExecCacheManager.cache(processInstance.getId(), workflowRunnable);
workflowEventQueue.addEvent(new WorkflowEvent(WorkflowEventType.START_WORKFLOW,
processInstance.getId()));
} finally {
LoggerUtils.removeWorkflowInstanceIdMDC();
}
});
} catch (InterruptedException interruptedException) {
logger.warn("Master schedule bootstrap interrupted, close the loop", interruptedException);
Thread.currentThread().interrupt();
break;
} catch (Exception e) {
logger.error("Master schedule workflow error", e);
// sleep for 1s here to avoid the database down cause the exception boom
ThreadUtils.sleep(Constants.SLEEP_TIME_MILLIS);
}
}
}
2.6 TaskPriorityQueueConsumer执行
TaskPriorityQueueConsumer线程处理上面流程中存储到processInstanceExecCacheManager中的workflowExecuteRunnable ,通过netty发送到worker执行
@Override
public void run() {
int fetchTaskNum = masterConfig.getDispatchTaskNumber();
while (!ServerLifeCycleManager.isStopped()) {
try {
List<TaskPriority> failedDispatchTasks = this.batchDispatch(fetchTaskNum);
if (CollectionUtils.isNotEmpty(failedDispatchTasks)) {
TaskMetrics.incTaskDispatchFailed(failedDispatchTasks.size());
for (TaskPriority dispatchFailedTask : failedDispatchTasks) {
taskPriorityQueue.put(dispatchFailedTask);
}
// If the all task dispatch failed, will sleep for 1s to avoid the master cpu higher.
if (fetchTaskNum == failedDispatchTasks.size()) {
TimeUnit.MILLISECONDS.sleep(Constants.SLEEP_TIME_MILLIS);
}
}
} catch (Exception e) {
TaskMetrics.incTaskDispatchError();
logger.error("dispatcher task error", e);
}
}
}
/**
* batch dispatch with thread pool
*/
public List<TaskPriority> batchDispatch(int fetchTaskNum) throws TaskPriorityQueueException, InterruptedException {
List<TaskPriority> failedDispatchTasks = Collections.synchronizedList(new ArrayList<>());
CountDownLatch latch = new CountDownLatch(fetchTaskNum);
for (int i = 0; i < fetchTaskNum; i++) {
TaskPriority taskPriority = taskPriorityQueue.poll(Constants.SLEEP_TIME_MILLIS, TimeUnit.MILLISECONDS);
if (Objects.isNull(taskPriority)) {
latch.countDown();
continue;
}
consumerThreadPoolExecutor.submit(() -> {
try {
boolean dispatchResult = this.dispatchTask(taskPriority);
if (!dispatchResult) {
failedDispatchTasks.add(taskPriority);
}
} finally {
// make sure the latch countDown
latch.countDown();
}
});
}
latch.await();
return failedDispatchTasks;
}
/**
* Dispatch task to worker.
*
* @param taskPriority taskPriority
* @return dispatch result, return true if dispatch success, return false if dispatch failed.
*/
protected boolean dispatchTask(TaskPriority taskPriority) {
TaskMetrics.incTaskDispatch();
boolean result = false;
try {
WorkflowExecuteRunnable workflowExecuteRunnable =
processInstanceExecCacheManager.getByProcessInstanceId(taskPriority.getProcessInstanceId());
if (workflowExecuteRunnable == null) {
logger.error("Cannot find the related processInstance of the task, taskPriority: {}", taskPriority);
return true;
}
Optional<TaskInstance> taskInstanceOptional =
workflowExecuteRunnable.getTaskInstance(taskPriority.getTaskId());
if (!taskInstanceOptional.isPresent()) {
logger.error("Cannot find the task instance from related processInstance, taskPriority: {}",
taskPriority);
// we return true, so that we will drop this task.
return true;
}
TaskInstance taskInstance = taskInstanceOptional.get();
TaskExecutionContext context = taskPriority.getTaskExecutionContext();
ExecutionContext executionContext =
new ExecutionContext(toCommand(context), ExecutorType.WORKER, context.getWorkerGroup(),
taskInstance);
if (isTaskNeedToCheck(taskPriority)) {
if (taskInstanceIsFinalState(taskPriority.getTaskId())) {
// when task finish, ignore this task, there is no need to dispatch anymore
return true;
}
}
result = dispatcher.dispatch(executionContext);
if (result) {
logger.info("Master success dispatch task to worker, taskInstanceId: {}, worker: {}",
taskPriority.getTaskId(),
executionContext.getHost());
addDispatchEvent(context, executionContext);
} else {
logger.info("Master failed to dispatch task to worker, taskInstanceId: {}, worker: {}",
taskPriority.getTaskId(),
executionContext.getHost());
}
} catch (RuntimeException | ExecuteException e) {
logger.error("Master dispatch task to worker error, taskPriority: {}", taskPriority, e);
}
return result;
}
/**
* task dispatch
*
* @param context context
* @return result
* @throws ExecuteException if error throws ExecuteException
*/
public Boolean dispatch(final ExecutionContext context) throws ExecuteException {
// get executor manager
ExecutorManager<Boolean> executorManager = this.executorManagers.get(context.getExecutorType());
if (executorManager == null) {
throw new ExecuteException("no ExecutorManager for type : " + context.getExecutorType());
}
// host select
Host host = hostManager.select(context);
if (StringUtils.isEmpty(host.getAddress())) {
logger.warn("fail to execute : {} due to no suitable worker, current task needs worker group {} to execute",
context.getCommand(), context.getWorkerGroup());
return false;
}
context.setHost(host);
executorManager.beforeExecute(context);
try {
// task execute
return executorManager.execute(context);
} finally {
executorManager.afterExecute(context);
}
}
2.7 NettyExecutorManager运行时序图
/**
* execute logic
*
* @param context context
* @return result
* @throws ExecuteException if error throws ExecuteException
*/
@Override
public Boolean execute(ExecutionContext context) throws ExecuteException {
// all nodes
Set<String> allNodes = getAllNodes(context);
// fail nodes
Set<String> failNodeSet = new HashSet<>();
// build command accord executeContext
Command command = context.getCommand();
// execute task host
Host host = context.getHost();
boolean success = false;
while (!success) {
try {
doExecute(host, command);
success = true;
context.setHost(host);
// We set the host to taskInstance to avoid when the worker down, this taskInstance may not be
// failovered, due to the taskInstance's host
// is not belongs to the down worker ISSUE-10842.
context.getTaskInstance().setHost(host.getAddress());
} catch (ExecuteException ex) {
logger.error(String.format("execute command : %s error", command), ex);
try {
failNodeSet.add(host.getAddress());
Set<String> tmpAllIps = new HashSet<>(allNodes);
Collection<String> remained = CollectionUtils.subtract(tmpAllIps, failNodeSet);
if (remained != null && remained.size() > 0) {
host = Host.of(remained.iterator().next());
logger.error("retry execute command : {} host : {}", command, host);
} else {
throw new ExecuteException("fail after try all nodes");
}
} catch (Throwable t) {
throw new ExecuteException("fail after try all nodes");
}
}
}
return success;
}
Netty执行发送任务时序图: