dolphinscheduler 3.0.1 监控中心(上):服务管理
🐬概览
功能比较简单,就是对系统中的各个服务的健康状况和基本信息的监控和显示,比如工作流不执行了,可能就是master服务挂掉了,或者说工作流一直运行中,这个时候任务实例如果是等待执行,一般就是worker挂了,1.0的时候好像会经常宕机,当时就是修改zk参数,增大连接时间吧,官网应该有这个问题,目前基本上没遇到过服务挂掉的情况,遇到过一会,机器磁盘满了,导致zk、master、worker全宕机了。
- Master
- Worker
- DB
- Statistics:这个功能从来没用到过,就是对表的查询,2.0的时候有四个统计项
待执行命令数:统计 t_ds_command 表的数据
执行失败的命令数:统计 t_ds_error_command 表的数据
待运行任务数:统计 Zookeeper 中 task_queue 的数据
待杀死任务数:统计 Zookeeper 中 task_kill 的数据
🐬源码
🐠Master、Worker
直接查询zk,获取信息,使用到了CuratorFramework
🐟CuratorFramework
官网,Master/Worker服务信息的展示使用的就是getChildren()
方法(开始操作以获取ZNode的子ZNode列表。调用其他方法(watch、background或get-stat)并通过调用forPath()完成操作)
Curator框架是一个高级API,大大简化了ZooKeeper的使用。它添加了许多基于ZooKeeper的功能,并处理了管理ZooKeper集群连接和重试操作的复杂性。其中一些功能包括:
- 自动连接管理:
存在潜在的错误情况,需要ZooKeeper客户端重新创建连接和/或重试操作。策展人自动且透明地(大部分)处理这些案例。
监视NodeDataChanged事件并根据需要调用updateServerList()。
手表由馆长食谱自动移除 - 简洁的API:
简化了原始ZooKeeper方法、事件等。
提供现代流畅的界面 - 配方实现(参见配方):
领导人选举
共享锁
路径缓存和观察程序
分布式队列
分布式优先级队列
注意:CuratorFramework的Java 8异步版本可用:Curator Async。 - pom.xml
<curator.version>4.3.0</curator.version>
<curator.test>2.12.0</curator.test>
<dependency>
<groupId>org.apache.curator</groupId>
<artifactId>curator-framework</artifactId>
</dependency>
<dependency>
<groupId>org.apache.curator</groupId>
<artifactId>curator-recipes</artifactId>
<exclusions>
<exclusion>
<groupId>org.apache.zookeeper</groupId>
<artifactId>zookeeper</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.curator</groupId>
<artifactId>curator-test</artifactId>
<version>${curator.test}</version>
<exclusions>
<exclusion>
<groupId>org.javassist</groupId>
<artifactId>javassist</artifactId>
</exclusion>
</exclusions>
<scope>test</scope>
</dependency>
- ZookeeperRegistry
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.dolphinscheduler.plugin.registry.zookeeper;
import static java.util.concurrent.TimeUnit.MILLISECONDS;
import org.apache.dolphinscheduler.registry.api.ConnectionListener;
import org.apache.dolphinscheduler.registry.api.Event;
import org.apache.dolphinscheduler.registry.api.Registry;
import org.apache.dolphinscheduler.registry.api.RegistryException;
import org.apache.dolphinscheduler.registry.api.RegistryProperties;
import org.apache.dolphinscheduler.registry.api.RegistryProperties.ZookeeperProperties;
import org.apache.dolphinscheduler.registry.api.SubscribeListener;
import org.apache.curator.framework.CuratorFramework;
import org.apache.curator.framework.CuratorFrameworkFactory;
import org.apache.curator.framework.api.ACLProvider;
import org.apache.curator.framework.recipes.cache.ChildData;
import org.apache.curator.framework.recipes.cache.TreeCache;
import org.apache.curator.framework.recipes.cache.TreeCacheEvent;
import org.apache.curator.framework.recipes.locks.InterProcessMutex;
import org.apache.curator.retry.ExponentialBackoffRetry;
import org.apache.curator.utils.CloseableUtils;
import org.apache.zookeeper.CreateMode;
import org.apache.zookeeper.KeeperException;
import org.apache.zookeeper.ZooDefs;
import org.apache.zookeeper.data.ACL;
import java.nio.charset.StandardCharsets;
import java.time.Duration;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import javax.annotation.PostConstruct;
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
import org.springframework.stereotype.Component;
import com.google.common.base.Strings;
@Component
@ConditionalOnProperty(prefix = "registry", name = "type", havingValue = "zookeeper")
public final class ZookeeperRegistry implements Registry {
private final ZookeeperProperties properties;
private final CuratorFramework client;
private final Map<String, TreeCache> treeCacheMap = new ConcurrentHashMap<>();
private static final ThreadLocal<Map<String, InterProcessMutex>> threadLocalLockMap = new ThreadLocal<>();
public ZookeeperRegistry(RegistryProperties registryProperties) {
properties = registryProperties.getZookeeper();
final ExponentialBackoffRetry retryPolicy = new ExponentialBackoffRetry(
(int) properties.getRetryPolicy().getBaseSleepTime().toMillis(),
properties.getRetryPolicy().getMaxRetries(),
(int) properties.getRetryPolicy().getMaxSleep().toMillis());
CuratorFrameworkFactory.Builder builder =
CuratorFrameworkFactory.builder()
.connectString(properties.getConnectString())
.retryPolicy(retryPolicy)
.namespace(properties.getNamespace())
.sessionTimeoutMs((int) properties.getSessionTimeout().toMillis())
.connectionTimeoutMs((int) properties.getConnectionTimeout().toMillis());
final String digest = properties.getDigest();
if (!Strings.isNullOrEmpty(digest)) {
buildDigest(builder, digest);
}
client = builder.build();
}
private void buildDigest(CuratorFrameworkFactory.Builder builder, String digest) {
builder.authorization("digest", digest.getBytes(StandardCharsets.UTF_8))
.aclProvider(new ACLProvider() {
@Override
public List<ACL> getDefaultAcl() {
return ZooDefs.Ids.CREATOR_ALL_ACL;
}
@Override
public List<ACL> getAclForPath(final String path) {
return ZooDefs.Ids.CREATOR_ALL_ACL;
}
});
}
@PostConstruct
public void start() {
client.start();
try {
if (!client.blockUntilConnected((int) properties.getBlockUntilConnected().toMillis(), MILLISECONDS)) {
client.close();
throw new RegistryException("zookeeper connect timeout: " + properties.getConnectString());
}
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}
@Override
public void addConnectionStateListener(ConnectionListener listener) {
client.getConnectionStateListenable().addListener(new ZookeeperConnectionStateListener(listener));
}
@Override
public boolean subscribe(String path, SubscribeListener listener) {
final TreeCache treeCache = treeCacheMap.computeIfAbsent(path, $ -> new TreeCache(client, path));
treeCache.getListenable().addListener(($, event) -> listener.notify(new EventAdaptor(event, path)));
try {
treeCache.start();
} catch (Exception e) {
treeCacheMap.remove(path);
throw new RegistryException("Failed to subscribe listener for key: " + path, e);
}
return true;
}
@Override
public void unsubscribe(String path) {
CloseableUtils.closeQuietly(treeCacheMap.get(path));
}
@Override
public String get(String key) {
try {
return new String(client.getData().forPath(key), StandardCharsets.UTF_8);
} catch (Exception e) {
throw new RegistryException("zookeeper get data error", e);
}
}
@Override
public boolean exists(String key) {
try {
return null != client.checkExists().forPath(key);
} catch (Exception e) {
throw new RegistryException("zookeeper check key is existed error", e);
}
}
@Override
public void put(String key, String value, boolean deleteOnDisconnect) {
final CreateMode mode = deleteOnDisconnect ? CreateMode.EPHEMERAL : CreateMode.PERSISTENT;
try {
client.create()
.orSetData()
.creatingParentsIfNeeded()
.withMode(mode)
.forPath(key, value.getBytes(StandardCharsets.UTF_8));
} catch (Exception e) {
throw new RegistryException("Failed to put registry key: " + key, e);
}
}
@Override
public List<String> children(String key) {
try {
List<String> result = client.getChildren().forPath(key);
result.sort(Comparator.reverseOrder());
return result;
} catch (Exception e) {
throw new RegistryException("zookeeper get children error", e);
}
}
@Override
public void delete(String nodePath) {
try {
client.delete()
.deletingChildrenIfNeeded()
.forPath(nodePath);
} catch (KeeperException.NoNodeException ignored) {
// Is already deleted or does not exist
} catch (Exception e) {
throw new RegistryException("Failed to delete registry key: " + nodePath, e);
}
}
@Override
public boolean acquireLock(String key) {
InterProcessMutex interProcessMutex = new InterProcessMutex(client, key);
try {
interProcessMutex.acquire();
if (null == threadLocalLockMap.get()) {
threadLocalLockMap.set(new HashMap<>(3));
}
threadLocalLockMap.get().put(key, interProcessMutex);
return true;
} catch (Exception e) {
try {
interProcessMutex.release();
throw new RegistryException("zookeeper get lock error", e);
} catch (Exception exception) {
throw new RegistryException("zookeeper release lock error", e);
}
}
}
@Override
public boolean releaseLock(String key) {
if (null == threadLocalLockMap.get().get(key)) {
return false;
}
try {
threadLocalLockMap.get().get(key).release();
threadLocalLockMap.get().remove(key);
if (threadLocalLockMap.get().isEmpty()) {
threadLocalLockMap.remove();
}
} catch (Exception e) {
throw new RegistryException("zookeeper release lock error", e);
}
return true;
}
@Override
public Duration getSessionTimeout() {
return properties.getSessionTimeout();
}
@Override
public void close() {
treeCacheMap.values().forEach(CloseableUtils::closeQuietly);
CloseableUtils.closeQuietly(client);
}
static final class EventAdaptor extends Event {
public EventAdaptor(TreeCacheEvent event, String key) {
key(key);
switch (event.getType()) {
case NODE_ADDED:
type(Type.ADD);
break;
case NODE_UPDATED:
type(Type.UPDATE);
break;
case NODE_REMOVED:
type(Type.REMOVE);
break;
default:
break;
}
final ChildData data = event.getData();
if (data != null) {
path(data.getPath());
data(new String(data.getData()));
}
}
}
}
🐠DB
-
获取数据库连接及相关信息,查询数据库,获取最大连接数等信息,用到了
javax.sql.DataSource
接口,2.0用的是com.zaxxer.hikari.HikariDataSource
,之所以注意到这个地方,之前给2.0引入审计日志的时候,不知道那里冲突了,一直加载不到HikariDataSource
,后来替换为DataSource
才解决,现在3.0也给替换掉了
-
MonitorDBDao
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.dolphinscheduler.dao;
import org.apache.dolphinscheduler.dao.entity.MonitorRecord;
import org.apache.dolphinscheduler.dao.utils.MySQLPerformance;
import org.apache.dolphinscheduler.dao.utils.PostgreSQLPerformance;
import org.apache.dolphinscheduler.spi.enums.DbType;
import java.sql.Connection;
import java.sql.DriverManager;
import java.util.ArrayList;
import java.util.List;
import javax.sql.DataSource;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
@Component
public class MonitorDBDao {
private static final Logger logger = LoggerFactory.getLogger(MonitorDBDao.class);
public static final String VARIABLE_NAME = "variable_name";
@Autowired
private DataSource dataSource;
private MonitorRecord getCurrentDbPerformance() {
try (final Connection conn = dataSource.getConnection()) {
String driverClassName = DriverManager.getDriver(conn.getMetaData().getURL()).getClass().getName();
if (driverClassName.contains(DbType.MYSQL.toString().toLowerCase())) {
return new MySQLPerformance().getMonitorRecord(conn);
} else if (driverClassName.contains(DbType.POSTGRESQL.toString().toLowerCase())) {
return new PostgreSQLPerformance().getMonitorRecord(conn);
}
} catch (Exception e) {
logger.error("SQLException: {}", e.getMessage(), e);
}
return null;
}
/**
* query database state
*
* @return MonitorRecord list
*/
public List<MonitorRecord> queryDatabaseState() {
List<MonitorRecord> list = new ArrayList<>(1);
MonitorRecord monitorRecord = getCurrentDbPerformance();
if (monitorRecord != null) {
list.add(monitorRecord);
}
return list;
}
}
- MySQLPerformance
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.dolphinscheduler.dao.utils;
import static org.apache.dolphinscheduler.dao.MonitorDBDao.VARIABLE_NAME;
import org.apache.dolphinscheduler.common.enums.Flag;
import org.apache.dolphinscheduler.dao.entity.MonitorRecord;
import org.apache.dolphinscheduler.spi.enums.DbType;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.Date;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* MySQL performance
*/
public class MySQLPerformance extends BaseDBPerformance {
private static Logger logger = LoggerFactory.getLogger(MySQLPerformance.class);
/**
* get monitor record
* @param conn connection
* @return MonitorRecord
*/
@Override
public MonitorRecord getMonitorRecord(Connection conn) {
MonitorRecord monitorRecord = new MonitorRecord();
monitorRecord.setDate(new Date());
monitorRecord.setDbType(DbType.MYSQL);
monitorRecord.setState(Flag.YES);
Statement pstmt= null;
try{
pstmt = conn.createStatement();
try (ResultSet rs1 = pstmt.executeQuery("show global variables")) {
while(rs1.next()){
if("MAX_CONNECTIONS".equalsIgnoreCase(rs1.getString(VARIABLE_NAME))){
monitorRecord.setMaxConnections( Long.parseLong(rs1.getString("value")));
}
}
}
try (ResultSet rs2 = pstmt.executeQuery("show global status")) {
while(rs2.next()){
if("MAX_USED_CONNECTIONS".equalsIgnoreCase(rs2.getString(VARIABLE_NAME))){
monitorRecord.setMaxUsedConnections(Long.parseLong(rs2.getString("value")));
}else if("THREADS_CONNECTED".equalsIgnoreCase(rs2.getString(VARIABLE_NAME))){
monitorRecord.setThreadsConnections(Long.parseLong(rs2.getString("value")));
}else if("THREADS_RUNNING".equalsIgnoreCase(rs2.getString(VARIABLE_NAME))){
monitorRecord.setThreadsRunningConnections(Long.parseLong(rs2.getString("value")));
}
}
}
}catch (Exception e) {
monitorRecord.setState(Flag.NO);
logger.error("SQLException ", e);
}finally {
try {
if (pstmt != null) {
pstmt.close();
}
}catch (SQLException e) {
logger.error("SQLException ", e);
}
}
return monitorRecord;
}
}
- PostgreSQLPerformance
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.dolphinscheduler.dao.utils;
import org.apache.dolphinscheduler.common.enums.Flag;
import org.apache.dolphinscheduler.dao.entity.MonitorRecord;
import org.apache.dolphinscheduler.spi.enums.DbType;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.Date;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class PostgreSQLPerformance extends BaseDBPerformance {
private static final Logger logger = LoggerFactory.getLogger(PostgreSQLPerformance.class);
/**
* get monitor record
*
* @param conn connection
* @return MonitorRecord
*/
@Override
public MonitorRecord getMonitorRecord(Connection conn) {
MonitorRecord monitorRecord = new MonitorRecord();
monitorRecord.setDate(new Date());
monitorRecord.setState(Flag.YES);
monitorRecord.setDbType(DbType.POSTGRESQL);
Statement pstmt = null;
try {
pstmt = conn.createStatement();
try (ResultSet rs1 = pstmt.executeQuery("select count(*) from pg_stat_activity;")) {
if (rs1.next()) {
monitorRecord.setThreadsConnections(rs1.getInt("count"));
}
}
try (ResultSet rs2 = pstmt.executeQuery("show max_connections")) {
if (rs2.next()) {
monitorRecord.setMaxConnections(rs2.getInt("max_connections"));
}
}
try (ResultSet rs3 = pstmt.executeQuery("select count(*) from pg_stat_activity pg where pg.state = 'active';")) {
if (rs3.next()) {
monitorRecord.setThreadsRunningConnections(rs3.getInt("count"));
}
}
} catch (Exception e) {
monitorRecord.setState(Flag.NO);
logger.error("SQLException ", e);
} finally {
try {
if (pstmt != null) {
pstmt.close();
}
} catch (SQLException e) {
logger.error("SQLException ", e);
}
}
return monitorRecord;
}
}
🐟DataSource 和HikariDataSource
HikariDataSource
是个实现类,实现了DataSource
接口
当时2.0引入审计日志(涉及AOP),报找不到HikariDataSource
这个类,估计是因为代理的原因,改成注入DataSource
,就正常了,能够确认的直接注入HikariDataSource这个实现类类不规范,一般都是注入接口
🐠Statistics
查询表(略)