除了自己写代码,当然可以利用Hortonworks Data Platform (HDP) 的 Ambari 等组件完成。
以下python代码提供另一种实现思路:
监控每台节点JVM的指标要包括:JVM内存使用情况、 线程数、 垃圾回收时间频率和对象数量、类加载情况, cpu使用率和负载均衡、 文件描述符使用情况、 网络IO、磁盘io、 JVM运行状态。
1.代码1(python: jpype + subprocess + smtplib )
#python脚本可以自动恢复相应的指标。并将异常情况和处理的恢复情况邮件发送通知多个关系人。
import jpype
import schedule
import time
import smtplib
import subprocess
#001 SMTP配置
from_addr = "your_email_address"
password = "your_email_口令"
smtp_server = "your_smtp_server_address"
smtp_port = 587
#002 关系人邮箱列表
to = ["recipient1@example.com", "recipient2@example.com"]
#003 JVM监控指标
jvm_metrics = [
{"name": "java.lang:type=Memory", "attributes": ["HeapMemoryUsage", "NonHeapMemoryUsage"]},
{"name": "java.lang:type=Threading", "attributes": ["ThreadCount"]},
{"name": "java.lang:type=GarbageCollector,*", "attributes": ["CollectionCount", "CollectionTime"]},
{"name": "java.lang:type=ClassLoading", "attributes": ["LoadedClassCount", "UnloadedClassCount"]},
{"name": "java.lang:type=OperatingSystem", "attributes": ["ProcessCpuLoad", "SystemLoadAverage"]},
{"name": "java.lang:type=OperatingSystem", "attributes": ["OpenFileDescriptorCount", "MaxFileDescriptorCount"]},
{"name": "java.lang:type=OperatingSystem", "attributes": ["TotalPhysicalMemorySize", "FreePhysicalMemorySize"]},
{"name": "java.lang:type=OperatingSystem", "attributes": ["TotalSwapSpaceSize", "FreeSwapSpaceSize"]},
{"name": "java.lang:type=OperatingSystem", "attributes": ["SystemCpuLoad", "ProcessCpuTime"]},
]
#004 JPype启动JVM
jpype.startJVM(jpype.getDefaultJVMPath(), "-ea")
#005 JMX连接
JMXServiceURL = jpype.JClass("javax.management.remote.JMXServiceURL")
JMXConnectorFactory = jpype.JClass("javax.management.remote.JMXConnectorFactory")
Hashtable = jpype.JClass("java.util.Hashtable")
url = JMXServiceURL("service:jmx:rmi:///jndi/rmi://localhost:7199/jmxrmi")
jmxConnector = JMXConnectorFactory.connect(url, Hashtable())
connection = jmxConnector.getMBeanServerConnection()
def get_jvm_metrics():
metrics_data = {}
for metric in jvm_metrics:
name = metric["name"]
attributes = metric["attributes"]
for attribute in attributes:
metric_name = name + ":" + attribute
try:
value = connection.getAttribute(jpype.JObjectName(metric_name), attribute)
metrics_data[metric_name] = value
except:
error_message = "Error getting metric {}.".format(metric_name)
send_email("JVM Monitoring Error", error_message, to)
return metrics_data
def monitor_jvm():
# 监控JVM指标
try:
metrics_data = get_jvm_metrics()
# 如果发现异常情况,就发送邮件通知多个关系人
if metrics_data.get("java.lang:type=Memory:HeapMemoryUsage").get("used") > 0.8 * metrics_data.get("java.lang:type=Memory:HeapMemoryUsage").get("max"):
error_message = "Heap memory usage is too high."
send_email("JVM Monitoring Error", error_message, to)
recover_jvm()
if metrics_data.get("java.lang:type=OperatingSystem:ProcessCpuLoad") > 0.8:
error_message = "CPU usage is too high."
send_email("JVM Monitoring Error", error_message, to)
recover_jvm()
if metrics_data.get("java.lang:type=OperatingSystem:SystemLoadAverage") > 0.8:
error_message = "System load is too high."
send_email("JVM Monitoring Error", error_message, to)
recover_jvm()
except:
error_message = "Error monitoring JVM metrics."
send_email("JVM Monitoring Error", error_message, to)
def recover_jvm():
# 恢复JVM指标
try:
# 自动重启JVM进程
subprocess.call(["/path/to/jvm/startup/script"])
time.sleep(60) # 等待JVM启动
get_jvm_metrics() # 测试JVM是否已经恢复
# 如果JVM已经恢复,就发送邮件通知多个关系人
recovery_message = "JVM has been recovered."
send_email("JVM Recovery", recovery_message, to)
except:
error_message = "Error recovering JVM."
send_email("JVM Recovery Error", error_message, to)
def send_email(subject, message, to):
server = smtplib.SMTP(smtp_server, smtp_port)
server.starttls()
server.login(from_addr, password)
email_message = "Subject: {}\n\n{}".format(subject, message)
server.sendmail(from_addr, to, email_message)
server.quit()
#006 定时任务
schedule.every(5).minutes.do(monitor_jvm)
while True:
schedule.run_pending()
time.sleep(1)
#007 JPype关闭JVM
jpype.shutdownJVM()
2. 代码2(python: jpype + smtplib subprocess + requests)
#支持同时对多台Hadoop集群节点的JVM进程监控和恢复:
import jpype
import schedule
import time
import smtplib
import subprocess
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
from email.mime.application import MIMEApplication
from typing import List, Dict, Any
import requests
# SMTP配置
from_addr = "your_email_address"
password = "your_email口令"
smtp_server = "your_smtp_server_address"
smtp_port = 587
# 关系人邮箱列表
to = ["recipient1@example.com", "recipient2@example.com"]
# Hadoop集群节点列表
hadoop_nodes = [
{"name": "node1", "address": "http://node1:50070"},
{"name": "node2", "address": "http://node2:50070"},
{"name": "node3", "address": "http://node3:50070"}
]
# JVM监控指标
jvm_metrics = [
{"name": "java.lang:type=Memory", "attributes": ["HeapMemoryUsage", "NonHeapMemoryUsage"]},
{"name": "java.lang:type=Threading", "attributes": ["ThreadCount"]},
{"name": "java.lang:type=GarbageCollector,*", "attributes": ["CollectionCount", "CollectionTime"]},
{"name": "java.lang:type=ClassLoading", "attributes": ["LoadedClassCount", "UnloadedClassCount"]},
{"name": "java.lang:type=OperatingSystem", "attributes": ["ProcessCpuLoad", "SystemLoadAverage"]},
{"name": "java.lang:type=OperatingSystem", "attributes": ["OpenFileDescriptorCount", "MaxFileDescriptorCount"]},
{"name": "java.lang:type=OperatingSystem", "attributes": ["TotalPhysicalMemorySize", "FreePhysicalMemorySize"]},
{"name": "java.lang:type=OperatingSystem", "attributes": ["TotalSwapSpaceSize", "FreeSwapSpaceSize"]},
{"name": "java.lang:type=OperatingSystem", "attributes": ["SystemCpuLoad", "ProcessCpuTime"]},
]
# JPype启动JVM
jpype.startJVM(jpype.getDefaultJVMPath(), "-ea")
# JMX连接
JMXServiceURL = jpype.JClass("javax.management.remote.JMXServiceURL")
JMXConnectorFactory = jpype.JClass("javax.management.remote.JMXConnectorFactory")
Hashtable = jpype.JClass("java.util.Hashtable")
jmx_connectors = []
for node in hadoop_nodes:
url = JMXServiceURL("{}/jmxrmi".format(node["address"]))
jmx_connector = JMXConnectorFactory.connect(url, Hashtable())
jmx_connectors.append(jmx_connector)
def get_jvm_metrics() -> List[Dict[str, Any]]:
metrics_data = []
for index, jmx_connector in enumerate(jmx_connectors):
node_name = hadoop_nodes[index]["name"]
connection = jmx_connector.getMBeanServerConnection()
for metric in jvm_metrics:
name = metric["name"]
attributes = metric["attributes"]
for attribute in attributes:
metric_name = name + ":" + attribute
try:
value = connection.getAttribute(jpype.JObjectName(metric_name), attribute)
metric_data = {"node": node_name, "metric_name": metric_name, "value": value}
metrics_data.append(metric_data)
except:
error_message = "Error getting metric {} from node {}.".format(metric_name, node_name)
send_email("JVM Monitoring Error", error_message, to)
return metrics_data
def monitor_jvm():
# 监控JVM指标
try:
metrics_data = get_jvm_metrics()
# 如果发现异常情况,就发送邮件通知多个关系人
for data in metrics_data:
node_name = data["node"]
metric_name = data["metric_name"]
value = data["value"]
if metric_name == "java.lang:type=Memory:HeapMemoryUsage":
if value.get("used") > 0.8 * value.get("max"):
error_message = "Heap memory usage is too high on node {}.".format(node_name)
recover_jvm(node_name)
if metric_name == "java.lang:type=OperatingSystem:ProcessCpuLoad":
if value > 0.8:
error_message = "CPU usage is too high on node {}.".format(node_name)
recover_jvm(node_name)
if metric_name == "java.lang:type=OperatingSystem:SystemLoadAverage":
if value > 0.8:
error_message = "System load is too high on node {}.".format(node_name)
recover_jvm(node_name)
except:
error_message = "Error monitoring JVM metrics."
send_email("JVM Monitoring Error", error_message, to)
def recover_jvm(node_name):
# 恢复JVM指标
try:
# 自动重启JVM进程
subprocess.call(["/path/to/jvm/startup/script", node_name])
time.sleep(60) # 等待JVM启动
# 测试JVM是否已经恢复
response = requests.get("{}/jmx?qry=java.lang:type=Memory".format(get_node_address(node_name)))
if response.status_code == 200:
# 如果JVM已经恢复,就发送邮件通知多个关系人
recovery_message = "JVM has been recovered on node {}.".format(node_name)
send_email("JVM Recovery", recovery_message, to)
else:
error_message = "JVM recovery failed on node {}.".format(node_name)
send_email("JVM Recovery Error", error_message, to)
except:
error_message = "Error recovering JVM on node {}.".format(node_name)
send_email("JVM Recovery Error", error_message, to)
def send_email(subject, message, to):
msg = MIMEMultipart()
msg["From"] = from_addr
msg["To"] = ", ".join(to)
msg["Subject"] = subject
body = MIMEText(message)
msg.attach(body)
server = smtplib.SMTP(smtp_server, smtp_port)
server.starttls()
server.login(from_addr, password)
server.sendmail(from_addr, to, msg.as_string())
server.quit()
def get_node_address(node_name) -> str:
for node in hadoop_nodes:
if node["name"] == node_name:
return node["address"]
return ""
# 定时任务
schedule.every(5).minutes.do(monitor_jvm)
while True:
schedule.run_pending()
time.sleep(1)
# JPype关闭JVM
jpype.shutdownJVM()
'''
在上述代码中,我们添加了多个Hadoop节点的列表,并通过循环连接每个节点的JMX服务。
在获取JVM指标时,我们将每个指标的值和所属节点一起记录,并在发现异常情况时仅恢复节点上的JVM进程。
我们还添加了一个get_node_address函数,用于根据节点名称获取其地址。
最后,我们使用Python的requests库来测试JVM是否已经恢复。
'''
3.代码3(JMXtrans + JMX)
- 借助JMX(Java Management Extensions)来实现,JMX是一种Java的管理和监控技术,可以通过JMX获取JVM的运行状态信息。需要在每台节点安装jmxtrans、jmxterm等相关组件。
- 使用Python脚本来连接每个节点的JMX端口,获取JVM的实时监控指标,并进行监控。如果发现异常,则发送邮件通知多个关系人。监控每台节点JVM的指标包括JVM内存使用情况、线程数、垃圾回收时间频率和对象数量、类加载情况,CPU使用率和负载均衡、文件描述符使用情况、网络IO、磁盘IO、JVM运行状态等
'''
以下是一个大致的实现思路:
1. 使用Python的paramiko模块连接到Hadoop集群的每个节点,并运行JVM监控命令,如jstat、jmap等,将结果保存到本地。
2. 使用Python的pandas模块读取和处理监控结果,提取出需要监控的指标,如JVM内存使用情况、线程数、垃圾回收时间频率和对象数量、类加载情况,CPU使用率和负载均衡、文件描述符使用情况、网络IO、磁盘IO等。
3. 对于每个节点,将提取的指标保存到一个本地文件中,并将它们汇总到一个单独的文件中,以便在需要时检查所有节点的状态。
4. 使用Python的smtplib模块发送电子邮件通知,如果任何节点的状态出现异常,则触发电子邮件通知。可以通过设置阈值来定义什么是异常状态。
5. 使用Python的schedule模块设置定期运行的任务,以便监视集群中的所有节点,并在需要时发送电子邮件通知。
6. 可以使用Python的Flask框架构建一个Web应用程序,以便实时监视集群中的所有节点,并在需要时发送电子邮件通知。
'''
import jmxterm
import smtplib
from email.mime.text import MIMEText
from email.header import Header
# 监控节点列表
nodes = ['192.168.0.1', '192.168.0.2']
# JMX参数
jmx_port = 9999
jmx_username = ''
jmx_password = ''
# 监控指标阈值
mem_threshold = 80
cpu_threshold = 80
fd_threshold = 80
net_threshold = 80
disk_threshold = 80
# 发送邮件
def send_email(content):
mail_host = "smtp.163.com"
mail_user = "sender@163.com"
mail_pass = "password"
sender = 'sender@163.com'
receivers = ['receiver1@163.com', 'receiver2@163.com']
message = MIMEText(content, 'plain', 'utf-8')
message['From'] = Header("JVM Monitor", 'utf-8')
message['To'] = Header("Admin", 'utf-8')
subject = 'JVM Monitor Alert'
message['Subject'] = Header(subject, 'utf-8')
try:
smtpObj = smtplib.SMTP()
smtpObj.connect(mail_host, 25)
smtpObj.login(mail_user, mail_pass)
smtpObj.sendmail(sender, receivers, message.as_string())
print("邮件发送成功")
except smtplib.SMTPException as e:
print("Error: 邮件发送失败", e)
# 监控JVM指标
def monitor_jvm(node):
try:
# 连接JMX
conn = jmxterm.JMXConnection('localhost', jmx_port)
conn.handle_command('open {0}:{1}'.format(node, jmx_port))
# 获取JVM指标
mem_info = conn.handle_command('bean java.lang:type=Memory')
thread_info = conn.handle_command('bean java.lang:type=Threading')
gc_info = conn.handle_command('bean java.lang:type=GarbageCollector,*')
class_info = conn.handle_command('bean java.lang:type=ClassLoading')
cpu_info = conn.handle_command('bean java.lang:type=OperatingSystem')
fd_info = conn.handle_command('bean java.lang:type=OperatingSystem')
net_info = conn.handle_command('bean java.lang:type=OperatingSystem')
disk_info = conn.handle_command('bean java.lang:type=OperatingSystem')
# 解析JVM指标
mem_usage = int(mem_info.split('HeapMemoryUsage = {')[1].split('}')[0].split('used = ')[1].split(',')[0])
thread_count = int(thread_info.split('ThreadCount = ')[1].split('\n')[0])
gc_count = int(gc_info.split('CollectionCount = ')[1].split('\n')[0])
gc_time = int(gc_info.split('CollectionTime = ')[1].split('\n')[0])
class_loaded = int(class_info.split('LoadedClassCount = ')[1].split('\n')[0])
class_unloaded = int(class_info.split('UnloadedClassCount = ')[1].split('\n')[0])
cpu_usage = int(cpu_info.split('ProcessCpuLoad = ')[1].split('\n')[0])
load_avg = float(cpu_info.split('SystemLoadAverage = ')[1].split('\n')[0])
fd_count = int(fd_info.split('MaxFileDescriptorCount = ')[1].split('\n')[0])
net_rx = int(net_info.split('ProcessCpuLoad = ')[1].split('\n')[0])
net_tx = int(net_info.split('ProcessCpuLoad = ')[2].split('\n')[0])
disk_read = int(disk_info.split('ProcessCpuLoad = ')[1].split('\n')[0])
disk_write = int(disk_info.split('ProcessCpuLoad = ')[2].split('\n')[0])
# 比较阈值
if mem_usage > mem_threshold:
send_email(node + ' JVM memory usage is too high!')
if cpu_usage > cpu_threshold:
send_email(node + ' CPU usage is too high!')
if fd_count > fd_threshold:
send_email(node + ' File descriptor usage is too high!')
if net_rx > net_threshold or net_tx > net_threshold:
send_email(node + ' Network usage is too high!')
if disk_read > disk_threshold or disk_write > disk_threshold:
send_email(node + ' Disk usage is too high!')
# 关闭JMX连接
conn.handle_command('close')
except Exception as e:
send_email(node + ' JVM monitor error: ' + str(e))
# 监控所有节点
for node in nodes:
monitor_jvm(node)
4.代码4(JMX + pyjmx)
1.安装并配置JMX
JMX是Java平台的管理和监控机制,可以通过JMX获取Java虚拟机(JVM)的各种运行指标。要使用JMX监控Hadoop集群所有节点的JVM,需要在每个节点上安装和配置JMX。以下是安装和配置JMX的步骤:
下载JMX的安装包。
解压安装包到目标目录。
设置JMX的环境变量。
可以在.bashrc
或.profile
中添加以下代码
export JAVA_HOME=<Java的安装路径>
export JMX_PORT=<JMX的监听端口>
export JMX_OPTS="-Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false -Dcom.sun.management.jmxremote.local.only=false -Djava.rmi.server.hostname=<节点的IP地址>"
#其中,JMX_PORT是Java的安装路径,JMX_PORT是JMX的监听端口,Djava.rmi.server.hostname是节点的IP地址。
- 启动JMX。
可以在启动Hadoop服务的启动脚本中加入以下代码:
export HADOOP_OPTS="$HADOOP_OPTS -Dcom.sun.management.jmxremote.port=<jmx-port> -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false -Dcom.sun.management.jmxremote.local.only=false -Djava.rmi.server.hostname=<ip-address>"
- 连接到每个节点的JMX并收集指标
Python可以通过JMX连接到每个节点的JVM,并收集JVM的各种指标。
使用JMX连接需要使用Java的jmxremote.jar。jmxremote.jar包含了JMX的Java API和JMX的一些类库。
以下是连接到每个节点的JMX并收集JVM指标的步骤:
安装pyjmx库。pyjmx是Python的一个JMX客户端库,可以使用pip安装。
pip install pyjmx
4.编写Python脚本,连接到每个节点的JMX,并收集JVM的各种指标。
import pyjmx
# 定义JMX连接信息
jmx_host = "localhost"
jmx_port = 9999
# 连接到JMX
jmx = pyjmx.JMX(jmx_host, jmx_port)
# 获取JVM内存使用情况
heap_memory_used = jmx.get_attribute("java.lang:type=Memory", "HeapMemoryUsage.used")
non_heap_memory_used = jmx.get_attribute("java.lang:type=Memory", "NonHeapMemoryUsage.used")
# 获取线程数
thread_count = jmx.get_attribute("java.lang:type=Threading", "ThreadCount")
# 获取垃圾回收时间频率和对象数量
gc_time = jmx.get_attribute("java.lang:type=GarbageCollector,name=PS MarkSweep", "CollectionTime")
gc_obj_count = jmx.get_attribute("java.lang:type=GarbageCollector,name=PS MarkSweep", "CollectionCount")
# 获取类加载情况
class_loaded_count = jmx.get_attribute("java.lang:type=ClassLoading", "LoadedClassCount")
class_unloaded_count = jmx.get_attribute("java.lang:type=ClassLoading", "UnloadedClassCount")
# 获取CPU使用率和负载均衡
cpu_load = jmx.get_attribute("java.lang:type=OperatingSystem", "SystemLoadAverage")
cpu_usage = jmx.get_attribute("java.lang:type=OperatingSystem", "ProcessCpuLoad")
# 获取文件描述符使用情况
fd_open_count = jmx.get_attribute("java.lang:type=OperatingSystem", "OpenFileDescriptorCount")
fd_max_count = jmx.get_attribute("java.lang:type=OperatingSystem", "MaxFileDescriptorCount")
# 获取网络IO
net_rx_bytes = jmx.get_attribute("java.lang:type=OperatingSystem", "BytesReceived")
net_tx_bytes = jmx.get_attribute("java.lang:type=OperatingSystem", "BytesSent")
# 获取磁盘IO
disk_read_bytes = jmx.get_attribute("java.lang:type=OperatingSystem", "DiskReadBytes")
disk_write_bytes = jmx.get_attribute("java.lang:type=OperatingSystem", "DiskWriteBytes")
# 获取JVM运行状态
jvm_uptime = jmx.get_attribute("java.lang:type=Runtime", "Uptime")
jvm_state = jmx.get_attribute("java.lang:type=Runtime", "State")
# 关闭JMX连接
jmx.close()