背景
再使用开源版本hadoop集群时,有配置参数修改时,需要人工修改并重启节点让配置生效,因此这里这里写了个脚本实现自动滚动重启。
脚本:
注意:不同版本的hadoop
#!/usr/bin/env python
# -*-coding:utf-8 -*-
import logging
import subprocess
import sys
import time
"""
滚动重启DataNode服务
通过如下命令关闭现有服务,然后通过远程命令启动当前机器服务
hdfs dfsadmin -shutdownDatanode <DATANODE_HOST:IPC_PORT> upgrade
hdfs dfsadmin -getDatanodeInfo <DATANODE_HOST:IPC_PORT>
"""
"""
初始化日志
"""
logger = logging.getLogger("Rolling Restart Hadoop Datanode")
logger.setLevel(logging.INFO)
stdout = logging.StreamHandler()
stdout.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
stdout.setFormatter(formatter)
logger.addHandler(stdout)
class RollingStartException(Exception):
"""
异常信息
"""
pass
class CheckDataNodeDeadStateException(Exception):
"""
检查datanode服务是否停止异常
"""
pass
def runShellCommand(cmd):
"""
执行shell命令
:param cmd:
:return:
"""
process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
# 等待当前任务运行完成
while process.poll() is None:
logger.info("wait for shell command done...")
time.sleep(1)
code = process.poll()
out, err = process.communicate()
if code != 0:
# logger.error("exec shell command {cmd} faild".format(cmd=cmd))
raise RollingStartException("run shell command faild,msg:{msg}".format(msg=err))
else:
return code, out
def shutDownDatanode(host):
"""
关闭当前datanode服务
:param host:
:return:
"""
execCommand = "su hdfs -c 'hdfs dfsadmin -shutdownDatanode {host}:8010'".format(host=host)
logger.info("start shutdown datanode,current info: {host}".format(host=host))
code, out = runShellCommand(execCommand)
logger.info("send shutDownDatanode msg successful,msg:{msg}".format(msg=out))
def chekDatanodeState(host, targetState="active"):
"""
检查当前Datanode状态待dn服务完全停止
:param host:
:return:
"""
try:
execCommand = "su hdfs -c 'hdfs dfsadmin -getDatanodeInfo {host}:8010'".format(host=host)
return runShellCommand(execCommand)
if targetState == "dead":
logger.warning("datanode already running ,wait shutdown...")
raise CheckDataNodeDeadStateException("datanode already running...")
except RollingStartException ,e:
# 判断当前服务是否已经停止成功时
if targetState == "dead" and str(e).__contains__("getDatanodeInfo: Datanode unreachable"):
logger.info("current datanode state became dead,dninfo: {host}".format(host=host))
return 0,"datanode is dead"
else:
raise RollingStartException(e.args)
# todo: 使用端口方式进行检测,是否会快一些 ????
def checkDataNodeIsDead(host):
logger.info("wait datanode state to dead...")
retrycount = 5
while retrycount > 0:
try:
chekDatanodeState(host, targetState="dead")
retrycount = 0
except CheckDataNodeDeadStateException ,e:
logger.warning("check datanode dead state with retry 5 times,current count {count}".format(count=retrycount))
retrycount = retrycount - 1
time.sleep(1)
# 如果重试五次还未检测到相应的状态
if retrycount <= 0:
raise RollingStartException(e.args)
def startDatanode(host):
"""
启动datanode服务
:param host:
:return:
"""
# 启动服务
logger.info("start datanode.......")
# todo: 升级时需要切换到对应版本路径上
baseCommand = "/opt/app/hadoop/sbin/hadoop-daemon.sh"
execCommand = "ssh {host} -C 'su hdfs -c \"{command} start datanode\"'".format(host=host, command=baseCommand)
logger.info(execCommand)
runShellCommand(execCommand)
# 检查当前服务状态
logger.info("wait datanode state to active...")
code, out = chekDatanodeState(host)
logger.info("restart datanode {host} successful,msg:{msg}".format(host=host, msg=out))
def rollingRestart():
"""
读取DataNode列表文件对依次对每台机器执行关闭 状态检查 服务启动
"""
dnlist = open("./dnhost", "r")
for line in dnlist:
try:
line = line.replace("\n", "")
shutDownDatanode(line)
checkDataNodeIsDead(line)
# 等待dn完全停止 pid文件被清理后在进行启动
time.sleep(5)
startDatanode(line)
# 等待dn上报完毕
time.sleep(30)
# todo: 这里需要修改语法 在python2
except RollingStartException ,e:
logger.error(e.args)
# 存在异常直接退出重启程序
sys.exit(255)
if __name__ == '__main__':
rollingRestart()