1. ambari-agent初始化脚本位于/etc/init.d/ambari-agent 脚本,来自于ambari/ambari-agent/etc/init.d/ambari-agent:
case "$1" in
start)
$command_prefx "/usr/sbin/ambari-agent $@"
;;
stop)
$command_prefx "/usr/sbin/ambari-agent $@"
;;
2. 执行/usr/sbin/ambari-agent 脚本,来自于ambari/ambari-agent/src/main/python/AmbariAgent.py:
if os.environ.has_key("PYTHON_BIN"):
AGENT_SCRIPT = os.path.join(os.environ["PYTHON_BIN"],"site-packages/ambari_agent/main.py")
else:
AGENT_SCRIPT = "/usr/lib/python2.6/site-packages/ambari_agent/main.py
def main():
mergedArgs = [PYTHON, AGENT_SCRIPT] + args
while status == AGENT_AUTO_RESTART_EXIT_CODE:
mainProcess = subprocess.Popen(mergedArgs)
mainProcess.communicate()
status = mainProcess.returncode
if os.path.isfile(AGENT_PID_FILE) and status == AGENT_AUTO_RESTART_EXIT_CODE:
os.remove(AGENT_PID_FILE)
3.main()方法里面启动1个子进程,执行main.py脚本,来自于ambari/ambari-agent/src/main/python/main.py:
logger = logging.getLogger()
alerts_logger = logging.getLogger('ambari_alerts')
formatstr = "%(levelname)s %(asctime)s %(filename)s:%(lineno)d - %(message)s"
agentPid = os.getpid()
config = AmbariConfig.AmbariConfig()
configFile = config.getConfigFile()
if __name__ == "__main__":
is_logger_setup = False
try:
heartbeat_stop_callback = bind_signal_handlers(agentPid)
main(heartbeat_stop_callback)
except SystemExit:
raise
4. AmbariConfig类初始化方法:
class AmbariConfig:
def __init__(self):
global content
self.config = ConfigParser.RawConfigParser()
self.net = NetUtil(self)
self.config.readfp(StringIO.StringIO(content))
5. main()方法源码如下:
def main(heartbeat_stop_callback=None):
netutil = NetUtil(config, heartbeat_stop_callback)
(retries, connected, stopped) = netutil.try_to_connect(server_url,
MAX_RETRIES, logger)
# if connected, launch controller
if connected:
logger.info('Connected to Ambari server %s', server_hostname)
# Set the active server
active_server = server_hostname
# Launch Controller communication
controller = Controller(config, server_hostname, heartbeat_stop_callback)
controller.start()
while controller.is_alive():
time.sleep(0.1)
return active_server
main()方法调用NetUtil类的try_to_connect()方法去连接ambari-server,使得与ambari-server进程通信,连接成功后调用Controller类的start()方法。Controller类的run()方法源码如下:
def run(self):
while True:
self.repeatRegistration = False
self.registerAndHeartbeat()
#firstBoot = False
if not self.repeatRegistration:
break
logger.info("Controller thread has successfully finished")
run()方法调用registerAndHeartbeat()方法向服务器注册, 其代码如下:
def registerAndHeartbeat(self):
registerResponse = self.registerWithServer()
if "response" in registerResponse:
message = registerResponse["response"]
logger.info("Registration response from %s was %s", self.serverHostname, message)
if self.isRegistered:
# Clearing command queue to stop executing "stale" commands
# after registration
logger.info('Resetting ActionQueue...')
self.actionQueue.reset()
# Process callbacks
for callback in self.registration_listeners:
callback()
time.sleep(self.netutil.HEARTBEAT_IDLE_INTERVAL_DEFAULT_MAX_SEC)
self.heartbeatWithServer()
else:
logger.info("Registration response from %s didn't contain 'response' as a key".format(self.serverHostname))
代码先调用registerWithServer()方法向服务器注册, 该方法会请求server端的注册接口, 带上自己机器的硬件参数信息. 向服务器注册成功后, 调用heartbeatWithServer()方法:
while not self.DEBUG_STOP_HEARTBEATING:
heartbeat_interval = self.netutil.HEARTBEAT_IDLE_INTERVAL_DEFAULT_MAX_SEC
try:
crt_time = time.time()
if crt_time - heartbeat_running_msg_timestamp > int(state_interval):
logger.info("Heartbeat with server is running...")
heartbeat_running_msg_timestamp = crt_time
send_state = False
if not retry:
if crt_time - last_state_timestamp > int(state_interval):
send_state = True
data = json.dumps(
self.heartbeat.build(self.responseId, send_state, self.hasMappedComponents))
else:
self.DEBUG_HEARTBEAT_RETRIES += 1
logger.debug("Sending Heartbeat (id = %s): %s", self.responseId, data)
heartbeatUrl=self.getHeartBeatUrl()
response = self.sendRequest(heartbeatUrl, data)
server端向agent端的回执请求中会带上要执行的命令:
response_keys = response.keys()
if 'cancelCommands' in response_keys:
self.cancelCommandInQueue(response['cancelCommands'])
if 'executionCommands' in response_keys:
execution_commands = response['executionCommands']
self.recovery_manager.process_execution_commands(execution_commands)
self.addToQueue(execution_commands)
if 'statusCommands' in response_keys:
# try storing execution command details and desired state
self.addToStatusQueue(response['statusCommands'])
if not self.actionQueue.tasks_in_progress_or_pending():
recovery_commands = self.recovery_manager.get_recovery_commands()
for recovery_command in recovery_commands:
logger.info("Adding recovery command %s for component %s",
recovery_command['roleCommand'], recovery_command['role'])
self.addToQueue([recovery_command])
if 'alertDefinitionCommands' in response_keys:
self.alert_scheduler_handler.update_definitions(response)
if 'alertExecutionCommands' in response_keys:
self.alert_scheduler_handler.execute_alert(response['alertExecutionCommands'])
if "true" == response['restartAgent']:
logger.error("Received the restartAgent command")
self.restartAgent()
else:
logger.debug("No commands sent from %s", self.serverHostname)