一. 文件 conductor/manager.py
执行命令 ironic node-create -d pxe_ipmitool -u ${ironic_node_uuid}
类ConductorManager中create_node传入的参数可以得出:只有uuid, driver, provision_state有值
boot_interface=<?>,chassis_id=<?>,clean_step=<?>,conductor_affinity=<?>,
console_enabled=<?>,console_interface=<?>,created_at=<?>,deploy_interface=<?>,driver='pxe_ipmitool',
driver_info=<?>,driver_internal_info=<?>,extra=<?>,id=<?>,inspect_interface=<?>,inspection_finished_at=<?>,
inspection_started_at=<?>,instance_info=<?>,instance_uuid=<?>,last_error=<?>,maintenance=<?>,
maintenance_reason=<?>,management_interface=<?>,name=<?>,network_interface=<?>,power_interface=<?>,
power_state=<?>,properties=<?>,provision_state='available',provision_updated_at=<?>,raid_config=<?>,
raid_interface=<?>,reservation=<?>,resource_class=<?>,storage_interface=<?>,target_power_state=<?>,
target_provision_state=<?>,target_raid_config=<?>,updated_at=<?>,
uuid=00000000-0000-0000-0000-000000000102,vendor_interface=<?>
@ check_and_update_node_interfaces:
@ 保存至数据库
node-update 差不多操作
二. _sync_power_states
定期任务同步电源状态
条件满足的:
@ 节点map到conductor
@ 节点 maintenance: false
@ 节点 provision state非DEPLOYWAIT/CLEANWAIT
@
过滤掉一些出错的问题:
主要函数:
count = do_sync_power_state(
task, self.power_state_sync_count[node_uuid])
if count:
self.power_state_sync_count[node_uuid] = count
else:
# don't bloat the dict with non-failing nodes
del self.power_state_sync_count[node_uuid]
2.1 函数 do_sync_power_state,
@METRICS.timer('do_sync_power_state')
def do_sync_power_state(task, count):
省略
# We will modify a node, so upgrade our lock and use reloaded node.
# This call may raise NodeLocked that will be caught on upper level.
task.upgrade_lock()
node = task.node
# Repeat all checks with exclusive lock to avoid races
if node.power_state and node.power_state == power_state:
# Node power state was updated to the correct value
return 0
elif node.provision_state in SYNC_EXCLUDED_STATES or node.maintenance:
# Something was done to a node while a shared lock was held
return 0
elif node.power_state is None:
# If node has no prior state AND we successfully got a state,
# simply record that and send a notification.
LOG.info("During sync_power_state, node %(node)s has no "
"previous known state. Recording current state '%(state)s'.",
{'node': node.uuid, 'state': power_state})
node.power_state = power_state
node.save()
notify_utils.emit_power_state_corrected_notification(
task, None)
return 0
if count > max_retries:
handle_sync_power_state_max_retries_exceeded(task, power_state)
return count
if CONF.conductor.force_power_state_during_sync:
LOG.warning("During sync_power_state, node %(node)s state "
"'%(actual)s' does not match expected state. "
"Changing hardware state to '%(state)s'.",
{'node': node.uuid, 'actual': power_state,
'state': node.power_state})
try:
# node_power_action will update the node record
# so don't do that again here.
utils.node_power_action(task, node.power_state)
except Exception as e:
LOG.error(
"Failed to change power state of node %(node)s "
"to '%(state)s', attempt %(attempt)s of %(retries)s.",
{'node': node.uuid,
'state': node.power_state,
'attempt': count,
'retries': max_retries})
else:
LOG.warning("During sync_power_state, node %(node)s state "
"does not match expected state '%(state)s'. "
"Updating recorded state to '%(actual)s'.",
{'node': node.uuid, 'actual': power_state,
'state': node.power_state})
node.power_state = power_state
node.save()
notify_utils.emit_power_state_corrected_notification(
task, old_power_state)
return count
代码中省略1: 超过最大失败次数,节点变为maintenace mode并报告错误, 如果节点没有先前状态和driver info 不合法,异常InvalidParameterValue
主要函数为:power_state = task.driver.power.get_power_state(task)
def get_power_state(self, task):
"""Get the current power state of the task's node.
:param task: a TaskManager instance containing the node to act on.
:returns: one of ironic.common.states POWER_OFF, POWER_ON or ERROR.
:raises: InvalidParameterValue if required ipmi parameters are missing.
:raises: MissingParameterValue if a required parameter is missing.
:raises: IPMIFailure on an error from ipmitool (from _power_status
call).
"""
driver_info = _parse_driver_info(task.node)
return _power_status(driver_info)
主要根据配置文件ironic.conf中driver为ipmitool, 需要的driver_info主要为ipmi_address, 得到一堆driver_info参数
def _power_status(driver_info):
"""Get the power status for a node.
:param driver_info: the ipmitool access parameters for a node.
:returns: one of ironic.common.states POWER_OFF, POWER_ON or ERROR.
:raises: IPMIFailure on an error from ipmitool.
"""
cmd = "power status"
try:
out_err = _exec_ipmitool(driver_info, cmd)
except (exception.PasswordFileFailedToCreate,
processutils.ProcessExecutionError) as e:
LOG.warning("IPMI power status failed for node %(node_id)s with "
"error: %(error)s.",
{'node_id': driver_info['uuid'], 'error': e})
raise exception.IPMIFailure(cmd=cmd)
if out_err[0] == "Chassis Power is on\n":
return states.POWER_ON
elif out_err[0] == "Chassis Power is off\n":
return states.POWER_OFF
else:
return states.ERROR
三. change_node_power_state
获得锁执行,设置target_power_state(POWER_ON / POWER_OFF)
最关键的函数创建一个协程进行处理
task.spawn_after(self._spawn_worker, utils.node_power_action, task, new_state, timeout=power_timeout)
关键执行函数node_power_action: 过滤掉一些当前与目标状态一样的,主要代码如下:
# take power action
try:
if new_state != states.REBOOT:
if ('timeout' in reflection.get_signature(
task.driver.power.set_power_state).parameters):
task.driver.power.set_power_state(task, new_state,
timeout=timeout)
else:
# FIXME(naohirot):
# After driver composition, we should print power interface
# name here instead of driver.
LOG.warning(
"The set_power_state method of %(driver_name)s "
"doesn't support 'timeout' parameter.",
{'driver_name': node.driver})
task.driver.power.set_power_state(task, new_state)
else:
if ('timeout' in reflection.get_signature(
task.driver.power.reboot).parameters):
task.driver.power.reboot(task, timeout=timeout)
else:
LOG.warning("The reboot method of %(driver_name)s "
"doesn't support 'timeout' parameter.",
{'driver_name': node.driver})
task.driver.power.reboot(task)
最终也是调用ipmitool power on / off reboot等
四. do_node_deploy
def run(self):
"""Run the Ironic Python Agent."""
。。。。。。。。。。。。。
if not self.standalone:
# Inspection should be started before call to lookup, otherwise
# lookup will fail due to unknown MAC.
uuid = None
if cfg.CONF.inspection_callback_url:
uuid = inspector.inspect()
if self.api_url:
self._wait_for_interface()
content = self.api_client.lookup_node(
hardware_info=hardware.dispatch_to_managers(
'list_hardware_info'),
timeout=self.lookup_timeout,
starting_interval=self.lookup_interval,
node_uuid=uuid)
LOG.debug('Received lookup results: %s', content)
self.node = content['node']
LOG.info('Lookup succeeded, node UUID is %s',
self.node['uuid'])
hardware.cache_node(self.node)
self.heartbeat_timeout = content['config']['heartbeat_timeout']
# Update config with values from Ironic
config = content.get('config', {})
if config.get('metrics'):
for opt, val in config.items():
setattr(cfg.CONF.metrics, opt, val)
if config.get('metrics_statsd'):
for opt, val in config.items():
setattr(cfg.CONF.metrics_statsd, opt, val)
elif cfg.CONF.inspection_callback_url:
LOG.info('No ipa-api-url configured, Heartbeat and lookup '
'skipped for inspector.')
else:
LOG.error('Neither ipa-api-url nor inspection_callback_url'
'found, please check your pxe append parameters.')
if netutils.is_ipv6_enabled():
# Listens to both IP versions, assuming IPV6_V6ONLY isn't enabled,
# (the default behaviour in linux)
simple_server.WSGIServer.address_family = socket.AF_INET6
wsgi = simple_server.make_server(
self.listen_address.hostname,
self.listen_address.port,
self.api,
server_class=simple_server.WSGIServer)
if not self.standalone and self.api_url:
# Don't start heartbeating until the server is listening
self.heartbeater.start()
try:
wsgi.serve_forever()
except BaseException:
LOG.exception('shutting down')
if not self.standalone and self.api_url:
self.heartbeater.stop()
5.0 beartbeat
def heartbeat(self, task, callback_url): """Process a heartbeat. :param task: task to work with. :param callback_url: agent HTTP API URL. """ # TODO(dtantsur): upgrade lock only if we actually take action other # than updating the last timestamp. task.upgrade_lock() node = task.node LOG.debug('Heartbeat from node %s', node.uuid) driver_internal_info = node.driver_internal_info driver_internal_info['agent_url'] = callback_url # TODO(rloo): 'agent_last_heartbeat' was deprecated since it wasn't # being used so remove that entry if it exists. # Hopefully all nodes will have been updated by Pike, so # we can delete this code then. driver_internal_info.pop('agent_last_heartbeat', None) node.driver_internal_info = driver_internal_info node.save() # Async call backs don't set error state on their own # TODO(jimrollenhagen) improve error messages here msg = _('Failed checking if deploy is done.') try: if node.maintenance: # this shouldn't happen often, but skip the rest if it does. LOG.debug('Heartbeat from node %(node)s in maintenance mode; ' 'not taking any action.', {'node': node.uuid}) return elif (node.provision_state == states.DEPLOYWAIT and not self.deploy_has_started(task)): msg = _('Node failed to deploy.') self.continue_deploy(task) elif (node.provision_state == states.DEPLOYWAIT and self.deploy_is_done(task)): msg = _('Node failed to move to active state.') self.reboot_to_instance(task) elif (node.provision_state == states.DEPLOYWAIT and self.deploy_has_started(task)): node.touch_provisioning() elif node.provision_state == states.CLEANWAIT: node.touch_provisioning() if not node.clean_step: LOG.debug('Node %s just booted to start cleaning.', node.uuid) msg = _('Node failed to start the first cleaning step.') # First, cache the clean steps self.refresh_clean_steps(task) # Then set/verify node clean steps and start cleaning manager_utils.set_node_cleaning_steps(task) # The exceptions from RPC are not possible as we using cast # here _notify_conductor_resume_clean(task) else: msg = _('Node failed to check cleaning progress.') self.continue_cleaning(task)
def continue_deploy(self, task): """Method invoked when deployed using iSCSI. This method is invoked during a heartbeat from an agent when the node is in wait-call-back state. This deploys the image on the node and then configures the node to boot according to the desired boot option (netboot or localboot). :param task: a TaskManager object containing the node. :param kwargs: the kwargs passed from the heartbeat method. :raises: InstanceDeployFailure, if it encounters some error during the deploy. """ task.process_event('resume') node = task.node LOG.debug('Continuing the deployment on node %s', node.uuid) uuid_dict_returned = do_agent_iscsi_deploy(task, self._client) root_uuid = uuid_dict_returned.get('root uuid') efi_sys_uuid = uuid_dict_returned.get('efi system partition uuid') self.prepare_instance_to_boot(task, root_uuid, efi_sys_uuid) self.reboot_and_finish_deploy(task)
Ironic的服务
- ironic-api: 接收REST请求,送给ironic-conductor
- ironic-conductor: 接收来自ironic-API的请求,进行创建、更新、删除nodes, 通过IPMI、ssh开关电源,或者部署机器
- ironic-python-agent: 当一台bare metal服务启动时,如果从PXE启动,机器可以从远端拉取一个最小版的Linux内核,也可以拉取一个ramdisk,
如果是ramdisk的话,可以在上面安装各种服务(也可以配置RAID),其中可以包括ironic-python-agent。在ramdisk中的ironic-python-agent提供和ironic-conductor一样的服务。 - ironicclient: Ironic CLI