【ironic 源码分析】manager 源码分析


一. 文件 conductor/manager.py

      执行命令 ironic node-create -d pxe_ipmitool -u ${ironic_node_uuid}

      类ConductorManager中create_node传入的参数可以得出:只有uuid, driver, provision_state有值

boot_interface=<?>,chassis_id=<?>,clean_step=<?>,conductor_affinity=<?>,
console_enabled=<?>,console_interface=<?>,created_at=<?>,deploy_interface=<?>,driver='pxe_ipmitool',
driver_info=<?>,driver_internal_info=<?>,extra=<?>,id=<?>,inspect_interface=<?>,inspection_finished_at=<?>,
inspection_started_at=<?>,instance_info=<?>,instance_uuid=<?>,last_error=<?>,maintenance=<?>,
maintenance_reason=<?>,management_interface=<?>,name=<?>,network_interface=<?>,power_interface=<?>,
power_state=<?>,properties=<?>,provision_state='available',provision_updated_at=<?>,raid_config=<?>,
raid_interface=<?>,reservation=<?>,resource_class=<?>,storage_interface=<?>,target_power_state=<?>,
target_provision_state=<?>,target_raid_config=<?>,updated_at=<?>,
uuid=00000000-0000-0000-0000-000000000102,vendor_interface=<?>

        @ check_and_update_node_interfaces: 

        @ 保存至数据库

        node-update 差不多操作


二. _sync_power_states

     定期任务同步电源状态

     条件满足的:

        @ 节点map到conductor

        @ 节点 maintenance: false

        @ 节点 provision state非DEPLOYWAIT/CLEANWAIT

        @ 

     过滤掉一些出错的问题:

     主要函数:

        count = do_sync_power_state(
        task, self.power_state_sync_count[node_uuid])
        if count:
              self.power_state_sync_count[node_uuid] = count
        else:
              # don't bloat the dict with non-failing nodes
              del self.power_state_sync_count[node_uuid]

2.1 函数 do_sync_power_state, 

@METRICS.timer('do_sync_power_state')
def do_sync_power_state(task, count):
    省略
    
    # We will modify a node, so upgrade our lock and use reloaded node.
    # This call may raise NodeLocked that will be caught on upper level.
    task.upgrade_lock()
    node = task.node

    # Repeat all checks with exclusive lock to avoid races
    if node.power_state and node.power_state == power_state:
        # Node power state was updated to the correct value
        return 0
    elif node.provision_state in SYNC_EXCLUDED_STATES or node.maintenance:
        # Something was done to a node while a shared lock was held
        return 0
    elif node.power_state is None:
        # If node has no prior state AND we successfully got a state,
        # simply record that and send a notification.
        LOG.info("During sync_power_state, node %(node)s has no "
                 "previous known state. Recording current state '%(state)s'.",
                 {'node': node.uuid, 'state': power_state})
        node.power_state = power_state
        node.save()
        notify_utils.emit_power_state_corrected_notification(
            task, None)
        return 0

    if count > max_retries:
        handle_sync_power_state_max_retries_exceeded(task, power_state)
        return count

    if CONF.conductor.force_power_state_during_sync:
        LOG.warning("During sync_power_state, node %(node)s state "
                    "'%(actual)s' does not match expected state. "
                    "Changing hardware state to '%(state)s'.",
                    {'node': node.uuid, 'actual': power_state,
                     'state': node.power_state})
        try:
            # node_power_action will update the node record
            # so don't do that again here.
            utils.node_power_action(task, node.power_state)
        except Exception as e:
            LOG.error(
                "Failed to change power state of node %(node)s "
                "to '%(state)s', attempt %(attempt)s of %(retries)s.",
                {'node': node.uuid,
                 'state': node.power_state,
                 'attempt': count,
                 'retries': max_retries})
    else:
        LOG.warning("During sync_power_state, node %(node)s state "
                    "does not match expected state '%(state)s'. "
                    "Updating recorded state to '%(actual)s'.",
                    {'node': node.uuid, 'actual': power_state,
                     'state': node.power_state})
        node.power_state = power_state
        node.save()
        notify_utils.emit_power_state_corrected_notification(
            task, old_power_state)

    return count

       代码中省略1: 超过最大失败次数,节点变为maintenace mode并报告错误, 如果节点没有先前状态和driver info 不合法,异常InvalidParameterValue

      主要函数为:power_state = task.driver.power.get_power_state(task)

     

    def get_power_state(self, task):
        """Get the current power state of the task's node.

        :param task: a TaskManager instance containing the node to act on.
        :returns: one of ironic.common.states POWER_OFF, POWER_ON or ERROR.
        :raises: InvalidParameterValue if required ipmi parameters are missing.
        :raises: MissingParameterValue if a required parameter is missing.
        :raises: IPMIFailure on an error from ipmitool (from _power_status
            call).

        """
        driver_info = _parse_driver_info(task.node)
        return _power_status(driver_info)

 

     主要根据配置文件ironic.conf中driver为ipmitool, 需要的driver_info主要为ipmi_address, 得到一堆driver_info参数

def _power_status(driver_info):
    """Get the power status for a node.

    :param driver_info: the ipmitool access parameters for a node.
    :returns: one of ironic.common.states POWER_OFF, POWER_ON or ERROR.
    :raises: IPMIFailure on an error from ipmitool.

    """
    cmd = "power status"
    try:
        out_err = _exec_ipmitool(driver_info, cmd)
    except (exception.PasswordFileFailedToCreate,
            processutils.ProcessExecutionError) as e:
        LOG.warning("IPMI power status failed for node %(node_id)s with "
                    "error: %(error)s.",
                    {'node_id': driver_info['uuid'], 'error': e})
        raise exception.IPMIFailure(cmd=cmd)

    if out_err[0] == "Chassis Power is on\n":
        return states.POWER_ON
    elif out_err[0] == "Chassis Power is off\n":
        return states.POWER_OFF
    else:
        return states.ERROR


主要执行ipmitool命令得到power status, 返回主要有三种模式: POWER_ON POWER_OFF ERROR


三. change_node_power_state

     获得锁执行,设置target_power_state(POWER_ON / POWER_OFF)

     最关键的函数创建一个协程进行处理

task.spawn_after(self._spawn_worker, utils.node_power_action,
                 task, new_state, timeout=power_timeout)

关键执行函数node_power_action: 过滤掉一些当前与目标状态一样的,主要代码如下:

# take power action
    try:
        if new_state != states.REBOOT:
            if ('timeout' in reflection.get_signature(
                    task.driver.power.set_power_state).parameters):
                task.driver.power.set_power_state(task, new_state,
                                                  timeout=timeout)
            else:
                # FIXME(naohirot):
                # After driver composition, we should print power interface
                # name here instead of driver.
                LOG.warning(
                    "The set_power_state method of %(driver_name)s "
                    "doesn't support 'timeout' parameter.",
                    {'driver_name': node.driver})
                task.driver.power.set_power_state(task, new_state)
        else:
            if ('timeout' in reflection.get_signature(
                    task.driver.power.reboot).parameters):
                task.driver.power.reboot(task, timeout=timeout)
            else:
                LOG.warning("The reboot method of %(driver_name)s "
                            "doesn't support 'timeout' parameter.",
                            {'driver_name': node.driver})
                task.driver.power.reboot(task)

最终也是调用ipmitool power on / off reboot等




四. do_node_deploy

def do_node_deploy (task , conductor_id , configdrive= None ):
1.1 #调用驱动的部署模块的prepare方法,不同驱动的动作不一样 pxe_* 驱动使用的是iscsi_deploy.ISCSIDeploy.prepare,
1.2 task.driver.deploy.deploy
开始部署节点,获取实例镜像从glance,写入到本地,重启物理机,
返回状态DEPLOYWAIT
基本执行完等待物理机上电,执行启动脚本


1.1。 def prepare ( self , task):
位置: iscsi_deploy.py 为PXE部署和用户镜像产生TFTP配置,从Glance获取TFTP镜像,加到本地缓存
1.1.1 node_power_action
1.1.2 unconfigure_tenant_networks unbind port 从neutron
1.1.3 add_provisioning_network 为节点添加网络
1.1.4 build_agent_options
返回 agent_config_opts = {
'ipa-api-url' : get_ironic_api_url() ,
# NOTE: The below entry is a temporary workaround for bug/1433812
'coreos.configdrive' : 0 ,
}
1.1.5 prepare_ramdisk
1.1.1 def node_power_action (task , new_state , timeout= None ):
改变电源状态或者重置节点
@ 状态POWER_ON,REBOOT,SOFT_REBOOT -> POWER_ON
@ 状态POWER_OFF,SOFT_POWER_OFF -> POWER_OFF
1.1.5 def prepare_ramdisk ( self , task , ramdisk_params):
从driver_info和instance_info读取相关信息,准备kernel/ramdisk
@ 得到dhcp_opts更新DHCP, 其中dhcp_opts大致如下:
dhcp_opts: [{'opt_value': 'pxelinux.0', 'ip_version': 4, 'opt_name': 'bootfile-name'}, {'opt_value': '/tftpboot/', 'ip_version': 4, 'opt_name': '210'}, {'opt_value': '10.248.128.234', 'ip_version': 4, 'opt_name': 'server-ip-address'}, {'opt_value': '10.248.128.234', 'ip_version': 4, 'opt_name': 'tftp-server'}]
@ _get_deploy_image_info 得到deploy_kernel deploy_ramdisk 绝对路径
{'deploy_ramdisk': ('edfa4240-9cd9-41e0-baa4-82cb581c617e', '/tftpboot/00000000-0000-0000-0000-000000000102/deploy_ramdisk'), 'deploy_kernel': ('e9905ded-c7e7-4d5f-94ee-97fa2d7d15b1', '/tftpboot/00000000-0000-0000-0000-000000000102/deploy_kernel')}
@ _build_pxe_config_options PXE配置
pxe_opts: {'ipa-api-url': 'http://10.248.128.234:6385', 'ari_path': 'no_ramdisk', 'deployment_ari_path': '00000000-0000-0000-0000-000000000102/deploy_ramdisk', 'pxe_append_params': 'nofb nomodeset vga=normal console=tty0 console=ttyS0,115200n8 ipa-debug=1', 'aki_path': 'no_kernel', 'deployment_aki_path': '00000000-0000-0000-0000-000000000102/deploy_kernel', 'tftp_server': '10.248.128.234', 'ipxe_timeout': 0, 'coreos.configdrive': 0}
@ try_set_boot_device 设置boot
@ _cache_ramdisk_kernel
cache image


五. continue_deploy
物理机上电后,执行ramdisk,然后启动IPA, 大致如下:
@ 向ironic发送lookup请求,获取节点UUID
@ 发送心跳包
@ 部署后停止心跳包
没有细读ironic-python-agent代码:
def run(self):
        """Run the Ironic Python Agent."""
        。。。。。。。。。。。。。
        if not self.standalone:
            # Inspection should be started before call to lookup, otherwise
            # lookup will fail due to unknown MAC.
            uuid = None
            if cfg.CONF.inspection_callback_url:
                uuid = inspector.inspect()

            if self.api_url:
                self._wait_for_interface()
                content = self.api_client.lookup_node(
                    hardware_info=hardware.dispatch_to_managers(
                        'list_hardware_info'),
                    timeout=self.lookup_timeout,
                    starting_interval=self.lookup_interval,
                    node_uuid=uuid)

                LOG.debug('Received lookup results: %s', content)
                self.node = content['node']
                LOG.info('Lookup succeeded, node UUID is %s',
                         self.node['uuid'])
                hardware.cache_node(self.node)
                self.heartbeat_timeout = content['config']['heartbeat_timeout']

                # Update config with values from Ironic
                config = content.get('config', {})
                if config.get('metrics'):
                    for opt, val in config.items():
                        setattr(cfg.CONF.metrics, opt, val)
                if config.get('metrics_statsd'):
                    for opt, val in config.items():
                        setattr(cfg.CONF.metrics_statsd, opt, val)
            elif cfg.CONF.inspection_callback_url:
                LOG.info('No ipa-api-url configured, Heartbeat and lookup '
                         'skipped for inspector.')
            else:
                LOG.error('Neither ipa-api-url nor inspection_callback_url'
                          'found, please check your pxe append parameters.')

        if netutils.is_ipv6_enabled():
            # Listens to both IP versions, assuming IPV6_V6ONLY isn't enabled,
            # (the default behaviour in linux)
            simple_server.WSGIServer.address_family = socket.AF_INET6
        wsgi = simple_server.make_server(
            self.listen_address.hostname,
            self.listen_address.port,
            self.api,
            server_class=simple_server.WSGIServer)

        if not self.standalone and self.api_url:
            # Don't start heartbeating until the server is listening
            self.heartbeater.start()

        try:
            wsgi.serve_forever()
        except BaseException:
            LOG.exception('shutting down')

        if not self.standalone and self.api_url:
            self.heartbeater.stop()

5.0 beartbeat
位置:drivers/modules/agent_base_vendor.py
如果node的provision_state 为 DEPLOYWAIT 并且已经不是started 则调用5.1 continue_deploy
DEPLOYWAIT 并且镜像已经下载完,部署完成则5.2 reboot_to_instance
driver_internal_info: {'agent_url': u'http://$ip:9999', u'is_whole_disk_image': True}
默认端口为9999
def heartbeat(self, task, callback_url):
    """Process a heartbeat.

    :param task: task to work with.
    :param callback_url: agent HTTP API URL.
    """
    # TODO(dtantsur): upgrade lock only if we actually take action other
    # than updating the last timestamp.
    task.upgrade_lock()

    node = task.node
    LOG.debug('Heartbeat from node %s', node.uuid)

    driver_internal_info = node.driver_internal_info
    driver_internal_info['agent_url'] = callback_url

    # TODO(rloo): 'agent_last_heartbeat' was deprecated since it wasn't
    # being used so remove that entry if it exists.
    # Hopefully all nodes will have been updated by Pike, so
    # we can delete this code then.
    driver_internal_info.pop('agent_last_heartbeat', None)

    node.driver_internal_info = driver_internal_info
    node.save()

    # Async call backs don't set error state on their own
    # TODO(jimrollenhagen) improve error messages here
    msg = _('Failed checking if deploy is done.')
    try:
        if node.maintenance:
            # this shouldn't happen often, but skip the rest if it does.
            LOG.debug('Heartbeat from node %(node)s in maintenance mode; '
                      'not taking any action.', {'node': node.uuid})
            return
        elif (node.provision_state == states.DEPLOYWAIT and
              not self.deploy_has_started(task)):
            msg = _('Node failed to deploy.')
            self.continue_deploy(task)
        elif (node.provision_state == states.DEPLOYWAIT and
              self.deploy_is_done(task)):
            msg = _('Node failed to move to active state.')
            self.reboot_to_instance(task)
        elif (node.provision_state == states.DEPLOYWAIT and
              self.deploy_has_started(task)):
            node.touch_provisioning()
        elif node.provision_state == states.CLEANWAIT:
            node.touch_provisioning()
            if not node.clean_step:
                LOG.debug('Node %s just booted to start cleaning.',
                          node.uuid)
                msg = _('Node failed to start the first cleaning step.')
                # First, cache the clean steps
                self.refresh_clean_steps(task)
                # Then set/verify node clean steps and start cleaning
                manager_utils.set_node_cleaning_steps(task)
                # The exceptions from RPC are not possible as we using cast
                # here
                _notify_conductor_resume_clean(task)
            else:
                msg = _('Node failed to check cleaning progress.')
                self.continue_cleaning(task)

5.1 continue_deploy函数
def continue_deploy(self, task):
    """Method invoked when deployed using iSCSI.

    This method is invoked during a heartbeat from an agent when
    the node is in wait-call-back state. This deploys the image on
    the node and then configures the node to boot according to the
    desired boot option (netboot or localboot).

    :param task: a TaskManager object containing the node.
    :param kwargs: the kwargs passed from the heartbeat method.
    :raises: InstanceDeployFailure, if it encounters some error during
        the deploy.
    """
    task.process_event('resume')
    node = task.node
    LOG.debug('Continuing the deployment on node %s', node.uuid)

    uuid_dict_returned = do_agent_iscsi_deploy(task, self._client)
    root_uuid = uuid_dict_returned.get('root uuid')
    efi_sys_uuid = uuid_dict_returned.get('efi system partition uuid')
    self.prepare_instance_to_boot(task, root_uuid, efi_sys_uuid)
    self.reboot_and_finish_deploy(task)

最终调用ipmitool命令如下格式
ipmitool -I lanplus -H $ip -L ADMINISTRATOR -p 623  -U $user -R 120 -N 5 -f /tmp/tmp83ANKp power status
ipmitool -I lanplus -H $ip -L ADMINISTRATOR -p 623  -U $user -R 120 -N 5 -f /tmp/tmp83ANKp power on


Ironic的服务

  1. ironic-api: 接收REST请求,送给ironic-conductor
  2. ironic-conductor: 接收来自ironic-API的请求,进行创建、更新、删除nodes, 通过IPMI、ssh开关电源,或者部署机器
  3. ironic-python-agent: 当一台bare metal服务启动时,如果从PXE启动,机器可以从远端拉取一个最小版的Linux内核,也可以拉取一个ramdisk,
    如果是ramdisk的话,可以在上面安装各种服务(也可以配置RAID),其中可以包括ironic-python-agent。在ramdisk中的ironic-python-agent提供和ironic-conductor一样的服务。
  4. ironicclient: Ironic CLI















  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值