【Nova】nova-compute代码学习1-启动时准备工作_cannot update service status on host "computer1" s-CSDN博客

本文链接：https://blog.csdn.net/oyw5201314ck/article/details/76079408

这篇先学习下nova-compute每次启动时提供服务之前的准备工作

nova-compute支持多种虚拟化驱动,包括LibvirtDriver、XenAPIDriver、FakeDriver、BareMetalDriver、VMwareESXDriver、VMwareVCDriver和HyperVDriver。从驱动名可以看出对应的虚拟化技术，一般在Linux服务器上，我们普遍会使用Qemu和Kvm这两种虚拟化技术(说法可能不太严谨, Kvm其实只是在Qemu的基础上实现了CPU硬件加速)，那么选择的驱动应该为LibvirtDriver，下面也会以这个驱动进行讲解。Libvirt是一个虚拟化API库，通过对不同的虚拟化backend进行抽象来提供统一的编程接口，目前支持KVM/QEMU/Xen/Virtuozzo/VMWare ESX/LXC/BHyve等,在nova-compute中就会使用到Libvirt的python-binding。

通过对启动流程的分析，可以大体分为以下几步：

1.进行Libvirt驱动的初始化工作，包括初始化Libvirt连接、创建事件队列和事件调度loop、通过原生线程运行Libvirt自带的事件loop，每当实例有事件发生，那么Libvirt的事件loop就会回调我们注册的方法来将事件放入事件队列中，然后唤醒事件调度loop对事件进行处理。作为管理平台，跟踪实例的状态信息是十分重要的，当我们使用实例时，关闭guest后，可以看到OpenStack通过Libvirt的事件loop能很快更新实例的状态为shutoff。

2.清理主机的已撤离、删除或者残留的实例，并对主机上的现有实例进行初始化操作，譬如我们重启服务器后需要恢复实例原来的状态；在初始化实例时，nova-compute也会保证虚拟网络的网桥、vlan设备等的存在，这也证实了多节点不使用multi_host的可行性，因为nova-compute也有责任创建网桥；

3.根据“是否延迟iptables的应用“配置，我们可以将iptables规则的应用时机延迟到最后一步，那么以上步骤中涉及到iptables规则的操作并不会生效；默认配置是False，也就是不延迟。

# nova-compute使用eventlet的greenpool来实现网络并发, 
# 但是在调用一些C库的接口时, eventlet的monkey_patch并不能修改它们, 导致
# 在协程中调用它们会被阻塞, 所以需要使用eventlet的另一个模块tpool来解决这个
# 问题, tpool使用线程池来实现并发
from eventlet import tpool

DISABLE_PREFIX = 'AUTO: '
DISABLE_REASON_UNDEFINED = 'None'

# tpool.Proxy类的__str__和__repr__内置方法有问题, 这里进行patch操作
def patch_tpool_proxy():
    def str_method(self):
        return str(self._obj)

    def repr_method(self):
        return repr(self._obj)

    tpool.Proxy.__str__ = str_method
    tpool.Proxy.__repr__ = repr_method

patch_tpool_proxy()

def libvirt_error_handler(context, err):
    pass
    
class ComputeDriver(object):

    # 注册计算服务的事件回调方法
    def register_event_listener(self, callback):
        self._compute_event_callback = callback
    
    # 处理事件
    def emit_event(self, event):
        if not self._compute_event_callback:
            LOG.debug(_("Discarding event %s") % str(event))
            return

        if not isinstance(event, virtevent.Event):
            raise ValueError(
                _("Event must be an instance of nova.virt.event.Event"))

        try:
            LOG.debug(_("Emitting event %s") % str(event))
            # 使用注册的回调函数处理事件
            self._compute_event_callback(event)
        except Exception as ex:
            LOG.error(_("Exception dispatching event %(event)s: %(ex)s"),
                      {'event': event, 'ex': ex})
    
class LibvirtDriver(driver.ComputeDriver):

    def __init__(self, virtapi, read_only=False):
        ...
        self._wrapped_conn = None
        self._wrapped_conn_lock = threading.Lock()
        self.read_only = read_only
        self._event_queue = None
        
        # Libvirt虚拟网卡驱动, 默认libvirt.vif.LibvirtGenericVIFDriver
        vif_class = importutils.import_class(CONF.libvirt.vif_driver)
        self.vif_driver = vif_class(self._get_connection)
        
        # 防火墙驱动, 这里使用的Libvirt的Iptables防火墙驱动
        self.firewall_driver = firewall.load_driver(
            DEFAULT_FIREWALL_DRIVER,
            self.virtapi,
            get_connection=self._get_connection)

    # 测试Libvirt连接是否可用
    @staticmethod
    def _test_connection(conn):
        try:
            conn.getLibVersion()
            return True
        except libvirt.libvirtError as e:
            if (e.get_error_code() in (libvirt.VIR_ERR_SYSTEM_ERROR,
                                       libvirt.VIR_ERR_INTERNAL_ERROR) and
                e.get_error_domain() in (libvirt.VIR_FROM_REMOTE,
                                         libvirt.VIR_FROM_RPC)):
                LOG.debug(_('Connection to libvirt broke'))
                return False
            raise
    
    # 将event放入事件队列, 并通过管道通知事件调度loop进行处理
    def _queue_event(self, event):
        if self._event_queue is None:
            return

        self._event_queue.put(event)

        c = ' '.encode()
        self._event_notify_send.write(c)
        self._event_notify_send.flush()
    
    # 使能/禁用本主机的计算服务
    def _set_host_enabled(self, enabled,
                          disable_reason=DISABLE_REASON_UNDEFINED):

        status_name = {True: 'disabled',
                       False: 'enabled'}

        disable_service = not enabled

        ctx = nova_context.get_admin_context()
        try:
            service = service_obj.Service.get_by_compute_host(ctx, CONF.host)

            # 如果服务的当前状态与将要处于的状态不一样, 那么我们才需要进行操作
            if service.disabled != disable_service:
                # 如果服务的当前状态是使能, 那么我们就修改数据库中服务的状态为禁用并记录禁用原因;
                # 或者服务的当前状态是禁用, 并且禁用原因是以$DISABLE_PREFIX开头, 那么我们就修改
                # 数据库中服务的状态为使能并清空禁用原因;
                # nova-compute不会擅自做主使能自己
                if not service.disabled or (
                        service.disabled_reason and
                        service.disabled_reason.startswith(DISABLE_PREFIX)):
                    service.disabled = disable_service
                    service.disabled_reason = (
                       DISABLE_PREFIX + disable_reason
                       if disable_service else DISABLE_REASON_UNDEFINED)
                    service.save()
                    LOG.debug(_('Updating compute service status to %s'),
                                 status_name[disable_service])
                else:
                    LOG.debug(_('Not overriding manual compute service '
                                'status with: %s'),
                                 status_name[disable_service])
        except exception.ComputeHostNotFound:
            LOG.warn(_('Cannot update service status on host: %s,'
                        'since it is not registered.') % CONF.host)
        except Exception:
            LOG.warn(_('Cannot update service status on host: %s,'
                        'due to an unexpected exception.') % CONF.host,
                     exc_info=True)
    
    # Libvirt连接关闭时就会调用此方法
    def _close_callback(self, conn, reason, opaque):
        # 将连接和关闭的原因放入事件队列中
        close_info = {'conn': conn, 'reason': reason}
        self._queue_event(close_info)
    
    # 每当有domain或实例发生事件时就会调用此方法
    @staticmethod
    def _event_lifecycle_callback(conn, dom, event, detail, opaque):
        self = opaque

        # 获取domain的UUID, 这也是OpenStack中实例的UUID
        uuid = dom.UUIDString()
        # 将Libvirt的domain事件转换为nova-compute的virtevent,
        # 并且我们只关注domain的停止、开始、挂起和恢复事件
        transition = None
        if event == libvirt.VIR_DOMAIN_EVENT_STOPPED:
            transition = virtevent.EVENT_LIFECYCLE_STOPPED
        elif event == libvirt.VIR_DOMAIN_EVENT_STARTED:
            transition = virtevent.EVENT_LIFECYCLE_STARTED
        elif event == libvirt.VIR_DOMAIN_EVENT_SUSPENDED:
            transition = virtevent.EVENT_LIFECYCLE_PAUSED
        elif event == libvirt.VIR_DOMAIN_EVENT_RESUMED:
            transition = virtevent.EVENT_LIFECYCLE_RESUMED

        if transition is not None:
            # 如果是我们感兴趣的事件, 那么将其放入事件队列中
            self._queue_event(virtevent.LifecycleEvent(uuid, transition))
    
    # 获取虚拟化技术对应的Libvirt uri
    @staticmethod
    def uri():
        if CONF.libvirt.virt_type == 'uml':
            uri = CONF.libvirt.connection_uri or 'uml:///system'
        elif CONF.libvirt.virt_type == 'xen':
            uri = CONF.libvirt.connection_uri or 'xen:///'
        elif CONF.libvirt.virt_type == 'lxc':
            uri = CONF.libvirt.connection_uri or 'lxc:///'
        else:
            uri = CONF.libvirt.connection_uri or 'qemu:///system'
        return uri
        
    # 进行Libvirt连接, 并返回连接
    @staticmethod
    def _connect(uri, read_only):
        def _connect_auth_cb(creds, opaque):
            if len(creds) == 0:
                return 0
            LOG.warning(
                _("Can not handle authentication request for %d credentials")
                % len(creds))
            raise exception.NovaException(
                _("Can not handle authentication request for %d credentials")
                % len(creds))

        auth = [[libvirt.VIR_CRED_AUTHNAME,
                 libvirt.VIR_CRED_ECHOPROMPT,
                 libvirt.VIR_CRED_REALM,
                 libvirt.VIR_CRED_PASSPHRASE,
                 libvirt.VIR_CRED_NOECHOPROMPT,
                 libvirt.VIR_CRED_EXTERNAL],
                _connect_auth_cb,
                None]

        try:
            flags = 0
            # 判断Libvirt连接是否是只读, 并修改flags
            if read_only:
                flags = libvirt.VIR_CONNECT_RO
            # 这里使用tpool来进行非阻塞的Libvirt连接, 原本的调用方式是
            # conn = libvirt.openAuth(uri, auth, flags)
            # conn是libvirt.virConnect类的实例;
            # 这里的原理是:在当前协程中把连接操作交由线程池去处理, 然后阻塞本协程, 把控制权交还给主循环;
            # 如果不这样做, 那么整个进程就会阻塞在这里
            # 这里的返回值是conn的Proxy代理, 我们之后如果要调用conn的方法, 那么可以通过此Proxy进行调用,
            # 好处是直接调用conn的方法可能会阻塞整个进程, 但是通过Proxy进行调用, 依旧沿用刚才的方式处理, 不会阻塞整个进程
            return tpool.proxy_call(
                (libvirt.virDomain, libvirt.virConnect),
                libvirt.openAuth, uri, auth, flags)
        except libvirt.libvirtError as ex:
            LOG.exception(_("Connection to libvirt failed: %s"), ex)
            payload = dict(ip=LibvirtDriver.get_host_ip_addr(),
                           method='_connect',
                           reason=ex)
            rpc.get_notifier('compute').error(nova_context.get_admin_context(),
                                              'compute.libvirt.error',
                                              payload)
            raise exception.HypervisorUnavailable(host=CONF.host)
            
    # 获取新的Libvirt连接, 并进行callback注册
    def _get_new_connection(self):
        LOG.debug(_('Connecting to libvirt: %s'), self.uri())
        wrapped_conn = None

        try:
            # 进行Libvirt连接, 返回一个经过封装的连接
            wrapped_conn = self._connect(self.uri(), self.read_only)
        finally:
            # 如果wrapped_conn为空, 说明连接失败, 此时禁用本主机的服务;
            # 如果wrapped_conn不为空, 说明连接成功, 此时使能本主机的服务
            disable_reason = DISABLE_REASON_UNDEFINED
            if not wrapped_conn:
                disable_reason = 'Failed to connect to libvirt'
            self._set_host_enabled(bool(wrapped_conn), disable_reason)

        self._wrapped_conn = wrapped_conn

        try:
            LOG.debug(_("Registering for lifecycle events %s"), self)
            # 这里调用之前不先判断是否为空吗? 一脸问号
            # 这里为domain或实例整个生命周期的事件注册callback
            wrapped_conn.domainEventRegisterAny(
                None,
                libvirt.VIR_DOMAIN_EVENT_ID_LIFECYCLE,
                self._event_lifecycle_callback,
                self)
        except Exception as e:
            LOG.warn(_("URI %(uri)s does not support events: %(error)s"),
                     {'uri': self.uri(), 'error': e})

        try:
            LOG.debug(_("Registering for connection events: %s") %
                      str(self))
            # 这里为Libvirt连接的关闭事件注册callback
            wrapped_conn.registerCloseCallback(self._close_callback, None)
        except (TypeError, AttributeError) as e:
            LOG.debug(_("The version of python-libvirt does not support "
                        "registerCloseCallback or is too old: %s"), e)
        except libvirt.libvirtError as e:
            LOG.warn(_("URI %(uri)s does not support connection"
                       " events: %(error)s"),
                     {'uri': self.uri(), 'error': e})
        # 返回封装的连接或None
        return wrapped_conn

    # 返回已有的Libvirt连接, 在必要时才进行初始化
    def _get_