虚拟机冷迁移和resize代码分析（二）

最新推荐文章于 2021-07-08 17:16:41 发布

weixin_34291004

最新推荐文章于 2021-07-08 17:16:41 发布

阅读量227

点赞数

文章标签：运维 python 数据库

原文链接：https://my.oschina.net/u/1179767/blog/837649

版权

2019独角兽企业重金招聘Python工程师标准>>>

　　上一节我们分析了冷迁移和resize上层各自的逻辑，本节主要讲解两者底层相同的代码逻辑。其中compute_api.resize()方法会进一步调用nova/compute/api.py.resize()的方法。部分代码和注释如下所示：

@wrap_check_policy
@check_instance_lock
@check_instance_cell
@check_instance_state(vm_state=[vm_states.ACTIVE, vm_states.STOPPED])
def resize(self, context, instance, flavor_id=None, clean_shutdown=True,
           **extra_instance_updates):
    """Resize (ie, migrate) a running instance.

    If flavor_id is None, the process is considered a migration, keeping
    the original flavor_id. If flavor_id is not None, the instance should
    be migrated to a new host and resized to the new flavor_id.
    """
    self._check_auto_disk_config(instance, **extra_instance_updates)

    current_instance_type = instance.get_flavor()

    #如果flavor_id为空，instance_type不变；否则使用前端传入的flavor_id
    if not flavor_id:
        LOG.debug("flavor_id is None. Assuming migration.",
                  instance=instance)
        new_instance_type = current_instance_type
    else:
        new_instance_type = flavors.get_flavor_by_flavor_id(
                flavor_id, read_deleted="no")
        if (new_instance_type.get('root_gb') == 0 and
            current_instance_type.get('root_gb') != 0 and
            not self.is_volume_backed_instance(context, instance)):
            reason = _('Resize to zero disk flavor is not allowed.')
            raise exception.CannotResizeDisk(reason=reason)

    if not new_instance_type:
        raise exception.FlavorNotFound(flavor_id=flavor_id)

    current_instance_type_name = current_instance_type['name']
    new_instance_type_name = new_instance_type['name']
    LOG.debug("Old instance type %(current_instance_type_name)s, "
              "new instance type %(new_instance_type_name)s",
              {'current_instance_type_name': current_instance_type_name,
               'new_instance_type_name': new_instance_type_name},
              instance=instance)

    same_instance_type = (current_instance_type['id'] ==
                          new_instance_type['id'])

    # NOTE(sirp): We don't want to force a customer to change their flavor
    # when Ops is migrating off of a failed host.
    if not same_instance_type and new_instance_type.get('disabled'):
        raise exception.FlavorNotFound(flavor_id=flavor_id)

    if same_instance_type and flavor_id and self.cell_type != 'compute':
        raise exception.CannotResizeToSameFlavor()

    # ensure there is sufficient headroom for upsizes
    if flavor_id:
		#计算resize所需的资源配额，主要统计vcpu和内存
        deltas = compute_utils.upsize_quota_delta(context,
                                                  new_instance_type,
                                                  current_instance_type)
        try:
			#检查更新项目配额
            quotas = compute_utils.reserve_quota_delta(context, deltas,
                                                       instance)
        except exception.OverQuota as exc:
            quotas = exc.kwargs['quotas']
            overs = exc.kwargs['overs']
            usages = exc.kwargs['usages']
            headroom = self._get_headroom(quotas, usages, deltas)
            (overs, reqs, total_alloweds,
             useds) = self._get_over_quota_detail(headroom, overs, quotas,
                                                  deltas)
            LOG.warning(_LW("%(overs)s quota exceeded for %(pid)s,"
                            " tried to resize instance."),
                        {'overs': overs, 'pid': context.project_id})
            raise exception.TooManyInstances(overs=overs,
                                             req=reqs,
                                             used=useds,
                                             allowed=total_alloweds)
    else:
        quotas = objects.Quotas(context=context)
	#将instance task_state状态设置为RESIZE_PREP
    instance.task_state = task_states.RESIZE_PREP
    instance.progress = 0
	#更新实例状态到数据库
    instance.update(extra_instance_updates)
    instance.save(expected_task_state=[None])

    filter_properties = {'ignore_hosts': []}
	#判断是否可以resize至本机
	#如果nova.conf中allow_resize_to_same_host==false，则将自身host添加到
	#ignore_hosts列表中，后续调度则不会将实例resize到本机
    if not CONF.allow_resize_to_same_host:
        filter_properties['ignore_hosts'].append(instance.host)

    if self.cell_type == 'api':
        # Commit reservations early and create migration record.
        self._resize_cells_support(context, quotas, instance,
                                   current_instance_type,
                                   new_instance_type)

    if not flavor_id:
		#更新数据库信息，如果flavor_id为空，表示冷迁移，否则表示resize
        self._record_action_start(context, instance,
                                  instance_actions.MIGRATE)
    else:
        self._record_action_start(context, instance,
                                  instance_actions.RESIZE)

    scheduler_hint = {'filter_properties': filter_properties}
	#调用conductor api，通过conductor rpc将请求转发给conductor manager 
    self.compute_task_api.resize_instance(context, instance,
            extra_instance_updates, scheduler_hint=scheduler_hint,
            flavor=new_instance_type,
            reservations=quotas.reservations or [],
            clean_shutdown=clean_shutdown)

　　接着调用nova/conductor/api.py.ComputeTaskAPI.resize_instance()方法，其代码如下：

def resize_instance(self, context, instance, extra_instance_updates,
                    scheduler_hint, flavor, reservations,
                    clean_shutdown=True):
    # NOTE(comstud): 'extra_instance_updates' is not used here but is
    # needed for compatibility with the cells_rpcapi version of this
    # method.
    self.conductor_compute_rpcapi.migrate_server(
        context, instance, scheduler_hint, live=False, rebuild=False,
        flavor=flavor, block_migration=None, disk_over_commit=None,
        reservations=reservations, clean_shutdown=clean_shutdown)

　　接着继续调用nova/conductor/rpcapi.py.ComputeTaskAPI.migrate_server()方法，其代码如下：

def migrate_server(self, context, instance, scheduler_hint, live, rebuild,
              flavor, block_migration, disk_over_commit,
              reservations=None, clean_shutdown=True):
	#此处传入live==false，即为冷迁移
    kw = {'instance': instance, 'scheduler_hint': scheduler_hint,
          'live': live, 'rebuild': rebuild, 'flavor': flavor,
          'block_migration': block_migration,
          'disk_over_commit': disk_over_commit,
          'reservations': reservations,
          'clean_shutdown': clean_shutdown}
    version = '1.11'
    if not self.client.can_send_version(version):
        del kw['clean_shutdown']
        version = '1.10'
    if not self.client.can_send_version(version):
        kw['flavor'] = objects_base.obj_to_primitive(flavor)
        version = '1.6'
    if not self.client.can_send_version(version):
        kw['instance'] = jsonutils.to_primitive(
                objects_base.obj_to_primitive(instance))
        version = '1.4'
    cctxt = self.client.prepare(version=version)
    return cctxt.call(context, 'migrate_server', **kw)

　　nova-conductor会收到该请求，根据路由映射，该请求会递交给nova/couductor/manager.py.ComputeTaskManager.migrate_server()去处理。其代码和注释如下：

@messaging.expected_exceptions(exception.NoValidHost,
                               exception.ComputeServiceUnavailable,
                               exception.InvalidHypervisorType,
                               exception.InvalidCPUInfo,
                               exception.UnableToMigrateToSelf,
                               exception.DestinationHypervisorTooOld,
                               exception.InvalidLocalStorage,
                               exception.InvalidSharedStorage,
                               exception.HypervisorUnavailable,
                               exception.InstanceInvalidState,
                               exception.MigrationPreCheckError,
                               exception.LiveMigrationWithOldNovaNotSafe,
                               exception.UnsupportedPolicyException)
def migrate_server(self, context, instance, scheduler_hint, live, rebuild,
        flavor, block_migration, disk_over_commit, reservations=None,
        clean_shutdown=True):
    if instance and not isinstance(instance, nova_object.NovaObject):
        # NOTE(danms): Until v2 of the RPC API, we need to tolerate
        # old-world instance objects here
        attrs = ['metadata', 'system_metadata', 'info_cache',
                 'security_groups']
        instance = objects.Instance._from_db_object(
            context, objects.Instance(), instance,
            expected_attrs=attrs)
    # NOTE: Remove this when we drop support for v1 of the RPC API
    if flavor and not isinstance(flavor, objects.Flavor):
        # Code downstream may expect extra_specs to be populated since it
        # is receiving an object, so lookup the flavor to ensure this.
        flavor = objects.Flavor.get_by_id(context, flavor['id'])
    if live and not rebuild and not flavor:
        self._live_migrate(context, instance, scheduler_hint,
                           block_migration, disk_over_commit)
	#非热迁移、非重建且flavor，也即resize或者是冷迁移，下面两者开始执行相同的逻辑代码
    elif not live and not rebuild and flavor:
        instance_uuid = instance.uuid
        with compute_utils.EventReporter(context, 'cold_migrate',
                                         instance_uuid):
            self._cold_migrate(context, instance, flavor,
                               scheduler_hint['filter_properties'],
                               reservations, clean_shutdown)
    else:
        raise NotImplementedError()

　　　　接着调用_cold_migrate()方法,其代码和注释如下：

def _cold_migrate(self, context, instance, flavor, filter_properties,
                  reservations, clean_shutdown):
    image = utils.get_image_from_system_metadata(
        instance.system_metadata)

    request_spec = scheduler_utils.build_request_spec(
        context, image, [instance], instance_type=flavor)
    task = self._build_cold_migrate_task(context, instance, flavor,
                                         filter_properties, request_spec,
                                         reservations, clean_shutdown)
    try:
        task.execute()
    except exception.NoValidHost as ex:
        vm_state = instance.vm_state
        if not vm_state:
            vm_state = vm_states.ACTIVE
        updates = {'vm_state': vm_state, 'task_state': None}
        self._set_vm_state_and_notify(context, instance.uuid,
                                      'migrate_server',
                                      updates, ex, request_spec)

        # if the flavor IDs match, it's migrate; otherwise resize
        if flavor.id == instance.instance_type_id:
            msg = _("No valid host found for cold migrate")
        else:
            msg = _("No valid host found for resize")
        raise exception.NoValidHost(reason=msg)
    except exception.UnsupportedPolicyException as ex:
        with excutils.save_and_reraise_exception():
            vm_state = instance.vm_state
            if not vm_state:
                vm_state = vm_states.ACTIVE
            updates = {'vm_state': vm_state, 'task_state': None}
            self._set_vm_state_and_notify(context, instance.uuid,
                                          'migrate_server',
                                          updates, ex, request_spec)
    except Exception as ex:
        with excutils.save_and_reraise_exception():
            updates = {'vm_state': instance.vm_state,
                       'task_state': None}
            self._set_vm_state_and_notify(context, instance.uuid,
                                          'migrate_server',
                                          updates, ex, request_spec)

　　接着调用_build_cold_migrate_task()方法，其代码和注释如下：

 def _build_cold_migrate_task(self, context, instance, flavor,
                             filter_properties, request_spec, reservations,
                             clean_shutdown):
    return migrate.MigrationTask(context, instance, flavor,
                                 filter_properties, request_spec,
                                 reservations, clean_shutdown,
                                 self.compute_rpcapi,
                                 self.scheduler_client)

　　后面会进一步调用nova/couductor/tasks/migrate.py.MigrationTask._excute()方法。其代码和注释如下：

def _execute(self):
    image = self.request_spec.get('image')
    self.quotas = objects.Quotas.from_reservations(self.context,
                                                   self.reservations,
                                                   instance=self.instance)
    scheduler_utils.setup_instance_group(self.context, self.request_spec,
                                         self.filter_properties)
    scheduler_utils.populate_retry(self.filter_properties,
                                   self.instance.uuid)
	
		'''发送同步消息给nova-scheduler，选取用于迁移云主机的主机
        接口调用如下：SchedulerClient -> SchedulerQueryClient -
        > SchedulerAPI'''
    hosts = self.scheduler_client.select_destinations(
        self.context, self.request_spec, self.filter_properties)
	#从hosts中选择第一个host进行冷迁移（第一个即为权重最大者）
    host_state = hosts[0]
	scheduler_utils.populate_filter_properties(self.filter_properties,
                                               host_state)
    # context is not serializable
    self.filter_properties.pop('context', None)

    (host, node) = (host_state['host'], host_state['nodename'])
    self.compute_rpcapi.prep_resize(
        self.context, image, self.instance, self.flavor, host,
        self.reservations, request_spec=self.request_spec,
        filter_properties=self.filter_properties, node=node,
        clean_shutdown=self.clean_shutdown)

　　接下来继续调用nova\scheduler\rpcapi.py.select_destinations()方法，其代码和注释如下：

def select_destinations(self, ctxt, request_spec, filter_properties):
    cctxt = self.client.prepare(version='4.0')
    return cctxt.call(ctxt, 'select_destinations',
        request_spec=request_spec, filter_properties=filter_properties)

　　接下来进一步调用nova/scheduler/manager.py.SchedulerManager.select_destinations()方法。其代码和注释如下：

def select_destinations(self, context, request_spec, filter_properties):
    """Returns destinations(s) best suited for this request_spec and
    filter_properties.

    The result should be a list of dicts with 'host', 'nodename' and
    'limits' as keys.
    """
    dests = self.driver.select_destinations(context, request_spec,
        filter_properties)
    return jsonutils.to_primitive(dests)

　　此时要注意，scheduler_driver的类型，该参数是在nova.conf配置的，默认采用nova.scheduler.filter_scheduler.FilterScheduler。故应该调用nova/scheduler/filter_scheduler.py.select_destinations()方法，其代码和注释如下：

def select_destinations(self, context, request_spec, filter_properties):
    """Selects a filtered set of hosts and nodes."""
    # TODO(sbauza): Change the select_destinations method to accept a
    # RequestSpec object directly (and add a new RPC API method for passing
    # a RequestSpec object over the wire)
    spec_obj = objects.RequestSpec.from_primitives(context,
                                                   request_spec,
                                                   filter_properties)
    self.notifier.info(
        context, 'scheduler.select_destinations.start',
        dict(request_spec=spec_obj.to_legacy_request_spec_dict()))

    num_instances = spec_obj.num_instances
    selected_hosts = self._schedule(context, spec_obj)

    # Couldn't fulfill the request_spec
    if len(selected_hosts) < num_instances:
        # NOTE(Rui Chen): If multiple creates failed, set the updated time
        # of selected HostState to None so that these HostStates are
        # refreshed according to database in next schedule, and release
        # the resource consumed by instance in the process of selecting
        # host.
        for host in selected_hosts:
            host.obj.updated = None

        # Log the details but don't put those into the reason since
        # we don't want to give away too much information about our
        # actual environment.
        LOG.debug('There are %(hosts)d hosts available but '
                  '%(num_instances)d instances requested to build.',
                  {'hosts': len(selected_hosts),
                   'num_instances': num_instances})

        reason = _('There are not enough hosts available.')
        raise exception.NoValidHost(reason=reason)

    dests = [dict(host=host.obj.host, nodename=host.obj.nodename,
                  limits=host.obj.limits) for host in selected_hosts]

    self.notifier.info(
        context, 'scheduler.select_destinations.end',
        dict(request_spec=spec_obj.to_legacy_request_spec_dict()))
    return dests

其中_schedule()方法如下：

def _schedule(self, context, spec_obj):
    """Returns a list of hosts that meet the required specs,
    ordered by their fitness.
    """
    elevated = context.elevated()
    #加载nova.conf文件中的过滤选项
	#用户可以通过nova.conf中的scheduler_json_config_location
	#参数指定一个包含过滤参数的json格式的过滤文件
    config_options = self._get_configuration_options()

    # Find our local list of acceptable hosts by repeatedly
    # filtering and weighing our options. Each time we choose a
    # host, we virtually consume resources on it so subsequent
    # selections can adjust accordingly.

    # Note: remember, we are using an iterator here. So only
    # traverse this list once. This can bite you if the hosts
    # are being scanned in a filter or weighing function.
	
    #获取所有的活动主机
	hosts = self._get_all_host_states(elevated)

    selected_hosts = []
    num_instances = spec_obj.num_instances
    # TODO(sbauza): Modify the interfaces for HostManager and filters to
    # accept the RequestSpec object directly (in a later patch hopefully)
    filter_properties = spec_obj.to_legacy_filter_properties_dict()
    # NOTE(sbauza): Adding temporarly some keys since filters are
    # directly using it - until we provide directly RequestSpec
    filter_properties.update(
        {'request_spec': spec_obj.to_legacy_request_spec_dict(),
         'instance_type': spec_obj.flavor})
    # TODO(sbauza): Adding two keys not used in-tree but which will be
    # provided as non-fields for the RequestSpec once we provide it to the
    # filters
	#更新过滤参数
    filter_properties.update({'context': context,
                              'config_options': config_options})
    for num in range(num_instances):
        # Filter local hosts based on requirements ...
		# 返回满足过滤条件的host，所使用的过滤器可以通过nova.conf文件中
        #的scheduler_default_filters选项指定，相关的过滤器代码位于
        #nova/scheduler/filters
        hosts = self.host_manager.get_filtered_hosts(hosts,
                filter_properties, index=num)
        if not hosts:
            # Can't get any more locally.
            break

        LOG.debug("Filtered %(hosts)s", {'hosts': hosts})
        #通过权重过滤器进一步过滤host，返回一个按照权重降序排列的host列
        #表，权重过滤器可以通过nova.conf文件中
        #的scheduler_weight_classes选项指定,相关的过滤器代码位于nova/scheduler/weights
        weighed_hosts = self.host_manager.get_weighed_hosts(hosts,
                filter_properties)

        LOG.debug("Weighed %(hosts)s", {'hosts': weighed_hosts})

        scheduler_host_subset_size = CONF.scheduler_host_subset_size
        if scheduler_host_subset_size > len(weighed_hosts):
            scheduler_host_subset_size = len(weighed_hosts)
        if scheduler_host_subset_size < 1:
            scheduler_host_subset_size = 1
        #设置host随机选择范围，默认选择第一个
        chosen_host = random.choice(
            weighed_hosts[0:scheduler_host_subset_size])
        LOG.debug("Selected host: %(host)s", {'host': chosen_host})
        selected_hosts.append(chosen_host)

        # Now consume the resources so the filter/weights
        # will change for the next instance.
        chosen_host.obj.consume_from_request(spec_obj)
        if filter_properties.get('group_updated') is True:
            filter_properties['group_hosts'].add(chosen_host.obj.host)
    return selected_hosts

　　接着回到_excute()方法，查看compute_rpcapi.prep_resize()方法，调用nova/compute/rpcapi.py.ComputeAPI.prep_resize()方法其代码如下：

def prep_resize(self, ctxt, image, instance, instance_type, host,
                reservations=None, request_spec=None,
                filter_properties=None, node=None,
                clean_shutdown=True):
    image_p = jsonutils.to_primitive(image)
    msg_args = {'instance': instance,
                'instance_type': instance_type,
                'image': image_p,
                'reservations': reservations,
                'request_spec': request_spec,
                'filter_properties': filter_properties,
                'node': node,
                'clean_shutdown': clean_shutdown}
    version = '4.1'
    if not self.client.can_send_version(version):
        version = '4.0'
        msg_args['instance_type'] = objects_base.obj_to_primitive(
                                        instance_type)
    cctxt = self.client.prepare(server=host, version=version)
    cctxt.cast(ctxt, 'prep_resize', **msg_args)

　　接着，将消息传给nova/compute/manager.py.ComputeManager.prep_resize()方法，该节点为目的计算节点，其代码和注释如下：

@wrap_exception()
@reverts_task_state
@wrap_instance_event
@wrap_instance_fault
def prep_resize(self, context, image, instance, instance_type,
                reservations, request_spec, filter_properties, node,
                clean_shutdown):
    """Initiates the process of moving a running instance to another host.

    Possibly changes the RAM and disk size in the process.

    """
    if node is None:
		#假如上述调度中没有选择传递node，则会再次选择一个node
        node = self.driver.get_available_nodes(refresh=True)[0]
        LOG.debug("No node specified, defaulting to %s", node,
                  instance=instance)

    # NOTE(melwitt): Remove this in version 5.0 of the RPC API
    # Code downstream may expect extra_specs to be populated since it
    # is receiving an object, so lookup the flavor to ensure this.
    if not isinstance(instance_type, objects.Flavor):
        instance_type = objects.Flavor.get_by_id(context,
                                                 instance_type['id'])

    quotas = objects.Quotas.from_reservations(context,
                                              reservations,
                                              instance=instance)
    with self._error_out_instance_on_exception(context, instance,
                                               quotas=quotas):
        compute_utils.notify_usage_exists(self.notifier, context, instance,
                                          current_period=True)
        self._notify_about_instance_usage(
                context, instance, "resize.prep.start")
        try:
            self._prep_resize(context, image, instance,
                              instance_type, quotas,
                              request_spec, filter_properties,
                              node, clean_shutdown)
        # NOTE(dgenin): This is thrown in LibvirtDriver when the
        #               instance to be migrated is backed by LVM.
        #               Remove when LVM migration is implemented.
        except exception.MigrationPreCheckError:
            raise
        except Exception:
            # try to re-schedule the resize elsewhere:
            exc_info = sys.exc_info()
            self._reschedule_resize_or_reraise(context, image, instance,
                    exc_info, instance_type, quotas, request_spec,
                    filter_properties)
        finally:
            extra_usage_info = dict(
                    new_instance_type=instance_type.name,
                    new_instance_type_id=instance_type.id)

            self._notify_about_instance_usage(
                context, instance, "resize.prep.end",
                extra_usage_info=extra_usage_info)

　　后面的内容下次继续讲解。

转载于:https://my.oschina.net/u/1179767/blog/837649