先捋一下主要的代码结构(分析主机过滤器选取主机节点方式)
- /nova/scheduler/driver.py(文件中最重要的就是类Scheduler,是所有调度器实现都要继承的基类)
- /nova/scheduler/manager.py(主要实现了一个类SchedulerManager,定义了若干需要通过调度器实现的主机管理操作)
- /nova/scheduler/host_manager.py(有两个类的实现,都是描述了跟调度器操作相关的主机的实现,其中,类HostState描述了从主机获取相关数据和状态的一些实现,类HostManager描述了跟调度器操作相关的一些主机管理实现)
- /nova/scheduler/chance.py(只有一个类ChanceScheduler,继承自类Scheduler,实现了基于随即选取主机节点的方式的调度器)
- /nova/scheduler/client (客户端调用程序的入口)
- /nova/scheduler/filter_scheduler.py(也只有一个类FilterScheduler,继承自类Scheduler,实现了基于主机过滤器选取主机节点方式的调度器)
- /nova/scheduler/filters 和 /nova/scheduler/weights (两个目录内容分别对应主机过滤器选取主机节点方式的两个步骤)
从conductor中的build_instances方法调用了select_destination方法,这个方法返回的参数就是用来选择可创建的主机列表。先分析一下调用路径。
现在直接看一下filter_scheduler.py中对select_destinations的实现。
def select_destinations(self, context, request_spec, filter_properties):
"""Selects a filtered set of hosts and nodes."""
self.notifier.info(context, 'scheduler.select_destinations.start',
dict(request_spec=request_spec))
num_instances = request_spec['num_instances']
# 关键步骤 选取合适的主机列表
selected_hosts = self._schedule(context, request_spec,
filter_properties)
# 从以下的代码可以了解到,当请求的主机数量大于实际符合的主机数量时,
# 所请求的虚拟机一个也不会被创建,而是报没有足够的可以主机
if len(selected_hosts) < num_instances:
# NOTE(Rui Chen): If multiple creates failed, set the updated time
# of selected HostState to None so that these HostStates are
# refreshed according to database in next schedule, and release
# the resource consumed by instance in the process of selecting
# host.
for host in selected_hosts:
host.obj.updated = None
# Log the details but don't put those into the reason since
# we don't want to give away too much information about our
# actual environment.
LOG.debug('There are %(hosts)d hosts available but '
'%(num_instances)d instances requested to build.',
{'hosts': len(selected_hosts),
'num_instances': num_instances})
reason = _('There are not enough hosts available.')
raise exception.NoValidHost(reason=reason)
dests = [dict(host=host.obj.host, nodename=host.obj.nodename,
limits=host.obj.limits) for host in selected_hosts]
self.notifier.info(context, 'scheduler.select_destinations.end',
dict(request_spec=request_spec))
return dests
接下来分析关键的步骤,_schedule方法
def _schedule(self, context, request_spec, filter_properties):
# 上面省略了,一些不关键部位(获取字典和更新状态),现在直达主题,在一个for循环里
# 面,我们发现了schedule 的两个关键操作,filter和weigh。
# 先获取所有机器的状态
hosts = self._get_all_host_states(elevated)
selected_hosts = []
# 获取要建立的实例数目
num_instances = request_spec.get('num_instances', 1)
# 遍历num_instances,为每个实例选取合适的主机
for num in range(num_instances):
# Filter local hosts based on requirements ...
hosts = self.host_manager.get_filtered_hosts(hosts,
filter_properties, index=num)
if not hosts:
# Can't get any more locally.
break
LOG.debug("Filtered %(hosts)s", {'hosts': hosts})
weighed_hosts = self.host_manager.get_weighed_hosts(hosts,
filter_properties)
LOG.debug("Weighed %(hosts)s", {'hosts': weighed_hosts})
scheduler_host_subset_size = CONF.scheduler_host_subset_size
# 分析下面两个if过程,主要为了防止random.choice调用越界
if scheduler_host_subset_size > len(weighed_hosts):
scheduler_host_subset_size = len(weighed_hosts)
if scheduler_host_subset_size < 1:
scheduler_host_subset_size = 1
# 在符合要求的weigh过的host里进行随机选取
chosen_host = random.choice(
weighed_hosts[0:scheduler_host_subset_size])
LOG.debug("Selected host: %(host)s", {'host': chosen_host})
selected_hosts.append(chosen_host)
# Now consume the resources so the filter/weights
# will change for the next instance.
chosen_host.obj.consume_from_instance(instance_properties)
if update_group_hosts is True:
if isinstance(filter_properties['group_hosts'], list):
filter_properties['group_hosts'] = set(
filter_properties['group_hosts'])
filter_properties['group_hosts'].add(chosen_host.obj.host)
# 循环为每一个实例获取合适的主机后,返回选择的主机列表
return selected_hosts
从上面的代码块了解到三个select的关键操作:
- get(_get_all_host_states):过滤可用机器
- filetr(get_filtered_hosts):过滤符合条件的机器
- weigh(get_weighed_hosts):选出最优机器
继续往下挖。
scheduler/driver.py(从这里可以知道host_manager是依据配置文件动态导入的,而默认就是filter模式)
class Scheduler(object):
"""The base class that all Scheduler classes should inherit from."""
def __init__(self):
# 从这里可以知道host_manager是依据配置文件动态导入的
self.host_manager = importutils.import_object(
CONF.scheduler_host_manager)
self.servicegroup_api = servicegroup.API()
去找定义函数的地方host_manager.py
def get_all_host_states(self, context):
service_refs = {service.host: service
for service in objects.ServiceList.get_by_binary(
context, 'nova-compute')}
# 这一步就是获取计算节点资源,疑问是这个资源是从数据库里查询来的吗?答案下一步挖
compute_nodes = objects.ComputeNodeList.get_all(context)
seen_nodes = set()
for compute in compute_nodes:
service = service_refs.get(compute.host)
if not service:
LOG.warning(_LW(
"No compute service record found for host %(host)s"),
{'host': compute.host})
continue
host = compute.host
node = compute.hypervisor_hostname
state_key = (host, node)
host_state = self.host_state_map.get(state_key)
# 更新主机信息,怎么更新?是数据库吗?
if host_state:
host_state.update_from_compute_node(compute)
else:
host_state = self.host_state_cls(host, node, compute=compute)
self.host_state_map[state_key] = host_state
# We force to update the aggregates info each time a new request
# comes in, because some changes on the aggregates could have been
# happening after setting this field for the first time
host_state.aggregates = [self.aggs_by_id[agg_id] for agg_id in
self.host_aggregates_map[
host_state.host]]
host_state.update_service(dict(service))
self._add_instance_info(context, compute, host_state)
seen_nodes.add(state_key)
# remove compute nodes from host_state_map if they are not active
# 关键这步移除了不活跃的节点
dead_nodes = set(self.host_state_map.keys()) - seen_nodes
for state_key in dead_nodes:
host, node = state_key
LOG.info(_LI("Removing dead compute node %(host)s:%(node)s "
"from scheduler"), {'host': host, 'node': node})
del self.host_state_map[state_key]
return six.itervalues(self.host_state_map)
好了,get_all_host_states主要用来去除不活跃的节点,同时上面留有一个疑问,compute到底有没有访问数据库?继续往下看
- compute_nodes = objects.ComputeNodeList.get_all(context)
移步nova/objects/node_compute.py,发现调到了db.compute_node_get_all
@base.remotable_classmethod
def get_all(cls, context):
db_computes = db.compute_node_get_all(context)
return base.obj_make_list(context, cls(context), objects.ComputeNode,
db_computes)
db函数的定义,至此,说明liberty版本的scheduler有权限去访问数据库,并没有发现从conductor去访问
def compute_node_get_all(context):
return model_query(context, models.ComputeNode, read_deleted='no').all()
那么除了读权限是否还有更高级权限呢,了解一下刚才那个主机信息update操作
- host_state.update_from_compute_node(compute)
def update_from_compute_node(self, compute):
"""Update information about a host from a ComputeNode object."""
if (self.updated and compute.updated_at
and self.updated > compute.updated_at):
return
all_ram_mb = compute.memory_mb
# Assume virtual size is all consumed by instances if use qcow2 disk.
free_gb = compute.free_disk_gb
least_gb = compute.disk_available_least
if least_gb is not None:
if least_gb > free_gb:
# can occur when an instance in database is not on host
LOG.warning(_LW("Host %(hostname)s has more disk space than "
"database expected "
"(%(physical)sgb > %(database)sgb)"),
{'physical': least_gb, 'database': free_gb,
'hostname': compute.hypervisor_hostname})
free_gb = min(least_gb, free_gb)
free_disk_mb = free_gb * 1024
self.disk_mb_used = compute.local_gb_used * 1024
# NOTE(jogo) free_ram_mb can be negative
self.free_ram_mb = compute.free_ram_mb
self.total_usable_ram_mb = all_ram_mb
self.total_usable_disk_gb = compute.local_gb
从上面这块代码,看到并没有访问数据库,那是怎么来呢?
class HostState(object):
"""Mutable and immutable information tracked for a host.
This is an attempt to remove the ad-hoc data structures
previously used and lock down access.
"""
def __init__(self, host, node, compute=None):
self.host = host
self.nodename = node
# Mutable available resources.
# These will change as resources are virtually "consumed".
self.total_usable_ram_mb = 0
self.total_usable_disk_gb = 0
self.disk_mb_used = 0
self.free_ram_mb = 0
self.free_disk_mb = 0
self.vcpus_total = 0
self.vcpus_used = 0
self.pci_stats = None
self.numa_topology = None
# Additional host information from the compute node stats:
self.num_instances = 0
self.num_io_ops = 0
# Other information
self.host_ip = None
self.hypervisor_type = None
self.hypervisor_version = None
self.hypervisor_hostname = None
self.cpu_info = None
self.supported_instances = None
看下面的初始化值,说明这状态是记录在变量里的,再回头翻一下上面那句注释。
# We force to update the aggregates info each time a new request
# comes in, because some changes on the aggregates could have been
# happening after setting this field for the first time
大意是:我们在每一次请求进来都强迫更新总量信息,因为一些总量变化是发生在第一次设置这个字段之后
恍然大悟,并没有走数据库去改变主机状态,而是把hoststate存在当前的进程内存里面。
scheduler服务会自己存了一份主机状态,同步主机信息(留待查看)
优点:不用去频繁对数据库进行io操作,存在内存里面会更快,另一方面考虑到安全性,可能在scheduler里限制了写操作(留待查看)
缺点:如果发生并发请求的时候,资源的竞争是否有缺陷(留着以后看)
接下来,还剩有两个关键步骤分析
filetr(get_filtered_hosts)
def get_filtered_hosts(self, hosts, filter_properties,
filter_class_names=None, index=0):
"""Filter hosts and return only ones passing all filters."""
# 下面定义了若干局部函数,先省略掉
def _strip_ignore_hosts(host_map, hosts_to_ignore):
ignored_hosts = []
for host in hosts_to_ignore:
。。。。
# 返回经过验证的可用的过滤器;
filter_classes = self._choose_host_filters(filter_class_names)
。。。。
# 调用了get_filtered_objects
return self.filter_handler.get_filtered_objects(filters,
hosts, filter_properties, index)
继续打开get_filtered_objects
def get_filtered_objects(self, filters, objs, filter_properties, index=0):
list_objs = list(objs)
LOG.debug("Starting with %d host(s)", len(list_objs))
part_filter_results = []
full_filter_results = []
log_msg = "%(cls_name)s: (start: %(start)s, end: %(end)s)"
for filter_ in filters:
if filter_.run_filter_for_index(index):
cls_name = filter_.__class__.__name__
start_count = len(list_objs)
# 关键的一句话
objs = filter_.filter_all(list_objs, filter_properties)
if objs is None:
LOG.debug("Filter %s says to stop filtering", cls_name)
return
list_objs = list(objs)
end_count = len(list_objs)
part_filter_results.append(log_msg % {"cls_name": cls_name,
"start": start_count, "end": end_count})
if list_objs:
remaining = [(getattr(obj, "host", obj),
getattr(obj, "nodename", ""))
for obj in list_objs]
full_filter_results.append((cls_name, remaining))
return list_objs
objs的返回又调用了filter_.filter_all(list_objs, filter_properties)
def filter_all(self, filter_obj_list, filter_properties):
for obj in filter_obj_list:
if self._filter_one(obj, filter_properties):
# 符合规则 生产一个obj
yield obj
继续调用_filter_one
def _filter_one(self, obj, filter_properties):
# 如果通过过滤器则返回TRUE,否则返回FALSE
return self.host_passes(obj, filter_properties)
filter完成了,继续weigh(get_weighed_hosts)