1. 基本信息
代码位于./pkg/kubelet/cm/devicemanager/,对于kubelet来说其包的入口为manager.go
2. DeviceManager
DeviceManager通过ManagerImpl的podDevices成员的状态向kubelet发送设备分配的进度和分配信息,通过allocatedDevices记录已分配的设备,通过healthyDevices-allocatedDevices生成当前空闲的设备列表。
types.go定义了一个名为Manager的接口,其有如下几个成员,功能分别为:
// ManagerImpl is the structure in charge of managing Device Plugins.
type ManagerImpl struct {
socketname string
socketdir string
endpoints map[string]endpointInfo // Key is ResourceName
mutex sync.Mutex
server *grpc.Server
wg sync.WaitGroup
// activePods is a method for listing active pods on the node
// so the amount of pluginResources requested by existing pods
// could be counted when updating allocated devices
// 获得Node上所有存在pods的列表
activePods ActivePodsFunc
// sourcesReady provides the readiness of kubelet configuration sources such as apiserver update readiness.
// We use it to determine when we can purge inactive pods from checkpointed state.
// sourcesReady提供了kubelet配置源的准备情况,例如apiserver update readyiness,以此决定什么时候可以清除不活跃节点从某个检查点状态
sourcesReady config.SourcesReady
// callback is used for updating devices' states in one time call.
// e.g. a new device is advertised, two old devices are deleted and a running device fails.
// 更新某resourceName的所有设备状态
callback monitorCallback
// healthyDevices contains all of the registered healthy resourceNames and their exported device IDs.
healthyDevices map[string]sets.String
// unhealthyDevices contains all of the unhealthy devices and their exported device IDs.
unhealthyDevices map[string]sets.String
// allocatedDevices contains allocated deviceIds, keyed by resourceName.
// 已被分配的设备列表
allocatedDevices map[string]sets.String
// podDevices contains pod to allocated device mapping.
// type podDevices map[string]containerDevices ,记录了哪个pod分配了哪些设备
podDevices podDevices
checkpointManager checkpointmanager.CheckpointManager
}
在manager中ManagerImpl实现了该接口,并在此基础之上定义了几个数据成员用来保存devicemanager的状态信息。
分别为
type ManagerImpl struct {
socketname string //kubelet
socketdir string //kubelet.sock的路径
// type endpointInfo struct {
// // defined in ./endpoint.go ,endpoint can run ,stop ,allocate prestartContainer ,etc.
// e endpoint
// // defined in apis ,only have a item
// opts *pluginapi.DevicePluginOptions
// }
endpoints map[string]endpointInfo //保存了所有endpoint的信息
mutex sync.Mutex //互斥锁,因为endpoint是单独起的协程,有一些临界区代码
server *grpc.Server //保存了register grpc的server
wg sync.WaitGroup //防止协程未关闭
// 获得Node上所有存在active pods的列表
activePods ActivePodsFunc
// sourcesReady提供了kubelet配置源的准备情况,例如apiserver update readyiness,以此决定什么时候可以清除不活跃节点从某个检查点状态
sourcesReady config.SourcesReady
// 更新某resource的所有设备状态
callback monitorCallback
// healthyDevices contains all of the registered healthy resourceNames and their exported device IDs.
healthyDevices map[string]sets.String
// unhealthyDevices contains all of the unhealthy devices and their exported device IDs.
unhealthyDevices map[string]sets.String
// allocatedDevices contains allocated deviceIds, keyed by resourceName.
allocatedDevices map[string]sets.String
// podDevices contains pod to allocated device mapping.
// type podDevices map[string]containerDevices ,记录了哪个pod分配了哪些设备
podDevices podDevices
//记录checkpoint
checkpointManager checkpointmanager.CheckpointManager
}
首先看一下创建ManagerImpl实例的函数
newManagerImpl(socketPath string) (*ManagerImpl, error) {
... // 检查是否已经有已存在的manager
// 创建manager
manager := &ManagerImpl{
endpoints: make(map[string]endpointInfo),
socketname: file,
socketdir: dir,
healthyDevices: make(map[string]sets.String),
unhealthyDevices: make(map[string]sets.String),
allocatedDevices: make(map[string]sets.String),
podDevices: make(podDevices),
}
// 将healthyDevice,unhealthyDevice列表清空,后根据最新的列表重新生成设备列表,并写入检查点
manager.callback = manager.genericDeviceUpdateCallback
// The following structs are populated with real implementations in manager.Start()
// Before that, initializes them to perform no-op operations.
manager.activePods = func() []*v1.Pod { return []*v1.Pod{} }
manager.sourcesReady = &sourcesReadyStub{} // add resource or return if is allready
checkpointManager, err := checkpointmanager.NewCheckpointManager(dir)
if err != nil {
return nil, fmt.Errorf("failed to initialize checkpoint manager: %v", err)
}
manager.checkpointManager = checkpointManager
return manager, nil
}
这里特别注意一下manager.callback,该函数会被所有的endpoint调用,故用了互斥锁,从逻辑上我们可以看出其每次更新资源列表的时候都会清空之前的列表
func (m *ManagerImpl) genericDeviceUpdateCallback(resourceName string, devices []pluginapi.Device) {
m.mutex.Lock()
m.healthyDevices[resourceName] = sets.NewString()
m.unhealthyDevices[resourceName] = sets.NewString()
for _, dev := range devices {
if dev.Health == pluginapi.Healthy {
m.healthyDevices[resourceName].Insert(dev.ID)
} else {
m.unhealthyDevices[resourceName].Insert(dev.ID)
}
}
m.mutex.Unlock()
m.writeCheckpoint()
}
Manger有一个Start函数,从名字上我们可以看出kubelet是调用该方法做device-manager的初始化的,其主要功能是创建kubelet.sock路径并挂plugin-grpc服务,该方法接受传入所有pod的列表,和前置资源是否准备好。
3. Register()函数
通过pluginapi的grpc定义,我们可以知道plugin和devicemanager交互的开始是从Register函数开始的,那么ManagerImpl也一定会有一个Register函数:
// 该方法由device-plugin通过grpc调用,将自身信息注册进来
// 1. 判断api版本是否支持
// 2. 判断resourceName是否合法
// 3. 创建对应该资源的endpoint并调用listandwatch持续检测该资源的设备健康
func (m *ManagerImpl) Register(ctx context.Context, r *pluginapi.RegisterRequest) (*pluginapi.Empty, error) {
//判断是否是支持的版本
klog.Infof("Got registration request from device plugin with resource name %q", r.ResourceName)
metrics.DevicePluginRegistrationCount.WithLabelValues(r.ResourceName).Inc()
var versionCompatible bool
for _, v := range pluginapi.SupportedVersions {
if r.Version == v {
versionCompatible = true
break
}
}
if !versionCompatible {
errorString := fmt.Sprintf(errUnsupportedVersion, r.Version, pluginapi.SupportedVersions)
klog.Infof("Bad registration request from device plugin with resource name %q: %s", r.ResourceName, errorString)
return &pluginapi.Empty{}, fmt.Errorf(errorString)
}
//判断resourceName是否合法
if !v1helper.IsExtendedResourceName(v1.ResourceName(r.ResourceName)) {
errorString := fmt.Sprintf(errInvalidResourceName, r.ResourceName)
klog.Infof("Bad registration request from device plugin: %s", errorString)
return &pluginapi.Empty{}, fmt.Errorf(errorString)
}
// TODO: for now, always accepts newest device plugin. Later may consider to
// add some policies here, e.g., verify whether an old device plugin with the
// same resource name is still alive to determine whether we want to accept
// the new registration.
// 大致意思是会可能会添加新老版本plugin更替的逻辑功能
// 为该Resource添加一个的endpoint
go m.addEndpoint(r)
return &pluginapi.Empty{}, nil
}
这里用 go m.addEndpoint®为每一个resource单独起了一个协程做处理,m.runEndpoint(r.ResourceName, new)也是单独起的协程
// 1. newEndpointImpl创建新的endpoint实例,并返回与plugin的连接conn
// 2. m.registerEndpoint将resourceName保存在m.endpoints[resourceName]中
// 3. go m.runEndpoint启动一个endpoint,调用其run方法,通过listandwatch grpc获取plugin发来的最新设备信息
// ,并实时更新该资源设备的状态
func (m *ManagerImpl) addEndpoint(r *pluginapi.RegisterRequest) {
// 和device-plugin提供的endpoint建立连接,并返回client
new, err := newEndpointImpl(filepath.Join(m.socketdir, r.Endpoint), r.ResourceName, m.callback)
if err != nil {
klog.Errorf("Failed to dial device plugin with request %v: %v", r, err)
return
}
// 将resourceName保存在m.endpoints[resourceName]中
m.registerEndpoint(r.ResourceName, r.Options, new)
go func() {
m.runEndpoint(r.ResourceName, new)
}()
}
下面看一下m.runEndpoint(r.ResourceName, new)
func (m *ManagerImpl) runEndpoint(resourceName string, e endpoint) {
// 调用plugin的ListAndWatch,并持续检测
e.run()
// 仅关闭了和plugin的grpc连接
e.stop()
m.mutex.Lock()
defer m.mutex.Unlock()
// 如果该endpoint从run方法中跳出来并且停止了,便将该资源标定为不健康
if old, ok := m.endpoints[resourceName]; ok && old.e == e {
// 将对应该resource的所有设备健康状态标记为不健康,resource本身不做改变
m.markResourceUnhealthy(resourceName)
}
klog.V(2).Infof("Endpoint (%s, %v) became unhealthy", resourceName, e)
}
这里e.run()实际上调用了该resource对应plugin的listandwatch函数,并且持续监控资源最新的信息,每当资源有更新时便调用m.callback更新device-manager的资源列表,包括m.healthyDevices[resourceName]和m.unhealthyDevices[resourceName],直到出错返回调用e.stop()并标定该资源的所有设备健康状态为不健康。
func (e *endpointImpl) run() {
stream, err := e.client.ListAndWatch(context.Background(), &pluginapi.Empty{})
if err != nil {
klog.Errorf(errListAndWatch, e.resourceName, err)
return
}
// 持续检测plugin发来的设备信息,并通过调用e.callback(实际调用为manager的genericDeviceUpdateCallback函数)来更新设备列表
// 但是从逻辑上来看调用callback时所有的endpoint都进入了一个临界区代码,resource如果太多的话可能需要等待
// 每次更新设备信息都会把之前的该resource下的内容清空
for {
response, err := stream.Recv()
if err != nil {
klog.Errorf(errListAndWatch, e.resourceName, err)
return
}
devs := response.Devices
klog.V(2).Infof("State pushed for device plugin %s", e.resourceName)
var newDevs []pluginapi.Device
for _, d := range devs {
newDevs = append(newDevs, *d)
}
e.callback(e.resourceName, newDevs)
}
}
e.callback(e.resourceName, newDevs)实际上是调用了func (m *ManagerImpl) genericDeviceUpdateCallback(resourceName string, devices []pluginapi.Device)
// 将healthyDevice,unhealthyDevice列表清空,后根据最新的列表重新生成设备列表,并写入检查点
func (m *ManagerImpl) genericDeviceUpdateCallback(resourceName string, devices []pluginapi.Device) {
m.mutex.Lock()
m.healthyDevices[resourceName] = sets.NewString()
m.unhealthyDevices[resourceName] = sets.NewString()
for _, dev := range devices {
if dev.Health == pluginapi.Healthy {
m.healthyDevices[resourceName].Insert(dev.ID)
} else {
m.unhealthyDevices[resourceName].Insert(dev.ID)
}
}
m.mutex.Unlock()
m.writeCheckpoint()
}
从上述过程我们可以看到,manager和endpoint是一对多的关系,endpoint和plugin是一对一的关系,通过每个resource绑定到一个单独协程的endpoint上,每当plugin状态更新时通过访问manager的临界区代码callback回调实现manager的信息统计,充分利用了go语言高并发的特性,也保证了每个plugin的信息反馈可以得到及时的处理。
至此Register的实现结束,流程图如下:
4. Allocate() 函数
Allocate()方法作用是根据scheduler传来的准许某为pod分配device,从后面函数的分析我们可以知道,scheduler发给kubelet分配pod设备的信息十分简单,就是limits,实际上pod真正挂载哪些device的策略是由kubelet的device-manager决定的,scheduler只需要声明需要几个什么类型的设备就可以了。
首先其先申明了一个deviceToReuse,这个map保存可重用的设备,为什么要声明这样一个map呢,原因是k8s支持在pod中创建init-container,init-container可以有多个,先于container执行,每个init-container按顺序依次执行完毕后container才会开始创建,而在为container或init-container分配设备的时候会优先利用deviceToReuse的设备,代码作者可能是想利用这样的特性避免资源浪费。但是分析逻辑可以看到,如果deviceToReuse的设备数量大于container所需的设备数量,那么这些多余的设备会在pod运行结束之前无法被重用。有待验证一下
所以我们可以在代码中看到,首先其为init-container分配设备,并且将该设备添加进入deviceToReuse为=被下一个init-container重用,当init-container处理完成以后会将这些可重用设备交给container继续使用。最后更新nodeinfo。
// Allocate is the call that you can use to allocate a set of devices
// from the registered device plugins.
// 当已注册的plugin可以分配一组设备时调用
// PodAdmitAttributes is the context for a pod admission decision.被批准分配设备的pod
// type PodAdmitAttributes struct {
// // the pod to evaluate for admission
// Pod *v1.Pod
// // all pods bound to the kubelet excluding the pod being evaluated
// OtherPods []*v1.Pod
// }
// 传入nodeInfo,PodAdmitAttributes,根据node信息和该pod的container组成如何分配设备?
func (m *ManagerImpl) Allocate(node *schedulercache.NodeInfo, attrs *lifecycle.PodAdmitAttributes) error {
pod := attrs.Pod
devicesToReuse := make(map[string]sets.String) //保存可重用的设备
// 下面的这个for循环实现了为init-container分配设备,init-container设备先于container执行,依次执行完毕后才会执行container
for _, container := range pod.Spec.InitContainers {
// allocateContainerResources尝试和plugin做通信为init-container分配设备
if err := m.allocateContainerResources(pod, &container, devicesToReuse); err != nil {
return err
}
// 将init-container中分配的设备加入可重用设备中,
// 如果为一个 Pod 指定了多个 Init 容器,那些容器会按顺序一次运行一个。 每个 Init 容器必须运行成功,
// 下一个才能够运行。故这些设备是在所有init-container中传递的,逻辑上可能有多个init-container用的是同一个设备
m.podDevices.addContainerAllocatedResources(string(pod.UID), container.Name, devicesToReuse)
}
// 下面的这个for循环实现了为container分配设备
for _, container := range pod.Spec.Containers {
if err := m.allocateContainerResources(pod, &container, devicesToReuse); err != nil {
return err
}
m.podDevices.removeContainerAllocatedResources(string(pod.UID), container.Name, devicesToReuse)
}
m.mutex.Lock()
defer m.mutex.Unlock()
// quick return if no pluginResources requested
if _, podRequireDevicePluginResource := m.podDevices[string(pod.UID)]; !podRequireDevicePluginResource {
return nil
}
m.sanitizeNodeAllocatable(node)
return nil
}
然后我们看一下其中的几个关键函数,首先是m.allocateContainerResources(pod, &container, devicesToReuse),该函数尝试和plugin做通信并为这个pod的某个container/init-container分配设备,最终的结果是更新了manager的podDevices成员信息,该成员包含着该node下所有pod包含的所有container各个resource的分配数量和分配方式,例如env,mount等等。
最后写入检查点保存当前状态。
// allocateContainerResources attempts to allocate all of required device
// plugin resources for the input container, issues an Allocate rpc request
// for each new device resource requirement, processes their AllocateResponses,
// and updates the cached containerDevices on success.
// allocateContainerResources尝试为输入容器分配所有必需的设备插件资源,
// 为每个新设备资源需求发出Allocate rpc请求,处理其AllocateResponses,
// 并在成功时更新缓存的containerDevices。并将结果插入m.podDevices中,记录哪个pod分配了哪些设备,和plugin返回的设备分配方式,mount,env,etc.
// 可以推测得知下面函数的输入变量为待分配设备的pod名,里面某一container,和deviceToReuse这个map
func (m *ManagerImpl) allocateContainerResources(pod *v1.Pod, container *v1.Container, devicesToReuse map[string]sets.String) error {
podUID := string(pod.UID)
contName := container.Name
allocatedDevicesUpdated := false
// Extended resources are not allowed to be overcommitted.
// Since device plugin advertises extended resources,
// therefore Requests must be equal to Limits and iterating
// over the Limits should be sufficient.
// Limits是yaml文件中对于扩展资源的标记字段,形如:
// resources:
// limits:
// nvidia.com/gpu: 1
// 不允许过度使用扩展资源。
// 由于设备插件负责扩展资源分配,因此请求必须等于Limits,并且迭代限制应该足够。
// 以此可以推断下面的k是扩展资源名,v是请求数量
// 对于请求的每一类资源
for k, v := range container.Resources.Limits {
resource := string(k)
needed := int(v.Value())
klog.V(3).Infof("needs %d %s", needed, resource)
// 判断是否是注册了的扩展资源
if !m.isDevicePluginResource(resource) {
continue
}
// Updates allocatedDevices to garbage collect any stranded resources
// before doing the device plugin allocation.
// 更新已分配设备为了在device-plugin之前垃圾收集任何标准设备
// 在分配新设备之前首先将能回收资源的pod释放掉
if !allocatedDevicesUpdated {
// 释放所有终结状态pod所分配的device,做垃圾收集用
// 从m.podDevices删除掉将要被移除的pod,同时也就将该pod中的device变为可以分配的了
m.updateAllocatedDevices(m.activePods())
allocatedDevicesUpdated = true
}
// 返回一个为该container分配的设备列表,如果是contianer restart则重新mount原设备,如果有可重用设备则用可重用设备,否则在当前可分配设备列表里取前几个,
// 并将这几个加入已分配设备中
allocDevices, err := m.devicesToAllocate(podUID, contName, resource, needed, devicesToReuse[resource])
if err != nil {
return err
}
if allocDevices == nil || len(allocDevices) <= 0 {
continue
}
startRPCTime := time.Now()
// Manager.Allocate involves RPC calls to device plugin, which
// could be heavy-weight. Therefore we want to perform this operation outside
// mutex lock. Note if Allocate call fails, we may leave container resources
// partially allocated for the failed container. We rely on updateAllocatedDevices()
// to garbage collect these resources later. Another side effect is that if
// we have X resource A and Y resource B in total, and two containers, container1
// and container2 both require X resource A and Y resource B. Both allocation
// requests may fail if we serve them in mixed order.
// TODO: may revisit this part later if we see inefficient resource allocation
// in real use as the result of this. Should also consider to parallize device
// plugin Allocate grpc calls if it becomes common that a container may require
// resources from multiple device plugins.
m.mutex.Lock()
// 每个endpoint都保存了一个可以和device-plugin交互的位置和客户端
eI, ok := m.endpoints[resource]
m.mutex.Unlock()
// 如果连接endpoint不成功,则重置已分配设备
if !ok {
m.mutex.Lock()
m.allocatedDevices = m.podDevices.devices()
m.mutex.Unlock()
return fmt.Errorf("Unknown Device Plugin %s", resource)
}
devs := allocDevices.UnsortedList()
// TODO: refactor this part of code to just append a ContainerAllocationRequest
// in a passed in AllocateRequest pointer, and issues a single Allocate call per pod.
klog.V(3).Infof("Making allocation request for devices %v for device plugin %s", devs, resource)
// 远程调用grpc向plugin请求分配设备
resp, err := eI.e.allocate(devs)
metrics.DevicePluginAllocationLatency.WithLabelValues(resource).Observe(metrics.SinceInMicroseconds(startRPCTime))
if err != nil {
// In case of allocation failure, we want to restore m.allocatedDevices
// to the actual allocated state from m.podDevices.
m.mutex.Lock()
m.allocatedDevices = m.podDevices.devices()
m.mutex.Unlock()
return err
}
if len(resp.ContainerResponses) == 0 {
return fmt.Errorf("No containers return in allocation response %v", resp)
}
// Update internal cached podDevices state.
m.mutex.Lock()
// type podDevices map[string]containerDevices ,记录了哪个pod的哪个container分配了哪些设备
m.podDevices.insert(podUID, contName, resource, allocDevices, resp.ContainerResponses[0])
m.mutex.Unlock()
}
// Checkpoints device to container allocation information.
return m.writeCheckpoint()
}
updateAllocatedDevices(activePods []*v1.Pod)函数的功能是从m.podDevices中删除所有处于终结状态的pod,并回收其设备,反映到代码上是用m.podDevices.device()重新生成一份已分配设备列表。
// updateAllocatedDevices gets a list of active pods and then frees any Devices that are bound to
// terminated pods. Returns error on failure.
// 释放所有终结状态pod所分配的device,做垃圾收集用,删除m.podDevices下对应的pod全部内容,然后用m.podDevices.devices()
// 重新生成m.allocatedDevices的内容
func (m *ManagerImpl) updateAllocatedDevices(activePods []*v1.Pod) {
if !m.sourcesReady.AllReady() {
return
}
m.mutex.Lock()
defer m.mutex.Unlock()
activePodUids := sets.NewString()
for _, pod := range activePods {
activePodUids.Insert(string(pod.UID))
}
allocatedPodUids := m.podDevices.pods()
// 已分配设备pod-activatepod = podsToRemoved
podsToBeRemoved := allocatedPodUids.Difference(activePodUids)
if len(podsToBeRemoved) <= 0 {
return
}
klog.V(3).Infof("pods to be removed: %v", podsToBeRemoved.List())
// 从m.podDevices删除掉将要被移除的pod
m.podDevices.delete(podsToBeRemoved.List())
// Regenerated allocatedDevices after we update pod allocation information.
// 将resourceName作为键值返回所有被跟踪pod的设备列表,即所有当前已被分配的资源设备列表
m.allocatedDevices = m.podDevices.devices()
}
m.devicesToAllocate(podUID, contName, resource, needed, devicesToReuse[resource])函数用来生成需要向plugin请求的设备列表,如果可重用设备已经够用或者没有设备需求时则不向plugin请求分配新的设备,否则调用grpc向plugin申请分配新的设备。
设备分配的逻辑是首先看container中是否已经分配了设备,如果设备够用则返回nil,否则查看reusableDevices,取出里面的设备分配,否则根据最终缺少的设备量返回healthdevice - inusedevice(m.allocatedDevices[resource]),中的前needed个,这便是其分配设备的策略。
// Returns list of device Ids we need to allocate with Allocate rpc call.
// Returns empty list in case we don't need to issue the Allocate rpc call.
// 返回一个为该container分配的设备列表,如果是contianer restart则重新mount原设备,如果有可重用设备则用可重用设备,否则在当前可分配设备列表里取前几个,
// 并将这几个加入已分配设备中
func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, required int, reusableDevices sets.String) (sets.String, error) {
m.mutex.Lock()
defer m.mutex.Unlock()
needed := required
// Gets list of devices that have already been allocated.
// This can happen if a container restarts for example.
// 如果是container重启的时候,则依旧mount原设备
devices := m.podDevices.containerDevices(podUID, contName, resource)
if devices != nil {
klog.V(3).Infof("Found pre-allocated devices for resource %s container %q in Pod %q: %v", resource, contName, podUID, devices.List())
needed = needed - devices.Len()
// A pod's resource is not expected to change once admitted by the API server,
// so just fail loudly here. We can revisit this part if this no longer holds.
if needed != 0 {
return nil, fmt.Errorf("pod %q container %q changed request for resource %q from %d to %d", podUID, contName, resource, devices.Len(), required)
}
}
if needed == 0 {
// No change, no work.
return nil, nil
}
klog.V(3).Infof("Needs to allocate %d %q for pod %q container %q", needed, resource, podUID, contName)
// Needs to allocate additional devices.
if _, ok := m.healthyDevices[resource]; !ok {
return nil, fmt.Errorf("can't allocate unregistered device %s", resource)
}
devices = sets.NewString()
// Allocates from reusableDevices list first.
// 优先利用reusableDevices里面的的设备
for device := range reusableDevices {
devices.Insert(device)
needed--
if needed == 0 {
return devices, nil
}
}
// Needs to allocate additional devices.
if m.allocatedDevices[resource] == nil {
m.allocatedDevices[resource] = sets.NewString()
}
// Gets Devices in use.
devicesInUse := m.allocatedDevices[resource]
// Gets a list of available devices.
// 每一次的可用设备都由所有健康设备减去已分配设备,这样便可以将刚刚从终结状态释放掉的设备利用起来
available := m.healthyDevices[resource].Difference(devicesInUse)
if int(available.Len()) < needed {
return nil, fmt.Errorf("requested number of devices unavailable for %s. Requested: %d, Available: %d", resource, needed, available.Len())
}
// 将被分配的设备是直接取前面的needed个,如果想实现扩展资源的自定义分配只需要改下面的代码就可以了
allocated := available.UnsortedList()[:needed]
// Updates m.allocatedDevices with allocated devices to prevent them
// from being allocated to other pods/containers, given that we are
// not holding lock during the rpc call.
// 加入已被分配的的设备之中
for _, device := range allocated {
m.allocatedDevices[resource].Insert(device)
devices.Insert(device)
}
//返回设备列表
return devices, nil
}
然后是m.podDevices.addContainerAllocatedResources(string(pod.UID), container.Name, devicesToReuse)这个函数。
该函数实现将init-container中分配的设备不断的加入可重用设备中。
// Populates allocatedResources with the device resources allocated to the specified <podUID, contName>.
// 将某资源分配给某pod内的container
// type podDevices map[string]containerDevices保存了所有pod已分配的设备列表,每一项对应一个containerDevice
// 每个containerDevice包含了所有该container已分配的资源名称,每个资源名称下保存着设备列表
func (pdev podDevices) addContainerAllocatedResources(podUID, contName string, allocatedResources map[string]sets.String) {
// 有没有这个pod
containers, exists := pdev[podUID]
if !exists {
return
}
resources, exists := containers[contName]
if !exists {
return
}
// 将已分配的设备全部添加进到该resource的allocatedResources中
for resource, devices := range resources {
allocatedResources[resource] = allocatedResources[resource].Union(devices.deviceIds)
}
}
然后是m.podDevices.removeContainerAllocatedResources(string(pod.UID), container.Name, devicesToReuse)函数,其实现了将从可重用设备列表删除本次分配给该container的设备。但是如果init-container生成的devicesToReuse数量大于container的所需设备那么这些多余的设备便还是处于已分配状态,故其他的pod便无法利用这个设备,直到该pod进入终结状态后才会被释放。举个例子,如果init-container每次请求4个gpu而container总共请求2个,那么剩余的两个gpu会被认为还在这个pod中被利用,为已分配状态,那么这个pod便会在运行时占用4个gpu,但是实际上他只需要2个,另外两个在pod终结前一直处于空闲状态而其他的pod无法再被分配这两块gpu。
// Removes the device resources allocated to the specified <podUID, contName> from allocatedResources.
// 将设备从该resource的已分配设备列表中删除
func (pdev podDevices) removeContainerAllocatedResources(podUID, contName string, allocatedResources map[string]sets.String) {
containers, exists := pdev[podUID]
if !exists {
return
}
resources, exists := containers[contName]
if !exists {
return
}
for resource, devices := range resources {
// 返回差集,未被利用的设备
allocatedResources[resource] = allocatedResources[resource].Difference(devices.deviceIds)
}
}
最后是m.sanitizeNodeAllocatable(node),该函数实现将最新的资源申请状况同步到node上去。
// sanitizeNodeAllocatable scans through allocatedDevices in the device manager
// and if necessary, updates allocatableResource in nodeInfo to at least equal to
// the allocated capacity. This allows pods that have already been scheduled on
// the node to pass GeneralPredicates admission checking even upon device plugin failure.
// 更新当前node的已分配设备资源量
func (m *ManagerImpl) sanitizeNodeAllocatable(node *schedulercache.NodeInfo) {
var newAllocatableResource *schedulercache.Resource
allocatableResource := node.AllocatableResource()
if allocatableResource.ScalarResources == nil {
allocatableResource.ScalarResources = make(map[v1.ResourceName]int64)
}
// needed表明对于scheduler来说申请设备是一个增量更新,最大程度上利用已经被plugin分配的设备。如果当前node已经分配的设备量
// 没有scheduler申请记录表里的多,那么就保持scheduler的申请记录,否则更新该记录。
for resource, devices := range m.allocatedDevices {
needed := devices.Len()
quant, ok := allocatableResource.ScalarResources[v1.ResourceName(resource)]
if ok && int(quant) >= needed {
continue
}
// Needs to update nodeInfo.AllocatableResource to make sure
// NodeInfo.allocatableResource at least equal to the capacity already allocated.
if newAllocatableResource == nil {
newAllocatableResource = allocatableResource.Clone()
}
newAllocatableResource.ScalarResources[v1.ResourceName(resource)] = int64(needed)
}
if newAllocatableResource != nil {
node.SetAllocatableResource(newAllocatableResource)
}
}
至此Allocate()函数分析完毕,流程图如下:
5. kubelet和scheduler的资源调度
上面我们分析了kubernetes如何注册设备插件、使用插件分配设备以及维护现有的设备信息,那kubelet是如何获取这些信息的,scheduler是如何获取信息并且感知调度的呢?
kubelet对于当前节点设备信息的获取依赖func (m *ManagerImpl) GetCapacity() (v1.ResourceList, v1.ResourceList, []string)
该函数返回三个值,分别为当前node上设备总量、健康设备总量和删除资源,分别叫做capacity, allocatable, deletedResources,删除资源的意思是这个resource的endpoint不健康了,所以需要报告给master节点处理。Node.Status.Capacity和Node.Status.Allocatable记录了节点全部资源,包括可用于分配的资源。
kubelet调用该函数的传递关系大致是:kubelet有一个ContainerManager的类型,ContainerManager有一个接口函数,containerManagerImpl是通过devicemanager.ManagerImpl实现设备插件资源管理的。(调用关系参考https://www.kubernetes.org.cn/4391.html)
// GetCapacity is expected to be called when Kubelet updates its node status.
// The first returned variable contains the registered device plugin resource capacity.
// The second returned variable contains the registered device plugin resource allocatable.
// The third returned variable contains previously registered resources that are no longer active.
// Kubelet uses this information to update resource capacity/allocatable in its node status.
// After the call, device plugin can remove the inactive resources from its internal list as the
// change is already reflected in Kubelet node status.
// Note in the special case after Kubelet restarts, device plugin resource capacities can
// temporarily drop to zero till corresponding device plugins re-register. This is OK because
// cm.UpdatePluginResource() run during predicate Admit guarantees we adjust nodeinfo
// capacity for already allocated pods so that they can continue to run. However, new pods
// requiring device plugin resources will not be scheduled till device plugin re-registers.
// 得到当前node上设备总量、健康设备总量和删除资源,分别叫做capacity, allocatable, deletedResources,
// 这是kubelet的视角
func (m *ManagerImpl) GetCapacity() (v1.ResourceList, v1.ResourceList, []string) {
needsUpdateCheckpoint := false
var capacity = v1.ResourceList{}
var allocatable = v1.ResourceList{}
deletedResources := sets.NewString()
m.mutex.Lock()
// 可分配资源等于healthyDevices数
for resourceName, devices := range m.healthyDevices {
eI, ok := m.endpoints[resourceName]
// 如果不存在或endpoint失效,则删除资源
if (ok && eI.e.stopGracePeriodExpired()) || !ok {
// The resources contained in endpoints and (un)healthyDevices
// should always be consistent. Otherwise, we run with the risk
// of failing to garbage collect non-existing resources or devices.
if !ok {
klog.Errorf("unexpected: healthyDevices and endpoints are out of sync")
}
delete(m.endpoints, resourceName)
delete(m.healthyDevices, resourceName)
deletedResources.Insert(resourceName)
needsUpdateCheckpoint = true
} else {
capacity[v1.ResourceName(resourceName)] = *resource.NewQuantity(int64(devices.Len()), resource.DecimalSI)
allocatable[v1.ResourceName(resourceName)] = *resource.NewQuantity(int64(devices.Len()), resource.DecimalSI)
}
}
// 节点资源总数等于healthyDevices+unhealthyDevices数
for resourceName, devices := range m.unhealthyDevices {
eI, ok := m.endpoints[resourceName]
if (ok && eI.e.stopGracePeriodExpired()) || !ok {
if !ok {
klog.Errorf("unexpected: unhealthyDevices and endpoints are out of sync")
}
delete(m.endpoints, resourceName)
delete(m.unhealthyDevices, resourceName)
deletedResources.Insert(resourceName)
needsUpdateCheckpoint = true
} else {
capacityCount := capacity[v1.ResourceName(resourceName)]
unhealthyCount := *resource.NewQuantity(int64(devices.Len()), resource.DecimalSI)
capacityCount.Add(unhealthyCount)
capacity[v1.ResourceName(resourceName)] = capacityCount
}
}
m.mutex.Unlock()
if needsUpdateCheckpoint {
m.writeCheckpoint()
}
// 返回资源容量,可用资源数,删除的资源数
return capacity, allocatable, deletedResources.UnsortedList()
}
那么scheduler是如何得知当前节点有多少设备可以被分配呢,还记得Allocate()有一个函数叫sanitizeNodeAllocatable(node *schedulercache.NodeInfo)吗,scheduler便是通过经由该函数维护的schedulercache.NodeInfo.ScalarResources[v1.ResourceName(resource)]获取当前节点的可用设备的。
总结一下,kubelet通过GetCapacity()获取当前节点所有资源信息,Scheduler通过NodeInfo获取资源可用量,以此为依据进行调度。