diff --git a/cmd/compute-domain-controller/computedomain.go b/cmd/compute-domain-controller/computedomain.go index eac709666..9fab203e4 100644 --- a/cmd/compute-domain-controller/computedomain.go +++ b/cmd/compute-domain-controller/computedomain.go @@ -55,7 +55,7 @@ type ComputeDomainManager struct { factory nvinformers.SharedInformerFactory informer cache.SharedIndexInformer - deploymentManager *DeploymentManager + DaemonSetManager *DaemonSetManager resourceClaimTemplateManager *WorkloadResourceClaimTemplateManager } @@ -69,7 +69,7 @@ func NewComputeDomainManager(config *ManagerConfig) *ComputeDomainManager { factory: factory, informer: informer, } - m.deploymentManager = NewDeploymentManager(config, m.Get) + m.DaemonSetManager = NewDaemonSetManager(config, m.Get) m.resourceClaimTemplateManager = NewWorkloadResourceClaimTemplateManager(config) return m @@ -117,8 +117,8 @@ func (m *ComputeDomainManager) Start(ctx context.Context) (rerr error) { return fmt.Errorf("informer cache sync for ComputeDomains failed") } - if err := m.deploymentManager.Start(ctx); err != nil { - return fmt.Errorf("error starting Deployment manager: %w", err) + if err := m.DaemonSetManager.Start(ctx); err != nil { + return fmt.Errorf("error starting DaemonSet manager: %w", err) } if err := m.resourceClaimTemplateManager.Start(ctx); err != nil { @@ -129,8 +129,8 @@ func (m *ComputeDomainManager) Start(ctx context.Context) (rerr error) { } func (m *ComputeDomainManager) Stop() error { - if err := m.deploymentManager.Stop(); err != nil { - return fmt.Errorf("error stopping Deployment manager: %w", err) + if err := m.DaemonSetManager.Stop(); err != nil { + return fmt.Errorf("error stopping DaemonSet manager: %w", err) } if err := m.resourceClaimTemplateManager.Stop(); err != nil { return fmt.Errorf("error stopping ResourceClaim manager: %w", err) @@ -220,8 +220,8 @@ func (m *ComputeDomainManager) onAddOrUpdate(ctx context.Context, obj any) error return fmt.Errorf("error deleting ResourceClaimTemplate: %w", err) } - if err := m.deploymentManager.Delete(ctx, string(cd.UID)); err != nil { - return fmt.Errorf("error deleting Deployment: %w", err) + if err := m.DaemonSetManager.Delete(ctx, string(cd.UID)); err != nil { + return fmt.Errorf("error deleting DaemonSet: %w", err) } // TODO: Condition the removal of these finalizers on there being no @@ -238,12 +238,12 @@ func (m *ComputeDomainManager) onAddOrUpdate(ctx context.Context, obj any) error return fmt.Errorf("error asserting removal of ResourceClaimTemplate: %w", err) } - if err := m.deploymentManager.RemoveFinalizer(ctx, string(cd.UID)); err != nil { - return fmt.Errorf("error removing finalizer on Deployment: %w", err) + if err := m.DaemonSetManager.RemoveFinalizer(ctx, string(cd.UID)); err != nil { + return fmt.Errorf("error removing finalizer on DaemonSet: %w", err) } - if err := m.deploymentManager.AssertRemoved(ctx, string(cd.UID)); err != nil { - return fmt.Errorf("error asserting removal of Deployment: %w", err) + if err := m.DaemonSetManager.AssertRemoved(ctx, string(cd.UID)); err != nil { + return fmt.Errorf("error asserting removal of DaemonSet: %w", err) } if err := m.RemoveFinalizer(ctx, string(cd.UID)); err != nil { @@ -258,8 +258,8 @@ func (m *ComputeDomainManager) onAddOrUpdate(ctx context.Context, obj any) error return fmt.Errorf("error adding finalizer: %w", err) } - if _, err := m.deploymentManager.Create(ctx, m.config.driverNamespace, cd); err != nil { - return fmt.Errorf("error creating Deployment: %w", err) + if _, err := m.DaemonSetManager.Create(ctx, m.config.driverNamespace, cd); err != nil { + return fmt.Errorf("error creating DaemonSet: %w", err) } if _, err := m.resourceClaimTemplateManager.Create(ctx, cd.Namespace, cd.Spec.ResourceClaimTemplate.Name, cd); err != nil { diff --git a/cmd/compute-domain-controller/deployment.go b/cmd/compute-domain-controller/daemonset.go similarity index 65% rename from cmd/compute-domain-controller/deployment.go rename to cmd/compute-domain-controller/daemonset.go index 17831f6d1..a3eeab069 100644 --- a/cmd/compute-domain-controller/deployment.go +++ b/cmd/compute-domain-controller/daemonset.go @@ -2,7 +2,7 @@ * Copyright (c) 2025 NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. +NumberReady * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 @@ -12,7 +12,7 @@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. - */ +*/ package main @@ -38,20 +38,19 @@ import ( ) const ( - DeploymentTemplatePath = "/templates/compute-domain-daemon.tmpl.yaml" + DaemonSetTemplatePath = "/templates/compute-domain-daemon.tmpl.yaml" ) -type DeploymentTemplateData struct { +type DaemonSetTemplateData struct { Namespace string GenerateName string Finalizer string ComputeDomainLabelKey string ComputeDomainLabelValue types.UID - Replicas int ResourceClaimTemplateName string } -type DeploymentManager struct { +type DaemonSetManager struct { sync.Mutex config *ManagerConfig @@ -62,11 +61,11 @@ type DeploymentManager struct { factory informers.SharedInformerFactory informer cache.SharedIndexInformer - resourceClaimTemplateManager *DeploymentResourceClaimTemplateManager - podManagers map[string]*DeploymentPodManager + resourceClaimTemplateManager *DaemonSetResourceClaimTemplateManager + podManagers map[string]*DaemonSetPodManager } -func NewDeploymentManager(config *ManagerConfig, getComputeDomain GetComputeDomainFunc) *DeploymentManager { +func NewDaemonSetManager(config *ManagerConfig, getComputeDomain GetComputeDomainFunc) *DaemonSetManager { labelSelector := &metav1.LabelSelector{ MatchExpressions: []metav1.LabelSelectorRequirement{ { @@ -85,33 +84,33 @@ func NewDeploymentManager(config *ManagerConfig, getComputeDomain GetComputeDoma }), ) - informer := factory.Apps().V1().Deployments().Informer() + informer := factory.Apps().V1().DaemonSets().Informer() - m := &DeploymentManager{ + m := &DaemonSetManager{ config: config, getComputeDomain: getComputeDomain, factory: factory, informer: informer, - podManagers: make(map[string]*DeploymentPodManager), + podManagers: make(map[string]*DaemonSetPodManager), } - m.resourceClaimTemplateManager = NewDeploymentResourceClaimTemplateManager(config) + m.resourceClaimTemplateManager = NewDaemonSetResourceClaimTemplateManager(config) return m } -func (m *DeploymentManager) Start(ctx context.Context) (rerr error) { +func (m *DaemonSetManager) Start(ctx context.Context) (rerr error) { ctx, cancel := context.WithCancel(ctx) m.cancelContext = cancel defer func() { if rerr != nil { if err := m.Stop(); err != nil { - klog.Errorf("error stopping Deployment manager: %v", err) + klog.Errorf("error stopping DaemonSet manager: %v", err) } } }() - if err := addComputeDomainLabelIndexer[*appsv1.Deployment](m.informer); err != nil { + if err := addComputeDomainLabelIndexer[*appsv1.DaemonSet](m.informer); err != nil { return fmt.Errorf("error adding indexer for MulitNodeEnvironment label: %w", err) } @@ -124,7 +123,7 @@ func (m *DeploymentManager) Start(ctx context.Context) (rerr error) { }, }) if err != nil { - return fmt.Errorf("error adding event handlers for Deployment informer: %w", err) + return fmt.Errorf("error adding event handlers for DaemonSet informer: %w", err) } m.waitGroup.Add(1) @@ -134,7 +133,7 @@ func (m *DeploymentManager) Start(ctx context.Context) (rerr error) { }() if !cache.WaitForCacheSync(ctx.Done(), m.informer.HasSynced) { - return fmt.Errorf("informer cache sync for Deployment failed") + return fmt.Errorf("informer cache sync for DaemonSet failed") } if err := m.resourceClaimTemplateManager.Start(ctx); err != nil { @@ -144,7 +143,7 @@ func (m *DeploymentManager) Start(ctx context.Context) (rerr error) { return nil } -func (m *DeploymentManager) Stop() error { +func (m *DaemonSetManager) Stop() error { if err := m.removeAllPodManagers(); err != nil { return fmt.Errorf("error removing all Pod managers: %w", err) } @@ -153,13 +152,13 @@ func (m *DeploymentManager) Stop() error { return nil } -func (m *DeploymentManager) Create(ctx context.Context, namespace string, cd *nvapi.ComputeDomain) (*appsv1.Deployment, error) { - ds, err := getByComputeDomainUID[*appsv1.Deployment](ctx, m.informer, string(cd.UID)) +func (m *DaemonSetManager) Create(ctx context.Context, namespace string, cd *nvapi.ComputeDomain) (*appsv1.DaemonSet, error) { + ds, err := getByComputeDomainUID[*appsv1.DaemonSet](ctx, m.informer, string(cd.UID)) if err != nil { - return nil, fmt.Errorf("error retrieving Deployment: %w", err) + return nil, fmt.Errorf("error retrieving DaemonSet: %w", err) } if len(ds) > 1 { - return nil, fmt.Errorf("more than one Deployment found with same ComputeDomain UID") + return nil, fmt.Errorf("more than one DaemonSet found with same ComputeDomain UID") } if len(ds) == 1 { return ds[0], nil @@ -170,17 +169,16 @@ func (m *DeploymentManager) Create(ctx context.Context, namespace string, cd *nv return nil, fmt.Errorf("error creating ResourceClaimTemplate: %w", err) } - templateData := DeploymentTemplateData{ + templateData := DaemonSetTemplateData{ Namespace: m.config.driverNamespace, GenerateName: fmt.Sprintf("%s-", cd.Name), Finalizer: computeDomainFinalizer, ComputeDomainLabelKey: computeDomainLabelKey, ComputeDomainLabelValue: cd.UID, - Replicas: 0, ResourceClaimTemplateName: rct.Name, } - tmpl, err := template.ParseFiles(DeploymentTemplatePath) + tmpl, err := template.ParseFiles(DaemonSetTemplatePath) if err != nil { return nil, fmt.Errorf("failed to parse template file: %w", err) } @@ -196,27 +194,27 @@ func (m *DeploymentManager) Create(ctx context.Context, namespace string, cd *nv return nil, fmt.Errorf("failed to unmarshal yaml: %w", err) } - var deployment appsv1.Deployment + var deployment appsv1.DaemonSet err = runtime.DefaultUnstructuredConverter.FromUnstructured(unstructuredObj.UnstructuredContent(), &deployment) if err != nil { return nil, fmt.Errorf("failed to convert unstructured data to typed object: %w", err) } - d, err := m.config.clientsets.Core.AppsV1().Deployments(deployment.Namespace).Create(ctx, &deployment, metav1.CreateOptions{}) + d, err := m.config.clientsets.Core.AppsV1().DaemonSets(deployment.Namespace).Create(ctx, &deployment, metav1.CreateOptions{}) if err != nil { - return nil, fmt.Errorf("error creating Deployment: %w", err) + return nil, fmt.Errorf("error creating DaemonSet: %w", err) } return d, nil } -func (m *DeploymentManager) Delete(ctx context.Context, cdUID string) error { - ds, err := getByComputeDomainUID[*appsv1.Deployment](ctx, m.informer, cdUID) +func (m *DaemonSetManager) Delete(ctx context.Context, cdUID string) error { + ds, err := getByComputeDomainUID[*appsv1.DaemonSet](ctx, m.informer, cdUID) if err != nil { - return fmt.Errorf("error retrieving Deployment: %w", err) + return fmt.Errorf("error retrieving DaemonSet: %w", err) } if len(ds) > 1 { - return fmt.Errorf("more than one Deployment found with same ComputeDomain UID") + return fmt.Errorf("more than one DaemonSet found with same ComputeDomain UID") } if len(ds) == 0 { return nil @@ -237,41 +235,41 @@ func (m *DeploymentManager) Delete(ctx context.Context, cdUID string) error { return nil } - err = m.config.clientsets.Core.AppsV1().Deployments(d.Namespace).Delete(ctx, d.Name, metav1.DeleteOptions{}) + err = m.config.clientsets.Core.AppsV1().DaemonSets(d.Namespace).Delete(ctx, d.Name, metav1.DeleteOptions{}) if err != nil && !errors.IsNotFound(err) { - return fmt.Errorf("erroring deleting Deployment: %w", err) + return fmt.Errorf("erroring deleting DaemonSet: %w", err) } return nil } -func (m *DeploymentManager) RemoveFinalizer(ctx context.Context, cdUID string) error { +func (m *DaemonSetManager) RemoveFinalizer(ctx context.Context, cdUID string) error { if err := m.resourceClaimTemplateManager.RemoveFinalizer(ctx, cdUID); err != nil { return fmt.Errorf("error removing finalizer on ResourceClaimTemplate: %w", err) } if err := m.removeFinalizer(ctx, cdUID); err != nil { - return fmt.Errorf("error removing finalizer on Deployment: %w", err) + return fmt.Errorf("error removing finalizer on DaemonSet: %w", err) } return nil } -func (m *DeploymentManager) AssertRemoved(ctx context.Context, cdUID string) error { +func (m *DaemonSetManager) AssertRemoved(ctx context.Context, cdUID string) error { if err := m.resourceClaimTemplateManager.AssertRemoved(ctx, cdUID); err != nil { return fmt.Errorf("error asserting ResourceClaimTemplate removed: %w", err) } if err := m.assertRemoved(ctx, cdUID); err != nil { - return fmt.Errorf("error asserting Deployment removal: %w", err) + return fmt.Errorf("error asserting DaemonSet removal: %w", err) } return nil } -func (m *DeploymentManager) removeFinalizer(ctx context.Context, cdUID string) error { - ds, err := getByComputeDomainUID[*appsv1.Deployment](ctx, m.informer, cdUID) +func (m *DaemonSetManager) removeFinalizer(ctx context.Context, cdUID string) error { + ds, err := getByComputeDomainUID[*appsv1.DaemonSet](ctx, m.informer, cdUID) if err != nil { - return fmt.Errorf("error retrieving Deployment: %w", err) + return fmt.Errorf("error retrieving DaemonSet: %w", err) } if len(ds) > 1 { - return fmt.Errorf("more than one Deployment found with same ComputeDomain UID") + return fmt.Errorf("more than one DaemonSet found with same ComputeDomain UID") } if len(ds) == 0 { return nil @@ -280,7 +278,7 @@ func (m *DeploymentManager) removeFinalizer(ctx context.Context, cdUID string) e d := ds[0] if d.GetDeletionTimestamp() == nil { - return fmt.Errorf("attempting to remove finalizer before Deployment marked for deletion") + return fmt.Errorf("attempting to remove finalizer before DaemonSet marked for deletion") } newD := d.DeepCopy() @@ -294,17 +292,17 @@ func (m *DeploymentManager) removeFinalizer(ctx context.Context, cdUID string) e return nil } - if _, err := m.config.clientsets.Core.AppsV1().Deployments(d.Namespace).Update(ctx, newD, metav1.UpdateOptions{}); err != nil { - return fmt.Errorf("error updating Deployment: %w", err) + if _, err := m.config.clientsets.Core.AppsV1().DaemonSets(d.Namespace).Update(ctx, newD, metav1.UpdateOptions{}); err != nil { + return fmt.Errorf("error updating DaemonSet: %w", err) } return nil } -func (m *DeploymentManager) assertRemoved(ctx context.Context, cdUID string) error { - ds, err := getByComputeDomainUID[*appsv1.Deployment](ctx, m.informer, cdUID) +func (m *DaemonSetManager) assertRemoved(ctx context.Context, cdUID string) error { + ds, err := getByComputeDomainUID[*appsv1.DaemonSet](ctx, m.informer, cdUID) if err != nil { - return fmt.Errorf("error retrieving Deployment: %w", err) + return fmt.Errorf("error retrieving DaemonSet: %w", err) } if len(ds) != 0 { return fmt.Errorf("still exists") @@ -312,13 +310,13 @@ func (m *DeploymentManager) assertRemoved(ctx context.Context, cdUID string) err return nil } -func (m *DeploymentManager) onAddOrUpdate(ctx context.Context, obj any) error { - d, ok := obj.(*appsv1.Deployment) +func (m *DaemonSetManager) onAddOrUpdate(ctx context.Context, obj any) error { + d, ok := obj.(*appsv1.DaemonSet) if !ok { - return fmt.Errorf("failed to cast to Deployment") + return fmt.Errorf("failed to cast to DaemonSet") } - klog.Infof("Processing added or updated Deployment: %s/%s", d.Namespace, d.Name) + klog.Infof("Processing added or updated DaemonSet: %s/%s", d.Namespace, d.Name) cd, err := m.getComputeDomain(d.Labels[computeDomainLabelKey]) if err != nil { @@ -332,7 +330,7 @@ func (m *DeploymentManager) onAddOrUpdate(ctx context.Context, obj any) error { return fmt.Errorf("error adding Pod manager '%s/%s': %w", d.Namespace, d.Name, err) } - if int(d.Status.AvailableReplicas) != cd.Spec.NumNodes { + if int(d.Status.NumberReady) != cd.Spec.NumNodes { return nil } @@ -345,14 +343,14 @@ func (m *DeploymentManager) onAddOrUpdate(ctx context.Context, obj any) error { return nil } -func (m *DeploymentManager) addPodManager(ctx context.Context, labelSelector *metav1.LabelSelector, numPods int) error { +func (m *DaemonSetManager) addPodManager(ctx context.Context, labelSelector *metav1.LabelSelector, numPods int) error { key := labelSelector.MatchLabels[computeDomainLabelKey] if _, exists := m.podManagers[key]; exists { return nil } - podManager := NewDeploymentPodManager(m.config, labelSelector, numPods, m.getComputeDomain) + podManager := NewDaemonSetPodManager(m.config, labelSelector, numPods, m.getComputeDomain) if err := podManager.Start(ctx); err != nil { return fmt.Errorf("error creating Pod manager: %w", err) @@ -365,7 +363,7 @@ func (m *DeploymentManager) addPodManager(ctx context.Context, labelSelector *me return nil } -func (m *DeploymentManager) removePodManager(key string) error { +func (m *DaemonSetManager) removePodManager(key string) error { if _, exists := m.podManagers[key]; !exists { return nil } @@ -385,7 +383,7 @@ func (m *DeploymentManager) removePodManager(key string) error { return nil } -func (m *DeploymentManager) removeAllPodManagers() error { +func (m *DaemonSetManager) removeAllPodManagers() error { m.Lock() for key, pm := range m.podManagers { m.Unlock() diff --git a/cmd/compute-domain-controller/deploymentpods.go b/cmd/compute-domain-controller/daemonsetpods.go similarity index 88% rename from cmd/compute-domain-controller/deploymentpods.go rename to cmd/compute-domain-controller/daemonsetpods.go index 38851f08b..73e369081 100644 --- a/cmd/compute-domain-controller/deploymentpods.go +++ b/cmd/compute-domain-controller/daemonsetpods.go @@ -37,7 +37,7 @@ const ( CliqueIDLabelKey = "nvidia.com/gpu.clique" ) -type DeploymentPodManager struct { +type DaemonSetPodManager struct { config *ManagerConfig waitGroup sync.WaitGroup cancelContext context.CancelFunc @@ -51,7 +51,7 @@ type DeploymentPodManager struct { numPods int } -func NewDeploymentPodManager(config *ManagerConfig, labelSelector *metav1.LabelSelector, numPods int, getComputeDomain GetComputeDomainFunc) *DeploymentPodManager { +func NewDaemonSetPodManager(config *ManagerConfig, labelSelector *metav1.LabelSelector, numPods int, getComputeDomain GetComputeDomainFunc) *DaemonSetPodManager { factory := informers.NewSharedInformerFactoryWithOptions( config.clientsets.Core, informerResyncPeriod, @@ -64,7 +64,7 @@ func NewDeploymentPodManager(config *ManagerConfig, labelSelector *metav1.LabelS informer := factory.Core().V1().Pods().Informer() lister := factory.Core().V1().Pods().Lister() - m := &DeploymentPodManager{ + m := &DaemonSetPodManager{ config: config, factory: factory, informer: informer, @@ -76,14 +76,14 @@ func NewDeploymentPodManager(config *ManagerConfig, labelSelector *metav1.LabelS return m } -func (m *DeploymentPodManager) Start(ctx context.Context) (rerr error) { +func (m *DaemonSetPodManager) Start(ctx context.Context) (rerr error) { ctx, cancel := context.WithCancel(ctx) m.cancelContext = cancel defer func() { if rerr != nil { if err := m.Stop(); err != nil { - klog.Errorf("error stopping DeploymentPod manager: %v", err) + klog.Errorf("error stopping DaemonSetPod manager: %v", err) } } }() @@ -113,13 +113,13 @@ func (m *DeploymentPodManager) Start(ctx context.Context) (rerr error) { return nil } -func (m *DeploymentPodManager) Stop() error { +func (m *DaemonSetPodManager) Stop() error { m.cancelContext() m.waitGroup.Wait() return nil } -func (m *DeploymentPodManager) onPodAddOrUpdate(ctx context.Context, obj any) error { +func (m *DaemonSetPodManager) onPodAddOrUpdate(ctx context.Context, obj any) error { p, ok := obj.(*corev1.Pod) if !ok { return fmt.Errorf("failed to cast to Pod") @@ -179,7 +179,7 @@ func (m *DeploymentPodManager) onPodAddOrUpdate(ctx context.Context, obj any) er return nil } -func (m *DeploymentPodManager) GetComputeDomainNode(ctx context.Context, nodeName string) (*nvapi.ComputeDomainNode, error) { +func (m *DaemonSetPodManager) GetComputeDomainNode(ctx context.Context, nodeName string) (*nvapi.ComputeDomainNode, error) { node, err := m.config.clientsets.Core.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) if err != nil { return nil, fmt.Errorf("error getting Node '%s': %w", nodeName, err) diff --git a/cmd/compute-domain-controller/indexers.go b/cmd/compute-domain-controller/indexers.go index 018aa61cd..6f9a1f58b 100644 --- a/cmd/compute-domain-controller/indexers.go +++ b/cmd/compute-domain-controller/indexers.go @@ -50,7 +50,7 @@ func addComputeDomainLabelIndexer[T metav1.ObjectMetaAccessor](informer cache.Sh func getByComputeDomainUID[T1 *T2, T2 any](ctx context.Context, informer cache.SharedIndexInformer, cdUID string) ([]T1, error) { if !cache.WaitForCacheSync(ctx.Done(), informer.HasSynced) { - return nil, fmt.Errorf("cache sync failed for Deployment") + return nil, fmt.Errorf("cache sync failed for %T", *new(T1)) } objs, err := informer.GetIndexer().ByIndex("computeDomainLabel", cdUID) diff --git a/cmd/compute-domain-controller/resourceclaimtemplate.go b/cmd/compute-domain-controller/resourceclaimtemplate.go index b4092cd07..af32c82b0 100644 --- a/cmd/compute-domain-controller/resourceclaimtemplate.go +++ b/cmd/compute-domain-controller/resourceclaimtemplate.go @@ -38,8 +38,8 @@ import ( ) const ( - DeploymentResourceClaimTemplateTemplatePath = "/templates/compute-domain-daemon-claim-template.tmpl.yaml" - WorkloadResourceClaimTemplateTemplatePath = "/templates/compute-domain-workload-claim-template.tmpl.yaml" + DaemonSetResourceClaimTemplateTemplatePath = "/templates/compute-domain-daemon-claim-template.tmpl.yaml" + WorkloadResourceClaimTemplateTemplatePath = "/templates/compute-domain-workload-claim-template.tmpl.yaml" ) type ResourceClaimTemplateTemplateData struct { @@ -66,7 +66,7 @@ type BaseResourceClaimTemplateManager struct { informer cache.SharedIndexInformer } -type DeploymentResourceClaimTemplateManager struct { +type DaemonSetResourceClaimTemplateManager struct { *BaseResourceClaimTemplateManager } @@ -234,7 +234,7 @@ func (m *BaseResourceClaimTemplateManager) AssertRemoved(ctx context.Context, cd return nil } -func NewDeploymentResourceClaimTemplateManager(config *ManagerConfig) *DeploymentResourceClaimTemplateManager { +func NewDaemonSetResourceClaimTemplateManager(config *ManagerConfig) *DaemonSetResourceClaimTemplateManager { labelSelector := &metav1.LabelSelector{ MatchExpressions: []metav1.LabelSelectorRequirement{ { @@ -251,14 +251,14 @@ func NewDeploymentResourceClaimTemplateManager(config *ManagerConfig) *Deploymen base := newBaseResourceClaimTemplateManager(config, labelSelector, config.driverNamespace) - m := &DeploymentResourceClaimTemplateManager{ + m := &DaemonSetResourceClaimTemplateManager{ BaseResourceClaimTemplateManager: base, } return m } -func (m *DeploymentResourceClaimTemplateManager) Create(ctx context.Context, namespace string, cd *nvapi.ComputeDomain) (*resourceapi.ResourceClaimTemplate, error) { +func (m *DaemonSetResourceClaimTemplateManager) Create(ctx context.Context, namespace string, cd *nvapi.ComputeDomain) (*resourceapi.ResourceClaimTemplate, error) { rcts, err := getByComputeDomainUID[*resourceapi.ResourceClaimTemplate](ctx, m.informer, string(cd.UID)) if err != nil { return nil, fmt.Errorf("error retrieving ResourceClaimTemplate: %w", err) @@ -290,7 +290,7 @@ func (m *DeploymentResourceClaimTemplateManager) Create(ctx context.Context, nam DaemonConfig: daemonConfig, } - rct, err := m.BaseResourceClaimTemplateManager.Create(ctx, DeploymentResourceClaimTemplateTemplatePath, &templateData) + rct, err := m.BaseResourceClaimTemplateManager.Create(ctx, DaemonSetResourceClaimTemplateTemplatePath, &templateData) if err != nil { return nil, fmt.Errorf("error creating ResourceClaimTemplate from base: %w", err) } diff --git a/cmd/compute-domain-kubelet-plugin/computedomain.go b/cmd/compute-domain-kubelet-plugin/computedomain.go index 05d5efe27..45c521ba6 100644 --- a/cmd/compute-domain-kubelet-plugin/computedomain.go +++ b/cmd/compute-domain-kubelet-plugin/computedomain.go @@ -26,12 +26,9 @@ import ( "text/template" "time" - appsv1 "k8s.io/api/apps/v1" - corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/tools/cache" "k8s.io/klog/v2" - "k8s.io/utils/ptr" cdiapi "tags.cncf.io/container-device-interface/pkg/cdi" cdispec "tags.cncf.io/container-device-interface/specs-go" @@ -231,10 +228,6 @@ func (s *ComputeDomainDaemonSettings) WriteNodesConfigFile(ctx context.Context) } func (m *ComputeDomainManager) AssertComputeDomainReady(ctx context.Context, cdUID string) error { - if err := m.UpdateComputeDomainDeployment(ctx, cdUID); err != nil { - return fmt.Errorf("error updating Deployment for ComputeDomain: %w", err) - } - cd, err := m.GetComputeDomain(ctx, cdUID) if err != nil { return fmt.Errorf("error getting ComputeDomain: %w", err) @@ -272,57 +265,45 @@ func (m *ComputeDomainManager) GetNodeIPs(ctx context.Context, cdUID string) ([] return ips, nil } -func (m *ComputeDomainManager) UpdateComputeDomainDeployment(ctx context.Context, cdUID string) error { - cd, err := m.GetComputeDomain(ctx, cdUID) +func (m *ComputeDomainManager) AddNodeLabel(ctx context.Context, cdUID string) error { + node, err := m.config.clientsets.Core.CoreV1().Nodes().Get(ctx, m.config.flags.nodeName, metav1.GetOptions{}) if err != nil { - return fmt.Errorf("error getting ComputeDomain: %w", err) + return fmt.Errorf("error retrieving Node: %w", err) } - if cd == nil { - return fmt.Errorf("ComputeDomain not found: %s", cdUID) + + currentValue, exists := node.Labels[computeDomainLabelKey] + if exists && currentValue == cdUID { + return nil } - d, err := m.GetComputeDomainDeployment(ctx, cdUID) - if err != nil || d == nil { - return fmt.Errorf("error getting Deployment for ComputeDomain: %w", err) - } - - newD := d.DeepCopy() - - if newD.Spec.Template.Spec.Affinity == nil { - newD.Spec.Template.Spec.Affinity = &corev1.Affinity{ - NodeAffinity: &corev1.NodeAffinity{ - RequiredDuringSchedulingIgnoredDuringExecution: &corev1.NodeSelector{ - NodeSelectorTerms: []corev1.NodeSelectorTerm{ - { - MatchExpressions: []corev1.NodeSelectorRequirement{ - { - Key: "kubernetes.io/hostname", - Operator: corev1.NodeSelectorOpIn, - Values: []string{}, - }, - }, - }, - }, - }, - }, - } + newNode := node.DeepCopy() + if newNode.Labels == nil { + newNode.Labels = make(map[string]string) } + newNode.Labels[computeDomainLabelKey] = cdUID - values := []string{m.config.flags.nodeName} - for _, value := range newD.Spec.Template.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms[0].MatchExpressions[0].Values { - if value == m.config.flags.nodeName { - return nil - } - values = append(values, value) + if _, err = m.config.clientsets.Core.CoreV1().Nodes().Update(ctx, newNode, metav1.UpdateOptions{}); err != nil { + return fmt.Errorf("error updating Node with label: %w", err) + } + + return nil +} + +func (m *ComputeDomainManager) RemoveNodeLabel(ctx context.Context, cdUID string) error { + node, err := m.config.clientsets.Core.CoreV1().Nodes().Get(ctx, m.config.flags.nodeName, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("error retrieving Node: %w", err) } - newD.Spec.Template.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms[0].MatchExpressions[0].Values = values - if len(values) == cd.Spec.NumNodes { - newD.Spec.Replicas = ptr.To(int32(len(values))) + if _, exists := node.Labels[computeDomainLabelKey]; !exists { + return nil } - if _, err := m.config.clientsets.Core.AppsV1().Deployments(newD.Namespace).Update(ctx, newD, metav1.UpdateOptions{}); err != nil { - return fmt.Errorf("error updating Deployment for ComputeDomain: %w", err) + newNode := node.DeepCopy() + delete(newNode.Labels, computeDomainLabelKey) + + if _, err := m.config.clientsets.Core.CoreV1().Nodes().Update(ctx, newNode, metav1.UpdateOptions{}); err != nil { + return fmt.Errorf("error updating Node to remove label: %w", err) } return nil @@ -345,32 +326,3 @@ func (m *ComputeDomainManager) GetComputeDomain(ctx context.Context, cdUID strin } return cd, nil } - -func (m *ComputeDomainManager) GetComputeDomainDeployment(ctx context.Context, cdUID string) (*appsv1.Deployment, error) { - labelSelector := &metav1.LabelSelector{ - MatchExpressions: []metav1.LabelSelectorRequirement{ - { - Key: computeDomainLabelKey, - Operator: metav1.LabelSelectorOpIn, - Values: []string{cdUID}, - }, - }, - } - - listOptions := metav1.ListOptions{ - LabelSelector: metav1.FormatLabelSelector(labelSelector), - } - - ds, err := m.config.clientsets.Core.AppsV1().Deployments(m.config.flags.namespace).List(ctx, listOptions) - if err != nil { - return nil, fmt.Errorf("error listing Deployments for ComputeDomain: %w", err) - } - if len(ds.Items) == 0 { - return nil, nil - } - if len(ds.Items) != 1 { - return nil, fmt.Errorf("multiple Deployments for ComputeDomain with the same UID") - } - - return &ds.Items[0], nil -} diff --git a/cmd/compute-domain-kubelet-plugin/device_state.go b/cmd/compute-domain-kubelet-plugin/device_state.go index 96555d399..d69d61b3b 100644 --- a/cmd/compute-domain-kubelet-plugin/device_state.go +++ b/cmd/compute-domain-kubelet-plugin/device_state.go @@ -43,6 +43,7 @@ type OpaqueDeviceConfig struct { } type DeviceConfigState struct { + Type string ComputeDomain string containerEdits *cdiapi.ContainerEdits } @@ -341,17 +342,19 @@ func (s *DeviceState) prepareDevices(ctx context.Context, claim *resourceapi.Res func (s *DeviceState) unprepareDevices(ctx context.Context, claimUID string, devices PreparedDevices) error { // Unprepare any ComputeDomain daemons prepared for each group of prepared devices. for _, group := range devices { - // Skip if no ComputeDomain daemon configured for this device. - if group.ConfigState.ComputeDomain == "" { - continue + // If a cannel type, remove the ComputeDomain label from the node + if group.ConfigState.Type == ComputeDomainChannelType { + if err := s.computeDomainManager.RemoveNodeLabel(ctx, group.ConfigState.ComputeDomain); err != nil { + return fmt.Errorf("error removing Node label for ComputeDomain: %w", err) + } } - // Create new ComputeDomain daemon settings from the ComputeDomainManager. - computeDomainDaemonSettings := s.computeDomainManager.NewSettings(group.ConfigState.ComputeDomain) - - // Unprepare the new ComputeDomain daemon. - if err := computeDomainDaemonSettings.Unprepare(ctx); err != nil { - return fmt.Errorf("error unpreparing ComputeDomain daemon settings: %w", err) + // If a daemon type, unprepare the new ComputeDomain daemon. + if group.ConfigState.Type == ComputeDomainDaemonType { + computeDomainDaemonSettings := s.computeDomainManager.NewSettings(group.ConfigState.ComputeDomain) + if err := computeDomainDaemonSettings.Unprepare(ctx); err != nil { + return fmt.Errorf("error unpreparing ComputeDomain daemon settings: %w", err) + } } } return nil @@ -375,12 +378,17 @@ func (s *DeviceState) applyComputeDomainChannelConfig(ctx context.Context, confi // Create any necessary ComputeDomain channels and gather their CDI container edits. for _, r := range results { channel := s.allocatable[r.Device].Channel + if err := s.computeDomainManager.AddNodeLabel(ctx, config.DomainID); err != nil { + return nil, fmt.Errorf("error adding Node label for ComputeDomain: %w", err) + } if err := s.computeDomainManager.AssertComputeDomainReady(ctx, config.DomainID); err != nil { return nil, fmt.Errorf("error asserting ComputeDomain Ready: %w", err) } if err := s.nvdevlib.createComputeDomainChannelDevice(channel.ID); err != nil { return nil, fmt.Errorf("error creating ComputeDomain channel device: %w", err) } + configState.Type = ComputeDomainChannelType + configState.ComputeDomain = config.DomainID configState.containerEdits = configState.containerEdits.Append(s.computeDomainManager.GetComputeDomainChannelContainerEdits(s.cdi.devRoot, channel)) } @@ -416,7 +424,8 @@ func (s *DeviceState) applyComputeDomainDaemonConfig(ctx context.Context, config } // Store information about the ComputeDomain daemon in the configState. - configState.ComputeDomain = computeDomainDaemonSettings.GetDomain() + configState.Type = ComputeDomainDaemonType + configState.ComputeDomain = config.DomainID configState.containerEdits = computeDomainDaemonSettings.GetCDIContainerEdits() return &configState, nil diff --git a/deployments/helm/nvidia-dra-driver-gpu/templates/clusterrole.yaml b/deployments/helm/nvidia-dra-driver-gpu/templates/clusterrole.yaml index b7c172706..971e357b4 100644 --- a/deployments/helm/nvidia-dra-driver-gpu/templates/clusterrole.yaml +++ b/deployments/helm/nvidia-dra-driver-gpu/templates/clusterrole.yaml @@ -23,15 +23,15 @@ rules: - apiGroups: ["resource.k8s.io"] resources: ["resourceslices"] verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] -- apiGroups: ["apps"] - resources: ["deployments"] - verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] - apiGroups: ["resource.k8s.io"] resources: ["resourceclaims/status"] verbs: ["update"] -- apiGroups: [""] - resources: ["pods"] - verbs: ["get", "list", "watch"] +- apiGroups: ["apps"] + resources: ["daemonsets"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] - apiGroups: [""] resources: ["nodes"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] +- apiGroups: [""] + resources: ["pods"] verbs: ["get", "list", "watch"] diff --git a/templates/compute-domain-daemon.tmpl.yaml b/templates/compute-domain-daemon.tmpl.yaml index deba90bc4..40ee5afa7 100644 --- a/templates/compute-domain-daemon.tmpl.yaml +++ b/templates/compute-domain-daemon.tmpl.yaml @@ -1,6 +1,6 @@ --- apiVersion: apps/v1 -kind: Deployment +kind: Daemonset metadata: namespace: {{ .Namespace }} generateName: {{ .GenerateName }} @@ -9,7 +9,6 @@ metadata: labels: {{ .ComputeDomainLabelKey }}: {{ .ComputeDomainLabelValue }} spec: - replicas: {{ .Replicas }} selector: matchLabels: {{ .ComputeDomainLabelKey }}: {{ .ComputeDomainLabelValue }} @@ -19,6 +18,8 @@ spec: {{ .ComputeDomainLabelKey }}: {{ .ComputeDomainLabelValue }} spec: hostNetwork: true + nodeSelector: + {{ .ComputeDomainLabelKey }}: {{ .ComputeDomainLabelValue }} containers: - name: compute-domain-daemon image: ubuntu:22.04