Skip to content

Commit

Permalink
Merge pull request #276 from klueska/remove-reliance-on-label
Browse files Browse the repository at this point in the history
Remove reliance on nvidia.com/gpu.clique label
  • Loading branch information
klueska authored Mar 11, 2025
2 parents 132ae6d + 03708e7 commit 8d34f97
Show file tree
Hide file tree
Showing 4 changed files with 71 additions and 275 deletions.
68 changes: 0 additions & 68 deletions cmd/compute-domain-controller/daemonset.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,6 @@ type DaemonSetManager struct {

resourceClaimTemplateManager *DaemonSetResourceClaimTemplateManager
cleanupManager *CleanupManager[*appsv1.DaemonSet]
podManagers map[string]*DaemonSetPodManager
}

func NewDaemonSetManager(config *ManagerConfig, getComputeDomain GetComputeDomainFunc) *DaemonSetManager {
Expand Down Expand Up @@ -92,7 +91,6 @@ func NewDaemonSetManager(config *ManagerConfig, getComputeDomain GetComputeDomai
getComputeDomain: getComputeDomain,
factory: factory,
informer: informer,
podManagers: make(map[string]*DaemonSetPodManager),
}
m.resourceClaimTemplateManager = NewDaemonSetResourceClaimTemplateManager(config, getComputeDomain)
m.cleanupManager = NewCleanupManager[*appsv1.DaemonSet](informer, getComputeDomain, m.cleanup)
Expand Down Expand Up @@ -150,9 +148,6 @@ func (m *DaemonSetManager) Start(ctx context.Context) (rerr error) {
}

func (m *DaemonSetManager) Stop() error {
if err := m.removeAllPodManagers(); err != nil {
return fmt.Errorf("error removing all Pod managers: %w", err)
}
if err := m.resourceClaimTemplateManager.Stop(); err != nil {
return fmt.Errorf("error stopping ResourceClaimTemplate manager: %w", err)
}
Expand Down Expand Up @@ -230,16 +225,11 @@ func (m *DaemonSetManager) Delete(ctx context.Context, cdUID string) error {
}

d := ds[0]
key := d.Spec.Selector.MatchLabels[computeDomainLabelKey]

if err := m.resourceClaimTemplateManager.Delete(ctx, cdUID); err != nil {
return fmt.Errorf("error deleting ResourceClaimTemplate: %w", err)
}

if err := m.removePodManager(key); err != nil {
return fmt.Errorf("error removing Pod manager: %w", err)
}

if d.GetDeletionTimestamp() != nil {
return nil
}
Expand Down Expand Up @@ -335,10 +325,6 @@ func (m *DaemonSetManager) onAddOrUpdate(ctx context.Context, obj any) error {
return nil
}

if err := m.addPodManager(ctx, d.Spec.Selector, cd.Spec.NumNodes); err != nil {
return fmt.Errorf("error adding Pod manager '%s/%s': %w", d.Namespace, d.Name, err)
}

if int(d.Status.NumberReady) != cd.Spec.NumNodes {
return nil
}
Expand All @@ -352,60 +338,6 @@ func (m *DaemonSetManager) onAddOrUpdate(ctx context.Context, obj any) error {
return nil
}

func (m *DaemonSetManager) addPodManager(ctx context.Context, labelSelector *metav1.LabelSelector, numPods int) error {
key := labelSelector.MatchLabels[computeDomainLabelKey]

if _, exists := m.podManagers[key]; exists {
return nil
}

podManager := NewDaemonSetPodManager(m.config, labelSelector, numPods, m.getComputeDomain)

if err := podManager.Start(ctx); err != nil {
return fmt.Errorf("error creating Pod manager: %w", err)
}

m.Lock()
m.podManagers[key] = podManager
m.Unlock()

return nil
}

func (m *DaemonSetManager) removePodManager(key string) error {
if _, exists := m.podManagers[key]; !exists {
return nil
}

m.Lock()
podManager := m.podManagers[key]
m.Unlock()

if err := podManager.Stop(); err != nil {
return fmt.Errorf("error stopping Pod manager: %w", err)
}

m.Lock()
delete(m.podManagers, key)
m.Unlock()

return nil
}

func (m *DaemonSetManager) removeAllPodManagers() error {
m.Lock()
for key, pm := range m.podManagers {
m.Unlock()
if err := pm.Stop(); err != nil {
return fmt.Errorf("error stopping Pod manager: %w", err)
}
m.Lock()
delete(m.podManagers, key)
}
m.Unlock()
return nil
}

func (m *DaemonSetManager) cleanup(ctx context.Context, cdUID string) error {
if err := m.Delete(ctx, cdUID); err != nil {
return fmt.Errorf("error deleting DaemonSet: %w", err)
Expand Down
203 changes: 0 additions & 203 deletions cmd/compute-domain-controller/daemonsetpods.go

This file was deleted.

Loading

0 comments on commit 8d34f97

Please sign in to comment.