Skip to content

Commit 8db89d1

Browse files
committed
ClusterTopology deletion prevention
1 parent 75c555c commit 8db89d1

File tree

8 files changed

+125
-80
lines changed

8 files changed

+125
-80
lines changed

operator/internal/controller/clustertopology/reconciledelete.go

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,18 @@ func (r *Reconciler) triggerDeletionFlow(ctx context.Context, logger logr.Logger
5656
// 2. Topology is enabled AND this specific topology is configured
5757
func (r *Reconciler) checkDeletionConditions(ctx context.Context, logger logr.Logger, ct *grovecorev1alpha1.ClusterTopology) ctrlcommon.ReconcileStepResult {
5858
logger.Info("Checking deletion conditions", "ClusterTopology", ct.Name)
59-
// Condition 1: Check for PodCliqueSet references
59+
60+
// Condition 1: Check if topology is enabled and configured to use this ClusterTopology
61+
if r.config.Enabled && r.config.Name == ct.Name {
62+
logger.Info("Cannot delete ClusterTopology: topology feature is enabled and configured to use this topology",
63+
"topologyName", r.config.Name)
64+
r.eventRecorder.Eventf(ct, corev1.EventTypeWarning, groveconstants.ReasonClusterTopologyDeleteBlocked,
65+
"Cannot delete ClusterTopology %s: topology feature is enabled and configured to use this ClusterTopology", ct.Name)
66+
// Don't requeue - this condition requires manual intervention (config change + operator restart)
67+
return ctrlcommon.DoNotRequeue()
68+
}
69+
70+
// Condition 2: Check for PodCliqueSet references
6071
pcsList := &grovecorev1alpha1.PodCliqueSetList{}
6172
labelSelector := client.MatchingLabels{
6273
apicommon.LabelClusterTopologyName: ct.Name,
@@ -74,16 +85,6 @@ func (r *Reconciler) checkDeletionConditions(ctx context.Context, logger logr.Lo
7485
return ctrlcommon.DoNotRequeue()
7586
}
7687

77-
// Condition 2: Check if topology is enabled and configured to use this ClusterTopology
78-
if r.config.ClusterTopology.Enabled && r.config.ClusterTopology.Name == ct.Name {
79-
logger.Info("Cannot delete ClusterTopology: topology feature is enabled and configured to use this topology",
80-
"topologyName", r.config.ClusterTopology.Name)
81-
r.eventRecorder.Eventf(ct, corev1.EventTypeWarning, groveconstants.ReasonClusterTopologyDeleteBlocked,
82-
"Cannot delete ClusterTopology %s: topology feature is enabled and configured to use this ClusterTopology", ct.Name)
83-
// Don't requeue - this condition requires manual intervention (config change + operator restart)
84-
return ctrlcommon.DoNotRequeue()
85-
}
86-
8788
logger.Info("ClusterTopology can be safely deleted", "topologyName", ct.Name)
8889
return ctrlcommon.ContinueReconcile()
8990
}

operator/internal/controller/clustertopology/reconciledelete_test.go

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -168,9 +168,7 @@ func TestCheckDeletionConditions(t *testing.T) {
168168
reconciler := &Reconciler{
169169
client: fakeClient,
170170
eventRecorder: record.NewFakeRecorder(10),
171-
config: configv1alpha1.OperatorConfiguration{
172-
ClusterTopology: tt.topologyConfig,
173-
},
171+
config: tt.topologyConfig,
174172
}
175173

176174
// Run checkDeletionConditions
@@ -318,9 +316,7 @@ func TestTriggerDeletionFlow(t *testing.T) {
318316
reconciler := &Reconciler{
319317
client: fakeClient,
320318
eventRecorder: record.NewFakeRecorder(10),
321-
config: configv1alpha1.OperatorConfiguration{
322-
ClusterTopology: tt.topologyConfig,
323-
},
319+
config: tt.topologyConfig,
324320
}
325321

326322
// Run triggerDeletionFlow

operator/internal/controller/clustertopology/reconciler.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,14 +33,14 @@ import (
3333

3434
// Reconciler reconciles ClusterTopology resources.
3535
type Reconciler struct {
36-
config configv1alpha1.OperatorConfiguration
36+
config configv1alpha1.ClusterTopologyConfiguration
3737
client ctrlclient.Client
3838
eventRecorder record.EventRecorder
3939
reconcileStatusRecorder ctrlcommon.ReconcileErrorRecorder
4040
}
4141

4242
// NewReconciler creates a new reconciler for ClusterTopology.
43-
func NewReconciler(mgr ctrl.Manager, topologyCfg configv1alpha1.OperatorConfiguration) *Reconciler {
43+
func NewReconciler(mgr ctrl.Manager, topologyCfg configv1alpha1.ClusterTopologyConfiguration) *Reconciler {
4444
return &Reconciler{
4545
config: topologyCfg,
4646
client: mgr.GetClient(),

operator/internal/controller/clustertopology/register.go

Lines changed: 4 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@ package clustertopology
1919
import (
2020
"context"
2121

22-
apicommon "github.com/ai-dynamo/grove/operator/api/common"
2322
grovecorev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1"
23+
componentutils "github.com/ai-dynamo/grove/operator/internal/controller/common/component/utils"
2424

2525
"k8s.io/apimachinery/pkg/types"
2626
"k8s.io/client-go/util/workqueue"
@@ -68,8 +68,8 @@ func (h *podCliqueSetEventHandler) Update(_ context.Context, e event.TypedUpdate
6868
return
6969
}
7070

71-
oldTopology := getTopologyName(oldPCS)
72-
newTopology := getTopologyName(newPCS)
71+
oldTopology := componentutils.GetTopologyName(oldPCS)
72+
newTopology := componentutils.GetTopologyName(newPCS)
7373

7474
// Only reconcile if there was a change to the topology label
7575
if oldTopology == newTopology {
@@ -107,7 +107,7 @@ func (h *podCliqueSetEventHandler) Delete(_ context.Context, e event.TypedDelete
107107
return
108108
}
109109

110-
topology := getTopologyName(pcs)
110+
topology := componentutils.GetTopologyName(pcs)
111111
if topology == "" {
112112
return
113113
}
@@ -122,16 +122,3 @@ func (h *podCliqueSetEventHandler) Delete(_ context.Context, e event.TypedDelete
122122
func (h *podCliqueSetEventHandler) Generic(_ context.Context, _ event.TypedGenericEvent[client.Object], _ workqueue.TypedRateLimitingInterface[reconcile.Request]) {
123123
// No-op: don't reconcile on generic events
124124
}
125-
126-
// getTopologyName extracts the topology name from a PodCliqueSet's labels.
127-
// Returns empty string if the label doesn't exist or is empty.
128-
func getTopologyName(pcs *grovecorev1alpha1.PodCliqueSet) string {
129-
if pcs == nil {
130-
return ""
131-
}
132-
topology, exists := pcs.Labels[apicommon.LabelClusterTopologyName]
133-
if !exists || topology == "" {
134-
return ""
135-
}
136-
return topology
137-
}

operator/internal/controller/clustertopology/register_test.go

Lines changed: 0 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ import (
2020
"context"
2121
"testing"
2222

23-
grovecorev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1"
2423
testutils "github.com/ai-dynamo/grove/operator/test/utils"
2524

2625
"github.com/stretchr/testify/assert"
@@ -32,47 +31,6 @@ import (
3231
"sigs.k8s.io/controller-runtime/pkg/reconcile"
3332
)
3433

35-
func TestGetTopologyName(t *testing.T) {
36-
tests := []struct {
37-
name string
38-
pcs *grovecorev1alpha1.PodCliqueSet
39-
expected string
40-
}{
41-
{
42-
name: "PodCliqueSet with topology label",
43-
pcs: testutils.NewPodCliqueSetBuilder("test-pcs", "test-ns", uuid.NewUUID()).
44-
WithTopologyLabel("test-topology").
45-
Build(),
46-
expected: "test-topology",
47-
},
48-
{
49-
name: "PodCliqueSet without topology label",
50-
pcs: testutils.NewPodCliqueSetBuilder("test-pcs", "test-ns", uuid.NewUUID()).
51-
Build(),
52-
expected: "",
53-
},
54-
{
55-
name: "nil PodCliqueSet",
56-
pcs: nil,
57-
expected: "",
58-
},
59-
{
60-
name: "PodCliqueSet with empty topology label",
61-
pcs: testutils.NewPodCliqueSetBuilder("test-pcs", "test-ns", uuid.NewUUID()).
62-
WithTopologyLabel("").
63-
Build(),
64-
expected: "",
65-
},
66-
}
67-
68-
for _, tt := range tests {
69-
t.Run(tt.name, func(t *testing.T) {
70-
result := getTopologyName(tt.pcs)
71-
assert.Equal(t, tt.expected, result)
72-
})
73-
}
74-
}
75-
7634
func TestPodCliqueSetEventHandler_Create(t *testing.T) {
7735
handler := &podCliqueSetEventHandler{}
7836
queue := workqueue.NewTypedRateLimitingQueue(workqueue.DefaultTypedControllerRateLimiter[reconcile.Request]())
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
// /*
2+
// Copyright 2025 The Grove Authors.
3+
//
4+
// Licensed under the Apache License, Version 2.0 (the "License");
5+
// you may not use this file except in compliance with the License.
6+
// You may obtain a copy of the License at
7+
//
8+
// http://www.apache.org/licenses/LICENSE-2.0
9+
//
10+
// Unless required by applicable law or agreed to in writing, software
11+
// distributed under the License is distributed on an "AS IS" BASIS,
12+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
// See the License for the specific language governing permissions and
14+
// limitations under the License.
15+
// */
16+
17+
package utils
18+
19+
import (
20+
apicommon "github.com/ai-dynamo/grove/operator/api/common"
21+
grovecorev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1"
22+
)
23+
24+
// GetTopologyName extracts the topology name from a PodCliqueSet's labels.
25+
// Returns empty string if the label doesn't exist or is empty.
26+
func GetTopologyName(pcs *grovecorev1alpha1.PodCliqueSet) string {
27+
if pcs == nil {
28+
return ""
29+
}
30+
topology, exists := pcs.Labels[apicommon.LabelClusterTopologyName]
31+
if !exists || topology == "" {
32+
return ""
33+
}
34+
return topology
35+
}
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
// /*
2+
// Copyright 2025 The Grove Authors.
3+
//
4+
// Licensed under the Apache License, Version 2.0 (the "License");
5+
// you may not use this file except in compliance with the License.
6+
// You may obtain a copy of the License at
7+
//
8+
// http://www.apache.org/licenses/LICENSE-2.0
9+
//
10+
// Unless required by applicable law or agreed to in writing, software
11+
// distributed under the License is distributed on an "AS IS" BASIS,
12+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
// See the License for the specific language governing permissions and
14+
// limitations under the License.
15+
// */
16+
17+
package utils
18+
19+
import (
20+
"testing"
21+
22+
grovecorev1alpha1 "github.com/ai-dynamo/grove/operator/api/core/v1alpha1"
23+
testutils "github.com/ai-dynamo/grove/operator/test/utils"
24+
25+
"github.com/stretchr/testify/assert"
26+
"k8s.io/apimachinery/pkg/util/uuid"
27+
)
28+
29+
func TestGetTopologyName(t *testing.T) {
30+
tests := []struct {
31+
name string
32+
pcs *grovecorev1alpha1.PodCliqueSet
33+
expected string
34+
}{
35+
{
36+
name: "PodCliqueSet with topology label",
37+
pcs: testutils.NewPodCliqueSetBuilder("test-pcs", "test-ns", uuid.NewUUID()).
38+
WithTopologyLabel("test-topology").
39+
Build(),
40+
expected: "test-topology",
41+
},
42+
{
43+
name: "PodCliqueSet without topology label",
44+
pcs: testutils.NewPodCliqueSetBuilder("test-pcs", "test-ns", uuid.NewUUID()).
45+
Build(),
46+
expected: "",
47+
},
48+
{
49+
name: "nil PodCliqueSet",
50+
pcs: nil,
51+
expected: "",
52+
},
53+
{
54+
name: "PodCliqueSet with empty topology label",
55+
pcs: testutils.NewPodCliqueSetBuilder("test-pcs", "test-ns", uuid.NewUUID()).
56+
WithTopologyLabel("").
57+
Build(),
58+
expected: "",
59+
},
60+
}
61+
62+
for _, tt := range tests {
63+
t.Run(tt.name, func(t *testing.T) {
64+
result := GetTopologyName(tt.pcs)
65+
assert.Equal(t, tt.expected, result)
66+
})
67+
}
68+
}

operator/internal/controller/register.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ import (
2727
)
2828

2929
// RegisterControllers registers all controllers with the manager.
30-
func RegisterControllers(mgr ctrl.Manager, controllerConfig configv1alpha1.ControllerConfiguration, topologyConfig configv1alpha1.OperatorConfiguration) error {
30+
func RegisterControllers(mgr ctrl.Manager, controllerConfig configv1alpha1.ControllerConfiguration, operatorConfig configv1alpha1.OperatorConfiguration) error {
3131
pcsReconciler := podcliqueset.NewReconciler(mgr, controllerConfig.PodCliqueSet)
3232
if err := pcsReconciler.RegisterWithManager(mgr); err != nil {
3333
return err
@@ -40,7 +40,7 @@ func RegisterControllers(mgr ctrl.Manager, controllerConfig configv1alpha1.Contr
4040
if err := pcsgReconciler.RegisterWithManager(mgr); err != nil {
4141
return err
4242
}
43-
ctReconciler := clustertopology.NewReconciler(mgr, topologyConfig)
43+
ctReconciler := clustertopology.NewReconciler(mgr, operatorConfig.ClusterTopology)
4444
if err := ctReconciler.RegisterWithManager(mgr); err != nil {
4545
return err
4646
}

0 commit comments

Comments
 (0)