Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,4 @@ vendor
*.swp
*.swo
*~
dev/
2 changes: 2 additions & 0 deletions api/core/v1alpha1/well_known_labels.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,6 @@ const (
TopologyLabelPrefix = "topology.core.apinet.ironcore.dev/"
TopologyPartitionLabel = TopologyLabelPrefix + "partition"
TopologyZoneLabel = TopologyLabelPrefix + "zone"

IPEphemeralLabel = "apinet.ironcore.dev/ip-ephemeral"
)
79 changes: 79 additions & 0 deletions internal/app/apiserver/apiserver.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@ package apiserver

import (
"context"
"encoding/json"
"fmt"
"log/slog"
"net"
"net/netip"

Expand All @@ -16,12 +18,15 @@ import (
"github.com/ironcore-dev/ironcore-net/api/core/v1alpha1"
informers "github.com/ironcore-dev/ironcore-net/client-go/informers/externalversions"
clientset "github.com/ironcore-dev/ironcore-net/client-go/ironcorenet/versioned"
v1alpha1client "github.com/ironcore-dev/ironcore-net/client-go/ironcorenet/versioned/typed/core/v1alpha1"
"github.com/ironcore-dev/ironcore-net/internal/apiserver"
netflag "github.com/ironcore-dev/ironcore-net/utils/flag"
"github.com/spf13/cobra"
"github.com/spf13/pflag"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/types"
utilerrors "k8s.io/apimachinery/pkg/util/errors"
"k8s.io/apiserver/pkg/admission"
genericapiserver "k8s.io/apiserver/pkg/server"
Expand Down Expand Up @@ -163,5 +168,79 @@ func (o *IronCoreNetServerOptions) Run(ctx context.Context) error {
return nil
})

// TODO: This is temporary migration code to strip OwnerReferences from legacy ephemeral IPs.
// Remove this hook once all clusters have been migrated.
server.GenericAPIServer.AddPostStartHookOrDie("migrate-ephemeral-ip-owner-references", func(hookContext genericapiserver.PostStartHookContext) error {
ipClient, err := v1alpha1client.NewForConfig(hookContext.LoopbackClientConfig)
if err != nil {
slog.Error("Failed to create client for IP migration", "error", err)
return nil
}

hookCtx, cancel := context.WithCancel(context.Background())
go func() {
<-hookContext.Done()
cancel()
}()
Comment on lines +180 to +184
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Missing defer cancel() — goroutine lives until server shutdown regardless of whether migration finished

After migrateEphemeralIPOwnerReferences returns, the spawned goroutine is still blocked on <-hookContext.Done(). cancel is never called on the normal exit path (only when the server shuts down), so hookCtx is never signalled as done after migration completes. go vet / staticcheck flag this as a context leak.

Adding defer cancel() to the goroutine body (as shown in the fix above) ensures hookCtx is cancelled and the goroutine exits once migration is complete.

♻️ Proposed fix
 		hookCtx, cancel := context.WithCancel(context.Background())
 		go func() {
+			defer cancel()
 			<-hookContext.Done()
-			cancel()
 		}()
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@internal/app/apiserver/apiserver.go` around lines 180 - 184, The spawned
goroutine that waits on hookContext.Done() leaks because cancel() for hookCtx is
never invoked after migrateEphemeralIPOwnerReferences completes; update the
goroutine (the closure that currently does "<-hookContext.Done(); cancel()") to
call defer cancel() at the top of the goroutine body so hookCtx is cancelled and
the goroutine can exit when the migration finishes (references: hookCtx, cancel,
hookContext).


migrateEphemeralIPOwnerReferences(hookCtx, ipClient)
return nil
})
Comment on lines +173 to +188
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Migration runs synchronously — blocks server readiness until all IPs are processed

migrateEphemeralIPOwnerReferences is called directly (line 186). k8s post-start hooks delay the server's /readyz response until the hook function returns. On a cluster with thousands of legacy ephemeral IPs this can cause readiness probe timeouts and restart loops on the first deployment of this version.

The migration is best-effort by design (the TODO comment acknowledges it is temporary), so it is safe to fire it off in a background goroutine and return immediately.

♻️ Proposed fix — run migration asynchronously
 	server.GenericAPIServer.AddPostStartHookOrDie("migrate-ephemeral-ip-owner-references", func(hookContext genericapiserver.PostStartHookContext) error {
 		ipClient, err := v1alpha1client.NewForConfig(hookContext.LoopbackClientConfig)
 		if err != nil {
 			slog.Error("Failed to create client for IP migration", "error", err)
 			return nil
 		}
 
 		hookCtx, cancel := context.WithCancel(context.Background())
 		go func() {
+			defer cancel()
 			<-hookContext.Done()
-			cancel()
 		}()
 
-		migrateEphemeralIPOwnerReferences(hookCtx, ipClient)
+		go migrateEphemeralIPOwnerReferences(hookCtx, ipClient)
 		return nil
 	})
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
server.GenericAPIServer.AddPostStartHookOrDie("migrate-ephemeral-ip-owner-references", func(hookContext genericapiserver.PostStartHookContext) error {
ipClient, err := v1alpha1client.NewForConfig(hookContext.LoopbackClientConfig)
if err != nil {
slog.Error("Failed to create client for IP migration", "error", err)
return nil
}
hookCtx, cancel := context.WithCancel(context.Background())
go func() {
<-hookContext.Done()
cancel()
}()
migrateEphemeralIPOwnerReferences(hookCtx, ipClient)
return nil
})
server.GenericAPIServer.AddPostStartHookOrDie("migrate-ephemeral-ip-owner-references", func(hookContext genericapiserver.PostStartHookContext) error {
ipClient, err := v1alpha1client.NewForConfig(hookContext.LoopbackClientConfig)
if err != nil {
slog.Error("Failed to create client for IP migration", "error", err)
return nil
}
hookCtx, cancel := context.WithCancel(context.Background())
go func() {
defer cancel()
<-hookContext.Done()
}()
go migrateEphemeralIPOwnerReferences(hookCtx, ipClient)
return nil
})
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@internal/app/apiserver/apiserver.go` around lines 173 - 188, The post-start
hook currently calls migrateEphemeralIPOwnerReferences(hookCtx, ipClient)
synchronously which blocks readiness; change it to launch the migration in a
background goroutine so the hook returns immediately: after creating hookCtx and
cancel, start the migration with go migrateEphemeralIPOwnerReferences(hookCtx,
ipClient) (keeping the existing cancel-on-hookContext.Done() logic) and then
return nil from the hook; keep the existing NewForConfig error handling and
logging as-is.


return server.GenericAPIServer.PrepareRun().RunWithContext(ctx)
}

func migrateEphemeralIPOwnerReferences(ctx context.Context, ipClient v1alpha1client.CoreV1alpha1Interface) {
var (
continueToken string
migrated int
)

for {
ipList, err := ipClient.IPs("").List(ctx, metav1.ListOptions{
Limit: 500,
Continue: continueToken,
})
if err != nil {
slog.Error("Failed to list IPs for migration", "error", err)
return
}

for i := range ipList.Items {
ip := &ipList.Items[i]
if metav1.GetControllerOf(ip) == nil {
continue
}

patch := map[string]any{
"metadata": map[string]any{
"ownerReferences": []any{},
"labels": map[string]string{
v1alpha1.IPEphemeralLabel: "true",
},
},
}
patchData, err := json.Marshal(patch)
if err != nil {
slog.Error("Failed to marshal migration patch", "ip", ip.Name, "namespace", ip.Namespace, "error", err)
continue
}

_, err = ipClient.IPs(ip.Namespace).Patch(ctx, ip.Name, types.MergePatchType, patchData, metav1.PatchOptions{})
if err != nil {
slog.Error("Failed to migrate IP", "ip", ip.Name, "namespace", ip.Namespace, "error", err)
continue
}
migrated++
}

continueToken = ipList.Continue
if continueToken == "" {
break
}
}

if migrated > 0 {
slog.Info("Migrated ephemeral IPs: stripped OwnerReferences and ensured label", "count", migrated)
}
}
27 changes: 17 additions & 10 deletions internal/registry/ipallocator/allocators.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@ type Allocators struct {
allocByFamily map[corev1.IPFamily]Interface

gv schema.GroupVersion
kind string
resource string

accessorFor func(obj runtime.Object) (Accessor, error)
Expand All @@ -67,13 +66,12 @@ type Allocators struct {
func NewAllocators(
allocByIPFamily map[corev1.IPFamily]Interface,
gv schema.GroupVersion,
kind, resource string,
resource string,
accessorFor func(obj runtime.Object) (Accessor, error),
) *Allocators {
return &Allocators{
allocByFamily: allocByIPFamily,
gv: gv,
kind: kind,
resource: resource,
accessorFor: accessorFor,
}
Expand All @@ -100,14 +98,23 @@ func (a *Allocators) allocatorsForRequestIterator(it func(yield func(Request) bo
return allocs, err
}

func (a *Allocators) releaseIPs(allocByIPFamily map[corev1.IPFamily]Interface, namespace string, ips []netip.Addr) ([]netip.Addr, error) {
func (a *Allocators) claimRefFor(acc Accessor) v1alpha1.IPClaimRef {
return v1alpha1.IPClaimRef{
Group: a.gv.Group,
Resource: a.resource,
Name: acc.GetName(),
UID: acc.GetUID(),
}
}

func (a *Allocators) releaseIPs(allocByIPFamily map[corev1.IPFamily]Interface, namespace string, ips []netip.Addr, claimRef v1alpha1.IPClaimRef) ([]netip.Addr, error) {
var (
released []netip.Addr
errs []error
)
for _, ip := range ips {
alloc := allocByIPFamily[core.IPFamilyForAddr(ip)]
if err := alloc.Release(namespace, ip); err != nil {
if err := alloc.Release(namespace, ip, claimRef); err != nil {
errs = append(errs, err)
continue
}
Expand All @@ -134,7 +141,7 @@ func (a *Allocators) allocateIPs(allocByFamily map[corev1.IPFamily]Interface, ac
return allocated, err
}
} else {
newAddr, err := alloc.AllocateNext(acc.GetNamespace(), claimRef, a.gv.Version, a.kind)
newAddr, err := alloc.AllocateNext(acc.GetNamespace(), claimRef)
if err != nil {
return allocated, err
}
Expand Down Expand Up @@ -182,7 +189,7 @@ func (a *Allocators) AllocateCreate(obj runtime.Object, dryRun bool) (Transactio
return
}

actuallyReleased, err := a.releaseIPs(allocs, acc.GetNamespace(), allocated)
actuallyReleased, err := a.releaseIPs(allocs, acc.GetNamespace(), allocated, a.claimRefFor(acc))
if err != nil {
klog.ErrorS(err, "Error releasing IPs",
"shouldRelease", allocated,
Expand Down Expand Up @@ -247,7 +254,7 @@ func (a *Allocators) AllocateUpdate(obj, oldObj runtime.Object, dryRun bool) (Tr
}

toRelease := toReleaseSet.UnsortedList()
if actuallyReleased, err := a.releaseIPs(allocs, acc.GetNamespace(), toRelease); err != nil {
if actuallyReleased, err := a.releaseIPs(allocs, acc.GetNamespace(), toRelease, a.claimRefFor(oldAcc)); err != nil {
klog.ErrorS(err, "Error releasing IPs",
"shouldRelease", toRelease,
"released", actuallyReleased,
Expand All @@ -259,7 +266,7 @@ func (a *Allocators) AllocateUpdate(obj, oldObj runtime.Object, dryRun bool) (Tr
return
}

if actuallyReleased, err := a.releaseIPs(allocs, acc.GetNamespace(), toReleaseSet.UnsortedList()); err != nil {
if actuallyReleased, err := a.releaseIPs(allocs, acc.GetNamespace(), toReleaseSet.UnsortedList(), a.claimRefFor(acc)); err != nil {
klog.ErrorS(err, "Error releasing IPs",
"shouldRelease", allocated,
"released", actuallyReleased,
Expand Down Expand Up @@ -288,7 +295,7 @@ func (a *Allocators) Release(obj runtime.Object, dryRun bool) {
}

allocated := utilslices.Map(reqs, func(r Request) netip.Addr { return r.Addr })
actuallyReleased, err := a.releaseIPs(allocs, acc.GetNamespace(), allocated)
actuallyReleased, err := a.releaseIPs(allocs, acc.GetNamespace(), allocated, a.claimRefFor(acc))
if err != nil {
klog.ErrorS(err, "Error releasing IPs",
"shouldRelease", allocated,
Expand Down
Loading
Loading