Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/agent-e2e-kind.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ on:
paths:
- cmd/agent/**
- cmd/kubectl-unbounded/**
- internal/cloudprovider/**
- pkg/agent/**
- internal/provision/**
- internal/kube/**
Expand All @@ -26,6 +27,7 @@ on:
paths:
- cmd/agent/**
- cmd/kubectl-unbounded/**
- internal/cloudprovider/**
- pkg/agent/**
- internal/provision/**
- internal/kube/**
Expand Down
6 changes: 3 additions & 3 deletions cmd/machina/machina/controller/ssh_integration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1100,7 +1100,7 @@ func TestProvisionMachine_ProviderLabelsOverride(t *testing.T) {
Client: fakeClient,
Scheme: s,
ClusterInfo: &ClusterInfo{
Provider: &cloudprovider.AKSProvider{ClusterName: "mc_rg_test_eastus"},
Provider: &cloudprovider.AKSProvider{},
},
}

Expand Down Expand Up @@ -1131,8 +1131,8 @@ func TestProvisionMachine_ProviderLabelsOverride(t *testing.T) {
// Provider label overrides user-specified value.
require.Equal(t, "false", agentConfig.Kubelet.Labels["kubernetes.azure.com/managed"])

// Provider label for cluster name is injected.
require.Equal(t, "mc_rg_test_eastus", agentConfig.Kubelet.Labels["kubernetes.azure.com/cluster"])
// kubernetes.azure.com/cluster must be absent from unbounded-managed nodes.
require.NotContains(t, agentConfig.Kubelet.Labels, "kubernetes.azure.com/cluster")
}

// ---------------------------------------------------------------------------
Expand Down
61 changes: 61 additions & 0 deletions cmd/unbounded-net-controller/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ import (
"syscall"
"time"

apierrors "k8s.io/apimachinery/pkg/api/errors"

"github.com/spf13/cobra"
"github.com/spf13/pflag"
corev1 "k8s.io/api/core/v1"
Expand Down Expand Up @@ -76,6 +78,7 @@ func main() {
RequireDashboardAuth: true,
StatusWSKeepaliveInterval: 10 * time.Second,
StatusWSKeepaliveFailureCount: 2,
ManagedKubeProxyEnabled: true,
NodeTokenLifetime: 4 * time.Hour,
ViewerTokenLifetime: 30 * time.Minute,
}
Expand Down Expand Up @@ -129,6 +132,8 @@ on site configuration, and maintain SiteNodeSlice and GatewayPool status.`,
flags.BoolVar(&cfg.RequireDashboardAuth, "require-dashboard-auth", true, "Require authentication and RBAC authorization for dashboard and status endpoints")
flags.DurationVar(&cfg.InformerResyncPeriod, "informer-resync-period", 300*time.Second, "Resync period for Kubernetes informers")
flags.DurationVar(&cfg.KubeProxyHealthInterval, "kube-proxy-health-interval", 30*time.Second, "Interval between kube-proxy health checks on the controller node (0 to disable)")
flags.BoolVar(&cfg.ManagedKubeProxyEnabled, "managed-kube-proxy", true, "Create kube-proxy DaemonSets for unbounded-managed site nodes not covered by provider kube-proxy")
flags.StringVar(&cfg.ManagedKubeProxyImage, "managed-kube-proxy-image", "", "kube-proxy image for managed site DaemonSets (default: registry.k8s.io/kube-proxy:<server version>)")
flags.DurationVar(&cfg.NodeTokenLifetime, "node-token-lifetime", 4*time.Hour, "Lifetime of HMAC tokens issued to node agents")
flags.DurationVar(&cfg.ViewerTokenLifetime, "viewer-token-lifetime", 30*time.Minute, "Lifetime of HMAC tokens issued to dashboard viewers")

Expand Down Expand Up @@ -209,6 +214,14 @@ func applyControllerRuntimeConfig(cmd *cobra.Command, cfg *config.Config, config
}
}

if !flags.Changed("managed-kube-proxy") && runtimeCfg.Controller.ManagedKubeProxy.Enabled != nil {
cfg.ManagedKubeProxyEnabled = *runtimeCfg.Controller.ManagedKubeProxy.Enabled
}

if !flags.Changed("managed-kube-proxy-image") && runtimeCfg.Controller.ManagedKubeProxy.Image != "" {
cfg.ManagedKubeProxyImage = runtimeCfg.Controller.ManagedKubeProxy.Image
}

if !flags.Changed("leader-elect") && runtimeCfg.Controller.LeaderElection.Enabled != nil {
cfg.LeaderElection.Enabled = *runtimeCfg.Controller.LeaderElection.Enabled
}
Expand Down Expand Up @@ -294,6 +307,8 @@ General Flags:
--informer-resync-period duration Resync period for Kubernetes informers (default 5m0s)
--kubeconfig string Path to kubeconfig file (uses in-cluster config if not specified)
--node-agent-health-port int Port where node agents serve their health/status endpoints (default 9998)
--managed-kube-proxy Create kube-proxy DaemonSets for unbounded-managed site nodes not covered by provider kube-proxy (default true)
--managed-kube-proxy-image string kube-proxy image for managed site DaemonSets
--status-stale-threshold duration Duration after which a node's pushed status is considered stale (default 90s)
--status-ws-keepalive-interval duration Interval between websocket keepalive pings on controller node status streams (0 to disable) (default 10s)
--status-ws-keepalive-failure-count int Sequential websocket keepalive ping failures before closing node status websocket (default 2)
Expand Down Expand Up @@ -573,6 +588,29 @@ func run(cfg *config.Config, forceNotLeader bool) error {
}()
}

if cfg.ManagedKubeProxyEnabled {
image := cfg.ManagedKubeProxyImage
if image == "" {
image = defaultKubeProxyImage(ctx, clientset)
}

kubeProxyCtrl, err := controller.NewManagedKubeProxyController(clientset, dynamicInformerFactory, informerFactory, controller.ManagedKubeProxyOptions{
Namespace: controllerNamespace,
Image: image,
})
if err != nil {
klog.Errorf("Failed to create managed kube-proxy controller: %v", err)
} else {
go func() {
if err := kubeProxyCtrl.Run(ctx, 2); err != nil {
klog.Errorf("Managed kube-proxy controller error: %v", err)
}
}()
}
} else {
klog.Info("Managed kube-proxy controller disabled")
}

// Create and start gateway pool controller (shares the node informer factory)
gatewayPoolCtrl, err := controller.NewGatewayPoolController(clientset, dynamicClient, dynamicInformerFactory, informerFactory)
if err != nil {
Expand Down Expand Up @@ -627,6 +665,29 @@ func run(cfg *config.Config, forceNotLeader bool) error {
return nil
}

func defaultKubeProxyImage(ctx context.Context, clientset kubernetes.Interface) string {
version := "latest"

if serverVersion, err := clientset.Discovery().ServerVersion(); err != nil {
klog.Warningf("Failed to discover Kubernetes server version for managed kube-proxy image: %v", err)
} else if serverVersion.GitVersion != "" {
version = serverVersion.GitVersion
}

ds, err := clientset.AppsV1().DaemonSets("kube-system").Get(ctx, "kube-proxy", metav1.GetOptions{})
if err == nil {
for _, container := range ds.Spec.Template.Spec.Containers {
if container.Name == "kube-proxy" && container.Image != "" {
return container.Image
}
}
} else if !apierrors.IsNotFound(err) {
klog.Warningf("Failed to read kube-system/kube-proxy for managed kube-proxy image: %v", err)
}

return "registry.k8s.io/kube-proxy:" + version
}

// injectCABundle updates the webhook and APIService configurations with the
// controller's self-signed CA bundle so the API server can verify TLS
// connections to the webhook/aggregated API endpoints.
Expand Down
3 changes: 3 additions & 0 deletions deploy/net/01-configmap.yaml.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ data:
statusWsKeepaliveFailureCount: {{ default "3" .ControllerStatusWsKeepaliveFailureCount }}
registerAggregatedAPIServer: {{ default "true" .ControllerRegisterAggregatedAPIServer }}
kubeProxyHealthInterval: "{{ default "30s" .ControllerKubeProxyHealthInterval }}"
managedKubeProxy:
enabled: {{ default "true" .ControllerManagedKubeProxyEnabled }}
image: "{{ default "" .ControllerManagedKubeProxyImage }}"
leaderElection:
enabled: {{ default "true" .ControllerLeaderElectionEnabled }}
leaseDuration: "{{ default "30s" .ControllerLeaderElectionLeaseDuration }}"
Expand Down
11 changes: 11 additions & 0 deletions deploy/net/controller/01-serviceaccount.yaml.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,14 @@ metadata:
app.kubernetes.io/name: unbounded-net-controller
app.kubernetes.io/component: controller
automountServiceAccountToken: true

---
apiVersion: v1
kind: ServiceAccount
metadata:
name: unbounded-net-kube-proxy
namespace: {{ .Namespace }}
labels:
app.kubernetes.io/name: unbounded-net-kube-proxy
app.kubernetes.io/component: kube-proxy
automountServiceAccountToken: true
21 changes: 21 additions & 0 deletions deploy/net/controller/02-rbac.yaml.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,11 @@ rules:
- apiGroups: [""]
resources: ["nodes"]
verbs: ["get", "list", "watch", "patch", "update"]
# DaemonSets: read provider kube-proxy DaemonSets and manage unbounded-owned
# per-site kube-proxy DaemonSets.
- apiGroups: ["apps"]
resources: ["daemonsets"]
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
# Events: emit events about cluster-scoped objects (Nodes)
- apiGroups: [""]
resources: ["events"]
Expand Down Expand Up @@ -68,6 +73,22 @@ rules:
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: unbounded-net-kube-proxy
labels:
app.kubernetes.io/name: unbounded-net-kube-proxy
app.kubernetes.io/component: kube-proxy
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: system:node-proxier
subjects:
- kind: ServiceAccount
name: unbounded-net-kube-proxy
namespace: {{ .Namespace }}
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: unbounded-net-controller
labels:
Expand Down
159 changes: 159 additions & 0 deletions designs/managed-kube-proxy.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
# Managed kube-proxy for Unbounded Sites

## Problem

Unbounded worker nodes can join a Kubernetes cluster outside the cloud
provider's managed node pools. On AKS, the provider-owned `kube-proxy`
DaemonSet selects AKS nodes with provider labels such as
`kubernetes.azure.com/cluster`. Externally joined unbounded nodes do not match
that selector, so no kube-proxy process programs ClusterIP service rules on
those hosts.

The unbounded-net node agent can still route to real pod and host endpoints, but
ClusterIP addresses do not work. This breaks direct status push to the
unbounded-net controller service because traffic to the service IP is never
DNATed to the controller endpoint.

## Goals

- Run kube-proxy on unbounded-managed site nodes that are not covered by the
cluster provider's kube-proxy DaemonSet.
- Avoid running two kube-proxy instances on the same node.
- Preserve provider-owned kube-proxy DaemonSets, especially managed AKS
resources that may be reconciled by addon managers.
- Keep kube-proxy configuration site-aware so local traffic detection uses the
site's pod CIDR.

## Non-goals

- Replacing provider-owned kube-proxy on managed cluster nodes.
- Supporting one kube-proxy process with multiple unrelated IPv4 pod CIDRs.
Kubernetes kube-proxy validates `--cluster-cidr` as either a single CIDR or a
dual-stack pair, so multiple IPv4 site CIDRs require separate DaemonSets.

## Behavior

The unbounded-net controller manages one kube-proxy DaemonSet per Site:

```text
unbounded-net-kube-proxy-<site>
```

Each DaemonSet is scheduled only to nodes with both labels:

```text
net.unbounded-cloud.io/site=<site>
net.unbounded-cloud.io/kube-proxy=managed
```

The controller adds `net.unbounded-cloud.io/kube-proxy=managed` to nodes when:

- the node has a canonical unbounded site label, and
- the node is not an AKS/provider-managed node. Currently this excludes nodes
with `kubernetes.azure.com/cluster` or `kubernetes.azure.com/managedby`, and
- no provider-owned kube-proxy DaemonSet appears to cover the node.

The controller removes the marker when those conditions stop being true.

Provider kube-proxy DaemonSets are detected by kube-proxy container name/image
and excluded if they are unbounded-owned. The controller evaluates their node
selectors and required node affinity against each node. Nodes already matched by
provider kube-proxy are not labeled for unbounded-managed kube-proxy.

The controller owns only DaemonSets labeled
`app.kubernetes.io/name=unbounded-net-kube-proxy`. It does not modify provider
DaemonSets such as AKS `kube-system/kube-proxy`.

Sites without an enabled pod CIDR assignment are skipped because kube-proxy
requires a valid `--cluster-cidr` for `ClusterCIDR` local traffic detection.
Unbounded-owned DaemonSets whose site no longer exists are deleted.

## DaemonSet Template

The managed DaemonSet uses the cluster's existing `kube-system/kube-proxy` image
when available. If that DaemonSet does not exist, it falls back to
`registry.k8s.io/kube-proxy:<server version>`.

Image selection can be overridden with:

```yaml
controller:
managedKubeProxy:
image: <image>
```

Managed kube-proxy can be disabled with:

```yaml
controller:
managedKubeProxy:
enabled: false
```

The equivalent controller flags are:

```text
--managed-kube-proxy=false
--managed-kube-proxy-image=<image>
```

The pod runs with:

- `hostNetwork: true`
- `system-node-critical` priority
- privileged security context
- broad tolerations
- `/run/xtables.lock`, `/etc/sysctl.d`, and `/lib/modules` host mounts
- `kubernetes.azure.com/set-kube-service-host-fqdn: "true"`

The AKS service host FQDN annotation is important for bootstrapping. Before
kube-proxy programs service rules, `KUBERNETES_SERVICE_HOST=10.0.0.1` may be
unreachable from the node. The FQDN override lets kube-proxy contact the API
server without relying on ClusterIP service NAT.

An init container runs before kube-proxy to set `nf_conntrack_max` using the
same CPU-scaled floor used by AKS kube-proxy. This preserves the provider's
expected conntrack sizing on externally joined nodes that do not run the
provider-owned DaemonSet.

The kube-proxy command uses the first enabled IPv4 pod CIDR for the Site, plus
the first enabled IPv6 pod CIDR if present:

```text
--cluster-cidr=<site IPv4>[,<site IPv6>]
--detect-local-mode=ClusterCIDR
```

For the `test` site this is:

```text
--cluster-cidr=100.125.0.0/16
```

## RBAC

The unbounded-net controller needs cluster-wide DaemonSet read/write access to
detect provider kube-proxy and manage unbounded-owned DaemonSets.

Managed kube-proxy pods use a dedicated service account:

```text
unbounded-net/unbounded-net-kube-proxy
```

That service account is bound to the built-in `system:node-proxier` ClusterRole.

The controller service account also needs cluster-wide DaemonSet permissions so
it can detect provider kube-proxy DaemonSets and create/update/delete the
unbounded-owned per-site DaemonSets.

## Operational Notes

- Deleting a Site deletes or stops updating its managed kube-proxy DaemonSet.
- Changing a Site's pod CIDR assignment updates the corresponding DaemonSet and
rolls kube-proxy for that site.
- If a provider later starts covering an unbounded node, the controller removes
the managed marker label, and the unbounded-owned kube-proxy pod drains from
that node.
- A single DaemonSet for all sites was tested but rejected because kube-proxy
does not accept multiple IPv4 `--cluster-cidr` values.
Original file line number Diff line number Diff line change
Expand Up @@ -96,14 +96,8 @@ done <<< "$token_names"
[[ -n "$token_id" && -n "$token_secret" ]] || die "no valid bootstrap token found in kube-system secrets"
bootstrap_token="${token_id}.${token_secret}"

# --- NODE RESOURCE GROUP (for Labels) ---
cluster_rg=$(kubectl get nodes -o jsonpath='{.items[0].metadata.labels.kubernetes\.azure\.com/cluster}' 2>/dev/null) || true

# --- Build labels JSON object ---
labels_json="\"kubernetes.azure.com/managed\": \"false\""
if [[ -n "$cluster_rg" ]]; then
labels_json+=", \"kubernetes.azure.com/cluster\": \"${cluster_rg}\""
fi

# --- Build taints JSON array ---
# REGISTER_WITH_TAINTS is optional; split comma-separated entries into a JSON array.
Expand Down
Loading
Loading