diff --git a/.github/workflows/agent-e2e-kind.yaml b/.github/workflows/agent-e2e-kind.yaml index 14172cab..d9018452 100644 --- a/.github/workflows/agent-e2e-kind.yaml +++ b/.github/workflows/agent-e2e-kind.yaml @@ -8,6 +8,7 @@ on: paths: - cmd/agent/** - cmd/kubectl-unbounded/** + - internal/cloudprovider/** - pkg/agent/** - internal/provision/** - internal/kube/** @@ -26,6 +27,7 @@ on: paths: - cmd/agent/** - cmd/kubectl-unbounded/** + - internal/cloudprovider/** - pkg/agent/** - internal/provision/** - internal/kube/** diff --git a/cmd/machina/machina/controller/ssh_integration_test.go b/cmd/machina/machina/controller/ssh_integration_test.go index f17dffad..4fc284a6 100644 --- a/cmd/machina/machina/controller/ssh_integration_test.go +++ b/cmd/machina/machina/controller/ssh_integration_test.go @@ -1100,7 +1100,7 @@ func TestProvisionMachine_ProviderLabelsOverride(t *testing.T) { Client: fakeClient, Scheme: s, ClusterInfo: &ClusterInfo{ - Provider: &cloudprovider.AKSProvider{ClusterName: "mc_rg_test_eastus"}, + Provider: &cloudprovider.AKSProvider{}, }, } @@ -1131,8 +1131,8 @@ func TestProvisionMachine_ProviderLabelsOverride(t *testing.T) { // Provider label overrides user-specified value. require.Equal(t, "false", agentConfig.Kubelet.Labels["kubernetes.azure.com/managed"]) - // Provider label for cluster name is injected. - require.Equal(t, "mc_rg_test_eastus", agentConfig.Kubelet.Labels["kubernetes.azure.com/cluster"]) + // kubernetes.azure.com/cluster must be absent from unbounded-managed nodes. + require.NotContains(t, agentConfig.Kubelet.Labels, "kubernetes.azure.com/cluster") } // --------------------------------------------------------------------------- diff --git a/cmd/unbounded-net-controller/main.go b/cmd/unbounded-net-controller/main.go index 76a48197..7e4c97d1 100644 --- a/cmd/unbounded-net-controller/main.go +++ b/cmd/unbounded-net-controller/main.go @@ -12,6 +12,8 @@ import ( "syscall" "time" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "github.com/spf13/cobra" "github.com/spf13/pflag" corev1 "k8s.io/api/core/v1" @@ -76,6 +78,7 @@ func main() { RequireDashboardAuth: true, StatusWSKeepaliveInterval: 10 * time.Second, StatusWSKeepaliveFailureCount: 2, + ManagedKubeProxyEnabled: true, NodeTokenLifetime: 4 * time.Hour, ViewerTokenLifetime: 30 * time.Minute, } @@ -129,6 +132,8 @@ on site configuration, and maintain SiteNodeSlice and GatewayPool status.`, flags.BoolVar(&cfg.RequireDashboardAuth, "require-dashboard-auth", true, "Require authentication and RBAC authorization for dashboard and status endpoints") flags.DurationVar(&cfg.InformerResyncPeriod, "informer-resync-period", 300*time.Second, "Resync period for Kubernetes informers") flags.DurationVar(&cfg.KubeProxyHealthInterval, "kube-proxy-health-interval", 30*time.Second, "Interval between kube-proxy health checks on the controller node (0 to disable)") + flags.BoolVar(&cfg.ManagedKubeProxyEnabled, "managed-kube-proxy", true, "Create kube-proxy DaemonSets for unbounded-managed site nodes not covered by provider kube-proxy") + flags.StringVar(&cfg.ManagedKubeProxyImage, "managed-kube-proxy-image", "", "kube-proxy image for managed site DaemonSets (default: registry.k8s.io/kube-proxy:)") flags.DurationVar(&cfg.NodeTokenLifetime, "node-token-lifetime", 4*time.Hour, "Lifetime of HMAC tokens issued to node agents") flags.DurationVar(&cfg.ViewerTokenLifetime, "viewer-token-lifetime", 30*time.Minute, "Lifetime of HMAC tokens issued to dashboard viewers") @@ -209,6 +214,14 @@ func applyControllerRuntimeConfig(cmd *cobra.Command, cfg *config.Config, config } } + if !flags.Changed("managed-kube-proxy") && runtimeCfg.Controller.ManagedKubeProxy.Enabled != nil { + cfg.ManagedKubeProxyEnabled = *runtimeCfg.Controller.ManagedKubeProxy.Enabled + } + + if !flags.Changed("managed-kube-proxy-image") && runtimeCfg.Controller.ManagedKubeProxy.Image != "" { + cfg.ManagedKubeProxyImage = runtimeCfg.Controller.ManagedKubeProxy.Image + } + if !flags.Changed("leader-elect") && runtimeCfg.Controller.LeaderElection.Enabled != nil { cfg.LeaderElection.Enabled = *runtimeCfg.Controller.LeaderElection.Enabled } @@ -294,6 +307,8 @@ General Flags: --informer-resync-period duration Resync period for Kubernetes informers (default 5m0s) --kubeconfig string Path to kubeconfig file (uses in-cluster config if not specified) --node-agent-health-port int Port where node agents serve their health/status endpoints (default 9998) + --managed-kube-proxy Create kube-proxy DaemonSets for unbounded-managed site nodes not covered by provider kube-proxy (default true) + --managed-kube-proxy-image string kube-proxy image for managed site DaemonSets --status-stale-threshold duration Duration after which a node's pushed status is considered stale (default 90s) --status-ws-keepalive-interval duration Interval between websocket keepalive pings on controller node status streams (0 to disable) (default 10s) --status-ws-keepalive-failure-count int Sequential websocket keepalive ping failures before closing node status websocket (default 2) @@ -573,6 +588,29 @@ func run(cfg *config.Config, forceNotLeader bool) error { }() } + if cfg.ManagedKubeProxyEnabled { + image := cfg.ManagedKubeProxyImage + if image == "" { + image = defaultKubeProxyImage(ctx, clientset) + } + + kubeProxyCtrl, err := controller.NewManagedKubeProxyController(clientset, dynamicInformerFactory, informerFactory, controller.ManagedKubeProxyOptions{ + Namespace: controllerNamespace, + Image: image, + }) + if err != nil { + klog.Errorf("Failed to create managed kube-proxy controller: %v", err) + } else { + go func() { + if err := kubeProxyCtrl.Run(ctx, 2); err != nil { + klog.Errorf("Managed kube-proxy controller error: %v", err) + } + }() + } + } else { + klog.Info("Managed kube-proxy controller disabled") + } + // Create and start gateway pool controller (shares the node informer factory) gatewayPoolCtrl, err := controller.NewGatewayPoolController(clientset, dynamicClient, dynamicInformerFactory, informerFactory) if err != nil { @@ -627,6 +665,29 @@ func run(cfg *config.Config, forceNotLeader bool) error { return nil } +func defaultKubeProxyImage(ctx context.Context, clientset kubernetes.Interface) string { + version := "latest" + + if serverVersion, err := clientset.Discovery().ServerVersion(); err != nil { + klog.Warningf("Failed to discover Kubernetes server version for managed kube-proxy image: %v", err) + } else if serverVersion.GitVersion != "" { + version = serverVersion.GitVersion + } + + ds, err := clientset.AppsV1().DaemonSets("kube-system").Get(ctx, "kube-proxy", metav1.GetOptions{}) + if err == nil { + for _, container := range ds.Spec.Template.Spec.Containers { + if container.Name == "kube-proxy" && container.Image != "" { + return container.Image + } + } + } else if !apierrors.IsNotFound(err) { + klog.Warningf("Failed to read kube-system/kube-proxy for managed kube-proxy image: %v", err) + } + + return "registry.k8s.io/kube-proxy:" + version +} + // injectCABundle updates the webhook and APIService configurations with the // controller's self-signed CA bundle so the API server can verify TLS // connections to the webhook/aggregated API endpoints. diff --git a/deploy/net/01-configmap.yaml.tmpl b/deploy/net/01-configmap.yaml.tmpl index 10bca7ba..52868537 100644 --- a/deploy/net/01-configmap.yaml.tmpl +++ b/deploy/net/01-configmap.yaml.tmpl @@ -24,6 +24,9 @@ data: statusWsKeepaliveFailureCount: {{ default "3" .ControllerStatusWsKeepaliveFailureCount }} registerAggregatedAPIServer: {{ default "true" .ControllerRegisterAggregatedAPIServer }} kubeProxyHealthInterval: "{{ default "30s" .ControllerKubeProxyHealthInterval }}" + managedKubeProxy: + enabled: {{ default "true" .ControllerManagedKubeProxyEnabled }} + image: "{{ default "" .ControllerManagedKubeProxyImage }}" leaderElection: enabled: {{ default "true" .ControllerLeaderElectionEnabled }} leaseDuration: "{{ default "30s" .ControllerLeaderElectionLeaseDuration }}" diff --git a/deploy/net/controller/01-serviceaccount.yaml.tmpl b/deploy/net/controller/01-serviceaccount.yaml.tmpl index 6a8261b1..bee3adc3 100644 --- a/deploy/net/controller/01-serviceaccount.yaml.tmpl +++ b/deploy/net/controller/01-serviceaccount.yaml.tmpl @@ -11,3 +11,14 @@ metadata: app.kubernetes.io/name: unbounded-net-controller app.kubernetes.io/component: controller automountServiceAccountToken: true + +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: unbounded-net-kube-proxy + namespace: {{ .Namespace }} + labels: + app.kubernetes.io/name: unbounded-net-kube-proxy + app.kubernetes.io/component: kube-proxy +automountServiceAccountToken: true diff --git a/deploy/net/controller/02-rbac.yaml.tmpl b/deploy/net/controller/02-rbac.yaml.tmpl index 63309ca1..567e742f 100644 --- a/deploy/net/controller/02-rbac.yaml.tmpl +++ b/deploy/net/controller/02-rbac.yaml.tmpl @@ -18,6 +18,11 @@ rules: - apiGroups: [""] resources: ["nodes"] verbs: ["get", "list", "watch", "patch", "update"] + # DaemonSets: read provider kube-proxy DaemonSets and manage unbounded-owned + # per-site kube-proxy DaemonSets. + - apiGroups: ["apps"] + resources: ["daemonsets"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] # Events: emit events about cluster-scoped objects (Nodes) - apiGroups: [""] resources: ["events"] @@ -68,6 +73,22 @@ rules: --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding +metadata: + name: unbounded-net-kube-proxy + labels: + app.kubernetes.io/name: unbounded-net-kube-proxy + app.kubernetes.io/component: kube-proxy +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: system:node-proxier +subjects: + - kind: ServiceAccount + name: unbounded-net-kube-proxy + namespace: {{ .Namespace }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding metadata: name: unbounded-net-controller labels: diff --git a/designs/managed-kube-proxy.md b/designs/managed-kube-proxy.md new file mode 100644 index 00000000..8475c8a8 --- /dev/null +++ b/designs/managed-kube-proxy.md @@ -0,0 +1,159 @@ +# Managed kube-proxy for Unbounded Sites + +## Problem + +Unbounded worker nodes can join a Kubernetes cluster outside the cloud +provider's managed node pools. On AKS, the provider-owned `kube-proxy` +DaemonSet selects AKS nodes with provider labels such as +`kubernetes.azure.com/cluster`. Externally joined unbounded nodes do not match +that selector, so no kube-proxy process programs ClusterIP service rules on +those hosts. + +The unbounded-net node agent can still route to real pod and host endpoints, but +ClusterIP addresses do not work. This breaks direct status push to the +unbounded-net controller service because traffic to the service IP is never +DNATed to the controller endpoint. + +## Goals + +- Run kube-proxy on unbounded-managed site nodes that are not covered by the + cluster provider's kube-proxy DaemonSet. +- Avoid running two kube-proxy instances on the same node. +- Preserve provider-owned kube-proxy DaemonSets, especially managed AKS + resources that may be reconciled by addon managers. +- Keep kube-proxy configuration site-aware so local traffic detection uses the + site's pod CIDR. + +## Non-goals + +- Replacing provider-owned kube-proxy on managed cluster nodes. +- Supporting one kube-proxy process with multiple unrelated IPv4 pod CIDRs. + Kubernetes kube-proxy validates `--cluster-cidr` as either a single CIDR or a + dual-stack pair, so multiple IPv4 site CIDRs require separate DaemonSets. + +## Behavior + +The unbounded-net controller manages one kube-proxy DaemonSet per Site: + +```text +unbounded-net-kube-proxy- +``` + +Each DaemonSet is scheduled only to nodes with both labels: + +```text +net.unbounded-cloud.io/site= +net.unbounded-cloud.io/kube-proxy=managed +``` + +The controller adds `net.unbounded-cloud.io/kube-proxy=managed` to nodes when: + +- the node has a canonical unbounded site label, and +- the node is not an AKS/provider-managed node. Currently this excludes nodes + with `kubernetes.azure.com/cluster` or `kubernetes.azure.com/managedby`, and +- no provider-owned kube-proxy DaemonSet appears to cover the node. + +The controller removes the marker when those conditions stop being true. + +Provider kube-proxy DaemonSets are detected by kube-proxy container name/image +and excluded if they are unbounded-owned. The controller evaluates their node +selectors and required node affinity against each node. Nodes already matched by +provider kube-proxy are not labeled for unbounded-managed kube-proxy. + +The controller owns only DaemonSets labeled +`app.kubernetes.io/name=unbounded-net-kube-proxy`. It does not modify provider +DaemonSets such as AKS `kube-system/kube-proxy`. + +Sites without an enabled pod CIDR assignment are skipped because kube-proxy +requires a valid `--cluster-cidr` for `ClusterCIDR` local traffic detection. +Unbounded-owned DaemonSets whose site no longer exists are deleted. + +## DaemonSet Template + +The managed DaemonSet uses the cluster's existing `kube-system/kube-proxy` image +when available. If that DaemonSet does not exist, it falls back to +`registry.k8s.io/kube-proxy:`. + +Image selection can be overridden with: + +```yaml +controller: + managedKubeProxy: + image: +``` + +Managed kube-proxy can be disabled with: + +```yaml +controller: + managedKubeProxy: + enabled: false +``` + +The equivalent controller flags are: + +```text +--managed-kube-proxy=false +--managed-kube-proxy-image= +``` + +The pod runs with: + +- `hostNetwork: true` +- `system-node-critical` priority +- privileged security context +- broad tolerations +- `/run/xtables.lock`, `/etc/sysctl.d`, and `/lib/modules` host mounts +- `kubernetes.azure.com/set-kube-service-host-fqdn: "true"` + +The AKS service host FQDN annotation is important for bootstrapping. Before +kube-proxy programs service rules, `KUBERNETES_SERVICE_HOST=10.0.0.1` may be +unreachable from the node. The FQDN override lets kube-proxy contact the API +server without relying on ClusterIP service NAT. + +An init container runs before kube-proxy to set `nf_conntrack_max` using the +same CPU-scaled floor used by AKS kube-proxy. This preserves the provider's +expected conntrack sizing on externally joined nodes that do not run the +provider-owned DaemonSet. + +The kube-proxy command uses the first enabled IPv4 pod CIDR for the Site, plus +the first enabled IPv6 pod CIDR if present: + +```text +--cluster-cidr=[,] +--detect-local-mode=ClusterCIDR +``` + +For the `test` site this is: + +```text +--cluster-cidr=100.125.0.0/16 +``` + +## RBAC + +The unbounded-net controller needs cluster-wide DaemonSet read/write access to +detect provider kube-proxy and manage unbounded-owned DaemonSets. + +Managed kube-proxy pods use a dedicated service account: + +```text +unbounded-net/unbounded-net-kube-proxy +``` + +That service account is bound to the built-in `system:node-proxier` ClusterRole. + +The controller service account also needs cluster-wide DaemonSet permissions so +it can detect provider kube-proxy DaemonSets and create/update/delete the +unbounded-owned per-site DaemonSets. + +## Operational Notes + +- Deleting a Site deletes or stops updating its managed kube-proxy DaemonSet. +- Changing a Site's pod CIDR assignment updates the corresponding DaemonSet and + rolls kube-proxy for that site. +- If a provider later starts covering an unbounded node, the controller removes + the managed marker label, and the unbounded-owned kube-proxy pod drains from + that node. +- A single DaemonSet for all sites was tested but rejected because kube-proxy + does not accept multiple IPv4 `--cluster-cidr` values. diff --git a/hack/agent/skills/unbounded-agent-qemu-vm-e2e/scripts/aks-config.sh b/hack/agent/skills/unbounded-agent-qemu-vm-e2e/scripts/aks-config.sh index b80837a3..c5d65845 100755 --- a/hack/agent/skills/unbounded-agent-qemu-vm-e2e/scripts/aks-config.sh +++ b/hack/agent/skills/unbounded-agent-qemu-vm-e2e/scripts/aks-config.sh @@ -96,14 +96,8 @@ done <<< "$token_names" [[ -n "$token_id" && -n "$token_secret" ]] || die "no valid bootstrap token found in kube-system secrets" bootstrap_token="${token_id}.${token_secret}" -# --- NODE RESOURCE GROUP (for Labels) --- -cluster_rg=$(kubectl get nodes -o jsonpath='{.items[0].metadata.labels.kubernetes\.azure\.com/cluster}' 2>/dev/null) || true - # --- Build labels JSON object --- labels_json="\"kubernetes.azure.com/managed\": \"false\"" -if [[ -n "$cluster_rg" ]]; then - labels_json+=", \"kubernetes.azure.com/cluster\": \"${cluster_rg}\"" -fi # --- Build taints JSON array --- # REGISTER_WITH_TAINTS is optional; split comma-separated entries into a JSON array. diff --git a/internal/cloudprovider/provider.go b/internal/cloudprovider/provider.go index f5f1607a..419e6d52 100644 --- a/internal/cloudprovider/provider.go +++ b/internal/cloudprovider/provider.go @@ -32,11 +32,7 @@ type Provider interface { } // AKSProvider implements Provider for Azure Kubernetes Service clusters. -type AKSProvider struct { - // ClusterName is the value of the kubernetes.azure.com/cluster label - // read from a system-mode node. - ClusterName string -} +type AKSProvider struct{} func (p *AKSProvider) ID() string { return "microsoft-aks" @@ -45,7 +41,6 @@ func (p *AKSProvider) ID() string { func (p *AKSProvider) DefaultLabels() map[string]string { return map[string]string{ "kubernetes.azure.com/managed": "false", - "kubernetes.azure.com/cluster": p.ClusterName, } } @@ -69,26 +64,5 @@ func DetectProvider(ctx context.Context, k kubernetes.Interface) (Provider, erro logger.Info("AKS provider detected") - // Read the kubernetes.azure.com/cluster label from a system-mode node. - nodes, err := k.CoreV1().Nodes().List(ctx, metav1.ListOptions{ - LabelSelector: "kubernetes.azure.com/mode=system", - Limit: 1, - }) - if err != nil { - return nil, fmt.Errorf("list system-mode nodes: %w", err) - } - - if len(nodes.Items) == 0 { - return nil, fmt.Errorf("AKS detected but no nodes found with label kubernetes.azure.com/mode=system") - } - - clusterName, ok := nodes.Items[0].Labels["kubernetes.azure.com/cluster"] - if !ok || clusterName == "" { - return nil, fmt.Errorf("AKS detected but system-mode node %q is missing kubernetes.azure.com/cluster label", - nodes.Items[0].Name) - } - - logger.Info("Resolved AKS cluster name", "clusterName", clusterName) - - return &AKSProvider{ClusterName: clusterName}, nil + return &AKSProvider{}, nil } diff --git a/internal/cloudprovider/provider_test.go b/internal/cloudprovider/provider_test.go index dabe40c4..dc443056 100644 --- a/internal/cloudprovider/provider_test.go +++ b/internal/cloudprovider/provider_test.go @@ -23,15 +23,6 @@ func TestDetectProvider_AKS(t *testing.T) { Namespace: metav1.NamespacePublic, }, }, - &corev1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: "aks-system-0", - Labels: map[string]string{ - "kubernetes.azure.com/mode": "system", - "kubernetes.azure.com/cluster": "mc_rg_my-cluster_eastus", - }, - }, - }, ) provider, err := DetectProvider(context.Background(), kubeCli) @@ -41,7 +32,7 @@ func TestDetectProvider_AKS(t *testing.T) { labels := provider.DefaultLabels() require.Equal(t, "false", labels["kubernetes.azure.com/managed"]) - require.Equal(t, "mc_rg_my-cluster_eastus", labels["kubernetes.azure.com/cluster"]) + require.NotContains(t, labels, "kubernetes.azure.com/cluster") } func TestDetectProvider_NotAKS(t *testing.T) { @@ -58,6 +49,8 @@ func TestDetectProvider_NotAKS(t *testing.T) { func TestDetectProvider_AKS_NoSystemNodes(t *testing.T) { t.Parallel() + // AKS detection no longer requires system-mode nodes; only the + // aks-cluster-metadata ConfigMap is needed. kubeCli := fake.NewClientset( &corev1.ConfigMap{ ObjectMeta: metav1.ObjectMeta{ @@ -69,49 +62,22 @@ func TestDetectProvider_AKS_NoSystemNodes(t *testing.T) { ) provider, err := DetectProvider(context.Background(), kubeCli) - require.Error(t, err) - require.Contains(t, err.Error(), "no nodes found") - require.Nil(t, provider) -} - -func TestDetectProvider_AKS_MissingClusterLabel(t *testing.T) { - t.Parallel() - - kubeCli := fake.NewClientset( - &corev1.ConfigMap{ - ObjectMeta: metav1.ObjectMeta{ - Name: "aks-cluster-metadata", - Namespace: metav1.NamespacePublic, - }, - }, - &corev1.Node{ - ObjectMeta: metav1.ObjectMeta{ - Name: "aks-system-0", - Labels: map[string]string{ - "kubernetes.azure.com/mode": "system", - // Missing kubernetes.azure.com/cluster label. - }, - }, - }, - ) - - provider, err := DetectProvider(context.Background(), kubeCli) - require.Error(t, err) - require.Contains(t, err.Error(), "missing kubernetes.azure.com/cluster label") - require.Nil(t, provider) + require.NoError(t, err) + require.NotNil(t, provider) + require.Equal(t, "microsoft-aks", provider.ID()) } func TestAKSProvider_DefaultLabels(t *testing.T) { t.Parallel() - p := &AKSProvider{ClusterName: "mc_rg_test_eastus"} + p := &AKSProvider{} require.Equal(t, "microsoft-aks", p.ID()) labels := p.DefaultLabels() - require.Len(t, labels, 2) + require.Len(t, labels, 1) require.Equal(t, "false", labels["kubernetes.azure.com/managed"]) - require.Equal(t, "mc_rg_test_eastus", labels["kubernetes.azure.com/cluster"]) + require.NotContains(t, labels, "kubernetes.azure.com/cluster") } func TestCommonDefaultLabels(t *testing.T) { diff --git a/internal/net/config/config.go b/internal/net/config/config.go index 663631d7..0b904952 100644 --- a/internal/net/config/config.go +++ b/internal/net/config/config.go @@ -51,6 +51,14 @@ type Config struct { // KubeProxyHealthInterval is the interval between kube-proxy health checks on the controller node. // Set to 0 to disable the check. KubeProxyHealthInterval time.Duration + // ManagedKubeProxyEnabled controls whether the controller creates kube-proxy + // DaemonSets for unbounded-managed site nodes that are not covered by the + // cloud provider's kube-proxy DaemonSet. + ManagedKubeProxyEnabled bool + // ManagedKubeProxyImage overrides the kube-proxy image used by managed + // per-site DaemonSets. Empty means reuse kube-system/kube-proxy if present, + // otherwise derive registry.k8s.io/kube-proxy from the Kubernetes server version. + ManagedKubeProxyImage string // NetlinkResyncPeriod is the interval between full netlink cache resyncs on node agents. NetlinkResyncPeriod time.Duration // NodeTokenLifetime is the lifetime of HMAC tokens issued to node agents. diff --git a/internal/net/config/runtime_config.go b/internal/net/config/runtime_config.go index 62de1535..d6f0bc40 100644 --- a/internal/net/config/runtime_config.go +++ b/internal/net/config/runtime_config.go @@ -27,16 +27,23 @@ type CommonRuntimeConfig struct { // ControllerRuntimeConfig contains controller-specific runtime settings. type ControllerRuntimeConfig struct { - InformerResyncPeriod string `yaml:"informerResyncPeriod"` - HealthPort *int `yaml:"healthPort"` - NodeAgentHealthPort *int `yaml:"nodeAgentHealthPort"` - StatusStaleThreshold string `yaml:"statusStaleThreshold"` - StatusWSKeepaliveInterval string `yaml:"statusWebsocketKeepaliveInterval"` - StatusWSKeepaliveFailCount *int `yaml:"statusWsKeepaliveFailureCount"` - RegisterAggregatedAPIServer *bool `yaml:"registerAggregatedAPIServer"` - RequireDashboardAuth *bool `yaml:"requireDashboardAuth"` - KubeProxyHealthInterval string `yaml:"kubeProxyHealthInterval"` - LeaderElection ControllerLeaderElectionYAML `yaml:"leaderElection"` + InformerResyncPeriod string `yaml:"informerResyncPeriod"` + HealthPort *int `yaml:"healthPort"` + NodeAgentHealthPort *int `yaml:"nodeAgentHealthPort"` + StatusStaleThreshold string `yaml:"statusStaleThreshold"` + StatusWSKeepaliveInterval string `yaml:"statusWebsocketKeepaliveInterval"` + StatusWSKeepaliveFailCount *int `yaml:"statusWsKeepaliveFailureCount"` + RegisterAggregatedAPIServer *bool `yaml:"registerAggregatedAPIServer"` + RequireDashboardAuth *bool `yaml:"requireDashboardAuth"` + KubeProxyHealthInterval string `yaml:"kubeProxyHealthInterval"` + ManagedKubeProxy ManagedKubeProxyRuntimeConfig `yaml:"managedKubeProxy"` + LeaderElection ControllerLeaderElectionYAML `yaml:"leaderElection"` +} + +// ManagedKubeProxyRuntimeConfig controls unbounded-managed kube-proxy DaemonSets. +type ManagedKubeProxyRuntimeConfig struct { + Enabled *bool `yaml:"enabled"` + Image string `yaml:"image"` } // ControllerLeaderElectionYAML configures controller leader election behavior. diff --git a/internal/net/config/runtime_config_test.go b/internal/net/config/runtime_config_test.go index 6b8376b8..855d5ae3 100644 --- a/internal/net/config/runtime_config_test.go +++ b/internal/net/config/runtime_config_test.go @@ -22,6 +22,9 @@ common: controller: informerResyncPeriod: 30s healthPort: 9080 + managedKubeProxy: + enabled: false + image: registry.example/kube-proxy:v1 node: nodeName: node-a wireGuardPort: 51820 @@ -44,6 +47,14 @@ node: t.Fatalf("unexpected controller.healthPort: %#v", cfg.Controller.HealthPort) } + if cfg.Controller.ManagedKubeProxy.Enabled == nil || *cfg.Controller.ManagedKubeProxy.Enabled { + t.Fatalf("unexpected controller.managedKubeProxy.enabled: %#v", cfg.Controller.ManagedKubeProxy.Enabled) + } + + if cfg.Controller.ManagedKubeProxy.Image != "registry.example/kube-proxy:v1" { + t.Fatalf("unexpected controller.managedKubeProxy.image: %q", cfg.Controller.ManagedKubeProxy.Image) + } + if cfg.Node.NodeName != "node-a" { t.Fatalf("unexpected node.nodeName: %q", cfg.Node.NodeName) } diff --git a/internal/net/controller/kubeproxy_controller.go b/internal/net/controller/kubeproxy_controller.go new file mode 100644 index 00000000..8658f7b7 --- /dev/null +++ b/internal/net/controller/kubeproxy_controller.go @@ -0,0 +1,576 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package controller + +import ( + "context" + "fmt" + "strings" + "time" + + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/intstr" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/dynamic/dynamicinformer" + "k8s.io/client-go/informers" + "k8s.io/client-go/kubernetes" + appsv1listers "k8s.io/client-go/listers/apps/v1" + corev1listers "k8s.io/client-go/listers/core/v1" + "k8s.io/client-go/tools/cache" + "k8s.io/client-go/util/workqueue" + "k8s.io/klog/v2" + + unboundednetv1alpha1 "github.com/Azure/unbounded/api/net/v1alpha1" +) + +const ( + ManagedKubeProxyNodeLabelKey = "net.unbounded-cloud.io/kube-proxy" + ManagedKubeProxyNodeLabelValue = "managed" + + managedKubeProxyNamePrefix = "unbounded-net-kube-proxy-" + managedKubeProxyAppName = "unbounded-net-kube-proxy" +) + +var kubeProxySiteGVR = schema.GroupVersionResource{ + Group: unboundednetv1alpha1.GroupName, + Version: "v1alpha1", + Resource: "sites", +} + +// ManagedKubeProxyOptions configures the kube-proxy DaemonSets created for +// unbounded-managed nodes. +type ManagedKubeProxyOptions struct { + Namespace string + Image string +} + +// ManagedKubeProxyController reconciles kube-proxy DaemonSets for unbounded +// nodes that are not already covered by the cluster provider's kube-proxy. +type ManagedKubeProxyController struct { + clientset kubernetes.Interface + options ManagedKubeProxyOptions + + nodeLister corev1listers.NodeLister + nodeSynced cache.InformerSynced + dsLister appsv1listers.DaemonSetLister + dsSynced cache.InformerSynced + siteInformer cache.SharedIndexInformer + siteSynced cache.InformerSynced + workqueue workqueue.TypedRateLimitingInterface[string] + providerDSCache []*appsv1.DaemonSet +} + +// NewManagedKubeProxyController creates a controller for managed kube-proxy. +func NewManagedKubeProxyController( + clientset kubernetes.Interface, + dynamicInformerFactory dynamicinformer.DynamicSharedInformerFactory, + informerFactory informers.SharedInformerFactory, + options ManagedKubeProxyOptions, +) (*ManagedKubeProxyController, error) { + if options.Namespace == "" { + return nil, fmt.Errorf("managed kube-proxy namespace is required") + } + + if options.Image == "" { + return nil, fmt.Errorf("managed kube-proxy image is required") + } + + nodeInformer := informerFactory.Core().V1().Nodes() + dsInformer := informerFactory.Apps().V1().DaemonSets() + siteInformer := dynamicInformerFactory.ForResource(kubeProxySiteGVR).Informer() + + c := &ManagedKubeProxyController{ + clientset: clientset, + options: options, + nodeLister: nodeInformer.Lister(), + nodeSynced: nodeInformer.Informer().HasSynced, + dsLister: dsInformer.Lister(), + dsSynced: dsInformer.Informer().HasSynced, + siteInformer: siteInformer, + siteSynced: siteInformer.HasSynced, + workqueue: workqueue.NewTypedRateLimitingQueueWithConfig( + workqueue.DefaultTypedControllerRateLimiter[string](), + workqueue.TypedRateLimitingQueueConfig[string]{Name: "ManagedKubeProxy"}, + ), + } + + handler := cache.ResourceEventHandlerFuncs{ + AddFunc: func(any) { c.enqueueAll() }, + UpdateFunc: func(any, any) { c.enqueueAll() }, + DeleteFunc: func(any) { c.enqueueAll() }, + } + + if _, err := nodeInformer.Informer().AddEventHandler(handler); err != nil { + return nil, fmt.Errorf("add node event handler: %w", err) + } + + if _, err := dsInformer.Informer().AddEventHandler(handler); err != nil { + return nil, fmt.Errorf("add daemonset event handler: %w", err) + } + + if _, err := siteInformer.AddEventHandler(handler); err != nil { + return nil, fmt.Errorf("add site event handler: %w", err) + } + + return c, nil +} + +// Run starts the managed kube-proxy controller. +func (c *ManagedKubeProxyController) Run(ctx context.Context, workers int) error { + defer c.workqueue.ShutDown() + + if ok := cache.WaitForCacheSync(ctx.Done(), c.nodeSynced, c.dsSynced, c.siteSynced); !ok { + return fmt.Errorf("failed to wait for managed kube-proxy caches to sync") + } + + c.enqueueAll() + + for i := 0; i < workers; i++ { + go wait.UntilWithContext(ctx, c.runWorker, time.Second) + } + + <-ctx.Done() + + return nil +} + +func fromUnstructured(u *unstructured.Unstructured, out any) error { + return runtime.DefaultUnstructuredConverter.FromUnstructured(u.Object, out) +} + +func resourceMustParse(value string) resource.Quantity { + return resource.MustParse(value) +} + +func (c *ManagedKubeProxyController) enqueueAll() { + c.workqueue.Add("all") +} + +func (c *ManagedKubeProxyController) runWorker(ctx context.Context) { + for c.processNextWorkItem(ctx) { + } +} + +func (c *ManagedKubeProxyController) processNextWorkItem(ctx context.Context) bool { + key, shutdown := c.workqueue.Get() + if shutdown { + return false + } + defer c.workqueue.Done(key) + + if err := c.sync(ctx); err != nil { + c.workqueue.AddRateLimited(key) + klog.Errorf("managed kube-proxy sync failed: %v", err) + } else { + c.workqueue.Forget(key) + } + + return true +} + +func (c *ManagedKubeProxyController) sync(ctx context.Context) error { + sites, err := c.listSites() + if err != nil { + return err + } + + nodes, err := c.nodeLister.List(labels.Everything()) + if err != nil { + return fmt.Errorf("list nodes: %w", err) + } + + dsList, err := c.dsLister.List(labels.Everything()) + if err != nil { + return fmt.Errorf("list daemonsets: %w", err) + } + + c.providerDSCache = providerKubeProxyDaemonSets(dsList) + + siteNames := map[string]struct{}{} + + for i := range sites { + site := sites[i] + + siteNames[site.Name] = struct{}{} + if err := c.ensureDaemonSet(ctx, site); err != nil { + return err + } + } + + if err := c.deleteStaleDaemonSets(ctx, dsList, siteNames); err != nil { + return err + } + + for _, node := range nodes { + if err := c.reconcileNodeLabel(ctx, node); err != nil { + return err + } + } + + return nil +} + +func (c *ManagedKubeProxyController) listSites() ([]unboundednetv1alpha1.Site, error) { + objs := c.siteInformer.GetStore().List() + + sites := make([]unboundednetv1alpha1.Site, 0, len(objs)) + for _, obj := range objs { + u, ok := obj.(*unstructured.Unstructured) + if !ok { + continue + } + + var site unboundednetv1alpha1.Site + if err := fromUnstructured(u, &site); err != nil { + return nil, fmt.Errorf("decode Site %s: %w", u.GetName(), err) + } + + sites = append(sites, site) + } + + return sites, nil +} + +func (c *ManagedKubeProxyController) reconcileNodeLabel(ctx context.Context, node *corev1.Node) error { + current := "" + if node.Labels != nil { + current = node.Labels[ManagedKubeProxyNodeLabelKey] + } + + want := shouldManageKubeProxyForNode(node, c.providerDSCache) + + if want && current == ManagedKubeProxyNodeLabelValue { + return nil + } + + if !want && current == "" { + return nil + } + + if want { + patch := fmt.Sprintf(`{"metadata":{"labels":{%q:%q}}}`, ManagedKubeProxyNodeLabelKey, ManagedKubeProxyNodeLabelValue) + _, err := c.clientset.CoreV1().Nodes().Patch(ctx, node.Name, types.MergePatchType, []byte(patch), metav1.PatchOptions{}) + + return err + } + + patch := fmt.Sprintf(`[{"op":"remove","path":"/metadata/labels/%s"}]`, escapeJSONPointer(ManagedKubeProxyNodeLabelKey)) + + _, err := c.clientset.CoreV1().Nodes().Patch(ctx, node.Name, types.JSONPatchType, []byte(patch), metav1.PatchOptions{}) + if apierrors.IsNotFound(err) || apierrors.IsInvalid(err) { + return nil + } + + return err +} + +func shouldManageKubeProxyForNode(node *corev1.Node, providerDS []*appsv1.DaemonSet) bool { + if node.Labels == nil || node.Labels[SiteLabelKey] == "" { + return false + } + + if _, exists := node.Labels["kubernetes.azure.com/managedby"]; exists { + return false + } + + if node.Labels["kubernetes.azure.com/cluster"] != "" { + return false + } + + for _, ds := range providerDS { + if daemonSetCouldScheduleOnNode(ds, node) { + return false + } + } + + return true +} + +func providerKubeProxyDaemonSets(dsList []*appsv1.DaemonSet) []*appsv1.DaemonSet { + var out []*appsv1.DaemonSet + + for _, ds := range dsList { + if ds.Labels["app.kubernetes.io/name"] == managedKubeProxyAppName || strings.HasPrefix(ds.Name, managedKubeProxyNamePrefix) { + continue + } + + for _, c := range ds.Spec.Template.Spec.Containers { + if c.Name == "kube-proxy" || strings.Contains(c.Image, "kube-proxy") { + out = append(out, ds) + break + } + } + } + + return out +} + +func daemonSetCouldScheduleOnNode(ds *appsv1.DaemonSet, node *corev1.Node) bool { + for k, v := range ds.Spec.Template.Spec.NodeSelector { + if node.Labels[k] != v { + return false + } + } + + if affinity := ds.Spec.Template.Spec.Affinity; affinity != nil && affinity.NodeAffinity != nil && affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution != nil { + terms := affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms + if len(terms) > 0 { + matched := false + + for _, term := range terms { + if nodeSelectorTermMatchesNode(term.MatchExpressions, node.Labels) { + matched = true + break + } + } + + if !matched { + return false + } + } + } + + return true +} + +func nodeSelectorTermMatchesNode(exprs []corev1.NodeSelectorRequirement, nodeLabels map[string]string) bool { + for _, expr := range exprs { + value, exists := nodeLabels[expr.Key] + switch expr.Operator { + case corev1.NodeSelectorOpIn: + if !exists || !stringInSlice(value, expr.Values) { + return false + } + case corev1.NodeSelectorOpNotIn: + if exists && stringInSlice(value, expr.Values) { + return false + } + case corev1.NodeSelectorOpExists: + if !exists { + return false + } + case corev1.NodeSelectorOpDoesNotExist: + if exists { + return false + } + } + } + + return true +} + +func stringInSlice(value string, values []string) bool { + for _, candidate := range values { + if value == candidate { + return true + } + } + + return false +} + +func (c *ManagedKubeProxyController) ensureDaemonSet(ctx context.Context, site unboundednetv1alpha1.Site) error { + clusterCIDR, ok := siteKubeProxyClusterCIDR(site) + if !ok { + return nil + } + + want := c.daemonSetForSite(site, clusterCIDR) + + existing, err := c.clientset.AppsV1().DaemonSets(c.options.Namespace).Get(ctx, want.Name, metav1.GetOptions{}) + if apierrors.IsNotFound(err) { + _, err = c.clientset.AppsV1().DaemonSets(c.options.Namespace).Create(ctx, want, metav1.CreateOptions{}) + return err + } + + if err != nil { + return err + } + + existing.Spec.Template = want.Spec.Template + existing.Spec.UpdateStrategy = want.Spec.UpdateStrategy + _, err = c.clientset.AppsV1().DaemonSets(c.options.Namespace).Update(ctx, existing, metav1.UpdateOptions{}) + + return err +} + +func siteKubeProxyClusterCIDR(site unboundednetv1alpha1.Site) (string, bool) { + var ipv4, ipv6 string + + for _, assignment := range site.Spec.PodCidrAssignments { + if !assignmentEnabled(assignment.AssignmentEnabled) { + continue + } + + for _, cidr := range assignment.CidrBlocks { + if strings.Contains(cidr, ":") { + if ipv6 == "" { + ipv6 = cidr + } + + continue + } + + if ipv4 == "" { + ipv4 = cidr + } + } + } + + if ipv4 != "" && ipv6 != "" { + return ipv4 + "," + ipv6, true + } + + if ipv4 != "" { + return ipv4, true + } + + if ipv6 != "" { + return ipv6, true + } + + return "", false +} + +func (c *ManagedKubeProxyController) daemonSetForSite(site unboundednetv1alpha1.Site, clusterCIDR string) *appsv1.DaemonSet { + name := managedKubeProxyDaemonSetName(site.Name) + labels := map[string]string{ + "app.kubernetes.io/name": managedKubeProxyAppName, + "app.kubernetes.io/component": "kube-proxy", + SiteLabelKey: site.Name, + } + maxUnavailable := intstr.FromInt32(1) + + return &appsv1.DaemonSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: c.options.Namespace, + Labels: labels, + }, + Spec: appsv1.DaemonSetSpec{ + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app.kubernetes.io/name": managedKubeProxyAppName, SiteLabelKey: site.Name}}, + UpdateStrategy: appsv1.DaemonSetUpdateStrategy{ + Type: appsv1.RollingUpdateDaemonSetStrategyType, + RollingUpdate: &appsv1.RollingUpdateDaemonSet{MaxUnavailable: &maxUnavailable}, + }, + Template: corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{"kubernetes.azure.com/set-kube-service-host-fqdn": "true"}, + Labels: labels, + }, + Spec: corev1.PodSpec{ + HostNetwork: true, + ServiceAccountName: "unbounded-net-kube-proxy", + PriorityClassName: "system-node-critical", + NodeSelector: map[string]string{ + ManagedKubeProxyNodeLabelKey: ManagedKubeProxyNodeLabelValue, + SiteLabelKey: site.Name, + }, + Tolerations: []corev1.Toleration{{Operator: corev1.TolerationOpExists}}, + InitContainers: []corev1.Container{{ + Name: "kube-proxy-bootstrap", + Image: c.options.Image, + ImagePullPolicy: corev1.PullIfNotPresent, + Command: []string{"/bin/sh", "-c", kubeProxyBootstrapScript}, + SecurityContext: &corev1.SecurityContext{Privileged: boolPtr(true)}, + Resources: corev1.ResourceRequirements{Requests: corev1.ResourceList{corev1.ResourceCPU: resourceMustParse("100m")}}, + VolumeMounts: kubeProxyBootstrapVolumeMounts(), + }}, + Containers: []corev1.Container{{ + Name: "kube-proxy", + Image: c.options.Image, + ImagePullPolicy: corev1.PullIfNotPresent, + Command: []string{ + "kube-proxy", + "--conntrack-max-per-core=0", + "--metrics-bind-address=0.0.0.0:10249", + "--cluster-cidr=" + clusterCIDR, + "--detect-local-mode=ClusterCIDR", + "--pod-interface-name-prefix=", + "--v=3", + }, + SecurityContext: &corev1.SecurityContext{Privileged: boolPtr(true)}, + Resources: corev1.ResourceRequirements{Requests: corev1.ResourceList{corev1.ResourceCPU: resourceMustParse("100m")}}, + VolumeMounts: []corev1.VolumeMount{ + {Name: "iptableslock", MountPath: "/run/xtables.lock"}, + {Name: "modules", MountPath: "/lib/modules"}, + }, + }}, + Volumes: []corev1.Volume{ + {Name: "iptableslock", VolumeSource: corev1.VolumeSource{HostPath: &corev1.HostPathVolumeSource{Path: "/run/xtables.lock", Type: hostPathTypePtr(corev1.HostPathFileOrCreate)}}}, + {Name: "sysctls", VolumeSource: corev1.VolumeSource{HostPath: &corev1.HostPathVolumeSource{Path: "/etc/sysctl.d", Type: hostPathTypePtr(corev1.HostPathDirectoryOrCreate)}}}, + {Name: "modules", VolumeSource: corev1.VolumeSource{HostPath: &corev1.HostPathVolumeSource{Path: "/lib/modules", Type: hostPathTypePtr(corev1.HostPathDirectory)}}}, + }, + }, + }, + }, + } +} + +func (c *ManagedKubeProxyController) deleteStaleDaemonSets(ctx context.Context, dsList []*appsv1.DaemonSet, siteNames map[string]struct{}) error { + for _, ds := range dsList { + if ds.Namespace != c.options.Namespace || ds.Labels["app.kubernetes.io/name"] != managedKubeProxyAppName { + continue + } + + siteName := strings.TrimPrefix(ds.Name, managedKubeProxyNamePrefix) + if _, ok := siteNames[siteName]; ok { + continue + } + + if err := c.clientset.AppsV1().DaemonSets(c.options.Namespace).Delete(ctx, ds.Name, metav1.DeleteOptions{}); err != nil && !apierrors.IsNotFound(err) { + return err + } + } + + return nil +} + +func managedKubeProxyDaemonSetName(siteName string) string { + return managedKubeProxyNamePrefix + siteName +} + +func boolPtr(v bool) *bool { return &v } + +func hostPathTypePtr(v corev1.HostPathType) *corev1.HostPathType { return &v } + +func kubeProxyBootstrapVolumeMounts() []corev1.VolumeMount { + return []corev1.VolumeMount{ + {Name: "sysctls", MountPath: "/etc/sysctl.d"}, + {Name: "modules", MountPath: "/lib/modules"}, + } +} + +const kubeProxyBootstrapScript = `get_num_cpu() { + sys_cpu_online=$(cat /sys/devices/system/cpu/online) + result=0 + OLD_IFS="$IFS"; IFS="," + for rng in $sys_cpu_online; do + if echo "$rng" | grep -q -- "-"; then + min=${rng%-*}; max=${rng#*-} + if [ "$min" -le "$max" ]; then + result=$((result + (max - min + 1))) + fi + else + result=$((result + 1)) + fi + done + IFS="$OLD_IFS" + echo $result +} +SYSCTL=/proc/sys/net/netfilter/nf_conntrack_max +NUM_CPU=$(get_num_cpu) +DESIRED=$((32768*NUM_CPU)) +if [ "$DESIRED" -lt 131072 ]; then DESIRED=131072; fi +echo "$DESIRED" > "$SYSCTL" +` diff --git a/internal/net/controller/kubeproxy_controller_test.go b/internal/net/controller/kubeproxy_controller_test.go new file mode 100644 index 00000000..c651dd16 --- /dev/null +++ b/internal/net/controller/kubeproxy_controller_test.go @@ -0,0 +1,87 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package controller + +import ( + "testing" + + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + unboundednetv1alpha1 "github.com/Azure/unbounded/api/net/v1alpha1" +) + +func TestShouldManageKubeProxyForNode(t *testing.T) { + providerDS := &appsv1.DaemonSet{Spec: appsv1.DaemonSetSpec{Template: corev1.PodTemplateSpec{Spec: corev1.PodSpec{Affinity: &corev1.Affinity{NodeAffinity: &corev1.NodeAffinity{RequiredDuringSchedulingIgnoredDuringExecution: &corev1.NodeSelector{NodeSelectorTerms: []corev1.NodeSelectorTerm{{MatchExpressions: []corev1.NodeSelectorRequirement{{Key: "kubernetes.azure.com/cluster", Operator: corev1.NodeSelectorOpExists}}}}}}}}}}} + + tests := []struct { + name string + node *corev1.Node + want bool + }{ + {name: "site node without provider coverage", node: nodeWithLabels(map[string]string{SiteLabelKey: "test"}), want: true}, + {name: "no site label", node: nodeWithLabels(map[string]string{}), want: false}, + {name: "aks cluster node excluded", node: nodeWithLabels(map[string]string{SiteLabelKey: "cluster", "kubernetes.azure.com/cluster": "rg"}), want: false}, + {name: "provider managed node excluded", node: nodeWithLabels(map[string]string{SiteLabelKey: "cluster", "kubernetes.azure.com/managedby": "aks"}), want: false}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := shouldManageKubeProxyForNode(tt.node, []*appsv1.DaemonSet{providerDS}); got != tt.want { + t.Fatalf("shouldManageKubeProxyForNode() = %v, want %v", got, tt.want) + } + }) + } +} + +func TestProviderKubeProxyDaemonSetsIgnoresManagedDaemonSets(t *testing.T) { + dsList := []*appsv1.DaemonSet{ + {ObjectMeta: metav1.ObjectMeta{Name: "kube-proxy"}, Spec: appsv1.DaemonSetSpec{Template: corev1.PodTemplateSpec{Spec: corev1.PodSpec{Containers: []corev1.Container{{Name: "kube-proxy", Image: "kube-proxy:v1"}}}}}}, + {ObjectMeta: metav1.ObjectMeta{Name: "unbounded-net-kube-proxy-test", Labels: map[string]string{"app.kubernetes.io/name": managedKubeProxyAppName}}, Spec: appsv1.DaemonSetSpec{Template: corev1.PodTemplateSpec{Spec: corev1.PodSpec{Containers: []corev1.Container{{Name: "kube-proxy", Image: "kube-proxy:v1"}}}}}}, + } + + got := providerKubeProxyDaemonSets(dsList) + if len(got) != 1 || got[0].Name != "kube-proxy" { + t.Fatalf("providerKubeProxyDaemonSets() = %#v, want only kube-proxy", got) + } +} + +func TestSiteKubeProxyClusterCIDR(t *testing.T) { + falseValue := false + site := unboundednetv1alpha1.Site{Spec: unboundednetv1alpha1.SiteSpec{PodCidrAssignments: []unboundednetv1alpha1.PodCidrAssignment{ + {AssignmentEnabled: &falseValue, CidrBlocks: []string{"10.99.0.0/16"}}, + {CidrBlocks: []string{"100.125.0.0/16", "fd00:1::/64"}}, + }}} + + got, ok := siteKubeProxyClusterCIDR(site) + if !ok || got != "100.125.0.0/16,fd00:1::/64" { + t.Fatalf("siteKubeProxyClusterCIDR() = %q,%v", got, ok) + } +} + +func TestDaemonSetForSite(t *testing.T) { + c := &ManagedKubeProxyController{options: ManagedKubeProxyOptions{Namespace: "unbounded-net", Image: "kube-proxy:v1"}} + ds := c.daemonSetForSite(unboundednetv1alpha1.Site{ObjectMeta: metav1.ObjectMeta{Name: "test"}}, "100.125.0.0/16") + + if ds.Name != "unbounded-net-kube-proxy-test" { + t.Fatalf("unexpected daemonset name: %s", ds.Name) + } + + if ds.Spec.Template.Spec.NodeSelector[ManagedKubeProxyNodeLabelKey] != ManagedKubeProxyNodeLabelValue { + t.Fatalf("missing managed kube-proxy selector: %#v", ds.Spec.Template.Spec.NodeSelector) + } + + if ds.Spec.Template.Spec.NodeSelector[SiteLabelKey] != "test" { + t.Fatalf("missing site selector: %#v", ds.Spec.Template.Spec.NodeSelector) + } + + if got := ds.Spec.Template.Spec.Containers[0].Command[3]; got != "--cluster-cidr=100.125.0.0/16" { + t.Fatalf("unexpected cluster-cidr arg: %s", got) + } +} + +func nodeWithLabels(labels map[string]string) *corev1.Node { + return &corev1.Node{ObjectMeta: metav1.ObjectMeta{Name: "node-a", Labels: labels}} +}