diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 00000000..749c64d8 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,9 @@ +.git +.github +.dev +*.md +build/ +examples/ +assets/ +dist/ +!dist/kubesolo diff --git a/.gitignore b/.gitignore index b2f8e1a1..29d0b1c4 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,4 @@ dist/ .DS_Store .dev/ CLAUDE.md -.claude \ No newline at end of file +.claude diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..94ba81ad --- /dev/null +++ b/Dockerfile @@ -0,0 +1,47 @@ +# kubesolo Container Image +# +# Build: +# make image +# +# Run: +# docker run -d --privileged \ +# --hostname kubesolo \ +# --security-opt seccomp=unconfined \ +# --security-opt apparmor=unconfined \ +# --tmpfs /tmp --tmpfs /run \ +# -v /lib/modules:/lib/modules:ro \ +# -v kubesolo-data:/var/lib/kubesolo \ +# -p 6443:6443 \ +# --name kubesolo \ +# portainer/kubesolo:latest +# +# Get kubeconfig: +# docker exec kubesolo cat /var/lib/kubesolo/pki/admin/admin.kubeconfig > kubeconfig.json +# sed -i 's|https://[^"]*:6443|https://127.0.0.1:6443|' kubeconfig.json +# export KUBECONFIG=$(pwd)/kubeconfig.json +# +# Stop: +# docker stop kubesolo && docker rm kubesolo + +FROM alpine:3.21 + +RUN apk add --no-cache \ + iptables \ + ip6tables \ + conntrack-tools \ + iproute2 \ + kmod \ + e2fsprogs \ + ca-certificates \ + && mkdir -p /var/lib/kubesolo + +COPY --chmod=755 dist/kubesolo /usr/local/bin/kubesolo + +VOLUME ["/var/lib/kubesolo"] + +EXPOSE 6443 10250 + +STOPSIGNAL SIGTERM + +ENTRYPOINT ["/usr/local/bin/kubesolo"] +CMD ["--container-mode"] diff --git a/Makefile b/Makefile index 2e18e019..72aa5ac1 100644 --- a/Makefile +++ b/Makefile @@ -162,3 +162,28 @@ archive-musl: # Include custom make targets -include $(wildcard .dev/*.make) + +# ---------- Container Image targets ---------- + +IMAGE_NAME ?= portainer/kubesolo +IMAGE_TAG ?= $(VERSION) + +# Build the container image (downloads arch-specific deps, builds static binary via Alpine, then packages it) +.PHONY: image +image: deps build-using-alpine + docker buildx build \ + --platform $(GOOS)/$(GOARCH) \ + -t $(IMAGE_NAME):$(IMAGE_TAG)-$(GOOS)-$(GOARCH) . + +# Build multi-arch container images using buildx +.PHONY: image-buildx +image-buildx: + docker buildx build --platform $(GOOS)/$(GOARCH) \ + -t $(IMAGE_NAME):$(IMAGE_TAG) -t $(IMAGE_NAME):latest \ + --push . + +# Push the container image +.PHONY: image-push +image-push: + docker push $(IMAGE_NAME):$(IMAGE_TAG) + docker push $(IMAGE_NAME):latest diff --git a/cmd/kubesolo/main.go b/cmd/kubesolo/main.go index dd3121d2..14b16292 100644 --- a/cmd/kubesolo/main.go +++ b/cmd/kubesolo/main.go @@ -189,7 +189,7 @@ func (s *kubesolo) run() { { name: "kubeproxy", start: func() { - kubeproxyService := kubeproxy.NewService(ctx, cancel, kubeproxyReadyCh, s.embedded.AdminKubeconfigFile) + kubeproxyService := kubeproxy.NewService(ctx, cancel, kubeproxyReadyCh, s.embedded.AdminKubeconfigFile, s.embedded.ContainerMode) s.wg.Go(func() { kubeproxyService.Run(kubeletReadyCh) }) @@ -207,7 +207,7 @@ func (s *kubesolo) run() { } log.Info().Str("component", "kubesolo").Msg("deploying coredns...") - if err := coredns.Deploy(s.embedded.AdminKubeconfigFile); err != nil { + if err := coredns.Deploy(s.embedded.AdminKubeconfigFile, s.embedded.ContainerMode); err != nil { log.Fatal().Err(err).Msg("failed to deploy coredns") } @@ -283,6 +283,19 @@ func (s *kubesolo) bootstrap() { // Setup paths basePath := *flags.Path + containerMode := *flags.ContainerMode || system.IsRunningInContainer() + if containerMode { + log.Info().Str("component", "kubesolo").Msg("container mode detected, using cgroupfs driver and relaxed eviction thresholds") + + if err := system.SetupContainerMounts(); err != nil { + log.Fatal().Err(err).Msg("failed to setup container mount propagation") + } + + if err := system.SetupContainerCgroups(); err != nil { + log.Fatal().Err(err).Msg("failed to setup container cgroups") + } + } + s.embedded = types.Embedded{ // System Node IP NodeIP: nodeIP, @@ -403,5 +416,8 @@ func (s *kubesolo) bootstrap() { // Portainer Edge IsPortainerEdge: s.portainerEdgeID != "" && s.portainerEdgeKey != "", + + // Container Mode + ContainerMode: containerMode, } } diff --git a/internal/config/flags/flags.go b/internal/config/flags/flags.go index 52f37c6b..2bab36b7 100644 --- a/internal/config/flags/flags.go +++ b/internal/config/flags/flags.go @@ -25,4 +25,5 @@ var ( LocalStorageSharedPath = Application.Flag("local-storage-shared-path", "Path to the shared file system for the local storage. Defaults to empty string.").Envar("KUBESOLO_LOCAL_STORAGE_SHARED_PATH").Default("").String() Debug = Application.Flag("debug", "Enable debug logging. Defaults to false.").Envar("KUBESOLO_DEBUG").Default("false").Bool() PprofServer = Application.Flag("pprof-server", "Enable pprof server. Defaults to false.").Envar("KUBESOLO_PPROF_SERVER").Default("false").Bool() + ContainerMode = Application.Flag("container-mode", "Run in container mode with cgroupfs driver and relaxed eviction thresholds. Auto-detected when running inside a container.").Envar("KUBESOLO_CONTAINER_MODE").Bool() ) diff --git a/internal/runtime/network/ip.go b/internal/runtime/network/ip.go index 3812f9ce..a7a9b3b6 100644 --- a/internal/runtime/network/ip.go +++ b/internal/runtime/network/ip.go @@ -79,8 +79,13 @@ var instanceMetadataServiceIP = net.ParseIP("169.254.169.254") // real upstream nameservers suitable for use by pods. It checks common resolv.conf // locations and validates that they contain usable nameservers (global unicast). // If no valid resolv.conf is found, it generates a fallback with public DNS servers. -// This follows the same approach as k3s locateOrGenerateResolvConf. -func GetHostResolvConf(dataDir string) string { +func GetHostResolvConf(dataDir string, containerMode bool) string { + if containerMode { + log.Info().Str("component", "network"). + Msg("running in container mode - using /dev/null for resolv.conf to prevent host DNS leakage into pods") + return "/dev/null" + } + resolvConfs := []string{"/etc/resolv.conf", "/run/systemd/resolve/resolv.conf"} for _, conf := range resolvConfs { if isValidResolvConf(conf) { diff --git a/internal/system/host.go b/internal/system/host.go index 3117ef78..7f11241c 100644 --- a/internal/system/host.go +++ b/internal/system/host.go @@ -1,12 +1,107 @@ package system import ( + "fmt" "os" + "path/filepath" + "strconv" + "strings" "github.com/portainer/kubesolo/types" "github.com/rs/zerolog/log" ) +// IsRunningInContainer detects if the process is running inside a container +// by checking for common container indicators +func IsRunningInContainer() bool { + // Check for Docker + if _, err := os.Stat("/.dockerenv"); err == nil { + return true + } + + // Check for Podman + if _, err := os.Stat("/run/.containerenv"); err == nil { + return true + } + + // Check for systemd container environment variable + if os.Getenv("container") != "" { + return true + } + + return false +} + +// SetupContainerMounts ensures mount propagation is set to rshared on the root +// filesystem. Without this, kubelet cannot propagate volume mounts (including +// projected service account tokens) into pod containers. +func SetupContainerMounts() error { + if err := mountMakeRShared(); err != nil { + return err + } + log.Info().Str("component", "mount").Msg("set root filesystem to rshared propagation") + return nil +} + +// SetupContainerCgroups prepares cgroup v2 for running nested containers. +// In cgroupv2, a cgroup cannot both contain processes AND have domain controllers +// delegated to child cgroups (the "no internal processes" rule). +// This function creates a child cgroup (/sys/fs/cgroup/init), moves the current +// process into it, and enables controller delegation on the root cgroup so that +// containerd/runc can create child cgroups (like /sys/fs/cgroup/k8s.io). +func SetupContainerCgroups() error { + const cgroupRoot = "/sys/fs/cgroup" + + // Check if this is cgroupv2 by looking for cgroup.controllers + if _, err := os.Stat(filepath.Join(cgroupRoot, "cgroup.controllers")); err != nil { + log.Debug().Str("component", "cgroup").Msg("not cgroupv2, skipping cgroup setup") + return nil + } + + // Create /sys/fs/cgroup/init if it doesn't exist + initCgroup := filepath.Join(cgroupRoot, "init") + if err := os.MkdirAll(initCgroup, 0o755); err != nil { + return fmt.Errorf("failed to create init cgroup: %w", err) + } + + // Move current process (PID 1) to the init cgroup + pid := os.Getpid() + procsFile := filepath.Join(initCgroup, "cgroup.procs") + if err := os.WriteFile(procsFile, []byte(strconv.Itoa(pid)), 0o644); err != nil { + return fmt.Errorf("failed to move PID %d to init cgroup: %w", pid, err) + } + log.Info().Str("component", "cgroup").Int("pid", pid).Msg("moved process to /sys/fs/cgroup/init") + + // Read available controllers and enable them on the root cgroup's subtree_control + controllersRaw, err := os.ReadFile(filepath.Join(cgroupRoot, "cgroup.controllers")) + if err != nil { + return fmt.Errorf("failed to read cgroup controllers: %w", err) + } + + controllers := strings.Fields(strings.TrimSpace(string(controllersRaw))) + if len(controllers) > 0 { + // Build "+cpu +memory +pids +io ..." string + var enableList []string + for _, c := range controllers { + enableList = append(enableList, "+"+c) + } + subtreeControl := strings.Join(enableList, " ") + + if err := os.WriteFile(filepath.Join(cgroupRoot, "cgroup.subtree_control"), []byte(subtreeControl), 0o644); err != nil { + log.Warn().Str("component", "cgroup").Err(err).Str("controllers", subtreeControl).Msg("failed to enable all controllers on subtree_control, trying one by one") + // Try enabling controllers one by one - some may not be delegatable + for _, entry := range enableList { + if err := os.WriteFile(filepath.Join(cgroupRoot, "cgroup.subtree_control"), []byte(entry), 0o644); err != nil { + log.Warn().Str("component", "cgroup").Err(err).Str("controller", entry).Msg("failed to enable controller") + } + } + } + log.Info().Str("component", "cgroup").Str("controllers", subtreeControl).Msg("enabled controller delegation on root cgroup") + } + + return nil +} + // GetHostname returns the hostname of the machine // it returns the hostname of the machine // if it fails, it uses the default value "kubesolo-node" diff --git a/internal/system/mount_linux.go b/internal/system/mount_linux.go new file mode 100644 index 00000000..9af5764d --- /dev/null +++ b/internal/system/mount_linux.go @@ -0,0 +1,15 @@ +//go:build linux + +package system + +import ( + "fmt" + "syscall" +) + +func mountMakeRShared() error { + if err := syscall.Mount("", "/", "", syscall.MS_REC|syscall.MS_SHARED, ""); err != nil { + return fmt.Errorf("failed to make / rshared: %w", err) + } + return nil +} diff --git a/pkg/components/coredns/configuration.go b/pkg/components/coredns/configuration.go index 99bb1143..f028ef95 100644 --- a/pkg/components/coredns/configuration.go +++ b/pkg/components/coredns/configuration.go @@ -9,8 +9,16 @@ import ( "k8s.io/client-go/kubernetes" ) -// CoreDNSConfig contains minimal CoreDNS Corefile configuration -const CoreDNSConfig = `.:53 { +// coreDNSConfig returns the minimal CoreDNS Corefile configuration. +// In container mode, /etc/resolv.conf is empty (kubelet uses resolvConf: /dev/null) +// so we use hardcoded upstream DNS servers instead. +func coreDNSConfig(containerMode bool) string { + forward := "forward . /etc/resolv.conf" + if containerMode { + forward = "forward . 1.1.1.1 8.8.8.8" + } + + return `.:53 { errors loop cache 30 { @@ -21,25 +29,26 @@ const CoreDNSConfig = `.:53 { fallthrough in-addr.arpa ip6.arpa ttl 30 } - forward . /etc/resolv.conf + ` + forward + ` minimal reload health :8080 ready :8181 }` +} // createConfigMap creates a configMap with the bare minimum CoreDNS configuration // it creates a new configmap if it does not exist // it updates the configmap if it already exists // it returns an error if it fails -func createConfigMap(ctx context.Context, clientset *kubernetes.Clientset) error { +func createConfigMap(ctx context.Context, clientset *kubernetes.Clientset, containerMode bool) error { configMap := &corev1.ConfigMap{ ObjectMeta: metav1.ObjectMeta{ Name: coreDNSConfigMapName, Namespace: coreDNSNamespace, }, Data: map[string]string{ - "Corefile": CoreDNSConfig, + "Corefile": coreDNSConfig(containerMode), }, } diff --git a/pkg/components/coredns/coredns.go b/pkg/components/coredns/coredns.go index 67cb3a07..96acdeaf 100644 --- a/pkg/components/coredns/coredns.go +++ b/pkg/components/coredns/coredns.go @@ -22,7 +22,7 @@ const ( ) // Deploy deploys all the necessary Kubernetes resources for CoreDNS -func Deploy(adminKubeconfig string) error { +func Deploy(adminKubeconfig string, containerMode bool) error { time.Sleep(types.DefaultComponentSleep) ctx, cancel := context.WithTimeout(context.Background(), types.DefaultContextTimeout) @@ -33,7 +33,7 @@ func Deploy(adminKubeconfig string) error { return fmt.Errorf("failed to create kubernetes client: %v", err) } - if err := createConfigMap(ctx, clientset); err != nil { + if err := createConfigMap(ctx, clientset, containerMode); err != nil { return fmt.Errorf("failed to create CoreDNS ConfigMap: %v", err) } @@ -53,7 +53,7 @@ func Deploy(adminKubeconfig string) error { return fmt.Errorf("failed to create CoreDNS Service: %v", err) } - if err := createDeployment(ctx, clientset); err != nil { + if err := createDeployment(ctx, clientset, containerMode); err != nil { return fmt.Errorf("failed to create CoreDNS Deployment: %v", err) } diff --git a/pkg/components/coredns/deployment.go b/pkg/components/coredns/deployment.go index 245dd5b6..29a3de9d 100644 --- a/pkg/components/coredns/deployment.go +++ b/pkg/components/coredns/deployment.go @@ -13,10 +13,29 @@ import ( "k8s.io/client-go/kubernetes" ) -func createDeployment(ctx context.Context, clientset *kubernetes.Clientset) error { +func createDeployment(ctx context.Context, clientset *kubernetes.Clientset, containerMode bool) error { replicas := int32(1) priorityClassName := "system-cluster-critical" + resources := corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + corev1.ResourceMemory: kubesolokubernetes.ParseResourceQuantity("20Mi"), + }, + Requests: corev1.ResourceList{ + corev1.ResourceMemory: kubesolokubernetes.ParseResourceQuantity("20Mi"), + corev1.ResourceCPU: kubesolokubernetes.ParseResourceQuantity("50m"), + }, + } + + if containerMode { + resources = corev1.ResourceRequirements{ + Requests: corev1.ResourceList{ + corev1.ResourceMemory: kubesolokubernetes.ParseResourceQuantity("20Mi"), + corev1.ResourceCPU: kubesolokubernetes.ParseResourceQuantity("50m"), + }, + } + } + deployment := &appsv1.Deployment{ ObjectMeta: metav1.ObjectMeta{ Name: coreDNSDeploymentName, @@ -57,16 +76,8 @@ func createDeployment(ctx context.Context, clientset *kubernetes.Clientset) erro Name: "coredns", Image: types.DefaultCoreDNSImage, ImagePullPolicy: corev1.PullIfNotPresent, - Resources: corev1.ResourceRequirements{ - Limits: corev1.ResourceList{ - corev1.ResourceMemory: kubesolokubernetes.ParseResourceQuantity("20Mi"), - }, - Requests: corev1.ResourceList{ - corev1.ResourceMemory: kubesolokubernetes.ParseResourceQuantity("20Mi"), - corev1.ResourceCPU: kubesolokubernetes.ParseResourceQuantity("50m"), - }, - }, - Args: []string{"-conf", "/etc/coredns/Corefile"}, + Resources: resources, + Args: []string{"-conf", "/etc/coredns/Corefile"}, VolumeMounts: []corev1.VolumeMount{ { Name: "config-volume", diff --git a/pkg/kubernetes/kubelet/config.go b/pkg/kubernetes/kubelet/config.go index cefb4dbb..b5786d2b 100644 --- a/pkg/kubernetes/kubelet/config.go +++ b/pkg/kubernetes/kubelet/config.go @@ -41,6 +41,38 @@ func (s *service) writeKubeletConfigFile() error { } func (s *service) generateKubeletConfig() map[string]any { + cgroupDriver := "systemd" + if s.containerMode { + cgroupDriver = "cgroupfs" + } + + evictionHard := map[string]string{ + "memory.available": "75Mi", + "nodefs.available": "50Mi", + } + imageGCHigh := 95 + systemReserved := map[string]string{"memory": "25Mi"} + kubeReserved := map[string]string{"memory": "25Mi"} + enforceNodeAllocatable := []string{"pods"} + cgroupsPerQOS := true + if s.containerMode { + evictionHard = map[string]string{ + "memory.available": "50Mi", + "nodefs.available": "0%", + "nodefs.inodesFree": "0%", + "imagefs.available": "0%", + } + imageGCHigh = 100 + // In a container, we cannot create the kubepods/system/kube cgroup hierarchies + // because cgroupv2 domain controllers block subtree creation. + // Disable QoS cgroup management and node allocatable enforcement entirely. + // Per-container cgroups are still managed by containerd/runc. + cgroupsPerQOS = false + enforceNodeAllocatable = []string{} + systemReserved = map[string]string{} + kubeReserved = map[string]string{} + } + return map[string]any{ "kind": "KubeletConfiguration", "apiVersion": "kubelet.config.k8s.io/v1beta1", @@ -72,11 +104,14 @@ func (s *service) generateKubeletConfig() map[string]any { "clusterDomain": "cluster.local", "clusterDNS": []string{types.DefaultCoreDNSIP}, - "resolvConf": network.GetHostResolvConf(s.kubeletDir), + "resolvConf": network.GetHostResolvConf(s.kubeletDir, s.containerMode), "tlsCertFile": s.certFile, "tlsPrivateKeyFile": s.keyFile, - "cgroupDriver": "systemd", + "cgroupDriver": cgroupDriver, + "cgroupsPerQOS": cgroupsPerQOS, + + "enforceNodeAllocatable": enforceNodeAllocatable, "registerNode": true, "readOnlyPort": 0, @@ -89,7 +124,7 @@ func (s *service) generateKubeletConfig() map[string]any { "volumeStatsAggPeriod": "5m0s", "imageMinimumGCAge": "10m0s", "imageMaximumGCAge": "0s", - "imageGCHighThresholdPercent": 95, + "imageGCHighThresholdPercent": imageGCHigh, "imageGCLowThresholdPercent": 80, "runtimeRequestTimeout": "60s", "cpuManagerReconcilePeriod": "60s", @@ -98,12 +133,9 @@ func (s *service) generateKubeletConfig() map[string]any { "registerWithTaints": []map[string]any{}, - "evictionHard": map[string]string{ - "memory.available": "75Mi", - "nodefs.available": "50Mi", - }, - "systemReserved": map[string]string{"memory": "25Mi"}, - "kubeReserved": map[string]string{"memory": "25Mi"}, + "evictionHard": evictionHard, + "systemReserved": systemReserved, + "kubeReserved": kubeReserved, "failSwapOn": false, "kubeAPIQPS": 10, diff --git a/pkg/kubernetes/kubelet/service.go b/pkg/kubernetes/kubelet/service.go index 609ebeb0..861ba526 100644 --- a/pkg/kubernetes/kubelet/service.go +++ b/pkg/kubernetes/kubelet/service.go @@ -28,6 +28,7 @@ type service struct { nodeIP string kubeletCertPath string adminKubeconfig string + containerMode bool } // NewService creates a new kubelet service @@ -49,5 +50,6 @@ func NewService(ctx context.Context, cancel context.CancelFunc, kubeletReady cha keyFile: embedded.KubeletCerts.Key, nodeName: system.GetHostname(), adminKubeconfig: embedded.AdminKubeconfigFile, + containerMode: embedded.ContainerMode, } } diff --git a/pkg/kubernetes/kubeproxy/flags.go b/pkg/kubernetes/kubeproxy/flags.go index 434b3529..02a4440e 100644 --- a/pkg/kubernetes/kubeproxy/flags.go +++ b/pkg/kubernetes/kubeproxy/flags.go @@ -21,7 +21,15 @@ func (s *service) configureKubeProxyFlags(command *cobra.Command) { _ = flags.Set("iptables-masquerade-bit", "14") _ = flags.Set("masquerade-all", "true") _ = flags.Set("proxy-mode", "iptables") - _ = flags.Set("conntrack-max-per-core", "1024") - _ = flags.Set("conntrack-min", "1024") _ = flags.Set("min-sync-period", "10s") + + if s.containerMode { + // In container mode, avoid writing to /proc/sys/net/netfilter/nf_conntrack_max + // which may be read-only depending on the container runtime. + _ = flags.Set("conntrack-max-per-core", "0") + _ = flags.Set("conntrack-min", "0") + } else { + _ = flags.Set("conntrack-max-per-core", "1024") + _ = flags.Set("conntrack-min", "1024") + } } diff --git a/pkg/kubernetes/kubeproxy/service.go b/pkg/kubernetes/kubeproxy/service.go index 4409bcfc..d691e965 100644 --- a/pkg/kubernetes/kubeproxy/service.go +++ b/pkg/kubernetes/kubeproxy/service.go @@ -12,14 +12,16 @@ type service struct { cancel context.CancelFunc kubeproxyReady chan<- struct{} adminKubeconfigFile string + containerMode bool } // NewService creates a new kube proxy service -func NewService(ctx context.Context, cancel context.CancelFunc, kubeproxyReady chan<- struct{}, adminKubeconfigFile string) *service { +func NewService(ctx context.Context, cancel context.CancelFunc, kubeproxyReady chan<- struct{}, adminKubeconfigFile string, containerMode bool) *service { return &service{ ctx: ctx, cancel: cancel, kubeproxyReady: kubeproxyReady, adminKubeconfigFile: adminKubeconfigFile, + containerMode: containerMode, } } diff --git a/types/types.go b/types/types.go index a90e2b84..9cd35995 100644 --- a/types/types.go +++ b/types/types.go @@ -128,6 +128,9 @@ type Embedded struct { // Portainer Edge IsPortainerEdge bool + + // Container Mode + ContainerMode bool } // EdgeAgentConfig contains configuration for Portainer Edge Agent