diff --git a/api/pkg/external-agent/hydra_executor.go b/api/pkg/external-agent/hydra_executor.go index 2bac40899b..08be99070e 100644 --- a/api/pkg/external-agent/hydra_executor.go +++ b/api/pkg/external-agent/hydra_executor.go @@ -1001,9 +1001,9 @@ func (h *HydraExecutor) buildMounts(agent *types.DesktopAgent, workspaceDir stri // The desktop's 17-start-dockerd.sh init script detects this mountpoint // and starts dockerd automatically. No docker.sock mount needed. mounts = append(mounts, hydra.MountConfig{ - Source: fmt.Sprintf("docker-data-%s", agent.SessionID), + Source: hydra.DockerDataVolumePrefix + agent.SessionID, Destination: "/var/lib/docker", - Type: "volume", // Docker named volume, backed by host ext4 + Type: "volume", // Converted to bind mount by Hydra's buildMounts }) // NOTE: Shared BuildKit cache mount (/buildkit-cache) and BUILDKIT_HOST env var diff --git a/api/pkg/hydra/devcontainer.go b/api/pkg/hydra/devcontainer.go index 25e3dd1bd5..a011cfbcd7 100644 --- a/api/pkg/hydra/devcontainer.go +++ b/api/pkg/hydra/devcontainer.go @@ -72,6 +72,10 @@ type DevContainerManager struct { // NewDevContainerManager creates a new dev container manager func NewDevContainerManager(manager *Manager) *DevContainerManager { + if os.Getenv("CONTAINER_DOCKER_PATH") == "" { + log.Fatal().Msg("CONTAINER_DOCKER_PATH must be set — Hydra requires bind-mount-backed session storage") + } + dm := &DevContainerManager{ manager: manager, containers: make(map[string]*DevContainer), @@ -676,33 +680,24 @@ func (dm *DevContainerManager) buildHostConfig(req *CreateDevContainerRequest) * func (dm *DevContainerManager) buildMounts(req *CreateDevContainerRequest) []mount.Mount { var mounts []mount.Mount - // CONTAINER_DOCKER_PATH: if set, per-session inner dockerd data uses bind mounts - // from a ZFS-backed path instead of Docker named volumes. This keeps the sandbox's - // own Docker storage on the root disk (so provisioned desktop images persist) - // while inner dockerd data benefits from ZFS dedup+compression. - containerDockerPath := os.Getenv("CONTAINER_DOCKER_PATH") - for _, m := range req.Mounts { mountType := mount.TypeBind if m.Type == "volume" { mountType = mount.TypeVolume } - // Redirect inner dockerd volumes to ZFS-backed bind mounts when configured. - // The API sends docker-data-{sessionID} as a named volume for /var/lib/docker; - // we convert it to a bind mount from /container-docker/sessions/{volumeName}/docker/. + // Inner dockerd data: convert the named volume to a bind mount from + // /container-docker/sessions/{volumeName}/docker/. This keeps the + // sandbox's own Docker storage on the root disk while inner dockerd + // data lives on the CONTAINER_DOCKER_PATH filesystem. // - // When a golden Docker cache exists for the project, copy it into the - // session's Docker data directory. This pre-populates the inner dockerd - // with cached images so builds start warm instead of cold. - if containerDockerPath != "" && m.Destination == "/var/lib/docker" && m.Type == "volume" { + // If the session dir already exists (e.g. session restart), reuse it + // instead of re-copying 30+ GB of golden cache (~28s). + if strings.HasPrefix(m.Source, DockerDataVolumePrefix) && m.Type == "volume" { volumeName := m.Source // e.g. "docker-data-{sessionID}" sessionDir := filepath.Join("/container-docker/sessions", volumeName, "docker") - // Check if this session already has a Docker data directory from a - // previous run (e.g. session restart). If so, reuse it — copying - // 30+ GB of golden cache on every restart is wasteful (~28s) and - // overwrites any Docker state changes made during the session. + // Reuse existing session dir on restart (skip golden copy). // Golden builds are excluded: they need the latest golden snapshot // for incremental rebuilds. sessionDirExists := false @@ -717,12 +712,9 @@ func (dm *DevContainerManager) buildMounts(req *CreateDevContainerRequest) []mou m.Source = sessionDir log.Info(). Str("session_dir", sessionDir). - Str("volume", volumeName). Msg("Reusing existing session Docker data dir (skipping golden copy)") } else if GoldenExists(req.ProjectID) { - // Golden cache available — copy to session dir (works for both - // normal sessions AND golden builds; golden builds start from the - // previous golden for incremental rebuilds) + // Golden cache available — copy to session dir onProgress := func(copied, total int64) { dm.setGoldenCopyProgress(req.ProjectID, copied, total, false) } @@ -733,7 +725,6 @@ func (dm *DevContainerManager) buildMounts(req *CreateDevContainerRequest) []mou Str("project_id", req.ProjectID). Str("volume", volumeName). Msg("Failed to copy golden cache, falling back to empty dir") - // Fall back to plain directory if mkErr := os.MkdirAll(sessionDir, 0755); mkErr == nil { mountType = mount.TypeBind m.Source = sessionDir @@ -747,13 +738,12 @@ func (dm *DevContainerManager) buildMounts(req *CreateDevContainerRequest) []mou Msg("Using golden cache copy for inner dockerd") } } else { - // No golden — plain bind mount (existing behavior) + // No golden cache — empty bind mount if err := os.MkdirAll(sessionDir, 0755); err != nil { - log.Warn().Err(err).Str("path", sessionDir).Msg("Failed to create container-docker session dir, falling back to named volume") + log.Warn().Err(err).Str("path", sessionDir).Msg("Failed to create session docker dir") } else { mountType = mount.TypeBind m.Source = sessionDir - log.Debug().Str("source", sessionDir).Msg("Using ZFS-backed bind mount for inner dockerd") } } } @@ -1011,43 +1001,23 @@ func (dm *DevContainerManager) DeleteDevContainer(ctx context.Context, sessionID log.Warn().Err(err).Str("container_id", dc.ContainerID).Msg("Failed to remove container") } - // Remove the per-session docker-data volume that was mounted at /var/lib/docker. - // This volume is created by hydra_executor.go when the container starts and - // accumulates docker images/layers from in-desktop Docker usage. Without cleanup - // these orphaned volumes leak tens of GB each and fill the disk over time. - dockerDataVolume := fmt.Sprintf("docker-data-%s", sessionID) - if err := dockerClient.VolumeRemove(ctx, dockerDataVolume, true); err != nil { - log.Warn().Err(err).Str("volume", dockerDataVolume).Msg("Failed to remove session docker-data volume") + // Preserve the session Docker data dir so restarts can reuse it instead + // of re-copying 30+ GB of golden cache (~28s). Write a .last-active marker + // so GCOrphanedSessions() (every 10 min) can clean dirs inactive for >7 days. + // For golden builds, monitorGoldenBuild handles promotion and cleanup. + dockerDataVolume := DockerDataVolumePrefix + sessionID + if dc.IsGoldenBuild && dc.ProjectID != "" { + _ = SetGoldenBuildRunning(dc.ProjectID, false) + log.Info(). + Str("project_id", dc.ProjectID). + Str("session_id", sessionID). + Msg("Golden build session stopped, lock released (monitorGoldenBuild handles cleanup)") } else { - log.Info().Str("volume", dockerDataVolume).Str("session_id", sessionID).Msg("Removed session docker-data volume") - } - - // CONTAINER_DOCKER_PATH session directory cleanup. - // For golden builds, monitorGoldenBuild handles promotion and cleanup — - // we must NOT delete the Docker data here or it'll be gone before promotion. - // For normal sessions, we intentionally do NOT clean up the session Docker - // data directory here. This allows session restarts to reuse the existing - // Docker data instead of re-copying 30+ GB of golden cache (~28s). - // The periodic GCOrphanedSessions() (every 10 min) handles cleanup of - // session dirs that no longer have a running container. - if os.Getenv("CONTAINER_DOCKER_PATH") != "" { - if dc.IsGoldenBuild && dc.ProjectID != "" { - _ = SetGoldenBuildRunning(dc.ProjectID, false) - log.Info(). - Str("project_id", dc.ProjectID). - Str("session_id", sessionID). - Msg("Golden build session stopped, lock released (monitorGoldenBuild handles cleanup)") - } else { - // Write a timestamp so GC knows when this session was last active. - // Directory mtime doesn't update when files deep inside are modified, - // so we use an explicit marker file. - sessionDir := filepath.Join("/container-docker/sessions", dockerDataVolume) - TouchSessionLastActive(sessionDir) - log.Info(). - Str("session_id", sessionID). - Str("volume", dockerDataVolume). - Msg("Session Docker data dir preserved for potential restart (GC will clean orphans)") - } + sessionDir := filepath.Join("/container-docker/sessions", dockerDataVolume) + TouchSessionLastActive(sessionDir) + log.Info(). + Str("session_id", sessionID). + Msg("Session Docker data dir preserved for potential restart (GC will clean orphans)") } // Update status @@ -1074,10 +1044,6 @@ func (dm *DevContainerManager) DeleteDevContainer(ctx context.Context, sessionID // running containers, and cleans up stale golden cache state. // Should be called periodically and on startup. func (dm *DevContainerManager) GCOrphanedSessions() { - if os.Getenv("CONTAINER_DOCKER_PATH") == "" { - return // Not using per-session docker dirs - } - dm.mu.RLock() active := make(map[string]bool, len(dm.containers)) for sessionID := range dm.containers { @@ -1411,7 +1377,7 @@ func (dm *DevContainerManager) monitorGoldenBuild(dc *DevContainer) { ctx, cancel := context.WithTimeout(context.Background(), 30*time.Minute) defer cancel() - dockerDataVolume := fmt.Sprintf("docker-data-%s", dc.SessionID) + dockerDataVolume := DockerDataVolumePrefix + dc.SessionID resultFile := filepath.Join(sessionsBaseDir, dockerDataVolume, "docker", ".golden-build-result") // Poll for the result file. The workspace-setup script writes this after diff --git a/api/pkg/hydra/golden.go b/api/pkg/hydra/golden.go index c6b80bf58b..c6d76f0d3a 100644 --- a/api/pkg/hydra/golden.go +++ b/api/pkg/hydra/golden.go @@ -302,7 +302,7 @@ func PromoteSessionToGolden(projectID, volumeName string) error { } // CleanupSessionDockerDir removes the per-session Docker data directory. -// Works for both golden-seeded and plain sessions that use CONTAINER_DOCKER_PATH. +// Only used for failed golden builds; normal sessions are cleaned by GC. func CleanupSessionDockerDir(volumeName string) error { base := sessionOverlayDir(volumeName) @@ -561,7 +561,7 @@ func GCOrphanedSessionDirs(activeSessions map[string]bool) (int, int64, error) { // Session dirs are named "docker-data-ses_xxxxx" name := entry.Name() - sessionID := strings.TrimPrefix(name, "docker-data-") + sessionID := strings.TrimPrefix(name, DockerDataVolumePrefix) dir := filepath.Join(sessionsBaseDir, name) if activeSessions[sessionID] { diff --git a/api/pkg/hydra/manager.go b/api/pkg/hydra/manager.go index 3dab27947b..c97e0ca8ae 100644 --- a/api/pkg/hydra/manager.go +++ b/api/pkg/hydra/manager.go @@ -135,18 +135,12 @@ func (m *Manager) setupSharedBuildKit(ctx context.Context) error { Str("cache_dir", buildkitCacheDir). Msg("Creating shared BuildKit container") - // BuildKit state volume: use ZFS-backed bind mount if CONTAINER_DOCKER_PATH is set, - // otherwise use a Docker named volume. ZFS provides dedup for content-addressed data. - buildkitStateMount := "buildkit_state:/var/lib/buildkit" - if containerDockerPath := os.Getenv("CONTAINER_DOCKER_PATH"); containerDockerPath != "" { - buildkitStateDir := "/container-docker/buildkit" - if err := os.MkdirAll(buildkitStateDir, 0755); err != nil { - log.Warn().Err(err).Msg("Failed to create buildkit state dir on ZFS, using named volume") - } else { - buildkitStateMount = buildkitStateDir + ":/var/lib/buildkit" - log.Info().Str("path", buildkitStateDir).Msg("Using ZFS-backed bind mount for BuildKit state") - } + // BuildKit state: bind mount from /container-docker/ filesystem (typically ZFS). + buildkitStateDir := "/container-docker/buildkit" + if err := os.MkdirAll(buildkitStateDir, 0755); err != nil { + log.Fatal().Err(err).Msg("Failed to create buildkit state dir") } + buildkitStateMount := buildkitStateDir + ":/var/lib/buildkit" createCmd := exec.CommandContext(ctx, "docker", "run", "-d", "--name", SharedBuildKitContainerName, diff --git a/api/pkg/hydra/types.go b/api/pkg/hydra/types.go index 1c02aeeab7..2d0b8265fd 100644 --- a/api/pkg/hydra/types.go +++ b/api/pkg/hydra/types.go @@ -37,6 +37,11 @@ const ( DevContainerStatusError DevContainerStatus = "error" ) +// DockerDataVolumePrefix is the naming convention for per-session inner dockerd +// data volumes. Used by hydra_executor (creation) and devcontainer (mount +// conversion to bind mount, GC, cleanup). +const DockerDataVolumePrefix = "docker-data-" + // MountConfig represents a volume mount configuration type MountConfig struct { Source string `json:"source"`