From 5809adaae9d4986d27d9667a1a5779a3b7c3232f Mon Sep 17 00:00:00 2001 From: Pranay Shah Date: Mon, 27 Apr 2026 03:47:32 +0000 Subject: [PATCH] cubemastercli: add --probe-timeout and bump default budget to 120s MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a `--probe-timeout` flag to `cubemastercli tpl create-from-image` so callers can tune the readiness-probe wall-clock budget without recompiling, and bumps the built-in default from 30s to 120s. Suggested in PR review by @chenhengqi (add `--probe-timeout`; use `DurationFlag`). The default Probe spec generated by `--probe ` previously gave the in-guest application 30s wall-clock (TimeoutMs=30000, PeriodMs=500, FailureThreshold=60) to bind the probe port and answer /health. On nested-KVM hosts (cloud-hypervisor inside a QEMU dev VM, e.g. on a GCP n2 with nested virtualization), 30s isn't enough — the s6 service tree in `ags-image/sandbox-code:latest` consistently needs longer than 30s to bring the code-interpreter HTTP server up on :49999. The upstream quickstart command then fails at `CREATING_TEMPLATE` with "connection refused", and the workaround of omitting --probe takes the snapshot before the code interpreter is ready, producing 502 Bad Gateway from cube-proxy on every later run_code() call. 1. New `--probe-timeout` flag declared as `cli.DurationFlag` (default `120 * time.Second`). When set, controls the total wall-clock probe budget. Internally PeriodMs is fixed at 1000ms; FailureThreshold derives from ceil(budget_in_seconds); TimeoutMs is set to the same total budget so the per-attempt timeout never trips before the failure-count ceiling does. 2. The default budget when `--probe-timeout` is not passed is 120s (vs. the old 30s). Bare-metal callers see no functional change since the probe succeeds on the first poll; slow / nested-KVM hosts get the longer ceiling they need without having to opt in. 3. Update `--probe` help text to point at `--probe-timeout` and `--probe-path` rather than baking specific numeric defaults into the help string. Avoids drift between defaults and docs. - `gofmt -l` clean. - `go vet ./cmd/cubemastercli/...` clean. - `go build -ldflags="-s -w" -o cubemastercli ./cmd/cubemastercli` succeeds. - `cubemastercli tpl create-from-image --help` shows the new flag with `(default: 2m0s)`. - After deploying the patched binary on a nested-KVM dev host, the upstream `tpl create-from-image --probe 49999` quickstart command reaches READY cleanly (status: SUCCEEDED, template_status: READY, artifact_status: READY). - `tpl create-from-image --probe 49999 --probe-timeout 30s` (i.e. reverting to the old budget on a slow host) fails as expected; `--probe-timeout 5m` succeeds with extra headroom. - Confirmed end-to-end with `e2b-code-interpreter` Python SDK that sandboxes spawned from the resulting template have the code interpreter up at first call (warm `run_code` ~80–200 ms; no 502s). - Tested on: GCP n2-standard-4, nested virtualization enabled, OpenCloudOS 9 dev VM (per `dev-env/run_vm.sh`), cube-sandbox-one-click v0.1.1. - Hosts where the in-guest application binds the probe port quickly see no functional change. The probe succeeds on the first poll; the ceiling is irrelevant. - Callers who want the old 30s ceiling can pass `--probe-timeout 30s` explicitly. - A genuinely-broken in-guest application (e.g. wrong probe port, missing service) now fails after 120s instead of 30s by default. The 30s ceiling tended to mask "slow but eventually correct" boots as flaky template builds rather than producing useful fast failures. Closes #95 Signed-off-by: Pranay Shah --- .../commands/cubebox/template.go | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/CubeMaster/cmd/cubemastercli/commands/cubebox/template.go b/CubeMaster/cmd/cubemastercli/commands/cubebox/template.go index dc70a393..843ff673 100644 --- a/CubeMaster/cmd/cubemastercli/commands/cubebox/template.go +++ b/CubeMaster/cmd/cubemastercli/commands/cubebox/template.go @@ -766,8 +766,9 @@ var TemplateCreateFromImageCommand = cli.Command{ cli.StringSliceFlag{Name: "arg", Usage: "override container CMD (args); repeat for multiple elements"}, cli.StringSliceFlag{Name: "env", Usage: "set environment variable, KEY=VALUE format; repeat for multiple envs"}, cli.StringSliceFlag{Name: "dns", Usage: "set container DNS nameserver; repeat for multiple servers"}, - cli.IntFlag{Name: "probe", Usage: "enable HTTP GET probe on the specified port (e.g. --probe 9000); sets timeout_ms=30000, period_ms=500"}, + cli.IntFlag{Name: "probe", Usage: "enable HTTP GET probe on the specified port (e.g. --probe 9000); see --probe-timeout, --probe-path"}, cli.StringFlag{Name: "probe-path", Value: "/health", Usage: "HTTP path for the readiness probe (default: /health); only effective when --probe is set"}, + cli.DurationFlag{Name: "probe-timeout", Value: 120 * time.Second, Usage: "total probe wall-clock budget (e.g. 30s, 2m); only effective when --probe is set; default 120s suits nested-KVM hosts where the in-guest application takes longer than 30s to bind the probe port"}, cli.IntFlag{Name: "cpu", Value: 2000, Usage: "CPU millicores for the template container (default: 2000, i.e. 2 cores)"}, cli.IntFlag{Name: "memory", Value: 2000, Usage: "Memory for the template container in MB (default: 2000 MB)"}, cli.BoolFlag{Name: "json", Usage: "print raw json response"}, @@ -1315,6 +1316,16 @@ func parseContainerOverrides(c *cli.Context) (*types.ContainerOverrides, error) if probePath == "" { probePath = "/health" } + probeTimeout := c.Duration("probe-timeout") + if probeTimeout < time.Second { + return nil, fmt.Errorf("--probe-timeout must be at least 1s, got %s", probeTimeout) + } + // PeriodMs is fixed at 1s; FailureThreshold derives from the requested + // total budget (rounded up to whole seconds). TimeoutMs == total budget + // so it never trips before FailureThreshold does. + periodMs := int32(1000) + timeoutMs := int32(probeTimeout.Milliseconds()) + failureThreshold := int32((probeTimeout + time.Second - 1) / time.Second) host := "" overrides.Probe = &types.Probe{ ProbeHandler: &types.ProbeHandler{ @@ -1324,9 +1335,9 @@ func parseContainerOverrides(c *cli.Context) (*types.ContainerOverrides, error) Host: &host, }, }, - TimeoutMs: 30000, - PeriodMs: 500, - FailureThreshold: 60, + TimeoutMs: timeoutMs, + PeriodMs: periodMs, + FailureThreshold: failureThreshold, SuccessThreshold: 1, } }