Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,11 @@
"description": "HTTP health path. Use an empty string for non-HTTP or one-shot services whose readiness is represented by container state/startup_check."
},
"type": { "type": "string", "enum": ["docker", "host-systemd"] },
"health_type": {
"type": "string",
"enum": ["http", "tcp", "none"],
"description": "Health check type. http = HTTP probe (default), tcp = TCP port check, none = no network probe (CLI/one-shot services only)"
},
"gpu_backends": {
"type": "array",
"items": { "type": "string", "enum": ["amd", "nvidia", "apple", "all", "none"] },
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ service:
external_port_env: ''
external_port_default: 0
health: ""
health_type: none
type: docker
startup_check: false # one-shot CLI tool; container exits 0 by design
gpu_backends: [amd, nvidia, apple]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ service:
external_port_env: PIPER_PORT
external_port_default: 10200
health: ""
health_type: tcp
health_timeout: 5
type: docker
gpu_backends: [amd, nvidia, apple]
compose_file: compose.yaml
Expand Down
5 changes: 5 additions & 0 deletions dream-server/extensions/schema/service-manifest.v1.json
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,11 @@
"maximum": 300,
"description": "Health check timeout in seconds (default: 10)"
},
"health_type": {
"type": "string",
"enum": ["http", "tcp", "none"],
"description": "Health check type. http = HTTP probe (default), tcp = TCP port check, none = no network probe (CLI/one-shot services only)"
},
"ui_path": {
"type": "string",
"pattern": "^/.*",
Expand Down
2 changes: 2 additions & 0 deletions dream-server/extensions/services/dashboard-api/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,8 @@ def load_extension_manifests(
"port": int(service.get("port", 0)),
"external_port": external_port,
"health": service.get("health", "/health"),
"health_type": service.get("health_type", "http"),
"health_timeout": int(service.get("health_timeout", 10)),
"name": service.get("name", service_id),
"ui_path": service.get("ui_path", "/"),
"external_link": bool(service.get("external_link", True)),
Expand Down
28 changes: 28 additions & 0 deletions dream-server/extensions/services/dashboard-api/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -417,6 +417,34 @@ async def check_service_health(
return _service_status_from_config(service_id, config, "not_deployed")

host = config.get('host', 'localhost')
health_type = config.get('health_type', 'http')

# health_type=none: no network probe, report skipped
if health_type == 'none':
return _service_status_from_config(service_id, config, "skipped")

# health_type=tcp: check TCP port instead of HTTP
if health_type == 'tcp':
health_port = config.get('health_port', config['port'])
tcp_timeout = config.get('health_timeout', 5)
try:
start = asyncio.get_event_loop().time()
reader, writer = await asyncio.wait_for(
asyncio.open_connection(host, int(health_port)),
timeout=tcp_timeout
)
writer.close()
await writer.wait_closed()
response_time = (asyncio.get_event_loop().time() - start) * 1000
return ServiceStatus(
id=service_id, name=config["name"], port=config["port"],
external_port=config.get("external_port", config["port"]),
status="healthy", response_time_ms=round(response_time, 1)
)
except (asyncio.TimeoutError, OSError):
return _service_status_from_config(service_id, config, "down")

# health_type=http (default): HTTP health check
health_port = config.get('health_port', config['port'])
url = f"http://{host}:{health_port}{config['health']}"
status = "unknown"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,17 @@ def _is_one_shot_extension(ext: dict) -> bool:
return ext.get("port") == 0


def _get_health_type(ext: dict, user_cfg: dict | None = None) -> str:
"""Return the health_type for an extension.

Prefer the runtime user config (which may have been regenerated), fall back
to the catalog entry, then default to http.
"""
if user_cfg:
return user_cfg.get("health_type", "http")
return ext.get("health_type", "http")


def _compute_extension_status(ext: dict, services_by_id: dict) -> str:
"""Compute the runtime status of an extension."""
ext_id = ext["id"]
Expand Down Expand Up @@ -207,6 +218,13 @@ def _compute_extension_status(ext: dict, services_by_id: dict) -> str:
# Core service loaded from manifests
if ext_id in SERVICES:
svc = services_by_id.get(ext_id)
health_type = _get_health_type(ext)
if health_type == "none":
# health_type=none: no network probe, rely on container state
# If we have a service status and it's "skipped", treat as enabled
if svc and svc.status in ("skipped", "healthy"):
return "enabled"
return "disabled"
if svc and svc.status == "healthy":
return "enabled"
return "disabled"
Expand All @@ -221,6 +239,11 @@ def _compute_extension_status(ext: dict, services_by_id: dict) -> str:
if one_shot:
return "cli_installed"
svc = services_by_id.get(ext_id)
health_type = _get_health_type(ext)
if health_type == "none":
if svc and svc.status in ("skipped", "healthy"):
return "enabled"
return "disabled"
if svc and svc.status == "healthy":
return "enabled"
# HTTP 4xx/5xx from the health endpoint is the clearest "container
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,8 @@ def scan_user_extension_services(
"port": int(port),
"external_port": int(svc.get("external_port_default", port)),
"health": health,
"health_type": svc.get("health_type", "http"),
"health_timeout": int(svc.get("health_timeout", 10)),
"name": name,
# Optional: extensions whose health endpoint lives on a
# secondary port (e.g. milvus 9091) need an explicit
Expand Down
3 changes: 3 additions & 0 deletions dream-server/lib/service-registry.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ declare -A SERVICE_CATEGORIES # service_id → core|recommended|optional
declare -A SERVICE_DEPENDS # service_id → space-separated dependency IDs
declare -A SERVICE_HEALTH # service_id → health endpoint path
declare -A SERVICE_HEALTH_TIMEOUTS # service_id → health check timeout in seconds
declare -A SERVICE_HEALTH_TYPES # service_id → health check type (http/tcp/none)
declare -A SERVICE_PORTS # service_id → external port (what the user hits on localhost)
declare -A SERVICE_PORT_ENVS # service_id → env var name for the external port
# Services with `host_network: true` in their manifest (Docker
Expand Down Expand Up @@ -177,11 +178,13 @@ for service_dir in _all_service_dirs:
print(f'SERVICE_DEPENDS["{_esc(sid)}"]="{_esc(" ".join(str(d) for d in depends))}"')
health = s.get("health", "/health")
health_timeout = s.get("health_timeout", 5) # Default 5 seconds
health_type = s.get("health_type", "http") # http (default), tcp, or none
port = s.get("external_port_default", s.get("port", 0))
port_env = s.get("external_port_env", "")
host_network = "1" if s.get("host_network") else ""
print(f'SERVICE_HEALTH["{_esc(sid)}"]="{_esc(health)}"')
print(f'SERVICE_HEALTH_TIMEOUTS["{_esc(sid)}"]="{_esc(health_timeout)}"')
print(f'SERVICE_HEALTH_TYPES["{_esc(sid)}"]="{_esc(health_type)}"')
print(f'SERVICE_PORTS["{_esc(sid)}"]="{_esc(port)}"')
print(f'SERVICE_PORT_ENVS["{_esc(sid)}"]="{_esc(port_env)}"')
if host_network:
Expand Down
72 changes: 69 additions & 3 deletions dream-server/scripts/audit-extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -500,14 +500,80 @@ def validate_records(
record.add_issue("error", "service-port-invalid", "service.port must be a positive integer", path=record.manifest_path)

health = str(service.get("health") or "")
if not health.startswith("/") and not host_network:
health_type = str(service.get("health_type") or "http")
valid_health_types = ("http", "tcp", "none")
if health_type not in valid_health_types:
record.add_issue(
"error",
"service-health-invalid",
"service.health must start with '/'",
"service-health-type-invalid",
f"service.health_type must be one of {valid_health_types}, got '{health_type}'",
path=record.manifest_path,
)

if health_type == "none":
# health_type=none should only be used with port=0 and startup_check=false
if port and port > 0:
record.add_issue(
"error",
"service-health-type-none-port",
"service.health_type=none requires port=0 (CLI/one-shot service)",
path=record.manifest_path,
)
startup_check = service.get("startup_check")
if startup_check is not False:
record.add_issue(
"error",
"service-health-type-none-startup-check",
"service.health_type=none requires startup_check=false",
path=record.manifest_path,
)
# health endpoint should be empty for none type
if health and health != "":
record.add_issue(
"warning",
"service-health-type-none-health",
"service.health should be empty when health_type=none",
path=record.manifest_path,
)
elif health_type == "tcp":
# tcp type needs a valid port
if not port or port <= 0:
record.add_issue(
"error",
"service-health-type-tcp-port",
"service.health_type=tcp requires a valid port",
path=record.manifest_path,
)
# health endpoint should be empty for tcp type
if health and health != "":
record.add_issue(
"warning",
"service-health-type-tcp-health",
"service.health should be empty when health_type=tcp (port check only)",
path=record.manifest_path,
)
else:
# http type (default)
if not health.startswith("/") and not host_network:
record.add_issue(
"error",
"service-health-invalid",
"service.health must be a non-empty path starting with '/' for health_type=http",
path=record.manifest_path,
)

# Validate health_timeout if present
health_timeout = service.get("health_timeout")
if health_timeout is not None:
ht = parse_positive_int(health_timeout)
if ht is None or ht <= 0 or ht > 300:
record.add_issue(
"error",
"service-health-timeout-invalid",
"service.health_timeout must be an integer between 1 and 300",
path=record.manifest_path,
)

ext_port_default = service.get("external_port_default")
if ext_port_default not in (None, "") and parse_non_negative_int(ext_port_default) is None:
record.add_issue(
Expand Down
19 changes: 17 additions & 2 deletions dream-server/scripts/dream-doctor.sh
Original file line number Diff line number Diff line change
Expand Up @@ -242,8 +242,23 @@ collect_extension_diagnostics() {
if [[ "$container_state" == "running" ]]; then
local port="${SERVICE_PORTS[$sid]:-0}"
local health="${SERVICE_HEALTH[$sid]:-}"
if [[ "$port" != "0" && -n "$health" ]]; then
if curl -sf --max-time 5 "http://127.0.0.1:${port}${health}" >/dev/null 2>&1; then
local health_type="${SERVICE_HEALTH_TYPES[$sid]:-http}"
local health_timeout="${SERVICE_HEALTH_TIMEOUTS[$sid]:-5}"

if [[ "$health_type" == "none" ]]; then
# CLI/one-shot service: no network probe, container running = healthy
health_status="healthy"
elif [[ "$health_type" == "tcp" ]]; then
# TCP health check: verify port accepts connections
if timeout "$health_timeout" bash -c "echo >/dev/tcp/127.0.0.1/$port" 2>/dev/null; then
health_status="healthy"
else
health_status="unhealthy"
issues+=("health_check_failed")
fi
elif [[ "$port" != "0" && -n "$health" ]]; then
# HTTP health check (default)
if curl -sf --max-time "$health_timeout" "http://127.0.0.1:${port}${health}" >/dev/null 2>&1; then
health_status="healthy"
else
health_status="unhealthy"
Expand Down
3 changes: 3 additions & 0 deletions dream-server/scripts/generate-extensions-catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ def extract_entry(manifest: dict) -> dict | None:
"port": service.get("port", 0),
"external_port_default": service.get("external_port_default", 0),
"health_endpoint": service.get("health", ""),
"health_type": service.get("health_type", "http"),
"env_vars": strip_secrets(env_vars),
"tags": manifest.get("tags") or service.get("tags", []),
"features": manifest.get("features") or service.get("features", []),
Expand All @@ -111,6 +112,8 @@ def extract_entry(manifest: dict) -> dict | None:
entry["startup_check"] = service.get("startup_check")
if "startup_timeout" in service:
entry["startup_timeout"] = service.get("startup_timeout")
if "health_timeout" in service:
entry["health_timeout"] = service.get("health_timeout")

return entry

Expand Down
51 changes: 48 additions & 3 deletions dream-server/scripts/health-check.sh
Original file line number Diff line number Diff line change
Expand Up @@ -157,22 +157,49 @@ test_service() {
local default_port="${SERVICE_PORTS[$sid]}"
local health="${SERVICE_HEALTH[$sid]}"
local timeout="${SERVICE_HEALTH_TIMEOUTS[$sid]:-$TIMEOUT}"
local health_type="${SERVICE_HEALTH_TYPES[$sid]:-http}"

# Resolve port
local port="$default_port"
[[ -n "$port_env" ]] && port="${!port_env:-$default_port}"

[[ -z "$health" || "$port" == "0" ]] && return 1

# Check container state first (if docker available)
# Check container state first (if docker available) — for ALL health types
local container_state
container_state=$(check_container_state "$sid")
if [[ -n "$container_state" && "$container_state" != "running" ]]; then
# For health_type=none, container state is the only signal we have
if [[ "$health_type" == "none" ]]; then
result_set "$sid" "fail"
ANY_FAIL=true
return 1
fi
# For http/tcp, fail early — no point probing a stopped container
result_set "$sid" "fail"
ANY_FAIL=true
return 1
fi

# Skip network probe for CLI/one-shot services (health_type=none)
if [[ "$health_type" == "none" ]]; then
result_set "$sid" "skipped"
return 0
fi

[[ -z "$health" || "$port" == "0" ]] && return 1

# TCP health check with timeout — verify the port accepts connections
if [[ "$health_type" == "tcp" ]]; then
local tcp_timeout="${timeout:-5}"
if timeout "$tcp_timeout" bash -c "echo >/dev/tcp/127.0.0.1/\"$port\"" 2>/dev/null; then
result_set "$sid" "ok"
return 0
fi
result_set "$sid" "fail"
ANY_FAIL=true
return 1
fi

# HTTP health check (default)
if curl -sf --max-time "$timeout" "http://127.0.0.1:${port}${health}" >/dev/null 2>&1; then
result_set "$sid" "ok"
return 0
Expand Down Expand Up @@ -229,6 +256,13 @@ test_disk() {
check_service() {
local sid="$1"
local name="${SERVICE_NAMES[$sid]:-$sid}"
local health_type="${SERVICE_HEALTH_TYPES[$sid]:-http}"

# Skip display for CLI tools
if [[ "$health_type" == "none" ]]; then
return 0
fi

if test_service "$sid" 2>/dev/null; then
log " ${GREEN}✓${NC} $name - healthy"
return 0
Expand All @@ -243,6 +277,13 @@ check_service_async() {
local sid="$1"
local result_file="$2"

# Skip CLI tools (health_type=none)
local health_type="${SERVICE_HEALTH_TYPES[$sid]:-http}"
if [[ "$health_type" == "none" ]]; then
echo "skipped:$sid:" > "$result_file"
return 0
fi

# Check container state first
local container_state
container_state=$(check_container_state "$sid")
Expand Down Expand Up @@ -303,6 +344,8 @@ for sid in "${CORE_SIDS[@]}"; do

if [[ "$status" == "ok" ]]; then
log " ${GREEN}✓${NC} $name - healthy"
elif [[ "$status" == "skipped" ]]; then
log " ${CYAN}-${NC} $name - skipped (CLI tool)"
else
# Use container state for better error message
case "$container_state" in
Expand Down Expand Up @@ -357,6 +400,8 @@ for sid in "${EXT_SIDS[@]}"; do

if [[ "$status" == "ok" ]]; then
log " ${GREEN}✓${NC} $name - healthy"
elif [[ "$status" == "skipped" ]]; then
log " ${CYAN}-${NC} $name - skipped (CLI tool)"
else
# Use container state for better error message
case "$container_state" in
Expand Down
Loading