diff --git a/src/config/mod.rs b/src/config/mod.rs index 3d5b660e..217324eb 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -49,8 +49,17 @@ pub struct GeneralConfig { /// Port for local Whisper STT server (used by voice calls) #[serde(default)] pub whisper_port: Option, + /// Maximum 1-second retries when waiting for the inference provider to + /// become healthy at startup. Default 60 (matches legacy hardcoded value). + /// Bump for setups where model load is genuinely slower than 60s — e.g. + /// >20B models on a single GPU, or any model split across multiple + /// backends via llama.cpp RPC where layer transfer takes time. + #[serde(default = "default_provider_health_check_retries")] + pub provider_health_check_retries: u32, } +fn default_provider_health_check_retries() -> u32 { 60 } + impl Default for GeneralConfig { fn default() -> Self { Self { @@ -59,6 +68,7 @@ impl Default for GeneralConfig { kokoro_port: Some(8880), flux_port: Some(8890), whisper_port: Some(8891), + provider_health_check_retries: 60, } } } @@ -90,6 +100,13 @@ pub struct LlamaCppConfig { pub visual_token_budget: usize, /// Optional LoRA adapter to load at inference time pub lora_adapter: Option, + /// Context length passed to llama-server `-c`. 0 = auto-detect from GGUF + /// (legacy behavior — stalls on long-default-context models like + /// Qwen3.5/3.6 which advertise 256K). Set explicitly (e.g. 32768) to + /// bound KV cache allocation and pass health-check timeouts on + /// VRAM-limited hardware. + #[serde(default)] + pub context_length: u32, } impl Default for LlamaCppConfig { @@ -107,6 +124,7 @@ impl Default for LlamaCppConfig { sae_embed_port: 8082, visual_token_budget: 560, lora_adapter: None, + context_length: 0, } } } @@ -374,4 +392,36 @@ mod tests { let deserialized: AppConfig = toml::from_str(&serialized).unwrap(); assert_eq!(deserialized.general.active_provider, "llamacpp"); } + + #[test] + fn test_provider_health_check_retries_default_is_legacy_60() { + // Default must match the value previously hardcoded in main.rs so + // that existing tomls without this field deserialize to identical + // pre-patch behavior. + let config = AppConfig::default(); + assert_eq!(config.general.provider_health_check_retries, 60); + } + + #[test] + fn test_provider_health_check_retries_missing_field_deserializes_to_60() { + // A toml file written before this field existed must still parse, + // and the field must default to the legacy hardcoded value. + let toml_without_field = r#" + active_provider = "llamacpp" + data_dir = "data" + "#; + let general: GeneralConfig = toml::from_str(toml_without_field).unwrap(); + assert_eq!(general.provider_health_check_retries, 60); + } + + #[test] + fn test_provider_health_check_retries_explicit_value_honored() { + let toml_with_field = r#" + active_provider = "llamacpp" + data_dir = "data" + provider_health_check_retries = 480 + "#; + let general: GeneralConfig = toml::from_str(toml_with_field).unwrap(); + assert_eq!(general.provider_health_check_retries, 480); + } } diff --git a/src/main.rs b/src/main.rs index 900157fd..5d80dc04 100644 --- a/src/main.rs +++ b/src/main.rs @@ -272,16 +272,20 @@ async fn create_and_verify_provider( .context("Failed to create provider")? ); - tracing::info!("Waiting for provider health check..."); - let mut retries = 0; + let max_retries = config.general.provider_health_check_retries; + tracing::info!(max_retries, "Waiting for provider health check..."); + let mut retries: u32 = 0; while !provider.health().await { retries += 1; - if retries > 60 { - anyhow::bail!("Provider failed health check after 60 attempts. Is the server running?"); + if retries > max_retries { + anyhow::bail!( + "Provider failed health check after {} attempts. Is the server running?", + max_retries + ); } tokio::time::sleep(tokio::time::Duration::from_secs(1)).await; if retries % 10 == 0 { - tracing::info!(retries, "Still waiting for provider..."); + tracing::info!(retries, max_retries, "Still waiting for provider..."); } } tracing::info!("Provider healthy"); diff --git a/src/provider/llamacpp.rs b/src/provider/llamacpp.rs index 12c7bdd3..cdf948ac 100644 --- a/src/provider/llamacpp.rs +++ b/src/provider/llamacpp.rs @@ -29,6 +29,18 @@ impl LlamaCppProvider { /// Build the llama-server command-line arguments. pub fn build_server_args(&self) -> Vec { + let ctx = if self.config.context_length > 0 { + // §2.7: behavior changes are never silent. When the operator + // sets a non-zero context_length, the auto-derived GGUF value + // is overridden — log it loudly at server-startup time. + tracing::info!( + context_length = self.config.context_length, + "Operator-configured context_length overrides GGUF default" + ); + self.config.context_length.to_string() + } else { + "0".to_string() // 0 = auto-detect from GGUF (legacy default) + }; let mut args = vec![ "--model".to_string(), self.config.model_path.clone(), @@ -36,7 +48,7 @@ impl LlamaCppProvider { self.config.port.to_string(), "--jinja".to_string(), // Use model's built-in Jinja chat template for tool calling "-c".to_string(), - "0".to_string(), // Auto-detect context from GGUF + ctx, "-np".to_string(), "1".to_string(), // Single slot — prevents unused slots wasting KV cache "-ngl".to_string(), @@ -515,4 +527,31 @@ mod tests { "Error classification must match: {}", msg); } } + + /// Helper: extract the value passed to the `-c` flag from build_server_args. + fn ctx_arg(args: &[String]) -> &str { + let pos = args.iter().position(|a| a == "-c") + .expect("build_server_args must always emit -c"); + &args[pos + 1] + } + + #[test] + fn test_build_server_args_default_context_passes_zero() { + // Default config has context_length = 0, which must produce `-c 0` + // (legacy behavior — llama-server interprets 0 as auto-detect from GGUF). + let config = LlamaCppConfig::default(); + let provider = LlamaCppProvider::new(&config); + let args = provider.build_server_args(); + assert_eq!(ctx_arg(&args), "0"); + } + + #[test] + fn test_build_server_args_explicit_context_overrides() { + // Setting context_length must produce `-c ` instead of `-c 0`. + let mut config = LlamaCppConfig::default(); + config.context_length = 32768; + let provider = LlamaCppProvider::new(&config); + let args = provider.build_server_args(); + assert_eq!(ctx_arg(&args), "32768"); + } }