MettaMazza · Scooter-DeJean · May 5, 2026 · May 5, 2026
diff --git a/src/config/mod.rs b/src/config/mod.rs
@@ -49,8 +49,17 @@ pub struct GeneralConfig {
     /// Port for local Whisper STT server (used by voice calls)
     #[serde(default)]
     pub whisper_port: Option<u16>,
+    /// Maximum 1-second retries when waiting for the inference provider to
+    /// become healthy at startup. Default 60 (matches legacy hardcoded value).
+    /// Bump for setups where model load is genuinely slower than 60s — e.g.
+    /// >20B models on a single GPU, or any model split across multiple
+    /// backends via llama.cpp RPC where layer transfer takes time.
+    #[serde(default = "default_provider_health_check_retries")]
+    pub provider_health_check_retries: u32,
 }
 
+fn default_provider_health_check_retries() -> u32 { 60 }
+
 impl Default for GeneralConfig {
     fn default() -> Self {
         Self {
@@ -59,6 +68,7 @@ impl Default for GeneralConfig {
             kokoro_port: Some(8880),
             flux_port: Some(8890),
             whisper_port: Some(8891),
+            provider_health_check_retries: 60,
         }
     }
 }
@@ -90,6 +100,13 @@ pub struct LlamaCppConfig {
     pub visual_token_budget: usize,
     /// Optional LoRA adapter to load at inference time
     pub lora_adapter: Option<String>,
+    /// Context length passed to llama-server `-c`. 0 = auto-detect from GGUF
+    /// (legacy behavior — stalls on long-default-context models like
+    /// Qwen3.5/3.6 which advertise 256K). Set explicitly (e.g. 32768) to
+    /// bound KV cache allocation and pass health-check timeouts on
+    /// VRAM-limited hardware.
+    #[serde(default)]
+    pub context_length: u32,
 }
 
 impl Default for LlamaCppConfig {
@@ -107,6 +124,7 @@ impl Default for LlamaCppConfig {
             sae_embed_port: 8082,
             visual_token_budget: 560,
             lora_adapter: None,
+            context_length: 0,
         }
     }
 }
@@ -374,4 +392,36 @@ mod tests {
         let deserialized: AppConfig = toml::from_str(&serialized).unwrap();
         assert_eq!(deserialized.general.active_provider, "llamacpp");
     }
+
+    #[test]
+    fn test_provider_health_check_retries_default_is_legacy_60() {
+        // Default must match the value previously hardcoded in main.rs so
+        // that existing tomls without this field deserialize to identical
+        // pre-patch behavior.
+        let config = AppConfig::default();
+        assert_eq!(config.general.provider_health_check_retries, 60);
+    }
+
+    #[test]
+    fn test_provider_health_check_retries_missing_field_deserializes_to_60() {
+        // A toml file written before this field existed must still parse,
+        // and the field must default to the legacy hardcoded value.
+        let toml_without_field = r#"
+            active_provider = "llamacpp"
+            data_dir = "data"
+        "#;
+        let general: GeneralConfig = toml::from_str(toml_without_field).unwrap();
+        assert_eq!(general.provider_health_check_retries, 60);
+    }
+
+    #[test]
+    fn test_provider_health_check_retries_explicit_value_honored() {
+        let toml_with_field = r#"
+            active_provider = "llamacpp"
+            data_dir = "data"
+            provider_health_check_retries = 480
+        "#;
+        let general: GeneralConfig = toml::from_str(toml_with_field).unwrap();
+        assert_eq!(general.provider_health_check_retries, 480);
+    }
 }
diff --git a/src/main.rs b/src/main.rs
@@ -272,16 +272,20 @@ async fn create_and_verify_provider(
             .context("Failed to create provider")?
     );
 
-    tracing::info!("Waiting for provider health check...");
-    let mut retries = 0;
+    let max_retries = config.general.provider_health_check_retries;
+    tracing::info!(max_retries, "Waiting for provider health check...");
+    let mut retries: u32 = 0;
     while !provider.health().await {
         retries += 1;
-        if retries > 60 {
-            anyhow::bail!("Provider failed health check after 60 attempts. Is the server running?");
+        if retries > max_retries {
+            anyhow::bail!(
+                "Provider failed health check after {} attempts. Is the server running?",
+                max_retries
+            );
         }
         tokio::time::sleep(tokio::time::Duration::from_secs(1)).await;
         if retries % 10 == 0 {
-            tracing::info!(retries, "Still waiting for provider...");
+            tracing::info!(retries, max_retries, "Still waiting for provider...");
         }
     }
     tracing::info!("Provider healthy");

diff --git a/src/provider/llamacpp.rs b/src/provider/llamacpp.rs
@@ -29,14 +29,26 @@ impl LlamaCppProvider {
 
     /// Build the llama-server command-line arguments.
     pub fn build_server_args(&self) -> Vec<String> {
+        let ctx = if self.config.context_length > 0 {
+            // §2.7: behavior changes are never silent. When the operator
+            // sets a non-zero context_length, the auto-derived GGUF value
+            // is overridden — log it loudly at server-startup time.
+            tracing::info!(
+                context_length = self.config.context_length,
+                "Operator-configured context_length overrides GGUF default"
+            );
+            self.config.context_length.to_string()
+        } else {
+            "0".to_string() // 0 = auto-detect from GGUF (legacy default)
+        };
         let mut args = vec![
             "--model".to_string(),
             self.config.model_path.clone(),
             "--port".to_string(),
             self.config.port.to_string(),
             "--jinja".to_string(), // Use model's built-in Jinja chat template for tool calling
             "-c".to_string(),
-            "0".to_string(), // Auto-detect context from GGUF
+            ctx,
             "-np".to_string(),
             "1".to_string(), // Single slot — prevents unused slots wasting KV cache
             "-ngl".to_string(),
@@ -515,4 +527,31 @@ mod tests {
                 "Error classification must match: {}", msg);
         }
     }
+
+    /// Helper: extract the value passed to the `-c` flag from build_server_args.
+    fn ctx_arg(args: &[String]) -> &str {
+        let pos = args.iter().position(|a| a == "-c")
+            .expect("build_server_args must always emit -c");
+        &args[pos + 1]
+    }
+
+    #[test]
+    fn test_build_server_args_default_context_passes_zero() {
+        // Default config has context_length = 0, which must produce `-c 0`
+        // (legacy behavior — llama-server interprets 0 as auto-detect from GGUF).
+        let config = LlamaCppConfig::default();
+        let provider = LlamaCppProvider::new(&config);
+        let args = provider.build_server_args();
+        assert_eq!(ctx_arg(&args), "0");
+    }
+
+    #[test]
+    fn test_build_server_args_explicit_context_overrides() {
+        // Setting context_length must produce `-c <value>` instead of `-c 0`.
+        let mut config = LlamaCppConfig::default();
+        config.context_length = 32768;
+        let provider = LlamaCppProvider::new(&config);
+        let args = provider.build_server_args();
+        assert_eq!(ctx_arg(&args), "32768");
+    }
 }