Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 50 additions & 0 deletions src/config/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,17 @@ pub struct GeneralConfig {
/// Port for local Whisper STT server (used by voice calls)
#[serde(default)]
pub whisper_port: Option<u16>,
/// Maximum 1-second retries when waiting for the inference provider to
/// become healthy at startup. Default 60 (matches legacy hardcoded value).
/// Bump for setups where model load is genuinely slower than 60s — e.g.
/// >20B models on a single GPU, or any model split across multiple
/// backends via llama.cpp RPC where layer transfer takes time.
#[serde(default = "default_provider_health_check_retries")]
pub provider_health_check_retries: u32,
}

fn default_provider_health_check_retries() -> u32 { 60 }

impl Default for GeneralConfig {
fn default() -> Self {
Self {
Expand All @@ -59,6 +68,7 @@ impl Default for GeneralConfig {
kokoro_port: Some(8880),
flux_port: Some(8890),
whisper_port: Some(8891),
provider_health_check_retries: 60,
}
}
}
Expand Down Expand Up @@ -90,6 +100,13 @@ pub struct LlamaCppConfig {
pub visual_token_budget: usize,
/// Optional LoRA adapter to load at inference time
pub lora_adapter: Option<String>,
/// Context length passed to llama-server `-c`. 0 = auto-detect from GGUF
/// (legacy behavior — stalls on long-default-context models like
/// Qwen3.5/3.6 which advertise 256K). Set explicitly (e.g. 32768) to
/// bound KV cache allocation and pass health-check timeouts on
/// VRAM-limited hardware.
#[serde(default)]
pub context_length: u32,
}

impl Default for LlamaCppConfig {
Expand All @@ -107,6 +124,7 @@ impl Default for LlamaCppConfig {
sae_embed_port: 8082,
visual_token_budget: 560,
lora_adapter: None,
context_length: 0,
}
}
}
Expand Down Expand Up @@ -374,4 +392,36 @@ mod tests {
let deserialized: AppConfig = toml::from_str(&serialized).unwrap();
assert_eq!(deserialized.general.active_provider, "llamacpp");
}

#[test]
fn test_provider_health_check_retries_default_is_legacy_60() {
// Default must match the value previously hardcoded in main.rs so
// that existing tomls without this field deserialize to identical
// pre-patch behavior.
let config = AppConfig::default();
assert_eq!(config.general.provider_health_check_retries, 60);
}

#[test]
fn test_provider_health_check_retries_missing_field_deserializes_to_60() {
// A toml file written before this field existed must still parse,
// and the field must default to the legacy hardcoded value.
let toml_without_field = r#"
active_provider = "llamacpp"
data_dir = "data"
"#;
let general: GeneralConfig = toml::from_str(toml_without_field).unwrap();
assert_eq!(general.provider_health_check_retries, 60);
}

#[test]
fn test_provider_health_check_retries_explicit_value_honored() {
let toml_with_field = r#"
active_provider = "llamacpp"
data_dir = "data"
provider_health_check_retries = 480
"#;
let general: GeneralConfig = toml::from_str(toml_with_field).unwrap();
assert_eq!(general.provider_health_check_retries, 480);
}
}
14 changes: 9 additions & 5 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -272,16 +272,20 @@ async fn create_and_verify_provider(
.context("Failed to create provider")?
);

tracing::info!("Waiting for provider health check...");
let mut retries = 0;
let max_retries = config.general.provider_health_check_retries;
tracing::info!(max_retries, "Waiting for provider health check...");
let mut retries: u32 = 0;
while !provider.health().await {
retries += 1;
if retries > 60 {
anyhow::bail!("Provider failed health check after 60 attempts. Is the server running?");
if retries > max_retries {
anyhow::bail!(
"Provider failed health check after {} attempts. Is the server running?",
max_retries
);
}
tokio::time::sleep(tokio::time::Duration::from_secs(1)).await;
if retries % 10 == 0 {
tracing::info!(retries, "Still waiting for provider...");
tracing::info!(retries, max_retries, "Still waiting for provider...");
}
}
tracing::info!("Provider healthy");
Expand Down
41 changes: 40 additions & 1 deletion src/provider/llamacpp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,26 @@ impl LlamaCppProvider {

/// Build the llama-server command-line arguments.
pub fn build_server_args(&self) -> Vec<String> {
let ctx = if self.config.context_length > 0 {
// §2.7: behavior changes are never silent. When the operator
// sets a non-zero context_length, the auto-derived GGUF value
// is overridden — log it loudly at server-startup time.
tracing::info!(
context_length = self.config.context_length,
"Operator-configured context_length overrides GGUF default"
);
self.config.context_length.to_string()
} else {
"0".to_string() // 0 = auto-detect from GGUF (legacy default)
};
let mut args = vec![
"--model".to_string(),
self.config.model_path.clone(),
"--port".to_string(),
self.config.port.to_string(),
"--jinja".to_string(), // Use model's built-in Jinja chat template for tool calling
"-c".to_string(),
"0".to_string(), // Auto-detect context from GGUF
ctx,
"-np".to_string(),
"1".to_string(), // Single slot — prevents unused slots wasting KV cache
"-ngl".to_string(),
Expand Down Expand Up @@ -515,4 +527,31 @@ mod tests {
"Error classification must match: {}", msg);
}
}

/// Helper: extract the value passed to the `-c` flag from build_server_args.
fn ctx_arg(args: &[String]) -> &str {
let pos = args.iter().position(|a| a == "-c")
.expect("build_server_args must always emit -c");
&args[pos + 1]
}

#[test]
fn test_build_server_args_default_context_passes_zero() {
// Default config has context_length = 0, which must produce `-c 0`
// (legacy behavior — llama-server interprets 0 as auto-detect from GGUF).
let config = LlamaCppConfig::default();
let provider = LlamaCppProvider::new(&config);
let args = provider.build_server_args();
assert_eq!(ctx_arg(&args), "0");
}

#[test]
fn test_build_server_args_explicit_context_overrides() {
// Setting context_length must produce `-c <value>` instead of `-c 0`.
let mut config = LlamaCppConfig::default();
config.context_length = 32768;
let provider = LlamaCppProvider::new(&config);
let args = provider.build_server_args();
assert_eq!(ctx_arg(&args), "32768");
}
}