diff --git a/crates/openfang-api/src/middleware.rs b/crates/openfang-api/src/middleware.rs index 4efb5bc310..094296fc40 100644 --- a/crates/openfang-api/src/middleware.rs +++ b/crates/openfang-api/src/middleware.rs @@ -234,14 +234,15 @@ pub async fn security_headers(request: Request, next: Next) -> Response f64 { - let (input_per_m, output_per_m) = catalog.pricing(model).unwrap_or((1.0, 3.0)); + let fallback = if is_local_provider(provider) { + (0.0, 0.0) + } else { + (1.0, 3.0) + }; + let (input_per_m, output_per_m) = catalog.pricing(model).unwrap_or(fallback); let input_cost = (input_tokens as f64 / 1_000_000.0) * input_per_m; let output_cost = (output_tokens as f64 / 1_000_000.0) * output_per_m; input_cost + output_cost @@ -212,6 +225,26 @@ impl MeteringEngine { } } +/// True when the provider runs inference locally (zero per-token cost). +/// +/// Used by `estimate_cost_with_catalog` to pick a $0/$0 fallback for +/// models that aren't explicitly registered in the catalog. A custom +/// Ollama Modelfile like `my-model:latest` will miss the catalog but +/// still cost nothing to run, so it should not trip budget quotas. +fn is_local_provider(provider: &str) -> bool { + matches!( + provider.to_lowercase().as_str(), + "ollama" + | "vllm" + | "lmstudio" + | "lm-studio" + | "lemonade" + | "llamacpp" + | "llama.cpp" + | "local" + ) +} + /// Budget status snapshot — current spend vs limits for all time windows. #[derive(Debug, Clone, serde::Serialize)] pub struct BudgetStatus { @@ -758,6 +791,7 @@ mod tests { let cost = MeteringEngine::estimate_cost_with_catalog( &catalog, "claude-sonnet-4-20250514", + "anthropic", 1_000_000, 1_000_000, ); @@ -768,24 +802,90 @@ mod tests { fn test_estimate_cost_with_catalog_alias() { let catalog = openfang_runtime::model_catalog::ModelCatalog::new(); // "sonnet" alias should resolve to same pricing - let cost = - MeteringEngine::estimate_cost_with_catalog(&catalog, "sonnet", 1_000_000, 1_000_000); + let cost = MeteringEngine::estimate_cost_with_catalog( + &catalog, + "sonnet", + "anthropic", + 1_000_000, + 1_000_000, + ); assert!((cost - 18.0).abs() < 0.01); } #[test] - fn test_estimate_cost_with_catalog_unknown_uses_default() { + fn test_estimate_cost_with_catalog_unknown_cloud_uses_default() { let catalog = openfang_runtime::model_catalog::ModelCatalog::new(); - // Unknown model falls back to $1/$3 + // Unknown cloud model falls back to $1/$3 — surfaces cost, doesn't hide it. let cost = MeteringEngine::estimate_cost_with_catalog( &catalog, "totally-unknown-model", + "openai", 1_000_000, 1_000_000, ); assert!((cost - 4.0).abs() < 0.01); } + #[test] + fn test_estimate_cost_with_catalog_unknown_local_is_free() { + let catalog = openfang_runtime::model_catalog::ModelCatalog::new(); + // Unknown local model (e.g. custom Ollama Modelfile) → $0. + // This prevents false budget-quota trips on zero-cost inference. + for provider in [ + "ollama", + "Ollama", + "OLLAMA", + "vllm", + "lmstudio", + "lm-studio", + "lemonade", + "llamacpp", + "llama.cpp", + "local", + ] { + let cost = MeteringEngine::estimate_cost_with_catalog( + &catalog, + "gemma4-agent", + provider, + 1_000_000, + 1_000_000, + ); + assert_eq!(cost, 0.0, "provider {provider} must default to $0"); + } + } + + #[test] + fn test_estimate_cost_with_catalog_known_model_ignores_provider_hint() { + let catalog = openfang_runtime::model_catalog::ModelCatalog::new(); + // When the model IS in the catalog, catalog pricing wins regardless + // of the provider hint. This guards against a caller mislabeling a + // known cloud model with a "local" provider tag. + let cost = MeteringEngine::estimate_cost_with_catalog( + &catalog, + "claude-sonnet-4-20250514", + "ollama", + 1_000_000, + 1_000_000, + ); + assert!((cost - 18.0).abs() < 0.01); + } + + #[test] + fn test_is_local_provider() { + assert!(super::is_local_provider("ollama")); + assert!(super::is_local_provider("OLLAMA")); + assert!(super::is_local_provider("vllm")); + assert!(super::is_local_provider("lmstudio")); + assert!(super::is_local_provider("lm-studio")); + assert!(super::is_local_provider("lemonade")); + assert!(super::is_local_provider("llamacpp")); + assert!(super::is_local_provider("llama.cpp")); + assert!(super::is_local_provider("local")); + assert!(!super::is_local_provider("anthropic")); + assert!(!super::is_local_provider("openai")); + assert!(!super::is_local_provider("")); + } + #[test] fn test_get_summary() { let engine = setup(); diff --git a/crates/openfang-runtime/src/drivers/openai.rs b/crates/openfang-runtime/src/drivers/openai.rs index 8b927fa8d1..2cceb24e25 100644 --- a/crates/openfang-runtime/src/drivers/openai.rs +++ b/crates/openfang-runtime/src/drivers/openai.rs @@ -253,6 +253,8 @@ struct OaiResponseMessage { tool_calls: Option>, /// Reasoning/thinking content returned by some models (DeepSeek-R1, Qwen3, etc.) /// via LM Studio, Ollama, and other local inference servers. + /// Ollama uses "reasoning" for Gemma 4; others use "reasoning_content". + #[serde(alias = "reasoning")] reasoning_content: Option, }