diff --git a/frontend_multi_user/src/admin_routes.py b/frontend_multi_user/src/admin_routes.py index b5f8d0f2..0bdb9fc3 100644 --- a/frontend_multi_user/src/admin_routes.py +++ b/frontend_multi_user/src/admin_routes.py @@ -370,40 +370,50 @@ def admin_database_backup(): return jsonify({"error": str(e)}), 502 -@admin_routes_bp.route("/ping/stream") +@admin_routes_bp.route("/ping/list") @login_required -def ping_stream(): +def ping_list(): worker_plan_url = current_app.config["WORKER_PLAN_URL"] + url = f"{worker_plan_url}/llm-list" + try: + resp = requests.get(url, timeout=(5, 30)) + except Exception as exc: + logger.error("LLM ping list proxy exception: %s", exc) + return jsonify({"error": str(exc)}), 502 + if resp.status_code != 200: + return jsonify({"error": f"worker_plan responded with {resp.status_code}"}), 502 + return jsonify(resp.json()) - def generate(): - url = f"{worker_plan_url}/llm-ping" - logger.info("Proxying LLM ping stream from %s", url) - try: - with requests.get( - url, - stream=True, - timeout=(5, 300), - headers={"Accept": "text/event-stream"}, - ) as resp: - if resp.status_code != 200: - msg = f"worker_plan responded with {resp.status_code}" - logger.error("LLM ping proxy error: %s", msg) - yield f"data: {json.dumps({'name': 'worker_plan', 'status': 'error', 'response_time': 0, 'response': msg})}\n\n" - yield f"data: {json.dumps({'name': 'server', 'status': 'done', 'response_time': 0, 'response': ''})}\n\n" - return - for line in resp.iter_lines(decode_unicode=True): - if line is None or line.strip() == "": - continue - yield f"{line}\n\n" - except Exception as exc: - logger.error("LLM ping proxy exception: %s", exc) - error_payload = {"name": "worker_plan", "status": "error", "response_time": 0, "response": str(exc)} - yield f"data: {json.dumps(error_payload)}\n\n" - yield f"data: {json.dumps({'name': 'server', 'status': 'done', 'response_time': 0, 'response': ''})}\n\n" - - response = Response(generate(), mimetype="text/event-stream") - response.headers["X-Accel-Buffering"] = "no" - return response + +@admin_routes_bp.route("/ping/one") +@login_required +def ping_one(): + worker_plan_url = current_app.config["WORKER_PLAN_URL"] + profile = request.args.get("profile", "") + llm_name = request.args.get("llm_name", "") + url = f"{worker_plan_url}/llm-ping-one" + try: + resp = requests.get( + url, + params={"profile": profile, "llm_name": llm_name}, + timeout=(5, 300), + ) + except Exception as exc: + logger.error("LLM ping-one proxy exception: %s", exc) + return jsonify({ + "name": f"{profile}:{llm_name}", + "status": "error", + "response_time": 0, + "response": str(exc), + }), 502 + if resp.status_code != 200: + return jsonify({ + "name": f"{profile}:{llm_name}", + "status": "error", + "response_time": 0, + "response": f"worker_plan responded with {resp.status_code}", + }), 502 + return jsonify(resp.json()) @admin_routes_bp.route("/admin/demo_run") diff --git a/frontend_multi_user/templates/ping.html b/frontend_multi_user/templates/ping.html index e5d3d4cc..20af1453 100644 --- a/frontend_multi_user/templates/ping.html +++ b/frontend_multi_user/templates/ping.html @@ -16,6 +16,7 @@ border: 1px solid #ddd; padding: 8px; text-align: left; + vertical-align: top; } th { background-color: #f2f2f2; @@ -23,28 +24,19 @@ tr:nth-child(even) { background-color: #f9f9f9; } - .status-success { - color: green; - } - .status-error { - color: red; - } - .status-pinging { - color: #666; - font-style: italic; - } + .status-success { color: green; } + .status-error { color: red; } + .status-pinging { color: #666; font-style: italic; } + .status-idle { color: #888; } .back-link { margin-bottom: 20px; display: inline-block; } - .loading { - color: #666; - font-style: italic; - } + .loading { color: #666; font-style: italic; } @keyframes ellipsis { - 0% { content: '.'; } - 33% { content: '..'; } - 66% { content: '...'; } + 0% { content: '.'; } + 33% { content: '..'; } + 66% { content: '...'; } 100% { content: '.'; } } .pinging::after { @@ -54,96 +46,166 @@ width: 1em; text-align: left; } - .server-status { - font-size: 0.8em; - font-weight: normal; - padding: 4px 8px; - border-radius: 4px; - margin-left: 10px; - display: inline-block; + .toolbar { + margin-bottom: 12px; } - .server-status.working { - background-color: #2196F3; - color: white; + button.ping-btn, + button.ping-all-btn { + cursor: pointer; + padding: 4px 10px; } - .server-status.done { - background-color: #4CAF50; - color: white; + button.ping-all-btn { + font-size: 1em; + padding: 8px 16px; } - .server-status.error { - background-color: #f44336; - color: white; + button:disabled { + cursor: not-allowed; + opacity: 0.6; } ← Back to Admin -

LLM Ping Results Working...

+

LLM Ping Results

+
+ + Loading models… +
+ + - - - - - +
LLM NamePriorityAction Status Response Time Response
Loading results...
- \ No newline at end of file + diff --git a/llm_config/baseline.json b/llm_config/baseline.json index 48c60576..968976f4 100644 --- a/llm_config/baseline.json +++ b/llm_config/baseline.json @@ -86,27 +86,27 @@ }, "pricing_kind": "paid" }, - "openrouter-elephant-alpha": { - "comment": "Released Apr 13, 2026. 262,144 context. $0/M input tokens. $0/M output tokens", + "openrouter-ling-2.6-flash": { + "comment": "Production name for what was previously the 'Elephant Alpha' stealth release (revealed Apr 21, 2026). 262,144 context. $0.08/M input tokens. $0.24/M output tokens.", "luigi_workers": 4, - "model_info_url": "https://openrouter.ai/openrouter/elephant-alpha", + "model_info_url": "https://openrouter.ai/inclusionai/ling-2.6-flash", "class": "OpenRouter", "arguments": { - "model": "openrouter/elephant-alpha", + "model": "inclusionai/ling-2.6-flash", "api_key": "${OPENROUTER_API_KEY}", "temperature": 0.1, "timeout": 60.0, - "context_window": 1048576, + "context_window": 262144, "is_function_calling_model": false, "is_chat_model": true, "max_tokens": 8192, "max_retries": 5 }, "pricing": { - "input_per_million_tokens": 0, - "output_per_million_tokens": 0 + "input_per_million_tokens": 0.08, + "output_per_million_tokens": 0.24 }, - "pricing_kind": "free" + "pricing_kind": "paid" }, "openrouter-ling-2.6-1t-free": { "comment": "Released Apr 23, 2026. Scheduled to be removed from OpenRouter on Apr 30, 2026. 262,144 context. $0/M input tokens. $0/M output tokens.", @@ -130,6 +130,72 @@ }, "pricing_kind": "free" }, + "openrouter-granite-4.1-8b": { + "comment": "IBM Granite 4.1 8B. Released Apr 30, 2026. Dense decoder-only 8B model, Apache 2.0. 131,072 context. $0.05/M input tokens. $0.10/M output tokens.", + "luigi_workers": 4, + "model_info_url": "https://openrouter.ai/ibm-granite/granite-4.1-8b", + "class": "OpenRouter", + "arguments": { + "model": "ibm-granite/granite-4.1-8b", + "api_key": "${OPENROUTER_API_KEY}", + "temperature": 0.1, + "timeout": 60.0, + "context_window": 131072, + "is_function_calling_model": false, + "is_chat_model": true, + "max_tokens": 8192, + "max_retries": 5 + }, + "pricing": { + "input_per_million_tokens": 0.05, + "output_per_million_tokens": 0.10 + }, + "pricing_kind": "paid" + }, + "openrouter-laguna-xs.2-free": { + "comment": "Poolside Laguna XS.2 (free). Released Apr 28, 2026. Compact fp8 model targeted at agentic coding. 128,000 context. $0/M input tokens. $0/M output tokens.", + "luigi_workers": 4, + "model_info_url": "https://openrouter.ai/poolside/laguna-xs.2", + "class": "OpenRouter", + "arguments": { + "model": "poolside/laguna-xs.2:free", + "api_key": "${OPENROUTER_API_KEY}", + "temperature": 0.1, + "timeout": 60.0, + "context_window": 128000, + "is_function_calling_model": false, + "is_chat_model": true, + "max_tokens": 8000, + "max_retries": 5 + }, + "pricing": { + "input_per_million_tokens": 0, + "output_per_million_tokens": 0 + }, + "pricing_kind": "free" + }, + "openrouter-nemotron-3-nano-omni-30b-reasoning-free": { + "comment": "NVIDIA Nemotron 3 Nano Omni 30B A3B reasoning (free). Released Apr 28, 2026. Reasoning model — uses more tokens than non-reasoning models. 256,000 context. $0/M input tokens. $0/M output tokens.", + "luigi_workers": 4, + "model_info_url": "https://openrouter.ai/nvidia/nemotron-3-nano-omni-30b-a3b-reasoning", + "class": "OpenRouter", + "arguments": { + "model": "nvidia/nemotron-3-nano-omni-30b-a3b-reasoning:free", + "api_key": "${OPENROUTER_API_KEY}", + "temperature": 0.1, + "timeout": 60.0, + "context_window": 256000, + "is_function_calling_model": false, + "is_chat_model": true, + "max_tokens": 16384, + "max_retries": 5 + }, + "pricing": { + "input_per_million_tokens": 0, + "output_per_million_tokens": 0 + }, + "pricing_kind": "free" + }, "openrouter-gemini-2.5-flash-lite-preview-09-2025": { "comment": "Created Sep 25, 2025. 1,048,576 context. $0.10/M input tokens. $0.40/M output tokens.", "priority": 1, @@ -308,6 +374,34 @@ }, "pricing_kind": "paid" }, + "deepseek-v4-flash-thinking-disabled": { + "comment": "DeepSeek's flash tier via the native DeepSeek API. Requires a DEEPSEEK_API_KEY in the .env file. 1,000,000 context, 384,000 max output. Thinking mode is explicitly disabled (defaults to enabled). $0.14/M input (cache miss). $0.28/M output. See https://api-docs.deepseek.com/quick_start/pricing/", + "luigi_workers": 4, + "class": "OpenAILike", + "arguments": { + "model": "deepseek-v4-flash", + "api_key": "${DEEPSEEK_API_KEY}", + "api_base": "https://api.deepseek.com/v1", + "temperature": 0.1, + "timeout": 60.0, + "context_window": 1000000, + "is_function_calling_model": false, + "is_chat_model": true, + "max_tokens": 16384, + "max_retries": 5, + "additional_kwargs": { + "extra_body": { + "thinking": {"type": "disabled"} + } + } + }, + "model_info_url": "https://api-docs.deepseek.com/quick_start/pricing/", + "pricing": { + "input_per_million_tokens": 0.14, + "output_per_million_tokens": 0.28 + }, + "pricing_kind": "paid" + }, "ollama-llama3.1": { "comment": "This runs on your own computer. Requires Ollama to be installed. PlanExe runs in .venv on the host computer. No use of docker.", "luigi_workers": 1, diff --git a/worker_plan/app.py b/worker_plan/app.py index d7e6802e..9f58e7ef 100644 --- a/worker_plan/app.py +++ b/worker_plan/app.py @@ -1,7 +1,6 @@ import logging import os import subprocess -import json import sys import tempfile import threading @@ -18,7 +17,7 @@ PlanExeDotEnv.load().update_os_environ() from fastapi import BackgroundTasks, FastAPI, HTTPException -from fastapi.responses import FileResponse, StreamingResponse +from fastapi.responses import FileResponse from pydantic import BaseModel, Field from worker_plan_api.filenames import FilenameEnum, ExtraFilenameEnum @@ -29,7 +28,7 @@ from worker_plan_internal.plan.pipeline_environment import PipelineEnvironmentEnum from worker_plan_api.plan_file import PlanFile from worker_plan_api.start_time import StartTime -from worker_plan_internal.llm_factory import obtain_llm_info, get_llm_names_by_priority, get_llm +from worker_plan_internal.llm_factory import obtain_llm_info, get_llm_names_by_priority, get_all_llm_names_with_priority, get_llm from worker_plan_internal.utils.time_since_last_modification import time_since_last_modification from worker_plan_internal.utils.purge_old_runs import purge_old_runs, start_purge_scheduler from llama_index.core.llms import ChatMessage, MessageRole @@ -422,78 +421,68 @@ def llm_info() -> LLMInfo: return obtain_llm_info() -@app.get("/llm-ping") -def llm_ping() -> StreamingResponse: - """ - Stream ping results for each configured LLM model. - """ +_PING_SYSTEM_PROMPT = "You are a healthcheck endpoint. Reply with exactly OK. Do not add any other words." +_PING_USER_PROMPT = "Reply with exactly OK." - def event_stream(): - logger.info("Starting llm-ping stream") - ping_system_prompt = "You are a healthcheck endpoint. Reply with exactly OK. Do not add any other words." - ping_user_prompt = "Reply with exactly OK." - ping_targets: list[tuple[ModelProfileEnum, str, str]] = [] - try: - for profile in ModelProfileEnum: - llm_names = get_llm_names_by_priority(model_profile=profile) - for llm_name in llm_names: - display_name = f"{profile.value}:{llm_name}" - ping_targets.append((profile, llm_name, display_name)) - except Exception as exc: # pragma: no cover - runtime probe - logger.error("llm-ping failed to enumerate llm names: %s", exc) - yield f"data: {json.dumps({'name': 'worker_plan', 'status': 'error', 'response_time': 0, 'response': str(exc)})}\n\n" - yield f"data: {json.dumps({'name': 'server', 'status': 'done', 'response_time': 0, 'response': ''})}\n\n" - return - - if len(ping_targets) == 0: - yield f"data: {json.dumps({'name': 'worker_plan', 'status': 'error', 'response_time': 0, 'response': 'No models found in whitelisted llm_config profiles.'})}\n\n" - yield f"data: {json.dumps({'name': 'server', 'status': 'done', 'response_time': 0, 'response': ''})}\n\n" - return - - for model_profile, llm_name, display_name in ping_targets: - yield f"data: {json.dumps({'name': display_name, 'status': 'pinging', 'response_time': 0, 'response': 'Pinging model…'})}\n\n" - try: - start_time = time.time() - llm = get_llm(llm_name, model_profile=model_profile) - chat_message_list = [ - ChatMessage( - role=MessageRole.SYSTEM, - content=ping_system_prompt, - ), - ChatMessage( - role=MessageRole.USER, - content=ping_user_prompt, - ) - ] - response = llm.chat(chat_message_list) - end_time = time.time() - - response_text = getattr(getattr(response, "message", None), "content", None) - if response_text is None: - response_text = str(response) - response_text = str(response_text).strip() - is_exact_ok = response_text == "OK" - - payload = { - "name": display_name, - "status": "success" if is_exact_ok else "error", - "response_time": int((end_time - start_time) * 1000), - "response": "OK" if is_exact_ok else f"Expected exact 'OK', got: {response_text}" - } - except Exception as exc: # pragma: no cover - runtime probe - logger.error("llm-ping error for %s: %s", llm_name, exc) - payload = { - "name": display_name, - "status": "error", - "response_time": 0, - "response": str(exc) - } - yield f"data: {json.dumps(payload)}\n\n" - - logger.info("llm-ping stream complete") - yield f"data: {json.dumps({'name': 'server', 'status': 'done', 'response_time': 0, 'response': ''})}\n\n" - - return StreamingResponse(event_stream(), media_type="text/event-stream") + +def _enumerate_ping_targets() -> list[dict]: + targets: list[dict] = [] + for profile in ModelProfileEnum: + for llm_name, priority in get_all_llm_names_with_priority(model_profile=profile): + targets.append({ + "profile": profile.value, + "llm_name": llm_name, + "display_name": f"{profile.value}:{llm_name}", + "priority": priority, + }) + return targets + + +@app.get("/llm-list") +def llm_list() -> dict: + """List every (profile, llm_name) pair configured in the active llm_config profiles.""" + try: + return {"models": _enumerate_ping_targets()} + except Exception as exc: + logger.error("llm-list failed to enumerate llm names: %s", exc) + raise HTTPException(status_code=500, detail=str(exc)) from exc + + +@app.get("/llm-ping-one") +def llm_ping_one(profile: str, llm_name: str) -> dict: + """Ping a single configured model and return the result as JSON.""" + model_profile = normalize_model_profile(profile) + display_name = f"{model_profile.value}:{llm_name}" + try: + start_time = time.time() + llm = get_llm(llm_name, model_profile=model_profile) + chat_message_list = [ + ChatMessage(role=MessageRole.SYSTEM, content=_PING_SYSTEM_PROMPT), + ChatMessage(role=MessageRole.USER, content=_PING_USER_PROMPT), + ] + response = llm.chat(chat_message_list) + elapsed_ms = int((time.time() - start_time) * 1000) + + response_text = getattr(getattr(response, "message", None), "content", None) + if response_text is None: + response_text = str(response) + response_text = str(response_text).strip() + is_exact_ok = response_text == "OK" + + return { + "name": display_name, + "status": "success" if is_exact_ok else "error", + "response_time": elapsed_ms, + "response": "OK" if is_exact_ok else f"Expected exact 'OK', got: {response_text}", + } + except Exception as exc: + logger.error("llm-ping-one error for %s:%s: %s", model_profile.value, llm_name, exc) + return { + "name": display_name, + "status": "error", + "response_time": 0, + "response": str(exc), + } @app.post("/purge-runs", response_model=PurgeRunsResponse) diff --git a/worker_plan/worker_plan_internal/llm_factory.py b/worker_plan/worker_plan_internal/llm_factory.py index d7d5c2bb..19de8487 100644 --- a/worker_plan/worker_plan_internal/llm_factory.py +++ b/worker_plan/worker_plan_internal/llm_factory.py @@ -38,7 +38,7 @@ logger = logging.getLogger(__name__) -__all__ = ["get_llm", "LLMInfo", "get_llm_names_by_priority", "SPECIAL_AUTO_ID", "is_valid_llm_name", "obtain_llm_info"] +__all__ = ["get_llm", "LLMInfo", "get_llm_names_by_priority", "get_all_llm_names_with_priority", "SPECIAL_AUTO_ID", "is_valid_llm_name", "obtain_llm_info"] def _resolve_model_profile(model_profile: Optional[ModelProfileEnum | str]) -> ModelProfileEnum: @@ -174,6 +174,26 @@ def get_llm_names_by_priority(model_profile: Optional[ModelProfileEnum | str] = configs.sort(key=lambda x: x[1].get("priority", 0)) return [name for name, _ in configs] + +def get_all_llm_names_with_priority(model_profile: Optional[ModelProfileEnum | str] = None) -> list[tuple[str, Optional[int]]]: + """ + Return every configured LLM name in the profile and its priority (None when unset). + + Models with a priority come first, sorted ascending. Models without a priority + follow, in the order the config file declares them. + """ + planexe_llmconfig = _load_llm_config(model_profile) + prioritized: list[tuple[str, Optional[int]]] = [] + unprioritized: list[tuple[str, Optional[int]]] = [] + for name, config in planexe_llmconfig.llm_config_dict.items(): + priority = config.get("priority") + if priority is None: + unprioritized.append((name, None)) + else: + prioritized.append((name, int(priority))) + prioritized.sort(key=lambda x: x[1]) + return prioritized + unprioritized + def is_valid_llm_name(llm_name: str, model_profile: Optional[ModelProfileEnum | str] = None) -> bool: """ Returns True if the LLM name is valid, False otherwise.