From bd4656fb9dacbb1c8b372b7c141f7cb7c6accc52 Mon Sep 17 00:00:00 2001 From: Jack Neil Date: Thu, 9 Apr 2026 10:26:52 -0400 Subject: [PATCH] fix: graceful fallback when model has no chat_template (MedGemma) Models like MedGemma have apply_chat_template() as an inherited method but no chat_template configured, causing ValueError on every request. Now catches ValueError and falls back to plain-text prompt format in both BatchedEngine and SimpleEngine paths. Fixes: MedGemma crashes with "Cannot use apply_chat_template" Co-Authored-By: Claude Opus 4.6 (1M context) --- tests/test_missing_chat_template.py | 94 +++++++++++++++++++++++++++++ vllm_mlx/engine/batched.py | 20 +++--- vllm_mlx/engine/simple.py | 22 ++++++- 3 files changed, 126 insertions(+), 10 deletions(-) create mode 100644 tests/test_missing_chat_template.py diff --git a/tests/test_missing_chat_template.py b/tests/test_missing_chat_template.py new file mode 100644 index 000000000..56eca90ab --- /dev/null +++ b/tests/test_missing_chat_template.py @@ -0,0 +1,94 @@ +# SPDX-License-Identifier: Apache-2.0 +"""Tests for models with no chat_template (e.g., MedGemma). + +MedGemma's HuggingFace processor has apply_chat_template() as a method +(inherited from base class) but no chat_template configured. Calling +apply_chat_template() raises ValueError. vllm-mlx should fall back to +a plain-text prompt format instead of crashing. +""" + +import pytest + + +class FakeProcessorNoTemplate: + """Simulates a HuggingFace processor with no chat_template set.""" + + chat_template = None # No template configured + + def apply_chat_template(self, messages, **kwargs): + """Raises like the real processor does when no template is set.""" + raise ValueError( + "Cannot use apply_chat_template when no chat_template is set. " + "Provide a chat_template or use a model with one." + ) + + +class FakeProcessorWithTemplate: + """Simulates a working processor for comparison.""" + + chat_template = "{% for m in messages %}{{ m['role'] }}: {{ m['content'] }}\n{% endfor %}" + + def apply_chat_template(self, messages, **kwargs): + return "\n".join(f"{m['role']}: {m['content']}" for m in messages) + "\nassistant:" + + +class TestBatchedEngineChatTemplateFallback: + """Test that _apply_chat_template handles missing templates gracefully.""" + + def _make_engine_stub(self, processor, is_mllm=True): + """Create a minimal stub with the fields _apply_chat_template needs.""" + from vllm_mlx.engine.batched import BatchedEngine + + # Create a bare object that has the method but not the full engine + stub = object.__new__(BatchedEngine) + stub._is_mllm = is_mllm + stub._processor = processor + stub._model_name = "test-model" + # _apply_chat_template falls through to tokenizer if processor fails + # Give it a tokenizer that also has no template + stub._tokenizer = None + return stub + + def test_no_template_processor_does_not_crash(self): + """A processor with no chat_template should fall back, not raise.""" + processor = FakeProcessorNoTemplate() + stub = self._make_engine_stub(processor) + messages = [{"role": "user", "content": "What is this image?"}] + + # This should NOT raise — it should fall back gracefully + result = stub._apply_chat_template(messages) + assert isinstance(result, str) + assert "What is this image?" in result + + def test_no_template_produces_readable_prompt(self): + """Fallback prompt should include role and content.""" + processor = FakeProcessorNoTemplate() + stub = self._make_engine_stub(processor) + messages = [ + {"role": "system", "content": "You are a medical assistant."}, + {"role": "user", "content": "Describe this X-ray."}, + ] + + result = stub._apply_chat_template(messages) + assert "medical assistant" in result + assert "X-ray" in result + + def test_working_processor_still_works(self): + """A processor WITH a template should still use it normally.""" + processor = FakeProcessorWithTemplate() + stub = self._make_engine_stub(processor) + messages = [{"role": "user", "content": "Hello"}] + + result = stub._apply_chat_template(messages) + assert "Hello" in result + + def test_no_template_with_tools_does_not_crash(self): + """Missing template + tools should also fall back gracefully.""" + processor = FakeProcessorNoTemplate() + stub = self._make_engine_stub(processor) + messages = [{"role": "user", "content": "Check vitals"}] + tools = [{"type": "function", "function": {"name": "get_vitals"}}] + + result = stub._apply_chat_template(messages, tools=tools) + assert isinstance(result, str) + assert "Check vitals" in result diff --git a/vllm_mlx/engine/batched.py b/vllm_mlx/engine/batched.py index 3ac52b4b0..24368bf98 100644 --- a/vllm_mlx/engine/batched.py +++ b/vllm_mlx/engine/batched.py @@ -380,13 +380,19 @@ def _apply_chat_template( for key in ["tools"]: if key in template_kwargs: del template_kwargs[key] - return template_applicator.apply_chat_template( - messages, **template_kwargs - ) - else: - # Fallback for models without apply_chat_template - prompt = "\n".join(f"{m['role']}: {m['content']}" for m in messages) - return prompt + "\nassistant:" + try: + return template_applicator.apply_chat_template( + messages, **template_kwargs + ) + except (TypeError, ValueError): + pass # Fall through to plain-text fallback below + except ValueError as e: + # No chat_template configured (e.g., MedGemma processor). + logger.warning(f"No chat template available: {e}, using plain-text fallback") + + # Fallback for models without apply_chat_template or no template configured + prompt = "\n".join(f"{m['role']}: {m['content']}" for m in messages) + return prompt + "\nassistant:" @staticmethod def _prepare_mllm_messages( diff --git a/vllm_mlx/engine/simple.py b/vllm_mlx/engine/simple.py index da3ccfc18..27b7860bc 100644 --- a/vllm_mlx/engine/simple.py +++ b/vllm_mlx/engine/simple.py @@ -480,8 +480,14 @@ async def chat( } if template_tools: template_kwargs["tools"] = template_tools - prompt_ids = tokenizer.apply_chat_template(messages, **template_kwargs) - prompt_token_count = len(prompt_ids) + try: + prompt_ids = tokenizer.apply_chat_template(messages, **template_kwargs) + prompt_token_count = len(prompt_ids) + except (TypeError, ValueError): + # No chat template (e.g., MedGemma) — estimate prompt tokens + prompt_token_count = sum( + len(m.get("content", "")) // 4 for m in messages + ) return GenerationOutput( text=text, tokens=output.tokens, @@ -607,7 +613,17 @@ def run_stream(): for key in ["tools", "enable_thinking"]: if key in template_kwargs: del template_kwargs[key] - prompt = tokenizer.apply_chat_template(messages, **template_kwargs) + try: + prompt = tokenizer.apply_chat_template(messages, **template_kwargs) + except (TypeError, ValueError): + prompt = None # Fall through to plain-text fallback + except ValueError: + # No chat_template configured (e.g., MedGemma) + prompt = None + + if prompt is None: + prompt = "\n".join(f"{m['role']}: {m['content']}" for m in messages) + prompt += "\nassistant:" else: prompt = "\n".join(f"{m['role']}: {m['content']}" for m in messages) prompt += "\nassistant:"