From bd4656fb9dacbb1c8b372b7c141f7cb7c6accc52 Mon Sep 17 00:00:00 2001
From: Jack Neil <jackneil@Jacks-Mac-Studio.local>
Date: Thu, 9 Apr 2026 10:26:52 -0400
Subject: [PATCH] fix: graceful fallback when model has no chat_template
 (MedGemma)

Models like MedGemma have apply_chat_template() as an inherited method
but no chat_template configured, causing ValueError on every request.
Now catches ValueError and falls back to plain-text prompt format
in both BatchedEngine and SimpleEngine paths.

Fixes: MedGemma crashes with "Cannot use apply_chat_template"

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 tests/test_missing_chat_template.py | 94 +++++++++++++++++++++++++++++
 vllm_mlx/engine/batched.py          | 20 +++---
 vllm_mlx/engine/simple.py           | 22 ++++++-
 3 files changed, 126 insertions(+), 10 deletions(-)
 create mode 100644 tests/test_missing_chat_template.py

diff --git a/tests/test_missing_chat_template.py b/tests/test_missing_chat_template.py
new file mode 100644
index 000000000..56eca90ab
--- /dev/null
+++ b/tests/test_missing_chat_template.py
@@ -0,0 +1,94 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Tests for models with no chat_template (e.g., MedGemma).
+
+MedGemma's HuggingFace processor has apply_chat_template() as a method
+(inherited from base class) but no chat_template configured. Calling
+apply_chat_template() raises ValueError. vllm-mlx should fall back to
+a plain-text prompt format instead of crashing.
+"""
+
+import pytest
+
+
+class FakeProcessorNoTemplate:
+    """Simulates a HuggingFace processor with no chat_template set."""
+
+    chat_template = None  # No template configured
+
+    def apply_chat_template(self, messages, **kwargs):
+        """Raises like the real processor does when no template is set."""
+        raise ValueError(
+            "Cannot use apply_chat_template when no chat_template is set. "
+            "Provide a chat_template or use a model with one."
+        )
+
+
+class FakeProcessorWithTemplate:
+    """Simulates a working processor for comparison."""
+
+    chat_template = "{% for m in messages %}{{ m['role'] }}: {{ m['content'] }}\n{% endfor %}"
+
+    def apply_chat_template(self, messages, **kwargs):
+        return "\n".join(f"{m['role']}: {m['content']}" for m in messages) + "\nassistant:"
+
+
+class TestBatchedEngineChatTemplateFallback:
+    """Test that _apply_chat_template handles missing templates gracefully."""
+
+    def _make_engine_stub(self, processor, is_mllm=True):
+        """Create a minimal stub with the fields _apply_chat_template needs."""
+        from vllm_mlx.engine.batched import BatchedEngine
+
+        # Create a bare object that has the method but not the full engine
+        stub = object.__new__(BatchedEngine)
+        stub._is_mllm = is_mllm
+        stub._processor = processor
+        stub._model_name = "test-model"
+        # _apply_chat_template falls through to tokenizer if processor fails
+        # Give it a tokenizer that also has no template
+        stub._tokenizer = None
+        return stub
+
+    def test_no_template_processor_does_not_crash(self):
+        """A processor with no chat_template should fall back, not raise."""
+        processor = FakeProcessorNoTemplate()
+        stub = self._make_engine_stub(processor)
+        messages = [{"role": "user", "content": "What is this image?"}]
+
+        # This should NOT raise — it should fall back gracefully
+        result = stub._apply_chat_template(messages)
+        assert isinstance(result, str)
+        assert "What is this image?" in result
+
+    def test_no_template_produces_readable_prompt(self):
+        """Fallback prompt should include role and content."""
+        processor = FakeProcessorNoTemplate()
+        stub = self._make_engine_stub(processor)
+        messages = [
+            {"role": "system", "content": "You are a medical assistant."},
+            {"role": "user", "content": "Describe this X-ray."},
+        ]
+
+        result = stub._apply_chat_template(messages)
+        assert "medical assistant" in result
+        assert "X-ray" in result
+
+    def test_working_processor_still_works(self):
+        """A processor WITH a template should still use it normally."""
+        processor = FakeProcessorWithTemplate()
+        stub = self._make_engine_stub(processor)
+        messages = [{"role": "user", "content": "Hello"}]
+
+        result = stub._apply_chat_template(messages)
+        assert "Hello" in result
+
+    def test_no_template_with_tools_does_not_crash(self):
+        """Missing template + tools should also fall back gracefully."""
+        processor = FakeProcessorNoTemplate()
+        stub = self._make_engine_stub(processor)
+        messages = [{"role": "user", "content": "Check vitals"}]
+        tools = [{"type": "function", "function": {"name": "get_vitals"}}]
+
+        result = stub._apply_chat_template(messages, tools=tools)
+        assert isinstance(result, str)
+        assert "Check vitals" in result
diff --git a/vllm_mlx/engine/batched.py b/vllm_mlx/engine/batched.py
index 3ac52b4b0..24368bf98 100644
--- a/vllm_mlx/engine/batched.py
+++ b/vllm_mlx/engine/batched.py
@@ -380,13 +380,19 @@ def _apply_chat_template(
                 for key in ["tools"]:
                     if key in template_kwargs:
                         del template_kwargs[key]
-                return template_applicator.apply_chat_template(
-                    messages, **template_kwargs
-                )
-        else:
-            # Fallback for models without apply_chat_template
-            prompt = "\n".join(f"{m['role']}: {m['content']}" for m in messages)
-            return prompt + "\nassistant:"
+                try:
+                    return template_applicator.apply_chat_template(
+                        messages, **template_kwargs
+                    )
+                except (TypeError, ValueError):
+                    pass  # Fall through to plain-text fallback below
+            except ValueError as e:
+                # No chat_template configured (e.g., MedGemma processor).
+                logger.warning(f"No chat template available: {e}, using plain-text fallback")
+
+        # Fallback for models without apply_chat_template or no template configured
+        prompt = "\n".join(f"{m['role']}: {m['content']}" for m in messages)
+        return prompt + "\nassistant:"
 
     @staticmethod
     def _prepare_mllm_messages(
diff --git a/vllm_mlx/engine/simple.py b/vllm_mlx/engine/simple.py
index da3ccfc18..27b7860bc 100644
--- a/vllm_mlx/engine/simple.py
+++ b/vllm_mlx/engine/simple.py
@@ -480,8 +480,14 @@ async def chat(
                 }
                 if template_tools:
                     template_kwargs["tools"] = template_tools
-                prompt_ids = tokenizer.apply_chat_template(messages, **template_kwargs)
-                prompt_token_count = len(prompt_ids)
+                try:
+                    prompt_ids = tokenizer.apply_chat_template(messages, **template_kwargs)
+                    prompt_token_count = len(prompt_ids)
+                except (TypeError, ValueError):
+                    # No chat template (e.g., MedGemma) — estimate prompt tokens
+                    prompt_token_count = sum(
+                        len(m.get("content", "")) // 4 for m in messages
+                    )
                 return GenerationOutput(
                     text=text,
                     tokens=output.tokens,
@@ -607,7 +613,17 @@ def run_stream():
                 for key in ["tools", "enable_thinking"]:
                     if key in template_kwargs:
                         del template_kwargs[key]
-                prompt = tokenizer.apply_chat_template(messages, **template_kwargs)
+                try:
+                    prompt = tokenizer.apply_chat_template(messages, **template_kwargs)
+                except (TypeError, ValueError):
+                    prompt = None  # Fall through to plain-text fallback
+            except ValueError:
+                # No chat_template configured (e.g., MedGemma)
+                prompt = None
+
+            if prompt is None:
+                prompt = "\n".join(f"{m['role']}: {m['content']}" for m in messages)
+                prompt += "\nassistant:"
         else:
             prompt = "\n".join(f"{m['role']}: {m['content']}" for m in messages)
             prompt += "\nassistant:"