Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 94 additions & 0 deletions tests/test_missing_chat_template.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# SPDX-License-Identifier: Apache-2.0
"""Tests for models with no chat_template (e.g., MedGemma).

MedGemma's HuggingFace processor has apply_chat_template() as a method
(inherited from base class) but no chat_template configured. Calling
apply_chat_template() raises ValueError. vllm-mlx should fall back to
a plain-text prompt format instead of crashing.
"""

import pytest


class FakeProcessorNoTemplate:
"""Simulates a HuggingFace processor with no chat_template set."""

chat_template = None # No template configured

def apply_chat_template(self, messages, **kwargs):
"""Raises like the real processor does when no template is set."""
raise ValueError(
"Cannot use apply_chat_template when no chat_template is set. "
"Provide a chat_template or use a model with one."
)


class FakeProcessorWithTemplate:
"""Simulates a working processor for comparison."""

chat_template = "{% for m in messages %}{{ m['role'] }}: {{ m['content'] }}\n{% endfor %}"

def apply_chat_template(self, messages, **kwargs):
return "\n".join(f"{m['role']}: {m['content']}" for m in messages) + "\nassistant:"


class TestBatchedEngineChatTemplateFallback:
"""Test that _apply_chat_template handles missing templates gracefully."""

def _make_engine_stub(self, processor, is_mllm=True):
"""Create a minimal stub with the fields _apply_chat_template needs."""
from vllm_mlx.engine.batched import BatchedEngine

# Create a bare object that has the method but not the full engine
stub = object.__new__(BatchedEngine)
stub._is_mllm = is_mllm
stub._processor = processor
stub._model_name = "test-model"
# _apply_chat_template falls through to tokenizer if processor fails
# Give it a tokenizer that also has no template
stub._tokenizer = None
return stub

def test_no_template_processor_does_not_crash(self):
"""A processor with no chat_template should fall back, not raise."""
processor = FakeProcessorNoTemplate()
stub = self._make_engine_stub(processor)
messages = [{"role": "user", "content": "What is this image?"}]

# This should NOT raise — it should fall back gracefully
result = stub._apply_chat_template(messages)
assert isinstance(result, str)
assert "What is this image?" in result

def test_no_template_produces_readable_prompt(self):
"""Fallback prompt should include role and content."""
processor = FakeProcessorNoTemplate()
stub = self._make_engine_stub(processor)
messages = [
{"role": "system", "content": "You are a medical assistant."},
{"role": "user", "content": "Describe this X-ray."},
]

result = stub._apply_chat_template(messages)
assert "medical assistant" in result
assert "X-ray" in result

def test_working_processor_still_works(self):
"""A processor WITH a template should still use it normally."""
processor = FakeProcessorWithTemplate()
stub = self._make_engine_stub(processor)
messages = [{"role": "user", "content": "Hello"}]

result = stub._apply_chat_template(messages)
assert "Hello" in result

def test_no_template_with_tools_does_not_crash(self):
"""Missing template + tools should also fall back gracefully."""
processor = FakeProcessorNoTemplate()
stub = self._make_engine_stub(processor)
messages = [{"role": "user", "content": "Check vitals"}]
tools = [{"type": "function", "function": {"name": "get_vitals"}}]

result = stub._apply_chat_template(messages, tools=tools)
assert isinstance(result, str)
assert "Check vitals" in result
20 changes: 13 additions & 7 deletions vllm_mlx/engine/batched.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,13 +380,19 @@ def _apply_chat_template(
for key in ["tools"]:
if key in template_kwargs:
del template_kwargs[key]
return template_applicator.apply_chat_template(
messages, **template_kwargs
)
else:
# Fallback for models without apply_chat_template
prompt = "\n".join(f"{m['role']}: {m['content']}" for m in messages)
return prompt + "\nassistant:"
try:
return template_applicator.apply_chat_template(
messages, **template_kwargs
)
except (TypeError, ValueError):
pass # Fall through to plain-text fallback below
except ValueError as e:
# No chat_template configured (e.g., MedGemma processor).
logger.warning(f"No chat template available: {e}, using plain-text fallback")

# Fallback for models without apply_chat_template or no template configured
prompt = "\n".join(f"{m['role']}: {m['content']}" for m in messages)
return prompt + "\nassistant:"

@staticmethod
def _prepare_mllm_messages(
Expand Down
22 changes: 19 additions & 3 deletions vllm_mlx/engine/simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -480,8 +480,14 @@ async def chat(
}
if template_tools:
template_kwargs["tools"] = template_tools
prompt_ids = tokenizer.apply_chat_template(messages, **template_kwargs)
prompt_token_count = len(prompt_ids)
try:
prompt_ids = tokenizer.apply_chat_template(messages, **template_kwargs)
prompt_token_count = len(prompt_ids)
except (TypeError, ValueError):
# No chat template (e.g., MedGemma) — estimate prompt tokens
prompt_token_count = sum(
len(m.get("content", "")) // 4 for m in messages
)
return GenerationOutput(
text=text,
tokens=output.tokens,
Expand Down Expand Up @@ -607,7 +613,17 @@ def run_stream():
for key in ["tools", "enable_thinking"]:
if key in template_kwargs:
del template_kwargs[key]
prompt = tokenizer.apply_chat_template(messages, **template_kwargs)
try:
prompt = tokenizer.apply_chat_template(messages, **template_kwargs)
except (TypeError, ValueError):
prompt = None # Fall through to plain-text fallback
except ValueError:
# No chat_template configured (e.g., MedGemma)
prompt = None

if prompt is None:
prompt = "\n".join(f"{m['role']}: {m['content']}" for m in messages)
prompt += "\nassistant:"
else:
prompt = "\n".join(f"{m['role']}: {m['content']}" for m in messages)
prompt += "\nassistant:"
Expand Down
Loading