From 5197cc6043a712dc6d4e77e03743a82858304318 Mon Sep 17 00:00:00 2001 From: Kyle Zheng <126034466+KyleZheng1284@users.noreply.github.com> Date: Thu, 11 Jun 2026 22:15:19 +0000 Subject: [PATCH 1/2] routing mechanisms for hf embedding models --- .../src/nemo_retriever/models/__init__.py | 52 ++++++++++++++++++- .../models/hf_model_registry.py | 7 +++ .../models/inference/processor.py | 19 ++++++- 3 files changed, 75 insertions(+), 3 deletions(-) diff --git a/nemo_retriever/src/nemo_retriever/models/__init__.py b/nemo_retriever/src/nemo_retriever/models/__init__.py index df6c72d45d..2e71e9a254 100644 --- a/nemo_retriever/src/nemo_retriever/models/__init__.py +++ b/nemo_retriever/src/nemo_retriever/models/__init__.py @@ -4,6 +4,7 @@ from __future__ import annotations +import os from typing import TYPE_CHECKING, Any if TYPE_CHECKING: @@ -59,6 +60,36 @@ def is_vl_rerank_model(model_name: str | None) -> bool: return (model_name or "") in _VL_RERANK_MODEL_IDS +LOCAL_EMBED_ARCH_ENV = "NRL_LOCAL_EMBED_ARCH" +_VALID_LOCAL_EMBED_ARCHS = frozenset({"vl", "text"}) + + +def _is_local_checkpoint_dir(model_name: str | None) -> bool: + """Return True if *model_name* points at an on-disk checkpoint directory.""" + return bool(model_name) and os.path.isdir(str(model_name)) + + +def _resolve_local_embed_arch(model_arch: str | None) -> bool: + """Return True (VL) / False (text) for a local checkpoint directory. + + The architecture is never inferred. It must be declared explicitly via the + *model_arch* argument or the ``NRL_LOCAL_EMBED_ARCH`` environment variable, + so a local checkpoint can never be silently routed to the wrong embedder. + + Raises: + ValueError: when the architecture is unset or not one of ``vl``/``text``. + """ + raw = model_arch if model_arch is not None else os.getenv(LOCAL_EMBED_ARCH_ENV) + arch = (raw or "").strip().lower() + if arch not in _VALID_LOCAL_EMBED_ARCHS: + raise ValueError( + "A local embedding checkpoint directory requires its architecture to be " + f"declared explicitly: set {LOCAL_EMBED_ARCH_ENV}='vl'|'text' (or pass " + f"model_arch) so it routes to the correct embedder. Got {raw!r}." + ) + return arch == "vl" + + def create_local_embedder( model_name: str | None = None, *, @@ -71,6 +102,7 @@ def create_local_embedder( normalize: bool = True, max_length: int = 8192, query_max_length: int = 128, + model_arch: str | None = None, ) -> Any: """Create the appropriate local embedding model (VL or non-VL). @@ -92,13 +124,26 @@ def create_local_embedder( Note: ``gpu_memory_utilization``, ``enforce_eager``, ``dimensions``, ``normalize``, and ``max_length`` apply to vLLM paths only; the HF VL path ignores them. + + A local checkpoint *directory* (e.g. a fine-tuned drop-in or proxy model) + is supported on both the text and VL paths. Because a directory carries no + registry entry, its architecture (``vl``/``text``) must be declared via + *model_arch* or ``NRL_LOCAL_EMBED_ARCH``; it is never inferred. """ b = (backend or "vllm").strip().lower() if b not in ("vllm", "hf"): raise ValueError(f"backend must be 'vllm' or 'hf', got {backend!r}") model_id = resolve_embed_model(model_name) - if is_vl_embed_model(model_name): + # Registered Hub ids select VL vs text by the id allow-list (unchanged). A + # local checkpoint dir is not in the allow-list, so it must declare its + # architecture explicitly (fail-loud rather than guess). + if _is_local_checkpoint_dir(model_name): + use_vl = _resolve_local_embed_arch(model_arch) + else: + use_vl = is_vl_embed_model(model_name) + + if use_vl: if b == "hf": from nemo_retriever.models.local.llama_nemotron_embed_vl_1b_v2_embedder import ( LlamaNemotronEmbedVL1BV2Embedder, @@ -181,6 +226,7 @@ def create_local_query_embedder( normalize: bool = True, max_length: int = 8192, query_max_length: int = 128, + model_arch: str | None = None, ) -> Any: """Create a local embedder for *query* vectors in retrieval (Retriever / recall). @@ -188,6 +234,9 @@ def create_local_query_embedder( - ``backend="hf"``: HuggingFace for both VL and non-VL models. - ``backend="vllm"``: vLLM for both VL and non-VL models. + + *model_arch* (``vl``/``text``) declares the architecture of a local + checkpoint directory; see :func:`create_local_embedder`. """ b = normalize_backend(backend, _LOCAL_QUERY_BACKENDS, field_name="backend", default="hf") @@ -202,6 +251,7 @@ def create_local_query_embedder( normalize=normalize, max_length=int(max_length), query_max_length=int(query_max_length), + model_arch=model_arch, ) diff --git a/nemo_retriever/src/nemo_retriever/models/hf_model_registry.py b/nemo_retriever/src/nemo_retriever/models/hf_model_registry.py index 4daedeb62c..f598d3fcb6 100644 --- a/nemo_retriever/src/nemo_retriever/models/hf_model_registry.py +++ b/nemo_retriever/src/nemo_retriever/models/hf_model_registry.py @@ -64,6 +64,13 @@ def get_hf_revision(model_id: str, *, strict: bool = True) -> str | None: if revision is not None: return revision + # A local filesystem checkpoint has no Hub commit to pin, so the revision + # gate does not apply: load the on-disk files as-is. This is scoped to + # directories only -- unregistered *Hub* ids still hit the strict gate + # below, preserving the supply-chain pin for remote models. + if model_id and os.path.isdir(str(model_id)): + return None + msg = ( f"No pinned HuggingFace revision for model '{model_id}'. " "Add an entry to HF_MODEL_REVISIONS in hf_model_registry.py to pin it." diff --git a/nemo_retriever/src/nemo_retriever/models/inference/processor.py b/nemo_retriever/src/nemo_retriever/models/inference/processor.py index 798b805107..168b4b37fd 100644 --- a/nemo_retriever/src/nemo_retriever/models/inference/processor.py +++ b/nemo_retriever/src/nemo_retriever/models/inference/processor.py @@ -90,7 +90,13 @@ def maybe_inject_local_hf_embedder(task_config: Dict[str, Any], transform_config if has_endpoint or not use_local: return - from nemo_retriever.models import create_local_embedder, resolve_embed_model, is_vl_embed_model + from nemo_retriever.model import ( + _is_local_checkpoint_dir, + _resolve_local_embed_arch, + create_local_embedder, + is_vl_embed_model, + resolve_embed_model, + ) embed_model = resolve_embed_model( task_config.get("embed_model_name") @@ -103,6 +109,7 @@ def maybe_inject_local_hf_embedder(task_config: Dict[str, Any], transform_config ingest_backend = (task_config.get("local_ingest_embed_backend") or "vllm").strip().lower() + model_arch = task_config.get("embed_model_arch") embedder_instance = create_local_embedder( embed_model, backend=ingest_backend, @@ -112,11 +119,19 @@ def maybe_inject_local_hf_embedder(task_config: Dict[str, Any], transform_config enforce_eager=_to_bool(task_config.get("enforce_eager"), default=False), dimensions=task_config.get("dimensions"), query_max_length=int(task_config.get("query_max_length", 128)), + model_arch=model_arch, ) prefix = f"{transform_config.input_type}: " if getattr(transform_config, "input_type", None) else "" - if is_vl_embed_model(embed_model): + # A local checkpoint dir declares vl/text explicitly (same resolver as the + # factory); registered ids fall back to the id allow-list. + if _is_local_checkpoint_dir(embed_model): + use_vl = _resolve_local_embed_arch(model_arch) + else: + use_vl = is_vl_embed_model(embed_model) + + if use_vl: def _embed(texts): vecs = embedder_instance.embed(texts, batch_size=local_batch_size) From 21b9df8e4ec6e5c0d0fe9b6ca1b42823f203e2f0 Mon Sep 17 00:00:00 2001 From: Kyle Zheng <126034466+KyleZheng1284@users.noreply.github.com> Date: Fri, 12 Jun 2026 22:02:40 +0000 Subject: [PATCH 2/2] Add local embed checkpoint routing --- .../src/nemo_retriever/models/__init__.py | 20 +- .../models/hf_model_registry.py | 18 +- .../models/inference/processor.py | 11 +- .../llama_nemotron_embed_1b_v2_embedder.py | 2 +- .../llama_nemotron_embed_1b_v2_hf_embedder.py | 2 +- .../llama_nemotron_embed_vl_1b_v2_embedder.py | 4 +- .../tests/test_local_embed_checkpoint.py | 190 ++++++++++++++++++ 7 files changed, 221 insertions(+), 26 deletions(-) create mode 100644 nemo_retriever/tests/test_local_embed_checkpoint.py diff --git a/nemo_retriever/src/nemo_retriever/models/__init__.py b/nemo_retriever/src/nemo_retriever/models/__init__.py index 2e71e9a254..0b66a4c18b 100644 --- a/nemo_retriever/src/nemo_retriever/models/__init__.py +++ b/nemo_retriever/src/nemo_retriever/models/__init__.py @@ -90,6 +90,18 @@ def _resolve_local_embed_arch(model_arch: str | None) -> bool: return arch == "vl" +def resolve_embed_model_use_vl(model_name: str | None, *, model_arch: str | None = None) -> bool: + """Return whether *model_name* should use the VL embedder path. + + Registered Hub IDs use the existing VL model allow-list. Local checkpoint + directories do not have a stable Hub ID to match against, so they must + declare their architecture via *model_arch* or ``NRL_LOCAL_EMBED_ARCH``. + """ + if _is_local_checkpoint_dir(model_name): + return _resolve_local_embed_arch(model_arch) + return is_vl_embed_model(model_name) + + def create_local_embedder( model_name: str | None = None, *, @@ -135,13 +147,7 @@ def create_local_embedder( raise ValueError(f"backend must be 'vllm' or 'hf', got {backend!r}") model_id = resolve_embed_model(model_name) - # Registered Hub ids select VL vs text by the id allow-list (unchanged). A - # local checkpoint dir is not in the allow-list, so it must declare its - # architecture explicitly (fail-loud rather than guess). - if _is_local_checkpoint_dir(model_name): - use_vl = _resolve_local_embed_arch(model_arch) - else: - use_vl = is_vl_embed_model(model_name) + use_vl = resolve_embed_model_use_vl(model_name, model_arch=model_arch) if use_vl: if b == "hf": diff --git a/nemo_retriever/src/nemo_retriever/models/hf_model_registry.py b/nemo_retriever/src/nemo_retriever/models/hf_model_registry.py index f598d3fcb6..4a85ee53fe 100644 --- a/nemo_retriever/src/nemo_retriever/models/hf_model_registry.py +++ b/nemo_retriever/src/nemo_retriever/models/hf_model_registry.py @@ -47,7 +47,14 @@ } -def get_hf_revision(model_id: str, *, strict: bool = True) -> str | None: +def _is_local_model_dir(model_id: str) -> bool: + """Return whether *model_id* is a local HF model directory.""" + return ( + bool(model_id) and os.path.isdir(str(model_id)) and os.path.isfile(os.path.join(str(model_id), "config.json")) + ) + + +def get_hf_revision(model_id: str, *, strict: bool = True, allow_local_path: bool = False) -> str | None: """Return the pinned commit SHA for *model_id*. Parameters @@ -59,16 +66,15 @@ def get_hf_revision(model_id: str, *, strict: bool = True) -> str | None: no pinned revision. When ``False``, log a warning and return ``None`` so that ``from_pretrained`` falls back to the ``main`` branch. + allow_local_path: + When ``True``, a local model directory containing ``config.json`` is + allowed to return ``None`` because it has no Hub commit SHA to pin. """ revision = HF_MODEL_REVISIONS.get(model_id) if revision is not None: return revision - # A local filesystem checkpoint has no Hub commit to pin, so the revision - # gate does not apply: load the on-disk files as-is. This is scoped to - # directories only -- unregistered *Hub* ids still hit the strict gate - # below, preserving the supply-chain pin for remote models. - if model_id and os.path.isdir(str(model_id)): + if allow_local_path and _is_local_model_dir(model_id): return None msg = ( diff --git a/nemo_retriever/src/nemo_retriever/models/inference/processor.py b/nemo_retriever/src/nemo_retriever/models/inference/processor.py index 168b4b37fd..d0c1c58653 100644 --- a/nemo_retriever/src/nemo_retriever/models/inference/processor.py +++ b/nemo_retriever/src/nemo_retriever/models/inference/processor.py @@ -91,11 +91,9 @@ def maybe_inject_local_hf_embedder(task_config: Dict[str, Any], transform_config return from nemo_retriever.model import ( - _is_local_checkpoint_dir, - _resolve_local_embed_arch, create_local_embedder, - is_vl_embed_model, resolve_embed_model, + resolve_embed_model_use_vl, ) embed_model = resolve_embed_model( @@ -124,12 +122,7 @@ def maybe_inject_local_hf_embedder(task_config: Dict[str, Any], transform_config prefix = f"{transform_config.input_type}: " if getattr(transform_config, "input_type", None) else "" - # A local checkpoint dir declares vl/text explicitly (same resolver as the - # factory); registered ids fall back to the id allow-list. - if _is_local_checkpoint_dir(embed_model): - use_vl = _resolve_local_embed_arch(model_arch) - else: - use_vl = is_vl_embed_model(embed_model) + use_vl = resolve_embed_model_use_vl(embed_model, model_arch=model_arch) if use_vl: diff --git a/nemo_retriever/src/nemo_retriever/models/local/llama_nemotron_embed_1b_v2_embedder.py b/nemo_retriever/src/nemo_retriever/models/local/llama_nemotron_embed_1b_v2_embedder.py index ec9dcf7cbe..3bd5fb6698 100644 --- a/nemo_retriever/src/nemo_retriever/models/local/llama_nemotron_embed_1b_v2_embedder.py +++ b/nemo_retriever/src/nemo_retriever/models/local/llama_nemotron_embed_1b_v2_embedder.py @@ -68,7 +68,7 @@ def _ensure_loaded(self) -> None: max_model_len = int(self.max_length) if int(self.max_length) > 0 else None self._llm = create_vllm_llm( str(model_id), - revision=get_hf_revision(model_id), + revision=get_hf_revision(model_id, allow_local_path=True), dimensions=self.dimensions, gpu_memory_utilization=self.gpu_memory_utilization, enforce_eager=self.enforce_eager, diff --git a/nemo_retriever/src/nemo_retriever/models/local/llama_nemotron_embed_1b_v2_hf_embedder.py b/nemo_retriever/src/nemo_retriever/models/local/llama_nemotron_embed_1b_v2_hf_embedder.py index 7f9f6a46c2..830cf5de35 100644 --- a/nemo_retriever/src/nemo_retriever/models/local/llama_nemotron_embed_1b_v2_hf_embedder.py +++ b/nemo_retriever/src/nemo_retriever/models/local/llama_nemotron_embed_1b_v2_hf_embedder.py @@ -56,7 +56,7 @@ def _ensure_loaded(self) -> None: model_id = self.model_id or _DEFAULT_EMBED_MODEL dev = torch.device(self.device or ("cuda" if torch.cuda.is_available() else "cpu")) hf_cache_dir = configure_global_hf_cache_base(self.hf_cache_dir) - _revision = get_hf_revision(model_id) + _revision = get_hf_revision(model_id, allow_local_path=True) self._tokenizer = AutoTokenizer.from_pretrained( model_id, revision=_revision, diff --git a/nemo_retriever/src/nemo_retriever/models/local/llama_nemotron_embed_vl_1b_v2_embedder.py b/nemo_retriever/src/nemo_retriever/models/local/llama_nemotron_embed_vl_1b_v2_embedder.py index 891ad25c43..727826a2fc 100644 --- a/nemo_retriever/src/nemo_retriever/models/local/llama_nemotron_embed_vl_1b_v2_embedder.py +++ b/nemo_retriever/src/nemo_retriever/models/local/llama_nemotron_embed_vl_1b_v2_embedder.py @@ -70,7 +70,7 @@ def _ensure_loaded(self) -> None: # device_map when requesting it. Fall back to sdpa/eager on CPU or # when flash-attn is not installed. use_gpu = dev.type == "cuda" - _revision = get_hf_revision(model_id) + _revision = get_hf_revision(model_id, allow_local_path=True) for attn_impl in ("flash_attention_2", "sdpa", "eager"): try: kwargs: dict[str, Any] = { @@ -234,7 +234,7 @@ def _ensure_loaded(self) -> None: model_id = self.model_id or "nvidia/llama-nemotron-embed-vl-1b-v2" self._llm = create_vllm_llm( str(model_id), - revision=get_hf_revision(model_id), + revision=get_hf_revision(model_id, allow_local_path=True), gpu_memory_utilization=self.gpu_memory_utilization, enforce_eager=self.enforce_eager, limit_mm_per_prompt={"image": 1}, diff --git a/nemo_retriever/tests/test_local_embed_checkpoint.py b/nemo_retriever/tests/test_local_embed_checkpoint.py new file mode 100644 index 0000000000..50e0f7b0e9 --- /dev/null +++ b/nemo_retriever/tests/test_local_embed_checkpoint.py @@ -0,0 +1,190 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. +# All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Tests for dropping in a local embedding checkpoint directory. + +Two mechanisms are exercised, both scoped to on-disk checkpoints only so that +registered Hub ids keep their existing behavior: + +- The revision pin is bypassed for local model directories only when the + caller explicitly opts into local-path loading. +- ``create_local_embedder`` routes a local dir to the VL or text embedder based + on an *explicit* ``vl``/``text`` declaration (arg or ``NRL_LOCAL_EMBED_ARCH``), + failing loudly rather than inferring. +""" + +import sys +from types import ModuleType +from unittest.mock import MagicMock + +import pytest + +from nemo_retriever.models import ( + LOCAL_EMBED_ARCH_ENV, + create_local_embedder, + create_local_query_embedder, + resolve_embed_model_use_vl, +) +from nemo_retriever.models.hf_model_registry import get_hf_revision + +# --------------------------------------------------------------------------- +# Lock: get_hf_revision is bypassed for local dirs, unchanged for Hub ids +# --------------------------------------------------------------------------- + + +def test_local_dir_requires_explicit_revision_pin_bypass(tmp_path): + (tmp_path / "config.json").write_text("{}", encoding="utf-8") + + with pytest.raises(ValueError, match="No pinned HuggingFace revision"): + get_hf_revision(str(tmp_path)) + + assert get_hf_revision(str(tmp_path), allow_local_path=True) is None + assert get_hf_revision(str(tmp_path), strict=True, allow_local_path=True) is None + + +def test_local_dir_without_model_config_does_not_bypass_revision_pin(tmp_path): + with pytest.raises(ValueError, match="No pinned HuggingFace revision"): + get_hf_revision(str(tmp_path), allow_local_path=True) + + +def test_registered_hub_id_still_pinned(): + assert get_hf_revision("nvidia/llama-nemotron-embed-1b-v2") == "b4caa8456edd360b3b4e938d94ed4398dd437fad" + + +def test_unregistered_hub_id_still_raises(): + with pytest.raises(ValueError, match="No pinned HuggingFace revision"): + get_hf_revision("some-org/not-registered") + + +def test_unregistered_hub_id_non_strict_returns_none(): + assert get_hf_revision("some-org/not-registered", strict=False) is None + + +# --------------------------------------------------------------------------- +# Routing: local dir -> VL or text embedder by explicit declaration +# --------------------------------------------------------------------------- + + +def test_resolver_routes_registered_vl_id_without_arch(_patch_embedders): + assert resolve_embed_model_use_vl("nvidia/llama-nemotron-embed-vl-1b-v2") is True + + +def test_resolver_routes_registered_text_id_without_arch(_patch_embedders): + assert resolve_embed_model_use_vl("nvidia/llama-nemotron-embed-1b-v2") is False + + +def test_resolver_routes_local_dir_from_arch(tmp_path, _patch_embedders): + assert resolve_embed_model_use_vl(str(tmp_path), model_arch="vl") is True + assert resolve_embed_model_use_vl(str(tmp_path), model_arch="text") is False + + +@pytest.fixture(autouse=True) +def _patch_embedders(monkeypatch): + """Stub the four embedder classes so no real model is loaded. + + Mirrors test_create_local_embedder.py: the ``model.local`` package exposes + classes lazily, so inject fake submodules directly into ``sys.modules``. + """ + fake_text_vllm = MagicMock(name="LlamaNemotronEmbed1BV2Embedder") + fake_text_hf = MagicMock(name="LlamaNemotronEmbed1BV2HFEmbedder") + fake_vl_hf = MagicMock(name="LlamaNemotronEmbedVL1BV2Embedder") + fake_vl_vllm = MagicMock(name="LlamaNemotronEmbedVL1BV2VLLMEmbedder") + + text_mod = ModuleType("nemo_retriever.models.local.llama_nemotron_embed_1b_v2_embedder") + text_mod.LlamaNemotronEmbed1BV2Embedder = fake_text_vllm + + text_hf_mod = ModuleType("nemo_retriever.models.local.llama_nemotron_embed_1b_v2_hf_embedder") + text_hf_mod.LlamaNemotronEmbed1BV2HFEmbedder = fake_text_hf + + vl_mod = ModuleType("nemo_retriever.models.local.llama_nemotron_embed_vl_1b_v2_embedder") + vl_mod.LlamaNemotronEmbedVL1BV2Embedder = fake_vl_hf + vl_mod.LlamaNemotronEmbedVL1BV2VLLMEmbedder = fake_vl_vllm + + monkeypatch.setitem(sys.modules, "nemo_retriever.models.local.llama_nemotron_embed_1b_v2_embedder", text_mod) + monkeypatch.setitem(sys.modules, "nemo_retriever.models.local.llama_nemotron_embed_1b_v2_hf_embedder", text_hf_mod) + monkeypatch.setitem(sys.modules, "nemo_retriever.models.local.llama_nemotron_embed_vl_1b_v2_embedder", vl_mod) + monkeypatch.delenv(LOCAL_EMBED_ARCH_ENV, raising=False) + + yield fake_text_vllm, fake_text_hf, fake_vl_hf, fake_vl_vllm + + +def test_local_dir_arch_vl_routes_to_vl_vllm(tmp_path, _patch_embedders): + _, _, _, fake_vl_vllm = _patch_embedders + result = create_local_embedder(str(tmp_path), model_arch="vl") # default backend vllm + fake_vl_vllm.assert_called_once() + assert fake_vl_vllm.call_args.kwargs["model_id"] == str(tmp_path) + assert result is fake_vl_vllm.return_value + + +def test_local_dir_arch_vl_hf_routes_to_vl_hf(tmp_path, _patch_embedders): + _, _, fake_vl_hf, _ = _patch_embedders + result = create_local_embedder(str(tmp_path), backend="hf", model_arch="vl") + fake_vl_hf.assert_called_once() + assert result is fake_vl_hf.return_value + + +def test_local_dir_arch_text_hf_routes_to_text_hf(tmp_path, _patch_embedders): + _, fake_text_hf, _, _ = _patch_embedders + result = create_local_embedder(str(tmp_path), backend="hf", model_arch="text") + fake_text_hf.assert_called_once() + assert fake_text_hf.call_args.kwargs["model_id"] == str(tmp_path) + assert result is fake_text_hf.return_value + + +def test_local_dir_arch_text_vllm_routes_to_text_vllm(tmp_path, _patch_embedders): + fake_text_vllm, _, _, _ = _patch_embedders + result = create_local_embedder(str(tmp_path), model_arch="text") + fake_text_vllm.assert_called_once() + assert result is fake_text_vllm.return_value + + +def test_local_dir_without_arch_fails_loud(tmp_path, _patch_embedders): + with pytest.raises(ValueError, match=LOCAL_EMBED_ARCH_ENV): + create_local_embedder(str(tmp_path), backend="hf") + + +def test_local_dir_invalid_arch_fails_loud(tmp_path, _patch_embedders): + with pytest.raises(ValueError, match=LOCAL_EMBED_ARCH_ENV): + create_local_embedder(str(tmp_path), backend="hf", model_arch="multimodal") + + +def test_local_dir_arch_from_env(tmp_path, monkeypatch, _patch_embedders): + _, _, fake_vl_hf, _ = _patch_embedders + monkeypatch.setenv(LOCAL_EMBED_ARCH_ENV, "vl") + result = create_local_embedder(str(tmp_path), backend="hf") + fake_vl_hf.assert_called_once() + assert result is fake_vl_hf.return_value + + +def test_explicit_arg_overrides_env(tmp_path, monkeypatch, _patch_embedders): + _, fake_text_hf, fake_vl_hf, _ = _patch_embedders + monkeypatch.setenv(LOCAL_EMBED_ARCH_ENV, "vl") + create_local_embedder(str(tmp_path), backend="hf", model_arch="text") + fake_text_hf.assert_called_once() + fake_vl_hf.assert_not_called() + + +def test_query_embedder_forwards_arch_for_local_dir(tmp_path, _patch_embedders): + _, _, fake_vl_hf, _ = _patch_embedders + result = create_local_query_embedder(str(tmp_path), backend="hf", model_arch="vl") + fake_vl_hf.assert_called_once() + assert result is fake_vl_hf.return_value + + +def test_query_embedder_local_dir_without_arch_fails_loud(tmp_path, _patch_embedders): + with pytest.raises(ValueError, match=LOCAL_EMBED_ARCH_ENV): + create_local_query_embedder(str(tmp_path), backend="hf") + + +# --------------------------------------------------------------------------- +# Registered Hub ids: routing is unchanged (no arch needed) +# --------------------------------------------------------------------------- + + +def test_registered_id_ignores_arch_and_uses_allowlist(_patch_embedders): + _, _, _, fake_vl_vllm = _patch_embedders + # The VL default id routes to VL regardless of any arch hint. + result = create_local_embedder("nvidia/llama-nemotron-embed-vl-1b-v2") + fake_vl_vllm.assert_called_once() + assert result is fake_vl_vllm.return_value