From 105d904c5df21940f3bc9d42e3c9bb9dbcda1a24 Mon Sep 17 00:00:00 2001 From: TKaltofen Date: Wed, 10 Jun 2026 09:47:14 +0200 Subject: [PATCH 1/9] feat: RAG connector families (retrieve, rerank, generate, graph_rag, structured, orchestrator) (#31) feat: RAG connector families (retrieve, rerank, generate, graph_rag, structured, orchestrator) --- README.md | 100 +++ pyproject.toml | 29 + .../feature_groups/connectors/__init__.py | 12 + .../connectors/generate/__init__.py | 9 + .../connectors/generate/_text.py | 21 + .../connectors/generate/base.py | 166 ++++ .../generate/extractive_responder.py | 69 ++ .../connectors/generate/template_responder.py | 86 ++ .../connectors/graph_rag/__init__.py | 9 + .../graph_rag/adjacency_graph_rag.py | 79 ++ .../connectors/graph_rag/base.py | 220 ++++++ .../graph_rag/networkx_graph_rag.py | 71 ++ .../connectors/orchestrator/__init__.py | 9 + .../connectors/orchestrator/base.py | 168 ++++ .../orchestrator/fixtures/r2r_responses.json | 22 + .../orchestrator/haystack_orchestrator.py | 75 ++ .../orchestrator/r2r_fixture_orchestrator.py | 125 +++ .../connectors/rerank/__init__.py | 9 + .../feature_groups/connectors/rerank/base.py | 181 +++++ .../connectors/rerank/flashrank_reranker.py | 71 ++ .../connectors/rerank/lexical_reranker.py | 50 ++ .../connectors/retrieve/__init__.py | 9 + .../connectors/retrieve/base.py | 262 ++++++ .../connectors/retrieve/bm25s_retriever.py | 65 ++ .../connectors/retrieve/tfidf_retriever.py | 73 ++ .../connectors/structured/__init__.py | 9 + .../connectors/structured/aggregate_sql.py | 144 ++++ .../connectors/structured/base.py | 199 +++++ .../connectors/structured/rule_based_sql.py | 84 ++ tests/conftest.py | 5 + tests/connectors/__init__.py | 1 + tests/connectors/generate/__init__.py | 1 + .../connectors/generate/generate_contract.py | 178 +++++ tests/connectors/generate/test_base_guards.py | 155 ++++ .../generate/test_extractive_responder.py | 48 ++ .../generate/test_template_responder.py | 76 ++ tests/connectors/graph_rag/__init__.py | 1 + .../graph_rag/graph_rag_contract.py | 228 ++++++ .../graph_rag/test_adjacency_graph_rag.py | 57 ++ .../graph_rag/test_backend_parity.py | 56 ++ tests/connectors/graph_rag/test_base_edges.py | 96 +++ .../graph_rag/test_networkx_graph_rag.py | 60 ++ tests/connectors/orchestrator/__init__.py | 1 + .../orchestrator/orchestrator_contract.py | 191 +++++ .../orchestrator/test_base_safety.py | 42 + .../test_haystack_orchestrator.py | 48 ++ .../test_r2r_fixture_orchestrator.py | 98 +++ tests/connectors/rerank/__init__.py | 1 + tests/connectors/rerank/rerank_contract.py | 173 ++++ .../rerank/test_flashrank_reranker.py | 47 ++ .../rerank/test_lexical_reranker.py | 35 + tests/connectors/retrieve/__init__.py | 7 + .../connectors/retrieve/retrieve_contract.py | 285 +++++++ .../retrieve/test_base_validation.py | 105 +++ .../retrieve/test_bm25s_retriever.py | 44 ++ .../retrieve/test_tfidf_retriever.py | 46 ++ tests/connectors/structured/__init__.py | 1 + .../structured/structured_contract.py | 210 +++++ .../structured/test_aggregate_sql.py | 116 +++ .../connectors/structured/test_base_safety.py | 131 +++ .../structured/test_rule_based_sql.py | 79 ++ tox.ini | 8 +- uv.lock | 744 +++++++++++++++++- 63 files changed, 5798 insertions(+), 2 deletions(-) create mode 100644 rag_integration/feature_groups/connectors/__init__.py create mode 100644 rag_integration/feature_groups/connectors/generate/__init__.py create mode 100644 rag_integration/feature_groups/connectors/generate/_text.py create mode 100644 rag_integration/feature_groups/connectors/generate/base.py create mode 100644 rag_integration/feature_groups/connectors/generate/extractive_responder.py create mode 100644 rag_integration/feature_groups/connectors/generate/template_responder.py create mode 100644 rag_integration/feature_groups/connectors/graph_rag/__init__.py create mode 100644 rag_integration/feature_groups/connectors/graph_rag/adjacency_graph_rag.py create mode 100644 rag_integration/feature_groups/connectors/graph_rag/base.py create mode 100644 rag_integration/feature_groups/connectors/graph_rag/networkx_graph_rag.py create mode 100644 rag_integration/feature_groups/connectors/orchestrator/__init__.py create mode 100644 rag_integration/feature_groups/connectors/orchestrator/base.py create mode 100644 rag_integration/feature_groups/connectors/orchestrator/fixtures/r2r_responses.json create mode 100644 rag_integration/feature_groups/connectors/orchestrator/haystack_orchestrator.py create mode 100644 rag_integration/feature_groups/connectors/orchestrator/r2r_fixture_orchestrator.py create mode 100644 rag_integration/feature_groups/connectors/rerank/__init__.py create mode 100644 rag_integration/feature_groups/connectors/rerank/base.py create mode 100644 rag_integration/feature_groups/connectors/rerank/flashrank_reranker.py create mode 100644 rag_integration/feature_groups/connectors/rerank/lexical_reranker.py create mode 100644 rag_integration/feature_groups/connectors/retrieve/__init__.py create mode 100644 rag_integration/feature_groups/connectors/retrieve/base.py create mode 100644 rag_integration/feature_groups/connectors/retrieve/bm25s_retriever.py create mode 100644 rag_integration/feature_groups/connectors/retrieve/tfidf_retriever.py create mode 100644 rag_integration/feature_groups/connectors/structured/__init__.py create mode 100644 rag_integration/feature_groups/connectors/structured/aggregate_sql.py create mode 100644 rag_integration/feature_groups/connectors/structured/base.py create mode 100644 rag_integration/feature_groups/connectors/structured/rule_based_sql.py create mode 100644 tests/connectors/__init__.py create mode 100644 tests/connectors/generate/__init__.py create mode 100644 tests/connectors/generate/generate_contract.py create mode 100644 tests/connectors/generate/test_base_guards.py create mode 100644 tests/connectors/generate/test_extractive_responder.py create mode 100644 tests/connectors/generate/test_template_responder.py create mode 100644 tests/connectors/graph_rag/__init__.py create mode 100644 tests/connectors/graph_rag/graph_rag_contract.py create mode 100644 tests/connectors/graph_rag/test_adjacency_graph_rag.py create mode 100644 tests/connectors/graph_rag/test_backend_parity.py create mode 100644 tests/connectors/graph_rag/test_base_edges.py create mode 100644 tests/connectors/graph_rag/test_networkx_graph_rag.py create mode 100644 tests/connectors/orchestrator/__init__.py create mode 100644 tests/connectors/orchestrator/orchestrator_contract.py create mode 100644 tests/connectors/orchestrator/test_base_safety.py create mode 100644 tests/connectors/orchestrator/test_haystack_orchestrator.py create mode 100644 tests/connectors/orchestrator/test_r2r_fixture_orchestrator.py create mode 100644 tests/connectors/rerank/__init__.py create mode 100644 tests/connectors/rerank/rerank_contract.py create mode 100644 tests/connectors/rerank/test_flashrank_reranker.py create mode 100644 tests/connectors/rerank/test_lexical_reranker.py create mode 100644 tests/connectors/retrieve/__init__.py create mode 100644 tests/connectors/retrieve/retrieve_contract.py create mode 100644 tests/connectors/retrieve/test_base_validation.py create mode 100644 tests/connectors/retrieve/test_bm25s_retriever.py create mode 100644 tests/connectors/retrieve/test_tfidf_retriever.py create mode 100644 tests/connectors/structured/__init__.py create mode 100644 tests/connectors/structured/structured_contract.py create mode 100644 tests/connectors/structured/test_aggregate_sql.py create mode 100644 tests/connectors/structured/test_base_safety.py create mode 100644 tests/connectors/structured/test_rule_based_sql.py diff --git a/README.md b/README.md index 26663cc..e6d8620 100644 --- a/README.md +++ b/README.md @@ -121,6 +121,105 @@ feature = Feature( | Deduplication | `ExactHashImageDeduplicator`, `PerceptualHashImageDeduplicator`, `DifferenceHashImageDeduplicator` | | Embedding | `MockImageEmbedder`, `HashImageEmbedder`, `CLIPImageEmbedder` | +## Connector families + +Alongside the build-your-own stage pipeline, the `connectors/` package wraps +whole external open-source RAG tools under one mloda surface, organized into +families by query-contract shape (see issue #25 for the taxonomy and the +backend-selection rationale). Each family is a thin `BaseConnector` +FeatureGroup plus one or more concrete backends, with an inheritable +contract-test suite so a new backend's test is a handful of adapter methods. + +The first family is `retrieve` (`query_text + corpus + top_k -> ranked +passages`). Its canonical backend is `Bm25sRetriever` (BM25 lexical retrieval +via `bm25s`): zero-download, deterministic, MIT/numpy-only. + +```python +from mloda.user import mlodaAPI, Feature, Options, PluginCollector +from mloda_plugins.compute_framework.base_implementations.python_dict.python_dict_framework import ( + PythonDictFramework, +) +from rag_integration.feature_groups.connectors.retrieve import Bm25sRetriever + +feature = Feature( + "retrieved_passages", + options=Options(context={ + "retrieve_backend": "bm25s", + "query_text": "cat pet", + "corpus": [ + {"doc_id": "d1", "text": "A cat is an independent and curious pet."}, + {"doc_id": "d2", "text": "Cars need regular engine oil and maintenance."}, + ], + "top_k": 3, + }), +) +results = mlodaAPI.run_all( + [feature], + compute_frameworks={PythonDictFramework}, + plugin_collector=PluginCollector.enabled_feature_groups({Bm25sRetriever}), +) +``` + +A second backend, `TfidfRetriever` (`retrieve_backend="tfidf"`), ranks the same +corpus by TF-IDF cosine similarity (a vector-space lexical counterpart to the +probabilistic `bm25s`): it vectorizes the corpus and query with the repo's +deterministic TF-IDF embedder and needs no extra dependency, so it is also +zero-download and a CI anchor. + +Install the family's backend with `uv sync --extra connectors`. + +The `rerank` family (`query_text + candidates + top_k -> reordered passages`) +reorders already-retrieved candidates. Its canonical backend is +`LexicalReranker` (`rerank_backend="lexical"`): pure-Python token overlap, +zero-download and deterministic. `FlashRankReranker` (`rerank_backend="flashrank"`, +`uv sync --extra rerank`) adds a real ONNX cross-encoder; its model downloads on +first use, so its test runs locally and is skipped on CI. + +The `generate` family (`query_text + passages -> answer + citations`) produces a +grounded answer from supporting passages. Its canonical backend is +`ExtractiveResponder` (`generate_backend="extractive"`): pure-Python sentence +extraction, zero-download and deterministic, and grounded by construction (every +citation is one of the supplied passages). A second backend, `TemplateResponder` +(`generate_backend="template"`), selects the top query-relevant sentences across +passages, joins them into a fixed template, and cites every passage it drew from +(multi-citation, vs the extractive responder's single citation); it is likewise +pure-Python, zero-download, and grounded by construction. LLM-backed generators +are pedigree backends for later. + +The `graph_rag` family (`query_text + nodes + edges + top_k -> ranked passages`) +scores nodes by query overlap plus a one-hop neighbour bonus: a passage +connected to a relevant one is surfaced even with no query-term overlap. Its canonical backend is `NetworkxGraphRag` +(`graph_backend="networkx"`, `uv sync --extra graph`): zero-download, +deterministic, BSD/pure-Python. A second backend, `AdjacencyGraphRag` +(`graph_backend="adjacency"`), applies the same overlap + neighbour-bonus +scoring over a hand-built adjacency map with no networkx (stdlib only), +demonstrating that the family contract is not tied to one graph library. + +The `structured` family (`question + table -> SQL -> typed rows`) answers a +natural-language question over a relational table. Its canonical backend is +`RuleBasedSql` (`structured_backend="rule_based"`, `uv sync --extra structured`): +rule-based NL->SQL executed on stdlib `sqlite3`, with `sqlglot` validating the +generated SQL is a single top-level `SELECT` statement. Zero-download, +deterministic, no LLM; values are always bound parameters and identifiers are +whitelisted. A second backend, `AggregateSql` (`structured_backend="aggregate"`), +adds aggregation intents (avg/min/max/sum over a column named in the question; +numericness is not validated, and SQLite's coercion means e.g. `AVG` over a text +column returns `0.0`) on top of the count/filter/list intents, reusing the same +identifier whitelist, SQL guard, and sqlite execution. + +The `orchestrator` family (`query_text + corpus + top_k -> answer + documents`) +wraps a whole external RAG framework as one connector (bring your existing +pipeline). Its canonical backend is `HaystackOrchestrator` +(`orchestrator_backend="haystack"`, `uv sync --extra orchestrator`): a real +Haystack 2.x in-memory BM25 pipeline, zero-download (no model, no server) so it +runs in CI. A second backend, `R2RFixtureOrchestrator` +(`orchestrator_backend="r2r"`), covers a different integration mode: it models a +server-shaped tool (R2R) over a static JSON fixture of canned responses (the +open-kgo `rest_public` pattern), answering with honest-surface narrowing +(surfacing only canned documents that are in the supplied corpus). No server, no +network, zero-dependency, deterministic. Other server-shaped tools (e.g. +RAGFlow) can follow the same fixture-stub pattern. + ## Installation Clone the repository and install with uv: @@ -140,6 +239,7 @@ To install only specific extras, use `uv sync --extra `: | `faiss` | FAISS vector indexing (`faiss-cpu`) | | `advanced` | Presidio, sentence-transformers, joblib, Pillow, FAISS| | `eval` | BEIR benchmark datasets, pandas, numpy | +| `graph` | networkx graph-RAG backend (`NetworkxGraphRag`) | | `dev` | tox, pytest, ruff, mypy, bandit | ## CLI diff --git a/pyproject.toml b/pyproject.toml index 5467439..4bad2e9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,11 +36,40 @@ eval = [ "pandas>=1.3.0", "numpy>=1.21.0", ] +# Connector families: wrap external open-source RAG tools (see issue #25). +# retrieve family canonical backend: bm25s (MIT, numpy-only, zero-download). +connectors = [ + "bm25s>=0.2.0", +] +# rerank family pedigree backend: FlashRank (Apache-2.0, ONNX, no torch). The +# canonical lexical reranker is pure-Python and needs no extra; this adds the +# neural cross-encoder backend (model downloads on first use). +rerank = [ + "flashrank>=0.2.0", +] +# graph_rag family canonical backend: networkx (BSD, pure-Python, zero-download). +graph = [ + "networkx>=3.0", +] +# structured family: sqlglot (MIT, pure-Python, zero-download) parses/validates +# the generated SQL; execution is on the stdlib sqlite3 module. +structured = [ + "sqlglot>=25", +] +# orchestrator family canonical backend: Haystack 2.x (Apache-2.0). Its in-memory +# BM25 pipeline is zero-download (no model, no server), so it runs in CI. +orchestrator = [ + "haystack-ai>=2.0", +] [tool.setuptools.packages.find] where = ["."] include = ["rag_integration*"] +[tool.setuptools.package-data] +# Ship the orchestrator R2R fixture-stub's canned-response JSON in the wheel. +"rag_integration.feature_groups.connectors.orchestrator" = ["fixtures/*.json"] + [tool.pytest.ini_options] testpaths = ["rag_integration", "tests"] python_files = ["test_*.py"] diff --git a/rag_integration/feature_groups/connectors/__init__.py b/rag_integration/feature_groups/connectors/__init__.py new file mode 100644 index 0000000..14f2feb --- /dev/null +++ b/rag_integration/feature_groups/connectors/__init__.py @@ -0,0 +1,12 @@ +"""Connector families: wrap external open-source RAG tools under one mloda surface. + +Each family is a thin ``BaseConnector`` FeatureGroup plus one or more +concrete backends, paired with an inheritable contract-test suite. Unlike the +stage pipeline under ``rag_pipeline/`` (build-your-own RAG from chained +FeatureGroups), a connector exposes a whole external retrieval/rerank/generate +tool through a single feature. See issue #25 for the family taxonomy and the +selection rationale. + +Family axis is the query-contract shape; the canonical concrete per family is +the zero-download, deterministic backend that anchors the CI contract suite. +""" diff --git a/rag_integration/feature_groups/connectors/generate/__init__.py b/rag_integration/feature_groups/connectors/generate/__init__.py new file mode 100644 index 0000000..ede8bce --- /dev/null +++ b/rag_integration/feature_groups/connectors/generate/__init__.py @@ -0,0 +1,9 @@ +"""The ``generate`` connector family: query + passages -> answer + citations.""" + +from __future__ import annotations + +from rag_integration.feature_groups.connectors.generate.base import BaseGenerateConnector +from rag_integration.feature_groups.connectors.generate.extractive_responder import ExtractiveResponder +from rag_integration.feature_groups.connectors.generate.template_responder import TemplateResponder + +__all__ = ["BaseGenerateConnector", "ExtractiveResponder", "TemplateResponder"] diff --git a/rag_integration/feature_groups/connectors/generate/_text.py b/rag_integration/feature_groups/connectors/generate/_text.py new file mode 100644 index 0000000..f732ef0 --- /dev/null +++ b/rag_integration/feature_groups/connectors/generate/_text.py @@ -0,0 +1,21 @@ +"""Shared text helpers for the ``generate`` family's no-LLM responders. + +Both deterministic baselines (extractive and template) tokenize and +sentence-split the same way; this private module holds the single copy so the +two backends cannot drift apart. The helpers are intentionally tiny: an +ASCII-lowercase token set and a punctuation-based sentence splitter. +""" + +from __future__ import annotations + +import re + +_TOKEN_RE = re.compile(r"[a-z0-9]+") + +SENTENCE_RE = re.compile(r"[^.!?]+[.!?]?") +"""Punctuation-based sentence splitter (over-splits abbreviations).""" + + +def tokenize(text: str) -> set[str]: + """Return the set of distinct lowercase ``[a-z0-9]+`` tokens in ``text``.""" + return set(_TOKEN_RE.findall(text.lower())) diff --git a/rag_integration/feature_groups/connectors/generate/base.py b/rag_integration/feature_groups/connectors/generate/base.py new file mode 100644 index 0000000..c27b24f --- /dev/null +++ b/rag_integration/feature_groups/connectors/generate/base.py @@ -0,0 +1,166 @@ +"""Base class for the ``generate`` connector family. + +Contract: ``query_text + passages -> answer + citations``. + +A generate connector takes a query and supporting passages (e.g. from the +retrieve or rerank families) and produces a grounded answer plus the passage +ids it drew from. It is a ROOT FeatureGroup here: passages are passed inline +through ``Options`` so the family is self-contained and contract-testable +without a network or an LLM. + +Output (single row, keyed by the root feature name):: + + {"generated_answer": {"answer": "...", "citations": ["doc_id", ...]}} + +The canonical concrete is deterministic and offline. LLM-backed generators are +pedigree backends that belong behind their own extra. The contract enforces +that the answer is *grounded*: every citation is one of the supplied passages. + +This mirrors the retrieve/rerank families (selector-gated matching, a single +abstract hook, single-row output) but the output is an answer object, not a +ranked-passage list, so it copies the pattern rather than subclassing them. +""" + +from __future__ import annotations + +from abc import abstractmethod +from typing import Any, Dict, List, Optional, Set, Tuple, Type, Union + +from mloda.provider import DataCreator, FeatureGroup, ComputeFramework, FeatureSet +from mloda.user import Options, FeatureName +from mloda_plugins.compute_framework.base_implementations.python_dict.python_dict_framework import ( + PythonDictFramework, +) + + +class BaseGenerateConnector(FeatureGroup): + """Root FeatureGroup for generate-connector backends. + + A concrete backend declares its selector value in ``GENERATE_BACKENDS`` and + implements :meth:`_generate`; the base owns option extraction, the + single-row assembly, and validation that every returned citation is one of + the supplied passages (no hallucinated sources). Selection is via + :meth:`match_feature_group_criteria`, gating on + ``generate_backend in cls.GENERATE_BACKENDS``. + """ + + ROOT_FEATURE_NAME = "generated_answer" + + # Option keys. + GENERATE_BACKEND = "generate_backend" + QUERY_TEXT = "query_text" + PASSAGES = "passages" + + # Filled per concrete; empty on the base so it never matches. + GENERATE_BACKENDS: Dict[str, str] = {} + + # Declarative option documentation only; selection is via + # ``match_feature_group_criteria`` (not the FeatureChainParser). + PROPERTY_MAPPING = { + GENERATE_BACKEND: {"explanation": "Which generate-connector backend to use"}, + QUERY_TEXT: {"explanation": "The question to answer"}, + PASSAGES: {"explanation": "Supporting passages: a list of {doc_id, text} dicts"}, + } + + @classmethod + def compute_framework_rule(cls) -> Optional[Set[Type[ComputeFramework]]]: + return {PythonDictFramework} + + @classmethod + def input_data(cls) -> DataCreator: + return DataCreator({cls.ROOT_FEATURE_NAME}) + + @classmethod + def match_feature_group_criteria( + cls, + feature_name: Union[FeatureName, str], + options: Options, + data_access_collection: Any = None, + ) -> bool: + """Match the root feature name only for a backend this concrete declares.""" + if str(feature_name) != cls.ROOT_FEATURE_NAME: + return False + backend = options.get(cls.GENERATE_BACKEND) + return backend in cls.GENERATE_BACKENDS + + def input_features(self, options: Options, feature_name: FeatureName) -> None: + """Root feature: no input features (passages arrive via Options).""" + return None + + @classmethod + def _get_passages(cls, options: Options) -> List[Dict[str, Any]]: + passages = options.get(cls.PASSAGES) + if passages is None: + raise ValueError(f"{cls.__name__} requires '{cls.PASSAGES}' in options: a list of {{doc_id, text}} dicts.") + passages = list(passages) + seen: Set[str] = set() + for i, passage in enumerate(passages): + doc_id = str(passage.get("doc_id", str(i))) + if doc_id in seen: + raise ValueError( + f"{cls.__name__} received duplicate passage doc_id '{doc_id}'; doc_ids must be unique." + ) + seen.add(doc_id) + return passages + + @classmethod + @abstractmethod + def _generate(cls, query: str, passages: List[Dict[str, Any]]) -> Tuple[str, List[str]]: + """Answer ``query`` from ``passages``. + + Returns ``(answer, citations)`` where ``answer`` is the answer text and + ``citations`` is the list of ``doc_id``s the answer draws from. Each + citation must be the ``doc_id`` of one of the supplied passages (the + base validates this). The base handles empty passages itself, so this + hook is never called with an empty list. + """ + ... + + @classmethod + def _validate_citations(cls, citations: List[str], passages: List[Dict[str, Any]]) -> None: + """Reject any citation that is not one of the supplied passage doc_ids, or cited twice.""" + known = {str(p.get("doc_id", str(i))) for i, p in enumerate(passages)} + for citation in citations: + if citation not in known: + raise ValueError( + f"{cls.__name__}._generate cited '{citation}', which is not among the supplied passages." + ) + if len(citations) != len(set(citations)): + raise ValueError(f"{cls.__name__}._generate returned duplicate citations; each doc_id may be cited once.") + + @classmethod + def _answer(cls, query: str, passages: List[Dict[str, Any]]) -> Dict[str, Any]: + """Assemble the answer contract around the backend's :meth:`_generate`.""" + if not passages: + return {"answer": "", "citations": []} + answer, citations = cls._generate(query, passages) + cls._validate_citations(citations, passages) + # Grounded by construction, in both directions: a non-empty answer must + # cite its source(s), and citations without an answer are meaningless. + if answer.strip() and not citations: + raise ValueError( + f"{cls.__name__}._generate returned a non-empty answer with no citations; " + f"a grounded answer must cite at least one supplied passage." + ) + if not answer.strip() and citations: + raise ValueError( + f"{cls.__name__}._generate returned citations with an empty answer; " + f"citations are only valid for a non-empty answer." + ) + if not answer.strip(): + # Normalize a whitespace-only answer so the empty shape is always + # exactly {"answer": "", "citations": []}. + return {"answer": "", "citations": []} + return {"answer": answer, "citations": citations} + + @classmethod + def calculate_feature(cls, data: Any, features: FeatureSet) -> List[Dict[str, Any]]: + """Generate an answer from the passages, return the answer object.""" + for feature in features.features: + options = feature.options + query = options.get(cls.QUERY_TEXT) + if query is None: + raise ValueError(f"{cls.__name__} requires '{cls.QUERY_TEXT}' in options.") + passages = cls._get_passages(options) + return [{cls.ROOT_FEATURE_NAME: cls._answer(str(query), passages)}] + return [] diff --git a/rag_integration/feature_groups/connectors/generate/extractive_responder.py b/rag_integration/feature_groups/connectors/generate/extractive_responder.py new file mode 100644 index 0000000..46e75b5 --- /dev/null +++ b/rag_integration/feature_groups/connectors/generate/extractive_responder.py @@ -0,0 +1,69 @@ +"""Extractive (no-LLM) responder. + +Canonical concrete for the ``generate`` family: zero-download, zero-dependency +(pure Python stdlib), deterministic. Selects the passage sentence most relevant +to the query (by token overlap) and returns it verbatim as the answer, citing +the passage it came from. A grounded-by-construction baseline that anchors the +CI contract suite; LLM-backed generators are pedigree backends for later. +""" + +from __future__ import annotations + +from typing import Any, Dict, List, Tuple + +from rag_integration.feature_groups.connectors.generate._text import SENTENCE_RE, tokenize +from rag_integration.feature_groups.connectors.generate.base import BaseGenerateConnector + + +class ExtractiveResponder(BaseGenerateConnector): + """Extractive responder (``generate_backend="extractive"``). + + Splits each passage into sentences, scores every sentence by the number of + distinct query tokens it contains, and returns the single best sentence as + the answer (cited to its passage). Ties are broken by passage then sentence + order, so the output is stable and deterministic. If no sentence shares any + token with the query, the answer is empty with no citations (the responder + does not invent an answer). + + Baseline limitations (acceptable for a zero-dependency CI anchor): the + tokenizer matches ``[a-z0-9]+``, so it is English/ASCII-only (accented or + non-Latin text scores low or zero); the sentence splitter is punctuation + based (``[.!?]``), so it over-splits abbreviations and keeps embedded + newlines verbatim. There is no stopword handling, so sentences echoing the + question's function words can outrank the substantive answer. A + higher-fidelity or multilingual responder would be a separate backend. + """ + + GENERATE_BACKENDS = { + "extractive": "Extractive sentence selection (pure Python, no LLM)", + } + + PROPERTY_MAPPING = { + BaseGenerateConnector.GENERATE_BACKEND: {"explanation": "Use 'extractive' for no-LLM sentence extraction"}, + BaseGenerateConnector.QUERY_TEXT: {"explanation": "The question to answer"}, + BaseGenerateConnector.PASSAGES: {"explanation": "Supporting passages: a list of {doc_id, text} dicts"}, + } + + @classmethod + def _generate(cls, query: str, passages: List[Dict[str, Any]]) -> Tuple[str, List[str]]: + query_tokens = tokenize(query) + + best_score = 0 + best_sentence = "" + best_doc_id = "" + for i, passage in enumerate(passages): + doc_id = str(passage.get("doc_id", str(i))) + for raw_sentence in SENTENCE_RE.findall(str(passage.get("text", ""))): + sentence = raw_sentence.strip() + if not sentence: + continue + score = len(query_tokens & tokenize(sentence)) + # Strictly greater keeps the first (earliest) best on ties. + if score > best_score: + best_score = score + best_sentence = sentence + best_doc_id = doc_id + + if best_score == 0: + return "", [] + return best_sentence, [best_doc_id] diff --git a/rag_integration/feature_groups/connectors/generate/template_responder.py b/rag_integration/feature_groups/connectors/generate/template_responder.py new file mode 100644 index 0000000..349b679 --- /dev/null +++ b/rag_integration/feature_groups/connectors/generate/template_responder.py @@ -0,0 +1,86 @@ +"""Template (no-LLM) responder. + +Second concrete for the ``generate`` family: zero-download, zero-dependency +(pure Python stdlib), deterministic. Where :class:`ExtractiveResponder` returns +a single best sentence cited to one passage, this backend selects the top-N +query-relevant sentences *across* passages, joins them into a fixed template, +and cites **every** passage it drew from (multi-citation). Grounded by +construction: the answer is a fixed lead-in plus verbatim source sentences, and +each contributing passage is cited. +""" + +from __future__ import annotations + +from typing import Any, Dict, List, Tuple + +from rag_integration.feature_groups.connectors.generate._text import SENTENCE_RE, tokenize +from rag_integration.feature_groups.connectors.generate.base import BaseGenerateConnector + +# Fixed lead-in the selected sentences are joined onto. The answer is this +# template plus verbatim source sentences, so it stays grounded. +_TEMPLATE_PREFIX = "Based on the retrieved passages: " + + +class TemplateResponder(BaseGenerateConnector): + """Multi-sentence template responder (``generate_backend="template"``). + + Splits every passage into sentences, scores each by the number of distinct + query tokens it contains, and keeps the top ``MAX_SENTENCES`` sentences with + a non-zero score. Those sentences are joined (in best-first order) onto a + fixed template, and every passage that contributed a sentence is cited. + + Ties are broken by passage order then sentence order, so the selection, + answer text, and citation order are all stable and deterministic. If no + sentence shares a token with the query, the answer is empty with no + citations (the responder does not invent an answer). + + Baseline limitations (shared with :class:`ExtractiveResponder` via the same + tokenizer and sentence splitter): English/ASCII-only matching and + punctuation-based splitting. There is no stopword handling, so sentences + echoing the question's function words can outrank the substantive answer. + """ + + # How many sentences the answer may draw together. A handful is enough to + # surface multi-passage support while keeping the answer focused. + MAX_SENTENCES = 3 + + GENERATE_BACKENDS = { + "template": "Top-N sentence templating with multi-passage citation (pure Python, no LLM)", + } + + PROPERTY_MAPPING = { + BaseGenerateConnector.GENERATE_BACKEND: {"explanation": "Use 'template' for multi-sentence templated answers"}, + BaseGenerateConnector.QUERY_TEXT: {"explanation": "The question to answer"}, + BaseGenerateConnector.PASSAGES: {"explanation": "Supporting passages: a list of {doc_id, text} dicts"}, + } + + @classmethod + def _generate(cls, query: str, passages: List[Dict[str, Any]]) -> Tuple[str, List[str]]: + query_tokens = tokenize(query) + + # (score, passage_index, sentence_index, sentence, doc_id) for every + # sentence that shares at least one distinct query token. + scored: List[Tuple[int, int, int, str, str]] = [] + for passage_index, passage in enumerate(passages): + doc_id = str(passage.get("doc_id", str(passage_index))) + for sentence_index, raw_sentence in enumerate(SENTENCE_RE.findall(str(passage.get("text", "")))): + sentence = raw_sentence.strip() + if not sentence: + continue + score = len(query_tokens & tokenize(sentence)) + if score > 0: + scored.append((score, passage_index, sentence_index, sentence, doc_id)) + + if not scored: + return "", [] + + # Best score first; ties broken by passage then sentence order for a + # stable, deterministic selection. + scored.sort(key=lambda item: (-item[0], item[1], item[2])) + selected = scored[: cls.MAX_SENTENCES] + + answer = _TEMPLATE_PREFIX + " ".join(item[3] for item in selected) + # Cite every passage a selected sentence came from, de-duplicated and in + # first-appearance order. + citations = list(dict.fromkeys(item[4] for item in selected)) + return answer, citations diff --git a/rag_integration/feature_groups/connectors/graph_rag/__init__.py b/rag_integration/feature_groups/connectors/graph_rag/__init__.py new file mode 100644 index 0000000..7bc72c0 --- /dev/null +++ b/rag_integration/feature_groups/connectors/graph_rag/__init__.py @@ -0,0 +1,9 @@ +"""The ``graph_rag`` connector family: query + graph -> ranked passages.""" + +from __future__ import annotations + +from rag_integration.feature_groups.connectors.graph_rag.adjacency_graph_rag import AdjacencyGraphRag +from rag_integration.feature_groups.connectors.graph_rag.base import BaseGraphRagConnector +from rag_integration.feature_groups.connectors.graph_rag.networkx_graph_rag import NetworkxGraphRag + +__all__ = ["BaseGraphRagConnector", "NetworkxGraphRag", "AdjacencyGraphRag"] diff --git a/rag_integration/feature_groups/connectors/graph_rag/adjacency_graph_rag.py b/rag_integration/feature_groups/connectors/graph_rag/adjacency_graph_rag.py new file mode 100644 index 0000000..4084a82 --- /dev/null +++ b/rag_integration/feature_groups/connectors/graph_rag/adjacency_graph_rag.py @@ -0,0 +1,79 @@ +"""Adjacency-map graph-RAG backend (no networkx). + +Second concrete for the ``graph_rag`` family: same query-overlap + +neighbour-bonus scoring as :class:`NetworkxGraphRag`, but over a hand-built +adjacency map walked with the standard library instead of networkx. Zero +dependency (pure Python stdlib), zero-download, deterministic. It proves the +family contract is not tied to one graph library: swap the engine, keep the +behaviour. +""" + +from __future__ import annotations + +import re +from typing import List, Tuple + +from rag_integration.feature_groups.connectors.graph_rag.base import BaseGraphRagConnector + +_TOKEN_RE = re.compile(r"[a-z0-9]+") + +# Weight added to a node for each relevant (query-overlapping) neighbour. +# Matches NetworkxGraphRag so the two backends rank identically. +_NEIGHBOUR_BONUS = 0.5 + + +class AdjacencyGraphRag(BaseGraphRagConnector): + """Graph-expansion retrieval over a plain adjacency map (``graph_backend="adjacency"``). + + ``score(node) = lexical_overlap(node) + 0.5 * (relevant neighbours)``, where + a relevant neighbour is a directly-connected node with non-zero query + overlap. The adjacency map is built by walking the resolved edge list (each + edge wired both ways, since the graph is undirected). Ties are broken by + node index, so the ranking is stable and deterministic. This is the same + scoring as :class:`NetworkxGraphRag`; only the graph engine differs. + """ + + GRAPH_BACKENDS = { + "adjacency": "Graph-expansion retrieval over a hand-built adjacency map (no networkx)", + } + + PROPERTY_MAPPING = { + BaseGraphRagConnector.GRAPH_BACKEND: { + "explanation": "Use 'adjacency' for graph-expansion retrieval (no networkx)" + }, + BaseGraphRagConnector.QUERY_TEXT: {"explanation": "Raw text query to search the graph"}, + BaseGraphRagConnector.TOP_K: { + "explanation": f"Number of passages to return (default {BaseGraphRagConnector.DEFAULT_TOP_K})" + }, + BaseGraphRagConnector.NODES: {"explanation": "Graph nodes: a list of {doc_id, text} dicts"}, + BaseGraphRagConnector.EDGES: { + "explanation": "Graph edges: a list of [doc_id_a, doc_id_b] pairs." + " Optional: omitting it degrades scoring to lexical-only (no neighbour bonus)" + }, + } + + @staticmethod + def _tokenize(text: str) -> set[str]: + return set(_TOKEN_RE.findall(text.lower())) + + @classmethod + def _rank(cls, query: str, texts: List[str], edges: List[Tuple[int, int]], top_k: int) -> List[Tuple[int, float]]: + # Build an undirected adjacency map from the resolved (index, index) + # edges; the base has already dropped self-loops and unknown ids. + adjacency: dict[int, set[int]] = {node: set() for node in range(len(texts))} + for a, b in edges: + adjacency[a].add(b) + adjacency[b].add(a) + + query_tokens = cls._tokenize(query) + overlap = [len(query_tokens & cls._tokenize(text)) for text in texts] + seeds = {i for i, count in enumerate(overlap) if count > 0} + + scored: List[Tuple[int, float]] = [] + for node in range(len(texts)): + relevant_neighbours = sum(1 for neighbour in adjacency[node] if neighbour in seeds) + score = float(overlap[node]) + _NEIGHBOUR_BONUS * relevant_neighbours + scored.append((node, score)) + + scored.sort(key=lambda pair: (-pair[1], pair[0])) + return scored[:top_k] diff --git a/rag_integration/feature_groups/connectors/graph_rag/base.py b/rag_integration/feature_groups/connectors/graph_rag/base.py new file mode 100644 index 0000000..c9a1f10 --- /dev/null +++ b/rag_integration/feature_groups/connectors/graph_rag/base.py @@ -0,0 +1,220 @@ +"""Base class for the ``graph_rag`` connector family. + +Contract: ``query_text + nodes + edges + top_k -> ranked passages``. + +A graph-RAG connector retrieves passages over a graph: the corpus is a set of +text nodes plus edges between them, and each node is scored by its own query +overlap plus a one-hop neighbour bonus for adjacent relevant nodes. The +distinguishing value over plain retrieval is *connected context*: a passage +with no query-term overlap can still be surfaced because it neighbours a +relevant one. + +It is a ROOT FeatureGroup: nodes and edges are passed inline through +``Options`` so the family is self-contained and contract-testable without a +graph database. ``nodes`` is a list of ``{doc_id, text}``; ``edges`` is a list +of ``[doc_id_a, doc_id_b]`` pairs. ``edges`` is optional: omitting it degrades +scoring to lexical-only (no neighbour bonus). + +Output (single row, keyed by the root feature name):: + + {"graph_passages": [{"doc_id": ..., "text": ..., "score": ..., "rank": ...}, ...]} + +The base owns option extraction, edge resolution (doc_id pairs -> node-index +pairs), clamping, validation of returned indices, and passage assembly. A +backend implements only :meth:`_rank`. +""" + +from __future__ import annotations + +from abc import abstractmethod +from typing import Any, Dict, List, Optional, Set, Tuple, Type, Union + +from mloda.provider import DataCreator, FeatureGroup, ComputeFramework, FeatureSet +from mloda.user import Options, FeatureName +from mloda_plugins.compute_framework.base_implementations.python_dict.python_dict_framework import ( + PythonDictFramework, +) + + +class BaseGraphRagConnector(FeatureGroup): + """Root FeatureGroup for graph-RAG connector backends. + + A concrete backend declares its selector value in ``GRAPH_BACKENDS`` and + implements :meth:`_rank`; selection is via + :meth:`match_feature_group_criteria`, gating on + ``graph_backend in cls.GRAPH_BACKENDS``. + """ + + ROOT_FEATURE_NAME = "graph_passages" + + # Option keys. + GRAPH_BACKEND = "graph_backend" + QUERY_TEXT = "query_text" + TOP_K = "top_k" + NODES = "nodes" + EDGES = "edges" + + DEFAULT_TOP_K = 5 + + # Filled per concrete; empty on the base so it never matches. + GRAPH_BACKENDS: Dict[str, str] = {} + + # Declarative option documentation only; selection is via + # ``match_feature_group_criteria`` (not the FeatureChainParser). + PROPERTY_MAPPING = { + GRAPH_BACKEND: {"explanation": "Which graph-RAG backend to use"}, + QUERY_TEXT: {"explanation": "Raw text query to search the graph"}, + TOP_K: {"explanation": f"Number of passages to return (default {DEFAULT_TOP_K})"}, + NODES: {"explanation": "Graph nodes: a list of {doc_id, text} dicts"}, + EDGES: { + "explanation": "Graph edges: a list of [doc_id_a, doc_id_b] pairs." + " Optional: omitting it degrades scoring to lexical-only (no neighbour bonus)" + }, + } + + @classmethod + def compute_framework_rule(cls) -> Optional[Set[Type[ComputeFramework]]]: + return {PythonDictFramework} + + @classmethod + def input_data(cls) -> DataCreator: + return DataCreator({cls.ROOT_FEATURE_NAME}) + + @classmethod + def match_feature_group_criteria( + cls, + feature_name: Union[FeatureName, str], + options: Options, + data_access_collection: Any = None, + ) -> bool: + """Match the root feature name only for a backend this concrete declares.""" + if str(feature_name) != cls.ROOT_FEATURE_NAME: + return False + backend = options.get(cls.GRAPH_BACKEND) + return backend in cls.GRAPH_BACKENDS + + def input_features(self, options: Options, feature_name: FeatureName) -> None: + """Root feature: no input features (graph arrives via Options).""" + return None + + @classmethod + def _get_top_k(cls, options: Options) -> int: + val = options.get(cls.TOP_K) + return int(val) if val is not None else cls.DEFAULT_TOP_K + + @classmethod + def _get_nodes(cls, options: Options) -> List[Dict[str, Any]]: + nodes = options.get(cls.NODES) + if nodes is None: + raise ValueError(f"{cls.__name__} requires '{cls.NODES}' in options: a list of {{doc_id, text}} dicts.") + return list(nodes) + + @classmethod + def _resolve_edges(cls, options: Options) -> List[Tuple[str, str]]: + """Resolve the optional ``EDGES`` option into ``(doc_id_a, doc_id_b)`` pairs. + + ``EDGES`` is optional: omitting it degrades scoring to lexical-only (no + neighbour bonus). When present it must be a list/tuple of + ``[doc_id_a, doc_id_b]`` pairs; any other container raises ``ValueError`` + (a string would otherwise silently drop every edge). Malformed elements + and self-loops are skipped (they carry no usable context). + """ + raw_edges = options.get(cls.EDGES) + if raw_edges is None: + return [] + if not isinstance(raw_edges, (list, tuple)): + raise ValueError( + f"{cls.__name__} '{cls.EDGES}' must be a list of [doc_id_a, doc_id_b] pairs, " + f"got {type(raw_edges).__name__}." + ) + resolved: List[Tuple[str, str]] = [] + for edge in raw_edges: + # A real pair only: a length-2 string would otherwise fabricate an + # edge between its two characters, and a non-sequence would crash len(). + if not isinstance(edge, (list, tuple)) or len(edge) != 2: + continue + a, b = str(edge[0]), str(edge[1]) + if a != b: + resolved.append((a, b)) + return resolved + + @classmethod + @abstractmethod + def _rank(cls, query: str, texts: List[str], edges: List[Tuple[int, int]], top_k: int) -> List[Tuple[int, float]]: + """Rank nodes against the query using graph structure. + + ``edges`` are node-index pairs (already resolved from doc_ids). Returns + up to ``top_k`` ``(node_index, score)`` pairs, best-first; indices must + be in range and unique (validated by the base). ``top_k`` is clamped to + ``1 <= top_k <= len(texts)``. The base does not re-sort, so returning + best-first is a hard requirement. + """ + ... + + @classmethod + def _validate_ranking(cls, ranked: List[Tuple[int, float]], n_nodes: int) -> None: + """Reject out-of-range or duplicate indices from a backend's ``_rank``.""" + seen: Set[int] = set() + for idx, _score in ranked: + if not 0 <= idx < n_nodes: + raise ValueError(f"{cls.__name__}._rank returned out-of-range index {idx} for {n_nodes} nodes.") + if idx in seen: + raise ValueError(f"{cls.__name__}._rank returned duplicate index {idx}.") + seen.add(idx) + + @classmethod + def _retrieve( + cls, + query: str, + nodes: List[Dict[str, Any]], + edges: List[Tuple[str, str]], + top_k: int, + ) -> List[Dict[str, Any]]: + """Assemble the ranked-passage contract around the backend's :meth:`_rank`. + + Pure data in, passages out: ``edges`` are already-resolved doc_id pairs + (see :meth:`_resolve_edges`). A duplicate doc_id (including distinct + values colliding after ``str()`` coercion) raises ``ValueError``: edges + could not be attributed unambiguously, and the earlier node would become + an unreachable isolated node that is still scored. Edges naming a + doc_id outside the corpus are skipped (no usable context). + """ + if not nodes: + return [] + effective_k = min(top_k, len(nodes)) + if effective_k <= 0: + return [] + + texts = [str(node.get("text", "")) for node in nodes] + doc_ids = [str(node.get("doc_id", str(i))) for i, node in enumerate(nodes)] + doc_id_to_index: Dict[str, int] = {} + for i, doc_id in enumerate(doc_ids): + if doc_id in doc_id_to_index: + raise ValueError(f"{cls.__name__}: duplicate doc_id '{doc_id}': edges would be ambiguous.") + doc_id_to_index[doc_id] = i + edge_indices = [ + (doc_id_to_index[a], doc_id_to_index[b]) for a, b in edges if a in doc_id_to_index and b in doc_id_to_index + ] + + ranked = cls._rank(query, texts, edge_indices, effective_k) + cls._validate_ranking(ranked, len(nodes)) + + passages: List[Dict[str, Any]] = [] + for rank, (idx, score) in enumerate(ranked): + passages.append({"doc_id": doc_ids[idx], "text": texts[idx], "score": float(score), "rank": rank}) + return passages + + @classmethod + def calculate_feature(cls, data: Any, features: FeatureSet) -> List[Dict[str, Any]]: + """Score nodes by query overlap plus a one-hop neighbour bonus, return ranked passages.""" + for feature in features.features: + options = feature.options + query = options.get(cls.QUERY_TEXT) + if query is None: + raise ValueError(f"{cls.__name__} requires '{cls.QUERY_TEXT}' in options.") + nodes = cls._get_nodes(options) + edges = cls._resolve_edges(options) + top_k = cls._get_top_k(options) + passages = cls._retrieve(str(query), nodes, edges, top_k) + return [{cls.ROOT_FEATURE_NAME: passages}] + return [] diff --git a/rag_integration/feature_groups/connectors/graph_rag/networkx_graph_rag.py b/rag_integration/feature_groups/connectors/graph_rag/networkx_graph_rag.py new file mode 100644 index 0000000..f04b8b0 --- /dev/null +++ b/rag_integration/feature_groups/connectors/graph_rag/networkx_graph_rag.py @@ -0,0 +1,71 @@ +"""NetworkX graph-RAG backend. + +Canonical concrete for the ``graph_rag`` family: zero-download, deterministic, +backed by networkx (BSD, pure-Python, no model). Scores each node by its own +query-term overlap plus a bonus for neighbouring relevant nodes, so passages +that are connected to the answer are surfaced even when they share no query +term. This is the distinguishing value of graph RAG over plain retrieval. +""" + +from __future__ import annotations + +import re +from typing import Any, List, Tuple + +from rag_integration.feature_groups.connectors.graph_rag.base import BaseGraphRagConnector + +_TOKEN_RE = re.compile(r"[a-z0-9]+") + +# Weight added to a node for each relevant (query-overlapping) neighbour. +_NEIGHBOUR_BONUS = 0.5 + + +class NetworkxGraphRag(BaseGraphRagConnector): + """Graph-expansion retrieval over networkx (``graph_backend="networkx"``). + + ``score(node) = lexical_overlap(node) + 0.5 * (relevant neighbours)``, where + a relevant neighbour is one with non-zero query overlap. Ties are broken by + node index, so the ranking is stable and deterministic. + """ + + GRAPH_BACKENDS = { + "networkx": "Graph-expansion retrieval over networkx", + } + + PROPERTY_MAPPING = { + BaseGraphRagConnector.GRAPH_BACKEND: {"explanation": "Use 'networkx' for graph-expansion retrieval"}, + BaseGraphRagConnector.QUERY_TEXT: {"explanation": "Raw text query to search the graph"}, + BaseGraphRagConnector.TOP_K: { + "explanation": f"Number of passages to return (default {BaseGraphRagConnector.DEFAULT_TOP_K})" + }, + BaseGraphRagConnector.NODES: {"explanation": "Graph nodes: a list of {doc_id, text} dicts"}, + BaseGraphRagConnector.EDGES: { + "explanation": "Graph edges: a list of [doc_id_a, doc_id_b] pairs." + " Optional: omitting it degrades scoring to lexical-only (no neighbour bonus)" + }, + } + + @staticmethod + def _tokenize(text: str) -> set[str]: + return set(_TOKEN_RE.findall(text.lower())) + + @classmethod + def _rank(cls, query: str, texts: List[str], edges: List[Tuple[int, int]], top_k: int) -> List[Tuple[int, float]]: + import networkx as nx + + graph: Any = nx.Graph() + graph.add_nodes_from(range(len(texts))) + graph.add_edges_from(edges) + + query_tokens = cls._tokenize(query) + overlap = [len(query_tokens & cls._tokenize(text)) for text in texts] + seeds = {i for i, count in enumerate(overlap) if count > 0} + + scored: List[Tuple[int, float]] = [] + for node in range(len(texts)): + relevant_neighbours = sum(1 for nb in graph.neighbors(node) if nb in seeds) + score = float(overlap[node]) + _NEIGHBOUR_BONUS * relevant_neighbours + scored.append((node, score)) + + scored.sort(key=lambda pair: (-pair[1], pair[0])) + return scored[:top_k] diff --git a/rag_integration/feature_groups/connectors/orchestrator/__init__.py b/rag_integration/feature_groups/connectors/orchestrator/__init__.py new file mode 100644 index 0000000..1120f00 --- /dev/null +++ b/rag_integration/feature_groups/connectors/orchestrator/__init__.py @@ -0,0 +1,9 @@ +"""The ``orchestrator`` connector family: query + corpus -> answer (opaque pipeline).""" + +from __future__ import annotations + +from rag_integration.feature_groups.connectors.orchestrator.base import BaseOrchestratorConnector +from rag_integration.feature_groups.connectors.orchestrator.haystack_orchestrator import HaystackOrchestrator +from rag_integration.feature_groups.connectors.orchestrator.r2r_fixture_orchestrator import R2RFixtureOrchestrator + +__all__ = ["BaseOrchestratorConnector", "HaystackOrchestrator", "R2RFixtureOrchestrator"] diff --git a/rag_integration/feature_groups/connectors/orchestrator/base.py b/rag_integration/feature_groups/connectors/orchestrator/base.py new file mode 100644 index 0000000..8e41d84 --- /dev/null +++ b/rag_integration/feature_groups/connectors/orchestrator/base.py @@ -0,0 +1,168 @@ +"""Base class for the ``orchestrator`` connector family. + +Contract: ``query_text + corpus + top_k -> answer + documents`` (internals opaque). + +An orchestrator connector wraps a whole external RAG framework (LlamaIndex, +Haystack, txtai, ...) as a single connector: you hand it a query and a corpus, +it runs the framework's own pipeline, and you get an answer plus the documents +the pipeline surfaced. Unlike the retrieve/rerank/generate families, the +internals are the framework's, not ours; this family is about the *integration +surface* (bring your existing pipeline), not the algorithm. + +It is a ROOT FeatureGroup: the corpus is passed inline through ``Options`` and +the framework runs fully in-memory, so the family is self-contained and +contract-testable without a server. + +Output (single row, keyed by the root feature name):: + + {"orchestrated_answer": {"answer": "...", "documents": [{"doc_id": ..., "text": ..., "score": ...}, ...]}} + +The base owns option extraction, single-row assembly, and validation that every +returned document came from the supplied corpus (no fabricated sources). A +backend implements only :meth:`_run` (driving its framework's pipeline). +""" + +from __future__ import annotations + +from abc import abstractmethod +from typing import Any, Dict, List, Optional, Set, Tuple, Type, Union + +from mloda.provider import DataCreator, FeatureGroup, ComputeFramework, FeatureSet +from mloda.user import Options, FeatureName +from mloda_plugins.compute_framework.base_implementations.python_dict.python_dict_framework import ( + PythonDictFramework, +) + + +class BaseOrchestratorConnector(FeatureGroup): + """Root FeatureGroup for orchestrator connector backends. + + A concrete backend declares its selector value in ``ORCHESTRATOR_BACKENDS`` + and implements :meth:`_run`; selection is via + :meth:`match_feature_group_criteria`, gating on + ``orchestrator_backend in cls.ORCHESTRATOR_BACKENDS``. + """ + + ROOT_FEATURE_NAME = "orchestrated_answer" + + # Option keys. + ORCHESTRATOR_BACKEND = "orchestrator_backend" + QUERY_TEXT = "query_text" + TOP_K = "top_k" + CORPUS = "corpus" + + DEFAULT_TOP_K = 5 + + ORCHESTRATOR_BACKENDS: Dict[str, str] = {} + + PROPERTY_MAPPING = { + ORCHESTRATOR_BACKEND: {"explanation": "Which orchestrator (external framework) backend to use"}, + QUERY_TEXT: {"explanation": "The query to run through the framework pipeline"}, + TOP_K: {"explanation": f"Number of documents the pipeline should surface (default {DEFAULT_TOP_K})"}, + CORPUS: {"explanation": "Inline corpus: a list of {doc_id, text} dicts"}, + } + + @classmethod + def compute_framework_rule(cls) -> Optional[Set[Type[ComputeFramework]]]: + return {PythonDictFramework} + + @classmethod + def input_data(cls) -> DataCreator: + return DataCreator({cls.ROOT_FEATURE_NAME}) + + @classmethod + def match_feature_group_criteria( + cls, + feature_name: Union[FeatureName, str], + options: Options, + data_access_collection: Any = None, + ) -> bool: + """Match the root feature name only for a backend this concrete declares.""" + if str(feature_name) != cls.ROOT_FEATURE_NAME: + return False + backend = options.get(cls.ORCHESTRATOR_BACKEND) + return backend in cls.ORCHESTRATOR_BACKENDS + + def input_features(self, options: Options, feature_name: FeatureName) -> None: + """Root feature: no input features (the corpus arrives via Options).""" + return None + + @classmethod + def _get_top_k(cls, options: Options) -> int: + val = options.get(cls.TOP_K) + return int(val) if val is not None else cls.DEFAULT_TOP_K + + @classmethod + def _get_corpus(cls, options: Options) -> List[Dict[str, Any]]: + corpus = options.get(cls.CORPUS) + if corpus is None: + raise ValueError(f"{cls.__name__} requires '{cls.CORPUS}' in options: a list of {{doc_id, text}} dicts.") + return list(corpus) + + @classmethod + @abstractmethod + def _run(cls, query: str, corpus: List[Dict[str, Any]], top_k: int) -> Tuple[str, List[Dict[str, Any]]]: + """Run the framework's pipeline for ``query`` over ``corpus``. + + Returns ``(answer, documents)`` where ``documents`` is a list of + ``{doc_id, text, score}`` dicts the pipeline surfaced (best first) and + ``answer`` is the framework's answer. Each document's ``doc_id`` must be + one of the supplied corpus ids (the base validates this). The base + short-circuits an empty corpus before dispatching, so :meth:`_run` is + never called with one. + """ + ... + + @classmethod + def _validate_unique_doc_ids(cls, corpus: List[Dict[str, Any]]) -> None: + """Reject duplicate effective doc_ids, uniformly across backends. + + An entry without ``doc_id`` defaults to its positional index, so an + explicit ``doc_id`` ``"1"`` collides with a missing ``doc_id`` at + index 1; the check runs on the effective ids. + """ + seen: Set[str] = set() + for i, doc in enumerate(corpus): + doc_id = str(doc.get("doc_id", str(i))) + if doc_id in seen: + raise ValueError(f"{cls.__name__}: duplicate doc_id {doc_id!r} in corpus; ids must be unique.") + seen.add(doc_id) + + @classmethod + def _validate_documents(cls, documents: List[Dict[str, Any]], corpus: List[Dict[str, Any]]) -> None: + """Reject any surfaced document whose doc_id is not in the supplied corpus.""" + known = {str(doc.get("doc_id", str(i))) for i, doc in enumerate(corpus)} + for document in documents: + if str(document.get("doc_id")) not in known: + raise ValueError( + f"{cls.__name__}._run surfaced document {document.get('doc_id')!r}, " + f"which is not in the supplied corpus." + ) + + @classmethod + def _answer(cls, query: str, corpus: List[Dict[str, Any]], top_k: int) -> Dict[str, Any]: + """Assemble the answer contract around the backend's :meth:`_run`.""" + if not corpus: + return {"answer": "", "documents": []} + cls._validate_unique_doc_ids(corpus) + answer, documents = cls._run(query, corpus, top_k) + cls._validate_documents(documents, corpus) + # A non-empty answer must rest on surfaced documents. An empty answer + # with no documents is a valid retrieve-only / no-match result; an empty + # answer alongside documents is fine too (retrieve-only pipeline). + if answer.strip() and not documents: + raise ValueError(f"{cls.__name__}._run returned a non-empty answer with no supporting documents.") + return {"answer": answer, "documents": documents} + + @classmethod + def calculate_feature(cls, data: Any, features: FeatureSet) -> List[Dict[str, Any]]: + """Run the framework pipeline, return the answer object.""" + for feature in features.features: + options = feature.options + query = options.get(cls.QUERY_TEXT) + if query is None: + raise ValueError(f"{cls.__name__} requires '{cls.QUERY_TEXT}' in options.") + corpus = cls._get_corpus(options) + top_k = cls._get_top_k(options) + return [{cls.ROOT_FEATURE_NAME: cls._answer(str(query), corpus, top_k)}] + return [] diff --git a/rag_integration/feature_groups/connectors/orchestrator/fixtures/r2r_responses.json b/rag_integration/feature_groups/connectors/orchestrator/fixtures/r2r_responses.json new file mode 100644 index 0000000..36e7b01 --- /dev/null +++ b/rag_integration/feature_groups/connectors/orchestrator/fixtures/r2r_responses.json @@ -0,0 +1,22 @@ +{ + "_comment": "Canned R2R-style retrieval+RAG responses, keyed by lowercased query. Each entry mirrors the shape an R2R server's /rag endpoint returns: a generated answer, the doc the answer is drawn from (answer_doc_id), and the ranked source documents (doc_id + relevance score). The stub backend narrows these to the documents present in the supplied corpus, so nothing is fabricated; the answer is surfaced only when its answer_doc_id is among the surfaced documents, i.e. it survives both corpus narrowing and top_k truncation (otherwise the result is retrieve-only). A query absent here yields no canned response (honest surface).", + "responses": { + "cat pet": { + "answer": "A cat is an independent and curious pet.", + "answer_doc_id": "d1", + "documents": [ + {"doc_id": "d1", "score": 0.91}, + {"doc_id": "d2", "score": 0.14}, + {"doc_id": "d0", "score": 0.07} + ] + }, + "loyal companion": { + "answer": "Dogs are loyal and energetic companions.", + "answer_doc_id": "d2", + "documents": [ + {"doc_id": "d1", "score": 0.62}, + {"doc_id": "d2", "score": 0.58} + ] + } + } +} diff --git a/rag_integration/feature_groups/connectors/orchestrator/haystack_orchestrator.py b/rag_integration/feature_groups/connectors/orchestrator/haystack_orchestrator.py new file mode 100644 index 0000000..5a8ae9c --- /dev/null +++ b/rag_integration/feature_groups/connectors/orchestrator/haystack_orchestrator.py @@ -0,0 +1,75 @@ +"""Haystack orchestrator backend. + +Canonical concrete for the ``orchestrator`` family: runs a real Haystack 2.x +pipeline (``InMemoryDocumentStore`` + ``InMemoryBM25Retriever``) entirely +in-memory. Zero-download (BM25 needs no model and no API) and deterministic, so +it anchors the CI contract suite while exercising a genuine external framework. +Behind the ``orchestrator`` extra. Haystack telemetry is disabled (via +``HAYSTACK_TELEMETRY_ENABLED``, set before the lazy import) to keep runs +offline and deterministic. +""" + +from __future__ import annotations + +import os +from typing import Any, Dict, List, Tuple + +from rag_integration.feature_groups.connectors.orchestrator.base import BaseOrchestratorConnector + + +class HaystackOrchestrator(BaseOrchestratorConnector): + """Whole-pipeline retrieval via Haystack (``orchestrator_backend="haystack"``). + + Builds an in-memory document store, writes the corpus, and runs a BM25 + retrieval pipeline. The answer (no LLM) is the top document's content; the + surfaced documents carry the pipeline's BM25 scores. + """ + + ORCHESTRATOR_BACKENDS = { + "haystack": "Haystack 2.x in-memory BM25 pipeline", + } + + PROPERTY_MAPPING = { + BaseOrchestratorConnector.ORCHESTRATOR_BACKEND: {"explanation": "Use 'haystack' for a Haystack BM25 pipeline"}, + BaseOrchestratorConnector.QUERY_TEXT: {"explanation": "The query to run through the pipeline"}, + BaseOrchestratorConnector.TOP_K: { + "explanation": f"Number of documents to surface (default {BaseOrchestratorConnector.DEFAULT_TOP_K})" + }, + BaseOrchestratorConnector.CORPUS: {"explanation": "Inline corpus: a list of {doc_id, text} dicts"}, + } + + @classmethod + def _run(cls, query: str, corpus: List[Dict[str, Any]], top_k: int) -> Tuple[str, List[Dict[str, Any]]]: + # Haystack evaluates telemetry at first import (and Pipeline.run() would + # otherwise POST a PostHog event and write ~/.haystack/config.yaml), so + # opt out before the lazy import to keep runs offline and deterministic. + os.environ.setdefault("HAYSTACK_TELEMETRY_ENABLED", "False") + + from haystack import Document, Pipeline + from haystack.components.retrievers.in_memory import InMemoryBM25Retriever + from haystack.document_stores.in_memory import InMemoryDocumentStore + + entries = [(str(doc.get("doc_id", str(i))), str(doc.get("text", ""))) for i, doc in enumerate(corpus)] + + # Nothing rankable -> empty result (rather than leaking a framework + # error): an empty/whitespace query, an all-empty-text corpus (BM25 + # divides by the average document length), or a non-positive top_k. + effective_k = min(top_k, len(entries)) + if not query.strip() or not any(text.strip() for _, text in entries) or effective_k <= 0: + return "", [] + + store = InMemoryDocumentStore() + store.write_documents([Document(id=doc_id, content=text) for doc_id, text in entries]) + + pipeline = Pipeline() + pipeline.add_component("retriever", InMemoryBM25Retriever(document_store=store, top_k=effective_k)) + result = pipeline.run({"retriever": {"query": query}}) + + # document.score is Optional[float]; the BM25 retriever filters out + # non-positive scores, so every surfaced document carries a float score. + documents = [ + {"doc_id": document.id, "text": document.content, "score": float(document.score)} + for document in result["retriever"]["documents"] + ] + answer = documents[0]["text"] if documents else "" + return answer, documents diff --git a/rag_integration/feature_groups/connectors/orchestrator/r2r_fixture_orchestrator.py b/rag_integration/feature_groups/connectors/orchestrator/r2r_fixture_orchestrator.py new file mode 100644 index 0000000..1cdcd86 --- /dev/null +++ b/rag_integration/feature_groups/connectors/orchestrator/r2r_fixture_orchestrator.py @@ -0,0 +1,125 @@ +"""R2R fixture-stub orchestrator backend. + +Second concrete for the ``orchestrator`` family, and a different *integration +mode* from the in-process ``HaystackOrchestrator``: it models a server-shaped +RAG tool (R2R) over a static JSON fixture instead of running a library +in-process. There is no server and no network: the fixture holds canned R2R +``/rag``-style responses (a generated answer plus ranked source doc_ids), keyed +by query, exactly as the open-kgo ``rest_public`` file-fixture connectors model +a REST API from local files. + +The honest-surface mechanism is **narrowing**: the corpus passed to the family +is treated as the documents ingested into R2R, and the stub surfaces only the +canned doc_ids that are actually in that corpus (with the corpus's own text), so +nothing is fabricated. A query with no canned response yields ``("", [])`` (the +server has nothing indexed for it). The canned answer is surfaced only when the +document it is drawn from (``answer_doc_id``) is among the documents actually +surfaced: it is suppressed when that document is dropped either by corpus +narrowing or by ``top_k`` truncation. In both cases the result is retrieve-only +(the surviving documents, an empty answer), so the answer always rests on the +surfaced documents it was drawn from. + +Zero-download, zero-dependency (stdlib ``json``), deterministic; a CI anchor +alongside the Haystack backend. +""" + +from __future__ import annotations + +import json +import threading +from pathlib import Path +from typing import Any, Dict, List, Tuple + +from rag_integration.feature_groups.connectors.orchestrator.base import BaseOrchestratorConnector + +_FIXTURE_PATH = Path(__file__).parent / "fixtures" / "r2r_responses.json" + + +class R2RFixtureOrchestrator(BaseOrchestratorConnector): + """R2R-shaped fixture-stub orchestrator (``orchestrator_backend="r2r"``). + + Answers from a bundled JSON fixture of canned R2R responses, narrowed to the + supplied corpus. The fixture is loaded once and cached at class level; the + read is deterministic, so repeated calls are idempotent. + """ + + ORCHESTRATOR_BACKENDS = { + "r2r": "R2R-shaped server stub over a static JSON fixture (honest-surface narrowing)", + } + + PROPERTY_MAPPING = { + BaseOrchestratorConnector.ORCHESTRATOR_BACKEND: {"explanation": "Use 'r2r' for the R2R fixture-stub pipeline"}, + BaseOrchestratorConnector.QUERY_TEXT: {"explanation": "The query to look up in the canned R2R responses"}, + BaseOrchestratorConnector.TOP_K: { + "explanation": f"Number of documents to surface (default {BaseOrchestratorConnector.DEFAULT_TOP_K})" + }, + BaseOrchestratorConnector.CORPUS: {"explanation": "Inline corpus (the documents ingested into R2R)"}, + } + + _responses: Dict[str, Any] | None = None + _cache_lock = threading.Lock() + + @classmethod + def _get_responses(cls) -> Dict[str, Any]: + """Load and cache the canned-response table from the bundled fixture. + + The returned table is the shared cache and must be treated as read-only; + ``_run`` only reads from it and emits fresh dicts for surfaced documents, + so the cache is never mutated through a result. + """ + responses = cls._responses + if responses is not None: + return responses + with cls._cache_lock: + if cls._responses is None: + try: + with _FIXTURE_PATH.open(encoding="utf-8") as fixture_file: + payload = json.load(fixture_file) + except (OSError, json.JSONDecodeError) as exc: + raise RuntimeError( + f"{cls.__name__}: failed to load bundled R2R fixture {_FIXTURE_PATH}: {exc}" + ) from exc + cls._responses = dict(payload.get("responses", {})) + return cls._responses + + @classmethod + def _run(cls, query: str, corpus: List[Dict[str, Any]], top_k: int) -> Tuple[str, List[Dict[str, Any]]]: + effective_k = min(top_k, len(corpus)) + if not query.strip() or effective_k <= 0: + return "", [] + + response = cls._get_responses().get(query.strip().lower()) + if response is None: + # The server has no canned response for this query (honest surface). + return "", [] + + text_by_doc_id = {str(doc.get("doc_id", str(i))): str(doc.get("text", "")) for i, doc in enumerate(corpus)} + + # Narrowing: keep only canned doc_ids that are in the ingested corpus, + # surfacing the corpus's own text (never the fixture's), so a surfaced + # document is always grounded in what was actually supplied. + documents: List[Dict[str, Any]] = [] + for entry in response.get("documents", []): + doc_id = str(entry.get("doc_id")) + if doc_id in text_by_doc_id: + documents.append( + {"doc_id": doc_id, "text": text_by_doc_id[doc_id], "score": float(entry.get("score", 0.0))} + ) + if len(documents) >= effective_k: + break + + # If narrowing removed every document, there is nothing to ground an + # answer on, so return an empty result rather than an ungrounded answer. + if not documents: + return "", [] + + # The canned answer is drawn from one source document (answer_doc_id). + # Surface the answer only if that document is among the SURFACED + # documents: it may have been dropped by corpus narrowing or by top_k + # truncation, and in either case we return a retrieve-only result (the + # surviving documents, no answer) rather than an answer whose support + # was not surfaced. + answer_doc_id = str(response.get("answer_doc_id", "")) + surfaced_ids = {document["doc_id"] for document in documents} + answer = str(response.get("answer", "")) if answer_doc_id in surfaced_ids else "" + return answer, documents diff --git a/rag_integration/feature_groups/connectors/rerank/__init__.py b/rag_integration/feature_groups/connectors/rerank/__init__.py new file mode 100644 index 0000000..91222ab --- /dev/null +++ b/rag_integration/feature_groups/connectors/rerank/__init__.py @@ -0,0 +1,9 @@ +"""The ``rerank`` connector family: query + candidates -> reordered passages.""" + +from __future__ import annotations + +from rag_integration.feature_groups.connectors.rerank.base import BaseRerankConnector +from rag_integration.feature_groups.connectors.rerank.lexical_reranker import LexicalReranker +from rag_integration.feature_groups.connectors.rerank.flashrank_reranker import FlashRankReranker + +__all__ = ["BaseRerankConnector", "LexicalReranker", "FlashRankReranker"] diff --git a/rag_integration/feature_groups/connectors/rerank/base.py b/rag_integration/feature_groups/connectors/rerank/base.py new file mode 100644 index 0000000..d41a103 --- /dev/null +++ b/rag_integration/feature_groups/connectors/rerank/base.py @@ -0,0 +1,181 @@ +"""Base class for the ``rerank`` connector family. + +Contract: ``query_text + candidates + top_k -> reordered passages with scores``. + +A rerank connector takes a set of candidate passages (already retrieved, e.g. +by the ``retrieve`` family) and reorders them by relevance to the query, +returning the top_k after reranking. It is a ROOT FeatureGroup here: candidates +are passed inline through ``Options`` so the family is self-contained and +contract-testable without a network or an upstream stage. In a two-stage +pipeline the candidates would come from a retrieve connector; the rerank logic +is identical either way. + +Output (single row, keyed by the root feature name):: + + {"reranked_passages": [{"doc_id": ..., "text": ..., "score": ..., "rank": ...}, ...]} + +``score`` is the rerank score (higher is more relevant); ``rank`` is 0-based, +ascending, best first. + +This mirrors the ``retrieve`` family by design (selector-gated matching, the +``_rank`` hoist, base-side validation of returned indices). It deliberately +copies that pattern rather than subclassing ``BaseRetrieveConnector``: the +input is ``candidates`` not ``corpus``, and keeping the families decoupled lets +each evolve its own contract. +""" + +from __future__ import annotations + +from abc import abstractmethod +from typing import Any, Dict, List, Optional, Set, Tuple, Type, Union + +from mloda.provider import DataCreator, FeatureGroup, ComputeFramework, FeatureSet +from mloda.user import Options, FeatureName +from mloda_plugins.compute_framework.base_implementations.python_dict.python_dict_framework import ( + PythonDictFramework, +) + + +class BaseRerankConnector(FeatureGroup): + """Root FeatureGroup for rerank-connector backends. + + A concrete backend declares its selector value in ``RERANK_BACKENDS`` and + implements :meth:`_rank` (the only per-backend logic); the base owns the + empty-input / ``top_k`` clamping, the passage-assembly contract, and the + validation of returned indices. Selection is done entirely by + :meth:`match_feature_group_criteria`, gating on + ``rerank_backend in cls.RERANK_BACKENDS``; disjoint selector values keep + backends mutually exclusive. The base keeps ``RERANK_BACKENDS`` empty so it + never matches. + """ + + ROOT_FEATURE_NAME = "reranked_passages" + + # Option keys. + RERANK_BACKEND = "rerank_backend" + QUERY_TEXT = "query_text" + TOP_K = "top_k" + CANDIDATES = "candidates" + + DEFAULT_TOP_K = 5 + + # Filled per concrete: {backend_value: human-readable description}. Disjoint + # across backends; empty on the base so it never matches. + RERANK_BACKENDS: Dict[str, str] = {} + + # Declarative option documentation only; selection is via + # ``match_feature_group_criteria`` (not the FeatureChainParser). + PROPERTY_MAPPING = { + RERANK_BACKEND: {"explanation": "Which rerank-connector backend to use"}, + QUERY_TEXT: {"explanation": "Query the candidates are reranked against"}, + TOP_K: {"explanation": f"Number of passages to return after reranking (default {DEFAULT_TOP_K})"}, + CANDIDATES: {"explanation": "Candidate passages to rerank: a list of {doc_id, text} dicts"}, + } + + @classmethod + def compute_framework_rule(cls) -> Optional[Set[Type[ComputeFramework]]]: + return {PythonDictFramework} + + @classmethod + def input_data(cls) -> DataCreator: + return DataCreator({cls.ROOT_FEATURE_NAME}) + + @classmethod + def match_feature_group_criteria( + cls, + feature_name: Union[FeatureName, str], + options: Options, + data_access_collection: Any = None, + ) -> bool: + """Match the root feature name only for a backend this concrete declares. + + Gating on ``rerank_backend`` keeps backends mutually exclusive; an + unknown backend matches nothing (honest surface). + """ + if str(feature_name) != cls.ROOT_FEATURE_NAME: + return False + backend = options.get(cls.RERANK_BACKEND) + return backend in cls.RERANK_BACKENDS + + def input_features(self, options: Options, feature_name: FeatureName) -> None: + """Root feature: no input features (candidates arrive via Options).""" + return None + + @classmethod + def _get_top_k(cls, options: Options) -> int: + val = options.get(cls.TOP_K) + return int(val) if val is not None else cls.DEFAULT_TOP_K + + @classmethod + def _get_candidates(cls, options: Options) -> List[Dict[str, Any]]: + candidates = options.get(cls.CANDIDATES) + if candidates is None: + raise ValueError( + f"{cls.__name__} requires '{cls.CANDIDATES}' in options: a list of {{doc_id, text}} dicts." + ) + return list(candidates) + + @classmethod + @abstractmethod + def _rank(cls, query: str, texts: List[str], top_k: int) -> List[Tuple[int, float]]: + """Reorder ``texts`` by relevance to ``query``. + + Returns up to ``top_k`` ``(candidate_index, score)`` pairs, ordered + best-first, where ``score`` is higher-is-more-relevant. Indices must be + in range (``0 <= candidate_index < len(texts)``) and unique (validated + by the base). ``top_k`` is already clamped to ``1 <= top_k <= + len(texts)``. The base does not re-sort, so best-first is required. + """ + ... + + @classmethod + def _validate_ranking(cls, ranked: List[Tuple[int, float]], n_candidates: int) -> None: + """Reject out-of-range or duplicate indices from a backend's ``_rank``.""" + seen: Set[int] = set() + for idx, _score in ranked: + if not 0 <= idx < n_candidates: + raise ValueError( + f"{cls.__name__}._rank returned out-of-range index {idx} for {n_candidates} candidates." + ) + if idx in seen: + raise ValueError(f"{cls.__name__}._rank returned duplicate index {idx}.") + seen.add(idx) + + @classmethod + def _rerank( + cls, + query: str, + candidates: List[Dict[str, Any]], + top_k: int, + ) -> List[Dict[str, Any]]: + """Assemble the reranked-passage contract around the backend's :meth:`_rank`.""" + if not candidates: + return [] + effective_k = min(top_k, len(candidates)) + if effective_k <= 0: + return [] + + texts = [str(doc.get("text", "")) for doc in candidates] + doc_ids = [str(doc.get("doc_id", str(i))) for i, doc in enumerate(candidates)] + + ranked = cls._rank(query, texts, effective_k) + cls._validate_ranking(ranked, len(candidates)) + + passages: List[Dict[str, Any]] = [] + for rank, (idx, score) in enumerate(ranked): + passages.append({"doc_id": doc_ids[idx], "text": texts[idx], "score": float(score), "rank": rank}) + return passages + + @classmethod + def calculate_feature(cls, data: Any, features: FeatureSet) -> List[Dict[str, Any]]: + """Rerank the candidates against the query, return reordered passages.""" + for feature in features.features: + options = feature.options + query = options.get(cls.QUERY_TEXT) + if query is None: + raise ValueError(f"{cls.__name__} requires '{cls.QUERY_TEXT}' in options.") + candidates = cls._get_candidates(options) + top_k = cls._get_top_k(options) + passages = cls._rerank(str(query), candidates, top_k) + return [{cls.ROOT_FEATURE_NAME: passages}] + return [] diff --git a/rag_integration/feature_groups/connectors/rerank/flashrank_reranker.py b/rag_integration/feature_groups/connectors/rerank/flashrank_reranker.py new file mode 100644 index 0000000..95d4b04 --- /dev/null +++ b/rag_integration/feature_groups/connectors/rerank/flashrank_reranker.py @@ -0,0 +1,71 @@ +"""FlashRank cross-encoder reranker. + +Pedigree concrete for the ``rerank`` family: a real neural cross-encoder that +exercises the distinguishing semantics of reranking, kept light by FlashRank's +ONNX runtime (no torch). Behind the ``rerank`` extra. The model (~4 MB for the +default ``ms-marco-TinyBERT-L-2-v2``) downloads on first use and is cached, so +its contract test is skipped on CI (network) but runs locally; the zero-download +``LexicalReranker`` is the always-on CI anchor. +""" + +from __future__ import annotations + +import threading +from typing import Any, List, Tuple + +from rag_integration.feature_groups.connectors.rerank.base import BaseRerankConnector + + +class FlashRankReranker(BaseRerankConnector): + """Cross-encoder reranking via FlashRank (``rerank_backend="flashrank"``). + + The default model is ``ms-marco-TinyBERT-L-2-v2`` (~4 MB, Apache-2.0). The + ranker is cached at class level since constructing it loads the ONNX model; + loading is guarded by a lock so concurrent callers do not build it twice. + """ + + # Fixed to the small default model. A configurable model option is omitted + # deliberately: the base `_rank(query, texts, top_k)` contract passes no + # options, so advertising a model option here would be a surface lie. If a + # future need arises, plumb it through `calculate_feature`, not here. + DEFAULT_MODEL = "ms-marco-TinyBERT-L-2-v2" + + RERANK_BACKENDS = { + "flashrank": "Cross-encoder reranking (FlashRank, ONNX)", + } + + PROPERTY_MAPPING = { + BaseRerankConnector.RERANK_BACKEND: {"explanation": "Use 'flashrank' for cross-encoder reranking"}, + BaseRerankConnector.QUERY_TEXT: {"explanation": "Query the candidates are reranked against"}, + BaseRerankConnector.TOP_K: { + "explanation": f"Number of passages to return after reranking (default {BaseRerankConnector.DEFAULT_TOP_K})" + }, + BaseRerankConnector.CANDIDATES: {"explanation": "Candidate passages: a list of {doc_id, text} dicts"}, + } + + # Single-slot cache: only DEFAULT_MODEL is ever loaded. + _ranker: Any | None = None + _cache_lock = threading.Lock() + + @classmethod + def _get_ranker(cls) -> Any: + from flashrank import Ranker + + ranker = cls._ranker + if ranker is not None: + return ranker + with cls._cache_lock: + if cls._ranker is None: + cls._ranker = Ranker(model_name=cls.DEFAULT_MODEL) + return cls._ranker + + @classmethod + def _rank(cls, query: str, texts: List[str], top_k: int) -> List[Tuple[int, float]]: + from flashrank import RerankRequest + + ranker = cls._get_ranker() + # Use the candidate's list index as the passage id so results map back + # to positions regardless of how FlashRank reorders them. + passages = [{"id": str(idx), "text": text} for idx, text in enumerate(texts)] + ranked = ranker.rerank(RerankRequest(query=query, passages=passages)) + return [(int(item["id"]), float(item["score"])) for item in ranked[:top_k]] diff --git a/rag_integration/feature_groups/connectors/rerank/lexical_reranker.py b/rag_integration/feature_groups/connectors/rerank/lexical_reranker.py new file mode 100644 index 0000000..303d2f0 --- /dev/null +++ b/rag_integration/feature_groups/connectors/rerank/lexical_reranker.py @@ -0,0 +1,50 @@ +"""Lexical token-overlap reranker. + +Canonical concrete for the ``rerank`` family: zero-download, zero-dependency +(pure Python stdlib), deterministic. Scores each candidate by the number of +query tokens it contains and reorders best-first. A cheap, honest reranker that +anchors the CI contract suite with no model, network, or third-party library. +""" + +from __future__ import annotations + +import re +from typing import List, Tuple + +from rag_integration.feature_groups.connectors.rerank.base import BaseRerankConnector + +_TOKEN_RE = re.compile(r"[a-z0-9]+") + + +class LexicalReranker(BaseRerankConnector): + """Token-overlap reranker (``rerank_backend="lexical"``). + + Score = number of distinct query tokens present in the candidate. Ties are + broken by the candidate's original position, so the ordering is stable and + deterministic. + """ + + RERANK_BACKENDS = { + "lexical": "Token-overlap lexical reranking (pure Python)", + } + + PROPERTY_MAPPING = { + BaseRerankConnector.RERANK_BACKEND: {"explanation": "Use 'lexical' for token-overlap reranking"}, + BaseRerankConnector.QUERY_TEXT: {"explanation": "Query the candidates are reranked against"}, + BaseRerankConnector.TOP_K: { + "explanation": f"Number of passages to return after reranking (default {BaseRerankConnector.DEFAULT_TOP_K})" + }, + BaseRerankConnector.CANDIDATES: {"explanation": "Candidate passages: a list of {doc_id, text} dicts"}, + } + + @staticmethod + def _tokenize(text: str) -> set[str]: + return set(_TOKEN_RE.findall(text.lower())) + + @classmethod + def _rank(cls, query: str, texts: List[str], top_k: int) -> List[Tuple[int, float]]: + query_tokens = cls._tokenize(query) + scored = [(idx, float(len(query_tokens & cls._tokenize(text)))) for idx, text in enumerate(texts)] + # Best score first; ties broken by original index for a stable order. + scored.sort(key=lambda pair: (-pair[1], pair[0])) + return scored[:top_k] diff --git a/rag_integration/feature_groups/connectors/retrieve/__init__.py b/rag_integration/feature_groups/connectors/retrieve/__init__.py new file mode 100644 index 0000000..7b68a8f --- /dev/null +++ b/rag_integration/feature_groups/connectors/retrieve/__init__.py @@ -0,0 +1,9 @@ +"""The ``retrieve`` connector family: query + corpus -> ranked passages.""" + +from __future__ import annotations + +from rag_integration.feature_groups.connectors.retrieve.base import BaseRetrieveConnector +from rag_integration.feature_groups.connectors.retrieve.bm25s_retriever import Bm25sRetriever +from rag_integration.feature_groups.connectors.retrieve.tfidf_retriever import TfidfRetriever + +__all__ = ["BaseRetrieveConnector", "Bm25sRetriever", "TfidfRetriever"] diff --git a/rag_integration/feature_groups/connectors/retrieve/base.py b/rag_integration/feature_groups/connectors/retrieve/base.py new file mode 100644 index 0000000..9b0ad37 --- /dev/null +++ b/rag_integration/feature_groups/connectors/retrieve/base.py @@ -0,0 +1,262 @@ +"""Base class for the ``retrieve`` connector family. + +Contract: ``query_text + corpus + top_k -> ranked passages with scores``. + +A retrieve connector is a ROOT FeatureGroup (no input features): it takes an +inline corpus and a query through ``Options`` and returns the passages ranked +best-first. Concrete backends (lexical, dense, hybrid, late-interaction) differ +only in the ranking they apply behind this one contract; they declare their +selector value in ``RETRIEVE_BACKENDS`` and implement :meth:`_rank`. + +Output (single row, keyed by the root feature name):: + + {"retrieved_passages": [{"doc_id": ..., "text": ..., "score": ..., "rank": ...}, ...]} + +``score`` is higher-is-more-relevant; ``rank`` is 0-based, ascending, best +first. Backends return at most ``top_k`` passages and only those with a +positive score, so a degenerate query (empty, or sharing no terms with the +corpus) yields no passages. ``PythonDictFramework`` slices the result to the +requested feature, so the ranked-passage list is the whole contract. +""" + +from __future__ import annotations + +from abc import abstractmethod +from typing import Any, Dict, List, Optional, Set, Tuple, Type, Union + +from mloda.provider import DataCreator, FeatureGroup, ComputeFramework, FeatureSet +from mloda.user import Options, FeatureName +from mloda_plugins.compute_framework.base_implementations.python_dict.python_dict_framework import ( + PythonDictFramework, +) + + +class BaseRetrieveConnector(FeatureGroup): + """Root FeatureGroup for retrieve-connector backends. + + A concrete backend declares its selector value in ``RETRIEVE_BACKENDS`` and + implements :meth:`_rank` (the only per-backend logic); the base owns the + empty-corpus / ``top_k`` clamping and the passage-assembly contract so every + backend returns an identically shaped result. + + Selection is unambiguous and done entirely by + :meth:`match_feature_group_criteria`, which gates on + ``retrieve_backend in cls.RETRIEVE_BACKENDS``. Because each backend declares + a disjoint selector value and mloda raises when more than one feature group + matches, at most one backend ever claims a given ``Options``. The base keeps + ``RETRIEVE_BACKENDS`` empty so it never matches. + + Reuse note for sibling families (rerank, generate, ...): the carry-over is + the *shape* (the ``RETRIEVE_BACKENDS`` selector dict, the + ``match_feature_group_criteria`` gating, and the ``_rank`` hoist), not this + class. The root/``DataCreator``/``_get_corpus`` triad below is + retrieve-specific: rerank consumes candidate passages as input features + rather than an inline corpus, so a sibling family copies this pattern, it + does not subclass ``BaseRetrieveConnector``. + """ + + ROOT_FEATURE_NAME = "retrieved_passages" + + # Option keys. + RETRIEVE_BACKEND = "retrieve_backend" + QUERY_TEXT = "query_text" + TOP_K = "top_k" + CORPUS = "corpus" + + DEFAULT_TOP_K = 5 + + # Filled per concrete: {backend_value: human-readable description}. The base + # stays empty so it never matches a feature. Values must be disjoint across + # backends (see the class docstring). + RETRIEVE_BACKENDS: Dict[str, str] = {} + + # Declarative option documentation only. These root connector groups select + # by ``match_feature_group_criteria`` (not the FeatureChainParser), so the + # ``context``/``default``/``strict_validation`` flags that the parser would + # consume are intentionally omitted here; defaulting and validation live in + # the code below (``_get_top_k``) and in ``match_feature_group_criteria``. + PROPERTY_MAPPING = { + RETRIEVE_BACKEND: {"explanation": "Which retrieve-connector backend to use"}, + QUERY_TEXT: {"explanation": "Raw text query to search the corpus"}, + TOP_K: {"explanation": f"Number of passages to return (default {DEFAULT_TOP_K})"}, + CORPUS: {"explanation": "Inline corpus: a list of {doc_id, text} dicts"}, + } + + @classmethod + def compute_framework_rule(cls) -> Optional[Set[Type[ComputeFramework]]]: + return {PythonDictFramework} + + @classmethod + def input_data(cls) -> DataCreator: + return DataCreator({cls.ROOT_FEATURE_NAME}) + + @classmethod + def match_feature_group_criteria( + cls, + feature_name: Union[FeatureName, str], + options: Options, + data_access_collection: Any = None, + ) -> bool: + """Match the root feature name only for a backend this concrete declares. + + Gating on ``retrieve_backend`` (rather than name alone) is what keeps + concrete backends mutually exclusive, so enabling several at once is + unambiguous. An unknown backend matches nothing (honest surface: the + connector does not silently claim a backend it cannot serve). + """ + if str(feature_name) != cls.ROOT_FEATURE_NAME: + return False + backend = options.get(cls.RETRIEVE_BACKEND) + return backend in cls.RETRIEVE_BACKENDS + + def input_features(self, options: Options, feature_name: FeatureName) -> None: + """Root feature: no input features.""" + return None + + @classmethod + def _get_top_k(cls, options: Options) -> int: + val = options.get(cls.TOP_K) + if val is None: + return cls.DEFAULT_TOP_K + try: + return int(val) + except (ValueError, TypeError) as exc: + raise ValueError(f"{cls.__name__} option '{cls.TOP_K}' must be an integer, got {val!r}.") from exc + + @classmethod + def _get_corpus(cls, options: Options) -> List[Dict[str, Any]]: + corpus = options.get(cls.CORPUS) + if corpus is None: + raise ValueError(f"{cls.__name__} requires '{cls.CORPUS}' in options: a list of {{doc_id, text}} dicts.") + return list(corpus) + + @classmethod + @abstractmethod + def _rank(cls, query: str, texts: List[str], top_k: int) -> List[Tuple[int, float]]: + """Rank ``texts`` against ``query``. + + Returns at most ``top_k`` ``(corpus_index, score)`` pairs, ordered + best-first, where ``score`` is higher-is-more-relevant. The unified + family rule: only pairs with a positive score are returned, so a + degenerate query (empty, or sharing no terms with the corpus) yields no + pairs. Requirements the base relies on and enforces (see + :meth:`_validate_ranking`): indices are in range + (``0 <= corpus_index < len(texts)``) and unique, at most ``top_k`` + pairs come back, and scores are non-increasing. ``top_k`` is already + clamped to ``1 <= top_k <= len(texts)``, so backends need not re-check + it. The base does not re-sort, so returning best-first is a hard + requirement. + """ + ... + + @classmethod + def _validate_ranking(cls, ranked: List[Tuple[int, float]], corpus_size: int, top_k: int) -> None: + """Reject a ``_rank`` result that breaks the contract. + + Enforces all four :meth:`_rank` requirements: indices in range, indices + unique, at most ``top_k`` pairs, and scores non-increasing (best-first). + """ + if len(ranked) > top_k: + raise ValueError(f"{cls.__name__}._rank returned {len(ranked)} pairs for top_k={top_k}.") + seen: Set[int] = set() + previous_score: Optional[float] = None + for corpus_idx, score in ranked: + if not 0 <= corpus_idx < corpus_size: + raise ValueError( + f"{cls.__name__}._rank returned out-of-range index {corpus_idx} for a corpus of size {corpus_size}." + ) + if corpus_idx in seen: + raise ValueError(f"{cls.__name__}._rank returned duplicate index {corpus_idx}.") + seen.add(corpus_idx) + if previous_score is not None and score > previous_score: + raise ValueError( + f"{cls.__name__}._rank returned scores out of order: {score} after {previous_score} " + f"(scores must be non-increasing, best-first)." + ) + previous_score = score + + @classmethod + def _retrieve( + cls, + query: str, + corpus: List[Dict[str, Any]], + top_k: int, + ) -> List[Dict[str, Any]]: + """Assemble the ranked-passage contract around the backend's :meth:`_rank`. + + Owns the cross-backend invariants: empty corpus and non-positive + ``top_k`` return ``[]``; ``top_k`` is clamped to the corpus size; + ``doc_id``/``text`` are read from the corpus; ``rank`` is assigned + 0-based ascending; ``score`` is coerced to ``float``. Corpus entries + must be dicts, and the effective ``doc_id`` values (after ``str()`` + coercion and the positional-index fallback for a missing ``doc_id``) + must be unique; either violation raises ``ValueError``. A missing + ``text`` key stays lenient and coerces to ``""``. The ranking a backend + returns is validated against the :meth:`_rank` contract, so a buggy + ``_rank`` fails loudly here (for any input) instead of silently + dropping or duplicating a passage. + """ + if not corpus: + return [] + + for i, doc in enumerate(corpus): + if not isinstance(doc, dict): + raise ValueError( + f"{cls.__name__} corpus entry at index {i} is not a dict: {doc!r}. " + f"Each entry must be a {{doc_id, text}} dict." + ) + + doc_ids = [str(doc.get("doc_id", str(i))) for i, doc in enumerate(corpus)] + seen_doc_ids: Set[str] = set() + for doc_id in doc_ids: + if doc_id in seen_doc_ids: + raise ValueError( + f"{cls.__name__} corpus contains duplicate doc_id {doc_id!r} " + f"(after str() coercion and the positional-index fallback)." + ) + seen_doc_ids.add(doc_id) + + effective_k = min(top_k, len(corpus)) + if effective_k <= 0: + return [] + + texts = [str(doc.get("text", "")) for doc in corpus] + + ranked = cls._rank(query, texts, effective_k) + cls._validate_ranking(ranked, len(corpus), effective_k) + + passages: List[Dict[str, Any]] = [] + for rank, (corpus_idx, score) in enumerate(ranked): + passages.append( + { + "doc_id": doc_ids[corpus_idx], + "text": texts[corpus_idx], + "score": float(score), + "rank": rank, + } + ) + return passages + + @classmethod + def calculate_feature(cls, data: Any, features: FeatureSet) -> List[Dict[str, Any]]: + """Rank the corpus against the query, return ranked passages. + + The FeatureSet must contain exactly one feature: the family answers one + query per run, so a set with several features would silently drop all + but the first and instead raises ``ValueError``. + """ + feature_list = list(features.features) + if len(feature_list) > 1: + raise ValueError( + f"{cls.__name__} answers one query per run, but the FeatureSet contains {len(feature_list)} features." + ) + for feature in feature_list: + options = feature.options + query = options.get(cls.QUERY_TEXT) + if query is None: + raise ValueError(f"{cls.__name__} requires '{cls.QUERY_TEXT}' in options.") + corpus = cls._get_corpus(options) + top_k = cls._get_top_k(options) + passages = cls._retrieve(str(query), corpus, top_k) + return [{cls.ROOT_FEATURE_NAME: passages}] + return [] diff --git a/rag_integration/feature_groups/connectors/retrieve/bm25s_retriever.py b/rag_integration/feature_groups/connectors/retrieve/bm25s_retriever.py new file mode 100644 index 0000000..b47d443 --- /dev/null +++ b/rag_integration/feature_groups/connectors/retrieve/bm25s_retriever.py @@ -0,0 +1,65 @@ +"""BM25 lexical retrieve connector, backed by the ``bm25s`` library. + +Canonical concrete for the ``retrieve`` family: zero-download (no model, no +network), deterministic, and contract-canonical (bm25s ``retrieve`` returns +ranked indices with scores directly). MIT-licensed, numpy-only. +""" + +from __future__ import annotations + +from typing import List, Tuple + +from rag_integration.feature_groups.connectors.retrieve.base import BaseRetrieveConnector + + +class Bm25sRetriever(BaseRetrieveConnector): + """Lexical BM25 retrieval over an inline corpus. + + Selected with ``retrieve_backend="bm25s"``. Builds an in-memory BM25 index + per call (the corpus is per-call), so there is no shared state to cache and + repeated calls are idempotent. Family rule: at most ``top_k`` passages come + back and only those scoring positively, so a degenerate query (empty, or + all out-of-vocabulary) yields no passages. + """ + + RETRIEVE_BACKENDS = { + "bm25s": "BM25 lexical retrieval (bm25s)", + } + + # Declarative option documentation; selection is via + # ``match_feature_group_criteria`` (see BaseRetrieveConnector). The allowed + # backend value is the single key of RETRIEVE_BACKENDS above. + PROPERTY_MAPPING = { + BaseRetrieveConnector.RETRIEVE_BACKEND: {"explanation": "Use 'bm25s' for BM25 lexical retrieval"}, + BaseRetrieveConnector.QUERY_TEXT: {"explanation": "Raw text query to search the corpus"}, + BaseRetrieveConnector.TOP_K: { + "explanation": f"Number of passages to return (default {BaseRetrieveConnector.DEFAULT_TOP_K})" + }, + BaseRetrieveConnector.CORPUS: {"explanation": "Inline corpus: a list of {doc_id, text} dicts"}, + } + + @classmethod + def _rank(cls, query: str, texts: List[str], top_k: int) -> List[Tuple[int, float]]: + import bm25s + + corpus_tokens = bm25s.tokenize(texts, stopwords="en", show_progress=False) + # Degenerate corpus (e.g. every doc is only stopwords) tokenizes to an + # empty vocabulary; bm25s would raise on retrieve. Nothing is rankable. + if len(corpus_tokens.vocab) == 0: + return [] + + retriever = bm25s.BM25() + retriever.index(corpus_tokens, show_progress=False) + + query_tokens = bm25s.tokenize([query], stopwords="en", show_progress=False) + # index() is called WITHOUT a corpus, so retrieve() returns integer + # corpus indices (not document objects); keep it that way so the int() + # cast below stays valid. + indices, scores = retriever.retrieve(query_tokens, k=top_k, show_progress=False) + + pairs = [(int(indices[0][rank]), float(scores[0][rank])) for rank in range(top_k)] + # Family rule: only positively scoring passages are returned. Without + # this filter a degenerate query (empty, or all out-of-vocabulary) + # would pad the result with top_k zero-scored passages in arbitrary + # order instead of returning nothing. + return [(idx, score) for idx, score in pairs if score > 0.0] diff --git a/rag_integration/feature_groups/connectors/retrieve/tfidf_retriever.py b/rag_integration/feature_groups/connectors/retrieve/tfidf_retriever.py new file mode 100644 index 0000000..f506715 --- /dev/null +++ b/rag_integration/feature_groups/connectors/retrieve/tfidf_retriever.py @@ -0,0 +1,73 @@ +"""Vector-space TF-IDF retrieve connector (lexical). + +Second concrete for the ``retrieve`` family: vectorizes the corpus and query +with the repo's deterministic :class:`TfidfEmbedder` (hashed TF-IDF, still a +lexical representation, not a learned dense one) and ranks documents by cosine +similarity. Zero-download (no model, no network), pure-Python, deterministic. A +vector-space counterpart to the probabilistic ``bm25s`` backend that anchors +the same contract suite from a different ranking mechanism, and with no new +dependency (it reuses the existing TF-IDF embedder). +""" + +from __future__ import annotations + +from typing import List, Tuple + +from rag_integration.feature_groups.connectors.retrieve.base import BaseRetrieveConnector +from rag_integration.feature_groups.rag_pipeline.embedding.tfidf import TfidfEmbedder + + +class TfidfRetriever(BaseRetrieveConnector): + """Vector-space TF-IDF retrieval (lexical) over an inline corpus + (``retrieve_backend="tfidf"``). + + Vectorizes the corpus and the query together so they share one + IDF/vocabulary, then ranks documents by cosine similarity to the query. The + embedder L2-normalizes every vector, so cosine reduces to a dot product. + Ties are broken by corpus index, so the ordering is stable and + deterministic. Family rule: at most ``top_k`` passages come back and only + those scoring positively, so a degenerate query yields no passages. + """ + + # The embedder hashes terms into a fixed-width vector; 384 is its own default + # and is ample for the small inline corpora this family serves. ``model_name`` + # is ignored by the TF-IDF embedder, so the default is passed verbatim. + _TFIDF_DIM = 384 + + RETRIEVE_BACKENDS = { + "tfidf": "Vector-space TF-IDF retrieval (cosine over hashed TF-IDF vectors)", + } + + PROPERTY_MAPPING = { + BaseRetrieveConnector.RETRIEVE_BACKEND: {"explanation": "Use 'tfidf' for vector-space TF-IDF retrieval"}, + BaseRetrieveConnector.QUERY_TEXT: {"explanation": "Raw text query to search the corpus"}, + BaseRetrieveConnector.TOP_K: { + "explanation": f"Number of passages to return (default {BaseRetrieveConnector.DEFAULT_TOP_K})" + }, + BaseRetrieveConnector.CORPUS: {"explanation": "Inline corpus: a list of {doc_id, text} dicts"}, + } + + @staticmethod + def _cosine(query_vector: List[float], doc_vector: List[float]) -> float: + # Both vectors are L2-normalized by the embedder, so the dot product is + # already the cosine similarity. + return sum(q * d for q, d in zip(query_vector, doc_vector)) + + @classmethod + def _rank(cls, query: str, texts: List[str], top_k: int) -> List[Tuple[int, float]]: + # ``_embed_texts`` is the embedder's deterministic raw-text vectorization + # entry point; embedding the corpus and query in one batch shares a + # single IDF/vocabulary so the query and documents live in one space. + vectors = TfidfEmbedder._embed_texts(list(texts) + [query], cls._TFIDF_DIM, "default") + query_vector = vectors[-1] + doc_vectors = vectors[:-1] + + scored = [(idx, cls._cosine(query_vector, doc_vector)) for idx, doc_vector in enumerate(doc_vectors)] + # Family rule: only positively scoring passages are returned. This also + # covers the degenerate query (empty, or only tokens the embedder + # drops): it embeds to an all-zero vector, every cosine is 0, and no + # pair survives the filter. + positive = [(idx, score) for idx, score in scored if score > 0.0] + # Best score first; ties broken by original index for a stable order. + positive.sort(key=lambda pair: (-pair[1], pair[0])) + return positive[:top_k] diff --git a/rag_integration/feature_groups/connectors/structured/__init__.py b/rag_integration/feature_groups/connectors/structured/__init__.py new file mode 100644 index 0000000..eee4fab --- /dev/null +++ b/rag_integration/feature_groups/connectors/structured/__init__.py @@ -0,0 +1,9 @@ +"""The ``structured`` connector family: NL question + table -> SQL -> typed rows.""" + +from __future__ import annotations + +from rag_integration.feature_groups.connectors.structured.aggregate_sql import AggregateSql +from rag_integration.feature_groups.connectors.structured.base import BaseStructuredConnector +from rag_integration.feature_groups.connectors.structured.rule_based_sql import RuleBasedSql + +__all__ = ["BaseStructuredConnector", "RuleBasedSql", "AggregateSql"] diff --git a/rag_integration/feature_groups/connectors/structured/aggregate_sql.py b/rag_integration/feature_groups/connectors/structured/aggregate_sql.py new file mode 100644 index 0000000..37700e3 --- /dev/null +++ b/rag_integration/feature_groups/connectors/structured/aggregate_sql.py @@ -0,0 +1,144 @@ +"""Aggregation-aware rule-based text-to-SQL backend. + +Second concrete for the ``structured`` family: zero-download, deterministic, no +LLM. Where :class:`RuleBasedSql` covers count/filter/list intents, this backend +adds *aggregation* (avg/min/max/sum over a column named in the question), +translating it to a parameterised aggregate ``SELECT``. Numericness is not +validated: SQLite coerces non-numeric values, so e.g. ``AVG``/``SUM`` over a +text column silently returns ``0.0``. It reuses the base's identifier +whitelist, sqlglot single-SELECT guard, and sqlite execution. The +count/filter/list intents are reimplemented here too (rather than inherited +from ``RuleBasedSql``) so the backend subclasses the family base directly and +satisfies the shared contract suite on its own. +""" + +from __future__ import annotations + +import re +from typing import Any, List, Optional, Tuple + +from rag_integration.feature_groups.connectors.structured.base import BaseStructuredConnector + +# Tokens keep underscores (snake_case columns) and decimals ("2.5"). Negative +# numbers are not supported as filter values (the leading "-" is dropped). +_TOKEN_RE = re.compile(r"[a-z0-9_]+(?:\.[0-9]+)?") + +# Natural-language aggregation cues -> SQL aggregate function. Each function is +# a fixed literal (never user text), so interpolating it is injection-safe. +_AGGREGATIONS = { + "average": "AVG", + "avg": "AVG", + "mean": "AVG", + "minimum": "MIN", + "min": "MIN", + "lowest": "MIN", + "smallest": "MIN", + "maximum": "MAX", + "max": "MAX", + "highest": "MAX", + "largest": "MAX", + "sum": "SUM", + "total": "SUM", +} + + +class AggregateSql(BaseStructuredConnector): + """Aggregation-aware rule-based NL->SQL (``structured_backend="aggregate"``). + + Intents, in priority order: + + 1. Aggregate (an aggregation cue such as ``average``/``min``/``max``/``sum`` + plus a column named in the question, preferring the first column + mentioned after the cue): + ``SELECT () AS result FROM ``. + 2. Count (``how many`` / ``count``): ``SELECT COUNT(*) AS cnt FROM
``; + if the question also names a column followed by a value token, the count + is filtered: ``SELECT COUNT(*) AS cnt FROM
WHERE LOWER() = ?``. + 3. Equality filter (a column name followed by a value token): + ``SELECT * FROM
WHERE LOWER() = ?`` (case-insensitive). + 4. Otherwise list all rows: ``SELECT * FROM
``. + + Table and column names are validated identifiers (by the base) and are + interpolated double-quoted, so reserved words work; the aggregate function + comes from a fixed whitelist; values are always bound parameters, never + interpolated. Negative numbers are not supported as filter values (the + tokenizer drops the sign). + """ + + STRUCTURED_BACKENDS = { + "aggregate": "Aggregation-aware rule-based natural-language-to-SQL (no LLM)", + } + + PROPERTY_MAPPING = { + BaseStructuredConnector.STRUCTURED_BACKEND: { + "explanation": "Use 'aggregate' for aggregation-aware text-to-SQL" + }, + BaseStructuredConnector.QUESTION: {"explanation": "Natural-language question to answer over the table"}, + BaseStructuredConnector.TABLE: {"explanation": "Table name (a simple SQL identifier)"}, + BaseStructuredConnector.COLUMNS: {"explanation": "Column names (simple SQL identifiers)"}, + BaseStructuredConnector.ROWS: {"explanation": "Table rows: a list of {column: value} dicts"}, + } + + @classmethod + def _find_column(cls, tokens: List[str], columns: List[str]) -> Optional[str]: + """Return the first column named in ``tokens`` (in token order), or None.""" + lowered = {column.lower(): column for column in columns} + for token in tokens: + if token in lowered: + return lowered[token] + return None + + @classmethod + def _find_filter(cls, tokens: List[str], columns: List[str]) -> Optional[Tuple[str, str]]: + """Return ``(column, value)`` for the first column (in declaration + order) named in the question and followed by a value token, or None.""" + for column in columns: + lowered = column.lower() + if lowered in tokens: + position = tokens.index(lowered) + if position + 1 < len(tokens): + return column, tokens[position + 1] + return None + + @classmethod + def _to_sql(cls, question: str, table: str, columns: List[str]) -> Tuple[str, List[Any]]: + tokens = _TOKEN_RE.findall(question.lower()) + token_set = set(tokens) + + # 1. Aggregation: an aggregation cue plus a column named in the question. + # Checked first by design, so "the average age" aggregates rather than + # being read as a filter; the trade-off is that a filter whose *value* + # token is itself a cue word (e.g. "... status max") would aggregate + # instead. The column mentioned after the cue is preferred ("the species + # with the highest age" aggregates age, not species); a column named + # only before the cue is the fallback. table, column, and the aggregate + # function are all whitelisted (the function is a fixed literal) and + # identifiers are interpolated double-quoted, so the f-string is + # injection-safe. + for index, token in enumerate(tokens): + function = _AGGREGATIONS.get(token) + if function is not None: + column = cls._find_column(tokens[index + 1 :], columns) + if column is None: + column = cls._find_column(tokens, columns) + if column is not None: + return f'SELECT {function}("{column}") AS result FROM "{table}"', [] # nosec B608 + break + + filter_match = cls._find_filter(tokens, columns) + + # 2. Count (filtered when the question also names a column + value). + if "count" in token_set or ("how" in token_set and "many" in token_set): + if filter_match is not None: + column, value = filter_match + return f'SELECT COUNT(*) AS cnt FROM "{table}" WHERE LOWER("{column}") = ?', [value] # nosec B608 + return f'SELECT COUNT(*) AS cnt FROM "{table}"', [] # nosec B608 + + # 3. Equality filter: a column name followed by a value token. The value + # is always returned as a bound parameter, never interpolated. + if filter_match is not None: + column, value = filter_match + return f'SELECT * FROM "{table}" WHERE LOWER("{column}") = ?', [value] # nosec B608 + + # 4. List all. + return f'SELECT * FROM "{table}"', [] # nosec B608 diff --git a/rag_integration/feature_groups/connectors/structured/base.py b/rag_integration/feature_groups/connectors/structured/base.py new file mode 100644 index 0000000..fae03b4 --- /dev/null +++ b/rag_integration/feature_groups/connectors/structured/base.py @@ -0,0 +1,199 @@ +"""Base class for the ``structured`` connector family. + +Contract: ``question + table -> SQL -> typed rows``. + +A structured connector answers a natural-language question over a relational +table by translating it to SQL and executing it. It is a ROOT FeatureGroup: the +table (name, columns, rows) is passed inline through ``Options`` and executed in +an in-memory SQLite database, so the family is self-contained and +contract-testable without an external database. + +Output (single row, keyed by the root feature name):: + + {"structured_rows": {"sql": "SELECT ...", "rows": [{"col": value, ...}, ...]}} + +The base owns identifier validation, SQL safety (it parses the generated SQL +with sqlglot and rejects anything that is not a single top-level ``SELECT`` +statement, with no set operations and no stacked statements), the +in-memory SQLite execution, and row typing. A backend implements only +:meth:`_to_sql` (the natural-language-to-SQL translation), which returns a +parameterised statement so values never reach SQL by string interpolation. +""" + +from __future__ import annotations + +import re +import sqlite3 +from abc import abstractmethod +from typing import Any, Dict, List, Optional, Set, Tuple, Type, Union + +from mloda.provider import DataCreator, FeatureGroup, ComputeFramework, FeatureSet +from mloda.user import Options, FeatureName +from mloda_plugins.compute_framework.base_implementations.python_dict.python_dict_framework import ( + PythonDictFramework, +) + +_IDENT_RE = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$") + + +class BaseStructuredConnector(FeatureGroup): + """Root FeatureGroup for structured (text-to-SQL) connector backends. + + A concrete backend declares its selector value in ``STRUCTURED_BACKENDS`` and + implements :meth:`_to_sql`; selection is via + :meth:`match_feature_group_criteria`, gating on + ``structured_backend in cls.STRUCTURED_BACKENDS``. + """ + + ROOT_FEATURE_NAME = "structured_rows" + + # Option keys. + STRUCTURED_BACKEND = "structured_backend" + QUESTION = "question" + TABLE = "table_name" + COLUMNS = "columns" + ROWS = "rows" + + STRUCTURED_BACKENDS: Dict[str, str] = {} + + PROPERTY_MAPPING = { + STRUCTURED_BACKEND: {"explanation": "Which structured (text-to-SQL) backend to use"}, + QUESTION: {"explanation": "Natural-language question to answer over the table"}, + TABLE: {"explanation": "Table name (a simple SQL identifier)"}, + COLUMNS: {"explanation": "Column names (simple SQL identifiers)"}, + ROWS: {"explanation": "Table rows: a list of {column: value} dicts"}, + } + + @classmethod + def compute_framework_rule(cls) -> Optional[Set[Type[ComputeFramework]]]: + return {PythonDictFramework} + + @classmethod + def input_data(cls) -> DataCreator: + return DataCreator({cls.ROOT_FEATURE_NAME}) + + @classmethod + def match_feature_group_criteria( + cls, + feature_name: Union[FeatureName, str], + options: Options, + data_access_collection: Any = None, + ) -> bool: + """Match the root feature name only for a backend this concrete declares.""" + if str(feature_name) != cls.ROOT_FEATURE_NAME: + return False + backend = options.get(cls.STRUCTURED_BACKEND) + return backend in cls.STRUCTURED_BACKENDS + + def input_features(self, options: Options, feature_name: FeatureName) -> None: + """Root feature: no input features (the table arrives via Options).""" + return None + + @classmethod + def _require(cls, options: Options, key: str) -> Any: + value = options.get(key) + if value is None: + raise ValueError(f"{cls.__name__} requires '{key}' in options.") + return value + + @classmethod + def _validate_identifier(cls, name: str, kind: str) -> str: + """Reject any table/column name that is not a simple SQL identifier. + + Identifiers cannot be parameterised, so this whitelist is what keeps the + generated SQL injection-safe (values, by contrast, are always bound).""" + if not _IDENT_RE.fullmatch(name): + raise ValueError(f"{cls.__name__}: invalid {kind} identifier {name!r}; expected a simple SQL identifier.") + return name + + @classmethod + @abstractmethod + def _to_sql(cls, question: str, table: str, columns: List[str]) -> Tuple[str, List[Any]]: + """Translate ``question`` into a read-only SQL statement. + + Returns ``(sql, params)`` where ``sql`` is a single ``SELECT`` over + ``table`` using ``?`` placeholders for any values, and ``params`` are the + bound values in order. ``table`` and ``columns`` are already validated + identifiers. The base parses the result with sqlglot and rejects + anything but a single top-level bare ``SELECT`` statement (no set + operations, no stacked statements). + """ + ... + + @classmethod + def _validate_select(cls, sql: str) -> None: + """Require ``sql`` to be a single top-level ``SELECT`` statement + (no set operations, no stacked statements).""" + import sqlglot + import sqlglot.expressions as exp + from sqlglot.errors import SqlglotError + + try: + statements = sqlglot.parse(sql, read="sqlite") + except SqlglotError as error: + raise ValueError(f"{cls.__name__}._to_sql produced unparseable SQL: {sql!r}") from error + if len(statements) != 1 or not isinstance(statements[0], exp.Select): + raise ValueError(f"{cls.__name__}._to_sql must produce a single top-level SELECT statement, got: {sql!r}") + + @classmethod + def _query( + cls, + question: str, + table: str, + columns: List[str], + rows: List[Dict[str, Any]], + ) -> Dict[str, Any]: + """Translate, validate, and execute the query over an in-memory SQLite table.""" + table = cls._validate_identifier(table, "table") + columns = [cls._validate_identifier(c, "column") for c in columns] + if not columns: + raise ValueError(f"{cls.__name__}: at least one column is required.") + if len({c.lower() for c in columns}) != len(columns): + raise ValueError( + f"{cls.__name__}: duplicate column names (SQLite is case-insensitive) are not allowed: {columns}." + ) + + sql, params = cls._to_sql(question, table, columns) + cls._validate_select(sql) + + connection = sqlite3.connect(":memory:") + try: + # table and columns are whitelisted identifiers (validated above, + # quotes excluded by the whitelist) and double-quoted so reserved + # words work; all row values are bound parameters, never interpolated. + column_ddl = ", ".join(f'"{c}"' for c in columns) + connection.execute(f'CREATE TABLE "{table}" ({column_ddl})') + placeholders = ", ".join("?" for _ in columns) + insert_sql = f'INSERT INTO "{table}" ({column_ddl}) VALUES ({placeholders})' # nosec B608 + connection.executemany(insert_sql, [[row.get(c) for c in columns] for row in rows]) + + # Defense-in-depth: make the connection read-only at the SQLite + # level before running backend SQL, so any write attempt fails + # regardless of sqlglot version or validation behavior. + connection.execute("PRAGMA query_only = ON") + + cursor = connection.execute(sql, params) + result_columns = [description[0] for description in cursor.description] + result_rows = [dict(zip(result_columns, record)) for record in cursor.fetchall()] + finally: + connection.close() + + return {"sql": sql, "rows": result_rows} + + @classmethod + def calculate_feature(cls, data: Any, features: FeatureSet) -> List[Dict[str, Any]]: + """Answer the question over the supplied table, return the SQL and rows.""" + for feature in features.features: + options = feature.options + question = str(cls._require(options, cls.QUESTION)) + table = str(cls._require(options, cls.TABLE)) + raw_columns = cls._require(options, cls.COLUMNS) + if not isinstance(raw_columns, (list, tuple)): + raise ValueError(f"{cls.__name__}: '{cls.COLUMNS}' must be a list or tuple of column names.") + columns = [str(c) for c in raw_columns] + raw_rows = cls._require(options, cls.ROWS) + if not isinstance(raw_rows, (list, tuple)) or not all(isinstance(row, dict) for row in raw_rows): + raise ValueError(f"{cls.__name__}: '{cls.ROWS}' must be a list or tuple of dicts.") + rows = [dict(row) for row in raw_rows] + return [{cls.ROOT_FEATURE_NAME: cls._query(question, table, columns, rows)}] + return [] diff --git a/rag_integration/feature_groups/connectors/structured/rule_based_sql.py b/rag_integration/feature_groups/connectors/structured/rule_based_sql.py new file mode 100644 index 0000000..334cef7 --- /dev/null +++ b/rag_integration/feature_groups/connectors/structured/rule_based_sql.py @@ -0,0 +1,84 @@ +"""Rule-based text-to-SQL backend. + +Canonical concrete for the ``structured`` family: zero-download, deterministic, +no LLM. Translates a small set of natural-language intents (count, equality +filter, list-all) into parameterised SQL. There is no mature deterministic +NL->SQL library (every real one needs an LLM), so a transparent rule set is the +right CI anchor; LLM-backed translators are pedigree backends for later. +""" + +from __future__ import annotations + +import re +from typing import Any, List, Optional, Tuple + +from rag_integration.feature_groups.connectors.structured.base import BaseStructuredConnector + +# Tokens keep underscores (snake_case columns) and decimals ("2.5"). Negative +# numbers are not supported as filter values (the leading "-" is dropped). +_TOKEN_RE = re.compile(r"[a-z0-9_]+(?:\.[0-9]+)?") + + +class RuleBasedSql(BaseStructuredConnector): + """Rule-based NL->SQL (``structured_backend="rule_based"``). + + Intents, in priority order: + + 1. Count (``how many`` / ``count``): ``SELECT COUNT(*) AS cnt FROM
``; + if the question also names a column followed by a value token, the count + is filtered: ``SELECT COUNT(*) AS cnt FROM
WHERE LOWER() = ?``. + 2. Equality filter (a column name followed by a value token): + ``SELECT * FROM
WHERE LOWER() = ?`` (case-insensitive). + 3. Otherwise list all rows: ``SELECT * FROM
``. + + Table and column names are validated identifiers (by the base) and are + interpolated double-quoted, so reserved words work; values are always bound + parameters, never interpolated. Negative numbers are not supported as + filter values (the tokenizer drops the sign). + """ + + STRUCTURED_BACKENDS = { + "rule_based": "Rule-based natural-language-to-SQL (no LLM)", + } + + PROPERTY_MAPPING = { + BaseStructuredConnector.STRUCTURED_BACKEND: {"explanation": "Use 'rule_based' for rule-based text-to-SQL"}, + BaseStructuredConnector.QUESTION: {"explanation": "Natural-language question to answer over the table"}, + BaseStructuredConnector.TABLE: {"explanation": "Table name (a simple SQL identifier)"}, + BaseStructuredConnector.COLUMNS: {"explanation": "Column names (simple SQL identifiers)"}, + BaseStructuredConnector.ROWS: {"explanation": "Table rows: a list of {column: value} dicts"}, + } + + @classmethod + def _find_filter(cls, tokens: List[str], columns: List[str]) -> Optional[Tuple[str, str]]: + """Return ``(column, value)`` for the first column (in declaration + order) named in the question and followed by a value token, or None.""" + for column in columns: + lowered = column.lower() + if lowered in tokens: + position = tokens.index(lowered) + if position + 1 < len(tokens): + return column, tokens[position + 1] + return None + + @classmethod + def _to_sql(cls, question: str, table: str, columns: List[str]) -> Tuple[str, List[Any]]: + tokens = _TOKEN_RE.findall(question.lower()) + token_set = set(tokens) + + # table and column are validated identifiers (by the base, quotes + # excluded) and interpolated double-quoted; the filter value is always + # returned as a bound parameter, never interpolated. + filter_match = cls._find_filter(tokens, columns) + + if "count" in token_set or ("how" in token_set and "many" in token_set): + if filter_match is not None: + column, value = filter_match + return f'SELECT COUNT(*) AS cnt FROM "{table}" WHERE LOWER("{column}") = ?', [value] # nosec B608 + return f'SELECT COUNT(*) AS cnt FROM "{table}"', [] # nosec B608 + + if filter_match is not None: + column, value = filter_match + return f'SELECT * FROM "{table}" WHERE LOWER("{column}") = ?', [value] # nosec B608 + + return f'SELECT * FROM "{table}"', [] # nosec B608 diff --git a/tests/conftest.py b/tests/conftest.py index 3fe4f74..63135a1 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -34,3 +34,8 @@ def _spacy_model_available(model_name: str) -> bool: RUNNING_ON_CI, reason="Skipped on CI: downloads a sentence-transformers model from the Hugging Face Hub (rate-limited, HTTP 429).", ) + +requires_flashrank_model = pytest.mark.skipif( + RUNNING_ON_CI, + reason="Skipped on CI: FlashRank downloads its (~4 MB) ONNX model from the network on first use.", +) diff --git a/tests/connectors/__init__.py b/tests/connectors/__init__.py new file mode 100644 index 0000000..310bb31 --- /dev/null +++ b/tests/connectors/__init__.py @@ -0,0 +1 @@ +"""Connector-family tests.""" diff --git a/tests/connectors/generate/__init__.py b/tests/connectors/generate/__init__.py new file mode 100644 index 0000000..26e50e4 --- /dev/null +++ b/tests/connectors/generate/__init__.py @@ -0,0 +1 @@ +"""generate connector-family tests.""" diff --git a/tests/connectors/generate/generate_contract.py b/tests/connectors/generate/generate_contract.py new file mode 100644 index 0000000..3c394ab --- /dev/null +++ b/tests/connectors/generate/generate_contract.py @@ -0,0 +1,178 @@ +"""Inheritable contract-test suite for the ``generate`` connector family. + +A concrete backend's test implements six adapter methods and inherits every +assertion. The base is not named ``Test*`` so pytest does not collect it. +""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import Any, Dict, List, Type + +from mloda.user import mlodaAPI, Feature, Options, PluginCollector +from mloda_plugins.compute_framework.base_implementations.python_dict.python_dict_framework import ( + PythonDictFramework, +) + +from rag_integration.feature_groups.connectors.generate.base import BaseGenerateConnector + + +class GenerateConnectorContractBase(ABC): + """Contract every generate-connector backend must satisfy.""" + + # -- Adapter methods ------------------------------------------------------ + + @classmethod + @abstractmethod + def connector_class(cls) -> Type[BaseGenerateConnector]: + """Return the concrete ``BaseGenerateConnector`` subclass under test.""" + + @classmethod + @abstractmethod + def backend_value(cls) -> str: + """Return the ``generate_backend`` value that selects this concrete.""" + + @classmethod + @abstractmethod + def sample_passages(cls) -> List[Dict[str, Any]]: + """Return supporting passages (``{doc_id, text}``) with at least one clearly relevant doc. + + ``expected_citation_doc_id`` names the one that must be cited; a backend + whose distinguishing behaviour is multi-passage citation may make several + passages relevant (the contract only requires that doc to be among them). + """ + + @classmethod + @abstractmethod + def sample_query(cls) -> str: + """Return a query answerable from ``sample_passages`` with a determinate + ``expected_citation_doc_id`` (other passages may also be relevant).""" + + @classmethod + @abstractmethod + def expected_citation_doc_id(cls) -> str: + """Return the ``doc_id`` that must be cited for ``sample_query``.""" + + @classmethod + @abstractmethod + def expected_answer_substring(cls) -> str: + """Return a distinctive substring of the relevant passage that the + grounded answer must contain (proves the answer is drawn from the + passage, not invented).""" + + # -- Helpers -------------------------------------------------------------- + + @classmethod + def _answer(cls, query: str, passages: List[Dict[str, Any]]) -> Dict[str, Any]: + return cls.connector_class()._answer(query, passages) + + @classmethod + def _run_all(cls, query: str, passages: List[Dict[str, Any]]) -> Dict[str, Any]: + connector = cls.connector_class() + feature = Feature( + connector.ROOT_FEATURE_NAME, + options=Options( + context={ + connector.GENERATE_BACKEND: cls.backend_value(), + connector.QUERY_TEXT: query, + connector.PASSAGES: passages, + } + ), + ) + result = mlodaAPI.run_all( + [feature], + compute_frameworks={PythonDictFramework}, + plugin_collector=PluginCollector.enabled_feature_groups({connector}), + ) + for partition in result: + for row in partition: + if connector.ROOT_FEATURE_NAME in row: + answer: Dict[str, Any] = row[connector.ROOT_FEATURE_NAME] + return answer + raise AssertionError(f"run_all returned no '{connector.ROOT_FEATURE_NAME}' row: {result!r}") + + # -- Matching / honest surface -------------------------------------------- + + def test_matches_root_feature_for_declared_backend(self) -> None: + connector = self.connector_class() + opts = Options(context={connector.GENERATE_BACKEND: self.backend_value()}) + assert connector.match_feature_group_criteria(connector.ROOT_FEATURE_NAME, opts) is True + + def test_does_not_match_other_feature_name(self) -> None: + connector = self.connector_class() + opts = Options(context={connector.GENERATE_BACKEND: self.backend_value()}) + assert connector.match_feature_group_criteria("docs", opts) is False + + def test_unknown_backend_does_not_match(self) -> None: + connector = self.connector_class() + opts = Options(context={connector.GENERATE_BACKEND: "definitely_not_a_backend_xyz"}) + assert connector.match_feature_group_criteria(connector.ROOT_FEATURE_NAME, opts) is False + + def test_backend_declared_in_supported_set(self) -> None: + connector = self.connector_class() + assert self.backend_value() in connector.GENERATE_BACKENDS + + # -- Output contract ------------------------------------------------------ + + def test_answer_object_shape(self) -> None: + result = self._answer(self.sample_query(), self.sample_passages()) + assert set(result) >= {"answer", "citations"} + assert isinstance(result["answer"], str) + assert isinstance(result["citations"], list) + assert all(isinstance(c, str) for c in result["citations"]) + + def test_answer_nonempty_for_canonical_query(self) -> None: + result = self._answer(self.sample_query(), self.sample_passages()) + assert result["answer"].strip(), "canonical query produced an empty answer; assertions would be vacuous" + + def test_citations_are_grounded(self) -> None: + """Every citation is one of the supplied passage doc_ids (no invented sources). + + Uses the same positional-fallback rule as the base validator so a + passage without an explicit ``doc_id`` does not crash this assertion. + """ + passages = self.sample_passages() + known = {str(p.get("doc_id", str(i))) for i, p in enumerate(passages)} + result = self._answer(self.sample_query(), passages) + assert set(result["citations"]) <= known + + def test_nonempty_answer_is_cited(self) -> None: + """Grounded by construction: the canonical query yields a non-empty + answer, and a non-empty answer must cite >=1 passage.""" + result = self._answer(self.sample_query(), self.sample_passages()) + assert result["answer"].strip(), "canonical query produced an empty answer" + assert result["citations"], "non-empty answer returned no citations" + + def test_relevant_passage_cited(self) -> None: + """Not-a-stub proof: the relevant passage is cited.""" + result = self._answer(self.sample_query(), self.sample_passages()) + assert self.expected_citation_doc_id() in result["citations"] + + def test_answer_grounded_in_passage(self) -> None: + """Not-a-stub proof: the answer contains a distinctive substring of the + relevant passage, so it is drawn from the source rather than invented.""" + passages = self.sample_passages() + cited = [p for i, p in enumerate(passages) if str(p.get("doc_id", str(i))) == self.expected_citation_doc_id()] + assert cited, "expected_citation_doc_id is not among sample_passages" + assert self.expected_answer_substring() in str(cited[0].get("text", "")), ( + "expected_answer_substring must occur in the expected-citation passage's text" + ) + result = self._answer(self.sample_query(), passages) + assert self.expected_answer_substring() in result["answer"] + + def test_empty_passages_returns_empty(self) -> None: + result = self._answer(self.sample_query(), []) + assert result == {"answer": "", "citations": []} + + def test_idempotent(self) -> None: + passages = self.sample_passages() + first = self._answer(self.sample_query(), passages) + second = self._answer(self.sample_query(), passages) + assert first == second + + # -- End to end ----------------------------------------------------------- + + def test_end_to_end_run_all(self) -> None: + result = self._run_all(self.sample_query(), self.sample_passages()) + assert self.expected_citation_doc_id() in result["citations"] + assert self.expected_answer_substring() in result["answer"] diff --git a/tests/connectors/generate/test_base_guards.py b/tests/connectors/generate/test_base_guards.py new file mode 100644 index 0000000..1a1652a --- /dev/null +++ b/tests/connectors/generate/test_base_guards.py @@ -0,0 +1,155 @@ +"""Negative-path tests for the ``BaseGenerateConnector`` grounding guards. + +The contract suite proves well-behaved backends pass; these tests prove a +misbehaving backend cannot. Each deliberately broken ``_generate`` stub trips +exactly one guard (hallucinated citation, uncited answer, citations without an +answer, duplicate citations), and the option guards (missing ``query_text``, +missing ``passages``, duplicate passage doc_ids) are exercised directly. +""" + +from __future__ import annotations + +from typing import Any, Dict, List, Tuple +from unittest.mock import MagicMock + +import pytest + +from mloda.user import Options + +from rag_integration.feature_groups.connectors.generate.base import BaseGenerateConnector + +_PASSAGES: List[Dict[str, Any]] = [ + {"doc_id": "d0", "text": "Alpha fact."}, + {"doc_id": "d1", "text": "Beta fact."}, +] + + +class _UnknownCitationStub(BaseGenerateConnector): + """Misbehaving backend: cites a doc_id that was never supplied.""" + + @classmethod + def _generate(cls, query: str, passages: List[Dict[str, Any]]) -> Tuple[str, List[str]]: + return "Alpha fact.", ["not_a_supplied_doc"] + + +class _UncitedAnswerStub(BaseGenerateConnector): + """Misbehaving backend: non-empty answer with no citations.""" + + @classmethod + def _generate(cls, query: str, passages: List[Dict[str, Any]]) -> Tuple[str, List[str]]: + return "Alpha fact.", [] + + +class _CitationsWithoutAnswerStub(BaseGenerateConnector): + """Misbehaving backend: citations attached to a whitespace-only answer.""" + + @classmethod + def _generate(cls, query: str, passages: List[Dict[str, Any]]) -> Tuple[str, List[str]]: + return " ", ["d0"] + + +class _DuplicateCitationsStub(BaseGenerateConnector): + """Misbehaving backend: cites the same passage twice.""" + + @classmethod + def _generate(cls, query: str, passages: List[Dict[str, Any]]) -> Tuple[str, List[str]]: + return "Alpha fact.", ["d0", "d0"] + + +class _WhitespaceAnswerStub(BaseGenerateConnector): + """Degenerate but legal backend: whitespace-only answer, no citations.""" + + @classmethod + def _generate(cls, query: str, passages: List[Dict[str, Any]]) -> Tuple[str, List[str]]: + return " \n\t ", [] + + +class _WellBehavedStub(BaseGenerateConnector): + """Minimal correct backend, used to exercise the option guards.""" + + @classmethod + def _generate(cls, query: str, passages: List[Dict[str, Any]]) -> Tuple[str, List[str]]: + return "Alpha fact.", ["d0"] + + +def _feature_set(context: Dict[str, Any]) -> Any: + """Build a minimal FeatureSet stand-in with a single feature and the given options.""" + feature = MagicMock() + feature.options = Options(context=context) + features = MagicMock() + features.features = [feature] + return features + + +class TestGenerateGuards: + """Each guard rejects its misbehaving backend with a ValueError.""" + + def test_unknown_citation_raises(self) -> None: + with pytest.raises(ValueError, match="not among the supplied passages"): + _UnknownCitationStub._answer("query", _PASSAGES) + + def test_nonempty_answer_without_citations_raises(self) -> None: + with pytest.raises(ValueError, match="non-empty answer with no citations"): + _UncitedAnswerStub._answer("query", _PASSAGES) + + def test_citations_without_answer_raises(self) -> None: + with pytest.raises(ValueError, match="citations with an empty answer"): + _CitationsWithoutAnswerStub._answer("query", _PASSAGES) + + def test_duplicate_citations_raise(self) -> None: + with pytest.raises(ValueError, match="duplicate citations"): + _DuplicateCitationsStub._answer("query", _PASSAGES) + + def test_whitespace_answer_normalized_to_canonical_empty_shape(self) -> None: + result = _WhitespaceAnswerStub._answer("query", _PASSAGES) + assert result == {"answer": "", "citations": []} + + +class TestGenerateOptionGuards: + """Missing or malformed options are rejected with a ValueError.""" + + def test_missing_query_text_raises(self) -> None: + features = _feature_set( + { + BaseGenerateConnector.GENERATE_BACKEND: "stub", + BaseGenerateConnector.PASSAGES: _PASSAGES, + } + ) + with pytest.raises(ValueError, match=BaseGenerateConnector.QUERY_TEXT): + _WellBehavedStub.calculate_feature(None, features) + + def test_missing_passages_raises(self) -> None: + features = _feature_set( + { + BaseGenerateConnector.GENERATE_BACKEND: "stub", + BaseGenerateConnector.QUERY_TEXT: "query", + } + ) + with pytest.raises(ValueError, match=BaseGenerateConnector.PASSAGES): + _WellBehavedStub.calculate_feature(None, features) + + def test_duplicate_explicit_doc_ids_raise(self) -> None: + options = Options( + context={ + BaseGenerateConnector.PASSAGES: [ + {"doc_id": "d0", "text": "Alpha."}, + {"doc_id": "d0", "text": "Beta."}, + ] + } + ) + with pytest.raises(ValueError, match="duplicate passage doc_id 'd0'"): + _WellBehavedStub._get_passages(options) + + def test_explicit_doc_id_colliding_with_positional_fallback_raises(self) -> None: + # The first passage falls back to its positional index "0"; the second + # explicitly claims "0" (after str() coercion), so they collide. + options = Options( + context={ + BaseGenerateConnector.PASSAGES: [ + {"text": "Alpha."}, + {"doc_id": 0, "text": "Beta."}, + ] + } + ) + with pytest.raises(ValueError, match="duplicate passage doc_id '0'"): + _WellBehavedStub._get_passages(options) diff --git a/tests/connectors/generate/test_extractive_responder.py b/tests/connectors/generate/test_extractive_responder.py new file mode 100644 index 0000000..28f5cba --- /dev/null +++ b/tests/connectors/generate/test_extractive_responder.py @@ -0,0 +1,48 @@ +"""Contract test for :class:`ExtractiveResponder` (zero-download CI anchor).""" + +from __future__ import annotations + +from typing import Any, Dict, List, Type + +from rag_integration.feature_groups.connectors.generate.base import BaseGenerateConnector +from rag_integration.feature_groups.connectors.generate.extractive_responder import ExtractiveResponder +from tests.connectors.generate.generate_contract import GenerateConnectorContractBase + + +class TestExtractiveResponder(GenerateConnectorContractBase): + @classmethod + def connector_class(cls) -> Type[BaseGenerateConnector]: + return ExtractiveResponder + + @classmethod + def backend_value(cls) -> str: + return "extractive" + + @classmethod + def sample_passages(cls) -> List[Dict[str, Any]]: + return [ + {"doc_id": "d0", "text": "Cars need regular engine oil and maintenance."}, + {"doc_id": "d1", "text": "Cats need fresh water, a clean litter box, and daily play."}, + {"doc_id": "d2", "text": "Dogs are loyal companions."}, + ] + + @classmethod + def sample_query(cls) -> str: + return "what do cats need" + + @classmethod + def expected_citation_doc_id(cls) -> str: + return "d1" + + @classmethod + def expected_answer_substring(cls) -> str: + return "fresh water" + + # -- Backend-specific proof: no invented answers --------------------------- + + def test_no_relevant_sentence_returns_empty(self) -> None: + """Passages present but no sentence shares a query token: the responder + returns an empty answer with no citations (it does not invent an answer), + exercising the ``best_score == 0`` path.""" + result = self._answer("zzz nonmatching query", self.sample_passages()) + assert result == {"answer": "", "citations": []} diff --git a/tests/connectors/generate/test_template_responder.py b/tests/connectors/generate/test_template_responder.py new file mode 100644 index 0000000..bf6a281 --- /dev/null +++ b/tests/connectors/generate/test_template_responder.py @@ -0,0 +1,76 @@ +"""Contract test for :class:`TemplateResponder` (zero-download CI anchor). + +Inherits the whole generate contract suite, then adds a backend-specific proof: +unlike the single-citation extractive responder, this backend cites *every* +passage it drew a sentence from. The fixture is crafted so the top sentences +span two passages, so the multi-citation behaviour is exercised. +""" + +from __future__ import annotations + +from typing import Any, Dict, List, Type + +from rag_integration.feature_groups.connectors.generate.base import BaseGenerateConnector +from rag_integration.feature_groups.connectors.generate.template_responder import TemplateResponder +from tests.connectors.generate.generate_contract import GenerateConnectorContractBase + + +class TestTemplateResponder(GenerateConnectorContractBase): + @classmethod + def connector_class(cls) -> Type[BaseGenerateConnector]: + return TemplateResponder + + @classmethod + def backend_value(cls) -> str: + return "template" + + @classmethod + def sample_passages(cls) -> List[Dict[str, Any]]: + return [ + {"doc_id": "d0", "text": "Cars need regular engine oil and maintenance."}, + {"doc_id": "d1", "text": "A cat needs fresh water every day. The cat also needs a clean box."}, + {"doc_id": "d2", "text": "Good cat food keeps a cat strong."}, + ] + + @classmethod + def sample_query(cls) -> str: + # Deliberately relevant to both d1 and d2 (not just one passage) so the + # multi-citation behaviour below is exercised. + return "cat water food" + + @classmethod + def expected_citation_doc_id(cls) -> str: + return "d1" + + @classmethod + def expected_answer_substring(cls) -> str: + return "fresh water" + + # -- Backend-specific proof: multi-passage citation ----------------------- + + def test_cites_every_contributing_passage(self) -> None: + """The distinguishing behaviour vs the extractive responder: when the + top sentences span several passages, each is cited (not just one).""" + result = self._answer(self.sample_query(), self.sample_passages()) + assert set(result["citations"]) == {"d1", "d2"}, result["citations"] + assert len(result["citations"]) > 1, "template responder must cite every contributing passage" + + def test_answer_uses_fixed_template_and_cited_sources(self) -> None: + """The answer is exactly the fixed template lead-in followed by the + deterministic best-first sentence selection, verbatim. An exact-equality + check on the residual body, so no invented text can sneak in.""" + result = self._answer(self.sample_query(), self.sample_passages()) + prefix = "Based on the retrieved passages: " + assert result["answer"].startswith(prefix) + body = result["answer"].removeprefix(prefix) + # Score 2 sentences first (passage order: d1 before d2), then score 1. + assert body == ( + "A cat needs fresh water every day. Good cat food keeps a cat strong. The cat also needs a clean box." + ) + + def test_no_relevant_sentence_returns_empty(self) -> None: + """Passages present but no sentence shares a query token: the responder + returns an empty answer with no citations (never a bare template), so the + base's 'non-empty answer requires citations' guard is never tripped.""" + result = self._answer("zzz nonmatching query", self.sample_passages()) + assert result == {"answer": "", "citations": []} diff --git a/tests/connectors/graph_rag/__init__.py b/tests/connectors/graph_rag/__init__.py new file mode 100644 index 0000000..dfa716f --- /dev/null +++ b/tests/connectors/graph_rag/__init__.py @@ -0,0 +1 @@ +"""graph_rag connector-family tests.""" diff --git a/tests/connectors/graph_rag/graph_rag_contract.py b/tests/connectors/graph_rag/graph_rag_contract.py new file mode 100644 index 0000000..cc16496 --- /dev/null +++ b/tests/connectors/graph_rag/graph_rag_contract.py @@ -0,0 +1,228 @@ +"""Inheritable contract-test suite for the ``graph_rag`` connector family. + +Beyond the shared ranked-passage assertions, this suite adds the graph-specific +not-a-stub proof: a node with zero query-term overlap is surfaced because it +neighbours a relevant node, while an equally non-overlapping but *isolated* node +is not. A plain lexical retriever could not produce that result. +""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import Any, Dict, List, Type + +import pytest + +from mloda.user import mlodaAPI, Feature, Options, PluginCollector +from mloda_plugins.compute_framework.base_implementations.python_dict.python_dict_framework import ( + PythonDictFramework, +) + +from rag_integration.feature_groups.connectors.graph_rag.base import BaseGraphRagConnector + + +class GraphRagConnectorContractBase(ABC): + """Contract every graph-RAG backend must satisfy.""" + + # -- Adapter methods ------------------------------------------------------ + + @classmethod + @abstractmethod + def connector_class(cls) -> Type[BaseGraphRagConnector]: + """Return the concrete ``BaseGraphRagConnector`` subclass under test.""" + + @classmethod + @abstractmethod + def backend_value(cls) -> str: + """Return the ``graph_backend`` value that selects this concrete.""" + + @classmethod + @abstractmethod + def sample_nodes(cls) -> List[Dict[str, Any]]: + """Nodes (``{doc_id, text}``). Must include a relevant node, a zero-overlap + node connected to it, and a zero-overlap isolated node.""" + + @classmethod + @abstractmethod + def sample_edges(cls) -> List[List[str]]: + """Edges (``[doc_id_a, doc_id_b]``) connecting the relevant and context nodes.""" + + @classmethod + @abstractmethod + def sample_query(cls) -> str: + """A query whose best match is determinate.""" + + @classmethod + @abstractmethod + def expected_top_doc_id(cls) -> str: + """The doc_id that must rank first.""" + + @classmethod + @abstractmethod + def expected_connected_doc_id(cls) -> str: + """A zero-overlap doc_id that must be retrieved via its edge to the top node.""" + + @classmethod + @abstractmethod + def expected_isolated_doc_id(cls) -> str: + """A zero-overlap doc_id with no edge that must NOT be retrieved at top_k=2.""" + + # -- Helpers -------------------------------------------------------------- + + @classmethod + def _passages( + cls, query: str, nodes: List[Dict[str, Any]], edges: List[List[str]], top_k: int + ) -> List[Dict[str, Any]]: + connector = cls.connector_class() + edge_pairs = [(str(a), str(b)) for a, b in edges] + return connector._retrieve(query, nodes, edge_pairs, top_k) + + @classmethod + def _run_all( + cls, query: str, nodes: List[Dict[str, Any]], edges: List[List[str]], top_k: int + ) -> List[Dict[str, Any]]: + connector = cls.connector_class() + feature = Feature( + connector.ROOT_FEATURE_NAME, + options=Options( + context={ + connector.GRAPH_BACKEND: cls.backend_value(), + connector.QUERY_TEXT: query, + connector.NODES: nodes, + connector.EDGES: edges, + connector.TOP_K: top_k, + } + ), + ) + result = mlodaAPI.run_all( + [feature], + compute_frameworks={PythonDictFramework}, + plugin_collector=PluginCollector.enabled_feature_groups({connector}), + ) + for partition in result: + for row in partition: + if connector.ROOT_FEATURE_NAME in row: + passages: List[Dict[str, Any]] = row[connector.ROOT_FEATURE_NAME] + return passages + raise AssertionError(f"run_all returned no '{connector.ROOT_FEATURE_NAME}' row: {result!r}") + + # -- Matching / honest surface -------------------------------------------- + + def test_matches_root_feature_for_declared_backend(self) -> None: + connector = self.connector_class() + opts = Options(context={connector.GRAPH_BACKEND: self.backend_value()}) + assert connector.match_feature_group_criteria(connector.ROOT_FEATURE_NAME, opts) is True + + def test_does_not_match_other_feature_name(self) -> None: + connector = self.connector_class() + opts = Options(context={connector.GRAPH_BACKEND: self.backend_value()}) + assert connector.match_feature_group_criteria("docs", opts) is False + + def test_unknown_backend_does_not_match(self) -> None: + connector = self.connector_class() + opts = Options(context={connector.GRAPH_BACKEND: "definitely_not_a_backend_xyz"}) + assert connector.match_feature_group_criteria(connector.ROOT_FEATURE_NAME, opts) is False + + def test_backend_declared_in_supported_set(self) -> None: + connector = self.connector_class() + assert self.backend_value() in connector.GRAPH_BACKENDS + + # -- Output contract ------------------------------------------------------ + + def test_returns_ranked_passage_shape(self) -> None: + nodes = self.sample_nodes() + passages = self._passages(self.sample_query(), nodes, self.sample_edges(), top_k=len(nodes)) + assert isinstance(passages, list) + assert passages, "canonical query returned no passages; assertions would be vacuous" + for passage in passages: + assert set(passage) >= {"doc_id", "text", "score", "rank"} + assert isinstance(passage["doc_id"], str) + assert isinstance(passage["text"], str) + assert isinstance(passage["score"], float) + assert isinstance(passage["rank"], int) + + def test_scores_non_increasing_and_ranks_ascending(self) -> None: + nodes = self.sample_nodes() + passages = self._passages(self.sample_query(), nodes, self.sample_edges(), top_k=len(nodes)) + ranks = [p["rank"] for p in passages] + scores = [p["score"] for p in passages] + assert ranks == list(range(len(passages))) + assert scores == sorted(scores, reverse=True) + + def test_relevant_doc_ranked_first(self) -> None: + nodes = self.sample_nodes() + passages = self._passages(self.sample_query(), nodes, self.sample_edges(), top_k=len(nodes)) + assert passages[0]["doc_id"] == self.expected_top_doc_id() + assert len(passages) >= 2 + assert passages[0]["score"] > passages[1]["score"] + + def test_connected_context_retrieved(self) -> None: + """Graph not-a-stub proof: a zero-overlap node neighbouring the relevant + node must *outscore* an equally non-overlapping isolated node, purely + because of the edge. A lexical-only backend gives the two equal scores + and fails this regardless of how it breaks ties. + + The score comparison is the load-bearing assertion (tie-break + independent); the top_k=2 membership check then confirms the connected + node is actually surfaced and the isolated one dropped. The proof is + scoped to backends that score on query relevance (the family contract): + the two zero-overlap nodes are only separable via the edge.""" + nodes = self.sample_nodes() + full = self._passages(self.sample_query(), nodes, self.sample_edges(), top_k=len(nodes)) + score = {p["doc_id"]: p["score"] for p in full} + assert score[self.expected_connected_doc_id()] > score[self.expected_isolated_doc_id()] + + top2 = [p["doc_id"] for p in self._passages(self.sample_query(), nodes, self.sample_edges(), top_k=2)] + assert self.expected_top_doc_id() in top2 + assert self.expected_connected_doc_id() in top2 + assert self.expected_isolated_doc_id() not in top2 + + def test_passage_text_matches_nodes(self) -> None: + nodes = self.sample_nodes() + text_by_doc_id = {str(n["doc_id"]): str(n["text"]) for n in nodes} + passages = self._passages(self.sample_query(), nodes, self.sample_edges(), top_k=len(nodes)) + for passage in passages: + assert passage["text"] == text_by_doc_id[passage["doc_id"]] + + def test_doc_ids_unique_and_cover_nodes(self) -> None: + nodes = self.sample_nodes() + passages = self._passages(self.sample_query(), nodes, self.sample_edges(), top_k=len(nodes)) + returned = [p["doc_id"] for p in passages] + assert len(returned) == len(set(returned)) + assert set(returned) == {str(n["doc_id"]) for n in nodes} + + def test_top_k_respected(self) -> None: + passages = self._passages(self.sample_query(), self.sample_nodes(), self.sample_edges(), top_k=2) + assert len(passages) == 2 + assert passages[0]["doc_id"] == self.expected_top_doc_id() + + def test_duplicate_doc_ids_rejected(self) -> None: + """Duplicate doc_ids make edges ambiguous; the base must refuse them loudly + instead of silently last-wins overwriting the doc_id -> index map.""" + nodes = self.sample_nodes() + duplicated = nodes + [{"doc_id": nodes[0]["doc_id"], "text": "an unrelated duplicate"}] + with pytest.raises(ValueError, match="duplicate doc_id"): + self._passages(self.sample_query(), duplicated, self.sample_edges(), top_k=len(duplicated)) + + def test_top_k_clamped_to_nodes(self) -> None: + nodes = self.sample_nodes() + passages = self._passages(self.sample_query(), nodes, self.sample_edges(), top_k=len(nodes) + 50) + assert len(passages) == len(nodes) + + def test_empty_nodes_returns_empty(self) -> None: + assert self._passages(self.sample_query(), [], [], top_k=5) == [] + + def test_idempotent(self) -> None: + nodes = self.sample_nodes() + edges = self.sample_edges() + first = self._passages(self.sample_query(), nodes, edges, top_k=len(nodes)) + second = self._passages(self.sample_query(), nodes, edges, top_k=len(nodes)) + assert first == second + + # -- End to end ----------------------------------------------------------- + + def test_end_to_end_run_all(self) -> None: + nodes = self.sample_nodes() + passages = self._run_all(self.sample_query(), nodes, self.sample_edges(), top_k=len(nodes)) + assert passages + assert passages[0]["doc_id"] == self.expected_top_doc_id() diff --git a/tests/connectors/graph_rag/test_adjacency_graph_rag.py b/tests/connectors/graph_rag/test_adjacency_graph_rag.py new file mode 100644 index 0000000..c16eea8 --- /dev/null +++ b/tests/connectors/graph_rag/test_adjacency_graph_rag.py @@ -0,0 +1,57 @@ +"""Contract test for :class:`AdjacencyGraphRag` (zero-download CI anchor). + +Inherits the whole graph_rag contract suite, including the graph not-a-stub +proof: a zero-overlap node connected to the relevant node must outscore an +equally non-overlapping isolated node, purely because of the edge. The fixture +is the same shape as the networkx backend's, so the proof holds for this +engine-free adjacency implementation too. +""" + +from __future__ import annotations + +from typing import Any, Dict, List, Type + +from rag_integration.feature_groups.connectors.graph_rag.adjacency_graph_rag import AdjacencyGraphRag +from rag_integration.feature_groups.connectors.graph_rag.base import BaseGraphRagConnector +from tests.connectors.graph_rag.graph_rag_contract import GraphRagConnectorContractBase + + +class TestAdjacencyGraphRag(GraphRagConnectorContractBase): + @classmethod + def connector_class(cls) -> Type[BaseGraphRagConnector]: + return AdjacencyGraphRag + + @classmethod + def backend_value(cls) -> str: + return "adjacency" + + @classmethod + def sample_nodes(cls) -> List[Dict[str, Any]]: + return [ + # Relevant: shares "photosynthesis" and "plants" with the query. + {"doc_id": "rel", "text": "Photosynthesis lets plants make energy from sunlight."}, + # Connected context: zero query overlap, but edged to "rel". + {"doc_id": "ctx", "text": "It happens inside the chloroplast organelle."}, + # Isolated: zero query overlap and no edges. + {"doc_id": "iso", "text": "The stock market fell sharply on Tuesday."}, + ] + + @classmethod + def sample_edges(cls) -> List[List[str]]: + return [["rel", "ctx"]] + + @classmethod + def sample_query(cls) -> str: + return "photosynthesis plants" + + @classmethod + def expected_top_doc_id(cls) -> str: + return "rel" + + @classmethod + def expected_connected_doc_id(cls) -> str: + return "ctx" + + @classmethod + def expected_isolated_doc_id(cls) -> str: + return "iso" diff --git a/tests/connectors/graph_rag/test_backend_parity.py b/tests/connectors/graph_rag/test_backend_parity.py new file mode 100644 index 0000000..829df0c --- /dev/null +++ b/tests/connectors/graph_rag/test_backend_parity.py @@ -0,0 +1,56 @@ +"""Backend parity for the ``graph_rag`` family. + +:class:`NetworkxGraphRag` and :class:`AdjacencyGraphRag` document identical +scoring (query overlap + neighbour bonus, index tie-break) but implement it +independently; nothing in the code enforces the parity beyond copy-discipline. +This suite pins it: both backends must return identical passage lists for the +same inputs, so a drift in either implementation fails loudly here. +""" + +from __future__ import annotations + +from typing import Any, Dict, List, Tuple + +import pytest + +from rag_integration.feature_groups.connectors.graph_rag.adjacency_graph_rag import AdjacencyGraphRag +from rag_integration.feature_groups.connectors.graph_rag.networkx_graph_rag import NetworkxGraphRag + +# Clean skip (not an error) when the `graph` extra is not installed. +pytest.importorskip("networkx") + +_NODES: List[Dict[str, Any]] = [ + {"doc_id": "rel", "text": "Photosynthesis lets plants make energy from sunlight."}, + {"doc_id": "ctx", "text": "It happens inside the chloroplast organelle."}, + {"doc_id": "alt", "text": "Plants also respire and grow at night."}, + {"doc_id": "iso", "text": "The stock market fell sharply on Tuesday."}, +] + +_QUERY = "photosynthesis plants" + + +def _assert_parity(query: str, nodes: List[Dict[str, Any]], edges: List[Tuple[str, str]], top_k: int) -> None: + networkx_passages = NetworkxGraphRag._retrieve(query, nodes, edges, top_k) + adjacency_passages = AdjacencyGraphRag._retrieve(query, nodes, edges, top_k) + assert networkx_passages == adjacency_passages + assert networkx_passages, "parity assertion would be vacuous on an empty result" + + +def test_parity_connected_graph() -> None: + _assert_parity(_QUERY, _NODES, [("rel", "ctx"), ("ctx", "alt"), ("alt", "iso")], top_k=len(_NODES)) + + +def test_parity_duplicate_and_reversed_edges() -> None: + _assert_parity(_QUERY, _NODES, [("rel", "ctx"), ("rel", "ctx"), ("ctx", "rel")], top_k=len(_NODES)) + + +def test_parity_zero_overlap_query() -> None: + _assert_parity("quantum entanglement", _NODES, [("rel", "ctx"), ("ctx", "alt")], top_k=len(_NODES)) + + +def test_parity_isolated_nodes() -> None: + _assert_parity(_QUERY, _NODES, [("rel", "ctx")], top_k=len(_NODES)) + + +def test_parity_empty_edges() -> None: + _assert_parity(_QUERY, _NODES, [], top_k=len(_NODES)) diff --git a/tests/connectors/graph_rag/test_base_edges.py b/tests/connectors/graph_rag/test_base_edges.py new file mode 100644 index 0000000..70c645c --- /dev/null +++ b/tests/connectors/graph_rag/test_base_edges.py @@ -0,0 +1,96 @@ +"""Base-level edge handling and ranking edge cases for the ``graph_rag`` family. + +Exercises the ``BaseGraphRagConnector`` invariants through the stdlib-only +:class:`AdjacencyGraphRag` (no extras required): ``_resolve_edges`` sanitising +(container guard, malformed elements, self-loops), duplicate doc_id rejection, +unknown doc_id edges, neighbour-bonus counting under duplicate/reversed edges, +and deterministic ordering on a zero-overlap query. +""" + +from __future__ import annotations + +from typing import Any, Dict, List, Tuple + +import pytest + +from mloda.user import Options + +from rag_integration.feature_groups.connectors.graph_rag.adjacency_graph_rag import AdjacencyGraphRag + +_NODES: List[Dict[str, Any]] = [ + {"doc_id": "rel", "text": "Photosynthesis lets plants make energy from sunlight."}, + {"doc_id": "ctx", "text": "It happens inside the chloroplast organelle."}, + {"doc_id": "iso", "text": "The stock market fell sharply on Tuesday."}, +] + +_QUERY = "photosynthesis plants" + + +def _resolve(raw_edges: Any) -> List[Tuple[str, str]]: + options = Options(context={AdjacencyGraphRag.EDGES: raw_edges}) + return AdjacencyGraphRag._resolve_edges(options) + + +def _passages(edges: List[Tuple[str, str]], query: str = _QUERY) -> List[Dict[str, Any]]: + return AdjacencyGraphRag._retrieve(query, _NODES, edges, len(_NODES)) + + +# -- _resolve_edges: option extraction and sanitising -------------------------- + + +def test_edges_omitted_resolve_to_empty() -> None: + assert AdjacencyGraphRag._resolve_edges(Options(context={})) == [] + + +def test_non_sequence_edges_container_rejected() -> None: + # A string EDGES previously dropped all edges silently; a non-iterable + # crashed with a bare TypeError. Both must be a loud ValueError. + with pytest.raises(ValueError, match="must be a list"): + _resolve("rel,ctx") + with pytest.raises(ValueError, match="must be a list"): + _resolve(42) + + +def test_malformed_edge_elements_skipped() -> None: + raw = [ + ["rel", "ctx", "extra"], # length-3 list + None, # not a sequence + {"a": "rel", "b": "ctx"}, # dict + "ab", # plain string element + ["rel", "ctx"], # the only real pair + ] + assert _resolve(raw) == [("rel", "ctx")] + + +def test_self_loop_edges_skipped() -> None: + assert _resolve([["rel", "rel"]]) == [] + + +# -- _retrieve: doc_id mapping --------------------------------------------------- + + +def test_unknown_doc_id_edges_skipped() -> None: + with_ghost_edges = _passages([("rel", "ghost"), ("ghost", "ctx")]) + assert with_ghost_edges == _passages([]) + + +def test_duplicate_doc_ids_rejected_even_after_str_coercion() -> None: + nodes: List[Dict[str, Any]] = [{"doc_id": 1, "text": "first"}, {"doc_id": "1", "text": "second"}] + with pytest.raises(ValueError, match="duplicate doc_id"): + AdjacencyGraphRag._retrieve(_QUERY, nodes, [], len(nodes)) + + +# -- Ranking edge cases ---------------------------------------------------------- + + +def test_duplicate_and_reversed_edges_count_bonus_once() -> None: + single = {p["doc_id"]: p["score"] for p in _passages([("rel", "ctx")])} + repeated = {p["doc_id"]: p["score"] for p in _passages([("rel", "ctx"), ("ctx", "rel"), ("rel", "ctx")])} + assert repeated == single + assert repeated["ctx"] == pytest.approx(0.5) + + +def test_zero_overlap_query_returns_deterministic_index_order() -> None: + passages = _passages([("rel", "ctx")], query="quantum entanglement") + assert [p["doc_id"] for p in passages] == ["rel", "ctx", "iso"] + assert all(p["score"] == 0.0 for p in passages) diff --git a/tests/connectors/graph_rag/test_networkx_graph_rag.py b/tests/connectors/graph_rag/test_networkx_graph_rag.py new file mode 100644 index 0000000..fcad52a --- /dev/null +++ b/tests/connectors/graph_rag/test_networkx_graph_rag.py @@ -0,0 +1,60 @@ +"""Contract test for :class:`NetworkxGraphRag` (canonical networkx backend). + +Skipped cleanly when the ``graph`` extra (networkx) is not installed; the +stdlib-only :class:`AdjacencyGraphRag` test is the family's zero-download CI +anchor. +""" + +from __future__ import annotations + +from typing import Any, Dict, List, Type + +import pytest + +from rag_integration.feature_groups.connectors.graph_rag.base import BaseGraphRagConnector +from rag_integration.feature_groups.connectors.graph_rag.networkx_graph_rag import NetworkxGraphRag +from tests.connectors.graph_rag.graph_rag_contract import GraphRagConnectorContractBase + +# Clean skip (not an error) when the `graph` extra is not installed. +pytest.importorskip("networkx") + + +class TestNetworkxGraphRag(GraphRagConnectorContractBase): + @classmethod + def connector_class(cls) -> Type[BaseGraphRagConnector]: + return NetworkxGraphRag + + @classmethod + def backend_value(cls) -> str: + return "networkx" + + @classmethod + def sample_nodes(cls) -> List[Dict[str, Any]]: + return [ + # Relevant: shares "photosynthesis" and "plants" with the query. + {"doc_id": "rel", "text": "Photosynthesis lets plants make energy from sunlight."}, + # Connected context: zero query overlap, but edged to "rel". + {"doc_id": "ctx", "text": "It happens inside the chloroplast organelle."}, + # Isolated: zero query overlap and no edges. + {"doc_id": "iso", "text": "The stock market fell sharply on Tuesday."}, + ] + + @classmethod + def sample_edges(cls) -> List[List[str]]: + return [["rel", "ctx"]] + + @classmethod + def sample_query(cls) -> str: + return "photosynthesis plants" + + @classmethod + def expected_top_doc_id(cls) -> str: + return "rel" + + @classmethod + def expected_connected_doc_id(cls) -> str: + return "ctx" + + @classmethod + def expected_isolated_doc_id(cls) -> str: + return "iso" diff --git a/tests/connectors/orchestrator/__init__.py b/tests/connectors/orchestrator/__init__.py new file mode 100644 index 0000000..65b0d36 --- /dev/null +++ b/tests/connectors/orchestrator/__init__.py @@ -0,0 +1 @@ +"""orchestrator connector-family tests.""" diff --git a/tests/connectors/orchestrator/orchestrator_contract.py b/tests/connectors/orchestrator/orchestrator_contract.py new file mode 100644 index 0000000..94dfe4a --- /dev/null +++ b/tests/connectors/orchestrator/orchestrator_contract.py @@ -0,0 +1,191 @@ +"""Inheritable contract-test suite for the ``orchestrator`` connector family. + +A concrete backend's test implements six adapter methods and inherits every +assertion. The base is not named ``Test*`` so pytest does not collect it. +""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import Any, Dict, List, Type + +import pytest +from mloda.user import mlodaAPI, Feature, Options, PluginCollector +from mloda_plugins.compute_framework.base_implementations.python_dict.python_dict_framework import ( + PythonDictFramework, +) + +from rag_integration.feature_groups.connectors.orchestrator.base import BaseOrchestratorConnector + + +class OrchestratorConnectorContractBase(ABC): + """Contract every orchestrator backend must satisfy.""" + + # -- Adapter methods ------------------------------------------------------ + + @classmethod + @abstractmethod + def connector_class(cls) -> Type[BaseOrchestratorConnector]: + """Return the concrete ``BaseOrchestratorConnector`` subclass under test.""" + + @classmethod + @abstractmethod + def backend_value(cls) -> str: + """Return the ``orchestrator_backend`` value that selects this concrete.""" + + @classmethod + @abstractmethod + def sample_corpus(cls) -> List[Dict[str, Any]]: + """Return a corpus (``{doc_id, text}``) with one clearly relevant doc.""" + + @classmethod + @abstractmethod + def sample_query(cls) -> str: + """Return a query whose best match is determinate.""" + + @classmethod + @abstractmethod + def expected_top_doc_id(cls) -> str: + """Return the doc_id the pipeline must surface first.""" + + @classmethod + @abstractmethod + def expected_answer_substring(cls) -> str: + """Return a distinctive substring the answer must contain (drawn from the top doc).""" + + # -- Helpers -------------------------------------------------------------- + + @classmethod + def _answer(cls, query: str, corpus: List[Dict[str, Any]], top_k: int) -> Dict[str, Any]: + return cls.connector_class()._answer(query, corpus, top_k) + + @classmethod + def _run_all(cls, query: str, corpus: List[Dict[str, Any]], top_k: int) -> Dict[str, Any]: + connector = cls.connector_class() + feature = Feature( + connector.ROOT_FEATURE_NAME, + options=Options( + context={ + connector.ORCHESTRATOR_BACKEND: cls.backend_value(), + connector.QUERY_TEXT: query, + connector.CORPUS: corpus, + connector.TOP_K: top_k, + } + ), + ) + result = mlodaAPI.run_all( + [feature], + compute_frameworks={PythonDictFramework}, + plugin_collector=PluginCollector.enabled_feature_groups({connector}), + ) + for partition in result: + for row in partition: + if connector.ROOT_FEATURE_NAME in row: + answer: Dict[str, Any] = row[connector.ROOT_FEATURE_NAME] + return answer + raise AssertionError(f"run_all returned no '{connector.ROOT_FEATURE_NAME}' row: {result!r}") + + # -- Matching / honest surface -------------------------------------------- + + def test_matches_root_feature_for_declared_backend(self) -> None: + connector = self.connector_class() + opts = Options(context={connector.ORCHESTRATOR_BACKEND: self.backend_value()}) + assert connector.match_feature_group_criteria(connector.ROOT_FEATURE_NAME, opts) is True + + def test_does_not_match_other_feature_name(self) -> None: + connector = self.connector_class() + opts = Options(context={connector.ORCHESTRATOR_BACKEND: self.backend_value()}) + assert connector.match_feature_group_criteria("docs", opts) is False + + def test_unknown_backend_does_not_match(self) -> None: + connector = self.connector_class() + opts = Options(context={connector.ORCHESTRATOR_BACKEND: "definitely_not_a_backend_xyz"}) + assert connector.match_feature_group_criteria(connector.ROOT_FEATURE_NAME, opts) is False + + def test_backend_declared_in_supported_set(self) -> None: + connector = self.connector_class() + assert self.backend_value() in connector.ORCHESTRATOR_BACKENDS + + # -- Output contract ------------------------------------------------------ + + def test_answer_object_shape(self) -> None: + result = self._answer(self.sample_query(), self.sample_corpus(), top_k=len(self.sample_corpus())) + assert set(result) >= {"answer", "documents"} + assert isinstance(result["answer"], str) + assert isinstance(result["documents"], list) + for document in result["documents"]: + assert set(document) >= {"doc_id", "text", "score"} + assert isinstance(document["doc_id"], str) + assert isinstance(document["text"], str) + assert isinstance(document["score"], float) + + def test_documents_are_grounded(self) -> None: + """Every surfaced document came from the supplied corpus (no fabrication).""" + corpus = self.sample_corpus() + known = {str(doc["doc_id"]) for doc in corpus} + result = self._answer(self.sample_query(), corpus, top_k=len(corpus)) + assert {document["doc_id"] for document in result["documents"]} <= known + + def test_relevant_doc_surfaced_first(self) -> None: + """Not-a-stub proof: the pipeline ranks the relevant doc first.""" + result = self._answer(self.sample_query(), self.sample_corpus(), top_k=len(self.sample_corpus())) + assert result["documents"], "pipeline surfaced no documents" + assert result["documents"][0]["doc_id"] == self.expected_top_doc_id() + + def test_answer_drawn_from_top_document(self) -> None: + """The answer contains a distinctive substring of the relevant document.""" + result = self._answer(self.sample_query(), self.sample_corpus(), top_k=len(self.sample_corpus())) + assert self.expected_answer_substring() in result["answer"] + + def test_top_k_respected(self) -> None: + result = self._answer(self.sample_query(), self.sample_corpus(), top_k=1) + assert len(result["documents"]) == 1 + assert result["documents"][0]["doc_id"] == self.expected_top_doc_id() + + def test_empty_corpus_returns_empty(self) -> None: + result = self._answer(self.sample_query(), [], top_k=5) + assert result == {"answer": "", "documents": []} + + def test_empty_query_returns_empty(self) -> None: + """An empty/whitespace query yields no answer and no documents (no framework error leak).""" + result = self._answer(" ", self.sample_corpus(), top_k=len(self.sample_corpus())) + assert result["answer"] == "" + assert result["documents"] == [] + + def test_nonpositive_top_k_returns_empty(self) -> None: + """A non-positive top_k yields no answer and no documents (no framework error leak).""" + result = self._answer(self.sample_query(), self.sample_corpus(), top_k=0) + assert result["answer"] == "" + assert result["documents"] == [] + + def test_duplicate_doc_id_raises(self) -> None: + """Duplicate corpus doc_ids are rejected uniformly by the base (no silent dedup).""" + corpus = [ + {"doc_id": "dup", "text": "first entry"}, + {"doc_id": "dup", "text": "second entry"}, + ] + with pytest.raises(ValueError, match="duplicate doc_id"): + self._answer(self.sample_query(), corpus, top_k=2) + + def test_positional_default_doc_id_collision_raises(self) -> None: + """An entry without doc_id defaults to its index, so it collides with an explicit id '1'.""" + corpus = [ + {"doc_id": "1", "text": "explicit id one"}, + {"text": "no doc_id: defaults to positional index '1'"}, + ] + with pytest.raises(ValueError, match="duplicate doc_id"): + self._answer(self.sample_query(), corpus, top_k=2) + + def test_idempotent(self) -> None: + corpus = self.sample_corpus() + first = self._answer(self.sample_query(), corpus, top_k=len(corpus)) + second = self._answer(self.sample_query(), corpus, top_k=len(corpus)) + assert first == second + + # -- End to end ----------------------------------------------------------- + + def test_end_to_end_run_all(self) -> None: + corpus = self.sample_corpus() + result = self._run_all(self.sample_query(), corpus, top_k=len(corpus)) + assert result["documents"][0]["doc_id"] == self.expected_top_doc_id() + assert self.expected_answer_substring() in result["answer"] diff --git a/tests/connectors/orchestrator/test_base_safety.py b/tests/connectors/orchestrator/test_base_safety.py new file mode 100644 index 0000000..24b1c5c --- /dev/null +++ b/tests/connectors/orchestrator/test_base_safety.py @@ -0,0 +1,42 @@ +"""Base-level safety tests for the orchestrator family, through the production path. + +Pins the two guards in ``BaseOrchestratorConnector._answer`` that no real +backend triggers but a future (e.g. server-stub) backend could: a fabricated +document (doc_id not in the corpus) and a non-empty answer with no documents. +""" + +from __future__ import annotations + +from typing import Any, Dict, List, Tuple + +import pytest + +from rag_integration.feature_groups.connectors.orchestrator.base import BaseOrchestratorConnector + +_CORPUS = [{"doc_id": "d0", "text": "a real document"}] + + +class _FabricatingBackend(BaseOrchestratorConnector): + ORCHESTRATOR_BACKENDS = {"_fabricating_stub": "test-only stub"} + + @classmethod + def _run(cls, query: str, corpus: List[Dict[str, Any]], top_k: int) -> Tuple[str, List[Dict[str, Any]]]: + return "answer", [{"doc_id": "not_in_corpus", "text": "fabricated", "score": 1.0}] + + +class _AnswerWithoutDocumentsBackend(BaseOrchestratorConnector): + ORCHESTRATOR_BACKENDS = {"_answer_no_docs_stub": "test-only stub"} + + @classmethod + def _run(cls, query: str, corpus: List[Dict[str, Any]], top_k: int) -> Tuple[str, List[Dict[str, Any]]]: + return "an ungrounded answer", [] + + +def test_query_rejects_fabricated_document() -> None: + with pytest.raises(ValueError): + _FabricatingBackend._answer("q", _CORPUS, 5) + + +def test_query_rejects_nonempty_answer_without_documents() -> None: + with pytest.raises(ValueError): + _AnswerWithoutDocumentsBackend._answer("q", _CORPUS, 5) diff --git a/tests/connectors/orchestrator/test_haystack_orchestrator.py b/tests/connectors/orchestrator/test_haystack_orchestrator.py new file mode 100644 index 0000000..1278f4d --- /dev/null +++ b/tests/connectors/orchestrator/test_haystack_orchestrator.py @@ -0,0 +1,48 @@ +"""Contract test for :class:`HaystackOrchestrator`. + +Runs a real Haystack in-memory BM25 pipeline (zero-download, so it runs on CI). +Skips cleanly when the ``orchestrator`` extra is not installed. +""" + +from __future__ import annotations + +from typing import Any, Dict, List, Type + +import pytest + +from rag_integration.feature_groups.connectors.orchestrator.base import BaseOrchestratorConnector +from rag_integration.feature_groups.connectors.orchestrator.haystack_orchestrator import HaystackOrchestrator +from tests.connectors.orchestrator.orchestrator_contract import OrchestratorConnectorContractBase + +# Clean skip (not an error) when the `orchestrator` extra is not installed. +pytest.importorskip("haystack") + + +class TestHaystackOrchestrator(OrchestratorConnectorContractBase): + @classmethod + def connector_class(cls) -> Type[BaseOrchestratorConnector]: + return HaystackOrchestrator + + @classmethod + def backend_value(cls) -> str: + return "haystack" + + @classmethod + def sample_corpus(cls) -> List[Dict[str, Any]]: + return [ + {"doc_id": "d0", "text": "Cars need regular engine oil and maintenance."}, + {"doc_id": "d1", "text": "A cat is an independent and curious pet."}, + {"doc_id": "d2", "text": "Dogs are loyal and energetic companions."}, + ] + + @classmethod + def sample_query(cls) -> str: + return "cat pet" + + @classmethod + def expected_top_doc_id(cls) -> str: + return "d1" + + @classmethod + def expected_answer_substring(cls) -> str: + return "curious pet" diff --git a/tests/connectors/orchestrator/test_r2r_fixture_orchestrator.py b/tests/connectors/orchestrator/test_r2r_fixture_orchestrator.py new file mode 100644 index 0000000..02f1996 --- /dev/null +++ b/tests/connectors/orchestrator/test_r2r_fixture_orchestrator.py @@ -0,0 +1,98 @@ +"""Contract test for :class:`R2RFixtureOrchestrator` (zero-download CI anchor). + +Runs entirely from a bundled JSON fixture (no server, no network), so it runs +on CI. Beyond the inherited contract, it adds the backend's own not-a-stub +proof: the honest-surface *narrowing* mechanism, where a canned response +document that is not in the supplied corpus is filtered out rather than +surfaced. +""" + +from __future__ import annotations + +from typing import Any, Dict, List, Type + +from rag_integration.feature_groups.connectors.orchestrator.base import BaseOrchestratorConnector +from rag_integration.feature_groups.connectors.orchestrator.r2r_fixture_orchestrator import R2RFixtureOrchestrator +from tests.connectors.orchestrator.orchestrator_contract import OrchestratorConnectorContractBase + + +class TestR2RFixtureOrchestrator(OrchestratorConnectorContractBase): + @classmethod + def connector_class(cls) -> Type[BaseOrchestratorConnector]: + return R2RFixtureOrchestrator + + @classmethod + def backend_value(cls) -> str: + return "r2r" + + @classmethod + def sample_corpus(cls) -> List[Dict[str, Any]]: + return [ + {"doc_id": "d0", "text": "Cars need regular engine oil and maintenance."}, + {"doc_id": "d1", "text": "A cat is an independent and curious pet."}, + {"doc_id": "d2", "text": "Dogs are loyal and energetic companions."}, + ] + + @classmethod + def sample_query(cls) -> str: + return "cat pet" + + @classmethod + def expected_top_doc_id(cls) -> str: + return "d1" + + @classmethod + def expected_answer_substring(cls) -> str: + return "curious pet" + + # -- Backend-specific proof: honest-surface narrowing --------------------- + + def test_narrows_canned_docs_to_corpus(self) -> None: + """The fixture's canned response for this query also ranks ``d2``, but a + corpus that omits ``d2`` must not surface it: the stub narrows to the + ingested corpus rather than echoing the fixture verbatim.""" + corpus = [ + {"doc_id": "d0", "text": "Cars need regular engine oil and maintenance."}, + {"doc_id": "d1", "text": "A cat is an independent and curious pet."}, + ] + result = self._answer(self.sample_query(), corpus, top_k=len(corpus)) + surfaced = {document["doc_id"] for document in result["documents"]} + assert "d2" not in surfaced, "narrowing failed: surfaced a canned doc not in the corpus" + assert surfaced == {"d0", "d1"} + # The relevant doc survives narrowing, so the answer stays grounded. + assert result["documents"][0]["doc_id"] == "d1" + assert self.expected_answer_substring() in result["answer"] + + def test_answer_dropped_when_supporting_doc_narrowed_away(self) -> None: + """If the document the answer is drawn from (``d1``) is not in the corpus, + the answer is dropped even when other canned docs survive: the result is + retrieve-only, never an answer grounded on documents it did not come from.""" + corpus = [ + {"doc_id": "d0", "text": "Cars need regular engine oil and maintenance."}, + {"doc_id": "d2", "text": "Dogs are loyal and energetic companions."}, + ] + result = self._answer(self.sample_query(), corpus, top_k=len(corpus)) + surfaced = {document["doc_id"] for document in result["documents"]} + assert "d1" not in surfaced + assert result["documents"], "surviving canned docs should still be surfaced (retrieve-only)" + assert result["answer"] == "", "answer must be dropped when its supporting doc is narrowed away" + + def test_answer_dropped_when_supporting_doc_truncated_by_top_k(self) -> None: + """The 'loyal companion' canned response ranks ``d1`` above the answer's + source ``d2``, so ``top_k=1`` truncates ``d2`` away: the answer is + suppressed (retrieve-only) even though ``d2`` is in the corpus, because + suppression keys on the SURFACED documents, not corpus membership.""" + corpus = self.sample_corpus() + full = self._answer("loyal companion", corpus, top_k=len(corpus)) + assert full["documents"][0]["doc_id"] == "d1" + assert "loyal" in full["answer"], "sanity: answer surfaces when its source doc is surfaced" + + truncated = self._answer("loyal companion", corpus, top_k=1) + assert [document["doc_id"] for document in truncated["documents"]] == ["d1"] + assert truncated["answer"] == "", "answer must be dropped when top_k truncation removes its source doc" + + def test_unknown_query_has_no_canned_response(self) -> None: + """A query absent from the fixture yields an empty result (the server has + nothing indexed for it), never a fabricated answer.""" + result = self._answer("a query the fixture has never seen", self.sample_corpus(), top_k=3) + assert result == {"answer": "", "documents": []} diff --git a/tests/connectors/rerank/__init__.py b/tests/connectors/rerank/__init__.py new file mode 100644 index 0000000..4adf44f --- /dev/null +++ b/tests/connectors/rerank/__init__.py @@ -0,0 +1 @@ +"""rerank connector-family tests.""" diff --git a/tests/connectors/rerank/rerank_contract.py b/tests/connectors/rerank/rerank_contract.py new file mode 100644 index 0000000..187081d --- /dev/null +++ b/tests/connectors/rerank/rerank_contract.py @@ -0,0 +1,173 @@ +"""Inheritable contract-test suite for the ``rerank`` connector family. + +Mirrors the ``retrieve`` family's contract suite: a concrete backend's test +implements five adapter methods and inherits every assertion. The base is not +named ``Test*`` so pytest does not collect it directly. +""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import Any, Dict, List, Type + +from mloda.user import mlodaAPI, Feature, Options, PluginCollector +from mloda_plugins.compute_framework.base_implementations.python_dict.python_dict_framework import ( + PythonDictFramework, +) + +from rag_integration.feature_groups.connectors.rerank.base import BaseRerankConnector + + +class RerankConnectorContractBase(ABC): + """Contract every rerank-connector backend must satisfy.""" + + # -- Adapter methods (a concrete test implements these five) -------------- + + @classmethod + @abstractmethod + def connector_class(cls) -> Type[BaseRerankConnector]: + """Return the concrete ``BaseRerankConnector`` subclass under test.""" + + @classmethod + @abstractmethod + def backend_value(cls) -> str: + """Return the ``rerank_backend`` value that selects this concrete.""" + + @classmethod + @abstractmethod + def sample_candidates(cls) -> List[Dict[str, Any]]: + """Return candidate passages (``{doc_id, text}``) with one determinate best match.""" + + @classmethod + @abstractmethod + def sample_query(cls) -> str: + """Return a query whose best match in ``sample_candidates`` is determinate.""" + + @classmethod + @abstractmethod + def expected_top_doc_id(cls) -> str: + """Return the ``doc_id`` that must rank first after reranking.""" + + # -- Helpers -------------------------------------------------------------- + + @classmethod + def _rerank(cls, query: str, candidates: List[Dict[str, Any]], top_k: int) -> List[Dict[str, Any]]: + return cls.connector_class()._rerank(query, candidates, top_k) + + @classmethod + def _run_all(cls, query: str, candidates: List[Dict[str, Any]], top_k: int) -> List[Dict[str, Any]]: + connector = cls.connector_class() + feature = Feature( + connector.ROOT_FEATURE_NAME, + options=Options( + context={ + connector.RERANK_BACKEND: cls.backend_value(), + connector.QUERY_TEXT: query, + connector.CANDIDATES: candidates, + connector.TOP_K: top_k, + } + ), + ) + result = mlodaAPI.run_all( + [feature], + compute_frameworks={PythonDictFramework}, + plugin_collector=PluginCollector.enabled_feature_groups({connector}), + ) + for partition in result: + for row in partition: + if connector.ROOT_FEATURE_NAME in row: + passages: List[Dict[str, Any]] = row[connector.ROOT_FEATURE_NAME] + return passages + raise AssertionError(f"run_all returned no '{connector.ROOT_FEATURE_NAME}' row: {result!r}") + + # -- Matching / honest surface -------------------------------------------- + + def test_matches_root_feature_for_declared_backend(self) -> None: + connector = self.connector_class() + opts = Options(context={connector.RERANK_BACKEND: self.backend_value()}) + assert connector.match_feature_group_criteria(connector.ROOT_FEATURE_NAME, opts) is True + + def test_does_not_match_other_feature_name(self) -> None: + connector = self.connector_class() + opts = Options(context={connector.RERANK_BACKEND: self.backend_value()}) + assert connector.match_feature_group_criteria("docs", opts) is False + + def test_unknown_backend_does_not_match(self) -> None: + connector = self.connector_class() + opts = Options(context={connector.RERANK_BACKEND: "definitely_not_a_backend_xyz"}) + assert connector.match_feature_group_criteria(connector.ROOT_FEATURE_NAME, opts) is False + + def test_backend_declared_in_supported_set(self) -> None: + connector = self.connector_class() + assert self.backend_value() in connector.RERANK_BACKENDS + + # -- Output contract ------------------------------------------------------ + + def test_returns_ranked_passage_shape(self) -> None: + passages = self._rerank(self.sample_query(), self.sample_candidates(), top_k=len(self.sample_candidates())) + assert isinstance(passages, list) + assert passages, "canonical query returned no passages; contract assertions would be vacuous" + for passage in passages: + assert set(passage) >= {"doc_id", "text", "score", "rank"} + assert isinstance(passage["doc_id"], str) + assert isinstance(passage["text"], str) + assert isinstance(passage["score"], float) + assert isinstance(passage["rank"], int) + + def test_scores_non_increasing_and_ranks_ascending(self) -> None: + passages = self._rerank(self.sample_query(), self.sample_candidates(), top_k=len(self.sample_candidates())) + ranks = [p["rank"] for p in passages] + scores = [p["score"] for p in passages] + assert ranks == list(range(len(passages))) + assert scores == sorted(scores, reverse=True) + + def test_relevant_doc_ranked_first(self) -> None: + """Not-a-stub proof: the crafted relevant doc must rank #1, by a margin.""" + passages = self._rerank(self.sample_query(), self.sample_candidates(), top_k=len(self.sample_candidates())) + assert passages[0]["doc_id"] == self.expected_top_doc_id() + assert len(passages) >= 2, "contract candidates must have >=2 docs to prove score separation" + assert passages[0]["score"] > passages[1]["score"] + + def test_passage_text_matches_candidates(self) -> None: + """Guards the base assembly: each passage's text is the candidate text for its doc_id.""" + candidates = self.sample_candidates() + text_by_doc_id = {str(doc["doc_id"]): str(doc["text"]) for doc in candidates} + passages = self._rerank(self.sample_query(), candidates, top_k=len(candidates)) + for passage in passages: + assert passage["text"] == text_by_doc_id[passage["doc_id"]] + + def test_doc_ids_unique_and_cover_candidates(self) -> None: + """No silent drop or duplicate: with top_k >= candidate count the returned + doc_ids are unique and cover exactly the candidates.""" + candidates = self.sample_candidates() + passages = self._rerank(self.sample_query(), candidates, top_k=len(candidates)) + returned = [p["doc_id"] for p in passages] + assert len(returned) == len(set(returned)), "rerank returned duplicate doc_ids" + assert set(returned) == {str(doc["doc_id"]) for doc in candidates} + + def test_top_k_respected(self) -> None: + passages = self._rerank(self.sample_query(), self.sample_candidates(), top_k=1) + assert len(passages) == 1 + assert passages[0]["doc_id"] == self.expected_top_doc_id() + + def test_top_k_clamped_to_candidates(self) -> None: + candidates = self.sample_candidates() + passages = self._rerank(self.sample_query(), candidates, top_k=len(candidates) + 50) + assert len(passages) == len(candidates) + + def test_empty_candidates_returns_empty(self) -> None: + assert self._rerank(self.sample_query(), [], top_k=5) == [] + + def test_idempotent(self) -> None: + candidates = self.sample_candidates() + first = self._rerank(self.sample_query(), candidates, top_k=len(candidates)) + second = self._rerank(self.sample_query(), candidates, top_k=len(candidates)) + assert first == second + + # -- End to end ----------------------------------------------------------- + + def test_end_to_end_run_all(self) -> None: + candidates = self.sample_candidates() + passages = self._run_all(self.sample_query(), candidates, top_k=len(candidates)) + assert passages, "run_all produced no passages" + assert passages[0]["doc_id"] == self.expected_top_doc_id() diff --git a/tests/connectors/rerank/test_flashrank_reranker.py b/tests/connectors/rerank/test_flashrank_reranker.py new file mode 100644 index 0000000..71d3f5f --- /dev/null +++ b/tests/connectors/rerank/test_flashrank_reranker.py @@ -0,0 +1,47 @@ +"""Contract test for :class:`FlashRankReranker` (pedigree backend). + +Skipped on CI: the FlashRank model (~4 MB) downloads from the network on first +use, which is flaky on CI runners. Runs locally against the cached model. The +zero-download ``LexicalReranker`` is the always-on CI anchor for this family. +""" + +from __future__ import annotations + +from typing import Any, Dict, List, Type + +import pytest + +from rag_integration.feature_groups.connectors.rerank.base import BaseRerankConnector +from rag_integration.feature_groups.connectors.rerank.flashrank_reranker import FlashRankReranker +from tests.conftest import requires_flashrank_model +from tests.connectors.rerank.rerank_contract import RerankConnectorContractBase + +# Clean skip (not an error) when the `rerank` extra is not installed. +pytest.importorskip("flashrank") + + +@requires_flashrank_model +class TestFlashRankReranker(RerankConnectorContractBase): + @classmethod + def connector_class(cls) -> Type[BaseRerankConnector]: + return FlashRankReranker + + @classmethod + def backend_value(cls) -> str: + return "flashrank" + + @classmethod + def sample_candidates(cls) -> List[Dict[str, Any]]: + return [ + {"doc_id": "d0", "text": "Cars need regular engine oil and maintenance."}, + {"doc_id": "d1", "text": "Cats need fresh water, a clean litter box, and regular vet visits."}, + {"doc_id": "d2", "text": "Dogs are loyal and energetic companions."}, + ] + + @classmethod + def sample_query(cls) -> str: + return "how to care for a cat" + + @classmethod + def expected_top_doc_id(cls) -> str: + return "d1" diff --git a/tests/connectors/rerank/test_lexical_reranker.py b/tests/connectors/rerank/test_lexical_reranker.py new file mode 100644 index 0000000..c865e30 --- /dev/null +++ b/tests/connectors/rerank/test_lexical_reranker.py @@ -0,0 +1,35 @@ +"""Contract test for :class:`LexicalReranker` (zero-download CI anchor).""" + +from __future__ import annotations + +from typing import Any, Dict, List, Type + +from rag_integration.feature_groups.connectors.rerank.base import BaseRerankConnector +from rag_integration.feature_groups.connectors.rerank.lexical_reranker import LexicalReranker +from tests.connectors.rerank.rerank_contract import RerankConnectorContractBase + + +class TestLexicalReranker(RerankConnectorContractBase): + @classmethod + def connector_class(cls) -> Type[BaseRerankConnector]: + return LexicalReranker + + @classmethod + def backend_value(cls) -> str: + return "lexical" + + @classmethod + def sample_candidates(cls) -> List[Dict[str, Any]]: + return [ + {"doc_id": "d0", "text": "Cars need regular engine oil and maintenance."}, + {"doc_id": "d1", "text": "A complete guide to cat care: water, litter, and vet visits."}, + {"doc_id": "d2", "text": "Dogs are loyal and energetic companions."}, + ] + + @classmethod + def sample_query(cls) -> str: + return "cat care guide" + + @classmethod + def expected_top_doc_id(cls) -> str: + return "d1" diff --git a/tests/connectors/retrieve/__init__.py b/tests/connectors/retrieve/__init__.py new file mode 100644 index 0000000..849cb97 --- /dev/null +++ b/tests/connectors/retrieve/__init__.py @@ -0,0 +1,7 @@ +"""retrieve connector-family tests.""" + +import pytest + +# The contract suite lives in a non-test module imported by the concrete test +# files; register it so its inherited asserts get pytest's assertion rewriting. +pytest.register_assert_rewrite("tests.connectors.retrieve.retrieve_contract") diff --git a/tests/connectors/retrieve/retrieve_contract.py b/tests/connectors/retrieve/retrieve_contract.py new file mode 100644 index 0000000..f3cf174 --- /dev/null +++ b/tests/connectors/retrieve/retrieve_contract.py @@ -0,0 +1,285 @@ +"""Inheritable contract-test suite for the ``retrieve`` connector family. + +Mirrors open-kgo's ``KgConnectorContractBase``: a concrete backend's test class +subclasses :class:`RetrieveConnectorContractBase`, implements five small adapter +methods, and inherits the whole body of contract assertions for free. + +The base is intentionally NOT named ``Test*`` so pytest does not collect it +directly (it has abstract adapters and no backend). Concrete subclasses named +``Test`` are collected and run every assertion below. +""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import Any, Dict, List, Optional, Type +from unittest.mock import MagicMock + +import pytest + +from mloda.user import mlodaAPI, Feature, Options, PluginCollector +from mloda_plugins.compute_framework.base_implementations.python_dict.python_dict_framework import ( + PythonDictFramework, +) + +from rag_integration.feature_groups.connectors.retrieve.base import BaseRetrieveConnector + + +class RetrieveConnectorContractBase(ABC): + """Contract every retrieve-connector backend must satisfy.""" + + # -- Adapter methods (a concrete test implements these five) -------------- + + @classmethod + @abstractmethod + def connector_class(cls) -> Type[BaseRetrieveConnector]: + """Return the concrete ``BaseRetrieveConnector`` subclass under test.""" + + @classmethod + @abstractmethod + def backend_value(cls) -> str: + """Return the ``retrieve_backend`` value that selects this concrete.""" + + @classmethod + @abstractmethod + def sample_corpus(cls) -> List[Dict[str, Any]]: + """Return a small corpus of ``{doc_id, text}`` dicts. + + Craft it so the query has one determinate best match. For a lexical + backend the query must share literal tokens with the intended top doc, + and the distractors must share none. + """ + + @classmethod + @abstractmethod + def sample_query(cls) -> str: + """Return a query whose best match in ``sample_corpus`` is determinate.""" + + @classmethod + @abstractmethod + def expected_top_doc_id(cls) -> str: + """Return the ``doc_id`` that must rank first for ``sample_query``.""" + + # -- Default fixtures (overridable, shared by all backends) ---------------- + + @classmethod + def matching_query(cls) -> str: + """Return a query that every doc in :meth:`matching_corpus` matches.""" + return "zebra" + + @classmethod + def matching_corpus(cls) -> List[Dict[str, Any]]: + """Return a corpus in which every doc positively matches + :meth:`matching_query`. + + Every doc shares the literal token ``zebra``, so any lexical or + vector-space backend scores all of them positively. Tests about corpus + coverage and the default ``top_k`` use this fixture because the family + returns only positively scoring passages, which makes the regular + distractor-heavy ``sample_corpus`` unsuitable for them. The corpus is + deliberately larger than ``DEFAULT_TOP_K``. + """ + return [{"doc_id": f"m{i}", "text": f"zebra fact number {i} from the zebra herd"} for i in range(7)] + + # -- Helpers -------------------------------------------------------------- + + @classmethod + def _retrieve(cls, query: str, corpus: List[Dict[str, Any]], top_k: int) -> List[Dict[str, Any]]: + return cls.connector_class()._retrieve(query, corpus, top_k) + + @classmethod + def _options(cls, query: str, corpus: List[Dict[str, Any]], top_k: Optional[int]) -> Options: + """Build the family Options; ``top_k=None`` omits the key (default applies).""" + connector = cls.connector_class() + context: Dict[str, Any] = { + connector.RETRIEVE_BACKEND: cls.backend_value(), + connector.QUERY_TEXT: query, + connector.CORPUS: corpus, + } + if top_k is not None: + context[connector.TOP_K] = top_k + return Options(context=context) + + @classmethod + def _feature_set(cls, options: Options) -> Any: + """Build a minimal FeatureSet stand-in holding one feature with ``options``.""" + feature = MagicMock() + feature.options = options + features = MagicMock() + features.features = [feature] + return features + + @classmethod + def _run_all(cls, query: str, corpus: List[Dict[str, Any]], top_k: int) -> List[Dict[str, Any]]: + connector = cls.connector_class() + feature = Feature(connector.ROOT_FEATURE_NAME, options=cls._options(query, corpus, top_k)) + result = mlodaAPI.run_all( + [feature], + compute_frameworks={PythonDictFramework}, + plugin_collector=PluginCollector.enabled_feature_groups({connector}), + ) + for partition in result: + for row in partition: + if connector.ROOT_FEATURE_NAME in row: + passages: List[Dict[str, Any]] = row[connector.ROOT_FEATURE_NAME] + return passages + raise AssertionError(f"run_all returned no '{connector.ROOT_FEATURE_NAME}' row: {result!r}") + + # -- Matching / honest surface -------------------------------------------- + + def test_matches_root_feature_for_declared_backend(self) -> None: + connector = self.connector_class() + opts = Options(context={connector.RETRIEVE_BACKEND: self.backend_value()}) + assert connector.match_feature_group_criteria(connector.ROOT_FEATURE_NAME, opts) is True + + def test_does_not_match_other_feature_name(self) -> None: + connector = self.connector_class() + opts = Options(context={connector.RETRIEVE_BACKEND: self.backend_value()}) + assert connector.match_feature_group_criteria("docs", opts) is False + + def test_unknown_backend_does_not_match(self) -> None: + """Honest surface: the connector does not claim a backend it cannot serve.""" + connector = self.connector_class() + opts = Options(context={connector.RETRIEVE_BACKEND: "definitely_not_a_backend_xyz"}) + assert connector.match_feature_group_criteria(connector.ROOT_FEATURE_NAME, opts) is False + + def test_backend_declared_in_supported_set(self) -> None: + connector = self.connector_class() + assert self.backend_value() in connector.RETRIEVE_BACKENDS + + # -- Output contract ------------------------------------------------------ + + def test_returns_ranked_passage_shape(self) -> None: + passages = self._retrieve(self.sample_query(), self.sample_corpus(), top_k=len(self.sample_corpus())) + assert isinstance(passages, list) + assert passages, "canonical query returned no passages; contract assertions would be vacuous" + for passage in passages: + assert set(passage) >= {"doc_id", "text", "score", "rank"} + assert isinstance(passage["doc_id"], str) + assert isinstance(passage["text"], str) + assert isinstance(passage["score"], float) + assert isinstance(passage["rank"], int) + + def test_scores_non_increasing_and_ranks_ascending(self) -> None: + passages = self._retrieve(self.sample_query(), self.sample_corpus(), top_k=len(self.sample_corpus())) + ranks = [p["rank"] for p in passages] + scores = [p["score"] for p in passages] + assert ranks == list(range(len(passages))) + assert scores == sorted(scores, reverse=True) + + def test_relevant_doc_ranked_first(self) -> None: + """Not-a-stub proof: the crafted relevant doc must rank #1, by a margin. + + The strict ``score`` separation rules out a backend that ignores the + query and happens to return the corpus in an order (e.g. alphabetical) + that puts the expected doc first, and a backend that returns all-equal + scores. + """ + passages = self._retrieve(self.sample_query(), self.sample_corpus(), top_k=len(self.sample_corpus())) + assert passages[0]["doc_id"] == self.expected_top_doc_id() + assert len(passages) >= 2, "contract corpus must have >=2 docs to prove score separation" + assert passages[0]["score"] > passages[1]["score"] + + def test_passage_text_matches_corpus(self) -> None: + """Guards the base assembly: each passage's text is the corpus text for + its doc_id (catches a doc_id<->text pairing regression in the base, not + a wrong-index backend, which ``test_relevant_doc_ranked_first`` covers).""" + corpus = self.sample_corpus() + text_by_doc_id = {str(doc["doc_id"]): str(doc["text"]) for doc in corpus} + passages = self._retrieve(self.sample_query(), corpus, top_k=len(corpus)) + for passage in passages: + assert passage["text"] == text_by_doc_id[passage["doc_id"]] + + def test_doc_ids_unique_and_cover_corpus(self) -> None: + """No silent drop or duplicate: with an all-matching corpus and + ``top_k >= corpus size`` the returned doc_ids are unique and cover + exactly the corpus. + + A backend that drops or duplicates a doc (e.g. returns ``[(2,..),(2,..), + (0,..)]``) passes ranking/score/mapping checks but fails here. This is + the assertion that keeps the silent-corruption class out of every + sibling family that copies this suite. Uses ``matching_corpus`` because + the family drops zero-scored passages, so only a corpus where every doc + matches can be covered in full. + """ + corpus = self.matching_corpus() + passages = self._retrieve(self.matching_query(), corpus, top_k=len(corpus)) + returned = [p["doc_id"] for p in passages] + assert len(returned) == len(set(returned)), "ranking returned duplicate doc_ids" + assert set(returned) == {str(doc["doc_id"]) for doc in corpus} + + def test_top_k_respected(self) -> None: + passages = self._retrieve(self.sample_query(), self.sample_corpus(), top_k=1) + assert len(passages) == 1 + assert passages[0]["doc_id"] == self.expected_top_doc_id() + + def test_top_k_clamped_to_corpus(self) -> None: + corpus = self.matching_corpus() + passages = self._retrieve(self.matching_query(), corpus, top_k=len(corpus) + 50) + assert len(passages) == len(corpus) + + def test_top_k_zero_returns_empty(self) -> None: + assert self._retrieve(self.sample_query(), self.sample_corpus(), top_k=0) == [] + + def test_top_k_negative_returns_empty(self) -> None: + assert self._retrieve(self.sample_query(), self.sample_corpus(), top_k=-3) == [] + + def test_default_top_k_is_five(self) -> None: + """With ``top_k`` absent from the options, ``DEFAULT_TOP_K`` (5) applies. + + The fixture proves it: ``matching_corpus`` has more than 5 docs that + all positively match, so exactly 5 coming back can only be the default + at work (the only-positive-scores rule cannot have trimmed the list). + """ + connector = self.connector_class() + corpus = self.matching_corpus() + assert len(corpus) > connector.DEFAULT_TOP_K, "fixture must exceed the default to prove the cut" + options = self._options(self.matching_query(), corpus, top_k=None) + result = connector.calculate_feature([], self._feature_set(options)) + passages = result[0][connector.ROOT_FEATURE_NAME] + assert len(passages) == connector.DEFAULT_TOP_K + + def test_empty_corpus_returns_empty(self) -> None: + assert self._retrieve(self.sample_query(), [], top_k=5) == [] + + def test_degenerate_query_returns_empty(self) -> None: + """Family rule: only positively scoring passages are returned, so a + query sharing no terms with the corpus yields no passages (instead of + ``top_k`` arbitrary zero-scored ones).""" + assert self._retrieve("zzzz qqqq", self.sample_corpus(), top_k=3) == [] + + def test_missing_query_text_raises(self) -> None: + connector = self.connector_class() + options = Options( + context={ + connector.RETRIEVE_BACKEND: self.backend_value(), + connector.CORPUS: self.sample_corpus(), + } + ) + with pytest.raises(ValueError, match=connector.QUERY_TEXT): + connector.calculate_feature([], self._feature_set(options)) + + def test_missing_corpus_raises(self) -> None: + connector = self.connector_class() + options = Options( + context={ + connector.RETRIEVE_BACKEND: self.backend_value(), + connector.QUERY_TEXT: self.sample_query(), + } + ) + with pytest.raises(ValueError, match=connector.CORPUS): + connector.calculate_feature([], self._feature_set(options)) + + def test_idempotent(self) -> None: + corpus = self.sample_corpus() + first = self._retrieve(self.sample_query(), corpus, top_k=len(corpus)) + second = self._retrieve(self.sample_query(), corpus, top_k=len(corpus)) + assert first == second + + # -- End to end ----------------------------------------------------------- + + def test_end_to_end_run_all(self) -> None: + corpus = self.sample_corpus() + passages = self._run_all(self.sample_query(), corpus, top_k=len(corpus)) + assert passages, "run_all produced no passages" + assert passages[0]["doc_id"] == self.expected_top_doc_id() diff --git a/tests/connectors/retrieve/test_base_validation.py b/tests/connectors/retrieve/test_base_validation.py new file mode 100644 index 0000000..e2653b1 --- /dev/null +++ b/tests/connectors/retrieve/test_base_validation.py @@ -0,0 +1,105 @@ +"""Negative tests for the base retrieve-connector validation. + +A deliberately misbehaving stub backend returns whatever ``(index, score)`` +pairs a test injects, proving that every ``_rank`` requirement the base +documents (indices in range, indices unique, at most ``top_k`` pairs, scores +non-increasing) fails loudly in ``_validate_ranking`` instead of silently +corrupting the passage list. The corpus validation in ``_retrieve``, the +one-feature-per-run limit of ``calculate_feature``, and the ``top_k`` option +parsing are covered here too: they are base behavior, not per-backend behavior, +so they live outside the inheritable contract suite. +""" + +from __future__ import annotations + +from typing import Any, Dict, List, Tuple, Type +from unittest.mock import MagicMock + +import pytest + +from mloda.user import Options + +from rag_integration.feature_groups.connectors.retrieve.base import BaseRetrieveConnector + + +def _stub_returning(pairs: List[Tuple[int, float]]) -> Type[BaseRetrieveConnector]: + """Build a stub backend whose ``_rank`` returns ``pairs`` verbatim.""" + + class _MisbehavingRetriever(BaseRetrieveConnector): + RETRIEVE_BACKENDS = {"misbehaving_stub": "Deliberately misbehaving _rank for validation tests"} + + @classmethod + def _rank(cls, query: str, texts: List[str], top_k: int) -> List[Tuple[int, float]]: + return pairs + + return _MisbehavingRetriever + + +def _corpus() -> List[Dict[str, Any]]: + return [ + {"doc_id": "d0", "text": "alpha"}, + {"doc_id": "d1", "text": "beta"}, + {"doc_id": "d2", "text": "gamma"}, + ] + + +class TestRankValidation: + """Each documented ``_rank`` requirement is enforced, not just trusted.""" + + def test_duplicate_indices_raise(self) -> None: + stub = _stub_returning([(0, 1.0), (0, 0.5)]) + with pytest.raises(ValueError, match="duplicate index"): + stub._retrieve("alpha", _corpus(), top_k=2) + + def test_out_of_range_index_raises(self) -> None: + stub = _stub_returning([(99, 1.0)]) + with pytest.raises(ValueError, match="out-of-range"): + stub._retrieve("alpha", _corpus(), top_k=2) + + def test_increasing_scores_raise(self) -> None: + stub = _stub_returning([(0, 0.1), (1, 0.9)]) + with pytest.raises(ValueError, match="non-increasing"): + stub._retrieve("alpha", _corpus(), top_k=2) + + def test_more_pairs_than_top_k_raise(self) -> None: + stub = _stub_returning([(0, 0.9), (1, 0.5)]) + with pytest.raises(ValueError, match="pairs for top_k"): + stub._retrieve("alpha", _corpus(), top_k=1) + + +class TestCorpusValidation: + """Malformed corpora raise a clear ValueError instead of an AttributeError.""" + + def test_non_dict_corpus_entry_raises(self) -> None: + stub = _stub_returning([(0, 1.0)]) + corpus: List[Any] = [{"doc_id": "d0", "text": "alpha"}, "not-a-dict"] + with pytest.raises(ValueError, match="not a dict"): + stub._retrieve("alpha", corpus, top_k=1) + + def test_duplicate_effective_doc_ids_raise(self) -> None: + # "1" collides with the positional-index fallback of the second entry, + # which has no doc_id and sits at index 1. + stub = _stub_returning([(0, 1.0)]) + corpus: List[Dict[str, Any]] = [{"doc_id": "1", "text": "alpha"}, {"text": "beta"}] + with pytest.raises(ValueError, match="duplicate doc_id"): + stub._retrieve("alpha", corpus, top_k=1) + + +class TestCalculateFeatureLimits: + """The family answers one query per run; a multi-feature set raises.""" + + def test_more_than_one_feature_raises(self) -> None: + stub = _stub_returning([(0, 1.0)]) + features = MagicMock() + features.features = [MagicMock(), MagicMock()] + with pytest.raises(ValueError, match="one query per run"): + stub.calculate_feature([], features) + + +class TestTopKParsing: + """A non-integer ``top_k`` option raises naming the key and the value.""" + + def test_garbage_top_k_raises(self) -> None: + options = Options(context={BaseRetrieveConnector.TOP_K: "not-an-int"}) + with pytest.raises(ValueError, match="top_k.*not-an-int"): + BaseRetrieveConnector._get_top_k(options) diff --git a/tests/connectors/retrieve/test_bm25s_retriever.py b/tests/connectors/retrieve/test_bm25s_retriever.py new file mode 100644 index 0000000..23c1703 --- /dev/null +++ b/tests/connectors/retrieve/test_bm25s_retriever.py @@ -0,0 +1,44 @@ +"""Contract test for :class:`Bm25sRetriever`. + +The whole suite is inherited from :class:`RetrieveConnectorContractBase`; this +class only wires up the five adapter methods. The corpus is crafted for a +lexical backend: the query shares both literal tokens (``cat``, ``pet``) only +with ``d2`` and just ``pet`` with ``d1``, so ``d2`` ranks first and ``d1`` is a +positively scoring runner-up (the family drops zero-scored passages, so the +score-margin assertion needs one); the distractors (mat, car) share none. +""" + +from __future__ import annotations + +from typing import Any, Dict, List, Type + +from rag_integration.feature_groups.connectors.retrieve.base import BaseRetrieveConnector +from rag_integration.feature_groups.connectors.retrieve.bm25s_retriever import Bm25sRetriever +from tests.connectors.retrieve.retrieve_contract import RetrieveConnectorContractBase + + +class TestBm25sRetriever(RetrieveConnectorContractBase): + @classmethod + def connector_class(cls) -> Type[BaseRetrieveConnector]: + return Bm25sRetriever + + @classmethod + def backend_value(cls) -> str: + return "bm25s" + + @classmethod + def sample_corpus(cls) -> List[Dict[str, Any]]: + return [ + {"doc_id": "d0", "text": "The mat lay flat on the floor by the window."}, + {"doc_id": "d1", "text": "A dog can be a loyal and energetic pet."}, + {"doc_id": "d2", "text": "A cat is an independent and curious pet."}, + {"doc_id": "d3", "text": "Cars need regular engine oil and maintenance."}, + ] + + @classmethod + def sample_query(cls) -> str: + return "cat pet" + + @classmethod + def expected_top_doc_id(cls) -> str: + return "d2" diff --git a/tests/connectors/retrieve/test_tfidf_retriever.py b/tests/connectors/retrieve/test_tfidf_retriever.py new file mode 100644 index 0000000..032efd5 --- /dev/null +++ b/tests/connectors/retrieve/test_tfidf_retriever.py @@ -0,0 +1,46 @@ +"""Contract test for :class:`TfidfRetriever` (zero-download CI anchor). + +The whole suite is inherited from :class:`RetrieveConnectorContractBase`; this +class only wires up the five adapter methods. The corpus is crafted so the +TF-IDF vectorizer separates the answer: the query shares both distinctive terms +``cat``/``pet`` only with ``d2`` and just ``pet`` with ``d1`` (the embedder +drops tokens of length <= 2, so short stop-ish words do not muddy the vectors), +so ``d2`` ranks first and ``d1`` is a positively scoring runner-up (the family +drops zero-scored passages, so the score-margin assertion needs one); the +distractors share none. +""" + +from __future__ import annotations + +from typing import Any, Dict, List, Type + +from rag_integration.feature_groups.connectors.retrieve.base import BaseRetrieveConnector +from rag_integration.feature_groups.connectors.retrieve.tfidf_retriever import TfidfRetriever +from tests.connectors.retrieve.retrieve_contract import RetrieveConnectorContractBase + + +class TestTfidfRetriever(RetrieveConnectorContractBase): + @classmethod + def connector_class(cls) -> Type[BaseRetrieveConnector]: + return TfidfRetriever + + @classmethod + def backend_value(cls) -> str: + return "tfidf" + + @classmethod + def sample_corpus(cls) -> List[Dict[str, Any]]: + return [ + {"doc_id": "d0", "text": "The mat lay flat on the floor by the window."}, + {"doc_id": "d1", "text": "A dog can be a loyal and energetic pet."}, + {"doc_id": "d2", "text": "A cat is an independent and curious pet."}, + {"doc_id": "d3", "text": "Cars need regular engine oil and maintenance."}, + ] + + @classmethod + def sample_query(cls) -> str: + return "cat pet" + + @classmethod + def expected_top_doc_id(cls) -> str: + return "d2" diff --git a/tests/connectors/structured/__init__.py b/tests/connectors/structured/__init__.py new file mode 100644 index 0000000..fb3abd0 --- /dev/null +++ b/tests/connectors/structured/__init__.py @@ -0,0 +1 @@ +"""structured connector-family tests.""" diff --git a/tests/connectors/structured/structured_contract.py b/tests/connectors/structured/structured_contract.py new file mode 100644 index 0000000..9ec41c1 --- /dev/null +++ b/tests/connectors/structured/structured_contract.py @@ -0,0 +1,210 @@ +"""Inheritable contract-test suite for the ``structured`` connector family. + +Beyond matching/shape, this suite proves the SQL actually ran (a count returns +the real row count) and the filter is precise (it returns exactly the matching +subset, not all rows), and that the base's SQL safety holds (non-SELECT and bad +identifiers are rejected). +""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import Any, Dict, List, Set, Type + +import pytest + +from mloda.user import mlodaAPI, Feature, Options, PluginCollector +from mloda_plugins.compute_framework.base_implementations.python_dict.python_dict_framework import ( + PythonDictFramework, +) + +from rag_integration.feature_groups.connectors.structured.base import BaseStructuredConnector + + +class StructuredConnectorContractBase(ABC): + """Contract every structured (text-to-SQL) backend must satisfy.""" + + # -- Adapter methods ------------------------------------------------------ + + @classmethod + @abstractmethod + def connector_class(cls) -> Type[BaseStructuredConnector]: + """Return the concrete ``BaseStructuredConnector`` subclass under test.""" + + @classmethod + @abstractmethod + def backend_value(cls) -> str: + """Return the ``structured_backend`` value that selects this concrete.""" + + @classmethod + @abstractmethod + def table_name(cls) -> str: ... + + @classmethod + @abstractmethod + def columns(cls) -> List[str]: ... + + @classmethod + @abstractmethod + def rows(cls) -> List[Dict[str, Any]]: ... + + @classmethod + @abstractmethod + def key_column(cls) -> str: + """A column whose values uniquely identify rows (used to check filter results).""" + + @classmethod + @abstractmethod + def count_question(cls) -> str: + """A natural-language 'how many' question over the whole table.""" + + @classmethod + @abstractmethod + def filter_question(cls) -> str: + """A question that should filter to a strict, non-empty subset of rows.""" + + @classmethod + @abstractmethod + def expected_filter_keys(cls) -> Set[str]: + """The ``key_column`` values expected from ``filter_question`` (a strict subset).""" + + @classmethod + @abstractmethod + def filter_value(cls) -> str: + """The literal value that ``filter_question`` filters on (used to prove binding).""" + + # -- Helpers -------------------------------------------------------------- + + @classmethod + def _query(cls, question: str) -> Dict[str, Any]: + connector = cls.connector_class() + return connector._query(question, cls.table_name(), cls.columns(), cls.rows()) + + @classmethod + def _run_all(cls, question: str) -> Dict[str, Any]: + connector = cls.connector_class() + feature = Feature( + connector.ROOT_FEATURE_NAME, + options=Options( + context={ + connector.STRUCTURED_BACKEND: cls.backend_value(), + connector.QUESTION: question, + connector.TABLE: cls.table_name(), + connector.COLUMNS: cls.columns(), + connector.ROWS: cls.rows(), + } + ), + ) + result = mlodaAPI.run_all( + [feature], + compute_frameworks={PythonDictFramework}, + plugin_collector=PluginCollector.enabled_feature_groups({connector}), + ) + for partition in result: + for row in partition: + if connector.ROOT_FEATURE_NAME in row: + answer: Dict[str, Any] = row[connector.ROOT_FEATURE_NAME] + return answer + raise AssertionError(f"run_all returned no '{connector.ROOT_FEATURE_NAME}' row: {result!r}") + + # -- Matching / honest surface -------------------------------------------- + + def test_matches_root_feature_for_declared_backend(self) -> None: + connector = self.connector_class() + opts = Options(context={connector.STRUCTURED_BACKEND: self.backend_value()}) + assert connector.match_feature_group_criteria(connector.ROOT_FEATURE_NAME, opts) is True + + def test_does_not_match_other_feature_name(self) -> None: + connector = self.connector_class() + opts = Options(context={connector.STRUCTURED_BACKEND: self.backend_value()}) + assert connector.match_feature_group_criteria("docs", opts) is False + + def test_unknown_backend_does_not_match(self) -> None: + connector = self.connector_class() + opts = Options(context={connector.STRUCTURED_BACKEND: "definitely_not_a_backend_xyz"}) + assert connector.match_feature_group_criteria(connector.ROOT_FEATURE_NAME, opts) is False + + def test_backend_declared_in_supported_set(self) -> None: + connector = self.connector_class() + assert self.backend_value() in connector.STRUCTURED_BACKENDS + + # -- Output contract ------------------------------------------------------ + + def test_result_shape(self) -> None: + result = self._query(self.count_question()) + assert set(result) >= {"sql", "rows"} + assert isinstance(result["sql"], str) + assert isinstance(result["rows"], list) + assert all(isinstance(row, dict) for row in result["rows"]) + + def test_count_question_returns_true_row_count(self) -> None: + """Not-a-stub proof: the SQL actually ran against the data.""" + result = self._query(self.count_question()) + assert len(result["rows"]) == 1 + (only_value,) = result["rows"][0].values() + assert only_value == len(self.rows()) + + def test_filter_question_is_precise(self) -> None: + """Not-a-stub proof: the filter returns exactly the matching subset, not all rows.""" + result = self._query(self.filter_question()) + keys = {row[self.key_column()] for row in result["rows"]} + assert keys == self.expected_filter_keys() + assert len(result["rows"]) < len(self.rows()), "filter did not narrow the result" + + def test_filter_value_is_bound_not_interpolated(self) -> None: + """Safety: the filter value reaches SQL as a ``?`` placeholder, never as a literal.""" + result = self._query(self.filter_question()) + assert "?" in result["sql"] + assert self.filter_value() not in result["sql"] + + def test_unmatched_question_lists_all_rows(self) -> None: + """Pin the list-all fallback: a question matching no intent returns every row.""" + result = self._query("") + keys = {row[self.key_column()] for row in result["rows"]} + assert keys == {row[self.key_column()] for row in self.rows()} + + def test_count_on_empty_table_returns_zero(self) -> None: + """Pin behavior on an empty table: a count question returns 0, not an error.""" + connector = self.connector_class() + result = connector._query(self.count_question(), self.table_name(), self.columns(), []) + (only_value,) = result["rows"][0].values() + assert only_value == 0 + + def test_rejects_non_select_sql(self) -> None: + """Safety: the base rejects any generated statement that is not a SELECT.""" + connector = self.connector_class() + with pytest.raises(ValueError): + connector._validate_select("DELETE FROM whatever") + + def test_rejects_bad_identifier(self) -> None: + """Safety: a non-identifier table/column is rejected (injection guard).""" + connector = self.connector_class() + with pytest.raises(ValueError): + connector._validate_identifier("a; DROP TABLE x", "column") + + def test_query_rejects_bad_table_identifier_end_to_end(self) -> None: + """Safety through the production path: a malicious table name is rejected + by ``_query`` itself, not only by the isolated validator.""" + connector = self.connector_class() + with pytest.raises(ValueError): + connector._query("anything", "pets; DROP TABLE pets", self.columns(), self.rows()) + + def test_query_rejects_bad_column_identifier_end_to_end(self) -> None: + """Safety through the production path: a malicious column name is rejected + by ``_query`` itself, not only by the isolated validator.""" + connector = self.connector_class() + bad_columns = [*self.columns(), 'evil" FROM x; --'] + with pytest.raises(ValueError): + connector._query("anything", self.table_name(), bad_columns, self.rows()) + + def test_idempotent(self) -> None: + first = self._query(self.filter_question()) + second = self._query(self.filter_question()) + assert first == second + + # -- End to end ----------------------------------------------------------- + + def test_end_to_end_run_all(self) -> None: + result = self._run_all(self.count_question()) + (only_value,) = result["rows"][0].values() + assert only_value == len(self.rows()) diff --git a/tests/connectors/structured/test_aggregate_sql.py b/tests/connectors/structured/test_aggregate_sql.py new file mode 100644 index 0000000..8ac4123 --- /dev/null +++ b/tests/connectors/structured/test_aggregate_sql.py @@ -0,0 +1,116 @@ +"""Contract test for :class:`AggregateSql` (zero-download CI anchor). + +Inherits the whole structured contract suite (count/filter/safety), then adds a +backend-specific proof: an aggregation question runs a real aggregate query and +returns a known computed value (avg of ``[2, 3, 5, 2]`` = ``3.0``), which the +count/filter-only sibling cannot answer. +""" + +from __future__ import annotations + +from typing import Any, Dict, List, Set, Type + +from rag_integration.feature_groups.connectors.structured.aggregate_sql import AggregateSql +from rag_integration.feature_groups.connectors.structured.base import BaseStructuredConnector +from tests.connectors.structured.structured_contract import StructuredConnectorContractBase + + +class TestAggregateSql(StructuredConnectorContractBase): + @classmethod + def connector_class(cls) -> Type[BaseStructuredConnector]: + return AggregateSql + + @classmethod + def backend_value(cls) -> str: + return "aggregate" + + @classmethod + def table_name(cls) -> str: + return "pets" + + @classmethod + def columns(cls) -> List[str]: + return ["name", "species", "age"] + + @classmethod + def rows(cls) -> List[Dict[str, Any]]: + return [ + {"name": "Whiskers", "species": "cat", "age": 2}, + {"name": "Rex", "species": "dog", "age": 3}, + {"name": "Felix", "species": "cat", "age": 5}, + {"name": "Tom", "species": "cat", "age": 2}, + ] + + @classmethod + def key_column(cls) -> str: + return "name" + + @classmethod + def count_question(cls) -> str: + return "how many pets are there" + + @classmethod + def filter_question(cls) -> str: + return "which pets have species cat" + + @classmethod + def expected_filter_keys(cls) -> Set[str]: + return {"Whiskers", "Felix", "Tom"} + + @classmethod + def filter_value(cls) -> str: + return "cat" + + # -- Backend-specific proof: aggregation ---------------------------------- + + def test_average_question_returns_computed_value(self) -> None: + """Not-a-stub proof for this backend: an aggregation question runs a real + AVG over the column and returns the known value (avg of [2,3,5,2] = 3.0).""" + result = self._query("what is the average age") + assert 'AVG("age")' in result["sql"] + assert len(result["rows"]) == 1 + (only_value,) = result["rows"][0].values() + assert only_value == 3.0 + + def test_max_question_returns_computed_value(self) -> None: + """A second aggregate intent: MAX over the column returns the known max.""" + result = self._query("what is the maximum age") + assert 'MAX("age")' in result["sql"] + (only_value,) = result["rows"][0].values() + assert only_value == 5 + + def test_aggregate_prefers_column_after_the_cue(self) -> None: + """A non-target column named before the cue must not win: "the species with + the highest age" aggregates age, not species.""" + result = self._query("what is the species with the highest age") + assert 'MAX("age")' in result["sql"] + (only_value,) = result["rows"][0].values() + assert only_value == 5 + + def test_aggregate_falls_back_to_column_before_the_cue(self) -> None: + """If no column follows the cue, the any-position match still aggregates.""" + result = self._query("what is the age maximum") + assert 'MAX("age")' in result["sql"] + (only_value,) = result["rows"][0].values() + assert only_value == 5 + + def test_count_with_filter_counts_only_matching_rows(self) -> None: + """Count intent must keep the filter: COUNT over cats only, not all rows.""" + result = self._query("how many pets have species cat") + assert "?" in result["sql"] + (only_value,) = result["rows"][0].values() + assert only_value == 3 + + def test_snake_case_column_aggregates(self) -> None: + """The tokenizer keeps underscores, so snake_case columns are recognised.""" + rows = [{"pet_name": "Rex", "unit_price": 2}, {"pet_name": "Felix", "unit_price": 4}] + result = AggregateSql._query("what is the average unit_price", "items", ["pet_name", "unit_price"], rows) + assert 'AVG("unit_price")' in result["sql"] + (only_value,) = result["rows"][0].values() + assert only_value == 3.0 + + def test_average_over_empty_table_returns_none(self) -> None: + """Pin behavior on an empty table: AVG returns a single row holding None.""" + result = AggregateSql._query("what is the average age", self.table_name(), self.columns(), []) + (only_value,) = result["rows"][0].values() + assert only_value is None diff --git a/tests/connectors/structured/test_base_safety.py b/tests/connectors/structured/test_base_safety.py new file mode 100644 index 0000000..aae99e0 --- /dev/null +++ b/tests/connectors/structured/test_base_safety.py @@ -0,0 +1,131 @@ +"""Base-level safety tests for the structured family, through the production path. + +Pins that ``BaseStructuredConnector._query`` itself rejects a non-SELECT or +stacked statement produced by a backend (not just the isolated +``_validate_select``), so a regression that dropped the guard from ``_query`` +would fail here. Also pins identifier handling: reserved-word identifiers work +(double-quoted interpolation) and case-insensitive duplicate columns are +rejected, plus the ``calculate_feature`` option-type guards. +""" + +from __future__ import annotations + +from typing import Any, Dict, List, Tuple +from unittest.mock import MagicMock + +import pytest + +from rag_integration.feature_groups.connectors.structured.base import BaseStructuredConnector +from rag_integration.feature_groups.connectors.structured.rule_based_sql import RuleBasedSql + + +class _DeleteBackend(BaseStructuredConnector): + """A deliberately malicious backend that emits a non-SELECT statement.""" + + STRUCTURED_BACKENDS = {"_delete_stub": "test-only stub"} + + @classmethod + def _to_sql(cls, question: str, table: str, columns: List[str]) -> Tuple[str, List[Any]]: + return "DELETE FROM pets", [] + + +class _StackedBackend(BaseStructuredConnector): + """A deliberately malicious backend that stacks a write after a SELECT.""" + + STRUCTURED_BACKENDS = {"_stacked_stub": "test-only stub"} + + @classmethod + def _to_sql(cls, question: str, table: str, columns: List[str]) -> Tuple[str, List[Any]]: + return "SELECT 1; DROP TABLE pets", [] + + +class _UnterminatedBackend(BaseStructuredConnector): + """A broken backend whose SQL fails sqlglot tokenization (unterminated string).""" + + STRUCTURED_BACKENDS = {"_unterminated_stub": "test-only stub"} + + @classmethod + def _to_sql(cls, question: str, table: str, columns: List[str]) -> Tuple[str, List[Any]]: + return "SELECT 'unterminated FROM pets", [] + + +def test_query_rejects_non_select_through_production_path() -> None: + with pytest.raises(ValueError): + _DeleteBackend._query("delete everything", "pets", ["name"], [{"name": "Rex"}]) + + +def test_query_rejects_stacked_statements_through_production_path() -> None: + """A stacked query ("SELECT 1; DROP TABLE pets") is rejected by ``_query`` itself.""" + with pytest.raises(ValueError): + _StackedBackend._query("anything", "pets", ["name"], [{"name": "Rex"}]) + + +def test_validate_select_rejects_stacked_statements() -> None: + """Regression: older sqlglot ``parse_one`` silently accepted stacked queries.""" + with pytest.raises(ValueError): + BaseStructuredConnector._validate_select("SELECT 1; DROP TABLE pets") + + +def test_validate_select_rejects_tokenize_errors() -> None: + """A tokenizer failure (unterminated string literal) surfaces as ValueError, not raw.""" + with pytest.raises(ValueError): + BaseStructuredConnector._validate_select("SELECT 'unterminated FROM pets") + + +def test_query_rejects_tokenize_errors_through_production_path() -> None: + with pytest.raises(ValueError): + _UnterminatedBackend._query("anything", "pets", ["name"], [{"name": "Rex"}]) + + +def test_validate_select_accepts_quoted_identifiers() -> None: + """The double-quoted identifiers the family generates pass the SQL guard.""" + BaseStructuredConnector._validate_select('SELECT * FROM "order" WHERE LOWER("from") = ?') + + +def test_reserved_word_identifiers_work_end_to_end() -> None: + """A table named "order" with a column named "from" works (quoted interpolation).""" + rows = [{"from": "berlin", "value": 1}, {"from": "munich", "value": 2}] + result = RuleBasedSql._query("which entries have from berlin", "order", ["from", "value"], rows) + assert [row["value"] for row in result["rows"]] == [1] + + +def test_query_rejects_duplicate_columns_case_insensitively() -> None: + """SQLite column names are case-insensitive, so ["Name", "name"] must be rejected.""" + with pytest.raises(ValueError): + RuleBasedSql._query("anything", "pets", ["Name", "name"], [{"Name": "Rex"}]) + + +def _make_features(context: Dict[str, Any]) -> Any: + """Build a minimal FeatureSet mock whose options resolve from ``context``.""" + feature = MagicMock() + feature.options.get.side_effect = context.get + features = MagicMock() + features.features = [feature] + return features + + +def test_calculate_feature_rejects_non_list_columns() -> None: + """A plain-string COLUMNS must not be iterated into characters.""" + features = _make_features( + { + BaseStructuredConnector.QUESTION: "how many", + BaseStructuredConnector.TABLE: "pets", + BaseStructuredConnector.COLUMNS: "name", + BaseStructuredConnector.ROWS: [{"name": "Rex"}], + } + ) + with pytest.raises(ValueError, match="columns"): + RuleBasedSql.calculate_feature(None, features) + + +def test_calculate_feature_rejects_non_dict_rows() -> None: + features = _make_features( + { + BaseStructuredConnector.QUESTION: "how many", + BaseStructuredConnector.TABLE: "pets", + BaseStructuredConnector.COLUMNS: ["name"], + BaseStructuredConnector.ROWS: ["Rex"], + } + ) + with pytest.raises(ValueError, match="rows"): + RuleBasedSql.calculate_feature(None, features) diff --git a/tests/connectors/structured/test_rule_based_sql.py b/tests/connectors/structured/test_rule_based_sql.py new file mode 100644 index 0000000..9e5fdc1 --- /dev/null +++ b/tests/connectors/structured/test_rule_based_sql.py @@ -0,0 +1,79 @@ +"""Contract test for :class:`RuleBasedSql` (zero-download CI anchor).""" + +from __future__ import annotations + +from typing import Any, Dict, List, Set, Type + +from rag_integration.feature_groups.connectors.structured.base import BaseStructuredConnector +from rag_integration.feature_groups.connectors.structured.rule_based_sql import RuleBasedSql +from tests.connectors.structured.structured_contract import StructuredConnectorContractBase + + +class TestRuleBasedSql(StructuredConnectorContractBase): + @classmethod + def connector_class(cls) -> Type[BaseStructuredConnector]: + return RuleBasedSql + + @classmethod + def backend_value(cls) -> str: + return "rule_based" + + @classmethod + def table_name(cls) -> str: + return "pets" + + @classmethod + def columns(cls) -> List[str]: + return ["name", "species", "age"] + + @classmethod + def rows(cls) -> List[Dict[str, Any]]: + return [ + {"name": "Whiskers", "species": "cat", "age": 3}, + {"name": "Rex", "species": "dog", "age": 5}, + {"name": "Felix", "species": "cat", "age": 2}, + ] + + @classmethod + def key_column(cls) -> str: + return "name" + + @classmethod + def count_question(cls) -> str: + return "how many pets are there" + + @classmethod + def filter_question(cls) -> str: + return "which pets have species cat" + + @classmethod + def expected_filter_keys(cls) -> Set[str]: + return {"Whiskers", "Felix"} + + @classmethod + def filter_value(cls) -> str: + return "cat" + + # -- Backend-specific behavior --------------------------------------------- + + def test_count_with_filter_counts_only_matching_rows(self) -> None: + """Count intent must keep the filter: COUNT over cats only, not all rows.""" + result = self._query("how many pets have species cat") + assert "?" in result["sql"] + (only_value,) = result["rows"][0].values() + assert only_value == 2 + + def test_snake_case_column_maps_to_filter(self) -> None: + """The tokenizer keeps underscores, so snake_case columns are recognised.""" + rows = [{"pet_name": "Rex", "unit_price": 5}, {"pet_name": "Felix", "unit_price": 7}] + result = RuleBasedSql._query("which items have unit_price 5", "items", ["pet_name", "unit_price"], rows) + assert 'LOWER("unit_price") = ?' in result["sql"] + assert [row["pet_name"] for row in result["rows"]] == ["Rex"] + + def test_decimal_filter_value_binds_full_number(self) -> None: + """The tokenizer keeps decimals, so "2.5" binds whole (not "2" then "5").""" + rows = [{"name": "Rex", "age": 2.5}, {"name": "Felix", "age": 3}] + result = RuleBasedSql._query("which pets have age 2.5", "pets", ["name", "age"], rows) + assert "?" in result["sql"] + assert "2.5" not in result["sql"] + assert [row["name"] for row in result["rows"]] == ["Rex"] diff --git a/tox.ini b/tox.ini index 4bfcac6..881397d 100644 --- a/tox.ini +++ b/tox.ini @@ -7,12 +7,18 @@ usedevelop = true extras = dev advanced + connectors + rerank + graph + structured + orchestrator passenv = CI UV_CACHE_DIR setenv = KMP_DUPLICATE_LIB_OK = TRUE OMP_NUM_THREADS = 1 + HAYSTACK_TELEMETRY_ENABLED = False allowlist_externals = sh commands = pytest @@ -24,7 +30,7 @@ commands = [testenv:security] # CVE scanning environment - builds and scans the local package usedevelop = true -extras = dev,advanced +extras = dev,advanced,connectors,rerank,graph,structured,orchestrator deps = pip-audit allowlist_externals = sh, mkdir diff --git a/uv.lock b/uv.lock index c6e4120..818f336 100644 --- a/uv.lock +++ b/uv.lock @@ -263,6 +263,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/64/b4/17d4b0b2a2dc85a6df63d1157e028ed19f90d4cd97c36717afef2bc2f395/attrs-26.1.0-py3-none-any.whl", hash = "sha256:c647aa4a12dfbad9333ca4e71fe62ddc36f4e63b2d260a37a8b83d2f043ac309", size = 67548, upload-time = "2026-03-19T14:22:23.645Z" }, ] +[[package]] +name = "backoff" +version = "2.2.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/47/d7/5bbeb12c44d7c4f2fb5b56abce497eb5ed9f34d85701de869acedd602619/backoff-2.2.1.tar.gz", hash = "sha256:03f829f5bb1923180821643f8753b0502c3b682293992485b0eef2807afa5cba", size = 17001, upload-time = "2022-10-05T19:19:32.061Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/df/73/b6e24bd22e6720ca8ee9a85a0c4a2971af8497d8f3193fa05390cbd46e09/backoff-2.2.1-py3-none-any.whl", hash = "sha256:63579f9a0628e06278f7e47b7d7d5b6ce20dc65c5e96a6f3ca99a6adca0396e8", size = 15148, upload-time = "2022-10-05T19:19:30.546Z" }, +] + [[package]] name = "bandit" version = "1.9.4" @@ -339,6 +348,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/0a/de/acae8e9f9a1f4bb393d41c8265898b0f29772e38eac14e9f69d191e2c006/blis-1.3.3-cp314-cp314-win_amd64.whl", hash = "sha256:9e5fdf4211b1972400f8ff6dafe87cb689c5d84f046b4a76b207c0bd2270faaf", size = 6324695, upload-time = "2025-11-17T12:28:28.401Z" }, ] +[[package]] +name = "bm25s" +version = "0.3.9" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, + { name = "numpy", version = "2.4.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/7c/ab/39d08d4589dad6f5735a6b03ee083088dc37fba57fa45d4a0749393b9295/bm25s-0.3.9.tar.gz", hash = "sha256:895c679d952b7de8355edb5f3e1a620a1e2f294d1d42b919bf0821cce2e2f597", size = 80528, upload-time = "2026-05-13T23:08:20.952Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/48/83/06582913e2af0cf498cc478350070490b019c6f6b523cb8b6cfafd32f009/bm25s-0.3.9-py3-none-any.whl", hash = "sha256:db997475e82c9e81262b83fefe6a71304f2fdd0dafd037c8929828865fa6d465", size = 74475, upload-time = "2026-05-13T23:08:19.557Z" }, +] + [[package]] name = "cachetools" version = "7.1.4" @@ -695,6 +717,24 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/33/6b/e0547afaf41bf2c42e52430072fa5658766e3d65bd4b03a563d1b6336f57/distlib-0.4.0-py2.py3-none-any.whl", hash = "sha256:9659f7d87e46584a30b5780e43ac7a2143098441670ff0a49d5f9034c54a6c16", size = 469047, upload-time = "2025-07-17T16:51:58.613Z" }, ] +[[package]] +name = "distro" +version = "1.9.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/fc/f8/98eea607f65de6527f8a2e8885fc8015d3e6f5775df186e443e0964a11c3/distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed", size = 60722, upload-time = "2023-12-24T09:54:32.31Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/12/b3/231ffd4ab1fc9d679809f356cebee130ac7daa00d6d6f3206dd4fd137e9e/distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2", size = 20277, upload-time = "2023-12-24T09:54:30.421Z" }, +] + +[[package]] +name = "docstring-parser" +version = "0.18.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e0/4d/f332313098c1de1b2d2ff91cf2674415cc7cddab2ca1b01ae29774bd5fdf/docstring_parser-0.18.0.tar.gz", hash = "sha256:292510982205c12b1248696f44959db3cdd1740237a968ea1e2e7a900eeb2015", size = 29341, upload-time = "2026-04-14T04:09:19.867Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a7/5f/ed01f9a3cdffbd5a008556fc7b2a08ddb1cc6ace7effa7340604b1d16699/docstring_parser-0.18.0-py3-none-any.whl", hash = "sha256:b3fcbed555c47d8479be0796ef7e19c2670d428d72e96da63f3a40122860374b", size = 22484, upload-time = "2026-04-14T04:09:18.638Z" }, +] + [[package]] name = "exceptiongroup" version = "1.3.1" @@ -737,6 +777,41 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/81/47/dd9a212ef6e343a6857485ffe25bba537304f1913bdbed446a23f7f592e1/filelock-3.29.0-py3-none-any.whl", hash = "sha256:96f5f6344709aa1572bbf631c640e4ebeeb519e08da902c39a001882f30ac258", size = 39812, upload-time = "2026-04-19T15:39:08.752Z" }, ] +[[package]] +name = "filetype" +version = "1.2.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/bb/29/745f7d30d47fe0f251d3ad3dc2978a23141917661998763bebb6da007eb1/filetype-1.2.0.tar.gz", hash = "sha256:66b56cd6474bf41d8c54660347d37afcc3f7d1970648de365c102ef77548aadb", size = 998020, upload-time = "2022-11-02T17:34:04.141Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/18/79/1b8fa1bb3568781e84c9200f951c735f3f157429f44be0495da55894d620/filetype-1.2.0-py2.py3-none-any.whl", hash = "sha256:7ce71b6880181241cf7ac8697a2f1eb6a8bd9b429f7ad6d27b8db9ba5f1c2d25", size = 19970, upload-time = "2022-11-02T17:34:01.425Z" }, +] + +[[package]] +name = "flashrank" +version = "0.2.10" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, + { name = "numpy", version = "2.4.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "onnxruntime", version = "1.24.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, + { name = "onnxruntime", version = "1.26.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "requests" }, + { name = "tokenizers" }, + { name = "tqdm" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/55/1f/176cb4a857a70c3538f637e19389ab6aed21548a1ba1d1424fccc8bba108/FlashRank-0.2.10.tar.gz", hash = "sha256:f8f82a25c32fdfc668a09dc4089421d6aab8e7f71308424b541f40bb3f01d9db", size = 18905, upload-time = "2025-01-06T13:33:01.657Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ec/99/72639cc1c9221c5bc77a2df1c2d352fe11965553bdf7d3e0856e7fcc8fd6/FlashRank-0.2.10-py3-none-any.whl", hash = "sha256:5d3272ae657d793c132d1e7917ed9e2adf49e0e1c60735583a67b051c6f0434a", size = 14511, upload-time = "2025-01-06T13:32:59.42Z" }, +] + +[[package]] +name = "flatbuffers" +version = "25.12.19" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e8/2d/d2a548598be01649e2d46231d151a6c56d10b964d94043a335ae56ea2d92/flatbuffers-25.12.19-py2.py3-none-any.whl", hash = "sha256:7634f50c427838bb021c2d66a3d1168e9d199b0607e6329399f04846d42e20b4", size = 26661, upload-time = "2025-12-19T23:16:13.622Z" }, +] + [[package]] name = "frozenlist" version = "1.8.0" @@ -881,6 +956,50 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515, upload-time = "2025-04-24T03:35:24.344Z" }, ] +[[package]] +name = "haystack-ai" +version = "2.30.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "docstring-parser" }, + { name = "filetype" }, + { name = "haystack-experimental" }, + { name = "httpx" }, + { name = "jinja2" }, + { name = "jsonschema" }, + { name = "lazy-imports" }, + { name = "markupsafe" }, + { name = "more-itertools" }, + { name = "networkx", version = "3.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, + { name = "networkx", version = "3.6.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, + { name = "numpy", version = "2.4.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "openai" }, + { name = "posthog" }, + { name = "pydantic" }, + { name = "python-dateutil" }, + { name = "pyyaml" }, + { name = "tenacity" }, + { name = "tqdm" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f4/64/6addccb6335cf551f2e2fa1ffc05769939a8d652246d8d6ca7de24a0a1de/haystack_ai-2.30.0.tar.gz", hash = "sha256:e9f677c5e32eb0217d9dac5f13cc0d940517a50f35bdcf02fc9e619e60128bce", size = 498234, upload-time = "2026-06-03T10:21:16.21Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/28/8d/73ae92b456dd2b33ffc0fc8c0b59fc6dbf30d4942b2adec21ed3058a7bd9/haystack_ai-2.30.0-py3-none-any.whl", hash = "sha256:7342544218c331ebcf2444ba43da5971f7376baf412c37617dbafd6332151975", size = 719742, upload-time = "2026-06-03T10:21:14.43Z" }, +] + +[[package]] +name = "haystack-experimental" +version = "0.19.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "haystack-ai" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/71/a3/02eb86716f4856072feb852555f2d23855bb20c993264dcf4e83dfe87a8a/haystack_experimental-0.19.0.tar.gz", hash = "sha256:194f9074f9184a20d2f4efa7b5082dd33118bc886f87937d13e33616cd549067", size = 45328, upload-time = "2026-02-05T08:31:33.862Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/58/c4/6719e1b03b72ab31729b556a47fe3df1ab7fa233b1d01182f0c5ddadfda6/haystack_experimental-0.19.0-py3-none-any.whl", hash = "sha256:ebe1691a4b8d06f934bad792ff95fb4b6858a2ae08f08519e9cf35b7b439b4bd", size = 63150, upload-time = "2026-02-05T08:31:34.683Z" }, +] + [[package]] name = "hf-xet" version = "1.5.0" @@ -991,6 +1110,109 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899, upload-time = "2025-03-05T20:05:00.369Z" }, ] +[[package]] +name = "jiter" +version = "0.15.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/66/b5/55f06bb281d92fb3cc86d14e1def2bd908bb77693183e7cb1f5a3c388b0c/jiter-0.15.0.tar.gz", hash = "sha256:4251acc80e2b7c9b7b8823456ea0fceeb0734dac2df7636d3c711b38476b5a76", size = 166640, upload-time = "2026-05-19T10:09:48.361Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1d/da/76a2c7e510ba15fe323d9509c223ab272da79ea59f54488f4a78da6426db/jiter-0.15.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:edebcf7d1f601199084bb6e844d7dc67e03e04f6ac786b0332d616635c4ff7a4", size = 310849, upload-time = "2026-05-19T10:06:51.944Z" }, + { url = "https://files.pythonhosted.org/packages/5d/8e/827be942883a4dc0862c48626ff41af3320b1902d136a0bf4b9041f2c567/jiter-0.15.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9f924585cdacf631cd382b657966847bb537bf9ed0a6f9b991da5f05a631480f", size = 314991, upload-time = "2026-05-19T10:06:53.522Z" }, + { url = "https://files.pythonhosted.org/packages/6d/38/be2832be361ba1b9517c76f46d30b64e985be1dd43c974f4c3a4b1844436/jiter-0.15.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:abbf258599526ad0326fe51e252e24f2bd6f24f1852681b4b78feda3808f1d18", size = 340843, upload-time = "2026-05-19T10:06:55.071Z" }, + { url = "https://files.pythonhosted.org/packages/6d/d8/90f01fb83c0c7ba509303ec93e32a308fbfa167d264860b01c0fd0dbbd06/jiter-0.15.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7c468136b8bd6bb18c8786e4236a1fa27362f24cb23450ba0cb204ab379b8e6f", size = 365116, upload-time = "2026-05-19T10:06:56.893Z" }, + { url = "https://files.pythonhosted.org/packages/91/38/94593d34f8c67a0b6f6cbc027f016ffa9780b3a858a7a86f6fd7a15bcc1e/jiter-0.15.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:05906b93d72f03339e6bb7cf8dc10ebda64a0266126eed6beba79e20abcf5fd4", size = 457970, upload-time = "2026-05-19T10:06:58.707Z" }, + { url = "https://files.pythonhosted.org/packages/df/04/d79962dd49d00c97e2a9b4cacea1947904d02135936960351f9a96d4c1a6/jiter-0.15.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:30ce785d2adb8e32c3f7741442370a74834ec4c01f3c48f0750227a0b4ef27d6", size = 375744, upload-time = "2026-05-19T10:07:00.471Z" }, + { url = "https://files.pythonhosted.org/packages/c3/2e/5d37abe2be0e819c21e2338bebd410e481763ce526a9138c8c3652fa0123/jiter-0.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2fd73e3da91a0a722d67165e849ce2cdc10de0e0d48738c142be8c6c5f310f4c", size = 349609, upload-time = "2026-05-19T10:07:01.829Z" }, + { url = "https://files.pythonhosted.org/packages/7a/90/98768ad2ed90c1fda15d64157de2dfbf73c1c074d4b1bfaca915480bc7cf/jiter-0.15.0-cp310-cp310-manylinux_2_31_riscv64.whl", hash = "sha256:ceb8fc27d38793f9c97149be8302720c5b22e5c195a37bf2c45dc36c4600a512", size = 354366, upload-time = "2026-05-19T10:07:03.587Z" }, + { url = "https://files.pythonhosted.org/packages/d6/c4/fbfb806209f1fe4b7dccdfb07bc62bb044300734a945b06fd64db446ef6a/jiter-0.15.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d726e3ceeb337191324b49de298142f27c3ad10886341555d1d5315b5f252c6a", size = 393519, upload-time = "2026-05-19T10:07:05.08Z" }, + { url = "https://files.pythonhosted.org/packages/37/1c/b9c257cd70cb453b6d10f3ebf0402cdb11669ab455389096f09839670290/jiter-0.15.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:2c8aea7781d2a372227871de4e1a1332aa96f5a89fd76c5e835dafdbad102887", size = 519952, upload-time = "2026-05-19T10:07:06.589Z" }, + { url = "https://files.pythonhosted.org/packages/a9/1a/aa85027db7ab15829c12feebbc33b404f53fc399bd559d85fd0d6365ff0d/jiter-0.15.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:cf4bd113a69c0a740e27cb962ce10630c36d2b8f59d759a651b955ee9d18a823", size = 550770, upload-time = "2026-05-19T10:07:08.228Z" }, + { url = "https://files.pythonhosted.org/packages/d4/54/8c3f65c8a5687925e84708f19d63f7f37d28e2b86a48d951702ad94424d8/jiter-0.15.0-cp310-cp310-win32.whl", hash = "sha256:d92a5cd21fdb083931d546c207aa29633787c5dc5b02daab2d32b843f88a2c53", size = 209303, upload-time = "2026-05-19T10:07:10.006Z" }, + { url = "https://files.pythonhosted.org/packages/d5/72/0528a1eb9f42dd2d8228a0711458628f35924d131f623eaebc35fd23d3d4/jiter-0.15.0-cp310-cp310-win_amd64.whl", hash = "sha256:e58585a58209d72691ce2d62a9147445f5a87beb0bde97fde284c96ae392a3d1", size = 200404, upload-time = "2026-05-19T10:07:11.426Z" }, + { url = "https://files.pythonhosted.org/packages/e4/13/daa722f5765c393576f466378f9dfd29d77c9bed939e0688f96afa3601ea/jiter-0.15.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:0f862193b8696249d22ec433e85fd2ab0ad9596bc3e45e6c0bc55e8aeba97be2", size = 310899, upload-time = "2026-05-19T10:07:12.89Z" }, + { url = "https://files.pythonhosted.org/packages/7f/82/2d2551829b082f4b6d82b9f939b031fb808a10aab1ec0664f82e150bb9a2/jiter-0.15.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1303d4d68a9b051ea90502402063ecf3807da00ad2affa19ca1ae3b90b3c5f67", size = 314963, upload-time = "2026-05-19T10:07:14.539Z" }, + { url = "https://files.pythonhosted.org/packages/2a/0a/8b1a51466f7fe9f31dbe4bc7e0ca848674f9825e0f737b929b97e8c60aa7/jiter-0.15.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:392b8ab019e5502d08aff85c6272209c24bc2cbe706ea82a56368f524236614a", size = 341730, upload-time = "2026-05-19T10:07:15.869Z" }, + { url = "https://files.pythonhosted.org/packages/f6/2a/e71dea19822e2e404e83992a08c1d6b9b617bb944f28c9c2fbd85d02c91e/jiter-0.15.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:773b6eb282ce11ee19f05f6b2d4404fa308e5bbd353b0b80a0262caad6db2cd7", size = 366214, upload-time = "2026-05-19T10:07:17.259Z" }, + { url = "https://files.pythonhosted.org/packages/c4/59/97e1fa539d124a509a00ab7f669289d1c1d236ecabf12948a18f16c91082/jiter-0.15.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8d2c0c44d569ce0f2850f5c926f8caeb5f245fbc84475aeb36efccc2103e6dbd", size = 459527, upload-time = "2026-05-19T10:07:18.741Z" }, + { url = "https://files.pythonhosted.org/packages/d1/7a/4a68d331aef8cf2e2393c14a3aacb635c62aa86071b0229899fb5baaa907/jiter-0.15.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:032396229564bca02440396bd327710719f724f5e7b7e9f7a8eb3faa4a2c2281", size = 375451, upload-time = "2026-05-19T10:07:20.208Z" }, + { url = "https://files.pythonhosted.org/packages/7b/7e/1c445c2b6f0e30a274dc8082e0c3c7825411cce80d726bccd697c98cc8d3/jiter-0.15.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3d37768fce7f88dd2a8c6091f2325dea27d30d30d5c6e7a1c0f0af77723b708", size = 349428, upload-time = "2026-05-19T10:07:22.372Z" }, + { url = "https://files.pythonhosted.org/packages/00/94/e20d38984fc17a636371bffd2ae0f698124fdc8e75ef969cd2da6ba7cea7/jiter-0.15.0-cp311-cp311-manylinux_2_31_riscv64.whl", hash = "sha256:2c9cb907439d20bd0c7d7565ca01ee52234203208433749bae5b516907526928", size = 355405, upload-time = "2026-05-19T10:07:23.916Z" }, + { url = "https://files.pythonhosted.org/packages/94/fa/4d09f814779d0ea80a28ed8e4c6662ec9a4a8ecef0ac52190ebac6262d14/jiter-0.15.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9100ddbec09741cc66feb0fc6773f8bdbd0e3c345689368f260082ff85dcc0cd", size = 393688, upload-time = "2026-05-19T10:07:25.854Z" }, + { url = "https://files.pythonhosted.org/packages/54/9d/8eb5d4fb8bf7e93a75964a5da71a75c67c864baf7fa3f98598187b3c7e57/jiter-0.15.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ae1b0d82ac2d987f9ea512b1c9adfcc71a28de3dea3a6039b54d76cffda9901e", size = 520853, upload-time = "2026-05-19T10:07:27.303Z" }, + { url = "https://files.pythonhosted.org/packages/e7/2c/5e07874e59e623a943a0acf1552a80d05b70f31b402287a8fc6d7ec634c7/jiter-0.15.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:8020c99ec13a7db2b6f96cbe82ef4721c88b426a4892f27478044af0284615ef", size = 551016, upload-time = "2026-05-19T10:07:28.846Z" }, + { url = "https://files.pythonhosted.org/packages/22/ed/d2d34422143474cadc15b60d482b1c35683dbc5c63c24346ddd0df09bcaf/jiter-0.15.0-cp311-cp311-win32.whl", hash = "sha256:42bfb257930800cf43e7c62c832402c704ab60797c992faf88d20e903eac8f32", size = 209518, upload-time = "2026-05-19T10:07:30.431Z" }, + { url = "https://files.pythonhosted.org/packages/1d/7d/52778b930e5cc3e52a37d950b1c10494244308b4329b25a0ff0d88303a81/jiter-0.15.0-cp311-cp311-win_amd64.whl", hash = "sha256:860a74063284a2ae9bfedd694f299cc2c68e2696c5f3d440cc9d18bb81b9dd04", size = 200565, upload-time = "2026-05-19T10:07:32.125Z" }, + { url = "https://files.pythonhosted.org/packages/3b/4f/d9b4067feb69b3fa6eb0488e1b59e2ad5b463fe39f59e527eab2aca00bb0/jiter-0.15.0-cp311-cp311-win_arm64.whl", hash = "sha256:37a10c377ce3a4a85f4a67f28b7afe093154cde77eaf248a72e856aa08b4d865", size = 195488, upload-time = "2026-05-19T10:07:33.846Z" }, + { url = "https://files.pythonhosted.org/packages/44/53/4f6bddbcde3c71e56d0aa1337ec95950f3d27dd4153e25aadf0feac71751/jiter-0.15.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:0e90a1c315a0226ec822d973817967f9223b7701546c8c2a7913e7ab0926294d", size = 308793, upload-time = "2026-05-19T10:07:35.25Z" }, + { url = "https://files.pythonhosted.org/packages/01/84/c01099b59a285a1ebba64ae93f62bfa036675340fd1b0045ae65890a0442/jiter-0.15.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8c9004af7c8d67cce7f1aae1026fb55607f4aa600710d08ede3a3ce4aeefe7e0", size = 309570, upload-time = "2026-05-19T10:07:36.919Z" }, + { url = "https://files.pythonhosted.org/packages/58/64/8fb7f9d45bb98190355454cd04dad8d8f27223d6bd52f83af07f637168a6/jiter-0.15.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c210f8b35dc6f30aafd4b4365ca89b9d1189f21ab49b8e68fa6322a847aef138", size = 336783, upload-time = "2026-05-19T10:07:38.694Z" }, + { url = "https://files.pythonhosted.org/packages/c3/b6/f5739011d009b3a30f6a53c5240979030ba29ae46a8c67e3a15759f7c37d/jiter-0.15.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5f30bae8bc1c2d613e28e5af3e8cceb09b742f1c8a8a5f839fb67afaffc03b61", size = 363555, upload-time = "2026-05-19T10:07:40.832Z" }, + { url = "https://files.pythonhosted.org/packages/e5/12/98a9d9f766665e8a3b6252454e17cb0c464606a28cf2fa09399b003345fa/jiter-0.15.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c60e71b6d10cfc284c9bf36bd885e8d44c46f688ce50aa91b5edd90181dea687", size = 452255, upload-time = "2026-05-19T10:07:42.62Z" }, + { url = "https://files.pythonhosted.org/packages/e8/d5/60f972840f79c5e7544fce567c56f1e4e50468f996baba3e78d823dd62a6/jiter-0.15.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0ab068bce62a45aa3e7367eceaffb5dde60b7eb853be8dece45132e3d0ff4879", size = 373559, upload-time = "2026-05-19T10:07:44.201Z" }, + { url = "https://files.pythonhosted.org/packages/ee/cf/d46ef1234ba335aabc2f013210db8e0821a22f5e644a2e9449df199ecc23/jiter-0.15.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fa248c9eb220197d363f688818dac2fd4b2f0cd7d843ca7105d652034823427d", size = 346055, upload-time = "2026-05-19T10:07:46.005Z" }, + { url = "https://files.pythonhosted.org/packages/f0/63/4d2749d8d54d230bad9b3a6b0d00cc28c6ff6b2fdffc26a8ccf76cc5a974/jiter-0.15.0-cp312-cp312-manylinux_2_31_riscv64.whl", hash = "sha256:2a77aadd57cac1682e4401a72724d2796d89a4ba129b1a5812aa94ee480826eb", size = 351406, upload-time = "2026-05-19T10:07:47.855Z" }, + { url = "https://files.pythonhosted.org/packages/d9/b9/9965b990035d8773328e0a8c8b457a87bf2b19f6c4126d9d99296be5d16a/jiter-0.15.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2ae901f3a55bfafdde31d289590fa25e3245735a2b1e8c7cc15871710a002871", size = 389357, upload-time = "2026-05-19T10:07:49.665Z" }, + { url = "https://files.pythonhosted.org/packages/2d/55/9ddf903deda1413e87fed792f416b7123daee5b8efbad6a202a7421c36a5/jiter-0.15.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:f0b271b462769543716f92d3a4f90527df6ef5ed05ee95ec4137f513e21e1b77", size = 517263, upload-time = "2026-05-19T10:07:51.537Z" }, + { url = "https://files.pythonhosted.org/packages/e8/76/a0c40ad064d3a20a4fde231e35d56e9a01ce82164278180e82d5daf85469/jiter-0.15.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:2fb6a5d26af81fc0f00f9360a891e05cf755e149bba391c4d563adc54812973d", size = 548646, upload-time = "2026-05-19T10:07:53.196Z" }, + { url = "https://files.pythonhosted.org/packages/23/4f/eca9b954942916ba2f453891b8593ab444cd872396fe66a3936616f236f3/jiter-0.15.0-cp312-cp312-win32.whl", hash = "sha256:c2f6bb8b5216ab9e7873bc08b5d7bef2b8abbb578a3069bf1cd14a45d71d771d", size = 206427, upload-time = "2026-05-19T10:07:55.307Z" }, + { url = "https://files.pythonhosted.org/packages/95/bf/8ead82a87495149542748e828d153fd232a512a22c83b02c4815c1a9c7d8/jiter-0.15.0-cp312-cp312-win_amd64.whl", hash = "sha256:40b2c7e92c44a84d748d21706c68dc6ff8161d80b59c99d774721a0d2317d7c7", size = 197300, upload-time = "2026-05-19T10:07:56.651Z" }, + { url = "https://files.pythonhosted.org/packages/f4/e4/9b8a78fb2d894471bc344e37f1949bdd784bd914d031dba0ba3a40c71dd7/jiter-0.15.0-cp312-cp312-win_arm64.whl", hash = "sha256:cc0bc345cf2df9d1c00ac443f50d543c1ccfa8b0422cb85b1ab70d681c0b255b", size = 192702, upload-time = "2026-05-19T10:07:58.307Z" }, + { url = "https://files.pythonhosted.org/packages/e5/f4/f708c900ecee41b2025ef8413d5351e5649eb2125c506f6720cc69b06f5c/jiter-0.15.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:1c11465f97e2abf45a014b83b730222f8f1c5335e802c7055a67d50de6f1f4e3", size = 307829, upload-time = "2026-05-19T10:07:59.704Z" }, + { url = "https://files.pythonhosted.org/packages/86/59/db537c0949e83668c38481d426b9f2fd5ab758c4ee53a811dd0a510626a0/jiter-0.15.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:d1e7b1776f0797956c509e123d0952d10d293a9492dea9f288ab9570ec01d1a5", size = 308445, upload-time = "2026-05-19T10:08:01.184Z" }, + { url = "https://files.pythonhosted.org/packages/37/38/ea0e13b18c30ef951da0d47d39e7fa9edb82a93a62990ffbd7cea9b622d4/jiter-0.15.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:351a341c2105aa430b7047e30f1bf7975f6313b00165d3fc07be2edaf741f279", size = 336181, upload-time = "2026-05-19T10:08:02.688Z" }, + { url = "https://files.pythonhosted.org/packages/58/fc/2303901b16c4ba05865588990a420c0b4156270b44379c20931544a1d962/jiter-0.15.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4ab395feec8d249ec4044e228e98a7033f043426a265df439dc3698823f0a4e4", size = 362985, upload-time = "2026-05-19T10:08:04.394Z" }, + { url = "https://files.pythonhosted.org/packages/5b/6f/11bace093c52e7d4d26c8e606ccd7ae8c972189622469ec0d9e28161e28b/jiter-0.15.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a2a438005b6f22d0273413484d6094d7c2c5d10ec1b3a3bf128e0d1d3ba53258", size = 453292, upload-time = "2026-05-19T10:08:05.967Z" }, + { url = "https://files.pythonhosted.org/packages/22/db/987f2f086ca4d7a6582eb4ccd513f9b26b42d9e4243a087609a3137a8fc7/jiter-0.15.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f18f85e4218d1b40f000f42a92239a7a61a902cd42c65e6c360dbd17dcb20894", size = 373501, upload-time = "2026-05-19T10:08:07.857Z" }, + { url = "https://files.pythonhosted.org/packages/8f/7c/89fbcabb2739b7a5b8dc959a1b6c5761f6484f5fed3486854b3c789bb1de/jiter-0.15.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d1aa62e277fc1cbd80e6deacae6f4d983b41b3d7728e0645c5d741a6149bba45", size = 344683, upload-time = "2026-05-19T10:08:09.431Z" }, + { url = "https://files.pythonhosted.org/packages/30/6f/6cca7692e7dddfec6d8d76c54dc97f2af2a41df4ac0674b999df1f09a5f3/jiter-0.15.0-cp313-cp313-manylinux_2_31_riscv64.whl", hash = "sha256:6550fa135c7deb8ead6af49ed7ff648532ea8334a1447fe34a36315ef79c5c29", size = 350892, upload-time = "2026-05-19T10:08:11.352Z" }, + { url = "https://files.pythonhosted.org/packages/39/14/0338d6190cb8e6d22e677ab1d4eabd4117f67cca70c54cd04b82ff64e068/jiter-0.15.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:066f8f33f18b2419cd8213b2436fa7fbc9c499f315971cfa3ce1f9820c001b1b", size = 388723, upload-time = "2026-05-19T10:08:12.912Z" }, + { url = "https://files.pythonhosted.org/packages/90/31/cc19f4a1bdb6afb09ce6a2f2615aa8d44d994eba0d8e6105ed1af920e736/jiter-0.15.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:75e8a04e91432dde9f1838373cf93d23726c79d3e908d319acf0e796f85592e7", size = 516648, upload-time = "2026-05-19T10:08:14.808Z" }, + { url = "https://files.pythonhosted.org/packages/49/9f/833c541512cd091b63c10c0381973dfe11bc7a503a818c16384417e0c81e/jiter-0.15.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:a97261f1fccb8e50ecd2890a96e46efdc3f57c80a197324c6777827231eca712", size = 547382, upload-time = "2026-05-19T10:08:16.927Z" }, + { url = "https://files.pythonhosted.org/packages/d2/11/e7b70e91f90bc4477e8eee9e8a5f7cf3cb41b4525d6394dc98a714eb8f7f/jiter-0.15.0-cp313-cp313-win32.whl", hash = "sha256:c77496cb10bd7549690fbbab3e5ec05857b83e49276f4a9423a766ddd2afcd4c", size = 205845, upload-time = "2026-05-19T10:08:18.401Z" }, + { url = "https://files.pythonhosted.org/packages/4b/23/5c20d9ad6f02c493e4023e5d2d09e1c1f15fe2753c9102c544aff068a88e/jiter-0.15.0-cp313-cp313-win_amd64.whl", hash = "sha256:b15741f501469009ae0ae90b7147958a664a7dede40aa7ff174a8a4645f546d0", size = 196842, upload-time = "2026-05-19T10:08:20.131Z" }, + { url = "https://files.pythonhosted.org/packages/6b/11/1eb400ef248e8c925fd883fbe325daf5e42cd1b0d308539dd332bd4f7ffc/jiter-0.15.0-cp313-cp313-win_arm64.whl", hash = "sha256:5d6a60072b44c3c2b797a7ddcbcbbf2b34ea3cfd4721580fbfd2a09d9d9b84ba", size = 192212, upload-time = "2026-05-19T10:08:21.807Z" }, + { url = "https://files.pythonhosted.org/packages/8a/60/2fd8d7c79da8acf9b7b277c7616847773779356b92acfc9bb158452174da/jiter-0.15.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:ef1fd24d9413f6209e00d3d5a453e67acfe004a25cc6c8e8484faed4311ab9e8", size = 315065, upload-time = "2026-05-19T10:08:23.218Z" }, + { url = "https://files.pythonhosted.org/packages/46/f4/008fb7d65e8ac2abf00811651a661e025c4ba80bbc6f378450384ddd3aed/jiter-0.15.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:144f8e72cb53dab146347b91cceac01f5481237f2b93b4a339a1ee8f8878b67c", size = 339444, upload-time = "2026-05-19T10:08:24.701Z" }, + { url = "https://files.pythonhosted.org/packages/00/55/90b0c7b9c6896c0f2a591dd36d36b71d22e09674bfef178fa03ba3f81499/jiter-0.15.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:553fcac2ef2cb990877f9fc0833b8b629a3e6a5670b6b5fd58219b41a653ddc4", size = 347779, upload-time = "2026-05-19T10:08:26.408Z" }, + { url = "https://files.pythonhosted.org/packages/51/6b/69666cec5000fd57734c118437394516c749ae8dbeea9fb66d6fef9c4775/jiter-0.15.0-cp313-cp313t-win_amd64.whl", hash = "sha256:774f93f65031856bf14ad9f59bdcab8b8cad501e5ceabd51ba3525f76937a25b", size = 200395, upload-time = "2026-05-19T10:08:28.055Z" }, + { url = "https://files.pythonhosted.org/packages/39/04/a6aa62cd27e8149b0d28df5561f10f6cceaf7935a9ccf3f1c5a05f9a0cd8/jiter-0.15.0-cp313-cp313t-win_arm64.whl", hash = "sha256:f1e1754960f38ec40613a07e5e372df67acb3b890fb383b6fb3de3e49ddbf3c7", size = 190516, upload-time = "2026-05-19T10:08:29.35Z" }, + { url = "https://files.pythonhosted.org/packages/eb/d2/079f350ebf7859d081de30aa890f9e3be68516f754f3ba32366ffff4dcee/jiter-0.15.0-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:ac0d9ddea4350974be7a221fc25895f251a8fee748c889bdced2141c0fec1a49", size = 308884, upload-time = "2026-05-19T10:08:31.667Z" }, + { url = "https://files.pythonhosted.org/packages/04/4e/a2c30a7f69b48c03b20935d647479106fe932f6e63f75faf53937197e05d/jiter-0.15.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:01a8222cf05ab1128e239421156c207949808acaaea2bdfd33130ae666786e86", size = 310028, upload-time = "2026-05-19T10:08:33.304Z" }, + { url = "https://files.pythonhosted.org/packages/40/90/2e7cdfd3cf8ca967be38c48f5cf474d79f089efaf559a40f15984a77ae69/jiter-0.15.0-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:182226cbc930c9fab81bc2e41a4da672f89539906dadb05e75670ac07b94f71f", size = 337485, upload-time = "2026-05-19T10:08:35.259Z" }, + { url = "https://files.pythonhosted.org/packages/9b/11/15a1aa28b120b8ee5b4f1fb894c125046225f09847738bd64233d3b84883/jiter-0.15.0-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:71683c38c825452999b5717fcae07ea708e8c93003e808be4319c1b02e3d176e", size = 364223, upload-time = "2026-05-19T10:08:36.694Z" }, + { url = "https://files.pythonhosted.org/packages/b7/25/f442e8af5f3d0dcf47b39e83a0efd9ee45ea946aa6d04625dc3181eae3b6/jiter-0.15.0-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:30f2218e6a9e5c18bc10fe6d41ac189c442c88eacf11bad9f28ef95a9bef00e6", size = 456387, upload-time = "2026-05-19T10:08:38.143Z" }, + { url = "https://files.pythonhosted.org/packages/da/f4/37f2d2c9f64f49af7da652ed7532bb5a2372e588e6927c3fdd76f911db65/jiter-0.15.0-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5157de9f76eb4bc5ea74a1219366a25f945ad305641d74e04f59c54087091aa9", size = 374461, upload-time = "2026-05-19T10:08:39.869Z" }, + { url = "https://files.pythonhosted.org/packages/60/28/edcfbbbf0cb15436f36664a8908a0df47ab9006298d4cd937dc08ea932d6/jiter-0.15.0-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:90c5db5527c221249a876160663ab891ace358c17f7b9c93ec1478b7f0550e5c", size = 345924, upload-time = "2026-05-19T10:08:41.668Z" }, + { url = "https://files.pythonhosted.org/packages/47/13/89fba6398dab7f202b7278c4b4aac122399d2c0183971c4a57a3b7088df5/jiter-0.15.0-cp314-cp314-manylinux_2_31_riscv64.whl", hash = "sha256:3e4540b8e74e4268811ac05db226a6a128ff572e7e0ce3f1163b693cadb184cd", size = 352283, upload-time = "2026-05-19T10:08:43.091Z" }, + { url = "https://files.pythonhosted.org/packages/1b/da/0f6af8cef2c565a1ab44d970f268c43ccaa72707386ea6388e6fe2b6cd26/jiter-0.15.0-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:62ebd14e47e9aed9df4472afcb2663668ce4d74891cd54f86bf6e44029d6dc89", size = 389985, upload-time = "2026-05-19T10:08:44.915Z" }, + { url = "https://files.pythonhosted.org/packages/a1/ec/b9cb7d6d29e24ee14910266157d2a279d7a8f60ee0df7fa840882976ba64/jiter-0.15.0-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:0be6f5ad41a809f303f416d17cec92a7a725902fb9b4f3de3d19362ac0ef8554", size = 517695, upload-time = "2026-05-19T10:08:46.486Z" }, + { url = "https://files.pythonhosted.org/packages/64/5e/6d1bda880723aae0ad86b4b763f044362448efe31e3e819635d41cb03451/jiter-0.15.0-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:813dfbb17d65328bf86e5f0905dd277ba2265d3ca20556e86c0c7035b7182e5a", size = 548868, upload-time = "2026-05-19T10:08:48.026Z" }, + { url = "https://files.pythonhosted.org/packages/0c/72/7de501cf38dcacaf35098796f3a50e0f2e338baba18a58946c618544b809/jiter-0.15.0-cp314-cp314-win32.whl", hash = "sha256:50e51156192722a9c58db112837d3f8ef96fb3c5ecc14e95f409134b08b158ec", size = 206380, upload-time = "2026-05-19T10:08:49.738Z" }, + { url = "https://files.pythonhosted.org/packages/1e/a9/e19addf4b0c1bdce52c6da12351e6bc42c340c45e7c09e2158e46d293ccc/jiter-0.15.0-cp314-cp314-win_amd64.whl", hash = "sha256:30ce1a5d16b5641dc935d50ef775af6a0871e3d14ab05d6fc54dff371b78e558", size = 197687, upload-time = "2026-05-19T10:08:51.088Z" }, + { url = "https://files.pythonhosted.org/packages/f2/c9/776b1db01db25fc6c1d58d1979a37b0a9fe787e5f5b1d062d2eaacb77923/jiter-0.15.0-cp314-cp314-win_arm64.whl", hash = "sha256:510c8b3c17a0ed9ac69850c0438dada3c9b82d9c4d589fcb62002a5a9cf3a866", size = 192571, upload-time = "2026-05-19T10:08:52.451Z" }, + { url = "https://files.pythonhosted.org/packages/a0/f6/45bb4670bacf300fd2c7abadbfb3af376e5f1b6ae75fd9bc069891d15870/jiter-0.15.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:7553333dd0930c104a5a0db8df72bf7219fe663d731383b576bb6ed6351c984d", size = 317151, upload-time = "2026-05-19T10:08:53.867Z" }, + { url = "https://files.pythonhosted.org/packages/d7/68/ed635ad5acd7b73e454283083bbb7c8205ad10e88b0d9d7d793b09fe8226/jiter-0.15.0-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f2143ab06181d2b029eedcb6af3cebe95f11bbac62441781860f98ee9330a6a6", size = 341243, upload-time = "2026-05-19T10:08:55.383Z" }, + { url = "https://files.pythonhosted.org/packages/5d/db/3ff4176b817b8ea33879e71e13d8bc2b0d481a7ed3fe9e080f333d415c16/jiter-0.15.0-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:6eac374c5c975709b69c10f09afd199df74150172156ad10c8d4fd785b7da995", size = 363629, upload-time = "2026-05-19T10:08:56.928Z" }, + { url = "https://files.pythonhosted.org/packages/ab/24/5f8270e0ba9c883582f96f722f8a0b58015c7ce1f8c6d4571cf394e99b6b/jiter-0.15.0-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b3b3b775e33d3bfaec9899edc526ae97b0da0bf9d071a46124ba419149a414f8", size = 456198, upload-time = "2026-05-19T10:08:58.618Z" }, + { url = "https://files.pythonhosted.org/packages/45/5b/76fc02b0b5c54c3d18c60653156e2f76fde1816f9b4722db68d6ee2c897e/jiter-0.15.0-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:eda3071db3346334beae1360b46da4606da57bf3528c167b3c38533afaf9f2c5", size = 373710, upload-time = "2026-05-19T10:09:00.151Z" }, + { url = "https://files.pythonhosted.org/packages/c4/52/4310821b0ea9277994d3e1f49fc6a4b34e4800caebacb2c0af81da59a454/jiter-0.15.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c6694a173ecabc12eb60efbc0b474464ead1951ff65cd8b1e72100715c64512b", size = 349901, upload-time = "2026-05-19T10:09:01.621Z" }, + { url = "https://files.pythonhosted.org/packages/93/fe/67648c35b3594fba8854ac64cc8a826d8bcd18324bbdb53d77697c60b6ef/jiter-0.15.0-cp314-cp314t-manylinux_2_31_riscv64.whl", hash = "sha256:a254e10b593624d230c365b6d616b22ca0ad65e63a16e6631c2b3466022e6ba8", size = 352438, upload-time = "2026-05-19T10:09:03.216Z" }, + { url = "https://files.pythonhosted.org/packages/cb/28/0a1879d07ad6b3e025a2750027363452ced93c2d16d1c9d4b153ffd51c91/jiter-0.15.0-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d8d2955167274e15d79a7a020afdd9b39c990eb80b2d89fca695d92dcfdd38ec", size = 388152, upload-time = "2026-05-19T10:09:04.741Z" }, + { url = "https://files.pythonhosted.org/packages/c1/78/46c6f6b56ba85c90021f4afd72ed42f691f8f84daacb5fe27277070e3858/jiter-0.15.0-cp314-cp314t-musllinux_1_1_aarch64.whl", hash = "sha256:acf4ee4d1fc55917239fe72972fb292dd773055d05eb040d36f4326e02cc2c0e", size = 517707, upload-time = "2026-05-19T10:09:06.231Z" }, + { url = "https://files.pythonhosted.org/packages/ca/cb/720662d4c88fcad606e826fef5424365527ba43ce4868a479aed8f8c507e/jiter-0.15.0-cp314-cp314t-musllinux_1_1_x86_64.whl", hash = "sha256:e7196e56f1cd69af1dbb07dff02dcfb260a50b45a82d409d92a06fedb32473b5", size = 548241, upload-time = "2026-05-19T10:09:08.093Z" }, + { url = "https://files.pythonhosted.org/packages/60/e3/935b8034fd143f21125c87d51404a9e0e1449186a494405721ff5d1d695e/jiter-0.15.0-cp314-cp314t-win32.whl", hash = "sha256:7f6163c0f10b055245f814dcc59f4818da60dfe72f3e72ab89fc24b6bd5e9c52", size = 207950, upload-time = "2026-05-19T10:09:09.616Z" }, + { url = "https://files.pythonhosted.org/packages/93/59/984fd9ece895953dad3e0880a650e766f5a2da2c5514f0eafdaaabbeb5f9/jiter-0.15.0-cp314-cp314t-win_amd64.whl", hash = "sha256:980c256edb05b78a111b99c4de3b1d32e31634b867fd1fc2cf726e7b7bba9854", size = 200055, upload-time = "2026-05-19T10:09:11.367Z" }, + { url = "https://files.pythonhosted.org/packages/0e/a4/cf8d779feb133a27a2e3bc833bccb9e13aa332cdf820497ebf72c10ce8c3/jiter-0.15.0-cp314-cp314t-win_arm64.whl", hash = "sha256:66b1880df2d01e206e8339769d1c7c1753bcb653efd6289e203f6f24ebada0c0", size = 191244, upload-time = "2026-05-19T10:09:12.74Z" }, + { url = "https://files.pythonhosted.org/packages/65/43/1fc62172aa98b50a7de9a25554060db510f85c89cfbed0dfe13e1907a139/jiter-0.15.0-graalpy311-graalpy242_311_native-macosx_10_12_x86_64.whl", hash = "sha256:411fa4dfa5a7ae3d11491027ffb9beadec3996010a986862db70d91abba1c750", size = 305585, upload-time = "2026-05-19T10:09:35.995Z" }, + { url = "https://files.pythonhosted.org/packages/e8/c4/dd58fcd9e2df83666e5c1c1347bef58ce919cd8efc3ffa38aeea62ce493b/jiter-0.15.0-graalpy311-graalpy242_311_native-macosx_11_0_arm64.whl", hash = "sha256:2b0074e2f56eb2dacca1689760fd2852a068f85a0547a157b82cb4cafeb6768b", size = 306936, upload-time = "2026-05-19T10:09:37.435Z" }, + { url = "https://files.pythonhosted.org/packages/39/86/b695e16f1180c07f43ea98e73ecd21cf63fa2e1b0c1103739013784d11ae/jiter-0.15.0-graalpy311-graalpy242_311_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:913d02d29c9606643418d9ccfc3b72492ab25a6bf7889934e09a3490f8d3438b", size = 342453, upload-time = "2026-05-19T10:09:39.294Z" }, + { url = "https://files.pythonhosted.org/packages/34/56/55d76614af37fe3f22a3347d1e410d2a15da581997cb2da499a625000bb5/jiter-0.15.0-graalpy311-graalpy242_311_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b15d3ec9b0449c40e85319bdb4caa8b77ab526e74f5532ed94bec15e2f66822c", size = 345606, upload-time = "2026-05-19T10:09:40.727Z" }, + { url = "https://files.pythonhosted.org/packages/73/38/505941b2b092fd5bbbd60a52a880db1173f1690ae6751bed3af1c9ddcb4e/jiter-0.15.0-graalpy312-graalpy250_312_native-macosx_10_12_x86_64.whl", hash = "sha256:631f13a3d04e97d4e083993b10f4b99530e3a10d953e2eb5e196b7dc7f812ce0", size = 303769, upload-time = "2026-05-19T10:09:42.203Z" }, + { url = "https://files.pythonhosted.org/packages/e7/95/a06692b29e77473f286e1ec1f426d3ca44d7b5843be8ad21d7a5f3fcdcc0/jiter-0.15.0-graalpy312-graalpy250_312_native-macosx_11_0_arm64.whl", hash = "sha256:b6c0ffae686c39bf3737be60793783267628783ea42545632c10b291105aee45", size = 305128, upload-time = "2026-05-19T10:09:43.657Z" }, + { url = "https://files.pythonhosted.org/packages/23/85/7270d7ad41d6061a25b950c6bf91d638bd9aacb113200a8c8d57a055fd67/jiter-0.15.0-graalpy312-graalpy250_312_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1d54fb5b31dea401a41af3f8a7d2512e9b6a6a005491e6166c7e4ffab9639a9c", size = 340459, upload-time = "2026-05-19T10:09:45.452Z" }, + { url = "https://files.pythonhosted.org/packages/c8/8d/302cb2057b7513327b4d575cff6b1d066ee6431a5357fc3f8867cd684406/jiter-0.15.0-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:54d5d6090cdc1b7c9e780dfb04949a990adb1e301a2fc0bbcee7de4638d33f9a", size = 344469, upload-time = "2026-05-19T10:09:46.864Z" }, +] + [[package]] name = "joblib" version = "1.5.3" @@ -1000,6 +1222,43 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/7b/91/984aca2ec129e2757d1e4e3c81c3fcda9d0f85b74670a094cc443d9ee949/joblib-1.5.3-py3-none-any.whl", hash = "sha256:5fc3c5039fc5ca8c0276333a188bbd59d6b7ab37fe6632daa76bc7f9ec18e713", size = 309071, upload-time = "2025-12-15T08:41:44.973Z" }, ] +[[package]] +name = "jsonschema" +version = "4.26.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "attrs" }, + { name = "jsonschema-specifications" }, + { name = "referencing" }, + { name = "rpds-py", version = "0.30.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, + { name = "rpds-py", version = "2026.5.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b3/fc/e067678238fa451312d4c62bf6e6cf5ec56375422aee02f9cb5f909b3047/jsonschema-4.26.0.tar.gz", hash = "sha256:0c26707e2efad8aa1bfc5b7ce170f3fccc2e4918ff85989ba9ffa9facb2be326", size = 366583, upload-time = "2026-01-07T13:41:07.246Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/69/90/f63fb5873511e014207a475e2bb4e8b2e570d655b00ac19a9a0ca0a385ee/jsonschema-4.26.0-py3-none-any.whl", hash = "sha256:d489f15263b8d200f8387e64b4c3a75f06629559fb73deb8fdfb525f2dab50ce", size = 90630, upload-time = "2026-01-07T13:41:05.306Z" }, +] + +[[package]] +name = "jsonschema-specifications" +version = "2025.9.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "referencing" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/19/74/a633ee74eb36c44aa6d1095e7cc5569bebf04342ee146178e2d36600708b/jsonschema_specifications-2025.9.1.tar.gz", hash = "sha256:b540987f239e745613c7a9176f3edb72b832a4ac465cf02712288397832b5e8d", size = 32855, upload-time = "2025-09-08T01:34:59.186Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/41/45/1a4ed80516f02155c51f51e8cedb3c1902296743db0bbc66608a0db2814f/jsonschema_specifications-2025.9.1-py3-none-any.whl", hash = "sha256:98802fee3a11ee76ecaca44429fda8a41bff98b00a0f2838151b113f210cc6fe", size = 18437, upload-time = "2025-09-08T01:34:57.871Z" }, +] + +[[package]] +name = "lazy-imports" +version = "1.2.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/25/67/04432aae0c1e2729bff14e1841f4a3fb63a9e354318e66622251487760c3/lazy_imports-1.2.0.tar.gz", hash = "sha256:3c546b3c1e7c4bf62a07f897f6179d9feda6118e71ef6ecc47a339cab3d2e2d9", size = 24470, upload-time = "2025-12-28T13:51:51.218Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/cd/62/60ed24fa8707f10c1c5aef94791252b820be3dd6bdfc6e2fcdb08bc8912f/lazy_imports-1.2.0-py3-none-any.whl", hash = "sha256:97134d6552e2ba16f1a278e316f05313ab73b360e848e40d593d08a5c2406fdf", size = 18681, upload-time = "2025-12-28T13:51:49.802Z" }, +] + [[package]] name = "librt" version = "0.11.0" @@ -1211,6 +1470,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/05/22/e8d25f5291c24cf9cc3add81516b19e1e4aeef29231f9f32ccf107e428d0/mloda_testing-0.3.2-py3-none-any.whl", hash = "sha256:f20db2d425d3c0372d15d5bd1a9b13d3d3769e37343124c6548a5cb848f5b911", size = 81356, upload-time = "2026-05-17T17:19:24.04Z" }, ] +[[package]] +name = "more-itertools" +version = "11.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/de/1d/f4da6f02cdffe04d6362210b807146a26044c88d839208aec273bb0d9184/more_itertools-11.1.0.tar.gz", hash = "sha256:48e8f4d9e7e5878571ecf6f2b4e57634f93cd474cc8cfbd2376f2d11b396e30d", size = 145772, upload-time = "2026-05-22T14:14:29.909Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e8/3d/1087453384dbde46a8c7f9356eead2c58be8a7bf156bca40243377c85715/more_itertools-11.1.0-py3-none-any.whl", hash = "sha256:4b65538ae22f6fed0ce4874efd317463a7489796a0939fa66824dd542125a192", size = 72226, upload-time = "2026-05-22T14:14:28.824Z" }, +] + [[package]] name = "mpmath" version = "1.3.0" @@ -1852,6 +2120,114 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a8/64/3708a90d1ebe202ffdeb7185f878a3c84d15c2b2c31858da2ce0583e2def/nvidia_nvtx-13.0.85-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cb7780edb6b14107373c835bf8b72e7a178bac7367e23da7acb108f973f157a6", size = 148878, upload-time = "2025-09-04T08:28:53.627Z" }, ] +[[package]] +name = "onnxruntime" +version = "1.24.3" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version < '3.11'", +] +dependencies = [ + { name = "flatbuffers", marker = "python_full_version < '3.11'" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, + { name = "packaging", marker = "python_full_version < '3.11'" }, + { name = "protobuf", marker = "python_full_version < '3.11'" }, + { name = "sympy", marker = "python_full_version < '3.11'" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/15/41/3253db975a90c3ce1d475e2a230773a21cd7998537f0657947df6fb79861/onnxruntime-1.24.3-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:3e6456801c66b095c5cd68e690ca25db970ea5202bd0c5b84a2c3ef7731c5a3c", size = 17332766, upload-time = "2026-03-05T17:18:59.714Z" }, + { url = "https://files.pythonhosted.org/packages/7e/c5/3af6b325f1492d691b23844d88ed26844c1164620860c5efe95c0e22782d/onnxruntime-1.24.3-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8b2ebc54c6d8281dccff78d4b06e47d4cf07535937584ab759448390a70f4978", size = 15130330, upload-time = "2026-03-05T16:34:53.831Z" }, + { url = "https://files.pythonhosted.org/packages/03/4b/f96b46c1866a293ed23ca2cf5e5a63d413ad3a951da60dd877e3c56cbbca/onnxruntime-1.24.3-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fb56575d7794bf0781156955610c9e651c9504c64d42ec880784b6106244882d", size = 17213247, upload-time = "2026-03-05T17:17:59.812Z" }, + { url = "https://files.pythonhosted.org/packages/36/13/27cf4d8df2578747584e8758aeb0b673b60274048510257f1f084b15e80e/onnxruntime-1.24.3-cp311-cp311-win_amd64.whl", hash = "sha256:c958222ef9eff54018332beecd32d5d94a3ab079d8821937b333811bf4da0d39", size = 12595530, upload-time = "2026-03-05T17:18:49.356Z" }, + { url = "https://files.pythonhosted.org/packages/19/8c/6d9f31e6bae72a8079be12ed8ba36c4126a571fad38ded0a1b96f60f6896/onnxruntime-1.24.3-cp311-cp311-win_arm64.whl", hash = "sha256:a8f761857ebaf58a85b9e42422d03207f1d39e6bb8fecfdbf613bac5b9710723", size = 12261715, upload-time = "2026-03-05T17:18:39.699Z" }, + { url = "https://files.pythonhosted.org/packages/d0/7f/dfdc4e52600fde4c02d59bfe98c4b057931c1114b701e175aee311a9bc11/onnxruntime-1.24.3-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:0d244227dc5e00a9ae15a7ac1eba4c4460d7876dfecafe73fb00db9f1d914d91", size = 17342578, upload-time = "2026-03-05T17:19:02.403Z" }, + { url = "https://files.pythonhosted.org/packages/1c/dc/1f5489f7b21817d4ad352bf7a92a252bd5b438bcbaa7ad20ea50814edc79/onnxruntime-1.24.3-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0a9847b870b6cb462652b547bc98c49e0efb67553410a082fde1918a38707452", size = 15150105, upload-time = "2026-03-05T16:34:56.897Z" }, + { url = "https://files.pythonhosted.org/packages/28/7c/fd253da53594ab8efbefdc85b3638620ab1a6aab6eb7028a513c853559ce/onnxruntime-1.24.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b354afce3333f2859c7e8706d84b6c552beac39233bcd3141ce7ab77b4cabb5d", size = 17237101, upload-time = "2026-03-05T17:18:02.561Z" }, + { url = "https://files.pythonhosted.org/packages/71/5f/eaabc5699eeed6a9188c5c055ac1948ae50138697a0428d562ac970d7db5/onnxruntime-1.24.3-cp312-cp312-win_amd64.whl", hash = "sha256:44ea708c34965439170d811267c51281d3897ecfc4aa0087fa25d4a4c3eb2e4a", size = 12597638, upload-time = "2026-03-05T17:18:52.141Z" }, + { url = "https://files.pythonhosted.org/packages/cc/5c/d8066c320b90610dbeb489a483b132c3b3879b2f93f949fb5d30cfa9b119/onnxruntime-1.24.3-cp312-cp312-win_arm64.whl", hash = "sha256:48d1092b44ca2ba6f9543892e7c422c15a568481403c10440945685faf27a8d8", size = 12270943, upload-time = "2026-03-05T17:18:42.006Z" }, + { url = "https://files.pythonhosted.org/packages/51/8d/487ece554119e2991242d4de55de7019ac6e47ee8dfafa69fcf41d37f8ed/onnxruntime-1.24.3-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:34a0ea5ff191d8420d9c1332355644148b1bf1a0d10c411af890a63a9f662aa7", size = 17342706, upload-time = "2026-03-05T16:35:10.813Z" }, + { url = "https://files.pythonhosted.org/packages/dd/25/8b444f463c1ac6106b889f6235c84f01eec001eaf689c3eff8c69cf48fae/onnxruntime-1.24.3-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1fd2ec7bb0fabe42f55e8337cfc9b1969d0d14622711aac73d69b4bd5abb5ed7", size = 15149956, upload-time = "2026-03-05T16:34:59.264Z" }, + { url = "https://files.pythonhosted.org/packages/34/fc/c9182a3e1ab46940dd4f30e61071f59eee8804c1f641f37ce6e173633fb6/onnxruntime-1.24.3-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:df8e70e732fe26346faaeec9147fa38bef35d232d2495d27e93dd221a2d473a9", size = 17237370, upload-time = "2026-03-05T17:18:05.258Z" }, + { url = "https://files.pythonhosted.org/packages/05/7e/3b549e1f4538514118bff98a1bcd6481dd9a17067f8c9af77151621c9a5c/onnxruntime-1.24.3-cp313-cp313-win_amd64.whl", hash = "sha256:2d3706719be6ad41d38a2250998b1d87758a20f6ea4546962e21dc79f1f1fd2b", size = 12597939, upload-time = "2026-03-05T17:18:54.772Z" }, + { url = "https://files.pythonhosted.org/packages/80/41/9696a5c4631a0caa75cc8bc4efd30938fd483694aa614898d087c3ee6d29/onnxruntime-1.24.3-cp313-cp313-win_arm64.whl", hash = "sha256:b082f3ba9519f0a1a1e754556bc7e635c7526ef81b98b3f78da4455d25f0437b", size = 12270705, upload-time = "2026-03-05T17:18:44.774Z" }, + { url = "https://files.pythonhosted.org/packages/b7/65/a26c5e59e3b210852ee04248cf8843c81fe7d40d94cf95343b66efe7eec9/onnxruntime-1.24.3-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:72f956634bc2e4bd2e8b006bef111849bd42c42dea37bd0a4c728404fdaf4d34", size = 15161796, upload-time = "2026-03-05T16:35:02.871Z" }, + { url = "https://files.pythonhosted.org/packages/f3/25/2035b4aa2ccb5be6acf139397731ec507c5f09e199ab39d3262b22ffa1ac/onnxruntime-1.24.3-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:78d1f25eed4ab9959db70a626ed50ee24cf497e60774f59f1207ac8556399c4d", size = 17240936, upload-time = "2026-03-05T17:18:09.534Z" }, + { url = "https://files.pythonhosted.org/packages/f9/a4/b3240ea84b92a3efb83d49cc16c04a17ade1ab47a6a95c4866d15bf0ac35/onnxruntime-1.24.3-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:a6b4bce87d96f78f0a9bf5cefab3303ae95d558c5bfea53d0bf7f9ea207880a8", size = 17344149, upload-time = "2026-03-05T16:35:13.382Z" }, + { url = "https://files.pythonhosted.org/packages/bb/4a/4b56757e51a56265e8c56764d9c36d7b435045e05e3b8a38bedfc5aedba3/onnxruntime-1.24.3-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d48f36c87b25ab3b2b4c88826c96cf1399a5631e3c2c03cc27d6a1e5d6b18eb4", size = 15151571, upload-time = "2026-03-05T16:35:05.679Z" }, + { url = "https://files.pythonhosted.org/packages/cf/14/c6fb84980cec8f682a523fcac7c2bdd6b311e7f342c61ce48d3a9cb87fc6/onnxruntime-1.24.3-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e104d33a409bf6e3f30f0e8198ec2aaf8d445b8395490a80f6e6ad56da98e400", size = 17238951, upload-time = "2026-03-05T17:18:12.394Z" }, + { url = "https://files.pythonhosted.org/packages/57/14/447e1400165aca8caf35dabd46540eb943c92f3065927bb4d9bcbc91e221/onnxruntime-1.24.3-cp314-cp314-win_amd64.whl", hash = "sha256:e785d73fbd17421c2513b0bb09eb25d88fa22c8c10c3f5d6060589efa5537c5b", size = 12903820, upload-time = "2026-03-05T17:18:57.123Z" }, + { url = "https://files.pythonhosted.org/packages/1d/ec/6b2fa5702e4bbba7339ca5787a9d056fc564a16079f8833cc6ba4798da1c/onnxruntime-1.24.3-cp314-cp314-win_arm64.whl", hash = "sha256:951e897a275f897a05ffbcaa615d98777882decaeb80c9216c68cdc62f849f53", size = 12594089, upload-time = "2026-03-05T17:18:47.169Z" }, + { url = "https://files.pythonhosted.org/packages/12/dc/cd06cba3ddad92ceb17b914a8e8d49836c79e38936e26bde6e368b62c1fe/onnxruntime-1.24.3-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4d4e70ce578aa214c74c7a7a9226bc8e229814db4a5b2d097333b81279ecde36", size = 15162789, upload-time = "2026-03-05T16:35:08.282Z" }, + { url = "https://files.pythonhosted.org/packages/a6/d6/413e98ab666c6fb9e8be7d1c6eb3bd403b0bea1b8d42db066dab98c7df07/onnxruntime-1.24.3-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:02aaf6ddfa784523b6873b4176a79d508e599efe12ab0ea1a3a6e7314408b7aa", size = 17240738, upload-time = "2026-03-05T17:18:15.203Z" }, +] + +[[package]] +name = "onnxruntime" +version = "1.26.0" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.15' and sys_platform == 'win32'", + "python_full_version >= '3.15' and sys_platform == 'emscripten'", + "python_full_version >= '3.15' and sys_platform != 'emscripten' and sys_platform != 'win32'", + "python_full_version == '3.14.*' and sys_platform == 'win32'", + "python_full_version == '3.14.*' and sys_platform == 'emscripten'", + "python_full_version == '3.14.*' and sys_platform != 'emscripten' and sys_platform != 'win32'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'win32'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'emscripten'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'", +] +dependencies = [ + { name = "flatbuffers", marker = "python_full_version >= '3.11'" }, + { name = "numpy", version = "2.4.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "packaging", marker = "python_full_version >= '3.11'" }, + { name = "protobuf", marker = "python_full_version >= '3.11'" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/d4/81/29a9eb470994a75eb7b3ccf32be314d7c66675a00ac7b50294816cc2db27/onnxruntime-1.26.0-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:ee1109ef4ef27cad90e823399e61e03b3c6c7bfe0fb820b4baf3678c15be8b3c", size = 18005108, upload-time = "2026-05-08T19:08:11.728Z" }, + { url = "https://files.pythonhosted.org/packages/66/c7/73efa6c8a4000c38fcc14947d84f234a17e5d66f203b37b7f1ad4a7b46eb/onnxruntime-1.26.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:35c7c7b0ac2e02001d28fab6c9fc24e9abc5e6faa35e6e19c63cecf1406ba89f", size = 16043752, upload-time = "2026-05-08T19:07:10.707Z" }, + { url = "https://files.pythonhosted.org/packages/b6/3f/8de630f595daf6ce884d4dd95afd2a60e70ec6572e52bfee3aa2229befab/onnxruntime-1.26.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:11a8df4dcfe9ad5ff0bd71a7571dbed019fabc7594676c89fe8b86ea029c246f", size = 18176043, upload-time = "2026-05-08T19:07:33.735Z" }, + { url = "https://files.pythonhosted.org/packages/9c/21/9f041de20787cd85498bd48e0ec4d098bf2a6c486e25b24b8dae1bf492b2/onnxruntime-1.26.0-cp311-cp311-win_amd64.whl", hash = "sha256:e6456718125fd777c673f3b78d4a9ab58d6adea641e9afae85ee6444f0e0e9a9", size = 13023165, upload-time = "2026-05-08T19:08:00.633Z" }, + { url = "https://files.pythonhosted.org/packages/0e/82/3b9fe0ead2557cc3adf74c74c141bd1c7c4c6a9548c610af37df199f4512/onnxruntime-1.26.0-cp311-cp311-win_arm64.whl", hash = "sha256:cd920e45b730e4a87833e2910d8ca375aaca9da6ccc09e24bce463b3356d637f", size = 12789514, upload-time = "2026-05-08T19:07:49.433Z" }, + { url = "https://files.pythonhosted.org/packages/81/b1/d111b1df656761f980d9e298a60039a9cb66036b1d039e777537743d0ac3/onnxruntime-1.26.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:05b028781b322ad74b57ce5b50aa5280bb1fe96ceec334628ade681e0b24c1ac", size = 18016624, upload-time = "2026-05-12T00:41:01.735Z" }, + { url = "https://files.pythonhosted.org/packages/f6/a0/3f9d896a0385a36bd04345d6d0b802821a5782adde562e7e135f6bb71c73/onnxruntime-1.26.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:91f2bb870a4b9224eba0a6728c1fa7a9e552b8e59e1083c51fbbc3d013f2b5c0", size = 16052692, upload-time = "2026-05-08T19:07:13.829Z" }, + { url = "https://files.pythonhosted.org/packages/7c/43/2a4e04f8dbeffad19bbcced4bcd4289bf478921518437404d6b92bdf213b/onnxruntime-1.26.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9b6dd70599005bd1bf29779f04a91978b92b5e719c11a20068a8f8e535f725b6", size = 18185439, upload-time = "2026-05-08T19:07:36.299Z" }, + { url = "https://files.pythonhosted.org/packages/44/fc/026d0a7162b9c2153dac292baea9e027c42304dc1d9dc6f8ff5b4cfbaedd/onnxruntime-1.26.0-cp312-cp312-win_amd64.whl", hash = "sha256:a26374dc7fbcaae593601086b242120e13f2310558df0991da6dd8b8fac00414", size = 13026427, upload-time = "2026-05-08T19:08:03.503Z" }, + { url = "https://files.pythonhosted.org/packages/3e/27/1dcf88e45e4c69db5f7b106f2dacc3801ba98994e082ca03e1dfdf7bfe57/onnxruntime-1.26.0-cp312-cp312-win_arm64.whl", hash = "sha256:54a8053410fd31fd66469bd754fcfe8a4df9f7eb44756b4b5479bf50c842d948", size = 12796647, upload-time = "2026-05-08T19:07:52.108Z" }, + { url = "https://files.pythonhosted.org/packages/cf/a2/c801242685e0ce48a4ca51dfafbb588765e0446397e123be53ba5598f3f5/onnxruntime-1.26.0-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:ccce19c5f771b8268902f77d9fed9e88f9499465d6780808faa6611a789d33f0", size = 18016563, upload-time = "2026-05-08T19:07:28.081Z" }, + { url = "https://files.pythonhosted.org/packages/e2/64/0492c0b1db04e29b2630c87cfa36f9d6872b1ca8614b90c5cad58fac7d76/onnxruntime-1.26.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bdbed8cf3b672b66acb032f33a253bc27f42bce6ece48ae3fab4fa483a5e96e0", size = 16052634, upload-time = "2026-05-08T19:07:16.885Z" }, + { url = "https://files.pythonhosted.org/packages/3d/26/4d09ddc755a84fc8d5e192991626b0e0680e8f6c5d58f4f1d05c42bc48cf/onnxruntime-1.26.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c07af6fc6d5557835f2b6ee7a96d8b3235d0c57a8e230efdedaee106a8a3cbc6", size = 18185632, upload-time = "2026-05-08T19:07:38.756Z" }, + { url = "https://files.pythonhosted.org/packages/77/89/3e52249aa08fa301e217ecba07b5246a8338fa2b401e109326e3fc5be0f9/onnxruntime-1.26.0-cp313-cp313-win_amd64.whl", hash = "sha256:61bec80655efa460591c2bc655392d57d2650ce85533a6b9b3b7a790d7ea7916", size = 13026751, upload-time = "2026-05-08T19:08:06.2Z" }, + { url = "https://files.pythonhosted.org/packages/06/b3/c1c8782b14af6797c303de132d6eef26a9fb80dfacd3750ce57911d11c6b/onnxruntime-1.26.0-cp313-cp313-win_arm64.whl", hash = "sha256:a6677545ff451e3539a02746d2f207d8c5baa4a0a818886bb9d6a6eb9511ee89", size = 12796807, upload-time = "2026-05-08T19:07:54.879Z" }, + { url = "https://files.pythonhosted.org/packages/c3/f5/47b0676408abec652c14b84d7173e389837832d850c24f87184277313e8d/onnxruntime-1.26.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5e016edc15d3c19f36807e1c6b10be5b27807688c32720f91b5ae480a95215d0", size = 16057265, upload-time = "2026-05-08T19:07:19.603Z" }, + { url = "https://files.pythonhosted.org/packages/3b/45/33ab6deeef010ca844c877dd618cebc079590bbe52d2a3678e7223b1b908/onnxruntime-1.26.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f5fc48a91a046a6a5c9b147f83fb41d65d24d24923373b222cdd248f0f4f4aac", size = 18197590, upload-time = "2026-05-08T19:07:41.422Z" }, + { url = "https://files.pythonhosted.org/packages/40/89/17546c1c20f6bfc3ae41c22152378a26edfea918af3129e2139dcd7c99f3/onnxruntime-1.26.0-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:33a791f31432a3af1a96db5e54818b37aba5e5eefc2e6af5794c10a9118a9993", size = 18019724, upload-time = "2026-05-08T19:07:30.723Z" }, + { url = "https://files.pythonhosted.org/packages/bb/24/89457a35f6af29538a76647f2c18c3a28277e6c19234c847e7b4b7c19860/onnxruntime-1.26.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e90c00732c4553618103149d93f688e8c3063017938f8983e21a71d9f3b6d22e", size = 16054821, upload-time = "2026-05-08T19:07:22.348Z" }, + { url = "https://files.pythonhosted.org/packages/12/f9/15b2e1815cf570d238e0135529f80d2dce64e8e8818a1489cae83823c5c6/onnxruntime-1.26.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:01498e80ba8988428d08c2d51b1338f89e3de2a93e6ffe555f79c68f26a5c06b", size = 18185815, upload-time = "2026-05-08T19:07:44.179Z" }, + { url = "https://files.pythonhosted.org/packages/d7/65/2e11055faf015e4b07f45b513fa49b391baf2e19d92d77d73ebee13c1004/onnxruntime-1.26.0-cp314-cp314-win_amd64.whl", hash = "sha256:7ead61450d8405167c87dd3a31d8da1d576b490a57dab1aa8b82a7da6825f5aa", size = 13349887, upload-time = "2026-05-08T19:08:08.671Z" }, + { url = "https://files.pythonhosted.org/packages/19/e4/0f9d1a5718b1781c610c1e354765a3820597081754277a6a9a2b50705702/onnxruntime-1.26.0-cp314-cp314-win_arm64.whl", hash = "sha256:31d71a53490e46910877d0902b5ad99c69a5955e5c7ea6c82863519410e1ba7c", size = 13140121, upload-time = "2026-05-08T19:07:57.804Z" }, + { url = "https://files.pythonhosted.org/packages/1c/42/3b8e635f067d06d9f45bede470b8d539d101a4166c272213158dfd08b6ce/onnxruntime-1.26.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d7b6d258fb78fdfcf049795bcfaa74dcb90ae7baa277afd21e6fd28b83f2c496", size = 16057240, upload-time = "2026-05-08T19:07:25.163Z" }, + { url = "https://files.pythonhosted.org/packages/93/99/f2be40a31b908d96b861ae0ce98582fa376c18a7f816b9d5eb4cd6aa0a4c/onnxruntime-1.26.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4eefd386a45202aefb7a5132b94f32df9d506c9edcc7faf2fc60d65183f4b183", size = 18197382, upload-time = "2026-05-08T19:07:46.965Z" }, +] + +[[package]] +name = "openai" +version = "2.41.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, + { name = "distro" }, + { name = "httpx" }, + { name = "jiter" }, + { name = "pydantic" }, + { name = "sniffio" }, + { name = "tqdm" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/3c/a6/5815fe2e2aca74b36c650d1bd43b69827cee568073d0d2d9b6fc5aaac80c/openai-2.41.0.tar.gz", hash = "sha256:db5c362acd6604b84f076abbefa66826ea4b46ecba2954ed866e6a149a1352c0", size = 783525, upload-time = "2026-06-03T22:39:40.719Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/be/51/d82bb424e8aa372190c5233253a2ceb399a778747d18b42cff487411e663/openai-2.41.0-py3-none-any.whl", hash = "sha256:20cc7952e8501c7e5773dd2ef7be437bae9cb549044902e1041a83a54516e375", size = 1353378, upload-time = "2026-06-03T22:39:38.964Z" }, +] + [[package]] name = "packaging" version = "26.2" @@ -2130,6 +2506,21 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" }, ] +[[package]] +name = "posthog" +version = "7.18.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "backoff" }, + { name = "distro" }, + { name = "requests" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a6/41/d7922b23d9f3dec427286b13f8f15b12c08eea3950a6cfd53979a54aed91/posthog-7.18.0.tar.gz", hash = "sha256:560230cf9616ac1fde5e3bfba1c666ab69e8488e5417005f86f91f7c6ef74252", size = 229913, upload-time = "2026-06-05T13:56:22.577Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/45/33/cb300af867a22c5e1680f6d1ebe5242ebe7de8280a82a5759e16211e28ee/posthog-7.18.0-py3-none-any.whl", hash = "sha256:9913586a958e5e81dbb1cdc4d4bedc8f95a268807b17ad03b5c55d2adff55dd7", size = 268498, upload-time = "2026-06-05T13:56:20.855Z" }, +] + [[package]] name = "preshed" version = "3.0.13" @@ -2334,6 +2725,21 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/3a/ed/1cdcab6ba3d6ab7feca11fc14f0eeea80755bb53ef4e892079f31b10a25f/propcache-0.5.2-py3-none-any.whl", hash = "sha256:be1ddfcbb376e3de5d2e2db1d58d6d67463e6b4f9f040c000de8e300295465fe", size = 14036, upload-time = "2026-05-08T21:02:10.673Z" }, ] +[[package]] +name = "protobuf" +version = "7.35.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/60/fd/5b1491d9e4b586d621c54f4c36b888714164b6875f8d6afa3f9072906a51/protobuf-7.35.0.tar.gz", hash = "sha256:a2efd84605f41e559f1881b0912b44099d0a2ac9bf46b3474823f10fb393b0e6", size = 458677, upload-time = "2026-05-19T23:02:29.197Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/83/ee/93d06e358a4aa32280b00e722d3ea0a1f25fc3cc5778d80581c9cca2c10e/protobuf-7.35.0-cp310-abi3-macosx_10_9_universal2.whl", hash = "sha256:66be6c513931c794fa92c080ffee41671390da3d79da219cf9c0c0907f035dda", size = 433225, upload-time = "2026-05-19T23:02:19.884Z" }, + { url = "https://files.pythonhosted.org/packages/8b/39/1c76c2da93f3c507e958e0aecee2391cc44d4625de6c728bbc555195b5a8/protobuf-7.35.0-cp310-abi3-manylinux2014_aarch64.whl", hash = "sha256:fcbe42a4ac09d3ec9c987ddfcd956afd0b15f1ff613bd8371bde9405ffd5c8e5", size = 328847, upload-time = "2026-05-19T23:02:22.3Z" }, + { url = "https://files.pythonhosted.org/packages/91/1a/39f7ce90a238c1a987a4d81ec26379e02ca0aff367de68e4a1fa474215b9/protobuf-7.35.0-cp310-abi3-manylinux2014_s390x.whl", hash = "sha256:4cbf5cc286130e06a6c9bbefac442431173906dfcc979712183d4adcc01b37ee", size = 344030, upload-time = "2026-05-19T23:02:23.591Z" }, + { url = "https://files.pythonhosted.org/packages/70/5b/6baf9008817964454055ff3fe65f1de0b5f1e26c80c82f7fb108b7cd4ea3/protobuf-7.35.0-cp310-abi3-manylinux2014_x86_64.whl", hash = "sha256:6c0f98f10c8a05ea30f8993dfef2de093d27b490fdae78bb60c8343795d55011", size = 327130, upload-time = "2026-05-19T23:02:24.637Z" }, + { url = "https://files.pythonhosted.org/packages/8e/e5/e46adb0badc388bfb84877a5f9f026aff63f60e611016cf64dbe77e05446/protobuf-7.35.0-cp310-abi3-win32.whl", hash = "sha256:4c4617b83ade0e279d1d2bfe04025a1adb87f9ed657de038620dc0ff959357f6", size = 428946, upload-time = "2026-05-19T23:02:25.741Z" }, + { url = "https://files.pythonhosted.org/packages/a7/ab/547fbd9e16d879dd13c167478f8ae0a83a428008ca07a5e06acdc23ad473/protobuf-7.35.0-cp310-abi3-win_amd64.whl", hash = "sha256:f05bcadf9a2a6b8dda047007075135fb7d08c73d9177aabc067e1be46881a201", size = 439996, upload-time = "2026-05-19T23:02:26.808Z" }, + { url = "https://files.pythonhosted.org/packages/b8/ef/50433d346c56657a70d27f156c7b349ac59a068b01de4eb796e747eecc43/protobuf-7.35.0-py3-none-any.whl", hash = "sha256:c13f325cf242bad135c350629eeb5d54b24228eb472fb3e2e9ebbd4c5dc20ca0", size = 171659, upload-time = "2026-05-19T23:02:27.842Z" }, +] + [[package]] name = "pyarrow" version = "24.0.0" @@ -2713,6 +3119,9 @@ advanced = [ { name = "presidio-analyzer" }, { name = "sentence-transformers" }, ] +connectors = [ + { name = "bm25s" }, +] dev = [ { name = "bandit" }, { name = "mypy" }, @@ -2731,17 +3140,34 @@ eval = [ faiss = [ { name = "faiss-cpu" }, ] +graph = [ + { name = "networkx", version = "3.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, + { name = "networkx", version = "3.6.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, +] +orchestrator = [ + { name = "haystack-ai" }, +] +rerank = [ + { name = "flashrank" }, +] +structured = [ + { name = "sqlglot" }, +] [package.metadata] requires-dist = [ { name = "bandit", marker = "extra == 'dev'" }, { name = "beir", marker = "extra == 'eval'", specifier = ">=2.0.0" }, + { name = "bm25s", marker = "extra == 'connectors'", specifier = ">=0.2.0" }, { name = "faiss-cpu", marker = "extra == 'advanced'", specifier = ">=1.7.0" }, { name = "faiss-cpu", marker = "extra == 'faiss'", specifier = ">=1.7.0" }, + { name = "flashrank", marker = "extra == 'rerank'", specifier = ">=0.2.0" }, + { name = "haystack-ai", marker = "extra == 'orchestrator'", specifier = ">=2.0" }, { name = "joblib", marker = "extra == 'advanced'", specifier = ">=1.0.0" }, { name = "mloda", specifier = ">=0.8.0" }, { name = "mloda-testing", specifier = ">=0.3.2" }, { name = "mypy", marker = "extra == 'dev'" }, + { name = "networkx", marker = "extra == 'graph'", specifier = ">=3.0" }, { name = "numpy", marker = "extra == 'eval'", specifier = ">=1.21.0" }, { name = "pandas", marker = "extra == 'eval'", specifier = ">=1.3.0" }, { name = "pillow", marker = "extra == 'advanced'", specifier = ">=9.0.0" }, @@ -2749,10 +3175,26 @@ requires-dist = [ { name = "pytest", marker = "extra == 'dev'" }, { name = "ruff", marker = "extra == 'dev'" }, { name = "sentence-transformers", marker = "extra == 'advanced'", specifier = ">=2.2.0" }, + { name = "sqlglot", marker = "extra == 'structured'", specifier = ">=25" }, { name = "tox", marker = "extra == 'dev'" }, { name = "tox-uv", marker = "extra == 'dev'" }, ] -provides-extras = ["dev", "faiss", "advanced", "eval"] +provides-extras = ["dev", "faiss", "advanced", "eval", "connectors", "rerank", "graph", "structured", "orchestrator"] + +[[package]] +name = "referencing" +version = "0.37.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "attrs" }, + { name = "rpds-py", version = "0.30.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, + { name = "rpds-py", version = "2026.5.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "typing-extensions", marker = "python_full_version < '3.13'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/22/f5/df4e9027acead3ecc63e50fe1e36aca1523e1719559c499951bb4b53188f/referencing-0.37.0.tar.gz", hash = "sha256:44aefc3142c5b842538163acb373e24cce6632bd54bdb01b21ad5863489f50d8", size = 78036, upload-time = "2025-10-13T15:30:48.871Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2c/58/ca301544e1fa93ed4f80d724bf5b194f6e4b945841c5bfd555878eea9fcb/referencing-0.37.0-py3-none-any.whl", hash = "sha256:381329a9f99628c9069361716891d34ad94af76e461dcb0335825aecc7692231", size = 26766, upload-time = "2025-10-13T15:30:47.625Z" }, +] [[package]] name = "regex" @@ -2915,6 +3357,279 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/82/3b/64d4899d73f91ba49a8c18a8ff3f0ea8f1c1d75481760df8c68ef5235bf5/rich-15.0.0-py3-none-any.whl", hash = "sha256:33bd4ef74232fb73fe9279a257718407f169c09b78a87ad3d296f548e27de0bb", size = 310654, upload-time = "2026-04-12T08:24:02.83Z" }, ] +[[package]] +name = "rpds-py" +version = "0.30.0" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version < '3.11'", +] +sdist = { url = "https://files.pythonhosted.org/packages/20/af/3f2f423103f1113b36230496629986e0ef7e199d2aa8392452b484b38ced/rpds_py-0.30.0.tar.gz", hash = "sha256:dd8ff7cf90014af0c0f787eea34794ebf6415242ee1d6fa91eaba725cc441e84", size = 69469, upload-time = "2025-11-30T20:24:38.837Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/06/0c/0c411a0ec64ccb6d104dcabe0e713e05e153a9a2c3c2bd2b32ce412166fe/rpds_py-0.30.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:679ae98e00c0e8d68a7fda324e16b90fd5260945b45d3b824c892cec9eea3288", size = 370490, upload-time = "2025-11-30T20:21:33.256Z" }, + { url = "https://files.pythonhosted.org/packages/19/6a/4ba3d0fb7297ebae71171822554abe48d7cab29c28b8f9f2c04b79988c05/rpds_py-0.30.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4cc2206b76b4f576934f0ed374b10d7ca5f457858b157ca52064bdfc26b9fc00", size = 359751, upload-time = "2025-11-30T20:21:34.591Z" }, + { url = "https://files.pythonhosted.org/packages/cd/7c/e4933565ef7f7a0818985d87c15d9d273f1a649afa6a52ea35ad011195ea/rpds_py-0.30.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:389a2d49eded1896c3d48b0136ead37c48e221b391c052fba3f4055c367f60a6", size = 389696, upload-time = "2025-11-30T20:21:36.122Z" }, + { url = "https://files.pythonhosted.org/packages/5e/01/6271a2511ad0815f00f7ed4390cf2567bec1d4b1da39e2c27a41e6e3b4de/rpds_py-0.30.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:32c8528634e1bf7121f3de08fa85b138f4e0dc47657866630611b03967f041d7", size = 403136, upload-time = "2025-11-30T20:21:37.728Z" }, + { url = "https://files.pythonhosted.org/packages/55/64/c857eb7cd7541e9b4eee9d49c196e833128a55b89a9850a9c9ac33ccf897/rpds_py-0.30.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f207f69853edd6f6700b86efb84999651baf3789e78a466431df1331608e5324", size = 524699, upload-time = "2025-11-30T20:21:38.92Z" }, + { url = "https://files.pythonhosted.org/packages/9c/ed/94816543404078af9ab26159c44f9e98e20fe47e2126d5d32c9d9948d10a/rpds_py-0.30.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:67b02ec25ba7a9e8fa74c63b6ca44cf5707f2fbfadae3ee8e7494297d56aa9df", size = 412022, upload-time = "2025-11-30T20:21:40.407Z" }, + { url = "https://files.pythonhosted.org/packages/61/b5/707f6cf0066a6412aacc11d17920ea2e19e5b2f04081c64526eb35b5c6e7/rpds_py-0.30.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0c0e95f6819a19965ff420f65578bacb0b00f251fefe2c8b23347c37174271f3", size = 390522, upload-time = "2025-11-30T20:21:42.17Z" }, + { url = "https://files.pythonhosted.org/packages/13/4e/57a85fda37a229ff4226f8cbcf09f2a455d1ed20e802ce5b2b4a7f5ed053/rpds_py-0.30.0-cp310-cp310-manylinux_2_31_riscv64.whl", hash = "sha256:a452763cc5198f2f98898eb98f7569649fe5da666c2dc6b5ddb10fde5a574221", size = 404579, upload-time = "2025-11-30T20:21:43.769Z" }, + { url = "https://files.pythonhosted.org/packages/f9/da/c9339293513ec680a721e0e16bf2bac3db6e5d7e922488de471308349bba/rpds_py-0.30.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e0b65193a413ccc930671c55153a03ee57cecb49e6227204b04fae512eb657a7", size = 421305, upload-time = "2025-11-30T20:21:44.994Z" }, + { url = "https://files.pythonhosted.org/packages/f9/be/522cb84751114f4ad9d822ff5a1aa3c98006341895d5f084779b99596e5c/rpds_py-0.30.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:858738e9c32147f78b3ac24dc0edb6610000e56dc0f700fd5f651d0a0f0eb9ff", size = 572503, upload-time = "2025-11-30T20:21:46.91Z" }, + { url = "https://files.pythonhosted.org/packages/a2/9b/de879f7e7ceddc973ea6e4629e9b380213a6938a249e94b0cdbcc325bb66/rpds_py-0.30.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:da279aa314f00acbb803da1e76fa18666778e8a8f83484fba94526da5de2cba7", size = 598322, upload-time = "2025-11-30T20:21:48.709Z" }, + { url = "https://files.pythonhosted.org/packages/48/ac/f01fc22efec3f37d8a914fc1b2fb9bcafd56a299edbe96406f3053edea5a/rpds_py-0.30.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:7c64d38fb49b6cdeda16ab49e35fe0da2e1e9b34bc38bd78386530f218b37139", size = 560792, upload-time = "2025-11-30T20:21:50.024Z" }, + { url = "https://files.pythonhosted.org/packages/e2/da/4e2b19d0f131f35b6146425f846563d0ce036763e38913d917187307a671/rpds_py-0.30.0-cp310-cp310-win32.whl", hash = "sha256:6de2a32a1665b93233cde140ff8b3467bdb9e2af2b91079f0333a0974d12d464", size = 221901, upload-time = "2025-11-30T20:21:51.32Z" }, + { url = "https://files.pythonhosted.org/packages/96/cb/156d7a5cf4f78a7cc571465d8aec7a3c447c94f6749c5123f08438bcf7bc/rpds_py-0.30.0-cp310-cp310-win_amd64.whl", hash = "sha256:1726859cd0de969f88dc8673bdd954185b9104e05806be64bcd87badbe313169", size = 235823, upload-time = "2025-11-30T20:21:52.505Z" }, + { url = "https://files.pythonhosted.org/packages/4d/6e/f964e88b3d2abee2a82c1ac8366da848fce1c6d834dc2132c3fda3970290/rpds_py-0.30.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:a2bffea6a4ca9f01b3f8e548302470306689684e61602aa3d141e34da06cf425", size = 370157, upload-time = "2025-11-30T20:21:53.789Z" }, + { url = "https://files.pythonhosted.org/packages/94/ba/24e5ebb7c1c82e74c4e4f33b2112a5573ddc703915b13a073737b59b86e0/rpds_py-0.30.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:dc4f992dfe1e2bc3ebc7444f6c7051b4bc13cd8e33e43511e8ffd13bf407010d", size = 359676, upload-time = "2025-11-30T20:21:55.475Z" }, + { url = "https://files.pythonhosted.org/packages/84/86/04dbba1b087227747d64d80c3b74df946b986c57af0a9f0c98726d4d7a3b/rpds_py-0.30.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:422c3cb9856d80b09d30d2eb255d0754b23e090034e1deb4083f8004bd0761e4", size = 389938, upload-time = "2025-11-30T20:21:57.079Z" }, + { url = "https://files.pythonhosted.org/packages/42/bb/1463f0b1722b7f45431bdd468301991d1328b16cffe0b1c2918eba2c4eee/rpds_py-0.30.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:07ae8a593e1c3c6b82ca3292efbe73c30b61332fd612e05abee07c79359f292f", size = 402932, upload-time = "2025-11-30T20:21:58.47Z" }, + { url = "https://files.pythonhosted.org/packages/99/ee/2520700a5c1f2d76631f948b0736cdf9b0acb25abd0ca8e889b5c62ac2e3/rpds_py-0.30.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:12f90dd7557b6bd57f40abe7747e81e0c0b119bef015ea7726e69fe550e394a4", size = 525830, upload-time = "2025-11-30T20:21:59.699Z" }, + { url = "https://files.pythonhosted.org/packages/e0/ad/bd0331f740f5705cc555a5e17fdf334671262160270962e69a2bdef3bf76/rpds_py-0.30.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:99b47d6ad9a6da00bec6aabe5a6279ecd3c06a329d4aa4771034a21e335c3a97", size = 412033, upload-time = "2025-11-30T20:22:00.991Z" }, + { url = "https://files.pythonhosted.org/packages/f8/1e/372195d326549bb51f0ba0f2ecb9874579906b97e08880e7a65c3bef1a99/rpds_py-0.30.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:33f559f3104504506a44bb666b93a33f5d33133765b0c216a5bf2f1e1503af89", size = 390828, upload-time = "2025-11-30T20:22:02.723Z" }, + { url = "https://files.pythonhosted.org/packages/ab/2b/d88bb33294e3e0c76bc8f351a3721212713629ffca1700fa94979cb3eae8/rpds_py-0.30.0-cp311-cp311-manylinux_2_31_riscv64.whl", hash = "sha256:946fe926af6e44f3697abbc305ea168c2c31d3e3ef1058cf68f379bf0335a78d", size = 404683, upload-time = "2025-11-30T20:22:04.367Z" }, + { url = "https://files.pythonhosted.org/packages/50/32/c759a8d42bcb5289c1fac697cd92f6fe01a018dd937e62ae77e0e7f15702/rpds_py-0.30.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:495aeca4b93d465efde585977365187149e75383ad2684f81519f504f5c13038", size = 421583, upload-time = "2025-11-30T20:22:05.814Z" }, + { url = "https://files.pythonhosted.org/packages/2b/81/e729761dbd55ddf5d84ec4ff1f47857f4374b0f19bdabfcf929164da3e24/rpds_py-0.30.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d9a0ca5da0386dee0655b4ccdf46119df60e0f10da268d04fe7cc87886872ba7", size = 572496, upload-time = "2025-11-30T20:22:07.713Z" }, + { url = "https://files.pythonhosted.org/packages/14/f6/69066a924c3557c9c30baa6ec3a0aa07526305684c6f86c696b08860726c/rpds_py-0.30.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:8d6d1cc13664ec13c1b84241204ff3b12f9bb82464b8ad6e7a5d3486975c2eed", size = 598669, upload-time = "2025-11-30T20:22:09.312Z" }, + { url = "https://files.pythonhosted.org/packages/5f/48/905896b1eb8a05630d20333d1d8ffd162394127b74ce0b0784ae04498d32/rpds_py-0.30.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:3896fa1be39912cf0757753826bc8bdc8ca331a28a7c4ae46b7a21280b06bb85", size = 561011, upload-time = "2025-11-30T20:22:11.309Z" }, + { url = "https://files.pythonhosted.org/packages/22/16/cd3027c7e279d22e5eb431dd3c0fbc677bed58797fe7581e148f3f68818b/rpds_py-0.30.0-cp311-cp311-win32.whl", hash = "sha256:55f66022632205940f1827effeff17c4fa7ae1953d2b74a8581baaefb7d16f8c", size = 221406, upload-time = "2025-11-30T20:22:13.101Z" }, + { url = "https://files.pythonhosted.org/packages/fa/5b/e7b7aa136f28462b344e652ee010d4de26ee9fd16f1bfd5811f5153ccf89/rpds_py-0.30.0-cp311-cp311-win_amd64.whl", hash = "sha256:a51033ff701fca756439d641c0ad09a41d9242fa69121c7d8769604a0a629825", size = 236024, upload-time = "2025-11-30T20:22:14.853Z" }, + { url = "https://files.pythonhosted.org/packages/14/a6/364bba985e4c13658edb156640608f2c9e1d3ea3c81b27aa9d889fff0e31/rpds_py-0.30.0-cp311-cp311-win_arm64.whl", hash = "sha256:47b0ef6231c58f506ef0b74d44e330405caa8428e770fec25329ed2cb971a229", size = 229069, upload-time = "2025-11-30T20:22:16.577Z" }, + { url = "https://files.pythonhosted.org/packages/03/e7/98a2f4ac921d82f33e03f3835f5bf3a4a40aa1bfdc57975e74a97b2b4bdd/rpds_py-0.30.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:a161f20d9a43006833cd7068375a94d035714d73a172b681d8881820600abfad", size = 375086, upload-time = "2025-11-30T20:22:17.93Z" }, + { url = "https://files.pythonhosted.org/packages/4d/a1/bca7fd3d452b272e13335db8d6b0b3ecde0f90ad6f16f3328c6fb150c889/rpds_py-0.30.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6abc8880d9d036ecaafe709079969f56e876fcf107f7a8e9920ba6d5a3878d05", size = 359053, upload-time = "2025-11-30T20:22:19.297Z" }, + { url = "https://files.pythonhosted.org/packages/65/1c/ae157e83a6357eceff62ba7e52113e3ec4834a84cfe07fa4b0757a7d105f/rpds_py-0.30.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ca28829ae5f5d569bb62a79512c842a03a12576375d5ece7d2cadf8abe96ec28", size = 390763, upload-time = "2025-11-30T20:22:21.661Z" }, + { url = "https://files.pythonhosted.org/packages/d4/36/eb2eb8515e2ad24c0bd43c3ee9cd74c33f7ca6430755ccdb240fd3144c44/rpds_py-0.30.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a1010ed9524c73b94d15919ca4d41d8780980e1765babf85f9a2f90d247153dd", size = 408951, upload-time = "2025-11-30T20:22:23.408Z" }, + { url = "https://files.pythonhosted.org/packages/d6/65/ad8dc1784a331fabbd740ef6f71ce2198c7ed0890dab595adb9ea2d775a1/rpds_py-0.30.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f8d1736cfb49381ba528cd5baa46f82fdc65c06e843dab24dd70b63d09121b3f", size = 514622, upload-time = "2025-11-30T20:22:25.16Z" }, + { url = "https://files.pythonhosted.org/packages/63/8e/0cfa7ae158e15e143fe03993b5bcd743a59f541f5952e1546b1ac1b5fd45/rpds_py-0.30.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d948b135c4693daff7bc2dcfc4ec57237a29bd37e60c2fabf5aff2bbacf3e2f1", size = 414492, upload-time = "2025-11-30T20:22:26.505Z" }, + { url = "https://files.pythonhosted.org/packages/60/1b/6f8f29f3f995c7ffdde46a626ddccd7c63aefc0efae881dc13b6e5d5bb16/rpds_py-0.30.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47f236970bccb2233267d89173d3ad2703cd36a0e2a6e92d0560d333871a3d23", size = 394080, upload-time = "2025-11-30T20:22:27.934Z" }, + { url = "https://files.pythonhosted.org/packages/6d/d5/a266341051a7a3ca2f4b750a3aa4abc986378431fc2da508c5034d081b70/rpds_py-0.30.0-cp312-cp312-manylinux_2_31_riscv64.whl", hash = "sha256:2e6ecb5a5bcacf59c3f912155044479af1d0b6681280048b338b28e364aca1f6", size = 408680, upload-time = "2025-11-30T20:22:29.341Z" }, + { url = "https://files.pythonhosted.org/packages/10/3b/71b725851df9ab7a7a4e33cf36d241933da66040d195a84781f49c50490c/rpds_py-0.30.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a8fa71a2e078c527c3e9dc9fc5a98c9db40bcc8a92b4e8858e36d329f8684b51", size = 423589, upload-time = "2025-11-30T20:22:31.469Z" }, + { url = "https://files.pythonhosted.org/packages/00/2b/e59e58c544dc9bd8bd8384ecdb8ea91f6727f0e37a7131baeff8d6f51661/rpds_py-0.30.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:73c67f2db7bc334e518d097c6d1e6fed021bbc9b7d678d6cc433478365d1d5f5", size = 573289, upload-time = "2025-11-30T20:22:32.997Z" }, + { url = "https://files.pythonhosted.org/packages/da/3e/a18e6f5b460893172a7d6a680e86d3b6bc87a54c1f0b03446a3c8c7b588f/rpds_py-0.30.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:5ba103fb455be00f3b1c2076c9d4264bfcb037c976167a6047ed82f23153f02e", size = 599737, upload-time = "2025-11-30T20:22:34.419Z" }, + { url = "https://files.pythonhosted.org/packages/5c/e2/714694e4b87b85a18e2c243614974413c60aa107fd815b8cbc42b873d1d7/rpds_py-0.30.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:7cee9c752c0364588353e627da8a7e808a66873672bcb5f52890c33fd965b394", size = 563120, upload-time = "2025-11-30T20:22:35.903Z" }, + { url = "https://files.pythonhosted.org/packages/6f/ab/d5d5e3bcedb0a77f4f613706b750e50a5a3ba1c15ccd3665ecc636c968fd/rpds_py-0.30.0-cp312-cp312-win32.whl", hash = "sha256:1ab5b83dbcf55acc8b08fc62b796ef672c457b17dbd7820a11d6c52c06839bdf", size = 223782, upload-time = "2025-11-30T20:22:37.271Z" }, + { url = "https://files.pythonhosted.org/packages/39/3b/f786af9957306fdc38a74cef405b7b93180f481fb48453a114bb6465744a/rpds_py-0.30.0-cp312-cp312-win_amd64.whl", hash = "sha256:a090322ca841abd453d43456ac34db46e8b05fd9b3b4ac0c78bcde8b089f959b", size = 240463, upload-time = "2025-11-30T20:22:39.021Z" }, + { url = "https://files.pythonhosted.org/packages/f3/d2/b91dc748126c1559042cfe41990deb92c4ee3e2b415f6b5234969ffaf0cc/rpds_py-0.30.0-cp312-cp312-win_arm64.whl", hash = "sha256:669b1805bd639dd2989b281be2cfd951c6121b65e729d9b843e9639ef1fd555e", size = 230868, upload-time = "2025-11-30T20:22:40.493Z" }, + { url = "https://files.pythonhosted.org/packages/ed/dc/d61221eb88ff410de3c49143407f6f3147acf2538c86f2ab7ce65ae7d5f9/rpds_py-0.30.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:f83424d738204d9770830d35290ff3273fbb02b41f919870479fab14b9d303b2", size = 374887, upload-time = "2025-11-30T20:22:41.812Z" }, + { url = "https://files.pythonhosted.org/packages/fd/32/55fb50ae104061dbc564ef15cc43c013dc4a9f4527a1f4d99baddf56fe5f/rpds_py-0.30.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:e7536cd91353c5273434b4e003cbda89034d67e7710eab8761fd918ec6c69cf8", size = 358904, upload-time = "2025-11-30T20:22:43.479Z" }, + { url = "https://files.pythonhosted.org/packages/58/70/faed8186300e3b9bdd138d0273109784eea2396c68458ed580f885dfe7ad/rpds_py-0.30.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2771c6c15973347f50fece41fc447c054b7ac2ae0502388ce3b6738cd366e3d4", size = 389945, upload-time = "2025-11-30T20:22:44.819Z" }, + { url = "https://files.pythonhosted.org/packages/bd/a8/073cac3ed2c6387df38f71296d002ab43496a96b92c823e76f46b8af0543/rpds_py-0.30.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:0a59119fc6e3f460315fe9d08149f8102aa322299deaa5cab5b40092345c2136", size = 407783, upload-time = "2025-11-30T20:22:46.103Z" }, + { url = "https://files.pythonhosted.org/packages/77/57/5999eb8c58671f1c11eba084115e77a8899d6e694d2a18f69f0ba471ec8b/rpds_py-0.30.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:76fec018282b4ead0364022e3c54b60bf368b9d926877957a8624b58419169b7", size = 515021, upload-time = "2025-11-30T20:22:47.458Z" }, + { url = "https://files.pythonhosted.org/packages/e0/af/5ab4833eadc36c0a8ed2bc5c0de0493c04f6c06de223170bd0798ff98ced/rpds_py-0.30.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:692bef75a5525db97318e8cd061542b5a79812d711ea03dbc1f6f8dbb0c5f0d2", size = 414589, upload-time = "2025-11-30T20:22:48.872Z" }, + { url = "https://files.pythonhosted.org/packages/b7/de/f7192e12b21b9e9a68a6d0f249b4af3fdcdff8418be0767a627564afa1f1/rpds_py-0.30.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9027da1ce107104c50c81383cae773ef5c24d296dd11c99e2629dbd7967a20c6", size = 394025, upload-time = "2025-11-30T20:22:50.196Z" }, + { url = "https://files.pythonhosted.org/packages/91/c4/fc70cd0249496493500e7cc2de87504f5aa6509de1e88623431fec76d4b6/rpds_py-0.30.0-cp313-cp313-manylinux_2_31_riscv64.whl", hash = "sha256:9cf69cdda1f5968a30a359aba2f7f9aa648a9ce4b580d6826437f2b291cfc86e", size = 408895, upload-time = "2025-11-30T20:22:51.87Z" }, + { url = "https://files.pythonhosted.org/packages/58/95/d9275b05ab96556fefff73a385813eb66032e4c99f411d0795372d9abcea/rpds_py-0.30.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a4796a717bf12b9da9d3ad002519a86063dcac8988b030e405704ef7d74d2d9d", size = 422799, upload-time = "2025-11-30T20:22:53.341Z" }, + { url = "https://files.pythonhosted.org/packages/06/c1/3088fc04b6624eb12a57eb814f0d4997a44b0d208d6cace713033ff1a6ba/rpds_py-0.30.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:5d4c2aa7c50ad4728a094ebd5eb46c452e9cb7edbfdb18f9e1221f597a73e1e7", size = 572731, upload-time = "2025-11-30T20:22:54.778Z" }, + { url = "https://files.pythonhosted.org/packages/d8/42/c612a833183b39774e8ac8fecae81263a68b9583ee343db33ab571a7ce55/rpds_py-0.30.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:ba81a9203d07805435eb06f536d95a266c21e5b2dfbf6517748ca40c98d19e31", size = 599027, upload-time = "2025-11-30T20:22:56.212Z" }, + { url = "https://files.pythonhosted.org/packages/5f/60/525a50f45b01d70005403ae0e25f43c0384369ad24ffe46e8d9068b50086/rpds_py-0.30.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:945dccface01af02675628334f7cf49c2af4c1c904748efc5cf7bbdf0b579f95", size = 563020, upload-time = "2025-11-30T20:22:58.2Z" }, + { url = "https://files.pythonhosted.org/packages/0b/5d/47c4655e9bcd5ca907148535c10e7d489044243cc9941c16ed7cd53be91d/rpds_py-0.30.0-cp313-cp313-win32.whl", hash = "sha256:b40fb160a2db369a194cb27943582b38f79fc4887291417685f3ad693c5a1d5d", size = 223139, upload-time = "2025-11-30T20:23:00.209Z" }, + { url = "https://files.pythonhosted.org/packages/f2/e1/485132437d20aa4d3e1d8b3fb5a5e65aa8139f1e097080c2a8443201742c/rpds_py-0.30.0-cp313-cp313-win_amd64.whl", hash = "sha256:806f36b1b605e2d6a72716f321f20036b9489d29c51c91f4dd29a3e3afb73b15", size = 240224, upload-time = "2025-11-30T20:23:02.008Z" }, + { url = "https://files.pythonhosted.org/packages/24/95/ffd128ed1146a153d928617b0ef673960130be0009c77d8fbf0abe306713/rpds_py-0.30.0-cp313-cp313-win_arm64.whl", hash = "sha256:d96c2086587c7c30d44f31f42eae4eac89b60dabbac18c7669be3700f13c3ce1", size = 230645, upload-time = "2025-11-30T20:23:03.43Z" }, + { url = "https://files.pythonhosted.org/packages/ff/1b/b10de890a0def2a319a2626334a7f0ae388215eb60914dbac8a3bae54435/rpds_py-0.30.0-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:eb0b93f2e5c2189ee831ee43f156ed34e2a89a78a66b98cadad955972548be5a", size = 364443, upload-time = "2025-11-30T20:23:04.878Z" }, + { url = "https://files.pythonhosted.org/packages/0d/bf/27e39f5971dc4f305a4fb9c672ca06f290f7c4e261c568f3dea16a410d47/rpds_py-0.30.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:922e10f31f303c7c920da8981051ff6d8c1a56207dbdf330d9047f6d30b70e5e", size = 353375, upload-time = "2025-11-30T20:23:06.342Z" }, + { url = "https://files.pythonhosted.org/packages/40/58/442ada3bba6e8e6615fc00483135c14a7538d2ffac30e2d933ccf6852232/rpds_py-0.30.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cdc62c8286ba9bf7f47befdcea13ea0e26bf294bda99758fd90535cbaf408000", size = 383850, upload-time = "2025-11-30T20:23:07.825Z" }, + { url = "https://files.pythonhosted.org/packages/14/14/f59b0127409a33c6ef6f5c1ebd5ad8e32d7861c9c7adfa9a624fc3889f6c/rpds_py-0.30.0-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:47f9a91efc418b54fb8190a6b4aa7813a23fb79c51f4bb84e418f5476c38b8db", size = 392812, upload-time = "2025-11-30T20:23:09.228Z" }, + { url = "https://files.pythonhosted.org/packages/b3/66/e0be3e162ac299b3a22527e8913767d869e6cc75c46bd844aa43fb81ab62/rpds_py-0.30.0-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1f3587eb9b17f3789ad50824084fa6f81921bbf9a795826570bda82cb3ed91f2", size = 517841, upload-time = "2025-11-30T20:23:11.186Z" }, + { url = "https://files.pythonhosted.org/packages/3d/55/fa3b9cf31d0c963ecf1ba777f7cf4b2a2c976795ac430d24a1f43d25a6ba/rpds_py-0.30.0-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:39c02563fc592411c2c61d26b6c5fe1e51eaa44a75aa2c8735ca88b0d9599daa", size = 408149, upload-time = "2025-11-30T20:23:12.864Z" }, + { url = "https://files.pythonhosted.org/packages/60/ca/780cf3b1a32b18c0f05c441958d3758f02544f1d613abf9488cd78876378/rpds_py-0.30.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:51a1234d8febafdfd33a42d97da7a43f5dcb120c1060e352a3fbc0c6d36e2083", size = 383843, upload-time = "2025-11-30T20:23:14.638Z" }, + { url = "https://files.pythonhosted.org/packages/82/86/d5f2e04f2aa6247c613da0c1dd87fcd08fa17107e858193566048a1e2f0a/rpds_py-0.30.0-cp313-cp313t-manylinux_2_31_riscv64.whl", hash = "sha256:eb2c4071ab598733724c08221091e8d80e89064cd472819285a9ab0f24bcedb9", size = 396507, upload-time = "2025-11-30T20:23:16.105Z" }, + { url = "https://files.pythonhosted.org/packages/4b/9a/453255d2f769fe44e07ea9785c8347edaf867f7026872e76c1ad9f7bed92/rpds_py-0.30.0-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6bdfdb946967d816e6adf9a3d8201bfad269c67efe6cefd7093ef959683c8de0", size = 414949, upload-time = "2025-11-30T20:23:17.539Z" }, + { url = "https://files.pythonhosted.org/packages/a3/31/622a86cdc0c45d6df0e9ccb6becdba5074735e7033c20e401a6d9d0e2ca0/rpds_py-0.30.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:c77afbd5f5250bf27bf516c7c4a016813eb2d3e116139aed0096940c5982da94", size = 565790, upload-time = "2025-11-30T20:23:19.029Z" }, + { url = "https://files.pythonhosted.org/packages/1c/5d/15bbf0fb4a3f58a3b1c67855ec1efcc4ceaef4e86644665fff03e1b66d8d/rpds_py-0.30.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:61046904275472a76c8c90c9ccee9013d70a6d0f73eecefd38c1ae7c39045a08", size = 590217, upload-time = "2025-11-30T20:23:20.885Z" }, + { url = "https://files.pythonhosted.org/packages/6d/61/21b8c41f68e60c8cc3b2e25644f0e3681926020f11d06ab0b78e3c6bbff1/rpds_py-0.30.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:4c5f36a861bc4b7da6516dbdf302c55313afa09b81931e8280361a4f6c9a2d27", size = 555806, upload-time = "2025-11-30T20:23:22.488Z" }, + { url = "https://files.pythonhosted.org/packages/f9/39/7e067bb06c31de48de3eb200f9fc7c58982a4d3db44b07e73963e10d3be9/rpds_py-0.30.0-cp313-cp313t-win32.whl", hash = "sha256:3d4a69de7a3e50ffc214ae16d79d8fbb0922972da0356dcf4d0fdca2878559c6", size = 211341, upload-time = "2025-11-30T20:23:24.449Z" }, + { url = "https://files.pythonhosted.org/packages/0a/4d/222ef0b46443cf4cf46764d9c630f3fe4abaa7245be9417e56e9f52b8f65/rpds_py-0.30.0-cp313-cp313t-win_amd64.whl", hash = "sha256:f14fc5df50a716f7ece6a80b6c78bb35ea2ca47c499e422aa4463455dd96d56d", size = 225768, upload-time = "2025-11-30T20:23:25.908Z" }, + { url = "https://files.pythonhosted.org/packages/86/81/dad16382ebbd3d0e0328776d8fd7ca94220e4fa0798d1dc5e7da48cb3201/rpds_py-0.30.0-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:68f19c879420aa08f61203801423f6cd5ac5f0ac4ac82a2368a9fcd6a9a075e0", size = 362099, upload-time = "2025-11-30T20:23:27.316Z" }, + { url = "https://files.pythonhosted.org/packages/2b/60/19f7884db5d5603edf3c6bce35408f45ad3e97e10007df0e17dd57af18f8/rpds_py-0.30.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:ec7c4490c672c1a0389d319b3a9cfcd098dcdc4783991553c332a15acf7249be", size = 353192, upload-time = "2025-11-30T20:23:29.151Z" }, + { url = "https://files.pythonhosted.org/packages/bf/c4/76eb0e1e72d1a9c4703c69607cec123c29028bff28ce41588792417098ac/rpds_py-0.30.0-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f251c812357a3fed308d684a5079ddfb9d933860fc6de89f2b7ab00da481e65f", size = 384080, upload-time = "2025-11-30T20:23:30.785Z" }, + { url = "https://files.pythonhosted.org/packages/72/87/87ea665e92f3298d1b26d78814721dc39ed8d2c74b86e83348d6b48a6f31/rpds_py-0.30.0-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ac98b175585ecf4c0348fd7b29c3864bda53b805c773cbf7bfdaffc8070c976f", size = 394841, upload-time = "2025-11-30T20:23:32.209Z" }, + { url = "https://files.pythonhosted.org/packages/77/ad/7783a89ca0587c15dcbf139b4a8364a872a25f861bdb88ed99f9b0dec985/rpds_py-0.30.0-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3e62880792319dbeb7eb866547f2e35973289e7d5696c6e295476448f5b63c87", size = 516670, upload-time = "2025-11-30T20:23:33.742Z" }, + { url = "https://files.pythonhosted.org/packages/5b/3c/2882bdac942bd2172f3da574eab16f309ae10a3925644e969536553cb4ee/rpds_py-0.30.0-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4e7fc54e0900ab35d041b0601431b0a0eb495f0851a0639b6ef90f7741b39a18", size = 408005, upload-time = "2025-11-30T20:23:35.253Z" }, + { url = "https://files.pythonhosted.org/packages/ce/81/9a91c0111ce1758c92516a3e44776920b579d9a7c09b2b06b642d4de3f0f/rpds_py-0.30.0-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47e77dc9822d3ad616c3d5759ea5631a75e5809d5a28707744ef79d7a1bcfcad", size = 382112, upload-time = "2025-11-30T20:23:36.842Z" }, + { url = "https://files.pythonhosted.org/packages/cf/8e/1da49d4a107027e5fbc64daeab96a0706361a2918da10cb41769244b805d/rpds_py-0.30.0-cp314-cp314-manylinux_2_31_riscv64.whl", hash = "sha256:b4dc1a6ff022ff85ecafef7979a2c6eb423430e05f1165d6688234e62ba99a07", size = 399049, upload-time = "2025-11-30T20:23:38.343Z" }, + { url = "https://files.pythonhosted.org/packages/df/5a/7ee239b1aa48a127570ec03becbb29c9d5a9eb092febbd1699d567cae859/rpds_py-0.30.0-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:4559c972db3a360808309e06a74628b95eaccbf961c335c8fe0d590cf587456f", size = 415661, upload-time = "2025-11-30T20:23:40.263Z" }, + { url = "https://files.pythonhosted.org/packages/70/ea/caa143cf6b772f823bc7929a45da1fa83569ee49b11d18d0ada7f5ee6fd6/rpds_py-0.30.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:0ed177ed9bded28f8deb6ab40c183cd1192aa0de40c12f38be4d59cd33cb5c65", size = 565606, upload-time = "2025-11-30T20:23:42.186Z" }, + { url = "https://files.pythonhosted.org/packages/64/91/ac20ba2d69303f961ad8cf55bf7dbdb4763f627291ba3d0d7d67333cced9/rpds_py-0.30.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:ad1fa8db769b76ea911cb4e10f049d80bf518c104f15b3edb2371cc65375c46f", size = 591126, upload-time = "2025-11-30T20:23:44.086Z" }, + { url = "https://files.pythonhosted.org/packages/21/20/7ff5f3c8b00c8a95f75985128c26ba44503fb35b8e0259d812766ea966c7/rpds_py-0.30.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:46e83c697b1f1c72b50e5ee5adb4353eef7406fb3f2043d64c33f20ad1c2fc53", size = 553371, upload-time = "2025-11-30T20:23:46.004Z" }, + { url = "https://files.pythonhosted.org/packages/72/c7/81dadd7b27c8ee391c132a6b192111ca58d866577ce2d9b0ca157552cce0/rpds_py-0.30.0-cp314-cp314-win32.whl", hash = "sha256:ee454b2a007d57363c2dfd5b6ca4a5d7e2c518938f8ed3b706e37e5d470801ed", size = 215298, upload-time = "2025-11-30T20:23:47.696Z" }, + { url = "https://files.pythonhosted.org/packages/3e/d2/1aaac33287e8cfb07aab2e6b8ac1deca62f6f65411344f1433c55e6f3eb8/rpds_py-0.30.0-cp314-cp314-win_amd64.whl", hash = "sha256:95f0802447ac2d10bcc69f6dc28fe95fdf17940367b21d34e34c737870758950", size = 228604, upload-time = "2025-11-30T20:23:49.501Z" }, + { url = "https://files.pythonhosted.org/packages/e8/95/ab005315818cc519ad074cb7784dae60d939163108bd2b394e60dc7b5461/rpds_py-0.30.0-cp314-cp314-win_arm64.whl", hash = "sha256:613aa4771c99f03346e54c3f038e4cc574ac09a3ddfb0e8878487335e96dead6", size = 222391, upload-time = "2025-11-30T20:23:50.96Z" }, + { url = "https://files.pythonhosted.org/packages/9e/68/154fe0194d83b973cdedcdcc88947a2752411165930182ae41d983dcefa6/rpds_py-0.30.0-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:7e6ecfcb62edfd632e56983964e6884851786443739dbfe3582947e87274f7cb", size = 364868, upload-time = "2025-11-30T20:23:52.494Z" }, + { url = "https://files.pythonhosted.org/packages/83/69/8bbc8b07ec854d92a8b75668c24d2abcb1719ebf890f5604c61c9369a16f/rpds_py-0.30.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:a1d0bc22a7cdc173fedebb73ef81e07faef93692b8c1ad3733b67e31e1b6e1b8", size = 353747, upload-time = "2025-11-30T20:23:54.036Z" }, + { url = "https://files.pythonhosted.org/packages/ab/00/ba2e50183dbd9abcce9497fa5149c62b4ff3e22d338a30d690f9af970561/rpds_py-0.30.0-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0d08f00679177226c4cb8c5265012eea897c8ca3b93f429e546600c971bcbae7", size = 383795, upload-time = "2025-11-30T20:23:55.556Z" }, + { url = "https://files.pythonhosted.org/packages/05/6f/86f0272b84926bcb0e4c972262f54223e8ecc556b3224d281e6598fc9268/rpds_py-0.30.0-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5965af57d5848192c13534f90f9dd16464f3c37aaf166cc1da1cae1fd5a34898", size = 393330, upload-time = "2025-11-30T20:23:57.033Z" }, + { url = "https://files.pythonhosted.org/packages/cb/e9/0e02bb2e6dc63d212641da45df2b0bf29699d01715913e0d0f017ee29438/rpds_py-0.30.0-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9a4e86e34e9ab6b667c27f3211ca48f73dba7cd3d90f8d5b11be56e5dbc3fb4e", size = 518194, upload-time = "2025-11-30T20:23:58.637Z" }, + { url = "https://files.pythonhosted.org/packages/ee/ca/be7bca14cf21513bdf9c0606aba17d1f389ea2b6987035eb4f62bd923f25/rpds_py-0.30.0-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e5d3e6b26f2c785d65cc25ef1e5267ccbe1b069c5c21b8cc724efee290554419", size = 408340, upload-time = "2025-11-30T20:24:00.2Z" }, + { url = "https://files.pythonhosted.org/packages/c2/c7/736e00ebf39ed81d75544c0da6ef7b0998f8201b369acf842f9a90dc8fce/rpds_py-0.30.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:626a7433c34566535b6e56a1b39a7b17ba961e97ce3b80ec62e6f1312c025551", size = 383765, upload-time = "2025-11-30T20:24:01.759Z" }, + { url = "https://files.pythonhosted.org/packages/4a/3f/da50dfde9956aaf365c4adc9533b100008ed31aea635f2b8d7b627e25b49/rpds_py-0.30.0-cp314-cp314t-manylinux_2_31_riscv64.whl", hash = "sha256:acd7eb3f4471577b9b5a41baf02a978e8bdeb08b4b355273994f8b87032000a8", size = 396834, upload-time = "2025-11-30T20:24:03.687Z" }, + { url = "https://files.pythonhosted.org/packages/4e/00/34bcc2565b6020eab2623349efbdec810676ad571995911f1abdae62a3a0/rpds_py-0.30.0-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:fe5fa731a1fa8a0a56b0977413f8cacac1768dad38d16b3a296712709476fbd5", size = 415470, upload-time = "2025-11-30T20:24:05.232Z" }, + { url = "https://files.pythonhosted.org/packages/8c/28/882e72b5b3e6f718d5453bd4d0d9cf8df36fddeb4ddbbab17869d5868616/rpds_py-0.30.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:74a3243a411126362712ee1524dfc90c650a503502f135d54d1b352bd01f2404", size = 565630, upload-time = "2025-11-30T20:24:06.878Z" }, + { url = "https://files.pythonhosted.org/packages/3b/97/04a65539c17692de5b85c6e293520fd01317fd878ea1995f0367d4532fb1/rpds_py-0.30.0-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:3e8eeb0544f2eb0d2581774be4c3410356eba189529a6b3e36bbbf9696175856", size = 591148, upload-time = "2025-11-30T20:24:08.445Z" }, + { url = "https://files.pythonhosted.org/packages/85/70/92482ccffb96f5441aab93e26c4d66489eb599efdcf96fad90c14bbfb976/rpds_py-0.30.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:dbd936cde57abfee19ab3213cf9c26be06d60750e60a8e4dd85d1ab12c8b1f40", size = 556030, upload-time = "2025-11-30T20:24:10.956Z" }, + { url = "https://files.pythonhosted.org/packages/20/53/7c7e784abfa500a2b6b583b147ee4bb5a2b3747a9166bab52fec4b5b5e7d/rpds_py-0.30.0-cp314-cp314t-win32.whl", hash = "sha256:dc824125c72246d924f7f796b4f63c1e9dc810c7d9e2355864b3c3a73d59ade0", size = 211570, upload-time = "2025-11-30T20:24:12.735Z" }, + { url = "https://files.pythonhosted.org/packages/d0/02/fa464cdfbe6b26e0600b62c528b72d8608f5cc49f96b8d6e38c95d60c676/rpds_py-0.30.0-cp314-cp314t-win_amd64.whl", hash = "sha256:27f4b0e92de5bfbc6f86e43959e6edd1425c33b5e69aab0984a72047f2bcf1e3", size = 226532, upload-time = "2025-11-30T20:24:14.634Z" }, + { url = "https://files.pythonhosted.org/packages/69/71/3f34339ee70521864411f8b6992e7ab13ac30d8e4e3309e07c7361767d91/rpds_py-0.30.0-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:c2262bdba0ad4fc6fb5545660673925c2d2a5d9e2e0fb603aad545427be0fc58", size = 372292, upload-time = "2025-11-30T20:24:16.537Z" }, + { url = "https://files.pythonhosted.org/packages/57/09/f183df9b8f2d66720d2ef71075c59f7e1b336bec7ee4c48f0a2b06857653/rpds_py-0.30.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:ee6af14263f25eedc3bb918a3c04245106a42dfd4f5c2285ea6f997b1fc3f89a", size = 362128, upload-time = "2025-11-30T20:24:18.086Z" }, + { url = "https://files.pythonhosted.org/packages/7a/68/5c2594e937253457342e078f0cc1ded3dd7b2ad59afdbf2d354869110a02/rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3adbb8179ce342d235c31ab8ec511e66c73faa27a47e076ccc92421add53e2bb", size = 391542, upload-time = "2025-11-30T20:24:20.092Z" }, + { url = "https://files.pythonhosted.org/packages/49/5c/31ef1afd70b4b4fbdb2800249f34c57c64beb687495b10aec0365f53dfc4/rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:250fa00e9543ac9b97ac258bd37367ff5256666122c2d0f2bc97577c60a1818c", size = 404004, upload-time = "2025-11-30T20:24:22.231Z" }, + { url = "https://files.pythonhosted.org/packages/e3/63/0cfbea38d05756f3440ce6534d51a491d26176ac045e2707adc99bb6e60a/rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9854cf4f488b3d57b9aaeb105f06d78e5529d3145b1e4a41750167e8c213c6d3", size = 527063, upload-time = "2025-11-30T20:24:24.302Z" }, + { url = "https://files.pythonhosted.org/packages/42/e6/01e1f72a2456678b0f618fc9a1a13f882061690893c192fcad9f2926553a/rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:993914b8e560023bc0a8bf742c5f303551992dcb85e247b1e5c7f4a7d145bda5", size = 413099, upload-time = "2025-11-30T20:24:25.916Z" }, + { url = "https://files.pythonhosted.org/packages/b8/25/8df56677f209003dcbb180765520c544525e3ef21ea72279c98b9aa7c7fb/rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:58edca431fb9b29950807e301826586e5bbf24163677732429770a697ffe6738", size = 392177, upload-time = "2025-11-30T20:24:27.834Z" }, + { url = "https://files.pythonhosted.org/packages/4a/b4/0a771378c5f16f8115f796d1f437950158679bcd2a7c68cf251cfb00ed5b/rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_31_riscv64.whl", hash = "sha256:dea5b552272a944763b34394d04577cf0f9bd013207bc32323b5a89a53cf9c2f", size = 406015, upload-time = "2025-11-30T20:24:29.457Z" }, + { url = "https://files.pythonhosted.org/packages/36/d8/456dbba0af75049dc6f63ff295a2f92766b9d521fa00de67a2bd6427d57a/rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ba3af48635eb83d03f6c9735dfb21785303e73d22ad03d489e88adae6eab8877", size = 423736, upload-time = "2025-11-30T20:24:31.22Z" }, + { url = "https://files.pythonhosted.org/packages/13/64/b4d76f227d5c45a7e0b796c674fd81b0a6c4fbd48dc29271857d8219571c/rpds_py-0.30.0-pp311-pypy311_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:dff13836529b921e22f15cb099751209a60009731a68519630a24d61f0b1b30a", size = 573981, upload-time = "2025-11-30T20:24:32.934Z" }, + { url = "https://files.pythonhosted.org/packages/20/91/092bacadeda3edf92bf743cc96a7be133e13a39cdbfd7b5082e7ab638406/rpds_py-0.30.0-pp311-pypy311_pp73-musllinux_1_2_i686.whl", hash = "sha256:1b151685b23929ab7beec71080a8889d4d6d9fa9a983d213f07121205d48e2c4", size = 599782, upload-time = "2025-11-30T20:24:35.169Z" }, + { url = "https://files.pythonhosted.org/packages/d1/b7/b95708304cd49b7b6f82fdd039f1748b66ec2b21d6a45180910802f1abf1/rpds_py-0.30.0-pp311-pypy311_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:ac37f9f516c51e5753f27dfdef11a88330f04de2d564be3991384b2f3535d02e", size = 562191, upload-time = "2025-11-30T20:24:36.853Z" }, +] + +[[package]] +name = "rpds-py" +version = "2026.5.1" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.15' and sys_platform == 'win32'", + "python_full_version >= '3.15' and sys_platform == 'emscripten'", + "python_full_version >= '3.15' and sys_platform != 'emscripten' and sys_platform != 'win32'", + "python_full_version == '3.14.*' and sys_platform == 'win32'", + "python_full_version == '3.14.*' and sys_platform == 'emscripten'", + "python_full_version == '3.14.*' and sys_platform != 'emscripten' and sys_platform != 'win32'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'win32'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'emscripten'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'", +] +sdist = { url = "https://files.pythonhosted.org/packages/2e/43/25a8dcd3feedd735039a8f0b5b7e3b118232b5eae288c4fd9ab200d41094/rpds_py-2026.5.1.tar.gz", hash = "sha256:07b24fea40541e28570e5b795a4a38fbdcd12550c06bd0748005ecc8116ca256", size = 64459, upload-time = "2026-05-28T12:02:13.232Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4f/a0/acf8b6fc20bfdcd3a45bd3f57680fb198e157b7e997b9123b10763798bd2/rpds_py-2026.5.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:3397a5ed7174dc2786bb214030232fc36fe8e5584fec43a9952cc542b1a12036", size = 355609, upload-time = "2026-05-28T11:58:50.78Z" }, + { url = "https://files.pythonhosted.org/packages/b6/95/f8203fd997484b1690a6869cd0e503b6c3c6be55b0ecc36d1a491fe742f0/rpds_py-2026.5.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:99ab6ba7bfa2cb0f96a04e3652355bf04e3f51aceb1e943b8541dab7ba4828cc", size = 348460, upload-time = "2026-05-28T11:58:52.374Z" }, + { url = "https://files.pythonhosted.org/packages/33/8c/b47326ad2f0be545a5e5c1a55937a12afaea7d392ba2837bb9680f57e6c9/rpds_py-2026.5.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d0efbe45632665e53e3db8fe1e5692db58fc5cb9bab4459d570b83efefe11164", size = 381031, upload-time = "2026-05-28T11:58:53.775Z" }, + { url = "https://files.pythonhosted.org/packages/22/0b/e83bbd97ffac6f6389b605cd4e1c8ac5761dc7e977769c9255d8c5adb7bd/rpds_py-2026.5.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:01d17b29c0c23d82b1f4751147ec49cf451f1fc2554eb9ef5f957e55d2656ead", size = 387121, upload-time = "2026-05-28T11:58:55.243Z" }, + { url = "https://files.pythonhosted.org/packages/fd/0e/d285d1bc8864245919c61e1ca82263e4a66d337759c3a4cef72766ff9afc/rpds_py-2026.5.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7559f72b94ae52659086c595dfa017cde03155f7832071d30959049052cb3ece", size = 501026, upload-time = "2026-05-28T11:58:56.788Z" }, + { url = "https://files.pythonhosted.org/packages/86/06/ccb2109a1e543437b5e43816f2b43b9554cc6783145528a4e3711e05c011/rpds_py-2026.5.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9e25b7088f9ccbfc0dfcaa52bf969300ca229e10ecf758974ebcbb080a4b37bb", size = 391865, upload-time = "2026-05-28T11:58:58.298Z" }, + { url = "https://files.pythonhosted.org/packages/3d/33/237173db1cfef10105b3839a24de00eb8d2a523711add4632447cdf0aedd/rpds_py-2026.5.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:613fc4ee9eaef26dc5840666214dd6fbcebcf32f46e76f4abc473059f4e13dda", size = 378012, upload-time = "2026-05-28T11:58:59.589Z" }, + { url = "https://files.pythonhosted.org/packages/97/64/1eae54e34d5161f9969295e80bd6b62a55f2b6ac5f2a5b60d02c2140e758/rpds_py-2026.5.1-cp311-cp311-manylinux_2_31_riscv64.whl", hash = "sha256:85264a90ff4c05c1568dd65f5921c837614b67c60358fb4c17df3b7f2e90690a", size = 391111, upload-time = "2026-05-28T11:59:01.104Z" }, + { url = "https://files.pythonhosted.org/packages/d8/34/5bb334a5a0f65d77869217c4654f34c78a7d11b93938a3c076a2edeafc52/rpds_py-2026.5.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:fe71bca7d547acb17027c7fd1624ff8aae623499c498d3e7011182c4de5c25e0", size = 409225, upload-time = "2026-05-28T11:59:02.433Z" }, + { url = "https://files.pythonhosted.org/packages/16/0f/007ec21283b5b040b4ec3bd95e0402591e22bfa7d5c93dfe01c465c2d2d7/rpds_py-2026.5.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:a05fa4f41f37ec97c9c260441a940450a192f78d774d2b097eee1379f1e1246a", size = 556487, upload-time = "2026-05-28T11:59:04.012Z" }, + { url = "https://files.pythonhosted.org/packages/ff/10/5437c94508169b6b22d8418fef7a66e9ffb5f3b9e9c94460f2eedafe06ff/rpds_py-2026.5.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:df1d2a1996755b24b9ecee92cb4d36c28f86f464a6a173349c26bab41e94b8c2", size = 620798, upload-time = "2026-05-28T11:59:05.485Z" }, + { url = "https://files.pythonhosted.org/packages/e0/d5/9937dce4d6bda74157b954e7d1460db05a22f5929dccfeeba1ed27a93df0/rpds_py-2026.5.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:8895840ac4809e5f60c88fd07617cd71326e73d6e5a8aa783c5c0f7c24985de2", size = 584053, upload-time = "2026-05-28T11:59:06.837Z" }, + { url = "https://files.pythonhosted.org/packages/6c/31/750617dd0ae1752471bf43f9e41d263398fae7cde7849d23b8574a70e617/rpds_py-2026.5.1-cp311-cp311-win32.whl", hash = "sha256:3684a59b158a7683aaeb8e25352e9a9dd2122cec78f2d8530266e4f91b4c7b3f", size = 214390, upload-time = "2026-05-28T11:59:08.402Z" }, + { url = "https://files.pythonhosted.org/packages/3c/bb/3dcab0e1d9516303f2eb672a5d6f62eca5a69e2886301e9c8c54b520c39b/rpds_py-2026.5.1-cp311-cp311-win_amd64.whl", hash = "sha256:7bd530e6a530bb3ea892f194fafa455f3516ac25ecf7143fd33c09be62b0470a", size = 231097, upload-time = "2026-05-28T11:59:09.786Z" }, + { url = "https://files.pythonhosted.org/packages/49/d6/c6bbf5cb1cf12b9732df8074b57f6ef8341ba884c95d40632ae8bddb44e4/rpds_py-2026.5.1-cp311-cp311-win_arm64.whl", hash = "sha256:0a5ae4dbe43c1076983b72616496919872ae7bbe7a1e21cc48336bc3154d130b", size = 226361, upload-time = "2026-05-28T11:59:11.079Z" }, + { url = "https://files.pythonhosted.org/packages/d4/e7/a78582dc57caa592dcc7d4fb69b61390561e908eb3d2f5df5928a8e354c0/rpds_py-2026.5.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:3abe24a66e57adcfa645d718063a5fa5103ecc71ddbf26d78af8f9368018ff1d", size = 353040, upload-time = "2026-05-28T11:59:12.531Z" }, + { url = "https://files.pythonhosted.org/packages/a3/43/35e3f136343aef451e545ce8c38d36c2f93c0ed88703db8b64ba2b205c68/rpds_py-2026.5.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:58b1d94308ddf0b1982f61f2eb54bf92997c9ece8a8093ef014250f4a517906c", size = 345775, upload-time = "2026-05-28T11:59:13.827Z" }, + { url = "https://files.pythonhosted.org/packages/20/e1/0f2160c5982d3157734d5cb3ed63d8b2d583a73c9864f77b666449f32cf8/rpds_py-2026.5.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0fa92420128dadce7f54bd73ba1825a273e9268fe9e35dbf7e6362890efa4e08", size = 376329, upload-time = "2026-05-28T11:59:15.271Z" }, + { url = "https://files.pythonhosted.org/packages/d0/11/ee0ba42aff83bf4effdbc576673c6be64c5e173978c3f6d537e94482f77d/rpds_py-2026.5.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ca653c6546386227cd9800d1bef6a348099acf8db4250341da6d90f663d6dfcb", size = 383539, upload-time = "2026-05-28T11:59:16.665Z" }, + { url = "https://files.pythonhosted.org/packages/11/df/d94aa6a499d4ac40afe2d7620f2c597fd3c0f182e854ad7cf3f596a81cb6/rpds_py-2026.5.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:66c93681c4729e4e3ecba31b8179fae083ff3118841672835140338b4b9867c1", size = 494674, upload-time = "2026-05-28T11:59:17.991Z" }, + { url = "https://files.pythonhosted.org/packages/1f/75/33d30f43bb2f458de11979486a591b1bf6e5651765ed1704c6197c2dc773/rpds_py-2026.5.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:40ff257542e04796880e011e15cd4dc21c2599975df2aaa8f2c8495ca574e1a5", size = 389268, upload-time = "2026-05-28T11:59:19.434Z" }, + { url = "https://files.pythonhosted.org/packages/f4/1e/2c9096fc19d5fd084b0184ca2b651e659aa0a37e6fdbecf6ece47f147fe1/rpds_py-2026.5.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b6825cc329b290e93c5f6a9be2393118a763f6ccf6abd83704e0c102ca583644", size = 376280, upload-time = "2026-05-28T11:59:21Z" }, + { url = "https://files.pythonhosted.org/packages/b9/e5/61ec9f8be8211ea7f48448195549e4aaf02004083475493b0e137702ecb2/rpds_py-2026.5.1-cp312-cp312-manylinux_2_31_riscv64.whl", hash = "sha256:de42116e69cb53b911cc34aee5ab98f36c597b822545045d49e938818b99e5e4", size = 387233, upload-time = "2026-05-28T11:59:22.454Z" }, + { url = "https://files.pythonhosted.org/packages/0d/ca/bcec1005c4f4a234f92a29078631fee49206c7265ccae966f18fd332e80e/rpds_py-2026.5.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c0f920015df2a504bebaba6d4c31ccf3fcf942f92655c086da30b671aad19aa6", size = 405009, upload-time = "2026-05-28T11:59:23.845Z" }, + { url = "https://files.pythonhosted.org/packages/72/e6/4d5718c5cf26c522dc7c9999e238da1e77380b81d0c5d1df11e271ddfeb1/rpds_py-2026.5.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:0408a24e44feb919423dc6d9da677cb5cddb894d2ca9e763967d156d9c60fab4", size = 553113, upload-time = "2026-05-28T11:59:25.184Z" }, + { url = "https://files.pythonhosted.org/packages/d4/25/2ee807bdb3e1f0b7eddf7782acd5665a8b5205a331a7d7244a52c4812fd9/rpds_py-2026.5.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:cea68bcd53467561ae2f96a6bdad1544299ba97b5b0ddcd5ac3d376e5c781c24", size = 618838, upload-time = "2026-05-28T11:59:26.749Z" }, + { url = "https://files.pythonhosted.org/packages/6a/c1/7d4c26f167f8c41501cc073d30ee22082b16ce358cf5b00ec97cbc7804ea/rpds_py-2026.5.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:4be8b1d2a705cc37d08256004e1d07de143fa0075c8e85a3df020b776f62b732", size = 582436, upload-time = "2026-05-28T11:59:28.11Z" }, + { url = "https://files.pythonhosted.org/packages/04/1d/9d12b0a337bab46f4769f8857f4007e3b2d639e14f9a44a0efe157696e64/rpds_py-2026.5.1-cp312-cp312-win32.whl", hash = "sha256:6736718bd4fc49cbcb538ba30516fdbef161522acefb739657d48b97bd864fed", size = 212734, upload-time = "2026-05-28T11:59:29.689Z" }, + { url = "https://files.pythonhosted.org/packages/c5/93/e4116f2de7f56bc7406a76033dc501811ddeb22b7f056b92d632871ebb0c/rpds_py-2026.5.1-cp312-cp312-win_amd64.whl", hash = "sha256:0a7d1eec967df0e9b22614a5e177622e0c89611d03727fa0cb48e45028907870", size = 229045, upload-time = "2026-05-28T11:59:31.033Z" }, + { url = "https://files.pythonhosted.org/packages/cb/53/6c3419d85eb2ec5938a37627c585b42d76a63bb731d6e42ed4b079ebf486/rpds_py-2026.5.1-cp312-cp312-win_arm64.whl", hash = "sha256:1841d067089e117142d79b98aa0df2f08b52f2ecc1819dd2700636c0db74a473", size = 223967, upload-time = "2026-05-28T11:59:32.318Z" }, + { url = "https://files.pythonhosted.org/packages/6c/32/14c961ad295f490eb0849ada8b79683e93a59b9de3afdd983eaf55fa6867/rpds_py-2026.5.1-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:efef4ac29c6ff495531eb17ee705b62841ecaa291b7c7077e848ea03e237164d", size = 352787, upload-time = "2026-05-28T11:59:33.655Z" }, + { url = "https://files.pythonhosted.org/packages/ca/bb/d1b85117967c11191441a7274ae616c65d93901d082c588f89a50a8da5ae/rpds_py-2026.5.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:c39f5b67a8a2e67179ada2a954227d670fe65fa9098457f698f56ddf248709b3", size = 345179, upload-time = "2026-05-28T11:59:35Z" }, + { url = "https://files.pythonhosted.org/packages/7c/46/d84105f062e626a1b233f863907288a4708c2d833b8b4c6fb2764bc080c0/rpds_py-2026.5.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b5c30f3f04eef4fbd362226a6f31d7c8895ca4fbb6e0b790f6890a98d8da8559", size = 376173, upload-time = "2026-05-28T11:59:36.43Z" }, + { url = "https://files.pythonhosted.org/packages/e2/ae/469d7959ce5b1201e1de135dc735b86db3b35dd0d1734f6a44246d5f061c/rpds_py-2026.5.1-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:277f6c82f0580848796c7ecc8a7173aa3bfb928e4ff831261c2f60a81dc270db", size = 383162, upload-time = "2026-05-28T11:59:37.995Z" }, + { url = "https://files.pythonhosted.org/packages/dc/a2/57853d31a1116a561aa072794602ad3f6341e18d70a8523f1bd5b9fc1e5a/rpds_py-2026.5.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:63c2c4c213f1a4e3f3de28ecab029dbdee976324e729c0d7a55211be72576b02", size = 495093, upload-time = "2026-05-28T11:59:39.453Z" }, + { url = "https://files.pythonhosted.org/packages/99/63/3a8eabcad9314b7daf5c65f451d2c33d989235cd8a5762186cf2c3f5a4f8/rpds_py-2026.5.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3350ec808fb538fe71a1f94dfaa0e29c598dfad805ce49f0caec5ae3183c652b", size = 389829, upload-time = "2026-05-28T11:59:40.896Z" }, + { url = "https://files.pythonhosted.org/packages/4b/25/05678d97fc25e2622df14dc530fb82023174ecfff6733991ed0d78f167bd/rpds_py-2026.5.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b1b964e3ab599e718dc46c018d104b1ebc007cbc6567d827c94a687fca56d77e", size = 374786, upload-time = "2026-05-28T11:59:42.626Z" }, + { url = "https://files.pythonhosted.org/packages/88/d1/8c90b6431e80a3b91b284a5c7c8c0c4f9c006444d90477a740d6e0f9c694/rpds_py-2026.5.1-cp313-cp313-manylinux_2_31_riscv64.whl", hash = "sha256:19cb09fab7b7fc96b2a6e28f2e34b72a3705ff27b37edb77455316e5d3f3dc9b", size = 386920, upload-time = "2026-05-28T11:59:44.124Z" }, + { url = "https://files.pythonhosted.org/packages/ff/99/4638f672ab356682d633ee0da9255f5b67ce6efd0b85eb94ad3e255e65a5/rpds_py-2026.5.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:abe76bcdba31e576cb83eeb8797aa0d882b738fef6dc65d0601fc753806a5b46", size = 405059, upload-time = "2026-05-28T11:59:47.177Z" }, + { url = "https://files.pythonhosted.org/packages/66/3f/3546524b6eb4cc2e1f363a3d638fa52f6c24faae3500c25fb488b02f1740/rpds_py-2026.5.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:8bff7073db3899158fff55ebf57b113a67030af26f80a18978f9f0aa60250ddf", size = 553030, upload-time = "2026-05-28T11:59:48.603Z" }, + { url = "https://files.pythonhosted.org/packages/c6/c3/7b3388c796fcf471bd17194242d4dc1a7608567c0fa422bcc1c5e79f9c1e/rpds_py-2026.5.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:8ba264fa49be666cd9cc56bf34ec7002fb3d27a4aee5bcb4d43d0d18feb1bb6f", size = 618975, upload-time = "2026-05-28T11:59:50.314Z" }, + { url = "https://files.pythonhosted.org/packages/61/1e/a3cb07f2795075d1d88efddae2f541359fde5f08c81ee114c29c2949c90a/rpds_py-2026.5.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4860b603ddda0475a8885499b3729e90229d480105b42651962a5397d995fa89", size = 581178, upload-time = "2026-05-28T11:59:51.673Z" }, + { url = "https://files.pythonhosted.org/packages/a1/74/e758c03a5ef46f04c37f2651a2893db846d569ba8a7bca469d4b58939bcd/rpds_py-2026.5.1-cp313-cp313-win32.whl", hash = "sha256:7944270ae71383f6e2657dd7d5ce4eeb4ac2d0059a6738f0510583d462ab4842", size = 212481, upload-time = "2026-05-28T11:59:53.148Z" }, + { url = "https://files.pythonhosted.org/packages/70/ec/a2aca432db9c7359b40fa393eeeaa0d166c2f70175be956e75fa24197c44/rpds_py-2026.5.1-cp313-cp313-win_amd64.whl", hash = "sha256:88647f43a73c4e01be19b04ceef0c8d3a1958153604d13c773becd8016f2a0cf", size = 228519, upload-time = "2026-05-28T11:59:54.505Z" }, + { url = "https://files.pythonhosted.org/packages/29/60/a73bfdd45b096574556acf303bbd9fa9eed36ca8a818b514e2a5d5fe2b9d/rpds_py-2026.5.1-cp313-cp313-win_arm64.whl", hash = "sha256:453895624ecf7db7063b1004e44037522bbaef9ff6a945e59bc71662d7a03abd", size = 223446, upload-time = "2026-05-28T11:59:56.081Z" }, + { url = "https://files.pythonhosted.org/packages/18/e2/408105fd611823f00882aea810f3989a30d26b1bab8b6beb20f98c724e0e/rpds_py-2026.5.1-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:b4e4bc98639ec915f512fde3aa7a95e0041d95d9c3cc86eea841fa63cb1e8600", size = 355287, upload-time = "2026-05-28T11:59:57.448Z" }, + { url = "https://files.pythonhosted.org/packages/8d/58/5c4a43436843c90d0f6d19f82c200c80e3843ca9fa07b237623327f6d384/rpds_py-2026.5.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:cacedb7a6e167680acba45ad5716e89067d225dc80da0d7040cae8c81d4572fa", size = 347033, upload-time = "2026-05-28T11:59:58.881Z" }, + { url = "https://files.pythonhosted.org/packages/fb/c2/1a71acdacaf4e259b10278fb87b039ded3cf80041bcd89dd8a3ea702ded6/rpds_py-2026.5.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:68700371c5d7ae1412862ddfa719090925c93ecf351c566d66f09d04b136ea00", size = 376891, upload-time = "2026-05-28T12:00:00.516Z" }, + { url = "https://files.pythonhosted.org/packages/c2/c8/535f3d9b65addd8e28aa87b83c6e526799c3717a88273db8ea795beeef7a/rpds_py-2026.5.1-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:296c799becfa849c779c8725494fe9ed94959ed886787df4364b058465bad7f0", size = 385646, upload-time = "2026-05-28T12:00:02.394Z" }, + { url = "https://files.pythonhosted.org/packages/1c/91/dc033f313345c354ade914dbe73cdb90b615a4409ea02430d5356794f3d8/rpds_py-2026.5.1-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d3858b908218ee108d0bbfb2095ccc237648053c9bf98affad7cb079acaf1d97", size = 498830, upload-time = "2026-05-28T12:00:04.189Z" }, + { url = "https://files.pythonhosted.org/packages/27/fc/90fcbea459dbb8ddc18a2e0fd1de9412b48bc84ffff2db771cf714bacfd6/rpds_py-2026.5.1-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4fb8d2e7cb2f850b169806d61d1b991738acec96500a75c30f49caf064ce7cef", size = 392830, upload-time = "2026-05-28T12:00:05.797Z" }, + { url = "https://files.pythonhosted.org/packages/b2/1d/46cd11a228c9750684a798d98f878be6f614aa762438da7378f035e79e35/rpds_py-2026.5.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:27b74c10ed6a8f190f4287f53bcfea348b92a84a9c9f70d30183d1e6172d580d", size = 379613, upload-time = "2026-05-28T12:00:07.433Z" }, + { url = "https://files.pythonhosted.org/packages/24/4a/d9b0c6af3a1de03eb93741bbe8be2bdce84d8fda8224f3005451d86df389/rpds_py-2026.5.1-cp313-cp313t-manylinux_2_31_riscv64.whl", hash = "sha256:b9a6528956191c48c52294a592dbd4a8386d7048bdb25c0efcb6b966466c6d83", size = 388183, upload-time = "2026-05-28T12:00:09.227Z" }, + { url = "https://files.pythonhosted.org/packages/c5/b4/db7aaabdda6d020afc87d981bcc2f57a434c7dec60ecfc2ab3dd50b20351/rpds_py-2026.5.1-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:af03e34e860047bc7a352b842856fcf78798fbb81132cc98bd2f907ab4eb9cd2", size = 408578, upload-time = "2026-05-28T12:00:10.779Z" }, + { url = "https://files.pythonhosted.org/packages/08/d6/070f6a41cbb343e2ac4171859bf3f3623e0ab002f72619d6d505313ec2de/rpds_py-2026.5.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:fea6e836d10abbe191d557d33bd58bd5987725fe63aa1eefe557d230209855bd", size = 553573, upload-time = "2026-05-28T12:00:12.443Z" }, + { url = "https://files.pythonhosted.org/packages/75/ab/1a71ea3589c4345dac0a0518f0e6a031cb42689277851b683c46d27463a5/rpds_py-2026.5.1-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:fc0c0f878ea770a0a8a462456c5ad36fc9fe6358e6b76fdadc7f17575e0b8bf1", size = 620861, upload-time = "2026-05-28T12:00:14.09Z" }, + { url = "https://files.pythonhosted.org/packages/8a/22/9bf80a56069c0c443fcfefac639a86a744550a2898817a6dfd3e26654924/rpds_py-2026.5.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:e0b360f316d966b048b085857630b3cc51f3db2f07b06f440eac8f695374d1e3", size = 585633, upload-time = "2026-05-28T12:00:15.66Z" }, + { url = "https://files.pythonhosted.org/packages/da/68/3b2c0a75c9e04125696f84ebdbbf304acf5a40b58ba4481cdb98a922c3ba/rpds_py-2026.5.1-cp313-cp313t-win32.whl", hash = "sha256:a2999883eedf72fdfb7520b92c7d4ec2572a71ff40239377aa604cc529eecafc", size = 210074, upload-time = "2026-05-28T12:00:17.291Z" }, + { url = "https://files.pythonhosted.org/packages/e7/8b/609157d5a25d37d4f29f92840ba531f416907c34ae5c5739dd21fc2bef98/rpds_py-2026.5.1-cp313-cp313t-win_amd64.whl", hash = "sha256:e07be2a9d7122bd6e82dea89814ef8dc893feb1aae97fec1630f3263bbb30e55", size = 228635, upload-time = "2026-05-28T12:00:18.73Z" }, + { url = "https://files.pythonhosted.org/packages/d4/6f/19c1918a4b590d8de87e712e4abe4b3875771eff60216fb6153cf6665c68/rpds_py-2026.5.1-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:1f2c391c3059798093b65df23aca2cac150460ae9c630d99dec83d703d9485b9", size = 349756, upload-time = "2026-05-28T12:00:20.217Z" }, + { url = "https://files.pythonhosted.org/packages/e5/60/a06fe7da34eca79dacbf958a2ba0c6eea85bc2b29de20080bf40f72f66fa/rpds_py-2026.5.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:413b424f7c4ee65ab5e5be91f5731be0f8b41a1ee2b12dfe810d716312e95a78", size = 343831, upload-time = "2026-05-28T12:00:21.711Z" }, + { url = "https://files.pythonhosted.org/packages/bf/ec/b2333b97b90e2a6ef6ca8ad386ee284968e74bcfe113b3f1a8d9036429a9/rpds_py-2026.5.1-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2c595a1d9255dce0599e13130d1440ab2506654f2b50294226ee06402f8fef63", size = 375127, upload-time = "2026-05-28T12:00:23.326Z" }, + { url = "https://files.pythonhosted.org/packages/14/7f/e00aae54067f2b488c4637961d5f58204d470795fc791085fa3f15060d2e/rpds_py-2026.5.1-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1c27c5f6102eac8c03e7595a00827a53b271ba40a53b59ff8709170e0855ea4a", size = 379034, upload-time = "2026-05-28T12:00:24.89Z" }, + { url = "https://files.pythonhosted.org/packages/be/cc/423999bbb8ae8dc93c77fc1d5e984ade5eb89d237d3bb884ccfa72ae2890/rpds_py-2026.5.1-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6c7fcf61d44cacecaf3aea542b0e053db77972a4573e7ceda16fb2b399161195", size = 490823, upload-time = "2026-05-28T12:00:26.676Z" }, + { url = "https://files.pythonhosted.org/packages/0f/aa/c671bf660f12e68d3c52ff86c7066ed1372df5a0f4f2ff584e419b8207e7/rpds_py-2026.5.1-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2c817a189d4ee14290420e5ff051e4dd6baa13f3edf84685071dee07a6d538ee", size = 388144, upload-time = "2026-05-28T12:00:28.577Z" }, + { url = "https://files.pythonhosted.org/packages/19/c8/d63bb75b68afe77b229e3021c6031bcaf01da5db5b0e69d0d10f9ba679a7/rpds_py-2026.5.1-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21846aac0ed2e0589f38c12dc44e77bb64e494b771eadbcf169cba00566ba7ba", size = 371959, upload-time = "2026-05-28T12:00:30.304Z" }, + { url = "https://files.pythonhosted.org/packages/82/35/c51122014d8274ff37dc606d60049c3db7d83da02b5b282511e5a906a9a6/rpds_py-2026.5.1-cp314-cp314-manylinux_2_31_riscv64.whl", hash = "sha256:b317c87a13f769a4e787819bd508aaa5d69aa09b0880de9af6d3a8a54571cdec", size = 383558, upload-time = "2026-05-28T12:00:31.764Z" }, + { url = "https://files.pythonhosted.org/packages/e3/f9/2790cb99c136a5363acdeacf5c27c56f3de0d4118a1f48fca83404c99c89/rpds_py-2026.5.1-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ce87129d9f2c14fa6c4a8601fb80eb4488c80d38a20cd13758ef11123e14995d", size = 402789, upload-time = "2026-05-28T12:00:33.247Z" }, + { url = "https://files.pythonhosted.org/packages/e5/1b/e4fb584f8c75d35c38150ff6a332cda949e6f97acba1f4fd123b14ab56fe/rpds_py-2026.5.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:9cdddb6c1207d284d94fd1530adf57fbd797fe7c4b8704ba85f49414f2557e7d", size = 551405, upload-time = "2026-05-28T12:00:34.819Z" }, + { url = "https://files.pythonhosted.org/packages/d8/f7/a6731b4216cb3793ea1af5391da240f5683dacc0d13e034fe5fc3503f240/rpds_py-2026.5.1-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:4e237e139f94d3c036fd28eb9f564c99055476ff4ff05cd42be55ce349b5aa02", size = 616975, upload-time = "2026-05-28T12:00:36.268Z" }, + { url = "https://files.pythonhosted.org/packages/2c/ea/2e051a81d95d8e63f4b35a1c463a87e8766bc3d083c067c5dfb6bf220747/rpds_py-2026.5.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:ed0954b524873214369184a9c82b0eaa45a3fbb9a798cd95b17e0d98499e7ea0", size = 578701, upload-time = "2026-05-28T12:00:37.82Z" }, + { url = "https://files.pythonhosted.org/packages/65/56/b5f6fdb2083e32bca8a8993d89e70db114b4756c9e2c38421328126689d2/rpds_py-2026.5.1-cp314-cp314-win32.whl", hash = "sha256:2d88621d6a7d4dfa633d21abe90f280bb205274e16b1d1e61c6ad4640b2453b7", size = 209806, upload-time = "2026-05-28T12:00:39.492Z" }, + { url = "https://files.pythonhosted.org/packages/fb/80/65a5aa96c155e611d1ed844e4e1f57f3e36b021f396d9f8585d756e6b90d/rpds_py-2026.5.1-cp314-cp314-win_amd64.whl", hash = "sha256:cef8ac28d26f4dda3533060c20fbf80a325458fa9fd23ea72a73cdfa8e978838", size = 225985, upload-time = "2026-05-28T12:00:40.94Z" }, + { url = "https://files.pythonhosted.org/packages/27/7c/ad185212e87b05f196daef92bc5f3caf07298eb47c295b5585c3dd3093ac/rpds_py-2026.5.1-cp314-cp314-win_arm64.whl", hash = "sha256:eaaea962c68cdc68d4a533ba985ab8e9484277910bbfaa2ab3ef7732667bfed8", size = 221219, upload-time = "2026-05-28T12:00:43.15Z" }, + { url = "https://files.pythonhosted.org/packages/23/58/e14ae18759020334646b031e708ab4158d653a938822bfb7b95ef2e93aa3/rpds_py-2026.5.1-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:21942f52dbbd5f8758bf021213d28bd45c39e873e65e2407faf5f1846f5761ad", size = 352148, upload-time = "2026-05-28T12:00:44.638Z" }, + { url = "https://files.pythonhosted.org/packages/31/9b/5f4a1e2f960bca3ac5d052b139dd31eed97b259f9d909173821760d542e8/rpds_py-2026.5.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:f414556f6e3958300ff941e40c9f97e3dc9774ddd1b3434c475d73dd354bbed3", size = 345196, upload-time = "2026-05-28T12:00:46.14Z" }, + { url = "https://files.pythonhosted.org/packages/1a/71/1d9574d6a2fa20ab60eaa55c7467f5aa20cbc770f341a05f09c0876f59e2/rpds_py-2026.5.1-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ef1013a8625c74043210190b246f5b1551e09757c1f356c6e4160ef96c5bc081", size = 374981, upload-time = "2026-05-28T12:00:47.531Z" }, + { url = "https://files.pythonhosted.org/packages/0c/9a/37e99f4915a80aa71670263c1267f7ae0af95f53a3f61e6c3bdc016d4515/rpds_py-2026.5.1-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:cc68e231a77a5f0d774ae278a1f8e55c0456501820847c1e4efb3829f3441df6", size = 379961, upload-time = "2026-05-28T12:00:49.216Z" }, + { url = "https://files.pythonhosted.org/packages/a8/ff/6e73f74b89d2e0715e0fc86b7dde893f9a61ae2f9b256ff3bdfe41ac4e94/rpds_py-2026.5.1-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9baffb505aff33acc69b422a19f77806680f3c8632227d79f48de8a810d1c2c5", size = 495965, upload-time = "2026-05-28T12:00:51.111Z" }, + { url = "https://files.pythonhosted.org/packages/ea/e0/425faba25f59d74d4638b267f7c7a80e8649d2ef4db10a19b0c4a71e6e6f/rpds_py-2026.5.1-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b8d2f912928d426e8cfa396f7f3f8d29a59e6689c86dcca3c420730c1096322b", size = 389526, upload-time = "2026-05-28T12:00:52.77Z" }, + { url = "https://files.pythonhosted.org/packages/c6/76/7a41960e3fddae47fab43a28684d5da981401dffd88253de0944148654cb/rpds_py-2026.5.1-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:90f628283be835db980c941767d41c9a27b5239e54ba0a9c1335247e82406964", size = 376190, upload-time = "2026-05-28T12:00:54.215Z" }, + { url = "https://files.pythonhosted.org/packages/27/60/5f38dc70824fc6951b51d35377e577a3a3a4c81a6769cc5a2de25ebe0ad1/rpds_py-2026.5.1-cp314-cp314t-manylinux_2_31_riscv64.whl", hash = "sha256:1ebb2f0ab7e16132995a72de805170e0203df0c3dd22e1ef1cd1fdd90bd7a131", size = 383921, upload-time = "2026-05-28T12:00:55.673Z" }, + { url = "https://files.pythonhosted.org/packages/60/1a/d60a38caa1505f4b9483c3fbbde12c94e1079154f4f401a6da96f7e77621/rpds_py-2026.5.1-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f3df3d16ded76f1f8c9cdebd0e1ea55fdf4c23b812de189814da7cf229c22a81", size = 404766, upload-time = "2026-05-28T12:00:57.518Z" }, + { url = "https://files.pythonhosted.org/packages/87/ff/602fd3f174d6425f0bce05ad0dfbec0e96b38d0f7d08a79af5aa20083885/rpds_py-2026.5.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:9af8905b8f854990e40d5206aa5ac58d9b0fe0b7f351ff2bb086c20f6c8c6a47", size = 551343, upload-time = "2026-05-28T12:00:58.978Z" }, + { url = "https://files.pythonhosted.org/packages/b8/c1/1be13327acdbead3eca1fde03b6a34dbb011f1e864e217f0d32cc1779a7f/rpds_py-2026.5.1-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:036a36a87fb1cd3b214d11c4b3c4f7d2ddad933625dca1c900b56a057c07740a", size = 618502, upload-time = "2026-05-28T12:01:00.656Z" }, + { url = "https://files.pythonhosted.org/packages/f3/d7/afb49b49d7f2be8b7ba1a9f0977fa5168003437b93086726f066544e8351/rpds_py-2026.5.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:62ae3853454fe9ef283a03c96c2d835d39e84b14643a9d62c82ef0fb87d702ca", size = 581916, upload-time = "2026-05-28T12:01:02.22Z" }, + { url = "https://files.pythonhosted.org/packages/25/d1/dbef8c1f8a10f07beb62b5f054e20099fd9924b3ec001b8f0b6ac7813a85/rpds_py-2026.5.1-cp314-cp314t-win32.whl", hash = "sha256:6c3d771a46ec18b12af06ce36243a9a80b07a5d0515236332d90863ca8bb326a", size = 207855, upload-time = "2026-05-28T12:01:03.821Z" }, + { url = "https://files.pythonhosted.org/packages/2a/72/bfa4e61ab8e7dc1c8adf397e05e6cbdd4239357bd72b248d3de662f23915/rpds_py-2026.5.1-cp314-cp314t-win_amd64.whl", hash = "sha256:c93c629be4636cf54337bd5f06c104d55e42ced54d681f6fe21ae510a65116f6", size = 225422, upload-time = "2026-05-28T12:01:05.194Z" }, + { url = "https://files.pythonhosted.org/packages/27/3a/7b5da92b640f67b6717ccafc83cdd06bfa7ff2395c3685c68922bb54d703/rpds_py-2026.5.1-cp315-cp315-macosx_10_12_x86_64.whl", hash = "sha256:3574b55c604b8f75dacb007136508bbc0db406e626301778096a133327e7f2fb", size = 349576, upload-time = "2026-05-28T12:01:06.722Z" }, + { url = "https://files.pythonhosted.org/packages/d7/8a/2aafd7ad355a1bd48ca76e2262b74b15e6432b5a1efe150efd4d779cd55d/rpds_py-2026.5.1-cp315-cp315-macosx_11_0_arm64.whl", hash = "sha256:94068eb3ae6d43f5a786b7db96a406a34e6d5c24489feef32fd6e8946ea7b291", size = 343640, upload-time = "2026-05-28T12:01:08.441Z" }, + { url = "https://files.pythonhosted.org/packages/f7/7d/6c9523c1abbe840a1b7fba3c516d48e1d3487cc80fea4366c4071cf56784/rpds_py-2026.5.1-cp315-cp315-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f3a5b10e8ce894825f380a8f1b6444cf73c294dfea62afbb2d13e3a9e630cec1", size = 375322, upload-time = "2026-05-28T12:01:09.934Z" }, + { url = "https://files.pythonhosted.org/packages/5a/5d/0b7b03fb1dc509321f01de3149784ab773e34c8573022029af8076afcb9c/rpds_py-2026.5.1-cp315-cp315-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:fc09f82e63d4bcd58149572f857a431bae851dc747e313c3b5bdf7abb907fda8", size = 379066, upload-time = "2026-05-28T12:01:11.48Z" }, + { url = "https://files.pythonhosted.org/packages/d7/e2/8ef6012999ebf1cb1c22f876d9ce5e63d960fd4631d2af3202d3f480aa25/rpds_py-2026.5.1-cp315-cp315-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e10464d17df3b582745c25cec695cb9558bca2cb6ddb631aee1787fc72c767b2", size = 494586, upload-time = "2026-05-28T12:01:13.051Z" }, + { url = "https://files.pythonhosted.org/packages/80/af/1eeb029bec67582c226b7809172207cd005073af4ebd906e65ff494f4983/rpds_py-2026.5.1-cp315-cp315-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ba05adbf15d994c38ec0b7ab32e858e5110c21e9009a00a86545fd220f84e038", size = 388415, upload-time = "2026-05-28T12:01:14.631Z" }, + { url = "https://files.pythonhosted.org/packages/18/23/ffbe10711c4d766c1cab0557d6906c074f795814863c67b351355d29354a/rpds_py-2026.5.1-cp315-cp315-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:77c004fdc7b891967106f78ddfd7b076bfe6813c6139c6fff6aed3bcaa960b26", size = 372427, upload-time = "2026-05-28T12:01:16.153Z" }, + { url = "https://files.pythonhosted.org/packages/bd/3a/30ba4a6ad457e5b070c18d742a33fb77d8d922b565cc881f8a5313d63bfe/rpds_py-2026.5.1-cp315-cp315-manylinux_2_31_riscv64.whl", hash = "sha256:83bcf894486c9d78dd290d3c0124ff6dd8875d3025e2090a8ec49fcc37c55fdd", size = 383615, upload-time = "2026-05-28T12:01:17.809Z" }, + { url = "https://files.pythonhosted.org/packages/d3/69/62e242b53ce39c0814bd24e1a6e6eba6c92be716277745f317f9540a2e7b/rpds_py-2026.5.1-cp315-cp315-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c3df104083952a0e0c6f10de33e440eabe98fb6317d23e1a58c68f6df08d01b9", size = 402786, upload-time = "2026-05-28T12:01:19.419Z" }, + { url = "https://files.pythonhosted.org/packages/38/c1/a770b9c186928a1ed0f7e6d7ae50e7f3950ed23e3f9e366dbc8e38cb55de/rpds_py-2026.5.1-cp315-cp315-musllinux_1_2_aarch64.whl", hash = "sha256:980450826cf22e133c57e0835070bdd0dd3f73b9b708c3ce223def2cb9469e14", size = 551583, upload-time = "2026-05-28T12:01:21.013Z" }, + { url = "https://files.pythonhosted.org/packages/21/7c/68e8579b95375b70d2a963103c42e705856cdb98569258bd807f4423891c/rpds_py-2026.5.1-cp315-cp315-musllinux_1_2_i686.whl", hash = "sha256:205dde846f24332ab0c1188699a043b8d165b79bb84529ce272c45048ff6be01", size = 616941, upload-time = "2026-05-28T12:01:22.548Z" }, + { url = "https://files.pythonhosted.org/packages/70/a1/a6135aed5730ff03ab957182259987ac11e55fb392a28dc6f0592048a280/rpds_py-2026.5.1-cp315-cp315-musllinux_1_2_x86_64.whl", hash = "sha256:3966b82dd563176396df030f3dd52a6e54cb69b718e95e78bd555ed3d1e0185d", size = 578349, upload-time = "2026-05-28T12:01:24.118Z" }, + { url = "https://files.pythonhosted.org/packages/09/6e/f24201a76a84e6c49d0bdfdfcb735210e21701e9b21c5bfc0ba497dd62f6/rpds_py-2026.5.1-cp315-cp315-win32.whl", hash = "sha256:7818f8d0a415be74d2be3590b0a1c1f463a642f4d0217e7d10602dceef5b79aa", size = 209922, upload-time = "2026-05-28T12:01:25.522Z" }, + { url = "https://files.pythonhosted.org/packages/9e/e4/966bc240bb0485fc265278f6de44d05834bf0b3618886e0b22e33d54c49a/rpds_py-2026.5.1-cp315-cp315-win_amd64.whl", hash = "sha256:b3cc20c0d800af78fd0fac68086e28c1856cec51ea528bb81ea851aa40d39325", size = 226003, upload-time = "2026-05-28T12:01:27.062Z" }, + { url = "https://files.pythonhosted.org/packages/5c/5c/a15a59269cd5e74472734516c73795c15eccfc841b3d4b0228c3f53f19d0/rpds_py-2026.5.1-cp315-cp315-win_arm64.whl", hash = "sha256:3609e9939a8a76cd904cf98a3f1f13b5dc7e150adeaee89e0ea09652ea213e16", size = 221245, upload-time = "2026-05-28T12:01:28.51Z" }, + { url = "https://files.pythonhosted.org/packages/e0/22/135ce03804e179a71ceb13be095deda4a279bc88f7a6b8fa161c5ad44e12/rpds_py-2026.5.1-cp315-cp315t-macosx_10_12_x86_64.whl", hash = "sha256:5d333a7127d4b307601ac37792bee01bb95c867cbfacf21b6375b804d6bbd723", size = 352015, upload-time = "2026-05-28T12:01:30.214Z" }, + { url = "https://files.pythonhosted.org/packages/3b/5f/f1f6d2652eb9d848f6eb369d8db83a2da6249bb49ad2c2a48f45d54538d3/rpds_py-2026.5.1-cp315-cp315t-macosx_11_0_arm64.whl", hash = "sha256:b5f077b44a4f7808520f66dae234988d867deb9aed9be5da057ce9ba831b2a41", size = 345016, upload-time = "2026-05-28T12:01:31.656Z" }, + { url = "https://files.pythonhosted.org/packages/88/66/b74182775691ea2290c99e52ac8d5db844e56fbec90ce421f107658c8314/rpds_py-2026.5.1-cp315-cp315t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:55d8f9b7b78c9538fc9e04e82ec0e888ff0c3cffcfad152c77e57cd09351a98a", size = 374775, upload-time = "2026-05-28T12:01:33.136Z" }, + { url = "https://files.pythonhosted.org/packages/ff/8f/15e5a61d9f0a43902d36561d4f07cae6ae9f4716be825159fd72717f33af/rpds_py-2026.5.1-cp315-cp315t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e3a8ae58895ac107ed934a6bf51e5846f95c53b9b940c2c6d310838fd5846358", size = 380270, upload-time = "2026-05-28T12:01:34.574Z" }, + { url = "https://files.pythonhosted.org/packages/02/c3/f859b12763a80540cdf2af0f15b19904cf756a71d7bdd3f82ff3e5b1bbf9/rpds_py-2026.5.1-cp315-cp315t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0957cf3c2b8632ec7aaebffebea8005b353cc2a237b6e2ae3c2cac0820704cfb", size = 495285, upload-time = "2026-05-28T12:01:36.127Z" }, + { url = "https://files.pythonhosted.org/packages/1c/c7/ff27c2ac8411d30b03b1829fd88cae8dad1a4d0da48dd25e57c4038042e6/rpds_py-2026.5.1-cp315-cp315t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c396c1304de421050b3681ea70f371874b54d41b0151e96109758144c231e30b", size = 389581, upload-time = "2026-05-28T12:01:37.635Z" }, + { url = "https://files.pythonhosted.org/packages/6e/67/fe92ee32a6cc05c77228a2f8b1762e7124f386ec20ff83d0757b762d58d0/rpds_py-2026.5.1-cp315-cp315t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aad1bff7f666b9598e573815affd666aac6a13a585dde336f843e33350c7fadc", size = 376041, upload-time = "2026-05-28T12:01:39.307Z" }, + { url = "https://files.pythonhosted.org/packages/f8/91/b4d6685c27aba55bd82f25b278be8237038117d05f9659a6213ad3408130/rpds_py-2026.5.1-cp315-cp315t-manylinux_2_31_riscv64.whl", hash = "sha256:656a042550878f12d45752452d47094b7cfe5ad1e9d7b87b5a22ad3ae5ff8015", size = 383946, upload-time = "2026-05-28T12:01:41.043Z" }, + { url = "https://files.pythonhosted.org/packages/bd/79/2c1d832a53c8e0f8e98fc970ec257b950fecd4f62be2ab7182b500a0cbc8/rpds_py-2026.5.1-cp315-cp315t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:73c4bd4f70294737b5206a3e8e30ccadbf8a60301831c8ea23eec5dbeea1ecfa", size = 405526, upload-time = "2026-05-28T12:01:43.032Z" }, + { url = "https://files.pythonhosted.org/packages/78/c4/c98117b03c6a8581ab2c2dfccfe9a5ad82bd8128a3c28b46a6ad2d97c393/rpds_py-2026.5.1-cp315-cp315t-musllinux_1_2_aarch64.whl", hash = "sha256:43bca78665423cabae77146f2fe7ce55272b6c8d55d82cca83effd42c7e13972", size = 551165, upload-time = "2026-05-28T12:01:44.648Z" }, + { url = "https://files.pythonhosted.org/packages/3b/c1/bc479ca069200af730881b1bd525e3114b2b391a351509fcb1b772f28086/rpds_py-2026.5.1-cp315-cp315t-musllinux_1_2_i686.whl", hash = "sha256:42d0f20e85e549c870749d0e247f0c10d318a45b7e9676d575d2dcb04a1b2e66", size = 618778, upload-time = "2026-05-28T12:01:46.337Z" }, + { url = "https://files.pythonhosted.org/packages/77/65/38ab2f90df44c2febfb63cc10ced40763d9b4bc94d173e734528663fe7f5/rpds_py-2026.5.1-cp315-cp315t-musllinux_1_2_x86_64.whl", hash = "sha256:b1be5c35683684d5331b93600c210e8367c254683d8a6df6bd21bd2da3a334fb", size = 581839, upload-time = "2026-05-28T12:01:48.109Z" }, + { url = "https://files.pythonhosted.org/packages/15/2d/ce1f605fe036aadd460e5822e578c6c7ec3a860936cca37d6e0f299daa77/rpds_py-2026.5.1-cp315-cp315t-win32.whl", hash = "sha256:75808f6c38ce7749bb68cc2770161aae5045e6c6f6781a9782e74b93304399df", size = 207866, upload-time = "2026-05-28T12:01:49.648Z" }, + { url = "https://files.pythonhosted.org/packages/79/cb/966040123eb102371559746908ef2c9471f4d43e17ec9a645a2258dab64b/rpds_py-2026.5.1-cp315-cp315t-win_amd64.whl", hash = "sha256:90bd6630002a1c7f09e7843dd79f0d24f3d2897cc25a753480917865d14f15b3", size = 225441, upload-time = "2026-05-28T12:01:51.408Z" }, + { url = "https://files.pythonhosted.org/packages/42/56/3fe0fb34820ff667be791b3a3c22b85e8bcba54e9c832f47438c191fa7be/rpds_py-2026.5.1-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:edf2765d84e42447f112ad877af8fe1db0089aaec5b28e88d6eab45e7fe99cea", size = 357151, upload-time = "2026-05-28T12:01:53.43Z" }, + { url = "https://files.pythonhosted.org/packages/8b/f2/3eb9ccdb9f143b8c9b003978898cb497f942a324c077401e6b8834238e63/rpds_py-2026.5.1-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:ad3773236e95f7f33991eb125224b7da66f206504d032a253a02da7e134519fb", size = 350195, upload-time = "2026-05-28T12:01:54.901Z" }, + { url = "https://files.pythonhosted.org/packages/a7/24/dbda232bc4f3ed732120692ab0d2c8402cb020516556d8bee622dcef2413/rpds_py-2026.5.1-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a04df86b3f0fade39ec8fd0e0aab089b1da9fbd2b48df778a57ef96f5e7d38df", size = 381850, upload-time = "2026-05-28T12:01:56.601Z" }, + { url = "https://files.pythonhosted.org/packages/40/30/32e769839a358f78810c234f160f2cc21d1e4e47e1c0e0e0d535be5a0219/rpds_py-2026.5.1-pp311-pypy311_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:6142dbd80c4df62a5d899f0d616d417f84e0bc8d32526c8e5589019d75d028a7", size = 387899, upload-time = "2026-05-28T12:01:58.212Z" }, + { url = "https://files.pythonhosted.org/packages/ab/86/ec84d243aadb3b34b71dd26a010d0930b2d284ff5fc9a69fec53810ee6fd/rpds_py-2026.5.1-pp311-pypy311_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0b35217adefe87f2fe4db7e9766cabe84744bfe9616d9667be18988928c7f2dc", size = 501618, upload-time = "2026-05-28T12:01:59.888Z" }, + { url = "https://files.pythonhosted.org/packages/74/25/b60e52686bbff777a64f9e4f4d3dd57980dc846913777177a2c92e4937aa/rpds_py-2026.5.1-pp311-pypy311_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b95d5e11fc712b752081183a55a244c03cd00570489edd7014d8899f8ceb8162", size = 394003, upload-time = "2026-05-28T12:02:01.482Z" }, + { url = "https://files.pythonhosted.org/packages/9b/c7/b3a6a588cc2219510ef3f42e207483a93950bedd1e3a0fd4015c95cff9e5/rpds_py-2026.5.1-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:141c9498daf2ace9eda35d2b0e376f9ea8b058d84f2aef4f96fccfd449a2f251", size = 379778, upload-time = "2026-05-28T12:02:03.197Z" }, + { url = "https://files.pythonhosted.org/packages/31/00/c7dba3fc8a3da8cb3f6db1eb3386be4d79c2e97c6890d20eb9ac66ae8c43/rpds_py-2026.5.1-pp311-pypy311_pp73-manylinux_2_31_riscv64.whl", hash = "sha256:6f249f8b860a200ad35193af961183ebe9132710484e6f6ce0cf89fd83c63a9a", size = 392359, upload-time = "2026-05-28T12:02:04.817Z" }, + { url = "https://files.pythonhosted.org/packages/93/dd/472ba494c70753f93745992c99855bee0636daf74e6984e5e003f150316f/rpds_py-2026.5.1-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e4abbf391a70be864920858bf360f4fb380577c9a0f732438a1996726e2c195b", size = 412820, upload-time = "2026-05-28T12:02:06.401Z" }, + { url = "https://files.pythonhosted.org/packages/1d/6f/93831a3bfe789542ed0c1d0d74b78b440f055d6dc3ea4640eba2d95e6e23/rpds_py-2026.5.1-pp311-pypy311_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:c74005a7bb87752acf351c93897ec63ad77a07a0da7ecad9c050e32e7286ba34", size = 557243, upload-time = "2026-05-28T12:02:08.013Z" }, + { url = "https://files.pythonhosted.org/packages/1f/ff/0b3d604614ffc77522c6b288fdbce68957eb583da1002aa65ba38ac0ee40/rpds_py-2026.5.1-pp311-pypy311_pp73-musllinux_1_2_i686.whl", hash = "sha256:8213afbe8a3a906fb9acb2014423fe3359ee783d0bf90995f70623a3217bfa6c", size = 623541, upload-time = "2026-05-28T12:02:09.661Z" }, + { url = "https://files.pythonhosted.org/packages/ea/ea/e7b0251441da9adfeaebcf29601d10f2a1455fcf0772fae9e7e19032bd96/rpds_py-2026.5.1-pp311-pypy311_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:8c43a8a973270fd173bf48cdf80bbe66312421cba68d40845034f174f2389049", size = 586326, upload-time = "2026-05-28T12:02:11.47Z" }, +] + [[package]] name = "ruff" version = "0.15.16" @@ -3276,6 +3991,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/14/78/0f68b93564b8c6b6987a0696c582ba2591a381ab2f733a501909e949f241/smart_open-7.6.1-py3-none-any.whl", hash = "sha256:b4de6aebef023aca91cc9fb372052e1343ba3f152de215bd22391a663e3ddd21", size = 64845, upload-time = "2026-05-09T06:23:35.386Z" }, ] +[[package]] +name = "sniffio" +version = "1.3.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a2/87/a6771e1546d97e7e041b6ae58d80074f81b7d5121207425c964ddf5cfdbd/sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc", size = 20372, upload-time = "2024-02-25T23:20:04.057Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" }, +] + [[package]] name = "spacy" version = "3.8.14" @@ -3353,6 +4077,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/33/78/d1a1a026ef3af911159398c939b1509d5c36fe524c7b644f34a5146c4e16/spacy_loggers-1.0.5-py3-none-any.whl", hash = "sha256:196284c9c446cc0cdb944005384270d775fdeaf4f494d8e269466cfa497ef645", size = 22343, upload-time = "2023-09-11T12:26:50.586Z" }, ] +[[package]] +name = "sqlglot" +version = "30.9.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/04/1d/b1380b3ee8fb63c50d7507e956a1228ddefff1a2601d79806c998ef6547b/sqlglot-30.9.0.tar.gz", hash = "sha256:20bed04b6482bf13560206cae517f451f46c321e04956ad71271ed1f12ce8802", size = 5885862, upload-time = "2026-06-04T15:33:52.268Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/13/4c/41b222c130950a077e1a0ba311df84a29b818db7c136ecc1aafbfad42e26/sqlglot-30.9.0-py3-none-any.whl", hash = "sha256:59b5f74f4d391e32e6980e8cd23cca8d47beac3c0140b711ead9ed05a824a8b5", size = 695762, upload-time = "2026-06-04T15:33:49.526Z" }, +] + [[package]] name = "srsly" version = "2.5.3" @@ -3433,6 +4166,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a2/09/77d55d46fd61b4a135c444fc97158ef34a095e5681d0a6c10b75bf356191/sympy-1.14.0-py3-none-any.whl", hash = "sha256:e091cc3e99d2141a0ba2847328f5479b05d94a6635cb96148ccb3f34671bd8f5", size = 6299353, upload-time = "2025-04-27T18:04:59.103Z" }, ] +[[package]] +name = "tenacity" +version = "9.1.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/47/c6/ee486fd809e357697ee8a44d3d69222b344920433d3b6666ccd9b374630c/tenacity-9.1.4.tar.gz", hash = "sha256:adb31d4c263f2bd041081ab33b498309a57c77f9acf2db65aadf0898179cf93a", size = 49413, upload-time = "2026-02-07T10:45:33.841Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d7/c1/eb8f9debc45d3b7918a32ab756658a0904732f75e555402972246b0b8e71/tenacity-9.1.4-py3-none-any.whl", hash = "sha256:6095a360c919085f28c6527de529e76a06ad89b23659fa881ae0649b867a9d55", size = 28926, upload-time = "2026-02-07T10:45:32.24Z" }, +] + [[package]] name = "thinc" version = "8.3.13" From fbe6a8b93a2a5d509c2d9590cd748e04355f68f1 Mon Sep 17 00:00:00 2001 From: TKaltofen Date: Wed, 10 Jun 2026 09:57:52 +0200 Subject: [PATCH 2/9] docs: Phase 0 RAG connector survey and family map (closes #32) (#38) * docs: add Phase 0 RAG connector survey and family map --- docs/rag-connector-base-classes.md | 249 +++++++++++++++++++++++++++++ 1 file changed, 249 insertions(+) create mode 100644 docs/rag-connector-base-classes.md diff --git a/docs/rag-connector-base-classes.md b/docs/rag-connector-base-classes.md new file mode 100644 index 0000000..0fe58b2 --- /dev/null +++ b/docs/rag-connector-base-classes.md @@ -0,0 +1,249 @@ +# RAG connector base classes + +The `connectors/` package wraps whole external open-source RAG tools under one +mloda surface, organized into families by query-contract shape. Each family is a +thin `BaseConnector` FeatureGroup plus one or more concrete backends +gated by a `_backend` selector, with an inheritable contract-test suite. +A user swaps retrievers, rerankers, or generators by changing options, not by +rewriting a pipeline. + +This sits alongside the build-your-own stage pipeline +(`feature_groups/rag_pipeline/`): the stages let a user assemble a pipeline step +by step, the connectors let a user drop in one external tool that subsumes +several steps. + +## How families are cut + +Families are cut by **query-contract shape**, not by paradigm (lexical / dense / +hybrid / late-interaction) and not by vendor (LlamaIndex / Haystack / ...). + +> Same in/out shape -> a **backend** inside an existing family. Different in/out +> shape -> its **own family**. + +BM25, dense bi-encoder, hybrid/RRF, and ColBERT late-interaction all share one +contract, `query + top_k -> ranked passages with scores`, so they are four +backends of a single `retrieve` family, not four families. The contract +boundaries are where the in/out signature actually changes: rerank takes +candidates in, generate returns prose plus citations, structured returns typed +rows, graph_rag traverses a subgraph, orchestrator is opaque end-to-end. + +## The landscape survey + +The open-source RAG systems the families were clustered from, grouped by the +contract cluster they fall into. Per row: the query contract (in -> out), the +state it needs, the no-Docker answer, the family it maps to, and a pedigree tag +(`real-lib-inmem` / `real-lib-server` / `fixture-stub` / `research-prototype`). + +Rows count tool *surfaces*, not unique repositories: a few projects appear in +more than one family because they expose more than one contract (LangChain and +LlamaIndex span orchestrator + generate + structured; Haystack spans +orchestrator + generate; ColBERT spans retrieve + rerank). A handful of entries +are hosted or not strictly OSS (Cohere-rerank, Canopy) and are labeled as such. + +No-Docker legend: **in-mem** = runs in-process after at most a pip install +(possibly a model download, noted); **fixture** = exercisable only via a static +fixture / REST stub; **server** = genuinely needs a running server or Docker. + +### Orchestration frameworks and RAG applications (-> `orchestrator`) + +| System | Query contract (in -> out) | State needed | No-Docker? | Family | Pedigree | +|---|---|---|---|---|---| +| LlamaIndex | `index.as_query_engine().query(str)` -> answer + `source_nodes` | in-mem `VectorStoreIndex`; LLM + embed model | in-mem (basic) | orchestrator | real-lib-inmem | +| Haystack 2.x | `Pipeline.run({"query"})` -> answers + retrieved documents | DocumentStore (InMemory available) | in-mem (basic) | orchestrator | real-lib-inmem | +| txtai | `embeddings.search(q)` / `rag(q)` -> results / answer | in-mem embeddings index (SQLite + ANN) | in-mem | orchestrator | real-lib-inmem | +| LangChain (RAG chains) | `RetrievalQA.invoke({"query"})` -> result + source_documents | in-mem vectorstore (FAISS/Chroma); LLM | in-mem (basic) | orchestrator | real-lib-inmem | +| DSPy | `module(question=...)` -> `Prediction.answer` | retriever + LM clients | in-mem (basic) | orchestrator | real-lib-inmem | +| Embedchain (mem0) | `app.add(src)`; `app.query(q)` -> answer | in-mem Chroma default; LLM API | in-mem (basic) | orchestrator | real-lib-inmem | +| LLMWare | `Query(library).query(text)` -> results; `Prompt` -> answer | library store (SQLite/Mongo); small model | in-mem (basic) | orchestrator | real-lib-inmem | +| LocalGPT | `run_localGPT.py` query -> answer + sources | local Chroma from ingest; local model | in-mem (basic) | orchestrator | real-lib-inmem | +| FlashRAG | `pipeline.run(dataset)` -> answers + eval | corpus + prebuilt index; model downloads | in-mem (basic) | orchestrator | research-prototype | +| AutoRAG | evaluator over QA data -> best pipeline; `query` -> answer | corpus + QA dataset + YAML; model APIs | in-mem (basic) | orchestrator | research-prototype | +| FLARE | `query` -> answer via iterative look-ahead retrieval | retriever + LM | in-mem (basic) | orchestrator | research-prototype | +| Self-RAG | `query` -> answer with retrieval/critique reflection tokens | fine-tuned Llama checkpoint; retriever | in-mem (basic) | generate / orchestrator | research-prototype | +| Canopy (Pinecone) | `ChatEngine.chat` / `query` -> answer + context | Pinecone index (external); OpenAI key | server | orchestrator | real-lib-server | +| Cognita (TrueFoundry) | REST `/query` -> answer + sources | backend server, vector DB, metadata store | server | orchestrator | real-lib-server | +| R2R (SciPhi) | REST `/rag` query -> answer + citations | Postgres + pgvector; Docker compose | server | orchestrator | real-lib-server | +| RAGFlow (InfiniFlow) | REST/UI query -> answer + grounded chunks | Docker stack (deep parse, ES, MySQL, MinIO) | server | orchestrator | real-lib-server | +| Verba (Weaviate) | UI/API query -> answer + context windows | Weaviate instance | server | orchestrator | real-lib-server | +| Quivr | API query -> answer over uploaded docs | Supabase/Postgres backend | server | orchestrator | real-lib-server | +| Danswer / Onyx | API query -> answer + cited sources | Docker stack (Postgres, Vespa, connectors) | server | orchestrator | real-lib-server | +| Khoj | API/chat query -> answer over personal corpus | Django server, embeddings DB | server | orchestrator | real-lib-server | +| PrivateGPT | API/UI query -> answer + source chunks | local LLM + embed; Qdrant/Chroma; FastAPI | server (local) | orchestrator | real-lib-server | +| AnythingLLM | API query (workspace) -> answer + citations | Node server, LanceDB default, LLM provider | server | orchestrator | real-lib-server | +| Open WebUI (RAG) | query + uploaded docs -> answer + refs | Open WebUI server + Ollama/OpenAI | server | orchestrator | real-lib-server | +| Dify | API app query -> answer + retrieved refs | Docker stack (Postgres, Redis, vector DB) | server | orchestrator | real-lib-server | +| Flowise | deployed flow API query -> answer | Node server; vector store; LLM keys | server | orchestrator | real-lib-server | +| kotaemon (Cinnamon) | UI/API query -> answer + citations (GraphRAG opt) | local/server install; vector + doc store | server | orchestrator | real-lib-server | + +### Retrieval engines, vector stores, lexical / dense / late-interaction (-> `retrieve`) + +| System | Query contract (in -> out) | State needed | No-Docker? | Family | Pedigree | +|---|---|---|---|---|---| +| FAISS | `query_emb + index -> top_k ids + distances` | prebuilt in-mem index | in-mem | retrieve | real-lib-inmem | +| Chroma | `query_text|emb + collection -> top_k docs + distances` | collection; optional embed fn | in-mem (embedded) / server | retrieve | real-lib-inmem | +| Qdrant | `query_vector(+filter) + collection -> scored points` | collection of vectors + payloads | in-mem (`:memory:`) / server | retrieve | real-lib-inmem | +| Milvus | `query_vectors + collection -> top_k ids + distances` | collection; index built | in-mem (Milvus-Lite) / server | retrieve | real-lib-inmem / server | +| LanceDB | `query_vector + table -> top_k rows + distances` | on-disk Lance table | in-mem (embedded) | retrieve | real-lib-inmem | +| Weaviate | `near_vector|near_text + class -> objects + scores` | schema + vectors; running node | server (embedded opt) | retrieve | real-lib-server | +| pgvector | `query_vector + table -> rows ORDER BY distance` | Postgres table + ANN index | server (Postgres) | retrieve | real-lib-server | +| Vespa | `YQL (ANN+BM25+rank) -> ranked hits` | deployed app package; running node | server | retrieve | real-lib-server | +| Elasticsearch (kNN + BM25) | `knn vector OR BM25 text + index -> ranked hits + _score` | index/mappings; running cluster | server | retrieve | real-lib-server | +| OpenSearch | `knn/neural OR BM25 + index -> ranked hits + _score` | index; running cluster (k-NN plugin) | server | retrieve | real-lib-server | +| Marqo | `query_text|image + index -> ranked docs + scores` | running server + embed models | server | retrieve | real-lib-server | +| Vald | `query_vector via gRPC -> nearest ids + distances` | K8s cluster (agent-NGT, gateway) | server (K8s) | retrieve | real-lib-server | +| Annoy | `query_vector + index -> top_k ids + distances` | prebuilt immutable index | in-mem | retrieve | real-lib-inmem | +| hnswlib | `query_vector + index -> top_k labels + distances` | in-mem HNSW graph | in-mem | retrieve | real-lib-inmem | +| ScaNN | `query_vector + searcher -> top_k ids + scores` | built partition/quantized index | in-mem | retrieve | real-lib-inmem | +| nmslib | `query_vector + index -> top_k ids + distances` | built in-mem index | in-mem | retrieve | real-lib-inmem | +| rank_bm25 | `tokenized_query + corpus -> per-doc score array` | in-mem tokenized corpus | in-mem | retrieve | real-lib-inmem | +| bm25s | `query_tokens + sparse index -> top_k ids + scores` | eagerly-scored sparse matrix (scipy) | in-mem | retrieve | real-lib-inmem | +| Pyserini (Anserini/Lucene) | `query_text + Lucene index -> ranked docids + scores` | prebuilt Lucene index; JVM (no server) | in-mem (needs Java) | retrieve | real-lib-inmem | +| Tantivy / tantivy-py | `query + index -> top docs + BM25 scores` | on-disk inverted index (embedded) | in-mem (embedded) | retrieve | real-lib-inmem | +| Whoosh | `query + index -> ranked hits + scores` | on-disk inverted index | in-mem | retrieve | real-lib-inmem | +| ColBERT | `query_tok_emb + token index -> MaxSim passages` | ColBERT checkpoint + PLAID index | in-mem (GPU pref.) | retrieve | real-lib-inmem | +| RAGatouille | `query_text + indexed corpus -> ranked passages` | ColBERT model + built index | in-mem | retrieve | real-lib-inmem | +| PLAID | `query_emb + compressed centroid index -> top_k` | quantized ColBERT index | in-mem | retrieve | research-prototype | +| SPLADE | `query_text -> sparse term weights -> ranked passages` | SPLADE model + sparse/inverted index | in-mem (index may be ES) | retrieve | research-prototype | +| DPR | `question_emb + FAISS passage index -> top_k passages` | Q/ctx encoders + FAISS index | in-mem | retrieve | research-prototype | +| sentence-transformers (bi-encoder) | `query_emb + corpus_emb -> top_k (semantic_search)` | downloaded model; corpus embeddings | in-mem (model dl) | retrieve | real-lib-inmem | +| Instructor embeddings | `(instruction, text) -> embedding` (feed to ANN) | downloaded INSTRUCTOR model | in-mem (model dl) | retrieve | real-lib-inmem | +| BGE / FlagEmbedding (retrieval) | `text -> embedding` (dense+sparse+colbert) | downloaded BGE model | in-mem (model dl) | retrieve | real-lib-inmem | +| ELSER (ES learned sparse) | `query_text -> expanded sparse tokens -> ranked hits` | ES cluster + deployed ELSER model | server | retrieve | real-lib-server | + +### Rerankers (-> `rerank`) + +| System | Query contract (in -> out) | State needed | No-Docker? | Family | Pedigree | +|---|---|---|---|---|---| +| FlashRank | `query + candidates -> reordered passages + scores` | ONNX cross-encoder download | in-mem (model dl) | rerank | real-lib-inmem | +| sentence-transformers CrossEncoder | `query + candidates -> reordered + relevance scores` | cross-encoder download | in-mem (model dl) | rerank | real-lib-inmem | +| BGE-reranker (FlagEmbedding) | `query + candidates -> reordered + scores` | BGE reranker download | in-mem (model dl) | rerank | real-lib-inmem | +| MixedBread mxbai-rerank | `query + candidates -> reordered + scores` | mxbai-rerank download | in-mem (model dl) | rerank | real-lib-inmem | +| monoT5 / castorini | `query + passage -> relevance score -> reordered` | T5 reranker download (pygaggle) | in-mem (model dl) | rerank | real-lib-inmem | +| ColBERT-as-reranker | `query + candidates -> MaxSim scores -> reordered` | ColBERT checkpoint download | in-mem (model dl) | rerank | real-lib-inmem | +| rerankers (AnswerDotAI) | `query + candidates -> reordered` (unified API) | backend model / API key | in-mem (model dl) | rerank | real-lib-inmem | +| RankGPT | `query + candidates -> LLM permutation -> reordered` | LLM API key | server (LLM API) | rerank | research-prototype | +| Cohere-rerank | `query + candidates -> reordered + scores` | Cohere API key (not OSS) | server (hosted) | rerank | real-lib-server | +| Lexical token-overlap reranker | `query + candidates -> reordered by overlap` | none | in-mem | rerank | fixture-stub | + +### Answer generators (-> `generate`) + +| System | Query contract (in -> out) | State needed | No-Docker? | Family | Pedigree | +|---|---|---|---|---|---| +| Template / extractive responder | `query + passages -> templated/extractive answer` | none | in-mem | generate | fixture-stub | +| HuggingFace QA pipeline (extractive) | `question + context -> answer span + score` | QA model download | in-mem (model dl) | generate | real-lib-inmem | +| Haystack readers | `query + docs -> answer span / generated + citations` | reader/LLM download or API key | in-mem (model dl) | generate | real-lib-inmem | +| LangChain generation | `query + passages -> LLM answer + citations` | LLM API key or local model | in-mem (model dl) | generate | real-lib-inmem | +| llama.cpp / Ollama | `query + passages prompt -> generated answer` | GGUF download / Ollama daemon | in-mem (model dl) | generate | real-lib-inmem | +| FiD (fusion-in-decoder) | `query + N passages -> fused generated answer` | trained FiD checkpoint | in-mem (model dl) | generate | research-prototype | + +### Graph-RAG (-> `graph_rag`) + +| System | Query contract (in -> out) | State needed | No-Docker? | Family | Pedigree | +|---|---|---|---|---|---| +| GraphRAG via networkx | `query -> in-mem graph traversal -> passages` | graph build (in-process) | in-mem | graph_rag | fixture-stub | +| Microsoft GraphRAG | `query -> community graph traversal -> answer` | graph build (parquet artifacts); LLM | in-mem (post-index) | graph_rag | real-lib-inmem | +| nano-graphrag | `query -> graph traversal -> context -> answer` | graph build; LLM API key | in-mem (file artifacts) | graph_rag | real-lib-inmem | +| LlamaIndex KnowledgeGraph / PropertyGraphIndex | `query -> KG traversal -> passages -> answer` | graph build; LLM API key | in-mem | graph_rag | real-lib-inmem | +| LightRAG | `query -> dual-level graph + vector -> answer` | graph build; embed/LLM API key | in-mem (file artifacts) | graph_rag | real-lib-inmem | +| HippoRAG | `query -> personalized PageRank over KG -> passages` | graph build; model download; LLM | in-mem (model dl) | graph_rag | research-prototype | +| Neo4j GraphRAG | `query -> Cypher/vector graph retrieval -> passages` | running Neo4j DB; LLM API key | server | graph_rag | real-lib-server | + +### Text-to-SQL / structured retrieval (-> `structured`) + +| System | Query contract (in -> out) | State needed | No-Docker? | Family | Pedigree | +|---|---|---|---|---|---| +| Rule-based text-to-SQL (in-mem SQLite) | `question + table -> SQL -> typed rows` | in-mem SQLite copy of the table | in-mem | structured | fixture-stub | +| LlamaIndex NLSQLTableQueryEngine | `NL question + schema -> SQL -> rows -> answer` | SQL DB (SQLite ok); LLM API key | in-mem (SQLite) | structured | real-lib-inmem | +| LangChain SQLDatabaseChain | `NL question + schema -> SQL -> rows -> answer` | SQL DB (SQLite ok); LLM API key | in-mem (SQLite) | structured | real-lib-inmem | +| Vanna.AI | `NL question + trained schema -> SQL -> rows` | vector store of schema; DB; LLM API key | in-mem (embeddable) | structured | real-lib-inmem | +| sqlcoder (defog) | `NL question + schema prompt -> SQL` | sqlcoder model download | in-mem (model dl) | structured | real-lib-inmem | +| DAIL-SQL / DIN-SQL | `NL question + schema (few-shot) -> SQL` | LLM API key (GPT-4); benchmark data | server (LLM API) | structured | research-prototype | +| PICARD | `NL question + schema -> constrained decode -> SQL` | T5 model + PICARD parsing server | server | structured | research-prototype | + +### Evaluation harnesses (cross-cutting, not a connector family) + +These do not fit a retrieval family: they consume `(query, answer, contexts, +ground_truth)` and emit metric scores. They belong on top of the existing +`evaluation/` module, not as a connector family. Recorded for completeness. + +| System | Query contract (in -> out) | No-Docker? | Disposition | +|---|---|---|---| +| RAGAS | `(query, answer, contexts, ground_truth) -> faithfulness/relevance scores` | in-mem (LLM API) | out-of-scope (eval) | +| TruLens | `(query, answer, contexts) + feedback fns -> scores (logged)` | in-mem (sqlite) | out-of-scope (eval) | +| DeepEval | `(query, answer, contexts, ground_truth) -> scores (pytest-style)` | in-mem (LLM API) | out-of-scope (eval) | +| ARES | `(query, answer, contexts) -> trained-judge scores` | in-mem (model dl) | out-of-scope (eval) | +| Phoenix (Arize) | `(query, answer, contexts, gt) -> scores + traces` | in-mem (local app) | out-of-scope (eval) | +| Giskard RAG (RAGET) | `(query, answer, contexts, gt) -> component scores + tests` | in-mem (LLM API) | out-of-scope (eval) | +| continuous-eval (relari) | `(query, answer, contexts, gt) -> modular metric scores` | in-mem (LLM API) | out-of-scope (eval) | + +## The family map + +Six families. Each has at least one no-Docker concrete. Contracts below are the +contracts declared on the family base classes. + +| Family | Reader contract (in -> out) | No-Docker concrete | Other backends | Pedigree of the anchor | +|---|---|---|---|---| +| `retrieve` | `query_text + corpus + top_k -> ranked passages w/ scores` (`retrieved_passages: [{doc_id, text, score, rank}]`) | `Bm25sRetriever` (`bm25s`, zero-download lexical) | `TfidfRetriever` (vector-space lexical); no dense/FAISS backend yet | real-lib-inmem | +| `rerank` | `query_text + candidates + top_k -> reordered passages w/ scores` (`reranked_passages`) | `LexicalReranker` (pure-Python token overlap, zero-download) | `FlashRankReranker` (ONNX cross-encoder, `rerank` extra, CI-skip on model download) | fixture-stub anchor + real-lib | +| `generate` | `query_text + passages -> answer + citations` (`generated_answer: {answer, citations}`), grounded by construction | `ExtractiveResponder` (stdlib sentence extraction) | `TemplateResponder` (multi-citation template) | fixture-stub anchor | +| `graph_rag` | `query_text + nodes + edges + top_k -> ranked passages` (`graph_passages`); query-overlap + one-hop neighbour bonus | `AdjacencyGraphRag` (stdlib adjacency map, zero-download) | `NetworkxGraphRag` (`networkx`, `graph` extra); parity test pins identical ranking | fixture-stub anchor + real-lib | +| `structured` | `question + table -> SQL -> typed rows` (`structured_rows: {sql, rows}`); in-mem SQLite, single-SELECT sqlglot guard | `RuleBasedSql` (deterministic NL->SQL over in-mem SQLite) | `AggregateSql` (aggregation queries) | fixture-stub anchor | +| `orchestrator` | `query_text + corpus + top_k -> answer + documents` (internals opaque) (`orchestrated_answer: {answer, documents}`) | `HaystackOrchestrator` (Haystack 2.x BM25 pipeline, offline, telemetry off) | `R2RFixtureOrchestrator` (file-fixture REST stub with `SUPPORTED_VALUES` + stripped params) | real-lib-inmem + fixture-stub | + +What each family is for: + +- **`retrieve`** holds the vector-store / lexical / late-interaction backends + (FAISS, Chroma, bm25s, ColBERT, ...): all share `query + top_k -> ranked + passages`. Paradigm and vendor are backend and pedigree distinctions. +- **`rerank`** takes *candidates* in, not a corpus (FlashRank, cross-encoders, + RankGPT). +- **`generate`** returns prose plus citations, a different out-shape from a + ranked list (extractive QA, Haystack readers, local LLMs). +- **`graph_rag`** traverses a node/edge graph; the value is connected context + (GraphRAG, LightRAG, HippoRAG, networkx prototypes). +- **`structured`** returns typed rows via generated SQL (Vanna, + NLSQLTableQueryEngine). +- **`orchestrator`** is the opaque end-to-end surface for whole frameworks and + apps (LlamaIndex, Haystack, txtai, and the server-shaped R2R/RAGFlow/Verba/... + reached through a fixture stub). + +## Cross-cutting properties + +These recur across families and are currently implemented inline in each +family's `base.py`. The shared axis: + +- **TopK / score-threshold** (retrieve, rerank, graph_rag, orchestrator) +- **Metadata filter** (corpus subset selection) +- **Corpus / index handle** (the locator: which prebuilt index or fixture) +- **Embedding-model selection** (retrieve dense backend, graph_rag) +- **Citation / provenance** (generate, orchestrator) + +## Relationship to the stage pipeline + +The stage pipeline (`feature_groups/rag_pipeline/`) has a FAISS-backed +`retrieval` stage and an `llm_response` stage, which cover the same ground as the +`retrieve` and `generate` connectors: + +- A connector and the corresponding stage emit the same passage / answer row + shape, so a downstream feature is agnostic to which produced it. +- Stages assemble a pipeline step by step; a connector drops in one external tool + that subsumes embed + index + retrieve. +- Switching between them is a change of connector id / options, same + `Feature -> run_all` shape, no pipeline rewrite. + +The `retrieve` family currently has lexical backends (`bm25s`, `tfidf`) only; +there is no dense / FAISS backend yet, and the FAISS retrieval stage is not yet +wired in as one. + +## Package layout + +``` +rag_integration/feature_groups/connectors/ + / + base.py BaseConnector (contract, option keys, validation) + .py concrete backend (declares its _backend selector) +tests/connectors/ + / + _contract.py inheritable contract-test suite + test_.py concrete adapter test +``` From 2dd668e2dce099dab53c22bd25553e7fce00da64 Mon Sep 17 00:00:00 2001 From: TKaltofen Date: Wed, 10 Jun 2026 12:45:50 +0200 Subject: [PATCH 3/9] refactor: extract shared connectors/mixins.py and connectors/errors.py (#35) (#40) * refactor: extract shared connectors/mixins.py and connectors/errors.py --- .../feature_groups/connectors/errors.py | 36 ++++++ .../connectors/generate/base.py | 39 +++--- .../connectors/graph_rag/base.py | 57 ++++----- .../feature_groups/connectors/mixins.py | 119 ++++++++++++++++++ .../connectors/orchestrator/base.py | 47 +++---- .../feature_groups/connectors/rerank/base.py | 50 +++----- .../connectors/retrieve/base.py | 81 ++++-------- .../connectors/structured/base.py | 38 +++--- tests/connectors/test_shared_mixins.py | 73 +++++++++++ 9 files changed, 341 insertions(+), 199 deletions(-) create mode 100644 rag_integration/feature_groups/connectors/errors.py create mode 100644 rag_integration/feature_groups/connectors/mixins.py create mode 100644 tests/connectors/test_shared_mixins.py diff --git a/rag_integration/feature_groups/connectors/errors.py b/rag_integration/feature_groups/connectors/errors.py new file mode 100644 index 0000000..ddd7f06 --- /dev/null +++ b/rag_integration/feature_groups/connectors/errors.py @@ -0,0 +1,36 @@ +"""Typed errors shared by the connector families. + +All subclass ``ValueError`` (via :class:`ConnectorError`), so callers and +contract tests that catch ``ValueError`` keep working. Messages stay at the +raise site, where the per-family wording differs. +""" + +from __future__ import annotations + + +class ConnectorError(ValueError): + """Base for every connector-family validation / rejection error.""" + + +class MissingOptionError(ConnectorError): + """A required option is absent.""" + + +class InvalidOptionError(ConnectorError): + """An option has the wrong type or an unusable value.""" + + +class DuplicateDocIdError(ConnectorError): + """Two entries share an effective ``doc_id``.""" + + +class RankingContractError(ConnectorError): + """A backend ``_rank`` result violates the ranking contract.""" + + +class GroundingError(ConnectorError): + """An answer cites or surfaces something not in the supplied input.""" + + +class SqlSafetyError(ConnectorError): + """Backend SQL is unsafe or not a single bare ``SELECT``.""" diff --git a/rag_integration/feature_groups/connectors/generate/base.py b/rag_integration/feature_groups/connectors/generate/base.py index c27b24f..0212bdf 100644 --- a/rag_integration/feature_groups/connectors/generate/base.py +++ b/rag_integration/feature_groups/connectors/generate/base.py @@ -32,8 +32,11 @@ PythonDictFramework, ) +from rag_integration.feature_groups.connectors.errors import DuplicateDocIdError, GroundingError +from rag_integration.feature_groups.connectors.mixins import DocCollectionMixin, OptionsMixin -class BaseGenerateConnector(FeatureGroup): + +class BaseGenerateConnector(OptionsMixin, DocCollectionMixin, FeatureGroup): """Root FeatureGroup for generate-connector backends. A concrete backend declares its selector value in ``GENERATE_BACKENDS`` and @@ -89,18 +92,12 @@ def input_features(self, options: Options, feature_name: FeatureName) -> None: @classmethod def _get_passages(cls, options: Options) -> List[Dict[str, Any]]: - passages = options.get(cls.PASSAGES) - if passages is None: - raise ValueError(f"{cls.__name__} requires '{cls.PASSAGES}' in options: a list of {{doc_id, text}} dicts.") - passages = list(passages) - seen: Set[str] = set() - for i, passage in enumerate(passages): - doc_id = str(passage.get("doc_id", str(i))) - if doc_id in seen: - raise ValueError( - f"{cls.__name__} received duplicate passage doc_id '{doc_id}'; doc_ids must be unique." - ) - seen.add(doc_id) + passages = cls._require_doc_list(options, cls.PASSAGES) + duplicate = cls._find_duplicate_doc_id(passages) + if duplicate is not None: + raise DuplicateDocIdError( + f"{cls.__name__} received duplicate passage doc_id '{duplicate}'; doc_ids must be unique." + ) return passages @classmethod @@ -119,14 +116,16 @@ def _generate(cls, query: str, passages: List[Dict[str, Any]]) -> Tuple[str, Lis @classmethod def _validate_citations(cls, citations: List[str], passages: List[Dict[str, Any]]) -> None: """Reject any citation that is not one of the supplied passage doc_ids, or cited twice.""" - known = {str(p.get("doc_id", str(i))) for i, p in enumerate(passages)} + known = cls._known_doc_ids(passages) for citation in citations: if citation not in known: - raise ValueError( + raise GroundingError( f"{cls.__name__}._generate cited '{citation}', which is not among the supplied passages." ) if len(citations) != len(set(citations)): - raise ValueError(f"{cls.__name__}._generate returned duplicate citations; each doc_id may be cited once.") + raise GroundingError( + f"{cls.__name__}._generate returned duplicate citations; each doc_id may be cited once." + ) @classmethod def _answer(cls, query: str, passages: List[Dict[str, Any]]) -> Dict[str, Any]: @@ -138,12 +137,12 @@ def _answer(cls, query: str, passages: List[Dict[str, Any]]) -> Dict[str, Any]: # Grounded by construction, in both directions: a non-empty answer must # cite its source(s), and citations without an answer are meaningless. if answer.strip() and not citations: - raise ValueError( + raise GroundingError( f"{cls.__name__}._generate returned a non-empty answer with no citations; " f"a grounded answer must cite at least one supplied passage." ) if not answer.strip() and citations: - raise ValueError( + raise GroundingError( f"{cls.__name__}._generate returned citations with an empty answer; " f"citations are only valid for a non-empty answer." ) @@ -158,9 +157,7 @@ def calculate_feature(cls, data: Any, features: FeatureSet) -> List[Dict[str, An """Generate an answer from the passages, return the answer object.""" for feature in features.features: options = feature.options - query = options.get(cls.QUERY_TEXT) - if query is None: - raise ValueError(f"{cls.__name__} requires '{cls.QUERY_TEXT}' in options.") + query = cls._require_option(options, cls.QUERY_TEXT) passages = cls._get_passages(options) return [{cls.ROOT_FEATURE_NAME: cls._answer(str(query), passages)}] return [] diff --git a/rag_integration/feature_groups/connectors/graph_rag/base.py b/rag_integration/feature_groups/connectors/graph_rag/base.py index c9a1f10..4da5879 100644 --- a/rag_integration/feature_groups/connectors/graph_rag/base.py +++ b/rag_integration/feature_groups/connectors/graph_rag/base.py @@ -35,8 +35,16 @@ PythonDictFramework, ) +from rag_integration.feature_groups.connectors.errors import DuplicateDocIdError, InvalidOptionError +from rag_integration.feature_groups.connectors.mixins import ( + DocCollectionMixin, + OptionsMixin, + RankingValidationMixin, + TopKMixin, +) + -class BaseGraphRagConnector(FeatureGroup): +class BaseGraphRagConnector(OptionsMixin, TopKMixin, DocCollectionMixin, RankingValidationMixin, FeatureGroup): """Root FeatureGroup for graph-RAG connector backends. A concrete backend declares its selector value in ``GRAPH_BACKENDS`` and @@ -47,15 +55,12 @@ class BaseGraphRagConnector(FeatureGroup): ROOT_FEATURE_NAME = "graph_passages" - # Option keys. + # Option keys. ``TOP_K`` / ``DEFAULT_TOP_K`` come from ``TopKMixin``. GRAPH_BACKEND = "graph_backend" QUERY_TEXT = "query_text" - TOP_K = "top_k" NODES = "nodes" EDGES = "edges" - DEFAULT_TOP_K = 5 - # Filled per concrete; empty on the base so it never matches. GRAPH_BACKENDS: Dict[str, str] = {} @@ -64,7 +69,7 @@ class BaseGraphRagConnector(FeatureGroup): PROPERTY_MAPPING = { GRAPH_BACKEND: {"explanation": "Which graph-RAG backend to use"}, QUERY_TEXT: {"explanation": "Raw text query to search the graph"}, - TOP_K: {"explanation": f"Number of passages to return (default {DEFAULT_TOP_K})"}, + TopKMixin.TOP_K: {"explanation": f"Number of passages to return (default {TopKMixin.DEFAULT_TOP_K})"}, NODES: {"explanation": "Graph nodes: a list of {doc_id, text} dicts"}, EDGES: { "explanation": "Graph edges: a list of [doc_id_a, doc_id_b] pairs." @@ -97,18 +102,6 @@ def input_features(self, options: Options, feature_name: FeatureName) -> None: """Root feature: no input features (graph arrives via Options).""" return None - @classmethod - def _get_top_k(cls, options: Options) -> int: - val = options.get(cls.TOP_K) - return int(val) if val is not None else cls.DEFAULT_TOP_K - - @classmethod - def _get_nodes(cls, options: Options) -> List[Dict[str, Any]]: - nodes = options.get(cls.NODES) - if nodes is None: - raise ValueError(f"{cls.__name__} requires '{cls.NODES}' in options: a list of {{doc_id, text}} dicts.") - return list(nodes) - @classmethod def _resolve_edges(cls, options: Options) -> List[Tuple[str, str]]: """Resolve the optional ``EDGES`` option into ``(doc_id_a, doc_id_b)`` pairs. @@ -123,7 +116,7 @@ def _resolve_edges(cls, options: Options) -> List[Tuple[str, str]]: if raw_edges is None: return [] if not isinstance(raw_edges, (list, tuple)): - raise ValueError( + raise InvalidOptionError( f"{cls.__name__} '{cls.EDGES}' must be a list of [doc_id_a, doc_id_b] pairs, " f"got {type(raw_edges).__name__}." ) @@ -154,13 +147,7 @@ def _rank(cls, query: str, texts: List[str], edges: List[Tuple[int, int]], top_k @classmethod def _validate_ranking(cls, ranked: List[Tuple[int, float]], n_nodes: int) -> None: """Reject out-of-range or duplicate indices from a backend's ``_rank``.""" - seen: Set[int] = set() - for idx, _score in ranked: - if not 0 <= idx < n_nodes: - raise ValueError(f"{cls.__name__}._rank returned out-of-range index {idx} for {n_nodes} nodes.") - if idx in seen: - raise ValueError(f"{cls.__name__}._rank returned duplicate index {idx}.") - seen.add(idx) + cls._validate_rank_indices(ranked, n_nodes, f"{n_nodes} nodes") @classmethod def _retrieve( @@ -185,13 +172,13 @@ def _retrieve( if effective_k <= 0: return [] + duplicate = cls._find_duplicate_doc_id(nodes) + if duplicate is not None: + raise DuplicateDocIdError(f"{cls.__name__}: duplicate doc_id '{duplicate}': edges would be ambiguous.") + texts = [str(node.get("text", "")) for node in nodes] - doc_ids = [str(node.get("doc_id", str(i))) for i, node in enumerate(nodes)] - doc_id_to_index: Dict[str, int] = {} - for i, doc_id in enumerate(doc_ids): - if doc_id in doc_id_to_index: - raise ValueError(f"{cls.__name__}: duplicate doc_id '{doc_id}': edges would be ambiguous.") - doc_id_to_index[doc_id] = i + doc_ids = cls._effective_doc_ids(nodes) + doc_id_to_index: Dict[str, int] = {doc_id: i for i, doc_id in enumerate(doc_ids)} edge_indices = [ (doc_id_to_index[a], doc_id_to_index[b]) for a, b in edges if a in doc_id_to_index and b in doc_id_to_index ] @@ -209,10 +196,8 @@ def calculate_feature(cls, data: Any, features: FeatureSet) -> List[Dict[str, An """Score nodes by query overlap plus a one-hop neighbour bonus, return ranked passages.""" for feature in features.features: options = feature.options - query = options.get(cls.QUERY_TEXT) - if query is None: - raise ValueError(f"{cls.__name__} requires '{cls.QUERY_TEXT}' in options.") - nodes = cls._get_nodes(options) + query = cls._require_option(options, cls.QUERY_TEXT) + nodes = cls._require_doc_list(options, cls.NODES) edges = cls._resolve_edges(options) top_k = cls._get_top_k(options) passages = cls._retrieve(str(query), nodes, edges, top_k) diff --git a/rag_integration/feature_groups/connectors/mixins.py b/rag_integration/feature_groups/connectors/mixins.py new file mode 100644 index 0000000..6f807d1 --- /dev/null +++ b/rag_integration/feature_groups/connectors/mixins.py @@ -0,0 +1,119 @@ +"""Cross-cutting property mixins shared by the connector-family bases. + +Each mixin hoists one concern PR #31 duplicated inline (top_k parsing, doc +collection / doc_id bookkeeping, ranking validation). They are plain classes +listed ahead of ``FeatureGroup`` in a base, so mloda discovery still sees only +the ``FeatureGroup`` leaves; ``cls.__name__`` keeps messages naming the backend. +""" + +from __future__ import annotations + +from typing import Any, Dict, List, Optional, Sequence, Set, Tuple + +from mloda.user import Options + +from rag_integration.feature_groups.connectors.errors import ( + InvalidOptionError, + MissingOptionError, + RankingContractError, +) + + +class OptionsMixin: + """Read required values out of ``Options``.""" + + @classmethod + def _require_option(cls, options: Options, key: str) -> Any: + value = options.get(key) + if value is None: + raise MissingOptionError(f"{cls.__name__} requires '{key}' in options.") + return value + + +class TopKMixin: + """The ``top_k`` cut-off (retrieve, rerank, graph_rag, orchestrator).""" + + TOP_K = "top_k" + DEFAULT_TOP_K = 5 + + @classmethod + def _get_top_k(cls, options: Options) -> int: + val = options.get(cls.TOP_K) + if val is None: + return cls.DEFAULT_TOP_K + try: + return int(val) + except (ValueError, TypeError) as exc: + raise InvalidOptionError(f"{cls.__name__} option '{cls.TOP_K}' must be an integer, got {val!r}.") from exc + + +class DocCollectionMixin: + """A ``{doc_id, text}`` collection and its ``doc_id`` bookkeeping. + + Effective ``doc_id`` is the explicit value coerced to ``str``, else the + positional index. + """ + + @staticmethod + def _effective_doc_id(item: Dict[str, Any], index: int) -> str: + return str(item.get("doc_id", str(index))) + + @classmethod + def _effective_doc_ids(cls, items: Sequence[Dict[str, Any]]) -> List[str]: + return [cls._effective_doc_id(item, i) for i, item in enumerate(items)] + + @classmethod + def _known_doc_ids(cls, items: Sequence[Dict[str, Any]]) -> Set[str]: + return set(cls._effective_doc_ids(items)) + + @classmethod + def _find_duplicate_doc_id(cls, items: Sequence[Dict[str, Any]]) -> Optional[str]: + """Return the first repeated effective ``doc_id``, or ``None``.""" + seen: Set[str] = set() + for i, item in enumerate(items): + doc_id = cls._effective_doc_id(item, i) + if doc_id in seen: + return doc_id + seen.add(doc_id) + return None + + @classmethod + def _require_doc_list(cls, options: Options, key: str) -> List[Dict[str, Any]]: + value = options.get(key) + if value is None: + raise MissingOptionError(f"{cls.__name__} requires '{key}' in options: a list of {{doc_id, text}} dicts.") + return list(value) + + +class RankingValidationMixin: + """Validate the ``(index, score)`` pairs a backend ``_rank`` returns.""" + + @classmethod + def _validate_rank_indices( + cls, + ranked: List[Tuple[int, float]], + count: int, + extent: str, + *, + non_increasing: bool = False, + ) -> None: + """Reject out-of-range / duplicate indices, and (if ``non_increasing``) rising scores. + + ``extent`` is the population label used in the out-of-range message + (e.g. ``"3 candidates"``). + """ + seen: Set[int] = set() + previous_score: Optional[float] = None + for idx, score in ranked: + if not 0 <= idx < count: + raise RankingContractError(f"{cls.__name__}._rank returned out-of-range index {idx} for {extent}.") + if idx in seen: + raise RankingContractError(f"{cls.__name__}._rank returned duplicate index {idx}.") + seen.add(idx) + if non_increasing: + if previous_score is not None and score > previous_score: + raise RankingContractError( + f"{cls.__name__}._rank returned scores out of order: {score} after {previous_score} " + f"(scores must be non-increasing, best-first)." + ) + previous_score = score diff --git a/rag_integration/feature_groups/connectors/orchestrator/base.py b/rag_integration/feature_groups/connectors/orchestrator/base.py index 8e41d84..b1d1d13 100644 --- a/rag_integration/feature_groups/connectors/orchestrator/base.py +++ b/rag_integration/feature_groups/connectors/orchestrator/base.py @@ -33,8 +33,11 @@ PythonDictFramework, ) +from rag_integration.feature_groups.connectors.errors import DuplicateDocIdError, GroundingError +from rag_integration.feature_groups.connectors.mixins import DocCollectionMixin, OptionsMixin, TopKMixin -class BaseOrchestratorConnector(FeatureGroup): + +class BaseOrchestratorConnector(OptionsMixin, TopKMixin, DocCollectionMixin, FeatureGroup): """Root FeatureGroup for orchestrator connector backends. A concrete backend declares its selector value in ``ORCHESTRATOR_BACKENDS`` @@ -45,20 +48,19 @@ class BaseOrchestratorConnector(FeatureGroup): ROOT_FEATURE_NAME = "orchestrated_answer" - # Option keys. + # Option keys. ``TOP_K`` / ``DEFAULT_TOP_K`` come from ``TopKMixin``. ORCHESTRATOR_BACKEND = "orchestrator_backend" QUERY_TEXT = "query_text" - TOP_K = "top_k" CORPUS = "corpus" - DEFAULT_TOP_K = 5 - ORCHESTRATOR_BACKENDS: Dict[str, str] = {} PROPERTY_MAPPING = { ORCHESTRATOR_BACKEND: {"explanation": "Which orchestrator (external framework) backend to use"}, QUERY_TEXT: {"explanation": "The query to run through the framework pipeline"}, - TOP_K: {"explanation": f"Number of documents the pipeline should surface (default {DEFAULT_TOP_K})"}, + TopKMixin.TOP_K: { + "explanation": f"Number of documents the pipeline should surface (default {TopKMixin.DEFAULT_TOP_K})" + }, CORPUS: {"explanation": "Inline corpus: a list of {doc_id, text} dicts"}, } @@ -87,18 +89,6 @@ def input_features(self, options: Options, feature_name: FeatureName) -> None: """Root feature: no input features (the corpus arrives via Options).""" return None - @classmethod - def _get_top_k(cls, options: Options) -> int: - val = options.get(cls.TOP_K) - return int(val) if val is not None else cls.DEFAULT_TOP_K - - @classmethod - def _get_corpus(cls, options: Options) -> List[Dict[str, Any]]: - corpus = options.get(cls.CORPUS) - if corpus is None: - raise ValueError(f"{cls.__name__} requires '{cls.CORPUS}' in options: a list of {{doc_id, text}} dicts.") - return list(corpus) - @classmethod @abstractmethod def _run(cls, query: str, corpus: List[Dict[str, Any]], top_k: int) -> Tuple[str, List[Dict[str, Any]]]: @@ -121,20 +111,17 @@ def _validate_unique_doc_ids(cls, corpus: List[Dict[str, Any]]) -> None: explicit ``doc_id`` ``"1"`` collides with a missing ``doc_id`` at index 1; the check runs on the effective ids. """ - seen: Set[str] = set() - for i, doc in enumerate(corpus): - doc_id = str(doc.get("doc_id", str(i))) - if doc_id in seen: - raise ValueError(f"{cls.__name__}: duplicate doc_id {doc_id!r} in corpus; ids must be unique.") - seen.add(doc_id) + duplicate = cls._find_duplicate_doc_id(corpus) + if duplicate is not None: + raise DuplicateDocIdError(f"{cls.__name__}: duplicate doc_id {duplicate!r} in corpus; ids must be unique.") @classmethod def _validate_documents(cls, documents: List[Dict[str, Any]], corpus: List[Dict[str, Any]]) -> None: """Reject any surfaced document whose doc_id is not in the supplied corpus.""" - known = {str(doc.get("doc_id", str(i))) for i, doc in enumerate(corpus)} + known = cls._known_doc_ids(corpus) for document in documents: if str(document.get("doc_id")) not in known: - raise ValueError( + raise GroundingError( f"{cls.__name__}._run surfaced document {document.get('doc_id')!r}, " f"which is not in the supplied corpus." ) @@ -151,7 +138,7 @@ def _answer(cls, query: str, corpus: List[Dict[str, Any]], top_k: int) -> Dict[s # with no documents is a valid retrieve-only / no-match result; an empty # answer alongside documents is fine too (retrieve-only pipeline). if answer.strip() and not documents: - raise ValueError(f"{cls.__name__}._run returned a non-empty answer with no supporting documents.") + raise GroundingError(f"{cls.__name__}._run returned a non-empty answer with no supporting documents.") return {"answer": answer, "documents": documents} @classmethod @@ -159,10 +146,8 @@ def calculate_feature(cls, data: Any, features: FeatureSet) -> List[Dict[str, An """Run the framework pipeline, return the answer object.""" for feature in features.features: options = feature.options - query = options.get(cls.QUERY_TEXT) - if query is None: - raise ValueError(f"{cls.__name__} requires '{cls.QUERY_TEXT}' in options.") - corpus = cls._get_corpus(options) + query = cls._require_option(options, cls.QUERY_TEXT) + corpus = cls._require_doc_list(options, cls.CORPUS) top_k = cls._get_top_k(options) return [{cls.ROOT_FEATURE_NAME: cls._answer(str(query), corpus, top_k)}] return [] diff --git a/rag_integration/feature_groups/connectors/rerank/base.py b/rag_integration/feature_groups/connectors/rerank/base.py index d41a103..2593fe8 100644 --- a/rag_integration/feature_groups/connectors/rerank/base.py +++ b/rag_integration/feature_groups/connectors/rerank/base.py @@ -35,8 +35,15 @@ PythonDictFramework, ) +from rag_integration.feature_groups.connectors.mixins import ( + DocCollectionMixin, + OptionsMixin, + RankingValidationMixin, + TopKMixin, +) + -class BaseRerankConnector(FeatureGroup): +class BaseRerankConnector(OptionsMixin, TopKMixin, DocCollectionMixin, RankingValidationMixin, FeatureGroup): """Root FeatureGroup for rerank-connector backends. A concrete backend declares its selector value in ``RERANK_BACKENDS`` and @@ -51,14 +58,11 @@ class BaseRerankConnector(FeatureGroup): ROOT_FEATURE_NAME = "reranked_passages" - # Option keys. + # Option keys. ``TOP_K`` / ``DEFAULT_TOP_K`` come from ``TopKMixin``. RERANK_BACKEND = "rerank_backend" QUERY_TEXT = "query_text" - TOP_K = "top_k" CANDIDATES = "candidates" - DEFAULT_TOP_K = 5 - # Filled per concrete: {backend_value: human-readable description}. Disjoint # across backends; empty on the base so it never matches. RERANK_BACKENDS: Dict[str, str] = {} @@ -68,7 +72,9 @@ class BaseRerankConnector(FeatureGroup): PROPERTY_MAPPING = { RERANK_BACKEND: {"explanation": "Which rerank-connector backend to use"}, QUERY_TEXT: {"explanation": "Query the candidates are reranked against"}, - TOP_K: {"explanation": f"Number of passages to return after reranking (default {DEFAULT_TOP_K})"}, + TopKMixin.TOP_K: { + "explanation": f"Number of passages to return after reranking (default {TopKMixin.DEFAULT_TOP_K})" + }, CANDIDATES: {"explanation": "Candidate passages to rerank: a list of {doc_id, text} dicts"}, } @@ -101,20 +107,6 @@ def input_features(self, options: Options, feature_name: FeatureName) -> None: """Root feature: no input features (candidates arrive via Options).""" return None - @classmethod - def _get_top_k(cls, options: Options) -> int: - val = options.get(cls.TOP_K) - return int(val) if val is not None else cls.DEFAULT_TOP_K - - @classmethod - def _get_candidates(cls, options: Options) -> List[Dict[str, Any]]: - candidates = options.get(cls.CANDIDATES) - if candidates is None: - raise ValueError( - f"{cls.__name__} requires '{cls.CANDIDATES}' in options: a list of {{doc_id, text}} dicts." - ) - return list(candidates) - @classmethod @abstractmethod def _rank(cls, query: str, texts: List[str], top_k: int) -> List[Tuple[int, float]]: @@ -131,15 +123,7 @@ def _rank(cls, query: str, texts: List[str], top_k: int) -> List[Tuple[int, floa @classmethod def _validate_ranking(cls, ranked: List[Tuple[int, float]], n_candidates: int) -> None: """Reject out-of-range or duplicate indices from a backend's ``_rank``.""" - seen: Set[int] = set() - for idx, _score in ranked: - if not 0 <= idx < n_candidates: - raise ValueError( - f"{cls.__name__}._rank returned out-of-range index {idx} for {n_candidates} candidates." - ) - if idx in seen: - raise ValueError(f"{cls.__name__}._rank returned duplicate index {idx}.") - seen.add(idx) + cls._validate_rank_indices(ranked, n_candidates, f"{n_candidates} candidates") @classmethod def _rerank( @@ -156,7 +140,7 @@ def _rerank( return [] texts = [str(doc.get("text", "")) for doc in candidates] - doc_ids = [str(doc.get("doc_id", str(i))) for i, doc in enumerate(candidates)] + doc_ids = cls._effective_doc_ids(candidates) ranked = cls._rank(query, texts, effective_k) cls._validate_ranking(ranked, len(candidates)) @@ -171,10 +155,8 @@ def calculate_feature(cls, data: Any, features: FeatureSet) -> List[Dict[str, An """Rerank the candidates against the query, return reordered passages.""" for feature in features.features: options = feature.options - query = options.get(cls.QUERY_TEXT) - if query is None: - raise ValueError(f"{cls.__name__} requires '{cls.QUERY_TEXT}' in options.") - candidates = cls._get_candidates(options) + query = cls._require_option(options, cls.QUERY_TEXT) + candidates = cls._require_doc_list(options, cls.CANDIDATES) top_k = cls._get_top_k(options) passages = cls._rerank(str(query), candidates, top_k) return [{cls.ROOT_FEATURE_NAME: passages}] diff --git a/rag_integration/feature_groups/connectors/retrieve/base.py b/rag_integration/feature_groups/connectors/retrieve/base.py index 9b0ad37..088279b 100644 --- a/rag_integration/feature_groups/connectors/retrieve/base.py +++ b/rag_integration/feature_groups/connectors/retrieve/base.py @@ -30,8 +30,16 @@ PythonDictFramework, ) +from rag_integration.feature_groups.connectors.errors import DuplicateDocIdError, RankingContractError +from rag_integration.feature_groups.connectors.mixins import ( + DocCollectionMixin, + OptionsMixin, + RankingValidationMixin, + TopKMixin, +) + -class BaseRetrieveConnector(FeatureGroup): +class BaseRetrieveConnector(OptionsMixin, TopKMixin, DocCollectionMixin, RankingValidationMixin, FeatureGroup): """Root FeatureGroup for retrieve-connector backends. A concrete backend declares its selector value in ``RETRIEVE_BACKENDS`` and @@ -57,14 +65,11 @@ class BaseRetrieveConnector(FeatureGroup): ROOT_FEATURE_NAME = "retrieved_passages" - # Option keys. + # Option keys. ``TOP_K`` / ``DEFAULT_TOP_K`` come from ``TopKMixin``. RETRIEVE_BACKEND = "retrieve_backend" QUERY_TEXT = "query_text" - TOP_K = "top_k" CORPUS = "corpus" - DEFAULT_TOP_K = 5 - # Filled per concrete: {backend_value: human-readable description}. The base # stays empty so it never matches a feature. Values must be disjoint across # backends (see the class docstring). @@ -78,7 +83,7 @@ class BaseRetrieveConnector(FeatureGroup): PROPERTY_MAPPING = { RETRIEVE_BACKEND: {"explanation": "Which retrieve-connector backend to use"}, QUERY_TEXT: {"explanation": "Raw text query to search the corpus"}, - TOP_K: {"explanation": f"Number of passages to return (default {DEFAULT_TOP_K})"}, + TopKMixin.TOP_K: {"explanation": f"Number of passages to return (default {TopKMixin.DEFAULT_TOP_K})"}, CORPUS: {"explanation": "Inline corpus: a list of {doc_id, text} dicts"}, } @@ -113,23 +118,6 @@ def input_features(self, options: Options, feature_name: FeatureName) -> None: """Root feature: no input features.""" return None - @classmethod - def _get_top_k(cls, options: Options) -> int: - val = options.get(cls.TOP_K) - if val is None: - return cls.DEFAULT_TOP_K - try: - return int(val) - except (ValueError, TypeError) as exc: - raise ValueError(f"{cls.__name__} option '{cls.TOP_K}' must be an integer, got {val!r}.") from exc - - @classmethod - def _get_corpus(cls, options: Options) -> List[Dict[str, Any]]: - corpus = options.get(cls.CORPUS) - if corpus is None: - raise ValueError(f"{cls.__name__} requires '{cls.CORPUS}' in options: a list of {{doc_id, text}} dicts.") - return list(corpus) - @classmethod @abstractmethod def _rank(cls, query: str, texts: List[str], top_k: int) -> List[Tuple[int, float]]: @@ -151,29 +139,10 @@ def _rank(cls, query: str, texts: List[str], top_k: int) -> List[Tuple[int, floa @classmethod def _validate_ranking(cls, ranked: List[Tuple[int, float]], corpus_size: int, top_k: int) -> None: - """Reject a ``_rank`` result that breaks the contract. - - Enforces all four :meth:`_rank` requirements: indices in range, indices - unique, at most ``top_k`` pairs, and scores non-increasing (best-first). - """ + """Enforce the four :meth:`_rank` requirements (count is retrieve-specific; rest shared).""" if len(ranked) > top_k: - raise ValueError(f"{cls.__name__}._rank returned {len(ranked)} pairs for top_k={top_k}.") - seen: Set[int] = set() - previous_score: Optional[float] = None - for corpus_idx, score in ranked: - if not 0 <= corpus_idx < corpus_size: - raise ValueError( - f"{cls.__name__}._rank returned out-of-range index {corpus_idx} for a corpus of size {corpus_size}." - ) - if corpus_idx in seen: - raise ValueError(f"{cls.__name__}._rank returned duplicate index {corpus_idx}.") - seen.add(corpus_idx) - if previous_score is not None and score > previous_score: - raise ValueError( - f"{cls.__name__}._rank returned scores out of order: {score} after {previous_score} " - f"(scores must be non-increasing, best-first)." - ) - previous_score = score + raise RankingContractError(f"{cls.__name__}._rank returned {len(ranked)} pairs for top_k={top_k}.") + cls._validate_rank_indices(ranked, corpus_size, f"a corpus of size {corpus_size}", non_increasing=True) @classmethod def _retrieve( @@ -206,15 +175,13 @@ def _retrieve( f"Each entry must be a {{doc_id, text}} dict." ) - doc_ids = [str(doc.get("doc_id", str(i))) for i, doc in enumerate(corpus)] - seen_doc_ids: Set[str] = set() - for doc_id in doc_ids: - if doc_id in seen_doc_ids: - raise ValueError( - f"{cls.__name__} corpus contains duplicate doc_id {doc_id!r} " - f"(after str() coercion and the positional-index fallback)." - ) - seen_doc_ids.add(doc_id) + duplicate = cls._find_duplicate_doc_id(corpus) + if duplicate is not None: + raise DuplicateDocIdError( + f"{cls.__name__} corpus contains duplicate doc_id {duplicate!r} " + f"(after str() coercion and the positional-index fallback)." + ) + doc_ids = cls._effective_doc_ids(corpus) effective_k = min(top_k, len(corpus)) if effective_k <= 0: @@ -252,10 +219,8 @@ def calculate_feature(cls, data: Any, features: FeatureSet) -> List[Dict[str, An ) for feature in feature_list: options = feature.options - query = options.get(cls.QUERY_TEXT) - if query is None: - raise ValueError(f"{cls.__name__} requires '{cls.QUERY_TEXT}' in options.") - corpus = cls._get_corpus(options) + query = cls._require_option(options, cls.QUERY_TEXT) + corpus = cls._require_doc_list(options, cls.CORPUS) top_k = cls._get_top_k(options) passages = cls._retrieve(str(query), corpus, top_k) return [{cls.ROOT_FEATURE_NAME: passages}] diff --git a/rag_integration/feature_groups/connectors/structured/base.py b/rag_integration/feature_groups/connectors/structured/base.py index fae03b4..2ca2fe3 100644 --- a/rag_integration/feature_groups/connectors/structured/base.py +++ b/rag_integration/feature_groups/connectors/structured/base.py @@ -33,10 +33,13 @@ PythonDictFramework, ) +from rag_integration.feature_groups.connectors.errors import InvalidOptionError, SqlSafetyError +from rag_integration.feature_groups.connectors.mixins import OptionsMixin + _IDENT_RE = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$") -class BaseStructuredConnector(FeatureGroup): +class BaseStructuredConnector(OptionsMixin, FeatureGroup): """Root FeatureGroup for structured (text-to-SQL) connector backends. A concrete backend declares its selector value in ``STRUCTURED_BACKENDS`` and @@ -89,13 +92,6 @@ def input_features(self, options: Options, feature_name: FeatureName) -> None: """Root feature: no input features (the table arrives via Options).""" return None - @classmethod - def _require(cls, options: Options, key: str) -> Any: - value = options.get(key) - if value is None: - raise ValueError(f"{cls.__name__} requires '{key}' in options.") - return value - @classmethod def _validate_identifier(cls, name: str, kind: str) -> str: """Reject any table/column name that is not a simple SQL identifier. @@ -103,7 +99,9 @@ def _validate_identifier(cls, name: str, kind: str) -> str: Identifiers cannot be parameterised, so this whitelist is what keeps the generated SQL injection-safe (values, by contrast, are always bound).""" if not _IDENT_RE.fullmatch(name): - raise ValueError(f"{cls.__name__}: invalid {kind} identifier {name!r}; expected a simple SQL identifier.") + raise InvalidOptionError( + f"{cls.__name__}: invalid {kind} identifier {name!r}; expected a simple SQL identifier." + ) return name @classmethod @@ -131,9 +129,11 @@ def _validate_select(cls, sql: str) -> None: try: statements = sqlglot.parse(sql, read="sqlite") except SqlglotError as error: - raise ValueError(f"{cls.__name__}._to_sql produced unparseable SQL: {sql!r}") from error + raise SqlSafetyError(f"{cls.__name__}._to_sql produced unparseable SQL: {sql!r}") from error if len(statements) != 1 or not isinstance(statements[0], exp.Select): - raise ValueError(f"{cls.__name__}._to_sql must produce a single top-level SELECT statement, got: {sql!r}") + raise SqlSafetyError( + f"{cls.__name__}._to_sql must produce a single top-level SELECT statement, got: {sql!r}" + ) @classmethod def _query( @@ -147,9 +147,9 @@ def _query( table = cls._validate_identifier(table, "table") columns = [cls._validate_identifier(c, "column") for c in columns] if not columns: - raise ValueError(f"{cls.__name__}: at least one column is required.") + raise InvalidOptionError(f"{cls.__name__}: at least one column is required.") if len({c.lower() for c in columns}) != len(columns): - raise ValueError( + raise InvalidOptionError( f"{cls.__name__}: duplicate column names (SQLite is case-insensitive) are not allowed: {columns}." ) @@ -185,15 +185,15 @@ def calculate_feature(cls, data: Any, features: FeatureSet) -> List[Dict[str, An """Answer the question over the supplied table, return the SQL and rows.""" for feature in features.features: options = feature.options - question = str(cls._require(options, cls.QUESTION)) - table = str(cls._require(options, cls.TABLE)) - raw_columns = cls._require(options, cls.COLUMNS) + question = str(cls._require_option(options, cls.QUESTION)) + table = str(cls._require_option(options, cls.TABLE)) + raw_columns = cls._require_option(options, cls.COLUMNS) if not isinstance(raw_columns, (list, tuple)): - raise ValueError(f"{cls.__name__}: '{cls.COLUMNS}' must be a list or tuple of column names.") + raise InvalidOptionError(f"{cls.__name__}: '{cls.COLUMNS}' must be a list or tuple of column names.") columns = [str(c) for c in raw_columns] - raw_rows = cls._require(options, cls.ROWS) + raw_rows = cls._require_option(options, cls.ROWS) if not isinstance(raw_rows, (list, tuple)) or not all(isinstance(row, dict) for row in raw_rows): - raise ValueError(f"{cls.__name__}: '{cls.ROWS}' must be a list or tuple of dicts.") + raise InvalidOptionError(f"{cls.__name__}: '{cls.ROWS}' must be a list or tuple of dicts.") rows = [dict(row) for row in raw_rows] return [{cls.ROOT_FEATURE_NAME: cls._query(question, table, columns, rows)}] return [] diff --git a/tests/connectors/test_shared_mixins.py b/tests/connectors/test_shared_mixins.py new file mode 100644 index 0000000..b82ac94 --- /dev/null +++ b/tests/connectors/test_shared_mixins.py @@ -0,0 +1,73 @@ +"""Shared connector mixins and error types: every error is a ``ValueError``, and +``top_k`` parsing is uniform across the ``TopKMixin`` families.""" + +from __future__ import annotations + +from typing import Type + +import pytest + +from mloda.user import Options + +from rag_integration.feature_groups.connectors.errors import ( + ConnectorError, + DuplicateDocIdError, + GroundingError, + InvalidOptionError, + MissingOptionError, + RankingContractError, + SqlSafetyError, +) +from rag_integration.feature_groups.connectors.graph_rag.base import BaseGraphRagConnector +from rag_integration.feature_groups.connectors.mixins import TopKMixin +from rag_integration.feature_groups.connectors.orchestrator.base import BaseOrchestratorConnector +from rag_integration.feature_groups.connectors.rerank.base import BaseRerankConnector +from rag_integration.feature_groups.connectors.retrieve.base import BaseRetrieveConnector + + +class TestErrorHierarchy: + """Every connector error is a ``ValueError`` so existing callers keep catching it.""" + + @pytest.mark.parametrize( + "error_cls", + [ + ConnectorError, + MissingOptionError, + InvalidOptionError, + DuplicateDocIdError, + RankingContractError, + GroundingError, + SqlSafetyError, + ], + ) + def test_subclasses_value_error(self, error_cls: Type[Exception]) -> None: + assert issubclass(error_cls, ValueError) + + +_TOPK_FAMILIES = [BaseRetrieveConnector, BaseRerankConnector, BaseGraphRagConnector, BaseOrchestratorConnector] + + +class TestSharedTopK: + """``TopKMixin._get_top_k`` parses uniformly across every family that mixes it in.""" + + @pytest.mark.parametrize("base", _TOPK_FAMILIES) + def test_garbage_top_k_raises_naming_key_and_value(self, base: Type[TopKMixin]) -> None: + options = Options(context={base.TOP_K: "not-an-int"}) + with pytest.raises(InvalidOptionError, match="top_k.*not-an-int"): + base._get_top_k(options) + + @pytest.mark.parametrize("base", _TOPK_FAMILIES) + def test_non_coercible_type_raises_value_error(self, base: Type[TopKMixin]) -> None: + # A list cannot become an int; before the refactor the non-validating + # families surfaced a bare TypeError here, now it is a loud ValueError. + options = Options(context={base.TOP_K: [1, 2]}) + with pytest.raises(ValueError): + base._get_top_k(options) + + @pytest.mark.parametrize("base", _TOPK_FAMILIES) + def test_absent_top_k_falls_back_to_default(self, base: Type[TopKMixin]) -> None: + assert base._get_top_k(Options(context={})) == base.DEFAULT_TOP_K + + @pytest.mark.parametrize("base", _TOPK_FAMILIES) + def test_string_integer_is_coerced(self, base: Type[TopKMixin]) -> None: + assert base._get_top_k(Options(context={base.TOP_K: "3"})) == 3 From 242d4a706c0e3f26651055a76b9436636751ff8f Mon Sep 17 00:00:00 2001 From: TKaltofen Date: Wed, 10 Jun 2026 14:51:55 +0200 Subject: [PATCH 4/9] docs: standalone connectors family-map README + top-level README link (#33) (#41) * docs: add connectors family-map README; link from top-level README Phase 2 (#33): move the per-family connector prose from the top-level README into rag_integration/feature_groups/connectors/README.md as a family-map table (contract, backends, no-Docker concrete, pedigree) plus per-family detail, cross-linked to docs/rag-connector-base-classes.md and each family's contract suite. The top-level README now links there instead of duplicating the detail (one source of truth). * docs: clarify per-family selector keys and pedigree/dep wording in connectors README --- README.md | 77 ++-------- .../feature_groups/connectors/README.md | 134 ++++++++++++++++++ 2 files changed, 144 insertions(+), 67 deletions(-) create mode 100644 rag_integration/feature_groups/connectors/README.md diff --git a/README.md b/README.md index e6d8620..9721ff6 100644 --- a/README.md +++ b/README.md @@ -124,15 +124,15 @@ feature = Feature( ## Connector families Alongside the build-your-own stage pipeline, the `connectors/` package wraps -whole external open-source RAG tools under one mloda surface, organized into -families by query-contract shape (see issue #25 for the taxonomy and the -backend-selection rationale). Each family is a thin `BaseConnector` -FeatureGroup plus one or more concrete backends, with an inheritable -contract-test suite so a new backend's test is a handful of adapter methods. +whole external open-source RAG tools under one mloda surface, organized into six +families by query-contract shape (retrieve, rerank, generate, graph_rag, +structured, orchestrator). You swap backends by changing options, not by +rewriting a pipeline. -The first family is `retrieve` (`query_text + corpus + top_k -> ranked -passages`). Its canonical backend is `Bm25sRetriever` (BM25 lexical retrieval -via `bm25s`): zero-download, deterministic, MIT/numpy-only. +See [`feature_groups/connectors/README.md`](rag_integration/feature_groups/connectors/README.md) +for the family map (per-family contract, backends, no-Docker concrete, and +pedigree), runnable examples, and links to the contract suites. The design +rationale is in [`docs/rag-connector-base-classes.md`](docs/rag-connector-base-classes.md). ```python from mloda.user import mlodaAPI, Feature, Options, PluginCollector @@ -160,65 +160,8 @@ results = mlodaAPI.run_all( ) ``` -A second backend, `TfidfRetriever` (`retrieve_backend="tfidf"`), ranks the same -corpus by TF-IDF cosine similarity (a vector-space lexical counterpart to the -probabilistic `bm25s`): it vectorizes the corpus and query with the repo's -deterministic TF-IDF embedder and needs no extra dependency, so it is also -zero-download and a CI anchor. - -Install the family's backend with `uv sync --extra connectors`. - -The `rerank` family (`query_text + candidates + top_k -> reordered passages`) -reorders already-retrieved candidates. Its canonical backend is -`LexicalReranker` (`rerank_backend="lexical"`): pure-Python token overlap, -zero-download and deterministic. `FlashRankReranker` (`rerank_backend="flashrank"`, -`uv sync --extra rerank`) adds a real ONNX cross-encoder; its model downloads on -first use, so its test runs locally and is skipped on CI. - -The `generate` family (`query_text + passages -> answer + citations`) produces a -grounded answer from supporting passages. Its canonical backend is -`ExtractiveResponder` (`generate_backend="extractive"`): pure-Python sentence -extraction, zero-download and deterministic, and grounded by construction (every -citation is one of the supplied passages). A second backend, `TemplateResponder` -(`generate_backend="template"`), selects the top query-relevant sentences across -passages, joins them into a fixed template, and cites every passage it drew from -(multi-citation, vs the extractive responder's single citation); it is likewise -pure-Python, zero-download, and grounded by construction. LLM-backed generators -are pedigree backends for later. - -The `graph_rag` family (`query_text + nodes + edges + top_k -> ranked passages`) -scores nodes by query overlap plus a one-hop neighbour bonus: a passage -connected to a relevant one is surfaced even with no query-term overlap. Its canonical backend is `NetworkxGraphRag` -(`graph_backend="networkx"`, `uv sync --extra graph`): zero-download, -deterministic, BSD/pure-Python. A second backend, `AdjacencyGraphRag` -(`graph_backend="adjacency"`), applies the same overlap + neighbour-bonus -scoring over a hand-built adjacency map with no networkx (stdlib only), -demonstrating that the family contract is not tied to one graph library. - -The `structured` family (`question + table -> SQL -> typed rows`) answers a -natural-language question over a relational table. Its canonical backend is -`RuleBasedSql` (`structured_backend="rule_based"`, `uv sync --extra structured`): -rule-based NL->SQL executed on stdlib `sqlite3`, with `sqlglot` validating the -generated SQL is a single top-level `SELECT` statement. Zero-download, -deterministic, no LLM; values are always bound parameters and identifiers are -whitelisted. A second backend, `AggregateSql` (`structured_backend="aggregate"`), -adds aggregation intents (avg/min/max/sum over a column named in the question; -numericness is not validated, and SQLite's coercion means e.g. `AVG` over a text -column returns `0.0`) on top of the count/filter/list intents, reusing the same -identifier whitelist, SQL guard, and sqlite execution. - -The `orchestrator` family (`query_text + corpus + top_k -> answer + documents`) -wraps a whole external RAG framework as one connector (bring your existing -pipeline). Its canonical backend is `HaystackOrchestrator` -(`orchestrator_backend="haystack"`, `uv sync --extra orchestrator`): a real -Haystack 2.x in-memory BM25 pipeline, zero-download (no model, no server) so it -runs in CI. A second backend, `R2RFixtureOrchestrator` -(`orchestrator_backend="r2r"`), covers a different integration mode: it models a -server-shaped tool (R2R) over a static JSON fixture of canned responses (the -open-kgo `rest_public` pattern), answering with honest-surface narrowing -(surfacing only canned documents that are in the supplied corpus). No server, no -network, zero-dependency, deterministic. Other server-shaped tools (e.g. -RAGFlow) can follow the same fixture-stub pattern. +Install a family's backend with `uv sync --extra connectors` (or `rerank` / +`graph` / `structured` / `orchestrator`). ## Installation diff --git a/rag_integration/feature_groups/connectors/README.md b/rag_integration/feature_groups/connectors/README.md new file mode 100644 index 0000000..3b17f45 --- /dev/null +++ b/rag_integration/feature_groups/connectors/README.md @@ -0,0 +1,134 @@ +# Connector families + +The `connectors/` package wraps whole external open-source RAG tools under one +mloda surface, organized into families by **query-contract shape**. Each family +is a thin `BaseConnector` FeatureGroup plus one or more concrete +backends gated by a per-family selector option (`retrieve_backend`, +`rerank_backend`, `generate_backend`, `graph_backend`, `structured_backend`, +`orchestrator_backend`), with an inheritable contract-test suite so a new +backend's test is a handful of adapter methods. + +This sits alongside the build-your-own stage pipeline (`../rag_pipeline/`): the +stages let you assemble a pipeline step by step, the connectors let you drop in +one external tool that subsumes several steps. You swap retrievers, rerankers, +or generators by changing options, not by rewriting a pipeline. + +For the design (how families are cut, the full landscape survey, and the base +classes) see [`docs/rag-connector-base-classes.md`](../../../docs/rag-connector-base-classes.md). +The shared cross-cutting mixins and error types live in [`mixins.py`](mixins.py) +and [`errors.py`](errors.py). + +## Family map + +The canonical concrete per family is the zero-download, deterministic backend +that anchors the CI contract suite. Pedigree tags: `real-lib-inmem` (a real +library running in-process), `fixture-stub` (deterministic stand-in, no model +download or server). The full survey in the design doc also uses +`real-lib-server` and `research-prototype`. + +| Family | Reader contract (in -> out) | No-Docker concrete | Other backends | Pedigree of the anchor | Contract suite | +|---|---|---|---|---|---| +| [`retrieve`](retrieve/) | `query_text + corpus + top_k -> ranked passages` (`retrieved_passages: [{doc_id, text, score, rank}]`) | `Bm25sRetriever` (`bm25s`, zero-download lexical) | `TfidfRetriever` (vector-space lexical) | real-lib-inmem | [`retrieve_contract.py`](../../../tests/connectors/retrieve/retrieve_contract.py) | +| [`rerank`](rerank/) | `query_text + candidates + top_k -> reordered passages` (`reranked_passages`) | `LexicalReranker` (token overlap, zero-download) | `FlashRankReranker` (ONNX cross-encoder, `rerank` extra, CI-skip on model download) | fixture-stub | [`rerank_contract.py`](../../../tests/connectors/rerank/rerank_contract.py) | +| [`generate`](generate/) | `query_text + passages -> answer + citations` (`generated_answer: {answer, citations}`), grounded by construction | `ExtractiveResponder` (stdlib sentence extraction) | `TemplateResponder` (multi-citation template) | fixture-stub | [`generate_contract.py`](../../../tests/connectors/generate/generate_contract.py) | +| [`graph_rag`](graph_rag/) | `query_text + nodes + edges + top_k -> ranked passages` (`graph_passages`); query overlap + one-hop neighbour bonus | `AdjacencyGraphRag` (stdlib adjacency map, zero-download) | `NetworkxGraphRag` (`networkx`, `graph` extra); parity test pins identical ranking | fixture-stub | [`graph_rag_contract.py`](../../../tests/connectors/graph_rag/graph_rag_contract.py) | +| [`structured`](structured/) | `question + table -> SQL -> typed rows` (`structured_rows: {sql, rows}`); in-mem SQLite, single-SELECT `sqlglot` guard | `RuleBasedSql` (rule-based NL->SQL, `structured` extra) | `AggregateSql` (adds avg/min/max/sum intents) | fixture-stub | [`structured_contract.py`](../../../tests/connectors/structured/structured_contract.py) | +| [`orchestrator`](orchestrator/) | `query_text + corpus + top_k -> answer + documents` (internals opaque) (`orchestrated_answer: {answer, documents}`) | `HaystackOrchestrator` (Haystack 2.x BM25, offline, `orchestrator` extra) | `R2RFixtureOrchestrator` (file-fixture REST stub) | real-lib-inmem | [`orchestrator_contract.py`](../../../tests/connectors/orchestrator/orchestrator_contract.py) | + +## Families in detail + +### `retrieve` -- `query_text + corpus + top_k -> ranked passages` + +Holds the vector-store / lexical / late-interaction backends (FAISS, Chroma, +bm25s, ColBERT, ...). The anchor `Bm25sRetriever` (`retrieve_backend="bm25s"`) is +BM25 lexical retrieval via `bm25s`: zero-download, deterministic, numpy/scipy. +`TfidfRetriever` (`retrieve_backend="tfidf"`) ranks the same corpus by TF-IDF +cosine similarity using the repo's deterministic embedder, also zero-download. + +```python +from mloda.user import mlodaAPI, Feature, Options, PluginCollector +from mloda_plugins.compute_framework.base_implementations.python_dict.python_dict_framework import ( + PythonDictFramework, +) +from rag_integration.feature_groups.connectors.retrieve import Bm25sRetriever + +feature = Feature( + "retrieved_passages", + options=Options(context={ + "retrieve_backend": "bm25s", + "query_text": "cat pet", + "corpus": [ + {"doc_id": "d1", "text": "A cat is an independent and curious pet."}, + {"doc_id": "d2", "text": "Cars need regular engine oil and maintenance."}, + ], + "top_k": 3, + }), +) +results = mlodaAPI.run_all( + [feature], + compute_frameworks={PythonDictFramework}, + plugin_collector=PluginCollector.enabled_feature_groups({Bm25sRetriever}), +) +``` + +### `rerank` -- `query_text + candidates + top_k -> reordered passages` + +Takes *candidates* in (already retrieved), not a corpus. `LexicalReranker` +(`rerank_backend="lexical"`) is pure-Python token overlap, zero-download. +`FlashRankReranker` (`rerank_backend="flashrank"`, `--extra rerank`) adds a real +ONNX cross-encoder; its model downloads on first use, so its test runs locally +and is skipped on CI. + +### `generate` -- `query_text + passages -> answer + citations` + +Returns prose plus citations, grounded by construction (every citation is one of +the supplied passages). `ExtractiveResponder` (`generate_backend="extractive"`) +does pure-Python sentence extraction with a single citation. `TemplateResponder` +(`generate_backend="template"`) selects top query-relevant sentences across +passages into a fixed template and cites every passage it drew from. LLM-backed +generators are pedigree backends for later. + +### `graph_rag` -- `query_text + nodes + edges + top_k -> ranked passages` + +Scores nodes by query overlap plus a one-hop neighbour bonus: a passage +connected to a relevant one is surfaced even with no query-term overlap. +`AdjacencyGraphRag` (`graph_backend="adjacency"`) applies the scoring over a +hand-built adjacency map with stdlib only. `NetworkxGraphRag` +(`graph_backend="networkx"`, `--extra graph`) does the same over `networkx`; a +parity test pins identical ranking, showing the contract is not tied to one +graph library. + +### `structured` -- `question + table -> SQL -> typed rows` + +Answers a natural-language question over a relational table. `RuleBasedSql` +(`structured_backend="rule_based"`, `--extra structured`) does rule-based NL->SQL +executed on stdlib `sqlite3`, with `sqlglot` validating the generated SQL is a +single top-level `SELECT`. Values are always bound parameters and identifiers +whitelisted. `AggregateSql` (`structured_backend="aggregate"`) adds aggregation +intents (avg/min/max/sum) on top of the count/filter/list intents. + +### `orchestrator` -- `query_text + corpus + top_k -> answer + documents` + +Wraps a whole external RAG framework as one connector (bring your existing +pipeline); the internals are the framework's. `HaystackOrchestrator` +(`orchestrator_backend="haystack"`, `--extra orchestrator`) runs a real Haystack +2.x in-memory BM25 pipeline, zero-download (no model, no server) so it runs in +CI. `R2RFixtureOrchestrator` (`orchestrator_backend="r2r"`) models a +server-shaped tool over a static JSON fixture, surfacing only canned documents +that are in the supplied corpus. Other server-shaped tools can follow the same +fixture-stub pattern. + +## How a backend is selected + +Each base gates on its selector option in `match_feature_group_criteria` (named +per family above; note `graph_rag` uses `graph_backend`, not +`graph_rag_backend`); backends declare disjoint selector values, so at most one +ever claims a given `Options`. An unknown backend matches nothing. The +base owns the cross-backend contract (option extraction, validation, assembly); +a concrete backend implements only its one ranking / generation hook. + +## Install + +```bash +uv sync --extra connectors # or --extra rerank / graph / structured / orchestrator +``` From 3fe3e56fc417e6f4fa3ad926ed0e6b88e2791464 Mon Sep 17 00:00:00 2001 From: Tom Kaltofen Date: Wed, 10 Jun 2026 12:59:33 +0000 Subject: [PATCH 5/9] test: enforce PROPERTY_MAPPING default invariant across all feature groups Upstream mloda will validate PROPERTY_MAPPING defaults at class-definition time (FeatureChainParser.validate_property_mapping_defaults called from FeatureGroup.__init_subclass__). Add a repo-wide test that applies the same invariant now: every strict default must be one of the key's accepted values. Delegates to the upstream validator when the installed mloda provides it, otherwise replicates the logic on FeatureChainParser helpers from 0.8.x. --- .../test_property_mapping_defaults.py | 122 ++++++++++++++++++ 1 file changed, 122 insertions(+) create mode 100644 tests/feature_groups/test_property_mapping_defaults.py diff --git a/tests/feature_groups/test_property_mapping_defaults.py b/tests/feature_groups/test_property_mapping_defaults.py new file mode 100644 index 0000000..679a8d8 --- /dev/null +++ b/tests/feature_groups/test_property_mapping_defaults.py @@ -0,0 +1,122 @@ +"""Repo-wide invariant: every PROPERTY_MAPPING default must be an accepted value. + +Upstream mloda is adding ``FeatureChainParser.validate_property_mapping_defaults``, +called from ``FeatureGroup.__init_subclass__``, which rejects at class-definition +time any strict default that is not in the key's accepted values. Once that mloda +release ships, a violating feature group fails on import. This test enforces the +same invariant now so violations are caught here first. When the installed mloda +already exposes the validator, the test delegates to it; otherwise it replicates +the upstream logic on top of the ``FeatureChainParser`` helpers present in 0.8.x. +""" + +from __future__ import annotations + +import importlib +import pkgutil +from typing import Any, Dict, List, Optional, Type + +from mloda.provider import DefaultOptionKeys, FeatureChainParser, FeatureGroup + +import rag_integration.feature_groups + + +def _validate_with_replica(owner_name: str, property_mapping: Optional[Dict[str, Any]]) -> List[str]: + """Replicate upstream validate_property_mapping_defaults; returns violation messages.""" + violations: List[str] = [] + if property_mapping is None: + return violations + for key, spec in property_mapping.items(): + if not isinstance(spec, dict): + continue + if DefaultOptionKeys.default not in spec: + continue + default = spec[DefaultOptionKeys.default] + if default is None: + continue + validation_function = FeatureChainParser._get_validation_function(spec) + if validation_function is not None: + if not FeatureChainParser._is_strict_validation(spec): + continue + try: + verdict = validation_function(default) + except Exception as exc: + violations.append( + f"{owner_name}.PROPERTY_MAPPING['{key}'] default {default!r}: validation_function raised {exc!r}" + ) + continue + if not verdict: + violations.append( + f"{owner_name}.PROPERTY_MAPPING['{key}'] default {default!r}: rejected by validation_function" + ) + continue + accepted = FeatureChainParser._extract_property_values(spec) + try: + FeatureChainParser._validate_property_value(default, accepted, key, spec) + except (ValueError, TypeError): + violations.append( + f"{owner_name}.PROPERTY_MAPPING['{key}'] default {default!r}: " + f"not in accepted values {sorted(accepted, key=repr)}" + ) + return violations + + +def _validate(owner_name: str, property_mapping: Optional[Dict[str, Any]]) -> List[str]: + """Delegate to the upstream validator when available, else use the replica.""" + upstream = getattr(FeatureChainParser, "validate_property_mapping_defaults", None) + if upstream is None: + return _validate_with_replica(owner_name, property_mapping) + try: + upstream(owner_name, property_mapping) + except (ValueError, TypeError) as exc: + return [str(exc)] + return [] + + +def _all_feature_groups() -> List[Type[FeatureGroup]]: + """Import every feature_groups module and collect this package's FeatureGroup subclasses.""" + for module_info in pkgutil.walk_packages( + rag_integration.feature_groups.__path__, prefix="rag_integration.feature_groups." + ): + try: + importlib.import_module(module_info.name) + except ImportError: + # Optional-dependency modules (e.g. model backends) are exercised by their own tests. + continue + + collected: List[Type[FeatureGroup]] = [] + stack: List[Type[FeatureGroup]] = list(FeatureGroup.__subclasses__()) + seen: set[Type[FeatureGroup]] = set() + while stack: + candidate = stack.pop() + if candidate in seen: + continue + seen.add(candidate) + stack.extend(candidate.__subclasses__()) + if candidate.__module__.startswith("rag_integration."): + collected.append(candidate) + return sorted(collected, key=lambda c: f"{c.__module__}.{c.__name__}") + + +def test_validator_catches_bad_default() -> None: + """Guard against a vacuously green check: a default outside the accepted values must be flagged.""" + bad_mapping: Dict[str, Any] = { + "mode": { + "fast": "fast mode", + "slow": "slow mode", + DefaultOptionKeys.default: "turbo", + DefaultOptionKeys.strict_validation: True, + } + } + assert _validate("DummyOwner", bad_mapping) + + +def test_all_property_mapping_defaults_are_accepted_values() -> None: + feature_groups = _all_feature_groups() + assert len(feature_groups) >= 70, "feature group discovery looks broken" + + violations: List[str] = [] + for feature_group in feature_groups: + owner = f"{feature_group.__module__}.{feature_group.__name__}" + violations.extend(_validate(owner, feature_group.PROPERTY_MAPPING)) + + assert not violations, "PROPERTY_MAPPING defaults outside accepted values:\n" + "\n".join(violations) From 26cf42acfc10950095b028bb44550473e69684df Mon Sep 17 00:00:00 2001 From: Tom Kaltofen Date: Wed, 10 Jun 2026 13:08:32 +0000 Subject: [PATCH 6/9] test: fail on import errors during discovery and tighten guards Review findings: walk_packages silently ignores ImportError without onerror, and the per-module except hid internal import regressions. Collect every import failure and fail the test. Tighten the discovery floor to the current count of 74 and add a validation_function case to the anti-vacuous guard. --- .../test_property_mapping_defaults.py | 43 +++++++++++++------ 1 file changed, 30 insertions(+), 13 deletions(-) diff --git a/tests/feature_groups/test_property_mapping_defaults.py b/tests/feature_groups/test_property_mapping_defaults.py index 679a8d8..c90c02e 100644 --- a/tests/feature_groups/test_property_mapping_defaults.py +++ b/tests/feature_groups/test_property_mapping_defaults.py @@ -1,12 +1,9 @@ """Repo-wide invariant: every PROPERTY_MAPPING default must be an accepted value. -Upstream mloda is adding ``FeatureChainParser.validate_property_mapping_defaults``, -called from ``FeatureGroup.__init_subclass__``, which rejects at class-definition -time any strict default that is not in the key's accepted values. Once that mloda -release ships, a violating feature group fails on import. This test enforces the -same invariant now so violations are caught here first. When the installed mloda -already exposes the validator, the test delegates to it; otherwise it replicates -the upstream logic on top of the ``FeatureChainParser`` helpers present in 0.8.x. +Upstream mloda will enforce this at class-definition time via +``FeatureChainParser.validate_property_mapping_defaults`` in ``FeatureGroup.__init_subclass__``. +Enforce it here first: delegate to that validator when the installed mloda has it, +otherwise replicate it on the ``FeatureChainParser`` helpers present in 0.8.x. """ from __future__ import annotations @@ -73,15 +70,22 @@ def _validate(owner_name: str, property_mapping: Optional[Dict[str, Any]]) -> Li def _all_feature_groups() -> List[Type[FeatureGroup]]: - """Import every feature_groups module and collect this package's FeatureGroup subclasses.""" + """Import every feature_groups module and collect this package's FeatureGroup subclasses. + + Any import failure fails the test: a module that cannot import is a module whose + PROPERTY_MAPPING this invariant silently skips. + """ + import_failures: List[str] = [] for module_info in pkgutil.walk_packages( - rag_integration.feature_groups.__path__, prefix="rag_integration.feature_groups." + rag_integration.feature_groups.__path__, + prefix="rag_integration.feature_groups.", + onerror=lambda name: import_failures.append(f"{name}: failed during package walk"), ): try: importlib.import_module(module_info.name) - except ImportError: - # Optional-dependency modules (e.g. model backends) are exercised by their own tests. - continue + except Exception as exc: + import_failures.append(f"{module_info.name}: {exc!r}") + assert not import_failures, "feature_groups modules failed to import:\n" + "\n".join(import_failures) collected: List[Type[FeatureGroup]] = [] stack: List[Type[FeatureGroup]] = list(FeatureGroup.__subclasses__()) @@ -110,9 +114,22 @@ def test_validator_catches_bad_default() -> None: assert _validate("DummyOwner", bad_mapping) +def test_validator_catches_default_rejected_by_validation_function() -> None: + bad_mapping: Dict[str, Any] = { + "size": { + "explanation": "positive size", + DefaultOptionKeys.default: -1, + DefaultOptionKeys.strict_validation: True, + DefaultOptionKeys.validation_function: lambda value: isinstance(value, int) and value > 0, + } + } + assert _validate("DummyOwner", bad_mapping) + + def test_all_property_mapping_defaults_are_accepted_values() -> None: feature_groups = _all_feature_groups() - assert len(feature_groups) >= 70, "feature group discovery looks broken" + # 74 feature groups exist today; lower this only when groups are deliberately removed. + assert len(feature_groups) >= 74, f"feature group discovery looks broken, found {len(feature_groups)}" violations: List[str] = [] for feature_group in feature_groups: From b4f0504cfdde6fe866763cf61df6dc9ef626b765 Mon Sep 17 00:00:00 2001 From: Tom Kaltofen Date: Wed, 10 Jun 2026 13:28:53 +0000 Subject: [PATCH 7/9] docs: record feature_groups/connectors/ as the canonical package path Closes #37. The strategy issue (#25) proposed rag/ while PR #31 shipped feature_groups/connectors/; this records the shipped path as canonical and maps stale rag/ references. Also lists mixins.py and errors.py in the package layout. --- docs/rag-connector-base-classes.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/docs/rag-connector-base-classes.md b/docs/rag-connector-base-classes.md index 0fe58b2..858a08c 100644 --- a/docs/rag-connector-base-classes.md +++ b/docs/rag-connector-base-classes.md @@ -239,6 +239,8 @@ wired in as one. ``` rag_integration/feature_groups/connectors/ + mixins.py shared cross-cutting option mixins + errors.py shared error types / base.py BaseConnector (contract, option keys, validation) .py concrete backend (declares its _backend selector) @@ -247,3 +249,14 @@ tests/connectors/ _contract.py inheritable contract-test suite test_.py concrete adapter test ``` + +### Canonical package path + +`rag_integration/feature_groups/connectors/` is the canonical package path +(decided in #37). The strategy issue (#25) proposed `rag/`, but the +implementation (#31) shipped under `feature_groups/`, matching both the repo's +existing convention (`rag_pipeline/`, `image_pipeline/`, `datasets/`, +`evaluation/`) and the open-kgo precedent, whose `kg/` package likewise lives at +`open_kgo/feature_groups/kg/`. Any remaining `rag/...` path references in older +issues or notes map to `rag_integration/feature_groups/connectors/...` (code) +and `tests/connectors/...` (contract suites). From 818b0f2758bb464325358c6d53d55edf44f6fffb Mon Sep 17 00:00:00 2001 From: Tom Kaltofen Date: Wed, 10 Jun 2026 14:33:49 +0000 Subject: [PATCH 8/9] feat: fold FAISS path in as canonical dense backend of retrieve (#36) FaissDenseRetriever (retrieve_backend="faiss") runs the stage pipeline's FAISS nearest-neighbor search behind the retrieve family contract: hash embeddings, in-memory IndexFlatIP, positive cosine scores best-first. The retrieval and llm_response stages now also emit the canonical retrieved_passages / generated_answer shapes and serve those feature names (gated on their defining options), so migrating between a stage and a connector is an option swap, not a pipeline rewrite. Parity is verified end to end by tests/integration/test_stage_connector_parity.py, and the migration seam is written down in the design doc and READMEs. --- README.md | 7 + docs/rag-connector-base-classes.md | 41 ++-- .../feature_groups/connectors/README.md | 14 +- .../connectors/retrieve/__init__.py | 3 +- .../connectors/retrieve/faiss_retriever.py | 81 +++++++ .../rag_pipeline/llm_response/base.py | 31 ++- .../rag_pipeline/retrieval/base.py | 61 +++++- .../retrieve/test_faiss_retriever.py | 45 ++++ .../test_stage_connector_parity.py | 203 ++++++++++++++++++ 9 files changed, 460 insertions(+), 26 deletions(-) create mode 100644 rag_integration/feature_groups/connectors/retrieve/faiss_retriever.py create mode 100644 tests/connectors/retrieve/test_faiss_retriever.py create mode 100644 tests/integration/test_stage_connector_parity.py diff --git a/README.md b/README.md index 9721ff6..e953fbd 100644 --- a/README.md +++ b/README.md @@ -129,6 +129,13 @@ families by query-contract shape (retrieve, rerank, generate, graph_rag, structured, orchestrator). You swap backends by changing options, not by rewriting a pipeline. +The two layers share one seam: the FAISS `retrieval` stage is the native dense +path of the `retrieve` family (`retrieve_backend="faiss"`), and a stage and its +connector counterpart emit the same passage / answer row shape under the same +canonical feature name, so migrating between them is an option swap. See +"Relationship to the stage pipeline" in +[`docs/rag-connector-base-classes.md`](docs/rag-connector-base-classes.md). + See [`feature_groups/connectors/README.md`](rag_integration/feature_groups/connectors/README.md) for the family map (per-family contract, backends, no-Docker concrete, and pedigree), runnable examples, and links to the contract suites. The design diff --git a/docs/rag-connector-base-classes.md b/docs/rag-connector-base-classes.md index 858a08c..10f00ee 100644 --- a/docs/rag-connector-base-classes.md +++ b/docs/rag-connector-base-classes.md @@ -218,22 +218,35 @@ family's `base.py`. The shared axis: - **Embedding-model selection** (retrieve dense backend, graph_rag) - **Citation / provenance** (generate, orchestrator) -## Relationship to the stage pipeline +## Relationship to the stage pipeline (the migration seam) The stage pipeline (`feature_groups/rag_pipeline/`) has a FAISS-backed -`retrieval` stage and an `llm_response` stage, which cover the same ground as the -`retrieve` and `generate` connectors: - -- A connector and the corresponding stage emit the same passage / answer row - shape, so a downstream feature is agnostic to which produced it. -- Stages assemble a pipeline step by step; a connector drops in one external tool - that subsumes embed + index + retrieve. -- Switching between them is a change of connector id / options, same - `Feature -> run_all` shape, no pipeline rewrite. - -The `retrieve` family currently has lexical backends (`bm25s`, `tfidf`) only; -there is no dense / FAISS backend yet, and the FAISS retrieval stage is not yet -wired in as one. +`retrieval` stage and an `llm_response` stage, which cover the same ground as +the `retrieve` and `generate` connectors. The division of labor: **stages = +build-your-own** (assemble source -> chunk -> embed -> index -> retrieve step +by step), **connectors = bring an existing tool** (drop in one external tool +that subsumes several steps). They are one world, not two parallel ones, and +the seam between them is pinned down in three ways (issue #36): + +- **The FAISS stage is the native dense path of `retrieve`, not a separate + concept.** `FaissDenseRetriever` (`retrieve_backend="faiss"`) runs the same + FAISS nearest-neighbor search over the same stage-pipeline embeddings + (`HashEmbedder`), serving the family's inline-corpus contract; the + `retrieval` stage serves the same search over a pre-built on-disk index. The + `retrieve` family thus has lexical (`bm25s`, `tfidf`) and dense (`faiss`) + backends under one contract. +- **Same row shape, same feature name.** A connector and the corresponding + stage emit the same passage / answer row shape under the same canonical + feature name: the `retrieval` stage also serves `retrieved_passages` + (`[{doc_id, text, score, rank}]`, score higher-is-better) when its + `index_path` option is present, and the `llm_response` stage also serves + `generated_answer` (`{answer, citations}`) when its `query` option is + present. A downstream feature is agnostic to which produced it. Verified by + [`tests/integration/test_stage_connector_parity.py`](../tests/integration/test_stage_connector_parity.py). +- **Migration is an option swap, not a pipeline rewrite.** Moving between a + stage and a connector (either direction) keeps the requested feature name + and the `Feature -> run_all` shape; only the options change (inline `corpus` + + `retrieve_backend` vs `index_path` + `embedding_method`). ## Package layout diff --git a/rag_integration/feature_groups/connectors/README.md b/rag_integration/feature_groups/connectors/README.md index 3b17f45..4938104 100644 --- a/rag_integration/feature_groups/connectors/README.md +++ b/rag_integration/feature_groups/connectors/README.md @@ -28,7 +28,7 @@ download or server). The full survey in the design doc also uses | Family | Reader contract (in -> out) | No-Docker concrete | Other backends | Pedigree of the anchor | Contract suite | |---|---|---|---|---|---| -| [`retrieve`](retrieve/) | `query_text + corpus + top_k -> ranked passages` (`retrieved_passages: [{doc_id, text, score, rank}]`) | `Bm25sRetriever` (`bm25s`, zero-download lexical) | `TfidfRetriever` (vector-space lexical) | real-lib-inmem | [`retrieve_contract.py`](../../../tests/connectors/retrieve/retrieve_contract.py) | +| [`retrieve`](retrieve/) | `query_text + corpus + top_k -> ranked passages` (`retrieved_passages: [{doc_id, text, score, rank}]`) | `Bm25sRetriever` (`bm25s`, zero-download lexical) | `TfidfRetriever` (vector-space lexical), `FaissDenseRetriever` (dense FAISS, `faiss` extra) | real-lib-inmem | [`retrieve_contract.py`](../../../tests/connectors/retrieve/retrieve_contract.py) | | [`rerank`](rerank/) | `query_text + candidates + top_k -> reordered passages` (`reranked_passages`) | `LexicalReranker` (token overlap, zero-download) | `FlashRankReranker` (ONNX cross-encoder, `rerank` extra, CI-skip on model download) | fixture-stub | [`rerank_contract.py`](../../../tests/connectors/rerank/rerank_contract.py) | | [`generate`](generate/) | `query_text + passages -> answer + citations` (`generated_answer: {answer, citations}`), grounded by construction | `ExtractiveResponder` (stdlib sentence extraction) | `TemplateResponder` (multi-citation template) | fixture-stub | [`generate_contract.py`](../../../tests/connectors/generate/generate_contract.py) | | [`graph_rag`](graph_rag/) | `query_text + nodes + edges + top_k -> ranked passages` (`graph_passages`); query overlap + one-hop neighbour bonus | `AdjacencyGraphRag` (stdlib adjacency map, zero-download) | `NetworkxGraphRag` (`networkx`, `graph` extra); parity test pins identical ranking | fixture-stub | [`graph_rag_contract.py`](../../../tests/connectors/graph_rag/graph_rag_contract.py) | @@ -44,6 +44,17 @@ bm25s, ColBERT, ...). The anchor `Bm25sRetriever` (`retrieve_backend="bm25s"`) i BM25 lexical retrieval via `bm25s`: zero-download, deterministic, numpy/scipy. `TfidfRetriever` (`retrieve_backend="tfidf"`) ranks the same corpus by TF-IDF cosine similarity using the repo's deterministic embedder, also zero-download. +`FaissDenseRetriever` (`retrieve_backend="faiss"`, `--extra faiss`) is the +canonical **dense** backend: the same FAISS nearest-neighbor search the stage +pipeline's `retrieval` stage runs, folded in behind this contract (cosine over +the repo's deterministic hash embeddings, in-memory `IndexFlatIP`). + +The FAISS `retrieval` stage and this family are one world: the stage serves the +same `retrieved_passages` shape from a pre-built on-disk index, so migrating +between stage and connector is an option swap, not a pipeline rewrite. See +"Relationship to the stage pipeline" in the +[design doc](../../../docs/rag-connector-base-classes.md) and the parity test +in [`tests/integration/test_stage_connector_parity.py`](../../../tests/integration/test_stage_connector_parity.py). ```python from mloda.user import mlodaAPI, Feature, Options, PluginCollector @@ -131,4 +142,5 @@ a concrete backend implements only its one ranking / generation hook. ```bash uv sync --extra connectors # or --extra rerank / graph / structured / orchestrator +uv sync --extra faiss # dense retrieve backend (FaissDenseRetriever) ``` diff --git a/rag_integration/feature_groups/connectors/retrieve/__init__.py b/rag_integration/feature_groups/connectors/retrieve/__init__.py index 7b68a8f..ec1c5e1 100644 --- a/rag_integration/feature_groups/connectors/retrieve/__init__.py +++ b/rag_integration/feature_groups/connectors/retrieve/__init__.py @@ -4,6 +4,7 @@ from rag_integration.feature_groups.connectors.retrieve.base import BaseRetrieveConnector from rag_integration.feature_groups.connectors.retrieve.bm25s_retriever import Bm25sRetriever +from rag_integration.feature_groups.connectors.retrieve.faiss_retriever import FaissDenseRetriever from rag_integration.feature_groups.connectors.retrieve.tfidf_retriever import TfidfRetriever -__all__ = ["BaseRetrieveConnector", "Bm25sRetriever", "TfidfRetriever"] +__all__ = ["BaseRetrieveConnector", "Bm25sRetriever", "FaissDenseRetriever", "TfidfRetriever"] diff --git a/rag_integration/feature_groups/connectors/retrieve/faiss_retriever.py b/rag_integration/feature_groups/connectors/retrieve/faiss_retriever.py new file mode 100644 index 0000000..6fda2eb --- /dev/null +++ b/rag_integration/feature_groups/connectors/retrieve/faiss_retriever.py @@ -0,0 +1,81 @@ +"""Dense FAISS retrieve connector: the repo's FAISS path under the family contract. + +Canonical dense concrete for the ``retrieve`` family (issue #36): the same +FAISS nearest-neighbor search the stage pipeline's ``retrieval`` stage runs +(``feature_groups/rag_pipeline/retrieval``), folded in behind the family's +``query_text + corpus + top_k -> ranked passages`` contract. The stage searches +a pre-built on-disk index; this connector serves the family's inline-corpus +contract by embedding the corpus per call and searching an in-memory index. +Both paths emit the same passage row shape, so a downstream feature is agnostic +to which produced it (see ``tests/integration/test_stage_connector_parity.py``). + +Embeddings come from the stage pipeline's deterministic +:class:`~rag_integration.feature_groups.rag_pipeline.embedding.hash_embed.HashEmbedder` +(zero-download, unit-normalized, query embedded independently of the corpus, +exactly how a dense bi-encoder behaves). Requires the ``faiss`` extra +(``faiss-cpu``), like the rest of the FAISS path. +""" + +from __future__ import annotations + +from typing import List, Tuple + +import numpy as np + +from rag_integration.feature_groups.connectors.retrieve.base import BaseRetrieveConnector +from rag_integration.feature_groups.rag_pipeline.embedding.hash_embed import HashEmbedder + + +class FaissDenseRetriever(BaseRetrieveConnector): + """Dense FAISS retrieval over an inline corpus (``retrieve_backend="faiss"``). + + Embeds corpus and query with the repo's deterministic ``HashEmbedder``, + builds an in-memory ``IndexFlatIP`` over the corpus vectors, and ranks by + inner product. The embedder L2-normalizes every vector, so the inner + product is the cosine similarity and scores are higher-is-more-relevant. + The corpus is per-call, so there is no shared state to cache and repeated + calls are idempotent. Family rule: at most ``top_k`` passages come back and + only those scoring positively, so a degenerate query (empty, or sharing no + hashed terms with the corpus) yields no passages. + """ + + # The embedder hashes terms into a fixed-width vector; 384 is its own + # default and is ample for the small inline corpora this family serves. + # ``model_name`` is ignored by the hash embedder. + _EMBED_DIM = 384 + + RETRIEVE_BACKENDS = { + "faiss": "Dense FAISS retrieval (cosine over deterministic hash embeddings)", + } + + PROPERTY_MAPPING = { + BaseRetrieveConnector.RETRIEVE_BACKEND: {"explanation": "Use 'faiss' for dense FAISS retrieval"}, + BaseRetrieveConnector.QUERY_TEXT: {"explanation": "Raw text query to search the corpus"}, + BaseRetrieveConnector.TOP_K: { + "explanation": f"Number of passages to return (default {BaseRetrieveConnector.DEFAULT_TOP_K})" + }, + BaseRetrieveConnector.CORPUS: {"explanation": "Inline corpus: a list of {doc_id, text} dicts"}, + } + + @classmethod + def _rank(cls, query: str, texts: List[str], top_k: int) -> List[Tuple[int, float]]: + import faiss + + # Embed corpus and query with the same stage-pipeline embedder. Each + # text embeds independently (no shared vocabulary), so the query is + # embedded exactly as a dense bi-encoder would embed it. + vectors = HashEmbedder._embed_texts(list(texts) + [query], cls._EMBED_DIM, "default") + corpus_array = np.array(vectors[:-1], dtype=np.float32) + query_array = np.array([vectors[-1]], dtype=np.float32) + + # Vectors are unit-length, so inner product == cosine similarity and + # FAISS returns the pairs best-first. + index = faiss.IndexFlatIP(cls._EMBED_DIM) + index.add(corpus_array) + scores, indices = index.search(query_array, top_k) + + pairs = [(int(idx), float(score)) for idx, score in zip(indices[0], scores[0])] + # Family rule: only positively scoring passages are returned. This also + # covers the degenerate query: it embeds to a zero or orthogonal + # vector, every cosine is <= 0, and no pair survives the filter. + return [(idx, score) for idx, score in pairs if score > 0.0] diff --git a/rag_integration/feature_groups/rag_pipeline/llm_response/base.py b/rag_integration/feature_groups/rag_pipeline/llm_response/base.py index 148d111..1f3ba84 100644 --- a/rag_integration/feature_groups/rag_pipeline/llm_response/base.py +++ b/rag_integration/feature_groups/rag_pipeline/llm_response/base.py @@ -28,9 +28,16 @@ class BaseLLMResponse(FeatureGroup): - system_prompt: System prompt for the LLM (optional, has default) - llm_method: Which LLM implementation to use (discriminator) - Output rows contain: llm_response (the generated text) + Output rows contain: llm_response (the generated text) and the canonical + answer object under ANSWER_KEY (same shape as the generate connector + family), so a downstream feature is agnostic to which produced it. """ + # Mirrors BaseGenerateConnector.ROOT_FEATURE_NAME (kept as a literal here so + # the stage layer does not import the connectors layer; the parity test + # asserts the two stay equal). + ANSWER_KEY = "generated_answer" + QUERY = "query" CONTEXT = "context" SYSTEM_PROMPT = "system_prompt" @@ -64,7 +71,7 @@ def compute_framework_rule(cls) -> Optional[Set[Type[ComputeFramework]]]: @classmethod def input_data(cls) -> DataCreator: - return DataCreator({"llm_response"}) + return DataCreator({"llm_response", cls.ANSWER_KEY}) @classmethod def match_feature_group_criteria( @@ -73,8 +80,18 @@ def match_feature_group_criteria( options: Options, data_access_collection: Any = None, ) -> bool: - """Match features named 'llm_response' exactly.""" - return feature_name == "llm_response" + """Match 'llm_response', or the canonical answer feature for stage options. + + Serving ANSWER_KEY makes migration a pure option swap: a downstream + feature keeps requesting the same name whether a generate connector + (query_text + passages) or this stage (query + context) produces it. + The gate on the stage's defining option (query) keeps the two worlds + from both claiming one request. + """ + name = str(feature_name) + if name == "llm_response": + return True + return name == cls.ANSWER_KEY and options.get(cls.QUERY) is not None def input_features(self, options: Options, feature_name: FeatureName) -> None: """Root feature: no input features.""" @@ -134,6 +151,10 @@ def calculate_feature(cls, data: Any, features: FeatureSet) -> List[Dict[str, An system_prompt = cls._get_system_prompt(options) response = cls._generate(query, context, system_prompt, options) - return [{"llm_response": response}] + # The canonical answer object mirrors the generate connector + # family's shape. The stage has no doc_id-tracked passages, so the + # citation list is honestly empty rather than fabricated. + answer = {"answer": response, "citations": []} + return [{"llm_response": response, cls.ANSWER_KEY: answer}] return [] diff --git a/rag_integration/feature_groups/rag_pipeline/retrieval/base.py b/rag_integration/feature_groups/rag_pipeline/retrieval/base.py index b32b270..50ecc00 100644 --- a/rag_integration/feature_groups/rag_pipeline/retrieval/base.py +++ b/rag_integration/feature_groups/rag_pipeline/retrieval/base.py @@ -34,9 +34,17 @@ class BaseRetriever(FeatureGroup): - query_text: Raw text query (requires embedding_method) - embedding_method: Which embedder to use for query_text - Output rows contain: indices, distances, texts, doc_ids + Output rows contain: indices, distances, texts, doc_ids, and the canonical + ranked-passage list under PASSAGES_KEY (same shape as the retrieve + connector family), so a downstream feature is agnostic to whether a stage + or a connector produced its passages. """ + # Mirrors BaseRetrieveConnector.ROOT_FEATURE_NAME (kept as a literal here so + # the stage layer does not import the connectors layer; the parity test + # asserts the two stay equal). + PASSAGES_KEY = "retrieved_passages" + TOP_K = "top_k" QUERY_EMBEDDING = "query_embedding" QUERY_TEXT = "query_text" @@ -77,7 +85,7 @@ def compute_framework_rule(cls) -> Optional[Set[Type[ComputeFramework]]]: @classmethod def input_data(cls) -> DataCreator: - return DataCreator({"retrieved"}) + return DataCreator({"retrieved", cls.PASSAGES_KEY}) @classmethod def match_feature_group_criteria( @@ -86,8 +94,18 @@ def match_feature_group_criteria( options: Options, data_access_collection: Any = None, ) -> bool: - """Match features named 'retrieved' exactly.""" - return feature_name == "retrieved" + """Match 'retrieved', or the canonical passage feature for index-backed options. + + Serving PASSAGES_KEY makes migration a pure option swap: a downstream + feature keeps requesting the same name whether a retrieve connector + (inline corpus) or this stage (pre-built index) produces it. The gate + on the stage's defining option (index_path) keeps the two worlds from + both claiming one request. + """ + name = str(feature_name) + if name == "retrieved": + return True + return name == cls.PASSAGES_KEY and options.get(cls.INDEX_PATH) is not None def input_features(self, options: Options, feature_name: FeatureName) -> None: """Root feature: no input features.""" @@ -146,6 +164,39 @@ def _search( """ ... + @classmethod + def _to_passages(cls, results: Dict[str, Any]) -> List[Dict[str, Any]]: + """Convert a :meth:`_search` result into the canonical ranked-passage shape. + + The shape is the retrieve connector family's contract: + ``[{doc_id, text, score, rank}]``, best first. FAISS distances are + lower-is-better, so ``score`` is ``1 / (1 + distance)``: positive, + monotone, and higher-is-more-relevant. Without metadata the ``doc_id`` + falls back to the index position (mirroring the connector family's + positional fallback) and ``text`` to ``""``. + """ + indices = results.get("indices", []) + distances = results.get("distances", []) + texts = results.get("texts", []) + doc_ids = results.get("doc_ids", []) + + passages: List[Dict[str, Any]] = [] + for rank, distance in enumerate(distances): + if rank < len(doc_ids): + doc_id = str(doc_ids[rank]) + else: + doc_id = str(indices[rank]) if rank < len(indices) else str(rank) + text = str(texts[rank]) if rank < len(texts) else "" + passages.append( + { + "doc_id": doc_id, + "text": text, + "score": 1.0 / (1.0 + float(distance)), + "rank": rank, + } + ) + return passages + @classmethod def calculate_feature(cls, data: Any, features: FeatureSet) -> List[Dict[str, Any]]: """Run retrieval: embed query if needed, search index, return results.""" @@ -172,6 +223,6 @@ def calculate_feature(cls, data: Any, features: FeatureSet) -> List[Dict[str, An top_k = cls._get_top_k(options) results = cls._search(query_vector, top_k, options) - return [{"retrieved": results, **results}] + return [{"retrieved": results, **results, cls.PASSAGES_KEY: cls._to_passages(results)}] return [] diff --git a/tests/connectors/retrieve/test_faiss_retriever.py b/tests/connectors/retrieve/test_faiss_retriever.py new file mode 100644 index 0000000..995dd55 --- /dev/null +++ b/tests/connectors/retrieve/test_faiss_retriever.py @@ -0,0 +1,45 @@ +"""Contract test for :class:`FaissDenseRetriever` (canonical dense backend). + +The whole suite is inherited from :class:`RetrieveConnectorContractBase`; this +class only wires up the five adapter methods. The corpus is crafted for the +hash embedder, which splits on whitespace without stripping punctuation: the +overlapping tokens are exact whitespace tokens (no trailing periods), the query +shares ``cat``/``pet`` with ``d2`` and just ``pet`` with ``d1`` (a positively +scoring runner-up for the score-margin assertion), and the distractors share no +token with the query, so their cosine is zero and the family drops them. +""" + +from __future__ import annotations + +from typing import Any, Dict, List, Type + +from rag_integration.feature_groups.connectors.retrieve.base import BaseRetrieveConnector +from rag_integration.feature_groups.connectors.retrieve.faiss_retriever import FaissDenseRetriever +from tests.connectors.retrieve.retrieve_contract import RetrieveConnectorContractBase + + +class TestFaissDenseRetriever(RetrieveConnectorContractBase): + @classmethod + def connector_class(cls) -> Type[BaseRetrieveConnector]: + return FaissDenseRetriever + + @classmethod + def backend_value(cls) -> str: + return "faiss" + + @classmethod + def sample_corpus(cls) -> List[Dict[str, Any]]: + return [ + {"doc_id": "d0", "text": "the mat lay flat on the floor by the window"}, + {"doc_id": "d1", "text": "a dog can be a loyal and energetic pet"}, + {"doc_id": "d2", "text": "a cat is an independent and curious pet"}, + {"doc_id": "d3", "text": "cars need regular engine oil and maintenance"}, + ] + + @classmethod + def sample_query(cls) -> str: + return "cat pet" + + @classmethod + def expected_top_doc_id(cls) -> str: + return "d2" diff --git a/tests/integration/test_stage_connector_parity.py b/tests/integration/test_stage_connector_parity.py new file mode 100644 index 0000000..8857509 --- /dev/null +++ b/tests/integration/test_stage_connector_parity.py @@ -0,0 +1,203 @@ +"""Stage <-> connector migration-seam parity tests (issue #36). + +The seam: the stage pipeline (build-your-own) and the connector families +(bring an existing tool) emit the same passage / answer row shape under the +same canonical feature name, so a downstream feature is agnostic to which +produced it and migration is a swap of options, not a pipeline rewrite. + +Verified here end to end: the FAISS ``retrieval`` stage (pre-built on-disk +index) and the dense ``retrieve`` connector (inline corpus) answer the same +query over the same documents with identically shaped, identically ordered +passages; the ``llm_response`` stage and the ``generate`` connector emit the +same answer-object shape. +""" + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any, Dict, List + +import faiss +import numpy as np + +from mloda.user import mlodaAPI, Feature, Options, PluginCollector +from mloda.provider import FeatureGroup +from mloda_plugins.compute_framework.base_implementations.python_dict.python_dict_framework import ( + PythonDictFramework, +) + +from rag_integration.feature_groups.connectors.generate.base import BaseGenerateConnector +from rag_integration.feature_groups.connectors.generate.extractive_responder import ExtractiveResponder +from rag_integration.feature_groups.connectors.retrieve.base import BaseRetrieveConnector +from rag_integration.feature_groups.connectors.retrieve.faiss_retriever import FaissDenseRetriever +from rag_integration.feature_groups.rag_pipeline.embedding.hash_embed import HashEmbedder +from rag_integration.feature_groups.rag_pipeline.llm_response.base import BaseLLMResponse +from rag_integration.feature_groups.rag_pipeline.retrieval.base import BaseRetriever +from rag_integration.feature_groups.rag_pipeline.retrieval.faiss_retriever import FaissRetriever +from tests.integration.helpers import flatten_result + +# Crafted for the hash embedder (whitespace tokens, no punctuation): the query +# shares two tokens with d2 and one with d1; the distractors share none, so +# their cosine is zero and the connector family drops them. +CORPUS = [ + {"doc_id": "d0", "text": "the mat lay flat on the floor by the window"}, + {"doc_id": "d1", "text": "a dog can be a loyal and energetic pet"}, + {"doc_id": "d2", "text": "a cat is an independent and curious pet"}, + {"doc_id": "d3", "text": "cars need regular engine oil and maintenance"}, +] +QUERY = "cat pet" +TOP_K = 2 +EMBED_DIM = 384 + + +class _StubLLMResponse(BaseLLMResponse): + """Deterministic offline stand-in for an LLM stage backend.""" + + LLM_METHODS = {"stub": "Deterministic stub for tests"} + + @classmethod + def match_feature_group_criteria( + cls, + feature_name: Any, + options: Options, + data_access_collection: Any = None, + ) -> bool: + # Gate on the stub's own selector so this test-only group can never + # claim a request from an unrelated test sharing the process. + if options.get(cls.LLM_METHOD) != "stub": + return False + return bool(super().match_feature_group_criteria(feature_name, options, data_access_collection)) + + @classmethod + def _generate(cls, query: str, context: str, system_prompt: str, options: Options) -> str: + return f"stub answer to: {query}" + + +def _run_one(feature: Feature, groups: set[type[FeatureGroup]], key: str) -> Any: + result = mlodaAPI.run_all( + [feature], + compute_frameworks={PythonDictFramework}, + plugin_collector=PluginCollector.enabled_feature_groups(groups), + ) + for row in flatten_result(result): + if key in row: + return row[key] + raise AssertionError(f"run_all returned no '{key}' row: {result!r}") + + +def _build_stage_index(tmp_path: Path) -> tuple[str, str]: + """Build the on-disk FAISS index + metadata sidecar the stage consumes.""" + texts = [str(doc["text"]) for doc in CORPUS] + vectors = HashEmbedder._embed_texts(texts, EMBED_DIM, "default") + index = faiss.IndexFlatL2(EMBED_DIM) + index.add(np.array(vectors, dtype=np.float32)) + + index_path = str(tmp_path / "parity_index.faiss") + faiss.write_index(index, index_path) + + metadata = {"texts": texts, "doc_ids": [str(doc["doc_id"]) for doc in CORPUS]} + metadata_path = str(tmp_path / "parity_metadata.json") + with open(metadata_path, "w", encoding="utf-8") as f: + json.dump(metadata, f) + + FaissRetriever._index_cache = None + FaissRetriever._metadata_cache = None + return index_path, metadata_path + + +def _assert_passage_shape(passages: List[Dict[str, Any]]) -> None: + assert isinstance(passages, list) + assert passages, "no passages came back; parity assertions would be vacuous" + for rank, passage in enumerate(passages): + assert set(passage) == {"doc_id", "text", "score", "rank"} + assert isinstance(passage["doc_id"], str) + assert isinstance(passage["text"], str) + assert isinstance(passage["score"], float) + assert passage["rank"] == rank + scores = [p["score"] for p in passages] + assert scores == sorted(scores, reverse=True) + + +class TestCanonicalFeatureNames: + def test_stage_keys_match_connector_root_feature_names(self) -> None: + """The stage-side literals must stay equal to the connector contract names.""" + assert BaseRetriever.PASSAGES_KEY == BaseRetrieveConnector.ROOT_FEATURE_NAME + assert BaseLLMResponse.ANSWER_KEY == BaseGenerateConnector.ROOT_FEATURE_NAME + + +class TestRetrieveSeamParity: + def test_stage_and_connector_emit_same_passage_rows(self, tmp_path: Path) -> None: + index_path, metadata_path = _build_stage_index(tmp_path) + + stage_feature = Feature( + BaseRetriever.PASSAGES_KEY, + options=Options( + context={ + "index_path": index_path, + "metadata_path": metadata_path, + "query_text": QUERY, + "embedding_method": "hash", + "top_k": TOP_K, + } + ), + ) + stage_passages = _run_one(stage_feature, {FaissRetriever}, BaseRetriever.PASSAGES_KEY) + + connector_feature = Feature( + BaseRetrieveConnector.ROOT_FEATURE_NAME, + options=Options( + context={ + "retrieve_backend": "faiss", + "query_text": QUERY, + "corpus": CORPUS, + "top_k": TOP_K, + } + ), + ) + connector_passages = _run_one(connector_feature, {FaissDenseRetriever}, BaseRetrieveConnector.ROOT_FEATURE_NAME) + + _assert_passage_shape(stage_passages) + _assert_passage_shape(connector_passages) + + # Same embedder, same vectors: both paths must rank the same documents + # in the same order (scores differ in scale: cosine vs 1/(1+L2)). + assert [p["doc_id"] for p in stage_passages] == [p["doc_id"] for p in connector_passages] + assert [p["text"] for p in stage_passages] == [p["text"] for p in connector_passages] + assert stage_passages[0]["doc_id"] == "d2" + + +class TestGenerateSeamParity: + def test_stage_and_connector_emit_same_answer_shape(self) -> None: + query = "what is a cat" + + stage_feature = Feature( + BaseLLMResponse.ANSWER_KEY, + options=Options( + context={ + "query": query, + "context": [str(doc["text"]) for doc in CORPUS], + "llm_method": "stub", + } + ), + ) + stage_answer = _run_one(stage_feature, {_StubLLMResponse}, BaseLLMResponse.ANSWER_KEY) + + connector_feature = Feature( + BaseGenerateConnector.ROOT_FEATURE_NAME, + options=Options( + context={ + "generate_backend": "extractive", + "query_text": query, + "passages": [{"doc_id": "d2", "text": "a cat is an independent and curious pet"}], + } + ), + ) + connector_answer = _run_one(connector_feature, {ExtractiveResponder}, BaseGenerateConnector.ROOT_FEATURE_NAME) + + for answer in (stage_answer, connector_answer): + assert set(answer) == {"answer", "citations"} + assert isinstance(answer["answer"], str) + assert answer["answer"], "answer must be non-empty for a shape comparison that means anything" + assert isinstance(answer["citations"], list) + assert all(isinstance(c, str) for c in answer["citations"]) From 9f8b15e38c5700753e2199ed18a41c6ee4c721f3 Mon Sep 17 00:00:00 2001 From: Tom Kaltofen Date: Wed, 10 Jun 2026 14:50:50 +0000 Subject: [PATCH 9/9] fix: align stage passage scores and matching with the retrieve family contract Review follow-ups: the retrieval stage's canonical passages now carry true cosine scores (1 - squared_L2 / 2 over the repo's unit-normalized embeddings) and apply the family's only-positive-scores rule, so a no-match query yields no passages on both paths; blank metadata doc_ids fall back to the FAISS index position; and both stages yield the canonical feature name to an explicit connector selector so mixed options cannot double-match. Parity test extended accordingly. --- docs/rag-connector-base-classes.md | 19 +++++--- .../rag_pipeline/llm_response/base.py | 9 +++- .../rag_pipeline/retrieval/base.py | 48 +++++++++++++------ .../test_stage_connector_parity.py | 46 +++++++++++++++++- 4 files changed, 100 insertions(+), 22 deletions(-) diff --git a/docs/rag-connector-base-classes.md b/docs/rag-connector-base-classes.md index 10f00ee..c43cbdb 100644 --- a/docs/rag-connector-base-classes.md +++ b/docs/rag-connector-base-classes.md @@ -235,13 +235,20 @@ the seam between them is pinned down in three ways (issue #36): `retrieval` stage serves the same search over a pre-built on-disk index. The `retrieve` family thus has lexical (`bm25s`, `tfidf`) and dense (`faiss`) backends under one contract. -- **Same row shape, same feature name.** A connector and the corresponding - stage emit the same passage / answer row shape under the same canonical - feature name: the `retrieval` stage also serves `retrieved_passages` - (`[{doc_id, text, score, rank}]`, score higher-is-better) when its - `index_path` option is present, and the `llm_response` stage also serves +- **Same row shape, same feature name, same rules.** A connector and the + corresponding stage emit the same passage / answer row shape under the same + canonical feature name: the `retrieval` stage also serves + `retrieved_passages` (`[{doc_id, text, score, rank}]`) when its `index_path` + option is present, and the `llm_response` stage also serves `generated_answer` (`{answer, citations}`) when its `query` option is - present. A downstream feature is agnostic to which produced it. Verified by + present. The stage's passage `score` is the cosine similarity (the repo's + embedders are unit-normalized, so FAISS's squared L2 converts exactly), the + same scale the dense connector emits, and the family's only-positive-scores + rule applies on both paths, so a no-match query yields no passages either + way. If a request carries an explicit connector selector + (`retrieve_backend` / `generate_backend`), the stage yields and the + connector serves it. A downstream feature is agnostic to which produced it. + Verified by [`tests/integration/test_stage_connector_parity.py`](../tests/integration/test_stage_connector_parity.py). - **Migration is an option swap, not a pipeline rewrite.** Moving between a stage and a connector (either direction) keeps the requested feature name diff --git a/rag_integration/feature_groups/rag_pipeline/llm_response/base.py b/rag_integration/feature_groups/rag_pipeline/llm_response/base.py index 1f3ba84..de5c156 100644 --- a/rag_integration/feature_groups/rag_pipeline/llm_response/base.py +++ b/rag_integration/feature_groups/rag_pipeline/llm_response/base.py @@ -91,7 +91,14 @@ def match_feature_group_criteria( name = str(feature_name) if name == "llm_response": return True - return name == cls.ANSWER_KEY and options.get(cls.QUERY) is not None + if name != cls.ANSWER_KEY: + return False + # Yield to an explicit generate-connector backend: with mixed options + # (e.g. a half-finished migration) the connector wins instead of both + # groups claiming the request. + if options.get("generate_backend") is not None: + return False + return options.get(cls.QUERY) is not None def input_features(self, options: Options, feature_name: FeatureName) -> None: """Root feature: no input features.""" diff --git a/rag_integration/feature_groups/rag_pipeline/retrieval/base.py b/rag_integration/feature_groups/rag_pipeline/retrieval/base.py index 50ecc00..6deac95 100644 --- a/rag_integration/feature_groups/rag_pipeline/retrieval/base.py +++ b/rag_integration/feature_groups/rag_pipeline/retrieval/base.py @@ -105,7 +105,14 @@ def match_feature_group_criteria( name = str(feature_name) if name == "retrieved": return True - return name == cls.PASSAGES_KEY and options.get(cls.INDEX_PATH) is not None + if name != cls.PASSAGES_KEY: + return False + # Yield to an explicit retrieve-connector backend: with mixed options + # (e.g. a half-finished migration) the connector wins instead of both + # groups claiming the request. + if options.get("retrieve_backend") is not None: + return False + return options.get(cls.INDEX_PATH) is not None def input_features(self, options: Options, feature_name: FeatureName) -> None: """Root feature: no input features.""" @@ -164,16 +171,25 @@ def _search( """ ... + # Cosine scores within this margin of zero are float32 noise around + # orthogonality (a no-match hit), not relevance; the family rule drops them. + _SCORE_EPSILON = 1e-6 + @classmethod def _to_passages(cls, results: Dict[str, Any]) -> List[Dict[str, Any]]: """Convert a :meth:`_search` result into the canonical ranked-passage shape. - The shape is the retrieve connector family's contract: - ``[{doc_id, text, score, rank}]``, best first. FAISS distances are - lower-is-better, so ``score`` is ``1 / (1 + distance)``: positive, - monotone, and higher-is-more-relevant. Without metadata the ``doc_id`` - falls back to the index position (mirroring the connector family's - positional fallback) and ``text`` to ``""``. + Shape and rules are the retrieve connector family's contract: + ``[{doc_id, text, score, rank}]``, best first, only positively scoring + passages. All of the repo's embedders L2-normalize, so the squared L2 + distance FAISS returns relates to cosine as ``cos = 1 - distance / 2``; + ``score`` is that cosine, the same scale the dense retrieve connector + emits, and the positive-score filter drops no-match hits exactly as the + family does (an index built from non-normalized vectors sees its + out-of-range hits filtered too; the raw ``distances`` stay available + unfiltered in the row). A blank or missing ``doc_id`` falls back to the + FAISS index position (mirroring the family's positional fallback) and a + missing ``text`` to ``""``. """ indices = results.get("indices", []) distances = results.get("distances", []) @@ -181,18 +197,22 @@ def _to_passages(cls, results: Dict[str, Any]) -> List[Dict[str, Any]]: doc_ids = results.get("doc_ids", []) passages: List[Dict[str, Any]] = [] - for rank, distance in enumerate(distances): - if rank < len(doc_ids): - doc_id = str(doc_ids[rank]) + for i, distance in enumerate(distances): + score = 1.0 - float(distance) / 2.0 + if score <= cls._SCORE_EPSILON: + continue + raw_doc_id = doc_ids[i] if i < len(doc_ids) else None + if raw_doc_id is not None and str(raw_doc_id) != "": + doc_id = str(raw_doc_id) else: - doc_id = str(indices[rank]) if rank < len(indices) else str(rank) - text = str(texts[rank]) if rank < len(texts) else "" + doc_id = str(indices[i]) if i < len(indices) else str(i) + text = str(texts[i]) if i < len(texts) else "" passages.append( { "doc_id": doc_id, "text": text, - "score": 1.0 / (1.0 + float(distance)), - "rank": rank, + "score": score, + "rank": len(passages), } ) return passages diff --git a/tests/integration/test_stage_connector_parity.py b/tests/integration/test_stage_connector_parity.py index 8857509..891fe2d 100644 --- a/tests/integration/test_stage_connector_parity.py +++ b/tests/integration/test_stage_connector_parity.py @@ -20,6 +20,7 @@ import faiss import numpy as np +import pytest from mloda.user import mlodaAPI, Feature, Options, PluginCollector from mloda.provider import FeatureGroup @@ -125,6 +126,21 @@ def test_stage_keys_match_connector_root_feature_names(self) -> None: assert BaseRetriever.PASSAGES_KEY == BaseRetrieveConnector.ROOT_FEATURE_NAME assert BaseLLMResponse.ANSWER_KEY == BaseGenerateConnector.ROOT_FEATURE_NAME + def test_stage_yield_gates_match_connector_selector_keys(self) -> None: + """The selector literals the stage gates yield on must stay the family selectors.""" + assert BaseRetrieveConnector.RETRIEVE_BACKEND == "retrieve_backend" + assert BaseGenerateConnector.GENERATE_BACKEND == "generate_backend" + + def test_stage_yields_canonical_name_to_explicit_connector_backend(self) -> None: + """With mixed options (half-finished migration) only the connector claims the request.""" + mixed_retrieve = Options(context={"index_path": "some/index.faiss", "retrieve_backend": "faiss"}) + assert FaissRetriever.match_feature_group_criteria(BaseRetriever.PASSAGES_KEY, mixed_retrieve) is False + assert FaissDenseRetriever.match_feature_group_criteria(BaseRetrieveConnector.ROOT_FEATURE_NAME, mixed_retrieve) + + mixed_generate = Options(context={"query": "q", "llm_method": "stub", "generate_backend": "extractive"}) + assert _StubLLMResponse.match_feature_group_criteria(BaseLLMResponse.ANSWER_KEY, mixed_generate) is False + assert ExtractiveResponder.match_feature_group_criteria(BaseGenerateConnector.ROOT_FEATURE_NAME, mixed_generate) + class TestRetrieveSeamParity: def test_stage_and_connector_emit_same_passage_rows(self, tmp_path: Path) -> None: @@ -161,11 +177,39 @@ def test_stage_and_connector_emit_same_passage_rows(self, tmp_path: Path) -> Non _assert_passage_shape(connector_passages) # Same embedder, same vectors: both paths must rank the same documents - # in the same order (scores differ in scale: cosine vs 1/(1+L2)). + # in the same order with the same cosine scores (the stage converts the + # squared L2 distance over unit vectors back to cosine). assert [p["doc_id"] for p in stage_passages] == [p["doc_id"] for p in connector_passages] assert [p["text"] for p in stage_passages] == [p["text"] for p in connector_passages] + for stage_passage, connector_passage in zip(stage_passages, connector_passages): + assert stage_passage["score"] == pytest.approx(connector_passage["score"], abs=1e-5) assert stage_passages[0]["doc_id"] == "d2" + def test_no_match_query_returns_empty_on_both_paths(self, tmp_path: Path) -> None: + """Family rule parity: a query relevant to nothing yields no passages + from the connector AND from the stage (no fabricated-positive scores).""" + index_path, metadata_path = _build_stage_index(tmp_path) + + stage_feature = Feature( + BaseRetriever.PASSAGES_KEY, + options=Options( + context={ + "index_path": index_path, + "metadata_path": metadata_path, + "query_text": "zzzz qqqq", + "embedding_method": "hash", + "top_k": TOP_K, + } + ), + ) + stage_passages = _run_one(stage_feature, {FaissRetriever}, BaseRetriever.PASSAGES_KEY) + + connector = FaissDenseRetriever() + connector_passages = connector._retrieve("zzzz qqqq", CORPUS, TOP_K) + + assert stage_passages == [] + assert connector_passages == [] + class TestGenerateSeamParity: def test_stage_and_connector_emit_same_answer_shape(self) -> None: