diff --git a/examples/cross_encoder/evaluation/evaluation_nano_cross_encoder_bm25.py b/examples/cross_encoder/evaluation/evaluation_nano_cross_encoder_bm25.py new file mode 100644 index 000000000..ed0b7bc65 --- /dev/null +++ b/examples/cross_encoder/evaluation/evaluation_nano_cross_encoder_bm25.py @@ -0,0 +1,63 @@ +"""Simple CrossEncoder NanoBEIR reranking example. + +Run: + uv run --with datasets python examples/cross_encoder/evaluation/evaluation_nano_cross_encoder_bm25.py +""" + +import logging + +from sentence_transformers import CrossEncoder +from sentence_transformers.cross_encoder.evaluation import CrossEncoderNanoBEIREvaluator +from sentence_transformers.cross_encoder.evaluation.nano_beir import DATASET_NAME_TO_HUMAN_READABLE + +logging.basicConfig(format="%(message)s", level=logging.INFO) + +MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L6-v2" +DATASET_ID = "sentence-transformers/NanoBEIR-en" +DATASET_SPLITS = ["msmarco", "nq"] +RERANK_K = 100 + +model = CrossEncoder(MODEL_NAME) +evaluator = CrossEncoderNanoBEIREvaluator( + dataset_id=DATASET_ID, + dataset_names=DATASET_SPLITS, + rerank_k=RERANK_K, + at_k=10, + batch_size=32, + show_progress_bar=False, +) + +results = evaluator(model) +if evaluator.primary_metric is None: + raise ValueError("Expected evaluator.primary_metric to be set after evaluation.") + +primary_metric = evaluator.primary_metric +if primary_metric not in results: + primary_metric = f"{evaluator.name}_{primary_metric}" +if primary_metric not in results: + raise ValueError(f"Primary metric key not found: {primary_metric}") + +""" +Example output (actual run in this repo, to be updated if defaults change): + Model: cross-encoder/ms-marco-MiniLM-L6-v2 + Dataset: sentence-transformers/NanoBEIR-en + Splits: ['msmarco', 'nq'] + Split scores: + - NanoMSMARCO_R100_ndcg@10 = 0.6686 + - NanoNQ_R100_ndcg@10 = 0.7599 + Primary metric key: NanoBEIR_R100_mean_ndcg@10 + Primary metric value: 0.7142 +""" + +print(f"Model: {MODEL_NAME}") +print(f"Dataset: {DATASET_ID}") +print(f"Splits: {DATASET_SPLITS}") +metric_suffix = primary_metric.split("_mean_", maxsplit=1)[1] +print("Split scores:") +for split_name in DATASET_SPLITS: + human_readable = DATASET_NAME_TO_HUMAN_READABLE[split_name.lower()] + split_key = f"Nano{human_readable}_R{RERANK_K}_{metric_suffix}" + if split_key in results: + print(f"- {split_key} = {float(results[split_key]):.4f}") +print(f"Primary metric key: {primary_metric}") +print(f"Primary metric value: {float(results[primary_metric]):.4f}") diff --git a/examples/sentence_transformer/evaluation/evaluation_nano_dense_miracl.py b/examples/sentence_transformer/evaluation/evaluation_nano_dense_miracl.py new file mode 100644 index 000000000..7d319d5e1 --- /dev/null +++ b/examples/sentence_transformer/evaluation/evaluation_nano_dense_miracl.py @@ -0,0 +1,54 @@ +"""Simple NanoEvaluator example on NanoMIRACL. + +Run: + uv run --with datasets python examples/sentence_transformer/evaluation/evaluation_nano_dense_miracl.py +""" + +import logging + +from sentence_transformers import SentenceTransformer +from sentence_transformers.evaluation import NanoEvaluator + +logging.basicConfig(format="%(message)s", level=logging.INFO) + +# Keep this example light: evaluate two language splits. +MODEL_NAME = "intfloat/multilingual-e5-small" +DATASET_ID = "hotchpotch/NanoMIRACL" +DATASET_SPLITS = ["en", "ja"] + +model = SentenceTransformer(MODEL_NAME) +evaluator = NanoEvaluator( + dataset_id=DATASET_ID, + dataset_names=DATASET_SPLITS, + batch_size=32, + show_progress_bar=False, +) + +results = evaluator(model) +""" +Example output (actual run in this repo, to be updated if defaults change): + Model: intfloat/multilingual-e5-small + Dataset: hotchpotch/NanoMIRACL + Splits: ['en', 'ja'] + Split scores: + - NanoMIRACL_en_cosine_ndcg@10 = 0.6901 + - NanoMIRACL_ja_cosine_ndcg@10 = 0.7168 + Primary metric key: NanoMIRACL_mean_cosine_ndcg@10 + Primary metric value: 0.7034 +""" + +primary_metric = evaluator.primary_metric +if primary_metric is None: + raise ValueError("Expected evaluator.primary_metric to be set after evaluation.") + +print(f"Model: {MODEL_NAME}") +print(f"Dataset: {DATASET_ID}") +print(f"Splits: {DATASET_SPLITS}") +metric_suffix = primary_metric.split("_mean_", maxsplit=1)[1] +print("Split scores:") +for split_name in DATASET_SPLITS: + split_key = f"NanoMIRACL_{split_name}_{metric_suffix}" + if split_key in results: + print(f"- {split_key} = {results[split_key]:.4f}") +print(f"Primary metric key: {primary_metric}") +print(f"Primary metric value: {results[primary_metric]:.4f}") diff --git a/examples/sentence_transformer/evaluation/evaluation_nano_dense_multidataset_macro.py b/examples/sentence_transformer/evaluation/evaluation_nano_dense_multidataset_macro.py new file mode 100644 index 000000000..4121e6c90 --- /dev/null +++ b/examples/sentence_transformer/evaluation/evaluation_nano_dense_multidataset_macro.py @@ -0,0 +1,73 @@ +"""Simple dense multi-dataset Nano macro example. + +Run: + uv run --with datasets python examples/sentence_transformer/evaluation/evaluation_nano_dense_multidataset_macro.py +""" + +import logging + +import numpy as np + +from sentence_transformers import SentenceTransformer +from sentence_transformers.evaluation import NanoEvaluator + +logging.basicConfig(format="%(message)s", level=logging.INFO) + +MODEL_NAME = "intfloat/multilingual-e5-small" +MULTILINGUAL_NANOBEIR_DATASET_IDS = [ + "sentence-transformers/NanoBEIR-en", + "LiquidAI/NanoBEIR-ja", +] +CUSTOM_DATASET_IDS = [ + "hotchpotch/NanoCodeSearchNet", +] + + +def evaluate_dataset(model: SentenceTransformer, dataset_id: str) -> tuple[str, str, float]: + evaluator = NanoEvaluator( + dataset_id=dataset_id, + dataset_names=None, + batch_size=32, + show_progress_bar=False, + ) + results = evaluator(model) + if evaluator.primary_metric is None: + raise ValueError(f"Expected evaluator.primary_metric for dataset_id={dataset_id}") + return dataset_id, evaluator.primary_metric, float(results[evaluator.primary_metric]) + + +model = SentenceTransformer(MODEL_NAME) + +multilingual_results = [evaluate_dataset(model, dataset_id) for dataset_id in MULTILINGUAL_NANOBEIR_DATASET_IDS] +custom_results = [evaluate_dataset(model, dataset_id) for dataset_id in CUSTOM_DATASET_IDS] + +multilingual_scores = [score for _, _, score in multilingual_results] +custom_scores = [score for _, _, score in custom_results] + +multilingual_macro = float(np.mean(multilingual_scores)) +custom_macro = float(np.mean(custom_scores)) +group_macro = float(np.mean([multilingual_macro, custom_macro])) + +""" +Example output (actual run in this repo, to be updated if defaults change): + Model: intfloat/multilingual-e5-small + Multilingual dataset scores: + - sentence-transformers/NanoBEIR-en | NanoBEIR-en_mean_cosine_ndcg@10 = 0.5542 + - LiquidAI/NanoBEIR-ja | NanoBEIR-ja_mean_cosine_ndcg@10 = 0.4985 + Custom dataset scores: + - hotchpotch/NanoCodeSearchNet | NanoCodeSearchNet_mean_cosine_ndcg@10 = 0.7381 + Multilingual macro mean: 0.5263 + Custom macro mean: 0.7381 + Group macro mean: 0.6322 +""" + +print(f"Model: {MODEL_NAME}") +print("Multilingual dataset scores:") +for dataset_id, metric_key, score in multilingual_results: + print(f"- {dataset_id} | {metric_key} = {score:.4f}") +print("Custom dataset scores:") +for dataset_id, metric_key, score in custom_results: + print(f"- {dataset_id} | {metric_key} = {score:.4f}") +print(f"Multilingual macro mean: {multilingual_macro:.4f}") +print(f"Custom macro mean: {custom_macro:.4f}") +print(f"Group macro mean: {group_macro:.4f}") diff --git a/examples/sparse_encoder/evaluation/sparse_nano_multidataset_macro_evaluator.py b/examples/sparse_encoder/evaluation/sparse_nano_multidataset_macro_evaluator.py new file mode 100644 index 000000000..d6eeb5ede --- /dev/null +++ b/examples/sparse_encoder/evaluation/sparse_nano_multidataset_macro_evaluator.py @@ -0,0 +1,71 @@ +"""Simple sparse multi-dataset Nano macro example. + +Run: + uv run --with datasets python examples/sparse_encoder/evaluation/sparse_nano_multidataset_macro_evaluator.py +""" + +import logging + +import numpy as np + +from sentence_transformers import SparseEncoder +from sentence_transformers.sparse_encoder.evaluation import SparseNanoEvaluator + +logging.basicConfig(format="%(message)s", level=logging.INFO) + +MODEL_NAME = "sparse-encoder/example-inference-free-splade-distilbert-base-uncased-nq" +MULTILINGUAL_NANOBEIR_DATASET_IDS = [ + "sentence-transformers/NanoBEIR-en", +] +CUSTOM_DATASET_IDS = [ + "hotchpotch/NanoCodeSearchNet", +] + + +def evaluate_dataset(model: SparseEncoder, dataset_id: str) -> tuple[str, str, float]: + evaluator = SparseNanoEvaluator( + dataset_id=dataset_id, + dataset_names=None, + batch_size=32, + show_progress_bar=False, + ) + results = evaluator(model) + if evaluator.primary_metric is None: + raise ValueError(f"Expected evaluator.primary_metric for dataset_id={dataset_id}") + return dataset_id, evaluator.primary_metric, float(results[evaluator.primary_metric]) + + +model = SparseEncoder(MODEL_NAME) + +multilingual_results = [evaluate_dataset(model, dataset_id) for dataset_id in MULTILINGUAL_NANOBEIR_DATASET_IDS] +custom_results = [evaluate_dataset(model, dataset_id) for dataset_id in CUSTOM_DATASET_IDS] + +multilingual_scores = [score for _, _, score in multilingual_results] +custom_scores = [score for _, _, score in custom_results] + +multilingual_macro = float(np.mean(multilingual_scores)) +custom_macro = float(np.mean(custom_scores)) +group_macro = float(np.mean([multilingual_macro, custom_macro])) + +""" +Example output (actual run in this repo, to be updated if defaults change): + Model: sparse-encoder/example-inference-free-splade-distilbert-base-uncased-nq + Multilingual dataset scores: + - sentence-transformers/NanoBEIR-en | NanoBEIR-en_mean_dot_ndcg@10 = 0.5205 + Custom dataset scores: + - hotchpotch/NanoCodeSearchNet | NanoCodeSearchNet_mean_dot_ndcg@10 = 0.5867 + Multilingual macro mean: 0.5205 + Custom macro mean: 0.5867 + Group macro mean: 0.5536 +""" + +print(f"Model: {MODEL_NAME}") +print("Multilingual dataset scores:") +for dataset_id, metric_key, score in multilingual_results: + print(f"- {dataset_id} | {metric_key} = {score:.4f}") +print("Custom dataset scores:") +for dataset_id, metric_key, score in custom_results: + print(f"- {dataset_id} | {metric_key} = {score:.4f}") +print(f"Multilingual macro mean: {multilingual_macro:.4f}") +print(f"Custom macro mean: {custom_macro:.4f}") +print(f"Group macro mean: {group_macro:.4f}") diff --git a/sentence_transformers/cross_encoder/evaluation/__init__.py b/sentence_transformers/cross_encoder/evaluation/__init__.py index d41f87984..ccd21e3fc 100644 --- a/sentence_transformers/cross_encoder/evaluation/__init__.py +++ b/sentence_transformers/cross_encoder/evaluation/__init__.py @@ -12,6 +12,7 @@ CERerankingEvaluator, CESoftmaxAccuracyEvaluator, ) +from .nano_evaluator import CrossEncoderNanoEvaluator from .nano_beir import CrossEncoderNanoBEIREvaluator from .reranking import CrossEncoderRerankingEvaluator @@ -31,6 +32,7 @@ __all__ = [ "CrossEncoderClassificationEvaluator", "CrossEncoderCorrelationEvaluator", + "CrossEncoderNanoEvaluator", "CrossEncoderRerankingEvaluator", "CrossEncoderNanoBEIREvaluator", # Deprecated: diff --git a/sentence_transformers/cross_encoder/evaluation/nano_beir.py b/sentence_transformers/cross_encoder/evaluation/nano_beir.py index 9acb36db6..5e0b8c8a1 100644 --- a/sentence_transformers/cross_encoder/evaluation/nano_beir.py +++ b/sentence_transformers/cross_encoder/evaluation/nano_beir.py @@ -192,6 +192,8 @@ class CrossEncoderNanoBEIREvaluator(SentenceEvaluator): pprint({key: value for key, value in results.items() if "ndcg@10" in key}) """ + reranking_evaluator_class = CrossEncoderRerankingEvaluator + def __init__( self, dataset_names: list[DatasetNameType | str] | None = None, @@ -207,7 +209,6 @@ def __init__( ): super().__init__() if dataset_names is None: - # We exclude arguana and touche2020 because their Argument Retrieval meaningfully task differs from the others dataset_names = [key for key in DATASET_NAME_TO_HUMAN_READABLE if key not in ["arguana", "touche2020"]] self.dataset_names = dataset_names self.dataset_id = dataset_id @@ -220,7 +221,7 @@ def __init__( self.aggregate_fn = aggregate_fn self.aggregate_key = aggregate_key - self.name = f"NanoBEIR_R{rerank_k:d}_{self.aggregate_key}" + self.name = f"{self.description}_R{rerank_k:d}_{self.aggregate_key}" self._validate_dataset_names() @@ -234,10 +235,10 @@ def __init__( self.evaluators = [ self._load_dataset(name, **reranking_kwargs) - for name in tqdm(self.dataset_names, desc="Loading NanoBEIR datasets", leave=False) + for name in tqdm(self.dataset_names, desc=f"Loading {self.description} datasets", leave=False) ] - self.csv_file: str = f"NanoBEIR_evaluation_{aggregate_key}_results.csv" + self.csv_file: str = f"{self.description}_evaluation_{aggregate_key}_results.csv" self.csv_headers = ["epoch", "steps", "MAP", f"MRR@{self.at_k}", f"NDCG@{self.at_k}"] self.primary_metric = f"ndcg@{self.at_k}" @@ -254,17 +255,17 @@ def __call__( out_txt = f" in epoch {epoch} after {steps} steps" else: out_txt = "" - logger.info(f"NanoBEIR Evaluation of the model on {self.dataset_names} dataset{out_txt}:") + logger.info(f"{self.description} Evaluation of the model on {self.dataset_names} dataset{out_txt}:") for evaluator in tqdm(self.evaluators, desc="Evaluating datasets", disable=not self.show_progress_bar): logger.info(f"Evaluating {evaluator.name}") evaluation = evaluator(model, output_path, epoch, steps) - for k in evaluation: - dataset, _rerank_k, metric = k.split("_", maxsplit=2) + for full_key, metric_value in evaluation.items(): + result_key, metric = self._parse_evaluation_key(evaluator.name, full_key) if metric not in per_metric_results: per_metric_results[metric] = [] - per_dataset_results[f"{dataset}_R{self.rerank_k}_{metric}"] = evaluation[k] - per_metric_results[metric].append(evaluation[k]) + per_dataset_results[result_key] = metric_value + per_metric_results[metric].append(metric_value) logger.info("") agg_results = {} @@ -320,15 +321,12 @@ def __call__( return per_dataset_results def _get_human_readable_name(self, dataset_name: DatasetNameType | str) -> str: - return f"Nano{DATASET_NAME_TO_HUMAN_READABLE[dataset_name.lower()]}_R{self.rerank_k}" + return f"{self._get_split_name(dataset_name)}_R{self.rerank_k}" def _load_dataset( self, dataset_name: DatasetNameType | str, **ir_evaluator_kwargs ) -> CrossEncoderRerankingEvaluator: - if dataset_name.lower() not in DATASET_NAME_TO_HUMAN_READABLE: - raise ValueError(f"Dataset '{dataset_name}' is not a valid NanoBEIR dataset.") - human_readable = DATASET_NAME_TO_HUMAN_READABLE[dataset_name.lower()] - split_name = f"Nano{human_readable}" + split_name = self._get_split_name(dataset_name) corpus = self._load_dataset_subset_split("corpus", split=split_name, required_columns=["_id", "text"]) queries = self._load_dataset_subset_split("queries", split=split_name, required_columns=["_id", "text"]) @@ -347,6 +345,9 @@ def _load_dataset( qrels_mapping[sample["query-id"]].update(corpus_ids) else: qrels_mapping[sample["query-id"]].add(corpus_ids) + self._validate_retrieval_references( + dataset_name, split_name, query_mapping, corpus_mapping, qrels_mapping, bm25 + ) def mapper( sample, @@ -375,12 +376,16 @@ def mapper( ) human_readable_name = self._get_human_readable_name(dataset_name) - return CrossEncoderRerankingEvaluator( + return self.reranking_evaluator_class( samples=list(relevance), name=human_readable_name, **ir_evaluator_kwargs, ) + @property + def description(self) -> str: + return "NanoBEIR" + def _load_dataset_subset_split(self, subset: str, split: str, required_columns: list[str]): if not is_datasets_available(): raise ValueError( @@ -415,6 +420,24 @@ def _validate_dataset_names(self): f"Valid dataset names are: {list(DATASET_NAME_TO_HUMAN_READABLE.keys())}" ) + def _get_split_name(self, dataset_name: DatasetNameType | str) -> str: + return f"Nano{DATASET_NAME_TO_HUMAN_READABLE[dataset_name.lower()]}" + + def _parse_evaluation_key(self, _evaluator_name: str, full_key: str) -> tuple[str, str]: + _dataset, _rerank_k, metric = full_key.split("_", maxsplit=2) + return full_key, metric + + def _validate_retrieval_references( + self, + dataset_name: DatasetNameType | str, + split_name: str, + query_mapping: dict[str, str], + corpus_mapping: dict[str, str], + qrels_mapping: dict[str, set[str]], + retrieved, + ) -> None: + pass + def get_config_dict(self): return { "dataset_names": self.dataset_names, diff --git a/sentence_transformers/cross_encoder/evaluation/nano_evaluator.py b/sentence_transformers/cross_encoder/evaluation/nano_evaluator.py new file mode 100644 index 000000000..f69050c0a --- /dev/null +++ b/sentence_transformers/cross_encoder/evaluation/nano_evaluator.py @@ -0,0 +1,84 @@ +from __future__ import annotations + +from collections.abc import Callable, Mapping +from typing import Any + +import numpy as np + +from sentence_transformers.cross_encoder.evaluation.nano_beir import CrossEncoderNanoBEIREvaluator +from sentence_transformers.evaluation._nano_utils import _GenericCrossEncoderNanoMixin + + +class CrossEncoderNanoEvaluator(_GenericCrossEncoderNanoMixin, CrossEncoderNanoBEIREvaluator): + """ + Generic cross-encoder evaluator for Nano-style reranking datasets on Hugging Face. + + This evaluator reuses :class:`~sentence_transformers.cross_encoder.evaluation.CrossEncoderNanoBEIREvaluator` + and overrides only dataset/split resolution plus candidate subset handling. + """ + + reranking_evaluator_class = CrossEncoderNanoBEIREvaluator.reranking_evaluator_class + + def __init__( + self, + dataset_names: list[str] | None = None, + dataset_id: str = "sentence-transformers/NanoBEIR-en", + rerank_k: int = 100, + at_k: int = 10, + always_rerank_positives: bool = True, + batch_size: int = 32, + show_progress_bar: bool = False, + write_csv: bool = True, + aggregate_fn: Callable[[list[float]], float] = np.mean, + aggregate_key: str = "mean", + dataset_name_to_human_readable: Mapping[str, str] | None = None, + split_prefix: str = "", + strict_dataset_name_validation: bool = False, + auto_expand_splits_when_dataset_names_none: bool = True, + name: str | None = None, + ) -> None: + self._initialize_generic_cross_encoder_state( + dataset_id=dataset_id, + dataset_name_to_human_readable=dataset_name_to_human_readable, + split_prefix=split_prefix, + strict_dataset_name_validation=strict_dataset_name_validation, + auto_expand_splits_when_dataset_names_none=auto_expand_splits_when_dataset_names_none, + name=name, + ) + dataset_names = self._resolve_dataset_names(dataset_names) + self.dataset_names = dataset_names + self._validate_dataset_names() + self._validate_mapping_splits() + super().__init__( + dataset_names=dataset_names, + dataset_id=dataset_id, + rerank_k=rerank_k, + at_k=at_k, + always_rerank_positives=always_rerank_positives, + batch_size=batch_size, + show_progress_bar=show_progress_bar, + write_csv=write_csv, + aggregate_fn=aggregate_fn, + aggregate_key=aggregate_key, + ) + + @property + def description(self) -> str: + return self.evaluator_name + + def _get_human_readable_name(self, dataset_name: str) -> str: + split_name = self._get_split_name(dataset_name) + if self.dataset_name_to_human_readable is None: + return f"{self.evaluator_name}_{split_name}_R{self.rerank_k}" + return f"{split_name}_R{self.rerank_k}" + + def _parse_evaluation_key(self, evaluator_name: str, full_key: str) -> tuple[str, str]: + prefix = f"{evaluator_name}_" + if full_key.startswith(prefix): + metric = full_key.removeprefix(prefix) + else: + metric = full_key.split("_", maxsplit=self.name.count("_"))[-1] + return full_key, metric + + def get_config_dict(self) -> dict[str, Any]: + return self._get_generic_cross_encoder_config_dict() diff --git a/sentence_transformers/evaluation/NanoBEIREvaluator.py b/sentence_transformers/evaluation/NanoBEIREvaluator.py index 7e10a4244..df0822dfc 100644 --- a/sentence_transformers/evaluation/NanoBEIREvaluator.py +++ b/sentence_transformers/evaluation/NanoBEIREvaluator.py @@ -247,7 +247,7 @@ def __init__( self.score_function_names = sorted(list(self.score_functions.keys())) if score_functions else [] self.main_score_function = main_score_function self.truncate_dim = truncate_dim - self.name = f"NanoBEIR_{aggregate_key}" + self.name = f"{self.description}_{aggregate_key}" if self.truncate_dim: self.name += f"_{self.truncate_dim}" @@ -276,10 +276,10 @@ def __init__( } self.evaluators = [ self._load_dataset(name, **ir_evaluator_kwargs) - for name in tqdm(self.dataset_names, desc="Loading NanoBEIR datasets", leave=False) + for name in tqdm(self.dataset_names, desc=f"Loading {self.description} datasets", leave=False) ] - self.csv_file: str = f"NanoBEIR_evaluation_{aggregate_key}_results.csv" + self.csv_file: str = f"{self.description}_evaluation_{aggregate_key}_results.csv" self.csv_headers = ["epoch", "steps"] self._append_csv_headers(self.score_function_names) @@ -322,7 +322,7 @@ def __call__( out_txt = "" if self.truncate_dim is not None: out_txt += f" (truncated to {self.truncate_dim})" - logger.info(f"NanoBEIR Evaluation of the model on {self.dataset_names} dataset{out_txt}:") + logger.info(f"{self.description} Evaluation of the model on {self.dataset_names} dataset{out_txt}:") if self.score_functions is None: self.score_functions = {model.similarity_fn_name: model.similarity} @@ -334,8 +334,7 @@ def __call__( logger.info(f"Evaluating {evaluator.name}") evaluation = evaluator(model, output_path, epoch, steps) for full_key, metric_value in evaluation.items(): - splits = full_key.split("_", maxsplit=num_underscores_in_name) - metric = splits[-1] + metric = self._get_metric_from_full_key(evaluator.name, full_key, num_underscores_in_name) if metric not in per_metric_results: per_metric_results[metric] = [] per_dataset_results[full_key] = metric_value @@ -419,7 +418,7 @@ def __call__( return per_dataset_results def _get_human_readable_name(self, dataset_name: DatasetNameType | str) -> str: - human_readable_name = f"Nano{DATASET_NAME_TO_HUMAN_READABLE[dataset_name.lower()]}" + human_readable_name = self._get_split_name(dataset_name) if self.truncate_dim is not None: human_readable_name += f"_{self.truncate_dim}" @@ -428,10 +427,7 @@ def _get_human_readable_name(self, dataset_name: DatasetNameType | str) -> str: def _load_dataset( self, dataset_name: DatasetNameType | str, **ir_evaluator_kwargs ) -> InformationRetrievalEvaluator: - if dataset_name.lower() not in DATASET_NAME_TO_HUMAN_READABLE: - raise ValueError(f"Dataset '{dataset_name}' is not a valid NanoBEIR dataset.") - human_readable = DATASET_NAME_TO_HUMAN_READABLE[dataset_name.lower()] - split_name = f"Nano{human_readable}" + split_name = self._get_split_name(dataset_name) corpus = self._load_dataset_subset_split("corpus", split=split_name, required_columns=["_id", "text"]) queries = self._load_dataset_subset_split("queries", split=split_name, required_columns=["_id", "text"]) @@ -464,6 +460,10 @@ def _load_dataset( **ir_evaluator_kwargs, ) + @property + def description(self) -> str: + return "NanoBEIR" + def _load_dataset_subset_split(self, subset: str, split: str, required_columns: list[str]): if not is_datasets_available(): raise ValueError( @@ -519,6 +519,13 @@ def _validate_prompts(self): if error_msg: raise ValueError(error_msg.strip()) + def _get_split_name(self, dataset_name: DatasetNameType | str) -> str: + return f"Nano{DATASET_NAME_TO_HUMAN_READABLE[dataset_name.lower()]}" + + def _get_metric_from_full_key(self, _evaluator_name: str, full_key: str, num_underscores_in_name: int) -> str: + splits = full_key.split("_", maxsplit=num_underscores_in_name) + return splits[-1] + def store_metrics_in_model_card_data(self, *args, **kwargs): # Only store metrics in the model card data if there is more than one dataset. # Otherwise the e.g. mean scores for NanoBEIR are the same as the scores for diff --git a/sentence_transformers/evaluation/NanoEvaluator.py b/sentence_transformers/evaluation/NanoEvaluator.py new file mode 100644 index 000000000..77dd5633b --- /dev/null +++ b/sentence_transformers/evaluation/NanoEvaluator.py @@ -0,0 +1,104 @@ +from __future__ import annotations + +from collections.abc import Callable, Mapping +from typing import Any + +import numpy as np +from torch import Tensor + +from sentence_transformers.evaluation._nano_utils import _GenericNanoDatasetMixin +from sentence_transformers.evaluation.NanoBEIREvaluator import NanoBEIREvaluator +from sentence_transformers.similarity_functions import SimilarityFunction + + +class NanoEvaluator(_GenericNanoDatasetMixin, NanoBEIREvaluator): + """ + Generic evaluator for Nano-style Information Retrieval datasets on Hugging Face. + + This evaluator supports direct split names as well as short dataset names that are + expanded through ``dataset_name_to_human_readable`` and ``split_prefix``. + """ + + def __init__( + self, + dataset_names: list[str] | None = None, + dataset_id: str = "sentence-transformers/NanoBEIR-en", + mrr_at_k: list[int] = [10], + ndcg_at_k: list[int] = [10], + accuracy_at_k: list[int] = [1, 3, 5, 10], + precision_recall_at_k: list[int] = [1, 3, 5, 10], + map_at_k: list[int] = [100], + show_progress_bar: bool = False, + batch_size: int = 32, + write_csv: bool = True, + truncate_dim: int | None = None, + score_functions: dict[str, Callable[[Tensor, Tensor], Tensor]] | None = None, + main_score_function: str | SimilarityFunction | None = None, + aggregate_fn: Callable[[list[float]], float] = np.mean, + aggregate_key: str = "mean", + query_prompts: str | dict[str, str] | None = None, + corpus_prompts: str | dict[str, str] | None = None, + write_predictions: bool = False, + dataset_name_to_human_readable: Mapping[str, str] | None = None, + split_prefix: str = "", + strict_dataset_name_validation: bool = False, + auto_expand_splits_when_dataset_names_none: bool = True, + name: str | None = None, + ) -> None: + self._initialize_generic_nano_state( + dataset_id=dataset_id, + dataset_name_to_human_readable=dataset_name_to_human_readable, + split_prefix=split_prefix, + strict_dataset_name_validation=strict_dataset_name_validation, + auto_expand_splits_when_dataset_names_none=auto_expand_splits_when_dataset_names_none, + name=name, + ) + dataset_names = self._resolve_dataset_names(dataset_names) + self.dataset_names = dataset_names + self._validate_dataset_names() + self._validate_mapping_splits() + query_prompts = self._normalize_prompt_mapping(query_prompts, dataset_names) + corpus_prompts = self._normalize_prompt_mapping(corpus_prompts, dataset_names) + super().__init__( + dataset_names=dataset_names, + dataset_id=dataset_id, + mrr_at_k=mrr_at_k, + ndcg_at_k=ndcg_at_k, + accuracy_at_k=accuracy_at_k, + precision_recall_at_k=precision_recall_at_k, + map_at_k=map_at_k, + show_progress_bar=show_progress_bar, + batch_size=batch_size, + write_csv=write_csv, + truncate_dim=truncate_dim, + score_functions=score_functions, + main_score_function=main_score_function, + aggregate_fn=aggregate_fn, + aggregate_key=aggregate_key, + query_prompts=query_prompts, + corpus_prompts=corpus_prompts, + write_predictions=write_predictions, + ) + + @property + def description(self) -> str: + return self.evaluator_name + + def _get_human_readable_name(self, dataset_name: str) -> str: + split_name = self._get_split_name(dataset_name) + if self.dataset_name_to_human_readable is None: + human_readable_name = f"{self.evaluator_name}_{split_name}" + else: + human_readable_name = split_name + if self.truncate_dim is not None: + human_readable_name += f"_{self.truncate_dim}" + return human_readable_name + + def _get_metric_from_full_key(self, evaluator_name: str, full_key: str, num_underscores_in_name: int) -> str: + prefix = f"{evaluator_name}_" + if full_key.startswith(prefix): + return full_key.removeprefix(prefix) + return full_key.split("_", maxsplit=num_underscores_in_name)[-1] + + def get_config_dict(self) -> dict[str, Any]: + return self._get_generic_config_dict() diff --git a/sentence_transformers/evaluation/__init__.py b/sentence_transformers/evaluation/__init__.py index 821c9c4a3..6ee01cb0f 100644 --- a/sentence_transformers/evaluation/__init__.py +++ b/sentence_transformers/evaluation/__init__.py @@ -7,6 +7,7 @@ from .MSEEvaluator import MSEEvaluator from .MSEEvaluatorFromDataFrame import MSEEvaluatorFromDataFrame from .NanoBEIREvaluator import NanoBEIREvaluator +from .NanoEvaluator import NanoEvaluator from .ParaphraseMiningEvaluator import ParaphraseMiningEvaluator from .RerankingEvaluator import RerankingEvaluator from .SentenceEvaluator import SentenceEvaluator @@ -24,6 +25,7 @@ "LabelAccuracyEvaluator", "MSEEvaluator", "MSEEvaluatorFromDataFrame", + "NanoEvaluator", "ParaphraseMiningEvaluator", "SequentialEvaluator", "TranslationEvaluator", diff --git a/sentence_transformers/evaluation/_nano_utils.py b/sentence_transformers/evaluation/_nano_utils.py new file mode 100644 index 000000000..88358b46d --- /dev/null +++ b/sentence_transformers/evaluation/_nano_utils.py @@ -0,0 +1,253 @@ +from __future__ import annotations + +from collections.abc import Mapping +from typing import Any + +from sentence_transformers.util import is_datasets_available + + +class _GenericNanoDatasetMixin: + def _initialize_generic_nano_state( + self, + *, + dataset_id: str, + dataset_name_to_human_readable: Mapping[str, str] | None, + split_prefix: str, + strict_dataset_name_validation: bool, + auto_expand_splits_when_dataset_names_none: bool, + name: str | None, + ) -> None: + self.dataset_id = dataset_id + self.dataset_name_to_human_readable = ( + dict(dataset_name_to_human_readable) if dataset_name_to_human_readable else None + ) + self.split_prefix = split_prefix + self.strict_dataset_name_validation = strict_dataset_name_validation + self.auto_expand_splits_when_dataset_names_none = auto_expand_splits_when_dataset_names_none + self._configured_name = name + self.evaluator_name = name or dataset_id.split("/")[-1] + self._subset_to_split_names_cache: dict[str, list[str]] = {} + + def _resolve_dataset_names(self, dataset_names: list[str] | None) -> list[str]: + if dataset_names is not None: + return dataset_names + if not self.auto_expand_splits_when_dataset_names_none: + raise ValueError("dataset_names cannot be None when auto split expansion is disabled.") + return self._get_available_splits("queries") + + def _normalize_prompt_mapping( + self, + prompt_mapping: str | dict[str, str] | None, + dataset_names: list[str], + ) -> str | dict[str, str] | None: + if prompt_mapping is None or isinstance(prompt_mapping, str): + return prompt_mapping + + lower_to_prompt = {key.lower(): value for key, value in prompt_mapping.items()} + normalized_prompt_mapping = {} + for dataset_name in dataset_names: + prompt = prompt_mapping.get(dataset_name) + if prompt is None: + prompt = lower_to_prompt.get(dataset_name.lower()) + if prompt is not None: + normalized_prompt_mapping[dataset_name] = prompt + return normalized_prompt_mapping + + def _is_known_split_name(self, dataset_name: str) -> bool: + return dataset_name in self._get_available_splits("queries") + + def _get_split_name(self, dataset_name: str) -> str: + if self.dataset_name_to_human_readable is None: + return dataset_name + if dataset_name in self.dataset_name_to_human_readable: + return f"{self.split_prefix}{self.dataset_name_to_human_readable[dataset_name]}" + lowered = dataset_name.lower() + if lowered in self.dataset_name_to_human_readable: + return f"{self.split_prefix}{self.dataset_name_to_human_readable[lowered]}" + if not self.strict_dataset_name_validation: + return dataset_name + if self._is_known_split_name(dataset_name): + return dataset_name + raise ValueError( + f"Dataset '{dataset_name}' does not exist in dataset_name_to_human_readable mapping. " + f"Available dataset names are: {list(self.dataset_name_to_human_readable.keys())}" + ) + + def _validate_dataset_names(self) -> None: + if len(self.dataset_names) == 0: + raise ValueError("dataset_names cannot be empty. Use None to evaluate on all datasets.") + if self.dataset_name_to_human_readable is None or not self.strict_dataset_name_validation: + return + + missing_datasets = [] + for dataset_name in self.dataset_names: + if dataset_name in self.dataset_name_to_human_readable: + continue + if dataset_name.lower() in self.dataset_name_to_human_readable: + continue + if self._is_known_split_name(dataset_name): + continue + missing_datasets.append(dataset_name) + if missing_datasets: + raise ValueError( + f"Dataset(s) {missing_datasets} do not exist in dataset_name_to_human_readable mapping. " + f"Available dataset names are: {list(self.dataset_name_to_human_readable.keys())}" + ) + + def _get_required_subset_names_for_split_validation(self) -> list[str]: + return ["corpus", "queries", "qrels"] + + def _validate_mapping_splits(self) -> None: + if self.dataset_name_to_human_readable is None: + return + for dataset_name in self.dataset_names: + split_name = self._get_split_name(dataset_name) + for subset_name in self._get_required_subset_names_for_split_validation(): + self._validate_split_exists(dataset_name, subset_name, split_name) + + def _get_available_splits(self, subset: str) -> list[str]: + if subset in self._subset_to_split_names_cache: + return self._subset_to_split_names_cache[subset] + if not is_datasets_available(): + raise ValueError(f"datasets is not available. Please install it to use the {type(self).__name__}.") + from datasets import get_dataset_split_names + + try: + split_names = get_dataset_split_names(self.dataset_id, subset) + except Exception as exc: + raise ValueError( + f"Could not list split names for subset '{subset}' from dataset '{self.dataset_id}'." + ) from exc + + if not split_names: + raise ValueError(f"No split names were found for subset '{subset}' in dataset '{self.dataset_id}'.") + self._subset_to_split_names_cache[subset] = list(split_names) + return self._subset_to_split_names_cache[subset] + + def _validate_split_exists(self, dataset_name: str, subset: str, split_name: str) -> None: + available_splits = self._get_available_splits(subset) + if split_name not in available_splits: + raise ValueError( + f"Dataset '{dataset_name}' maps to split '{split_name}', but it does not exist in subset '{subset}' " + f"for dataset '{self.dataset_id}'. Available splits: {available_splits}" + ) + + def _get_generic_config_dict(self) -> dict[str, Any]: + config_dict: dict[str, Any] = { + "dataset_names": self.dataset_names, + "dataset_id": self.dataset_id, + "dataset_name_to_human_readable": self.dataset_name_to_human_readable, + "split_prefix": self.split_prefix, + "strict_dataset_name_validation": self.strict_dataset_name_validation, + "auto_expand_splits_when_dataset_names_none": self.auto_expand_splits_when_dataset_names_none, + } + if self._configured_name is not None: + config_dict["name"] = self._configured_name + for key in ["truncate_dim", "query_prompts", "corpus_prompts"]: + value = getattr(self, key, None) + if value is not None: + config_dict[key] = value + return config_dict + + +class _GenericCrossEncoderNanoMixin(_GenericNanoDatasetMixin): + def _initialize_generic_cross_encoder_state( + self, + *, + dataset_id: str, + dataset_name_to_human_readable: Mapping[str, str] | None, + split_prefix: str, + strict_dataset_name_validation: bool, + auto_expand_splits_when_dataset_names_none: bool, + name: str | None, + ) -> None: + self._initialize_generic_nano_state( + dataset_id=dataset_id, + dataset_name_to_human_readable=dataset_name_to_human_readable, + split_prefix=split_prefix, + strict_dataset_name_validation=strict_dataset_name_validation, + auto_expand_splits_when_dataset_names_none=auto_expand_splits_when_dataset_names_none, + name=name, + ) + + def _get_required_subset_names_for_split_validation(self) -> list[str]: + return [*super()._get_required_subset_names_for_split_validation(), "bm25"] + + def _validate_retrieval_references( + self, + dataset_name: str, + split_name: str, + query_mapping: dict[str, str], + corpus_mapping: dict[str, str], + qrels_mapping: dict[str, set[str]], + retrieved: Any, + ) -> None: + missing_query_ids_in_qrels = [query_id for query_id in qrels_mapping if query_id not in query_mapping] + missing_positive_ids = sorted( + { + corpus_id + for corpus_ids in qrels_mapping.values() + for corpus_id in corpus_ids + if corpus_id not in corpus_mapping + } + ) + + missing_query_ids_in_candidates: set[str] = set() + missing_qrels_for_candidates: set[str] = set() + missing_retrieved_ids: set[str] = set() + for sample in retrieved: + query_id = sample["query-id"] + if query_id not in query_mapping: + missing_query_ids_in_candidates.add(query_id) + if query_id not in qrels_mapping: + missing_qrels_for_candidates.add(query_id) + for document_id in sample["corpus-ids"]: + if document_id not in corpus_mapping: + missing_retrieved_ids.add(document_id) + + if any( + [ + missing_query_ids_in_qrels, + missing_positive_ids, + missing_query_ids_in_candidates, + missing_qrels_for_candidates, + missing_retrieved_ids, + ] + ): + error_details: list[str] = [] + if missing_query_ids_in_qrels: + error_details.append(f"qrels references unknown query IDs: {sorted(missing_query_ids_in_qrels)[:5]}") + if missing_positive_ids: + error_details.append(f"qrels references unknown corpus IDs: {missing_positive_ids[:5]}") + if missing_query_ids_in_candidates: + error_details.append( + f"candidate subset references unknown query IDs: {sorted(missing_query_ids_in_candidates)[:5]}" + ) + if missing_qrels_for_candidates: + error_details.append( + f"candidate subset contains queries missing in qrels: {sorted(missing_qrels_for_candidates)[:5]}" + ) + if missing_retrieved_ids: + error_details.append( + f"candidate subset references unknown corpus IDs: {sorted(missing_retrieved_ids)[:5]}" + ) + raise ValueError( + f"Inconsistent IDs found for dataset '{dataset_name}' split '{split_name}' in '{self.dataset_id}'. " + + " | ".join(error_details) + ) + + def _get_generic_cross_encoder_config_dict(self) -> dict[str, Any]: + config_dict: dict[str, Any] = { + "dataset_names": self.dataset_names, + "dataset_id": self.dataset_id, + "rerank_k": self.rerank_k, + "at_k": self.at_k, + "always_rerank_positives": self.always_rerank_positives, + "dataset_name_to_human_readable": self.dataset_name_to_human_readable, + "split_prefix": self.split_prefix, + "strict_dataset_name_validation": self.strict_dataset_name_validation, + "auto_expand_splits_when_dataset_names_none": self.auto_expand_splits_when_dataset_names_none, + } + if self._configured_name is not None: + config_dict["name"] = self._configured_name + return config_dict diff --git a/sentence_transformers/sparse_encoder/evaluation/SparseNanoEvaluator.py b/sentence_transformers/sparse_encoder/evaluation/SparseNanoEvaluator.py new file mode 100644 index 000000000..8d1113e12 --- /dev/null +++ b/sentence_transformers/sparse_encoder/evaluation/SparseNanoEvaluator.py @@ -0,0 +1,107 @@ +from __future__ import annotations + +from collections.abc import Callable, Mapping +from typing import Any + +import numpy as np +from torch import Tensor + +from sentence_transformers.evaluation._nano_utils import _GenericNanoDatasetMixin +from sentence_transformers.similarity_functions import SimilarityFunction +from sentence_transformers.sparse_encoder.evaluation.SparseNanoBEIREvaluator import SparseNanoBEIREvaluator + + +class SparseNanoEvaluator(_GenericNanoDatasetMixin, SparseNanoBEIREvaluator): + """ + Generic Nano-style evaluator for sparse encoders. + + This evaluator reuses :class:`~sentence_transformers.sparse_encoder.evaluation.SparseNanoBEIREvaluator` + and overrides only dataset/split resolution so the sparse aggregation logic stays identical. + """ + + def __init__( + self, + dataset_names: list[str] | None = None, + dataset_id: str = "sentence-transformers/NanoBEIR-en", + mrr_at_k: list[int] = [10], + ndcg_at_k: list[int] = [10], + accuracy_at_k: list[int] = [1, 3, 5, 10], + precision_recall_at_k: list[int] = [1, 3, 5, 10], + map_at_k: list[int] = [100], + show_progress_bar: bool = False, + batch_size: int = 32, + write_csv: bool = True, + max_active_dims: int | None = None, + score_functions: dict[str, Callable[[Tensor, Tensor], Tensor]] | None = None, + main_score_function: str | SimilarityFunction | None = None, + aggregate_fn: Callable[[list[float]], float] = np.mean, + aggregate_key: str = "mean", + query_prompts: str | dict[str, str] | None = None, + corpus_prompts: str | dict[str, str] | None = None, + write_predictions: bool = False, + dataset_name_to_human_readable: Mapping[str, str] | None = None, + split_prefix: str = "", + strict_dataset_name_validation: bool = False, + auto_expand_splits_when_dataset_names_none: bool = True, + name: str | None = None, + ) -> None: + self._initialize_generic_nano_state( + dataset_id=dataset_id, + dataset_name_to_human_readable=dataset_name_to_human_readable, + split_prefix=split_prefix, + strict_dataset_name_validation=strict_dataset_name_validation, + auto_expand_splits_when_dataset_names_none=auto_expand_splits_when_dataset_names_none, + name=name, + ) + dataset_names = self._resolve_dataset_names(dataset_names) + self.dataset_names = dataset_names + self._validate_dataset_names() + self._validate_mapping_splits() + query_prompts = self._normalize_prompt_mapping(query_prompts, dataset_names) + corpus_prompts = self._normalize_prompt_mapping(corpus_prompts, dataset_names) + super().__init__( + dataset_names=dataset_names, + dataset_id=dataset_id, + mrr_at_k=mrr_at_k, + ndcg_at_k=ndcg_at_k, + accuracy_at_k=accuracy_at_k, + precision_recall_at_k=precision_recall_at_k, + map_at_k=map_at_k, + show_progress_bar=show_progress_bar, + batch_size=batch_size, + write_csv=write_csv, + max_active_dims=max_active_dims, + score_functions=score_functions, + main_score_function=main_score_function, + aggregate_fn=aggregate_fn, + aggregate_key=aggregate_key, + query_prompts=query_prompts, + corpus_prompts=corpus_prompts, + write_predictions=write_predictions, + ) + + def _get_human_readable_name(self, dataset_name: str) -> str: + split_name = self._get_split_name(dataset_name) + if self.dataset_name_to_human_readable is None: + human_readable_name = f"{self.evaluator_name}_{split_name}" + else: + human_readable_name = split_name + if self.max_active_dims is not None: + human_readable_name += f"_{self.max_active_dims}" + return human_readable_name + + @property + def description(self) -> str: + return self.evaluator_name + + def _get_metric_from_full_key(self, evaluator_name: str, full_key: str, num_underscores_in_name: int) -> str: + prefix = f"{evaluator_name}_" + if full_key.startswith(prefix): + return full_key.removeprefix(prefix) + return full_key.split("_", maxsplit=num_underscores_in_name)[-1] + + def get_config_dict(self) -> dict[str, Any]: + config_dict = self._get_generic_config_dict() + if self.max_active_dims is not None: + config_dict["max_active_dims"] = self.max_active_dims + return config_dict diff --git a/sentence_transformers/sparse_encoder/evaluation/__init__.py b/sentence_transformers/sparse_encoder/evaluation/__init__.py index 139fe5d81..c30f0ab2b 100644 --- a/sentence_transformers/sparse_encoder/evaluation/__init__.py +++ b/sentence_transformers/sparse_encoder/evaluation/__init__.py @@ -18,6 +18,9 @@ from sentence_transformers.sparse_encoder.evaluation.SparseNanoBEIREvaluator import ( SparseNanoBEIREvaluator, ) +from sentence_transformers.sparse_encoder.evaluation.SparseNanoEvaluator import ( + SparseNanoEvaluator, +) from sentence_transformers.sparse_encoder.evaluation.SparseRerankingEvaluator import ( SparseRerankingEvaluator, ) @@ -34,6 +37,7 @@ "SparseBinaryClassificationEvaluator", "SparseMSEEvaluator", "SparseNanoBEIREvaluator", + "SparseNanoEvaluator", "SparseTripletEvaluator", "SparseTranslationEvaluator", "SparseRerankingEvaluator", diff --git a/tests/cross_encoder/test_nano_evaluator.py b/tests/cross_encoder/test_nano_evaluator.py new file mode 100644 index 000000000..9292ca6f1 --- /dev/null +++ b/tests/cross_encoder/test_nano_evaluator.py @@ -0,0 +1,214 @@ +from __future__ import annotations + +import sys +from types import SimpleNamespace +from typing import Any + +import pytest + +from sentence_transformers.cross_encoder.evaluation import CrossEncoderNanoBEIREvaluator, CrossEncoderNanoEvaluator +from tests.nano_evaluator_test_utils import build_fake_datasets_module + + +class FakeCrossEncoderRerankingEvaluator: + def __init__( + self, + samples: list[dict[str, str | list[str]]], + name: str, + at_k: int = 10, + **kwargs: Any, + ) -> None: + del kwargs + self.samples = samples + self.name = name + self.at_k = at_k + + def __call__( + self, + model: Any, + output_path: str | None = None, + epoch: int = -1, + steps: int = -1, + *args: Any, + **kwargs: Any, + ) -> dict[str, float]: + del model, output_path, epoch, steps, args, kwargs + return { + f"{self.name}_base_map": 0.10, + f"{self.name}_map": 0.20, + f"{self.name}_base_mrr@{self.at_k}": 0.30, + f"{self.name}_mrr@{self.at_k}": 0.40, + f"{self.name}_base_ndcg@{self.at_k}": 0.50, + f"{self.name}_ndcg@{self.at_k}": 0.60, + } + + +@pytest.fixture +def dummy_cross_encoder() -> Any: + return SimpleNamespace(model_card_data=SimpleNamespace(set_evaluation_metrics=lambda *args, **kwargs: None)) + + +@pytest.fixture +def fake_datasets_module() -> Any: + return build_fake_datasets_module( + { + "sentence-transformers/NanoBEIR-en": ["NanoMSMARCO", "NanoNQ"], + "example/FooBar": ["ds_foo", "ds_bar"], + }, + candidate_subsets={"bm25": "corpus-ids"}, + ) + + +@pytest.fixture +def patch_cross_nano_eval(monkeypatch: pytest.MonkeyPatch, fake_datasets_module: Any) -> None: + import sentence_transformers.cross_encoder.evaluation.nano_beir as cross_nanobeir_module + import sentence_transformers.evaluation._nano_utils as nano_utils_module + + monkeypatch.setattr(nano_utils_module, "is_datasets_available", lambda: True) + monkeypatch.setattr(cross_nanobeir_module, "is_datasets_available", lambda: True) + monkeypatch.setitem(sys.modules, "datasets", fake_datasets_module) + monkeypatch.setattr( + CrossEncoderNanoEvaluator, + "reranking_evaluator_class", + FakeCrossEncoderRerankingEvaluator, + ) + monkeypatch.setattr( + CrossEncoderNanoBEIREvaluator, + "reranking_evaluator_class", + FakeCrossEncoderRerankingEvaluator, + ) + monkeypatch.setattr( + cross_nanobeir_module, + "CrossEncoderRerankingEvaluator", + FakeCrossEncoderRerankingEvaluator, + ) + + +def test_cross_encoder_nano_evaluator_auto_expand_splits_and_auto_names( + patch_cross_nano_eval: None, + dummy_cross_encoder: Any, +) -> None: + evaluator = CrossEncoderNanoEvaluator( + dataset_names=None, + dataset_id="example/FooBar", + write_csv=False, + ) + + assert evaluator.dataset_names == ["ds_foo", "ds_bar"] + assert [sub_evaluator.name for sub_evaluator in evaluator.evaluators] == [ + "FooBar_ds_foo_R100", + "FooBar_ds_bar_R100", + ] + + metrics = evaluator(dummy_cross_encoder) + assert "FooBar_R100_mean_ndcg@10" in metrics + + +def test_cross_encoder_nano_evaluator_auto_expand_splits_with_mapping_in_strict_mode( + patch_cross_nano_eval: None, + dummy_cross_encoder: Any, +) -> None: + evaluator = CrossEncoderNanoEvaluator( + dataset_names=None, + dataset_id="example/FooBar", + dataset_name_to_human_readable={"msmarco": "MSMARCO"}, + split_prefix="Nano", + strict_dataset_name_validation=True, + write_csv=False, + ) + + assert evaluator.dataset_names == ["ds_foo", "ds_bar"] + metrics = evaluator(dummy_cross_encoder) + assert "FooBar_R100_mean_ndcg@10" in metrics + + +def test_cross_encoder_nano_evaluator_mapping_validates_split_exists(monkeypatch: pytest.MonkeyPatch) -> None: + import sentence_transformers.evaluation._nano_utils as nano_utils_module + + def get_dataset_split_names(dataset_id: str, subset: str) -> list[str]: + del dataset_id, subset + return ["NanoNQ"] + + monkeypatch.setattr(nano_utils_module, "is_datasets_available", lambda: True) + monkeypatch.setitem( + sys.modules, + "datasets", + SimpleNamespace(load_dataset=lambda *args, **kwargs: None, get_dataset_split_names=get_dataset_split_names), + ) + + with pytest.raises(ValueError, match="maps to split 'NanoMSMARCO'.*does not exist"): + CrossEncoderNanoEvaluator( + dataset_names=["msmarco"], + dataset_id="sentence-transformers/NanoBEIR-en", + dataset_name_to_human_readable={"msmarco": "MSMARCO"}, + split_prefix="Nano", + write_csv=False, + ) + + +def test_cross_encoder_nano_evaluator_accepts_direct_split_names_with_mapping( + patch_cross_nano_eval: None, + dummy_cross_encoder: Any, +) -> None: + evaluator = CrossEncoderNanoEvaluator( + dataset_names=["ds_foo"], + dataset_id="example/FooBar", + dataset_name_to_human_readable={"msmarco": "MSMARCO"}, + split_prefix="Nano", + write_csv=False, + ) + + assert [sub_evaluator.name for sub_evaluator in evaluator.evaluators] == ["ds_foo_R100"] + metrics = evaluator(dummy_cross_encoder) + assert "FooBar_R100_mean_ndcg@10" in metrics + + +def test_cross_encoder_nano_evaluator_custom_name_metric_root( + patch_cross_nano_eval: None, + dummy_cross_encoder: Any, +) -> None: + evaluator = CrossEncoderNanoEvaluator( + dataset_names=["ds_foo"], + dataset_id="example/FooBar", + write_csv=False, + name="CustomCrossNano", + ) + + assert evaluator.name == "CustomCrossNano_R100_mean" + assert [sub_evaluator.name for sub_evaluator in evaluator.evaluators] == ["CustomCrossNano_ds_foo_R100"] + metrics = evaluator(dummy_cross_encoder) + assert "CustomCrossNano_R100_mean_ndcg@10" in metrics + + +def test_cross_encoder_nano_evaluator_config_keeps_custom_name( + patch_cross_nano_eval: None, +) -> None: + evaluator = CrossEncoderNanoEvaluator( + dataset_names=["ds_foo"], + dataset_id="example/FooBar", + write_csv=False, + name="CustomCrossNano", + ) + + config = evaluator.get_config_dict() + + assert config["name"] == "CustomCrossNano" + assert "candidate_subset_name" not in config + + +def test_cross_encoder_nanobeir_invalid_dataset_name() -> None: + with pytest.raises(ValueError, match="are not valid NanoBEIR datasets"): + CrossEncoderNanoBEIREvaluator(dataset_names=["invalidDataset"]) + + +def test_cross_encoder_nanobeir_primary_metric_key( + patch_cross_nano_eval: None, + dummy_cross_encoder: Any, +) -> None: + evaluator = CrossEncoderNanoBEIREvaluator( + dataset_names=["msmarco"], + write_csv=False, + ) + + metrics = evaluator(dummy_cross_encoder) + assert "NanoBEIR_R100_mean_ndcg@10" in metrics diff --git a/tests/evaluation/test_nano_evaluator.py b/tests/evaluation/test_nano_evaluator.py new file mode 100644 index 000000000..b97ecb72d --- /dev/null +++ b/tests/evaluation/test_nano_evaluator.py @@ -0,0 +1,235 @@ +from __future__ import annotations + +import importlib +import sys +from types import SimpleNamespace +from typing import Any + +import pytest + +from sentence_transformers.evaluation import NanoBEIREvaluator, NanoEvaluator, SequentialEvaluator +from tests.nano_evaluator_test_utils import build_fake_datasets_module + + +class FakeInformationRetrievalEvaluator: + def __init__( + self, + queries: dict[str, str], + corpus: dict[str, str], + relevant_docs: dict[str, set[str]], + name: str, + mrr_at_k: list[int], + ndcg_at_k: list[int], + accuracy_at_k: list[int], + precision_recall_at_k: list[int], + map_at_k: list[int], + score_functions: dict[str, Any] | None = None, + **kwargs: Any, + ) -> None: + self.queries = queries + self.corpus = corpus + self.relevant_docs = relevant_docs + self.name = name + self.query_prompt = kwargs.get("query_prompt") + self.corpus_prompt = kwargs.get("corpus_prompt") + self.mrr_at_k = mrr_at_k + self.ndcg_at_k = ndcg_at_k + self.accuracy_at_k = accuracy_at_k + self.precision_recall_at_k = precision_recall_at_k + self.map_at_k = map_at_k + self.score_names = sorted(score_functions.keys()) if score_functions else ["cosine"] + + def __call__( + self, + model: Any, + output_path: str | None = None, + epoch: int = -1, + steps: int = -1, + *args: Any, + **kwargs: Any, + ) -> dict[str, float]: + del model, output_path, epoch, steps, args, kwargs + + base_value = 0.05 + (len(self.name) % 10) * 0.01 + metrics: dict[str, float] = {} + for score_name in self.score_names: + for k in self.accuracy_at_k: + metrics[f"{self.name}_{score_name}_accuracy@{k}"] = base_value + for k in self.precision_recall_at_k: + metrics[f"{self.name}_{score_name}_precision@{k}"] = base_value + metrics[f"{self.name}_{score_name}_recall@{k}"] = base_value + for k in self.mrr_at_k: + metrics[f"{self.name}_{score_name}_mrr@{k}"] = base_value + for k in self.ndcg_at_k: + metrics[f"{self.name}_{score_name}_ndcg@{k}"] = base_value + for k in self.map_at_k: + metrics[f"{self.name}_{score_name}_map@{k}"] = base_value + return metrics + + +@pytest.fixture +def dummy_model() -> Any: + return SimpleNamespace( + similarity_fn_name="cosine", + similarity=lambda a, b: a, + model_card_data=SimpleNamespace(set_evaluation_metrics=lambda *args, **kwargs: None), + ) + + +@pytest.fixture +def fake_datasets_module() -> Any: + return build_fake_datasets_module( + { + "sentence-transformers/NanoBEIR-en": ["NanoMSMARCO", "NanoNQ"], + "example/FooBar": ["ds_foo", "ds_bar"], + } + ) + + +@pytest.fixture +def patch_nano_eval(monkeypatch: pytest.MonkeyPatch, fake_datasets_module: Any) -> None: + nanobeir_module = importlib.import_module("sentence_transformers.evaluation.NanoBEIREvaluator") + nano_utils_module = importlib.import_module("sentence_transformers.evaluation._nano_utils") + + monkeypatch.setattr(nanobeir_module, "is_datasets_available", lambda: True) + monkeypatch.setattr(nano_utils_module, "is_datasets_available", lambda: True) + monkeypatch.setitem(sys.modules, "datasets", fake_datasets_module) + monkeypatch.setattr(NanoEvaluator, "information_retrieval_class", FakeInformationRetrievalEvaluator) + monkeypatch.setattr(NanoBEIREvaluator, "information_retrieval_class", FakeInformationRetrievalEvaluator) + + +def test_nano_evaluator_auto_expand_splits_and_auto_names(patch_nano_eval: None, dummy_model: Any) -> None: + evaluator = NanoEvaluator( + dataset_names=None, + dataset_id="example/FooBar", + write_csv=False, + ) + + assert evaluator.dataset_names == ["ds_foo", "ds_bar"] + assert [sub_evaluator.name for sub_evaluator in evaluator.evaluators] == [ + "FooBar_ds_foo", + "FooBar_ds_bar", + ] + + metrics = evaluator(dummy_model) + assert evaluator.primary_metric == "FooBar_mean_cosine_ndcg@10" + assert "FooBar_mean_cosine_ndcg@10" in metrics + + +def test_nano_evaluator_auto_expand_splits_with_mapping_in_strict_mode( + patch_nano_eval: None, + dummy_model: Any, +) -> None: + evaluator = NanoEvaluator( + dataset_names=None, + dataset_id="example/FooBar", + dataset_name_to_human_readable={"msmarco": "MSMARCO"}, + split_prefix="Nano", + strict_dataset_name_validation=True, + write_csv=False, + ) + + assert evaluator.dataset_names == ["ds_foo", "ds_bar"] + metrics = evaluator(dummy_model) + assert "FooBar_mean_cosine_ndcg@10" in metrics + + +def test_nano_evaluator_mapping_validates_split_exists(monkeypatch: pytest.MonkeyPatch) -> None: + nano_utils_module = importlib.import_module("sentence_transformers.evaluation._nano_utils") + + def get_dataset_split_names(dataset_id: str, subset: str) -> list[str]: + del dataset_id, subset + return ["NanoNQ"] + + monkeypatch.setattr(nano_utils_module, "is_datasets_available", lambda: True) + monkeypatch.setitem( + sys.modules, + "datasets", + SimpleNamespace(load_dataset=lambda *args, **kwargs: None, get_dataset_split_names=get_dataset_split_names), + ) + + with pytest.raises(ValueError, match="maps to split 'NanoMSMARCO'.*does not exist"): + NanoEvaluator( + dataset_names=["msmarco"], + dataset_id="sentence-transformers/NanoBEIR-en", + dataset_name_to_human_readable={"msmarco": "MSMARCO"}, + split_prefix="Nano", + write_csv=False, + ) + + +def test_nano_evaluator_accepts_direct_split_names_with_mapping( + patch_nano_eval: None, + dummy_model: Any, +) -> None: + evaluator = NanoEvaluator( + dataset_names=["ds_foo"], + dataset_id="example/FooBar", + dataset_name_to_human_readable={"msmarco": "MSMARCO"}, + split_prefix="Nano", + write_csv=False, + ) + + assert [sub_evaluator.name for sub_evaluator in evaluator.evaluators] == ["ds_foo"] + metrics = evaluator(dummy_model) + assert "FooBar_mean_cosine_ndcg@10" in metrics + + +def test_nano_evaluator_custom_name_and_case_insensitive_prompts( + patch_nano_eval: None, + dummy_model: Any, +) -> None: + evaluator = NanoEvaluator( + dataset_names=["ds_foo"], + dataset_id="example/FooBar", + query_prompts={"DS_FOO": "query: "}, + corpus_prompts={"DS_FOO": "passage: "}, + name="CustomNano", + write_csv=False, + ) + + assert evaluator.name == "CustomNano_mean" + assert [sub_evaluator.name for sub_evaluator in evaluator.evaluators] == ["CustomNano_ds_foo"] + assert evaluator.evaluators[0].query_prompt == "query: " + assert evaluator.evaluators[0].corpus_prompt == "passage: " + metrics = evaluator(dummy_model) + assert "CustomNano_mean_cosine_ndcg@10" in metrics + + +def test_nano_evaluator_config_keeps_custom_name(patch_nano_eval: None) -> None: + evaluator = NanoEvaluator( + dataset_names=["ds_foo"], + dataset_id="example/FooBar", + name="CustomNano", + write_csv=False, + ) + + config = evaluator.get_config_dict() + + assert config["name"] == "CustomNano" + + +def test_sequential_evaluator_with_nanobeir_and_generic_nano_dataset( + patch_nano_eval: None, + dummy_model: Any, +) -> None: + nanobeir_evaluator = NanoBEIREvaluator( + dataset_names=["msmarco"], + write_csv=False, + ) + generic_nano_evaluator = NanoEvaluator( + dataset_names=["ds_foo"], + dataset_id="example/FooBar", + write_csv=False, + ) + seq_evaluator = SequentialEvaluator( + [nanobeir_evaluator, generic_nano_evaluator], + main_score_function=lambda scores: float(sum(scores) / len(scores)), + ) + + metrics = seq_evaluator(dummy_model) + + assert "sequential_score" in metrics + assert any(key.startswith("NanoBEIR_mean_") for key in metrics) + assert any(key.startswith("FooBar_mean_") for key in metrics) + assert "NanoBEIR_mean_cosine_ndcg@10" in metrics diff --git a/tests/evaluation/test_nanobeir_evaluator_unit.py b/tests/evaluation/test_nanobeir_evaluator_unit.py new file mode 100644 index 000000000..8ae1754c7 --- /dev/null +++ b/tests/evaluation/test_nanobeir_evaluator_unit.py @@ -0,0 +1,141 @@ +from __future__ import annotations + +from pathlib import Path +from types import SimpleNamespace +from typing import Any + +import pytest + +from sentence_transformers.evaluation import NanoBEIREvaluator + + +class FakeInformationRetrievalEvaluator: + def __init__( + self, + queries: dict[str, str], + corpus: dict[str, str], + relevant_docs: dict[str, set[str]], + name: str, + mrr_at_k: list[int], + ndcg_at_k: list[int], + accuracy_at_k: list[int], + precision_recall_at_k: list[int], + map_at_k: list[int], + score_functions: dict[str, Any] | None = None, + **kwargs: Any, + ) -> None: + del relevant_docs, kwargs + self.queries = queries + self.corpus = corpus + self.name = name + self.mrr_at_k = mrr_at_k + self.ndcg_at_k = ndcg_at_k + self.accuracy_at_k = accuracy_at_k + self.precision_recall_at_k = precision_recall_at_k + self.map_at_k = map_at_k + self.score_names = sorted(score_functions.keys()) if score_functions else ["cosine"] + + def __call__( + self, + model: Any, + output_path: str | None = None, + epoch: int = -1, + steps: int = -1, + *args: Any, + **kwargs: Any, + ) -> dict[str, float]: + del model, output_path, epoch, steps, args, kwargs + metrics: dict[str, float] = {} + base_value = 0.42 + for score_name in self.score_names: + for k in self.accuracy_at_k: + metrics[f"{self.name}_{score_name}_accuracy@{k}"] = base_value + for k in self.precision_recall_at_k: + metrics[f"{self.name}_{score_name}_precision@{k}"] = base_value + metrics[f"{self.name}_{score_name}_recall@{k}"] = base_value + for k in self.mrr_at_k: + metrics[f"{self.name}_{score_name}_mrr@{k}"] = base_value + for k in self.ndcg_at_k: + metrics[f"{self.name}_{score_name}_ndcg@{k}"] = base_value + for k in self.map_at_k: + metrics[f"{self.name}_{score_name}_map@{k}"] = base_value + return metrics + + +@pytest.fixture +def dummy_model() -> Any: + return SimpleNamespace( + similarity_fn_name="cosine", + similarity=lambda a, b: a, + model_card_data=SimpleNamespace(set_evaluation_metrics=lambda *args, **kwargs: None), + ) + + +@pytest.fixture +def patch_nanobeir_loader(monkeypatch: pytest.MonkeyPatch) -> None: + def fake_load_dataset(self: NanoBEIREvaluator, dataset_name: str, **ir_evaluator_kwargs: Any) -> Any: + return FakeInformationRetrievalEvaluator( + queries={"q1": "query 1"}, + corpus={"d1": "doc 1"}, + relevant_docs={"q1": {"d1"}}, + name=self._get_human_readable_name(dataset_name), + **ir_evaluator_kwargs, + ) + + monkeypatch.setattr(NanoBEIREvaluator, "_load_dataset", fake_load_dataset) + + +def test_nanobeir_primary_metric_key_default(patch_nanobeir_loader: None, dummy_model: Any) -> None: + evaluator = NanoBEIREvaluator( + dataset_names=["msmarco"], + write_csv=False, + ) + + results = evaluator(dummy_model) + + assert evaluator.primary_metric == "NanoBEIR_mean_cosine_ndcg@10" + assert evaluator.primary_metric in results + + +def test_nanobeir_primary_metric_key_with_truncate_dim(patch_nanobeir_loader: None, dummy_model: Any) -> None: + evaluator = NanoBEIREvaluator( + dataset_names=["msmarco"], + truncate_dim=64, + write_csv=False, + ) + + results = evaluator(dummy_model) + + assert evaluator.primary_metric == "NanoBEIR_mean_64_cosine_ndcg@10" + assert evaluator.primary_metric in results + assert "NanoMSMARCO_64_cosine_ndcg@10" in results + + +def test_nanobeir_writes_csv_metrics( + patch_nanobeir_loader: None, + dummy_model: Any, + tmp_path: Path, +) -> None: + evaluator = NanoBEIREvaluator( + dataset_names=["msmarco", "nq"], + write_csv=True, + ) + + results = evaluator(dummy_model, output_path=str(tmp_path), epoch=1, steps=2) + + csv_path = tmp_path / "NanoBEIR_evaluation_mean_results.csv" + assert csv_path.exists() + + lines = csv_path.read_text(encoding="utf-8").splitlines() + assert len(lines) == 2 + + header = lines[0].split(",") + row = lines[1].split(",") + assert len(header) == len(row) + assert header[:2] == ["epoch", "steps"] + assert row[:2] == ["1", "2"] + + ndcg_idx = header.index("cosine-NDCG@10") + map_idx = header.index("cosine-MAP@100") + assert float(row[ndcg_idx]) == pytest.approx(results["NanoBEIR_mean_cosine_ndcg@10"]) + assert float(row[map_idx]) == pytest.approx(results["NanoBEIR_mean_cosine_map@100"]) diff --git a/tests/nano_evaluator_test_utils.py b/tests/nano_evaluator_test_utils.py new file mode 100644 index 000000000..f32dfee68 --- /dev/null +++ b/tests/nano_evaluator_test_utils.py @@ -0,0 +1,61 @@ +from __future__ import annotations + +from collections.abc import Iterator, Mapping +from types import SimpleNamespace +from typing import Any + + +class FakeDataset: + def __init__(self, rows: list[dict[str, Any]]) -> None: + self.rows = rows + self.column_names = list(rows[0].keys()) if rows else [] + + def __iter__(self) -> Iterator[dict[str, Any]]: + return iter(self.rows) + + def __len__(self) -> int: + return len(self.rows) + + def __getitem__(self, key: str | int) -> Any: + if isinstance(key, str): + return [row[key] for row in self.rows] + return self.rows[key] + + def map(self, fn: Any, fn_kwargs: dict[str, Any] | None = None) -> FakeDataset: + kwargs = fn_kwargs or {} + return FakeDataset([fn(row, **kwargs) for row in self.rows]) + + +def build_fake_datasets_module( + dataset_splits: Mapping[str, list[str]], + candidate_subsets: Mapping[str, str] | None = None, +) -> Any: + data: dict[tuple[str, str, str], list[dict[str, Any]]] = {} + split_names: dict[tuple[str, str], list[str]] = {} + candidate_subsets = candidate_subsets or {} + + def add_split(dataset_id: str, split_name: str) -> None: + data[(dataset_id, "corpus", split_name)] = [ + {"_id": f"{split_name}-d1", "text": "Document 1"}, + {"_id": f"{split_name}-d2", "text": "Document 2"}, + ] + data[(dataset_id, "queries", split_name)] = [{"_id": f"{split_name}-q1", "text": "Query 1"}] + data[(dataset_id, "qrels", split_name)] = [{"query-id": f"{split_name}-q1", "corpus-id": f"{split_name}-d1"}] + for subset_name, column_name in candidate_subsets.items(): + data[(dataset_id, subset_name, split_name)] = [ + {"query-id": f"{split_name}-q1", column_name: [f"{split_name}-d2", f"{split_name}-d1"]} + ] + + for dataset_id, splits in dataset_splits.items(): + for split_name in splits: + add_split(dataset_id, split_name) + for subset_name in ["corpus", "queries", "qrels", *candidate_subsets]: + split_names[(dataset_id, subset_name)] = list(splits) + + def load_dataset(dataset_id: str, subset: str, split: str) -> FakeDataset: + return FakeDataset(data[(dataset_id, subset, split)]) + + def get_dataset_split_names(dataset_id: str, subset: str) -> list[str]: + return split_names[(dataset_id, subset)] + + return SimpleNamespace(load_dataset=load_dataset, get_dataset_split_names=get_dataset_split_names) diff --git a/tests/sparse_encoder/test_nano_evaluator.py b/tests/sparse_encoder/test_nano_evaluator.py new file mode 100644 index 000000000..caae351ec --- /dev/null +++ b/tests/sparse_encoder/test_nano_evaluator.py @@ -0,0 +1,238 @@ +from __future__ import annotations + +import importlib +import sys +from types import SimpleNamespace +from typing import Any + +import pytest +import torch + +from sentence_transformers.evaluation import SequentialEvaluator +from sentence_transformers.sparse_encoder.evaluation import SparseNanoBEIREvaluator, SparseNanoEvaluator +from tests.nano_evaluator_test_utils import build_fake_datasets_module + + +class FakeSparseInformationRetrievalEvaluator: + def __init__( + self, + queries: dict[str, str], + corpus: dict[str, str], + relevant_docs: dict[str, set[str]], + name: str, + mrr_at_k: list[int], + ndcg_at_k: list[int], + accuracy_at_k: list[int], + precision_recall_at_k: list[int], + map_at_k: list[int], + score_functions: dict[str, Any] | None = None, + max_active_dims: int | None = None, + **kwargs: Any, + ) -> None: + del relevant_docs, max_active_dims, kwargs + self.queries = queries + self.corpus = corpus + self.name = name + self.mrr_at_k = mrr_at_k + self.ndcg_at_k = ndcg_at_k + self.accuracy_at_k = accuracy_at_k + self.precision_recall_at_k = precision_recall_at_k + self.map_at_k = map_at_k + self.score_names = sorted(score_functions.keys()) if score_functions else ["dot"] + base = float(10 + len(name) % 4) + self.sparsity_stats: dict[str, float] = { + "query_active_dims": base, + "query_sparsity_ratio": 0.99, + "corpus_active_dims": base + 2.0, + "corpus_sparsity_ratio": 0.98, + "avg_flops": 0.0, + } + self.count_vectors = { + "query": torch.tensor([1.0, 2.0, 3.0]), + "corpus": torch.tensor([3.0, 2.0, 1.0]), + } + + def __call__( + self, + model: Any, + output_path: str | None = None, + epoch: int = -1, + steps: int = -1, + *args: Any, + **kwargs: Any, + ) -> dict[str, float]: + del model, output_path, epoch, steps, args, kwargs + + base_value = 0.05 + (len(self.name) % 10) * 0.01 + metrics: dict[str, float] = {} + for score_name in self.score_names: + for k in self.accuracy_at_k: + metrics[f"{self.name}_{score_name}_accuracy@{k}"] = base_value + for k in self.precision_recall_at_k: + metrics[f"{self.name}_{score_name}_precision@{k}"] = base_value + metrics[f"{self.name}_{score_name}_recall@{k}"] = base_value + for k in self.mrr_at_k: + metrics[f"{self.name}_{score_name}_mrr@{k}"] = base_value + for k in self.ndcg_at_k: + metrics[f"{self.name}_{score_name}_ndcg@{k}"] = base_value + for k in self.map_at_k: + metrics[f"{self.name}_{score_name}_map@{k}"] = base_value + return metrics + + +@pytest.fixture +def dummy_sparse_model() -> Any: + return SimpleNamespace( + similarity_fn_name="dot", + similarity=lambda a, b: a, + model_card_data=SimpleNamespace(set_evaluation_metrics=lambda *args, **kwargs: None), + ) + + +@pytest.fixture +def fake_datasets_module() -> Any: + return build_fake_datasets_module( + { + "sentence-transformers/NanoBEIR-en": ["NanoMSMARCO", "NanoNQ"], + "example/FooBar": ["ds_foo", "ds_bar"], + } + ) + + +@pytest.fixture +def patch_sparse_nano_eval(monkeypatch: pytest.MonkeyPatch, fake_datasets_module: Any) -> None: + nanobeir_module = importlib.import_module("sentence_transformers.evaluation.NanoBEIREvaluator") + nano_utils_module = importlib.import_module("sentence_transformers.evaluation._nano_utils") + + monkeypatch.setattr(nano_utils_module, "is_datasets_available", lambda: True) + monkeypatch.setattr(nanobeir_module, "is_datasets_available", lambda: True) + monkeypatch.setitem(sys.modules, "datasets", fake_datasets_module) + monkeypatch.setattr(SparseNanoEvaluator, "information_retrieval_class", FakeSparseInformationRetrievalEvaluator) + monkeypatch.setattr( + SparseNanoBEIREvaluator, + "information_retrieval_class", + FakeSparseInformationRetrievalEvaluator, + ) + + +def test_sparse_nano_evaluator_auto_expand_splits_and_auto_names( + patch_sparse_nano_eval: None, + dummy_sparse_model: Any, +) -> None: + evaluator = SparseNanoEvaluator( + dataset_names=None, + dataset_id="example/FooBar", + write_csv=False, + ) + + assert evaluator.dataset_names == ["ds_foo", "ds_bar"] + assert [sub_evaluator.name for sub_evaluator in evaluator.evaluators] == [ + "FooBar_ds_foo", + "FooBar_ds_bar", + ] + + metrics = evaluator(dummy_sparse_model) + assert evaluator.primary_metric == "FooBar_mean_dot_ndcg@10" + assert "FooBar_mean_dot_ndcg@10" in metrics + assert "FooBar_mean_query_active_dims" in metrics + assert "FooBar_mean_avg_flops" in metrics + + +def test_sparse_nano_evaluator_single_split_path( + patch_sparse_nano_eval: None, + dummy_sparse_model: Any, +) -> None: + evaluator = SparseNanoEvaluator( + dataset_names=["ds_foo"], + dataset_id="example/FooBar", + write_csv=False, + ) + + metrics = evaluator(dummy_sparse_model) + assert evaluator.primary_metric == "FooBar_mean_dot_ndcg@10" + assert "FooBar_ds_foo_dot_ndcg@10" in metrics + assert "FooBar_mean_avg_flops" in metrics + + +def test_sparse_nano_evaluator_mapping_validates_split_exists(monkeypatch: pytest.MonkeyPatch) -> None: + nano_utils_module = importlib.import_module("sentence_transformers.evaluation._nano_utils") + + def get_dataset_split_names(dataset_id: str, subset: str) -> list[str]: + del dataset_id, subset + return ["NanoNQ"] + + monkeypatch.setattr(nano_utils_module, "is_datasets_available", lambda: True) + monkeypatch.setitem( + sys.modules, + "datasets", + SimpleNamespace(load_dataset=lambda *args, **kwargs: None, get_dataset_split_names=get_dataset_split_names), + ) + + with pytest.raises(ValueError, match="maps to split 'NanoMSMARCO'.*does not exist"): + SparseNanoEvaluator( + dataset_names=["msmarco"], + dataset_id="sentence-transformers/NanoBEIR-en", + dataset_name_to_human_readable={"msmarco": "MSMARCO"}, + split_prefix="Nano", + write_csv=False, + ) + + +def test_sparse_nano_evaluator_accepts_direct_split_names_with_mapping( + patch_sparse_nano_eval: None, + dummy_sparse_model: Any, +) -> None: + evaluator = SparseNanoEvaluator( + dataset_names=["ds_foo"], + dataset_id="example/FooBar", + dataset_name_to_human_readable={"msmarco": "MSMARCO"}, + split_prefix="Nano", + write_csv=False, + ) + + assert [sub_evaluator.name for sub_evaluator in evaluator.evaluators] == ["ds_foo"] + metrics = evaluator(dummy_sparse_model) + assert "FooBar_mean_dot_ndcg@10" in metrics + + +def test_sparse_nano_evaluator_custom_name_metric_root( + patch_sparse_nano_eval: None, + dummy_sparse_model: Any, +) -> None: + evaluator = SparseNanoEvaluator( + dataset_names=["ds_foo"], + dataset_id="example/FooBar", + name="CustomSparseNano", + write_csv=False, + ) + + assert evaluator.name == "CustomSparseNano_mean" + assert [sub_evaluator.name for sub_evaluator in evaluator.evaluators] == ["CustomSparseNano_ds_foo"] + metrics = evaluator(dummy_sparse_model) + assert "CustomSparseNano_mean_dot_ndcg@10" in metrics + + +def test_sequential_evaluator_with_sparse_nanobeir_and_generic_nano_dataset( + patch_sparse_nano_eval: None, + dummy_sparse_model: Any, +) -> None: + nanobeir_evaluator = SparseNanoBEIREvaluator( + dataset_names=["msmarco"], + write_csv=False, + ) + generic_nano_evaluator = SparseNanoEvaluator( + dataset_names=["ds_foo"], + dataset_id="example/FooBar", + write_csv=False, + ) + seq_evaluator = SequentialEvaluator( + [nanobeir_evaluator, generic_nano_evaluator], + main_score_function=lambda scores: float(sum(scores) / len(scores)), + ) + + metrics = seq_evaluator(dummy_sparse_model) + + assert "sequential_score" in metrics + assert any(key.startswith("NanoBEIR_mean_") for key in metrics) + assert any(key.startswith("FooBar_mean_") for key in metrics) + assert "NanoBEIR_mean_dot_ndcg@10" in metrics