Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
c2e08f3
Add generic NanoEvaluator abstractions with NanoBEIR compatibility
hotchpotch Feb 24, 2026
beb18fe
docs: add local AGENTS guidance for NanoEvaluator PR workflow
hotchpotch Feb 24, 2026
caff296
Add NanoEvaluator example scripts across dense/sparse/cross encoders
hotchpotch Feb 24, 2026
b6a40bc
Restore NanoBEIR docs and move generic behavior docs to NanoEvaluator…
hotchpotch Feb 24, 2026
4226e95
Enhance Nano evaluator examples with per-dataset metric breakdown
hotchpotch Feb 24, 2026
3da882d
Refine metric-key comments and fix cross Nano config serialization
hotchpotch Feb 24, 2026
776c498
Add NanoBEIR primary-metric unit tests and document sparse init invar…
hotchpotch Feb 24, 2026
8e07fc0
Add CSV output unit test for NanoBEIREvaluator
hotchpotch Feb 24, 2026
59750f6
Use neutral example dataset IDs in Nano evaluator unit tests
hotchpotch Feb 24, 2026
3e55ae6
Refactor nano evaluators to minimize main diff
hotchpotch Mar 6, 2026
8db2c83
Refactor nano evaluator test fixtures
hotchpotch Mar 6, 2026
97a5e26
Allow direct split names in nano evaluators
hotchpotch Mar 6, 2026
04a8408
Move generic nano hooks out of shared helper
hotchpotch Mar 6, 2026
69f294a
Trim cross nano evaluator surface area
hotchpotch Mar 6, 2026
0183225
Fix generic nano evaluator validation
hotchpotch Mar 6, 2026
0a0d555
Reduce redundant nano evaluator hooks
hotchpotch Mar 6, 2026
572ae3a
Simplify nano evaluator overrides
hotchpotch Mar 18, 2026
fe25211
Minimize nano_beir formatting diff
hotchpotch Mar 18, 2026
2837a8d
Trim redundant nano evaluator test args
hotchpotch Mar 18, 2026
ba1219b
Use neutral generic dataset names in tests
hotchpotch Mar 18, 2026
5444bd7
Use dummy split names in nano evaluator tests
hotchpotch Mar 18, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
"""Simple CrossEncoder NanoBEIR reranking example.

Run:
uv run --with datasets python examples/cross_encoder/evaluation/evaluation_nano_cross_encoder_bm25.py
"""

import logging

from sentence_transformers import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CrossEncoderNanoBEIREvaluator
from sentence_transformers.cross_encoder.evaluation.nano_beir import DATASET_NAME_TO_HUMAN_READABLE

logging.basicConfig(format="%(message)s", level=logging.INFO)

MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L6-v2"
DATASET_ID = "sentence-transformers/NanoBEIR-en"
DATASET_SPLITS = ["msmarco", "nq"]
RERANK_K = 100

model = CrossEncoder(MODEL_NAME)
evaluator = CrossEncoderNanoBEIREvaluator(
dataset_id=DATASET_ID,
dataset_names=DATASET_SPLITS,
rerank_k=RERANK_K,
at_k=10,
batch_size=32,
show_progress_bar=False,
)

results = evaluator(model)
if evaluator.primary_metric is None:
raise ValueError("Expected evaluator.primary_metric to be set after evaluation.")

primary_metric = evaluator.primary_metric
if primary_metric not in results:
primary_metric = f"{evaluator.name}_{primary_metric}"
if primary_metric not in results:
raise ValueError(f"Primary metric key not found: {primary_metric}")

"""
Example output (actual run in this repo, to be updated if defaults change):
Model: cross-encoder/ms-marco-MiniLM-L6-v2
Dataset: sentence-transformers/NanoBEIR-en
Splits: ['msmarco', 'nq']
Split scores:
- NanoMSMARCO_R100_ndcg@10 = 0.6686
- NanoNQ_R100_ndcg@10 = 0.7599
Primary metric key: NanoBEIR_R100_mean_ndcg@10
Primary metric value: 0.7142
"""

print(f"Model: {MODEL_NAME}")
print(f"Dataset: {DATASET_ID}")
print(f"Splits: {DATASET_SPLITS}")
metric_suffix = primary_metric.split("_mean_", maxsplit=1)[1]
print("Split scores:")
for split_name in DATASET_SPLITS:
human_readable = DATASET_NAME_TO_HUMAN_READABLE[split_name.lower()]
split_key = f"Nano{human_readable}_R{RERANK_K}_{metric_suffix}"
if split_key in results:
print(f"- {split_key} = {float(results[split_key]):.4f}")
print(f"Primary metric key: {primary_metric}")
print(f"Primary metric value: {float(results[primary_metric]):.4f}")
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
"""Simple NanoEvaluator example on NanoMIRACL.

Run:
uv run --with datasets python examples/sentence_transformer/evaluation/evaluation_nano_dense_miracl.py
"""

import logging

from sentence_transformers import SentenceTransformer
from sentence_transformers.evaluation import NanoEvaluator

logging.basicConfig(format="%(message)s", level=logging.INFO)

# Keep this example light: evaluate two language splits.
MODEL_NAME = "intfloat/multilingual-e5-small"
DATASET_ID = "hotchpotch/NanoMIRACL"
DATASET_SPLITS = ["en", "ja"]

model = SentenceTransformer(MODEL_NAME)
evaluator = NanoEvaluator(
dataset_id=DATASET_ID,
dataset_names=DATASET_SPLITS,
batch_size=32,
show_progress_bar=False,
)

results = evaluator(model)
"""
Example output (actual run in this repo, to be updated if defaults change):
Model: intfloat/multilingual-e5-small
Dataset: hotchpotch/NanoMIRACL
Splits: ['en', 'ja']
Split scores:
- NanoMIRACL_en_cosine_ndcg@10 = 0.6901
- NanoMIRACL_ja_cosine_ndcg@10 = 0.7168
Primary metric key: NanoMIRACL_mean_cosine_ndcg@10
Primary metric value: 0.7034
"""

primary_metric = evaluator.primary_metric
if primary_metric is None:
raise ValueError("Expected evaluator.primary_metric to be set after evaluation.")

print(f"Model: {MODEL_NAME}")
print(f"Dataset: {DATASET_ID}")
print(f"Splits: {DATASET_SPLITS}")
metric_suffix = primary_metric.split("_mean_", maxsplit=1)[1]
print("Split scores:")
for split_name in DATASET_SPLITS:
split_key = f"NanoMIRACL_{split_name}_{metric_suffix}"
if split_key in results:
print(f"- {split_key} = {results[split_key]:.4f}")
print(f"Primary metric key: {primary_metric}")
print(f"Primary metric value: {results[primary_metric]:.4f}")
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
"""Simple dense multi-dataset Nano macro example.
Run:
uv run --with datasets python examples/sentence_transformer/evaluation/evaluation_nano_dense_multidataset_macro.py
"""

import logging

import numpy as np

from sentence_transformers import SentenceTransformer
from sentence_transformers.evaluation import NanoEvaluator

logging.basicConfig(format="%(message)s", level=logging.INFO)

MODEL_NAME = "intfloat/multilingual-e5-small"
MULTILINGUAL_NANOBEIR_DATASET_IDS = [
"sentence-transformers/NanoBEIR-en",
"LiquidAI/NanoBEIR-ja",
]
CUSTOM_DATASET_IDS = [
"hotchpotch/NanoCodeSearchNet",
]


def evaluate_dataset(model: SentenceTransformer, dataset_id: str) -> tuple[str, str, float]:
evaluator = NanoEvaluator(
dataset_id=dataset_id,
dataset_names=None,
batch_size=32,
show_progress_bar=False,
)
results = evaluator(model)
if evaluator.primary_metric is None:
raise ValueError(f"Expected evaluator.primary_metric for dataset_id={dataset_id}")
return dataset_id, evaluator.primary_metric, float(results[evaluator.primary_metric])


model = SentenceTransformer(MODEL_NAME)

multilingual_results = [evaluate_dataset(model, dataset_id) for dataset_id in MULTILINGUAL_NANOBEIR_DATASET_IDS]
custom_results = [evaluate_dataset(model, dataset_id) for dataset_id in CUSTOM_DATASET_IDS]

multilingual_scores = [score for _, _, score in multilingual_results]
custom_scores = [score for _, _, score in custom_results]

multilingual_macro = float(np.mean(multilingual_scores))
custom_macro = float(np.mean(custom_scores))
group_macro = float(np.mean([multilingual_macro, custom_macro]))

"""
Example output (actual run in this repo, to be updated if defaults change):
Model: intfloat/multilingual-e5-small
Multilingual dataset scores:
- sentence-transformers/NanoBEIR-en | NanoBEIR-en_mean_cosine_ndcg@10 = 0.5542
- LiquidAI/NanoBEIR-ja | NanoBEIR-ja_mean_cosine_ndcg@10 = 0.4985
Custom dataset scores:
- hotchpotch/NanoCodeSearchNet | NanoCodeSearchNet_mean_cosine_ndcg@10 = 0.7381
Multilingual macro mean: 0.5263
Custom macro mean: 0.7381
Group macro mean: 0.6322
"""

print(f"Model: {MODEL_NAME}")
print("Multilingual dataset scores:")
for dataset_id, metric_key, score in multilingual_results:
print(f"- {dataset_id} | {metric_key} = {score:.4f}")
print("Custom dataset scores:")
for dataset_id, metric_key, score in custom_results:
print(f"- {dataset_id} | {metric_key} = {score:.4f}")
print(f"Multilingual macro mean: {multilingual_macro:.4f}")
print(f"Custom macro mean: {custom_macro:.4f}")
print(f"Group macro mean: {group_macro:.4f}")
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
"""Simple sparse multi-dataset Nano macro example.

Run:
uv run --with datasets python examples/sparse_encoder/evaluation/sparse_nano_multidataset_macro_evaluator.py
"""

import logging

import numpy as np

from sentence_transformers import SparseEncoder
from sentence_transformers.sparse_encoder.evaluation import SparseNanoEvaluator

logging.basicConfig(format="%(message)s", level=logging.INFO)

MODEL_NAME = "sparse-encoder/example-inference-free-splade-distilbert-base-uncased-nq"
MULTILINGUAL_NANOBEIR_DATASET_IDS = [
"sentence-transformers/NanoBEIR-en",
]
CUSTOM_DATASET_IDS = [
"hotchpotch/NanoCodeSearchNet",
]


def evaluate_dataset(model: SparseEncoder, dataset_id: str) -> tuple[str, str, float]:
evaluator = SparseNanoEvaluator(
dataset_id=dataset_id,
dataset_names=None,
batch_size=32,
show_progress_bar=False,
)
results = evaluator(model)
if evaluator.primary_metric is None:
raise ValueError(f"Expected evaluator.primary_metric for dataset_id={dataset_id}")
return dataset_id, evaluator.primary_metric, float(results[evaluator.primary_metric])


model = SparseEncoder(MODEL_NAME)

multilingual_results = [evaluate_dataset(model, dataset_id) for dataset_id in MULTILINGUAL_NANOBEIR_DATASET_IDS]
custom_results = [evaluate_dataset(model, dataset_id) for dataset_id in CUSTOM_DATASET_IDS]

multilingual_scores = [score for _, _, score in multilingual_results]
custom_scores = [score for _, _, score in custom_results]

multilingual_macro = float(np.mean(multilingual_scores))
custom_macro = float(np.mean(custom_scores))
group_macro = float(np.mean([multilingual_macro, custom_macro]))

"""
Example output (actual run in this repo, to be updated if defaults change):
Model: sparse-encoder/example-inference-free-splade-distilbert-base-uncased-nq
Multilingual dataset scores:
- sentence-transformers/NanoBEIR-en | NanoBEIR-en_mean_dot_ndcg@10 = 0.5205
Custom dataset scores:
- hotchpotch/NanoCodeSearchNet | NanoCodeSearchNet_mean_dot_ndcg@10 = 0.5867
Multilingual macro mean: 0.5205
Custom macro mean: 0.5867
Group macro mean: 0.5536
"""

print(f"Model: {MODEL_NAME}")
print("Multilingual dataset scores:")
for dataset_id, metric_key, score in multilingual_results:
print(f"- {dataset_id} | {metric_key} = {score:.4f}")
print("Custom dataset scores:")
for dataset_id, metric_key, score in custom_results:
print(f"- {dataset_id} | {metric_key} = {score:.4f}")
print(f"Multilingual macro mean: {multilingual_macro:.4f}")
print(f"Custom macro mean: {custom_macro:.4f}")
print(f"Group macro mean: {group_macro:.4f}")
2 changes: 2 additions & 0 deletions sentence_transformers/cross_encoder/evaluation/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
CERerankingEvaluator,
CESoftmaxAccuracyEvaluator,
)
from .nano_evaluator import CrossEncoderNanoEvaluator
from .nano_beir import CrossEncoderNanoBEIREvaluator
from .reranking import CrossEncoderRerankingEvaluator

Expand All @@ -31,6 +32,7 @@
__all__ = [
"CrossEncoderClassificationEvaluator",
"CrossEncoderCorrelationEvaluator",
"CrossEncoderNanoEvaluator",
"CrossEncoderRerankingEvaluator",
"CrossEncoderNanoBEIREvaluator",
# Deprecated:
Expand Down
Loading
Loading