diff --git a/README.md b/README.md index 5deeb16..41e4df4 100644 --- a/README.md +++ b/README.md @@ -60,6 +60,7 @@ results = workrb.evaluate( # Returns BenchmarkResults (Pydantic model) model, tasks, output_folder="results/my_model", + save_rankings=False, # Optional: store full per-target score arrays for ranking tasks ) print(results) # Benchmark/Per-task/Per-language metrics ``` @@ -208,6 +209,20 @@ results/my_model/ └── config.yaml # Final benchmark configuration dump ``` +If you pass `save_rankings=True` to `evaluate`, WorkRB also writes per-task, +per-dataset ranking score artifacts under a model-scoped subdirectory: + +``` +results/my_model/ +└── rankings/ + └── / + └── __.json +``` + +Each JSON contains `model_name`, `task_name`, `dataset_id`, `num_queries`, +`num_targets`, and `scores_by_target` (a mapping from target text to its +score across all queries). + To load & parse results from a run: ```python diff --git a/src/workrb/config.py b/src/workrb/config.py index 14daf6a..4d6df53 100644 --- a/src/workrb/config.py +++ b/src/workrb/config.py @@ -7,6 +7,7 @@ import json import logging +import re import time from collections.abc import Sequence from dataclasses import asdict, dataclass, field @@ -134,6 +135,47 @@ def get_results_path(self) -> Path: """Get the path where final results should be saved.""" return self.get_output_path() / "results.json" + def get_rankings_dir(self) -> Path: + """Get the directory where per-dataset ranking artifacts are saved. + + Rankings are nested under a sanitized model-name directory so that + running multiple models into the same ``output_folder`` cannot clobber + each other's ranking files. + """ + safe_model_name = re.sub(r"[^A-Za-z0-9_.-]+", "_", self.model_name).strip("_") + return self.get_output_path() / "rankings" / safe_model_name + + def get_task_rankings_path(self, task_name: str, dataset_id: str) -> Path: + """Get the output path for one task/dataset ranking artifact.""" + safe_task_name = re.sub(r"[^A-Za-z0-9_.-]+", "_", task_name).strip("_") + safe_dataset_id = re.sub(r"[^A-Za-z0-9_.-]+", "_", dataset_id).strip("_") + filename = f"{safe_task_name}__{safe_dataset_id}.json" + return self.get_rankings_dir() / filename + + def save_rankings_artifact( + self, + task_name: str, + dataset_id: str, + scores_by_target: dict[str, list[float]], + num_queries: int, + num_targets: int, + ) -> Path: + """Save full ranking scores for one dataset as a JSON artifact.""" + rankings_path = self.get_task_rankings_path(task_name=task_name, dataset_id=dataset_id) + rankings_path.parent.mkdir(parents=True, exist_ok=True) + payload = { + "model_name": self.model_name, + "task_name": task_name, + "dataset_id": dataset_id, + "num_queries": num_queries, + "num_targets": num_targets, + "scores_by_target": scores_by_target, + } + with open(rankings_path, "w") as f: + json.dump(payload, f, indent=2) + logger.debug(f"Ranking artifact saved to {rankings_path}") + return rankings_path + def has_checkpoint(self) -> bool: """Check if a checkpoint exists.""" return self.get_checkpoint_path().exists() diff --git a/src/workrb/run.py b/src/workrb/run.py index aefdeb1..7ca99b1 100644 --- a/src/workrb/run.py +++ b/src/workrb/run.py @@ -23,6 +23,7 @@ TaskResults, ) from workrb.tasks.abstract.base import Task +from workrb.tasks.abstract.ranking_base import RankingTask from workrb.types import ExecutionMode, LanguageAggregationMode, get_language_grouping_key logger = logging.getLogger(__name__) @@ -36,6 +37,7 @@ def evaluate( metrics: dict[str, list[str]] | None = None, description: str = "", force_restart: bool = False, + save_rankings: bool = False, language_aggregation_mode: LanguageAggregationMode = LanguageAggregationMode.MONOLINGUAL_ONLY, execution_mode: ExecutionMode = ExecutionMode.LAZY, ) -> BenchmarkResults: @@ -49,6 +51,10 @@ def evaluate( metrics: Optional dict mapping task names to custom metrics lists description: Description for the benchmark run force_restart: If True, ignore checkpoints and restart from beginning + save_rankings: If True, save per-target ranking score arrays for each + ranking task dataset under + ``/rankings//`` as JSON artifacts. + Has no effect for non-ranking tasks. Defaults to False. language_aggregation_mode: How per-language results should be grouped when calling ``get_summary_metrics()`` on the returned results. When ``execution_mode`` is ``LAZY``, datasets that are @@ -111,6 +117,7 @@ def evaluate( results=results, model=model, metrics=metrics, + save_rankings=save_rankings, total_evaluations=total_evaluations, ) if results.metadata.resumed_from_checkpoint: @@ -429,6 +436,7 @@ def _run_pending_work( results: BenchmarkResults, model: ModelInterface, metrics: dict[str, list[str]] | None, + save_rankings: bool, total_evaluations: int, ) -> BenchmarkResults: """Run pending evaluations. @@ -439,6 +447,7 @@ def _run_pending_work( results: BenchmarkResults object to store results. model: ModelInterface object to evaluate. metrics: Dictionary of task names to their custom metrics. + save_rankings: If True, save full ranking score artifacts for ranking tasks. total_evaluations: Total number of compatible evaluations (for progress display). """ # Run pending evaluations @@ -476,9 +485,30 @@ def _run_pending_work( try: start_time_eval = time.time() - dataset_results: dict[str, float] = task.evaluate( - model=model, metrics=task_metrics, dataset_id=dataset_id - ) + if save_rankings and isinstance(task, RankingTask): + prediction_matrix = task.compute_prediction_matrix( + model=model, dataset_id=dataset_id + ) + dataset_results = task.compute_metrics_from_prediction_matrix( + prediction_matrix=prediction_matrix, + dataset_id=dataset_id, + metrics=task_metrics, + ) + rankings_path = config.save_rankings_artifact( + task_name=task.name, + dataset_id=dataset_id, + scores_by_target=_build_scores_by_target( + target_space=task.datasets[dataset_id].target_space, + prediction_matrix=prediction_matrix, + ), + num_queries=prediction_matrix.shape[0], + num_targets=prediction_matrix.shape[1], + ) + logger.info(f"\tSaved ranking scores to: {rankings_path}") + else: + dataset_results: dict[str, float] = task.evaluate( + model=model, metrics=task_metrics, dataset_id=dataset_id + ) evaluation_time = time.time() - start_time_eval # Store results @@ -508,3 +538,14 @@ def _run_pending_work( logger.info(f"Completed {run_idx} / {total_evaluations} evaluations. ") return results + + +def _build_scores_by_target( + target_space: list[str], + prediction_matrix: Any, +) -> dict[str, list[float]]: + """Build a mapping from target text to its scores across all queries.""" + return { + target_text: prediction_matrix[:, idx].tolist() + for idx, target_text in enumerate(target_space) + } diff --git a/src/workrb/tasks/abstract/ranking_base.py b/src/workrb/tasks/abstract/ranking_base.py index ed99200..efc88ba 100644 --- a/src/workrb/tasks/abstract/ranking_base.py +++ b/src/workrb/tasks/abstract/ranking_base.py @@ -8,6 +8,7 @@ from enum import Enum from typing import TYPE_CHECKING +import numpy as np import torch from workrb.metrics.ranking import calculate_ranking_metrics @@ -306,30 +307,42 @@ def evaluate( dict[str, float] Dictionary containing metric scores and evaluation metadata. """ - if metrics is None: - metrics = self.default_metrics + prediction_matrix = self.compute_prediction_matrix(model=model, dataset_id=dataset_id) + return self.compute_metrics_from_prediction_matrix( + prediction_matrix=prediction_matrix, + dataset_id=dataset_id, + metrics=metrics, + ) - # Retrieve dataset by ID + def compute_prediction_matrix( + self, + model: ModelInterface, + dataset_id: str = "en", + ) -> np.ndarray: + """Compute the ranking score matrix for a dataset.""" dataset = self.datasets[dataset_id] - queries = dataset.query_texts - targets = dataset.target_space - labels = dataset.target_indices - - # Get model predictions (similarity matrix) prediction_matrix = model.compute_rankings( - queries=queries, - targets=targets, + queries=dataset.query_texts, + targets=dataset.target_space, query_input_type=self.query_input_type, target_input_type=self.target_input_type, ) - - # Convert to numpy if needed if isinstance(prediction_matrix, torch.Tensor): prediction_matrix = prediction_matrix.cpu().float().numpy() + return prediction_matrix - # Calculate metrics - metric_results = calculate_ranking_metrics( - prediction_matrix=prediction_matrix, pos_label_idxs=labels, metrics=metrics + def compute_metrics_from_prediction_matrix( + self, + prediction_matrix: np.ndarray, + dataset_id: str = "en", + metrics: list[str] | None = None, + ) -> dict[str, float]: + """Compute ranking metrics from a precomputed prediction matrix.""" + if metrics is None: + metrics = self.default_metrics + dataset = self.datasets[dataset_id] + return calculate_ranking_metrics( + prediction_matrix=prediction_matrix, + pos_label_idxs=dataset.target_indices, + metrics=metrics, ) - - return metric_results diff --git a/tests/test_save_rankings_artifacts.py b/tests/test_save_rankings_artifacts.py new file mode 100644 index 0000000..74c1fb0 --- /dev/null +++ b/tests/test_save_rankings_artifacts.py @@ -0,0 +1,156 @@ +"""Tests for ranking artifact persistence in evaluate(save_rankings=...).""" + +import json +import shutil +from pathlib import Path + +import pytest +import torch + +import workrb +from workrb.models.base import ModelInterface +from workrb.tasks.abstract.base import DatasetSplit, LabelType, Language +from workrb.tasks.abstract.ranking_base import RankingDataset, RankingTask, RankingTaskGroup +from workrb.types import ModelInputType + + +class TinyRankingTask(RankingTask): + """Minimal ranking task used to test ranking artifact persistence.""" + + @property + def name(self) -> str: + return "Tiny Ranking Task" + + @property + def description(self) -> str: + return "Tiny in-memory ranking task for tests." + + @property + def supported_query_languages(self) -> list[Language]: + return [Language.EN] + + @property + def supported_target_languages(self) -> list[Language]: + return [Language.EN] + + @property + def task_group(self) -> RankingTaskGroup: + return RankingTaskGroup.SKILL_EXTRACTION + + @property + def label_type(self) -> LabelType: + return LabelType.MULTI_LABEL + + @property + def query_input_type(self) -> ModelInputType: + return ModelInputType.SKILL_SENTENCE + + @property + def target_input_type(self) -> ModelInputType: + return ModelInputType.SKILL_NAME + + def load_dataset(self, dataset_id: str, split: DatasetSplit) -> RankingDataset: + return RankingDataset( + query_texts=["query one", "query two"], + target_indices=[[0], [1]], + target_space=["target_a", "target_b", "target_c"], + dataset_id=dataset_id, + ) + + +class TinyDeterministicModel(ModelInterface): + """Deterministic model with fixed ranking scores for tests.""" + + @property + def name(self) -> str: + return "tiny-deterministic-model" + + @property + def description(self) -> str: + return "Tiny deterministic model for ranking artifact tests." + + def _compute_rankings( + self, + queries: list[str], + targets: list[str], + query_input_type: ModelInputType, + target_input_type: ModelInputType, + ) -> torch.Tensor: + # 2 queries x 3 targets + return torch.tensor( + [ + [0.1, 0.2, 0.3], + [0.4, 0.5, 0.6], + ], + dtype=torch.float32, + ) + + def _compute_classification( + self, + texts: list[str], + targets: list[str], + input_type: ModelInputType, + target_input_type: ModelInputType | None = None, + ) -> torch.Tensor: + return torch.zeros((len(texts), len(targets))) + + @property + def classification_label_space(self) -> list[str] | None: + return None + + +def test_evaluate_saves_rankings_artifact_when_enabled(): + """evaluate(save_rankings=True) writes one JSON artifact per ranking dataset with full per-target scores.""" + output_folder = Path("tmp/rankings_artifact_test_enabled") + if output_folder.exists(): + shutil.rmtree(output_folder, ignore_errors=True) + + model = TinyDeterministicModel() + tasks = [TinyRankingTask(split=DatasetSplit.TEST, languages=[Language.EN])] + + _ = workrb.evaluate( + model=model, + tasks=tasks, + output_folder=str(output_folder), + force_restart=True, + save_rankings=True, + ) + + rankings_dir = output_folder / "rankings" / model.name + ranking_files = list(rankings_dir.glob("*.json")) + assert len(ranking_files) == 1 + + with open(ranking_files[0]) as f: + payload = json.load(f) + + assert payload["model_name"] == model.name + assert payload["task_name"] == "Tiny Ranking Task" + assert payload["dataset_id"] == "en" + assert payload["num_queries"] == 2 + assert payload["num_targets"] == 3 + + scores_by_target = payload["scores_by_target"] + assert set(scores_by_target.keys()) == {"target_a", "target_b", "target_c"} + assert scores_by_target["target_a"] == pytest.approx([0.1, 0.4]) + assert scores_by_target["target_b"] == pytest.approx([0.2, 0.5]) + assert scores_by_target["target_c"] == pytest.approx([0.3, 0.6]) + + +def test_evaluate_does_not_save_rankings_artifact_by_default(): + """evaluate() without save_rankings does not create a rankings/ directory.""" + output_folder = Path("tmp/rankings_artifact_test_disabled") + if output_folder.exists(): + shutil.rmtree(output_folder, ignore_errors=True) + + model = TinyDeterministicModel() + tasks = [TinyRankingTask(split=DatasetSplit.TEST, languages=[Language.EN])] + + _ = workrb.evaluate( + model=model, + tasks=tasks, + output_folder=str(output_folder), + force_restart=True, + ) + + rankings_dir = output_folder / "rankings" + assert not rankings_dir.exists()