Introducing prompt benchmarking (#497)

shreyasXplain · web-flow · commit ba406e2b36b9 · 2025-06-26T10:25:01.000+05:30
* Add base support for benchmarking models with config

* bugFix: config normalization

* TypoFix: add 's' to configuration

* add display name in get_scores

* add tests for prompt benchmark

* uncomment first benchmark test
diff --git a/aixplain/factories/benchmark_factory.py b/aixplain/factories/benchmark_factory.py
@@ -22,7 +22,7 @@
 """
 
 import logging
-from typing import Dict, List, Text
+from typing import Dict, List, Text, Any, Tuple
 import json
 from aixplain.enums.supplier import Supplier
 from aixplain.modules import Dataset, Metric, Model
@@ -150,9 +150,9 @@ def _validate_create_benchmark_payload(cls, payload):
         if len(payload["datasets"]) != 1:
             raise Exception("Please use exactly one dataset")
         if len(payload["metrics"]) == 0:
-            raise Exception("Please use exactly one metric")
-        if len(payload["model"]) == 0:
-            raise Exception("Please use exactly one model")
+            raise Exception("Please use at least one metric")
+        if len(payload["model"]) == 0 and payload.get("models", None) is None:
+            raise Exception("Please use at least one model")
         clean_metrics_info = {}
         for metric_info in payload["metrics"]:
             metric_id = metric_info["id"]
@@ -167,6 +167,31 @@ def _validate_create_benchmark_payload(cls, payload):
             {"id": metric_id, "configurations": metric_config} for metric_id, metric_config in clean_metrics_info.items()
         ]
         return payload
+    
+    @classmethod
+    def _reformat_model_list(cls, model_list: List[Model]) -> Tuple[List[Any], List[Any]]:
+        """Reformat the model list to be used in the create benchmark API
+
+        Args:
+            model_list (List[Model]): List of models to be used in the benchmark
+
+        Returns:
+            Tuple[List[Any], List[Any]]: Reformatted model lists
+
+        """
+        model_list_without_parms, model_list_with_parms = [], []
+        for model in model_list:
+            if "displayName" in model.additional_info:
+                model_list_with_parms.append({"id": model.id, "displayName": model.additional_info["displayName"], "configurations": json.dumps(model.additional_info["configuration"])})
+            else:
+                model_list_without_parms.append(model.id)
+        if len(model_list_with_parms) > 0:
+            if len(model_list_without_parms) > 0:
+                raise Exception("Please provide addditional info for all models or for none of the models")
+        else:
+            model_list_with_parms = None
+        return model_list_without_parms, model_list_with_parms
+
 
     @classmethod
     def create(cls, name: str, dataset_list: List[Dataset], model_list: List[Model], metric_list: List[Metric]) -> Benchmark:
@@ -186,15 +211,18 @@ def create(cls, name: str, dataset_list: List[Dataset], model_list: List[Model],
         try:
             url = urljoin(cls.backend_url, "sdk/benchmarks")
             headers = {"Authorization": f"Token {config.TEAM_API_KEY}", "Content-Type": "application/json"}
+            model_list_without_parms, model_list_with_parms = cls._reformat_model_list(model_list)
             payload = {
                 "name": name,
                 "datasets": [dataset.id for dataset in dataset_list],
-                "model": [model.id for model in model_list],
                 "metrics": [{"id": metric.id, "configurations": metric.normalization_options} for metric in metric_list],
+                "model": model_list_without_parms,
                 "shapScores": [],
                 "humanEvaluationReport": False,
                 "automodeTraining": False,
             }
+            if model_list_with_parms is not None:
+                payload["models"] = model_list_with_parms
             clean_payload = cls._validate_create_benchmark_payload(payload)
             payload = json.dumps(clean_payload)
             r = _request_with_retry("post", url, headers=headers, data=payload)
diff --git a/aixplain/modules/benchmark_job.py b/aixplain/modules/benchmark_job.py
@@ -3,6 +3,7 @@
 from aixplain.utils import config
 from urllib.parse import urljoin
 import pandas as pd
+import json
 from pathlib import Path
 from aixplain.utils.request_utils import _request_with_retry
 from aixplain.utils.file_utils import save_file
@@ -109,6 +110,10 @@ def get_scores(self, return_simplified=True, return_as_dataframe=True):
             scores = {}
             for iteration_info in iterations:
                 model_id = iteration_info["pipeline"]
+                pipeline_json = json.loads(iteration_info["pipelineJson"])
+                if "benchmark" in pipeline_json:
+                    model_id = pipeline_json["benchmark"]["displayName"]
+
                 model_info = {
                     "creditsUsed": round(iteration_info.get("credits", 0), 5),
                     "timeSpent": round(iteration_info.get("runtime", 0), 2),
diff --git a/aixplain/modules/model/__init__.py b/aixplain/modules/model/__init__.py
@@ -432,6 +432,16 @@ def delete(self) -> None:
             message = "Model Deletion Error: Make sure the model exists and you are the owner."
             logging.error(message)
             raise Exception(f"{message}")
+    
+    def add_additional_info_for_benchmark(self, display_name: str, configuration: Dict) -> None:
+        """Add additional info for benchmark
+        
+        Args:
+            display_name (str): display name of the model
+            configuration (Dict): configuration of the model
+        """
+        self.additional_info["displayName"] = display_name
+        self.additional_info["configuration"] = configuration
 
     @classmethod
     def from_dict(cls, data: Dict) -> "Model":
@@ -451,3 +461,4 @@ def from_dict(cls, data: Dict) -> "Model":
             model_params=data.get("model_params"),
             **data.get("additional_info", {}),
         )
+
diff --git a/tests/functional/benchmark/benchmark_functional_test.py b/tests/functional/benchmark/benchmark_functional_test.py
@@ -11,9 +11,7 @@
 from pathlib import Path
 
 import pytest
-
 import logging
-
 from aixplain import aixplain_v2 as v2
 
 logger = logging.getLogger()
@@ -22,6 +20,7 @@
 TIMEOUT = 60 * 30
 RUN_FILE = str(Path(r"tests/functional/benchmark/data/benchmark_test_run_data.json"))
 MODULE_FILE = str(Path(r"tests/functional/benchmark/data/benchmark_module_test_data.json"))
+RUN_WITH_PARAMETERS_FILE = str(Path(r"tests/functional/benchmark/data/benchmark_test_with_parameters.json"))
 
 
 def read_data(data_path):
@@ -33,6 +32,11 @@ def run_input_map(request):
     return request.param
 
 
+@pytest.fixture(scope="module", params=[(name, params) for name, params in read_data(RUN_WITH_PARAMETERS_FILE).items()])
+def run_with_parameters_input_map(request):
+    return request.param
+
+
 @pytest.fixture(scope="module", params=read_data(MODULE_FILE))
 def module_input_map(request):
     return request.param
@@ -79,12 +83,22 @@ def test_create_and_run(run_input_map, BenchmarkFactory):
     assert_correct_results(benchmark_job)
 
 
-# def test_module(module_input_map):
-#     benchmark = BenchmarkFactory.get(module_input_map["benchmark_id"])
-#     assert benchmark.id == module_input_map["benchmark_id"]
-#     benchmark_job = benchmark.job_list[0]
-#     assert benchmark_job.benchmark_id == module_input_map["benchmark_id"]
-#     job_status = benchmark_job.check_status()
-#     assert job_status in ["in_progress", "completed"]
-#     df = benchmark_job.download_results_as_csv(return_dataframe=True)
-#     assert type(df) is pd.DataFrame
+@pytest.mark.parametrize("BenchmarkFactory", [BenchmarkFactory, v2.Benchmark])
+def test_create_and_run_with_parameters(run_with_parameters_input_map, BenchmarkFactory):
+    name, params = run_with_parameters_input_map
+    model_list = []
+    for model_info in params["models_with_parameters"]:
+        model = ModelFactory.get(model_info["model_id"])
+        model.add_additional_info_for_benchmark(display_name=model_info["display_name"], configuration=model_info["configuration"])
+        model_list.append(model)
+    dataset_list = [DatasetFactory.list(query=dataset_name)["results"][0] for dataset_name in params["dataset_names"]]
+    metric_list = [MetricFactory.get(metric_id) for metric_id in params["metric_ids"]]
+    benchmark = BenchmarkFactory.create(f"SDK Benchmark Test With Parameters({name}) {uuid.uuid4()}", dataset_list, model_list, metric_list)
+    assert type(benchmark) is Benchmark, "Couldn't create benchmark"
+    benchmark_job = benchmark.start()
+    assert type(benchmark_job) is BenchmarkJob, "Couldn't start job"
+    assert is_job_finshed(benchmark_job), "Job did not finish in time"
+    assert_correct_results(benchmark_job)
+
+
+
diff --git a/tests/functional/benchmark/data/benchmark_test_with_parameters.json b/tests/functional/benchmark/data/benchmark_test_with_parameters.json
@@ -0,0 +1,22 @@
+{
+    "Translation With LLMs": {
+        "models_with_parameters": [
+            {
+                "model_id": "669a63646eb56306647e1091",
+                "display_name": "EnHi LLM",
+                "configuration": {
+                    "prompt": "Translate the following text into Hindi."
+                }
+            },
+            {
+                "model_id": "669a63646eb56306647e1091",
+                "display_name": "EnEs LLM",
+                "configuration": {
+                    "prompt": "Translate the following text into Spanish."
+                }
+            }
+        ],
+        "dataset_names": ["EnHi SDK Test - Benchmark Dataset"],
+        "metric_ids": ["639874ab506c987b1ae1acc6", "6408942f166427039206d71e"]
+    }
+}