Add to_markdown() to InstanceScores to pretty print output (#1846)

yoavkatz · web-flow · commit fa53229d5be0 · 2025-06-29T11:45:42.000+03:00
* Add wrapped to_markdown() method to InstanceScores to pretty print the results

Signed-off-by: Yoav Katz &lt;katz@il.ibm.com&gt;

* Updated examples

Signed-off-by: Yoav Katz &lt;katz@il.ibm.com&gt;

* Updated example to run with existing models

Signed-off-by: Yoav Katz &lt;katz@il.ibm.com&gt;

* Fixed max token renaming in WatsonSDK Chat enginer

Signed-off-by: Yoav Katz &lt;katz@il.ibm.com&gt;

* Improved summary printout of InstanceScores

Signed-off-by: Yoav Katz &lt;katz@il.ibm.com&gt;

* USed new summary feature

Signed-off-by: Yoav Katz &lt;katz@il.ibm.com&gt;

* Change lite lllm test to use large model for better and more consistent results

Signed-off-by: Yoav Katz &lt;katz@il.ibm.com&gt;

* Update InstanceScore summary to ignore columsn that do not exist.

Signed-off-by: Yoav Katz &lt;katz@il.ibm.com&gt;

---------

Signed-off-by: Yoav Katz &lt;katz@il.ibm.com&gt;
diff --git a/examples/evaluate_batched_multiclass_classification.py b/examples/evaluate_batched_multiclass_classification.py
@@ -5,7 +5,6 @@
 from unitxt import get_logger
 from unitxt.api import evaluate, load_dataset
 from unitxt.artifact import fetch_artifact
-from unitxt.formats import SystemFormat
 from unitxt.operators import CollateInstances, Copy, FieldOperator, Rename
 from unitxt.processors import PostProcess
 from unitxt.serializers import MultiTypeSerializer, SingleTypeSerializer
@@ -82,13 +81,11 @@ def serialize(self, value: EnumeratedList, instance: Dict[str, Any]) -> str:
 
 for provider in [
     "watsonx",
-    "bam",
 ]:
     for model_name in [
-        "granite-3-8b-instruct",
-        "llama-3-8b-instruct",
+        "granite-3-3-8b-instruct",
     ]:
-        batch_sizes = [30, 20, 10, 5, 1]
+        batch_sizes = [100, 50, 10, 5, 1]
 
         for batch_size in batch_sizes:
             card, _ = fetch_artifact("cards.banking77")
@@ -104,21 +101,6 @@ def serialize(self, value: EnumeratedList, instance: Dict[str, Any]) -> str:
             card.task = task
             card.templates = [template]
             format = "formats.chat_api"
-            if provider == "bam" and model_name.startswith("llama"):
-                format = "formats.llama3_instruct"
-            if provider == "bam" and model_name.startswith("granite"):
-                format = SystemFormat(
-                    demo_format=(
-                        "{instruction}\\N{source}\\N<|end_of_text|>\n"
-                        "<|start_of_role|>assistant<|end_of_role|>{target}\\N<|end_of_text|>\n"
-                        "<|start_of_role|>user<|end_of_role|>"
-                    ),
-                    model_input_format=(
-                        "<|start_of_role|>system<|end_of_role|>{system_prompt}<|end_of_text|>\n"
-                        "<|start_of_role|>user<|end_of_role|>{demos}{instruction}\\N{source}\\N<|end_of_text|>\n"
-                        "<|start_of_role|>assistant<|end_of_role|>"
-                    ),
-                )
 
             dataset = load_dataset(
                 card=card,
@@ -138,7 +120,7 @@ def serialize(self, value: EnumeratedList, instance: Dict[str, Any]) -> str:
             )
             """
             We are using a CrossProviderInferenceEngine inference engine that supply api access to provider such as:
-            watsonx, bam, openai, azure, aws and more.
+            watsonx, openai, azure, aws and more.
 
             For the arguments these inference engines can receive, please refer to the classes documentation or read
             about the the open ai api arguments the CrossProviderInferenceEngine follows.
@@ -148,7 +130,7 @@ def serialize(self, value: EnumeratedList, instance: Dict[str, Any]) -> str:
             results = evaluate(predictions=predictions, data=test_dataset)
 
             print(
-                results.instance_scores.to_df(
+                results.instance_scores.to_markdown(
                     columns=[
                         "source",
                         "prediction",
diff --git a/examples/evaluate_same_datasets_and_models_with_multiple_providers.py b/examples/evaluate_same_datasets_and_models_with_multiple_providers.py
@@ -1,7 +1,6 @@
 import pandas as pd
 from unitxt.api import evaluate, load_dataset
 from unitxt.artifact import fetch_artifact
-from unitxt.formats import SystemFormat
 
 df = pd.DataFrame(
     columns=[
@@ -22,29 +21,11 @@
 ]:
     for model_name in [
         "granite-3-8b-instruct",
-        "llama-3-8b-instruct",
+        "llama-3-3-70b-instruct",
     ]:
-        for format_as_chat_api in [True, False]:
-            if format_as_chat_api and provider == "watsonx-sdk":
-                continue
+        for format_as_chat_api in [True]:
             if format_as_chat_api:
                 format = "formats.chat_api"
-            else:
-                if model_name.startswith("llama"):
-                    format = "formats.llama3_instruct"
-                if model_name.startswith("granite"):
-                    format = SystemFormat(
-                        demo_format=(
-                            "{instruction}\\N{source}\\N<|end_of_text|>\n"
-                            "<|start_of_role|>assistant<|end_of_role|>{target}\\N<|end_of_text|>\n"
-                            "<|start_of_role|>user<|end_of_role|>"
-                        ),
-                        model_input_format=(
-                            "<|start_of_role|>system<|end_of_role|>{system_prompt}<|end_of_text|>\n"
-                            "<|start_of_role|>user<|end_of_role|>{demos}{instruction}\\N{source}\\N<|end_of_text|>\n"
-                            "<|start_of_role|>assistant<|end_of_role|>"
-                        ),
-                    )
             card, _ = fetch_artifact("cards.sst2")
 
             dataset = load_dataset(
@@ -71,16 +52,7 @@
             # result_df = pd.json_normalize(evaluated_dataset)
             # result_df.to_csv(f"output.csv")
             # Print results
-            print(
-                results.instance_scores.to_df(
-                    columns=[
-                        "source",
-                        "prediction",
-                        "processed_prediction",
-                        "processed_references",
-                    ],
-                )
-            )
+            print(results.instance_scores.summary)
 
             global_scores = results.global_scores
             df.loc[len(df)] = [
diff --git a/examples/ner_evaluation.py b/examples/ner_evaluation.py
@@ -41,7 +41,7 @@
 # )
 # Change to this to infer with external APIs:
 
-model = CrossProviderInferenceEngine(model="llama-3-8b-instruct", provider="watsonx")
+model = CrossProviderInferenceEngine(model="llama-3-3-70b-instruct", provider="watsonx")
 # The provider can be one of: ["watsonx", "together-ai", "open-ai", "aws", "ollama", "bam"]
 
 
@@ -57,7 +57,7 @@
 
 print("Instance Results:")
 print(
-    results.instance_scores.to_df(
+    results.instance_scores.to_markdown(
         columns=[
             "text",
             "prediction",
@@ -66,5 +66,5 @@
             "score",
             "score_name",
         ]
-    ).to_markdown()
+    )
 )
diff --git a/src/unitxt/inference.py b/src/unitxt/inference.py
@@ -3677,7 +3677,7 @@ class CrossProviderInferenceEngine(InferenceEngine, StandardAPIParamsMixin):
 
     _provider_param_renaming = {
         "bam": {"max_tokens": "max_new_tokens", "model": "model_name"},
-        "watsonx-sdk": {"max_tokens": "max_new_tokens", "model": "model_name"},
+        "watsonx-sdk": {"model": "model_name"},
         "rits": {"model": "model_name"},
     }
 
diff --git a/src/unitxt/metric_utils.py b/src/unitxt/metric_utils.py
@@ -1,5 +1,6 @@
 import json
 import re
+import textwrap
 from collections import defaultdict
 from functools import lru_cache
 from statistics import mean
@@ -683,21 +684,36 @@ def to_df(self, flatten=True, columns=None):
             return df[columns]
         return df
 
+    def _to_markdown(self, df, max_col_width=30, **kwargs):
+        def wrap_column(series, max_width=30):
+            """Wraps string values in a Pandas Series to a maximum width."""
+            return series.apply(lambda x: textwrap.fill(str(x), width=max_width))
+
+        wrapped_df = df.copy()
+        for col in wrapped_df.columns:
+            wrapped_df[col] = wrap_column(wrapped_df[col], max_col_width)
+        return wrapped_df.to_markdown(**kwargs)
+
+    def to_markdown(self, flatten=True, columns=None, max_col_width=30, **kwargs):
+        return self._to_markdown(self.to_df(flatten, columns), max_col_width, **kwargs)
+
     @property
     def summary(self):
-        return to_pretty_string(
+        return self._to_markdown(
             self.to_df()
             .head()
             .drop(
                 columns=[
                     "metadata",
                     "media",
-                    "data_classification_policy",
                     "groups",
                     "subset",
-                ]
-            ),
-            float_format=".2g",
+                    "demos",
+                    "metrics",
+                    "postprocessors",
+                ],
+                errors="ignore",
+            )
         )
 
     def __repr__(self):
diff --git a/tests/inference/test_inference_engine.py b/tests/inference/test_inference_engine.py
@@ -369,7 +369,7 @@ def test_watsonx_inference_with_images(self):
 
     def test_lite_llm_inference_engine(self):
         model = LiteLLMInferenceEngine(
-            model="watsonx/meta-llama/llama-3-2-1b-instruct",
+            model="watsonx/meta-llama/llama-3-3-70b-instruct",
             max_tokens=2,
             temperature=0,
             top_p=1,
@@ -379,11 +379,11 @@ def test_lite_llm_inference_engine(self):
         dataset = get_text_dataset(format="formats.chat_api")
         predictions = model(dataset)
 
-        self.assertListEqual(predictions, ["100", "```\n"])
+        self.assertListEqual(predictions, ["7", "2"])
 
     def test_lite_llm_inference_engine_without_task_data_not_failing(self):
         LiteLLMInferenceEngine(
-            model="watsonx/meta-llama/llama-3-2-1b-instruct",
+            model="watsonx/meta-llama/llama-3-3-70b-instruct",
             max_tokens=2,
             temperature=0,
             top_p=1,

Original file line number	Diff line number	Diff line change
`@@ -3677,7 +3677,7 @@ class CrossProviderInferenceEngine(InferenceEngine, StandardAPIParamsMixin):`
`3677`	`3677`
`3678`	`3678`	`_provider_param_renaming = {`
`3679`	`3679`	`"bam": {"max_tokens": "max_new_tokens", "model": "model_name"},`
`3680`		`- "watsonx-sdk": {"max_tokens": "max_new_tokens", "model": "model_name"},`
	`3680`	`+ "watsonx-sdk": {"model": "model_name"},`
`3681`	`3681`	`"rits": {"model": "model_name"},`
`3682`	`3682`	`}`
`3683`	`3683`