Fix erroneous prompts in evaluation tasks (and clean some json-schema-wise) (#1920)

dafnapension · elronbandel · web-flow · commit 6aff8ad6cf30 · 2025-08-24T21:08:01.000+03:00
* cast None to str, to comply with json schema

Signed-off-by: dafnapension &lt;dafnashein@yahoo.com&gt;

* fix template and tasks of evaluation

Signed-off-by: dafnapension &lt;dafnashein@yahoo.com&gt;

---------

Signed-off-by: dafnapension &lt;dafnashein@yahoo.com&gt;
Co-authored-by: Elron Bandel &lt;elronbandel@gmail.com&gt;
diff --git a/prepare/cards/coedit.py b/prepare/cards/coedit.py
@@ -106,8 +106,8 @@
         Shuffle(page_size=sys.maxsize),
         "splitters.small_no_test",
         Split(field="src", by=": "),
+        Copy(field="src/0", to_field="instance_instruction"),
         Slice(field="src", start=1),
-        Copy(field="src/0", to_field="instruction"),
         Join(field="src", by=": "),
         ListFieldValues(
             fields=["tgt", "src"],
diff --git a/prepare/cards/hh_rlhf.py b/prepare/cards/hh_rlhf.py
@@ -29,7 +29,7 @@
             fields={
                 "input_type": "dialog",
                 "output_type": "response",
-                "instruction": "Respond the following dialog in an helpful and harmfull way.",
+                "instance_instruction": "Respond the following dialog in an helpful and harmless way.",
             }
         ),
         IndexOf(
diff --git a/prepare/cards/summarize_from_human_feedback.py b/prepare/cards/summarize_from_human_feedback.py
@@ -6,6 +6,7 @@
     TaskCard,
 )
 from unitxt.catalog import add_to_catalog
+from unitxt.operators import Cast
 from unitxt.test_utils.card import test_card
 
 card = TaskCard(
@@ -24,12 +25,13 @@
                 "summaries/*/text": "choices",
             }
         ),
+        Cast(field="input", to="str"),
         Rename(field_to_field={"choice": "output_choice"}),
         Set(
             fields={
                 "input_type": "post",
                 "output_type": "summary",
-                "instruction": "Summarize the following post",
+                "instance_instruction": "Summarize the following post",
             }
         ),
     ],
diff --git a/prepare/tasks/evaluation.py b/prepare/tasks/evaluation.py
@@ -10,7 +10,7 @@
             "input_type": str,
             "output_type": str,
             "choices": List[str],
-            "instruction": str,
+            "instance_instruction": str,
         },
         reference_fields={
             "choices": List[str],
@@ -19,7 +19,7 @@
         metrics=[
             "metrics.accuracy",
         ],
-        augmentable_inputs=["input", "instruction"],
+        augmentable_inputs=["input", "instance_instruction"],
     ),
     "tasks.evaluation.preference",
     overwrite=True,
diff --git a/prepare/templates/evaluation/preference.py b/prepare/templates/evaluation/preference.py
@@ -4,7 +4,7 @@
 add_to_catalog(
     MultipleChoiceTemplate(
         instruction="For any Instruction and {input_type} given to a model, assess which {output_type} written by the model aligns most closely with the given instruction (choose from {numerals}).",
-        input_format="Instruction:\n{instruction}\n{input_type}:\n{input}\nResponses:\n{choices}",
+        input_format="Instruction:\n{instance_instruction}\n{input_type}:\n{input}\nResponses:\n{choices}",
         target_prefix="{output_type}:\n",
         target_field="output_choice",
         choices_separator="\n",
diff --git a/src/unitxt/catalog/cards/coedit/preference.json b/src/unitxt/catalog/cards/coedit/preference.json
@@ -16,16 +16,16 @@
             "field": "src",
             "by": ": "
         },
+        {
+            "__type__": "copy",
+            "field": "src/0",
+            "to_field": "instance_instruction"
+        },
         {
             "__type__": "slice",
             "field": "src",
             "start": 1
         },
-        {
-            "__type__": "copy",
-            "field": "src/0",
-            "to_field": "instruction"
-        },
         {
             "__type__": "join",
             "field": "src",
diff --git a/src/unitxt/catalog/cards/hh_rlhf.json b/src/unitxt/catalog/cards/hh_rlhf.json
@@ -66,7 +66,7 @@
             "fields": {
                 "input_type": "dialog",
                 "output_type": "response",
-                "instruction": "Respond the following dialog in an helpful and harmfull way."
+                "instance_instruction": "Respond the following dialog in an helpful and harmless way."
             }
         },
         {
diff --git a/src/unitxt/catalog/cards/summarize_from_human_feedback.json b/src/unitxt/catalog/cards/summarize_from_human_feedback.json
@@ -22,6 +22,11 @@
                 "summaries/*/text": "choices"
             }
         },
+        {
+            "__type__": "cast",
+            "field": "input",
+            "to": "str"
+        },
         {
             "__type__": "rename",
             "field_to_field": {
@@ -33,7 +38,7 @@
             "fields": {
                 "input_type": "post",
                 "output_type": "summary",
-                "instruction": "Summarize the following post"
+                "instance_instruction": "Summarize the following post"
             }
         }
     ],
diff --git a/src/unitxt/catalog/tasks/evaluation/preference.json b/src/unitxt/catalog/tasks/evaluation/preference.json
@@ -5,7 +5,7 @@
         "input_type": "str",
         "output_type": "str",
         "choices": "List[str]",
-        "instruction": "str"
+        "instance_instruction": "str"
     },
     "reference_fields": {
         "choices": "List[str]",
@@ -16,6 +16,6 @@
     ],
     "augmentable_inputs": [
         "input",
-        "instruction"
+        "instance_instruction"
     ]
 }
diff --git a/src/unitxt/catalog/templates/evaluation/preference/default.json b/src/unitxt/catalog/templates/evaluation/preference/default.json
@@ -1,7 +1,7 @@
 {
     "__type__": "multiple_choice_template",
     "instruction": "For any Instruction and {input_type} given to a model, assess which {output_type} written by the model aligns most closely with the given instruction (choose from {numerals}).",
-    "input_format": "Instruction:\n{instruction}\n{input_type}:\n{input}\nResponses:\n{choices}",
+    "input_format": "Instruction:\n{instance_instruction}\n{input_type}:\n{input}\nResponses:\n{choices}",
     "target_prefix": "{output_type}:\n",
     "target_field": "output_choice",
     "choices_separator": "\n",

Original file line number	Diff line number	Diff line change
`@@ -29,7 +29,7 @@`
`29`	`29`	`fields={`
`30`	`30`	`"input_type": "dialog",`
`31`	`31`	`"output_type": "response",`
`32`		`- "instruction": "Respond the following dialog in an helpful and harmfull way.",`
	`32`	`+ "instance_instruction": "Respond the following dialog in an helpful and harmless way.",`
`33`	`33`	`}`
`34`	`34`	`),`
`35`	`35`	`IndexOf(`
Original file line number	Diff line number	Diff line change
`@@ -66,7 +66,7 @@`
`66`	`66`	`"fields": {`
`67`	`67`	`"input_type": "dialog",`
`68`	`68`	`"output_type": "response",`
`69`		`- "instruction": "Respond the following dialog in an helpful and harmfull way."`
	`69`	`+ "instance_instruction": "Respond the following dialog in an helpful and harmless way."`
`70`	`70`	`}`
`71`	`71`	`},`
`72`	`72`	`{`
Original file line number	Diff line number	Diff line change
`@@ -22,6 +22,11 @@`
`22`	`22`	`"summaries/*/text": "choices"`
`23`	`23`	`}`
`24`	`24`	`},`
	`25`	`+ {`
	`26`	`+ "__type__": "cast",`
	`27`	`+ "field": "input",`
	`28`	`+ "to": "str"`
	`29`	`+ },`
`25`	`30`	`{`
`26`	`31`	`"__type__": "rename",`
`27`	`32`	`"field_to_field": {`
`@@ -33,7 +38,7 @@`
`33`	`38`	`"fields": {`
`34`	`39`	`"input_type": "post",`
`35`	`40`	`"output_type": "summary",`
`36`		`- "instruction": "Summarize the following post"`
	`41`	`+ "instance_instruction": "Summarize the following post"`
`37`	`42`	`}`
`38`	`43`	`}`
`39`	`44`	`],`
Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,7 @@`
`1`	`1`	`{`
`2`	`2`	`"__type__": "multiple_choice_template",`
`3`	`3`	`"instruction": "For any Instruction and {input_type} given to a model, assess which {output_type} written by the model aligns most closely with the given instruction (choose from {numerals}).",`
`4`		`- "input_format": "Instruction:\n{instruction}\n{input_type}:\n{input}\nResponses:\n{choices}",`
	`4`	`+ "input_format": "Instruction:\n{instance_instruction}\n{input_type}:\n{input}\nResponses:\n{choices}",`
`5`	`5`	`"target_prefix": "{output_type}:\n",`
`6`	`6`	`"target_field": "output_choice",`
`7`	`7`	`"choices_separator": "\n",`