trying to fix PERFORMANCE: use a github repository as a replacement for the gone HF 'lmsys/arena-hard-browser' (#1757)

dafnapension · web-flow · commit 3472805f3c07 · 2025-04-29T19:40:41.000+03:00
* a handy replacement for the gone HF 'lmsys/arena-hard-browser'

Signed-off-by: dafnapension &lt;dafnashein@yahoo.com&gt;

* specify commit point for the data from github

Signed-off-by: dafnapension &lt;dafnashein@yahoo.com&gt;

---------

Signed-off-by: dafnapension &lt;dafnashein@yahoo.com&gt;
diff --git a/prepare/cards/arena_hard/generation/english_gpt-4-0314_reference.py b/prepare/cards/arena_hard/generation/english_gpt-4-0314_reference.py
@@ -2,7 +2,7 @@
     TaskCard,
 )
 from unitxt.catalog import add_to_catalog
-from unitxt.loaders import LoadFromHFSpace
+from unitxt.loaders import LoadCSV
 from unitxt.operators import (
     Apply,
     Copy,
@@ -13,35 +13,39 @@
 from unitxt.stream_operators import DeleteSplits, JoinStreams
 from unitxt.test_utils.card import test_card
 
+# 'lmsys/arena-hard-browser' (https://huggingface.co/spaces/lmarena-ai/arena-hard-browser) was gone from HF.
+# https://paperswithcode.com/dataset/arena-hard points at git repository  https://github.com/lmarena/arena-hard-auto
+# from which the dataset is now fetched:
+
+question_git_repo_file_path = "https://raw.githubusercontent.com/lmarena/arena-hard-auto/57451f35d2be7fef9f05d5567f36e4c959bb6630/data/arena-hard-v0.1/question.jsonl"
+model_answer_git_repo_file_path = "https://raw.githubusercontent.com/lmarena/arena-hard-auto/57451f35d2be7fef9f05d5567f36e4c959bb6630/data/arena-hard-v0.1/model_answer/gpt-4-0314.jsonl"
+
 card = TaskCard(
-    loader=LoadFromHFSpace(
-        space_name="lmsys/arena-hard-browser",
-        revision="03b91ca",  # May 26, 2024
-        data_files={
-            "questions": "data/arena-hard-v0.1/question.jsonl",
-            "model_answer": "data/arena-hard-v0.1/model_answer/gpt-4-0314.jsonl",
-        },
-        data_classification_policy = ["public"],
-    ),
+    loader=LoadCSV(
+        files={"questions": question_git_repo_file_path, "model_answer": model_answer_git_repo_file_path},
+            file_type="json",
+            lines=True,
+            data_classification_policy=["public"],
+        ),
     preprocess_steps=[
         # region Question file
         Rename(field_to_field={"cluster": "group"}, apply_to_streams=["questions"]),
         Copy(
-            field_to_field={"turns/0/content": "model_input"},
+            field_to_field={"prompt": "model_input"},
             apply_to_streams=["questions"],
         ),
         Set(fields={"reference_model": "gpt-4-0314"}, apply_to_streams=["questions"]),
         # endregion
         # region Answers file processing
         Copy(
             field_to_field={
-                "choices/0/turns/0/content": "reference_model_output",
-                "choices/0/turns/0/token_len": "reference_model_output_token_len",
+                "messages/1/content/answer": "reference_model_output"
+                # "choices/0/turns/0/token_len": "reference_model_output_token_len",
             },
             apply_to_streams=["model_answer"],
         ),
         Rename(
-            field_to_field={"model_id": "reference_model"},
+            field_to_field={"model": "reference_model"},
             apply_to_streams=["model_answer"],
         ),
         Apply(
@@ -56,13 +60,13 @@
             left_stream="questions",
             right_stream="model_answer",
             how="inner",
-            on=["question_id", "reference_model"],
+            on=["uid", "reference_model"],
             new_stream_name="test",
         ),
         DeleteSplits(splits=["questions", "model_answer"]),
         SelectFields(
             fields=[
-                "question_id",
+                "uid",
                 "category",
                 "model_input",
                 "reference_model",
@@ -71,6 +75,7 @@
         ),
         Rename(
             field_to_field={
+                "uid": "question_id",
                 "model_input": "input",
                 "category": "group",
                 "reference_model_output": "output",
diff --git a/src/unitxt/catalog/cards/arena_hard/generation/english_gpt_4_0314_reference.json b/src/unitxt/catalog/cards/arena_hard/generation/english_gpt_4_0314_reference.json
@@ -1,13 +1,13 @@
 {
     "__type__": "task_card",
     "loader": {
-        "__type__": "load_from_hf_space",
-        "space_name": "lmsys/arena-hard-browser",
-        "revision": "03b91ca",
-        "data_files": {
-            "questions": "data/arena-hard-v0.1/question.jsonl",
-            "model_answer": "data/arena-hard-v0.1/model_answer/gpt-4-0314.jsonl"
+        "__type__": "load_csv",
+        "files": {
+            "questions": "https://raw.githubusercontent.com/lmarena/arena-hard-auto/57451f35d2be7fef9f05d5567f36e4c959bb6630/data/arena-hard-v0.1/question.jsonl",
+            "model_answer": "https://raw.githubusercontent.com/lmarena/arena-hard-auto/57451f35d2be7fef9f05d5567f36e4c959bb6630/data/arena-hard-v0.1/model_answer/gpt-4-0314.jsonl"
         },
+        "file_type": "json",
+        "lines": true,
         "data_classification_policy": [
             "public"
         ]
@@ -25,7 +25,7 @@
         {
             "__type__": "copy",
             "field_to_field": {
-                "turns/0/content": "model_input"
+                "prompt": "model_input"
             },
             "apply_to_streams": [
                 "questions"
@@ -43,8 +43,7 @@
         {
             "__type__": "copy",
             "field_to_field": {
-                "choices/0/turns/0/content": "reference_model_output",
-                "choices/0/turns/0/token_len": "reference_model_output_token_len"
+                "messages/1/content/answer": "reference_model_output"
             },
             "apply_to_streams": [
                 "model_answer"
@@ -53,7 +52,7 @@
         {
             "__type__": "rename",
             "field_to_field": {
-                "model_id": "reference_model"
+                "model": "reference_model"
             },
             "apply_to_streams": [
                 "model_answer"
@@ -76,7 +75,7 @@
             "right_stream": "model_answer",
             "how": "inner",
             "on": [
-                "question_id",
+                "uid",
                 "reference_model"
             ],
             "new_stream_name": "test"
@@ -91,7 +90,7 @@
         {
             "__type__": "select_fields",
             "fields": [
-                "question_id",
+                "uid",
                 "category",
                 "model_input",
                 "reference_model",
@@ -101,6 +100,7 @@
         {
             "__type__": "rename",
             "field_to_field": {
+                "uid": "question_id",
                 "model_input": "input",
                 "category": "group",
                 "reference_model_output": "output"