Add LoadJsonFile loader and tests (#1801)

elronbandel · web-flow · commit 85c07cfe0939 · 2025-05-19T10:40:18.000+03:00
* Add LoadJsonFile loader and tests

Signed-off-by: elronbandel &lt;elronbandel@gmail.com&gt;

* Update artifact

Signed-off-by: elronbandel &lt;elronbandel@gmail.com&gt;

---------

Signed-off-by: elronbandel &lt;elronbandel@gmail.com&gt;
diff --git a/prepare/cards/arena_hard/generation/english_gpt-4-0314_reference.py b/prepare/cards/arena_hard/generation/english_gpt-4-0314_reference.py
@@ -2,7 +2,7 @@
     TaskCard,
 )
 from unitxt.catalog import add_to_catalog
-from unitxt.loaders import LoadCSV
+from unitxt.loaders import LoadJsonFile
 from unitxt.operators import (
     Apply,
     Copy,
@@ -21,12 +21,11 @@
 model_answer_git_repo_file_path = "https://raw.githubusercontent.com/lmarena/arena-hard-auto/57451f35d2be7fef9f05d5567f36e4c959bb6630/data/arena-hard-v0.1/model_answer/gpt-4-0314.jsonl"
 
 card = TaskCard(
-    loader=LoadCSV(
+    loader=LoadJsonFile(
         files={"questions": question_git_repo_file_path, "model_answer": model_answer_git_repo_file_path},
-            file_type="json",
-            lines=True,
-            data_classification_policy=["public"],
-        ),
+        lines=True,
+        data_classification_policy=["public"],
+    ),
     preprocess_steps=[
         # region Question file
         Rename(field_to_field={"cluster": "group"}, apply_to_streams=["questions"]),
diff --git a/prepare/cards/bfcl.py b/prepare/cards/bfcl.py
@@ -1,7 +1,7 @@
 import unitxt
 from unitxt.card import TaskCard
 from unitxt.catalog import add_to_catalog
-from unitxt.loaders import LoadCSV
+from unitxt.loaders import LoadJsonFile
 from unitxt.operators import (
     Copy,
     ExecuteExpression,
@@ -14,9 +14,8 @@
 
 with unitxt.settings.context(allow_unverified_code=True):
     card = TaskCard(
-        loader=LoadCSV(
+        loader=LoadJsonFile(
             files={"questions": base_path + "BFCL_v3_simple.json", "answers": base_path + "possible_answer/BFCL_v3_simple.json"},
-            file_type="json",
             lines=True,
             data_classification_policy=["public"],
         ),
diff --git a/prepare/cards/mtrag.py b/prepare/cards/mtrag.py
@@ -5,7 +5,7 @@
     TaskCard,
 )
 from unitxt.collections_operators import Dictify, Wrap
-from unitxt.loaders import LoadCSV
+from unitxt.loaders import LoadJsonFile
 from unitxt.operators import (
     Cast,
     Copy,
@@ -17,11 +17,10 @@
 from unitxt.test_utils.card import test_card
 
 card = TaskCard(
-    loader=LoadCSV(
+    loader=LoadJsonFile(
         files={
             "test": "https://raw.githubusercontent.com/IBM/mt-rag-benchmark/refs/heads/main/human/generation_tasks/reference+RAG.jsonl"
         },
-        file_type="json",
         lines=True,
         data_classification_policy=["public"],
     ),
@@ -97,12 +96,11 @@
         subset_operators.append(Set(fields={"title": ""}))
 
     card = TaskCard(
-        loader=LoadCSV(
+        loader=LoadJsonFile(
             files={
                 "test": f"https://github.com/IBM/mt-rag-benchmark/raw/refs/heads/main/corpora/{subset}.jsonl.zip"
             },
             compression="zip",
-            file_type="json",
             lines=True,
             data_classification_policy=["public"],
         ),
diff --git a/prepare/cards/quality.py b/prepare/cards/quality.py
@@ -2,7 +2,7 @@
 from unitxt.card import TaskCard
 from unitxt.catalog import add_to_catalog
 from unitxt.collections_operators import Explode
-from unitxt.loaders import LoadCSV
+from unitxt.loaders import LoadJsonFile
 from unitxt.operators import (
     Copy,
     MapInstanceValues,
@@ -15,9 +15,8 @@
 
 with unitxt.settings.context(allow_unverified_code=True):
     card = TaskCard(
-        loader=LoadCSV(
+        loader=LoadJsonFile(
             files={"train": file_path + "train", "validation": file_path + "dev"},
-            file_type="json",
             lines=True,
             data_classification_policy=["public"],
         ),
diff --git a/src/unitxt/catalog/cards/arena_hard/generation/english_gpt_4_0314_reference.json b/src/unitxt/catalog/cards/arena_hard/generation/english_gpt_4_0314_reference.json
@@ -1,12 +1,11 @@
 {
     "__type__": "task_card",
     "loader": {
-        "__type__": "load_csv",
+        "__type__": "load_json_file",
         "files": {
             "questions": "https://raw.githubusercontent.com/lmarena/arena-hard-auto/57451f35d2be7fef9f05d5567f36e4c959bb6630/data/arena-hard-v0.1/question.jsonl",
             "model_answer": "https://raw.githubusercontent.com/lmarena/arena-hard-auto/57451f35d2be7fef9f05d5567f36e4c959bb6630/data/arena-hard-v0.1/model_answer/gpt-4-0314.jsonl"
         },
-        "file_type": "json",
         "lines": true,
         "data_classification_policy": [
             "public"
diff --git a/src/unitxt/catalog/cards/bfcl/simple_v3.json b/src/unitxt/catalog/cards/bfcl/simple_v3.json
@@ -1,12 +1,11 @@
 {
     "__type__": "task_card",
     "loader": {
-        "__type__": "load_csv",
+        "__type__": "load_json_file",
         "files": {
             "questions": "https://raw.githubusercontent.com/ShishirPatil/gorilla/70b6a4a2144597b1f99d1f4d3185d35d7ee532a4/berkeley-function-call-leaderboard/data/BFCL_v3_simple.json",
             "answers": "https://raw.githubusercontent.com/ShishirPatil/gorilla/70b6a4a2144597b1f99d1f4d3185d35d7ee532a4/berkeley-function-call-leaderboard/data/possible_answer/BFCL_v3_simple.json"
         },
-        "file_type": "json",
         "lines": true,
         "data_classification_policy": [
             "public"
diff --git a/src/unitxt/catalog/cards/quality.json b/src/unitxt/catalog/cards/quality.json
@@ -1,12 +1,11 @@
 {
     "__type__": "task_card",
     "loader": {
-        "__type__": "load_csv",
+        "__type__": "load_json_file",
         "files": {
             "train": "https://raw.githubusercontent.com/nyu-mll/quality/05e85750d4c5444d2a0a4ad299f6df5f4df06068/data/v1.0.1/QuALITY.v1.0.1.htmlstripped.train",
             "validation": "https://raw.githubusercontent.com/nyu-mll/quality/05e85750d4c5444d2a0a4ad299f6df5f4df06068/data/v1.0.1/QuALITY.v1.0.1.htmlstripped.dev"
         },
-        "file_type": "json",
         "lines": true,
         "data_classification_policy": [
             "public"
diff --git a/src/unitxt/catalog/cards/rag/mtrag.json b/src/unitxt/catalog/cards/rag/mtrag.json
@@ -1,11 +1,10 @@
 {
     "__type__": "task_card",
     "loader": {
-        "__type__": "load_csv",
+        "__type__": "load_json_file",
         "files": {
             "test": "https://raw.githubusercontent.com/IBM/mt-rag-benchmark/refs/heads/main/human/generation_tasks/reference+RAG.jsonl"
         },
-        "file_type": "json",
         "lines": true,
         "data_classification_policy": [
             "public"
diff --git a/src/unitxt/catalog/cards/rag/mtrag/documents/clapnq.json b/src/unitxt/catalog/cards/rag/mtrag/documents/clapnq.json
@@ -1,12 +1,11 @@
 {
     "__type__": "task_card",
     "loader": {
-        "__type__": "load_csv",
+        "__type__": "load_json_file",
         "files": {
             "test": "https://github.com/IBM/mt-rag-benchmark/raw/refs/heads/main/corpora/clapnq.jsonl.zip"
         },
         "compression": "zip",
-        "file_type": "json",
         "lines": true,
         "data_classification_policy": [
             "public"
diff --git a/src/unitxt/catalog/cards/rag/mtrag/documents/cloud.json b/src/unitxt/catalog/cards/rag/mtrag/documents/cloud.json
@@ -1,12 +1,11 @@
 {
     "__type__": "task_card",
     "loader": {
-        "__type__": "load_csv",
+        "__type__": "load_json_file",
         "files": {
             "test": "https://github.com/IBM/mt-rag-benchmark/raw/refs/heads/main/corpora/cloud.jsonl.zip"
         },
         "compression": "zip",
-        "file_type": "json",
         "lines": true,
         "data_classification_policy": [
             "public"
diff --git a/src/unitxt/catalog/cards/rag/mtrag/documents/fiqa.json b/src/unitxt/catalog/cards/rag/mtrag/documents/fiqa.json
@@ -1,12 +1,11 @@
 {
     "__type__": "task_card",
     "loader": {
-        "__type__": "load_csv",
+        "__type__": "load_json_file",
         "files": {
             "test": "https://github.com/IBM/mt-rag-benchmark/raw/refs/heads/main/corpora/fiqa.jsonl.zip"
         },
         "compression": "zip",
-        "file_type": "json",
         "lines": true,
         "data_classification_policy": [
             "public"
diff --git a/src/unitxt/catalog/cards/rag/mtrag/documents/govt.json b/src/unitxt/catalog/cards/rag/mtrag/documents/govt.json
@@ -1,12 +1,11 @@
 {
     "__type__": "task_card",
     "loader": {
-        "__type__": "load_csv",
+        "__type__": "load_json_file",
         "files": {
             "test": "https://github.com/IBM/mt-rag-benchmark/raw/refs/heads/main/corpora/govt.jsonl.zip"
         },
         "compression": "zip",
-        "file_type": "json",
         "lines": true,
         "data_classification_policy": [
             "public"
diff --git a/src/unitxt/loaders.py b/src/unitxt/loaders.py
diff --git a/tests/library/test_loaders.py b/tests/library/test_loaders.py