Skip to content

Commit 85c07cf

Browse files
authored
Add LoadJsonFile loader and tests (#1801)
* Add LoadJsonFile loader and tests Signed-off-by: elronbandel <[email protected]> * Update artifact Signed-off-by: elronbandel <[email protected]> --------- Signed-off-by: elronbandel <[email protected]>
1 parent 079c208 commit 85c07cf

File tree

14 files changed

+188
-86
lines changed

14 files changed

+188
-86
lines changed

prepare/cards/arena_hard/generation/english_gpt-4-0314_reference.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
TaskCard,
33
)
44
from unitxt.catalog import add_to_catalog
5-
from unitxt.loaders import LoadCSV
5+
from unitxt.loaders import LoadJsonFile
66
from unitxt.operators import (
77
Apply,
88
Copy,
@@ -21,12 +21,11 @@
2121
model_answer_git_repo_file_path = "https://raw.githubusercontent.com/lmarena/arena-hard-auto/57451f35d2be7fef9f05d5567f36e4c959bb6630/data/arena-hard-v0.1/model_answer/gpt-4-0314.jsonl"
2222

2323
card = TaskCard(
24-
loader=LoadCSV(
24+
loader=LoadJsonFile(
2525
files={"questions": question_git_repo_file_path, "model_answer": model_answer_git_repo_file_path},
26-
file_type="json",
27-
lines=True,
28-
data_classification_policy=["public"],
29-
),
26+
lines=True,
27+
data_classification_policy=["public"],
28+
),
3029
preprocess_steps=[
3130
# region Question file
3231
Rename(field_to_field={"cluster": "group"}, apply_to_streams=["questions"]),

prepare/cards/bfcl.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import unitxt
22
from unitxt.card import TaskCard
33
from unitxt.catalog import add_to_catalog
4-
from unitxt.loaders import LoadCSV
4+
from unitxt.loaders import LoadJsonFile
55
from unitxt.operators import (
66
Copy,
77
ExecuteExpression,
@@ -14,9 +14,8 @@
1414

1515
with unitxt.settings.context(allow_unverified_code=True):
1616
card = TaskCard(
17-
loader=LoadCSV(
17+
loader=LoadJsonFile(
1818
files={"questions": base_path + "BFCL_v3_simple.json", "answers": base_path + "possible_answer/BFCL_v3_simple.json"},
19-
file_type="json",
2019
lines=True,
2120
data_classification_policy=["public"],
2221
),

prepare/cards/mtrag.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
TaskCard,
66
)
77
from unitxt.collections_operators import Dictify, Wrap
8-
from unitxt.loaders import LoadCSV
8+
from unitxt.loaders import LoadJsonFile
99
from unitxt.operators import (
1010
Cast,
1111
Copy,
@@ -17,11 +17,10 @@
1717
from unitxt.test_utils.card import test_card
1818

1919
card = TaskCard(
20-
loader=LoadCSV(
20+
loader=LoadJsonFile(
2121
files={
2222
"test": "https://raw.githubusercontent.com/IBM/mt-rag-benchmark/refs/heads/main/human/generation_tasks/reference+RAG.jsonl"
2323
},
24-
file_type="json",
2524
lines=True,
2625
data_classification_policy=["public"],
2726
),
@@ -97,12 +96,11 @@
9796
subset_operators.append(Set(fields={"title": ""}))
9897

9998
card = TaskCard(
100-
loader=LoadCSV(
99+
loader=LoadJsonFile(
101100
files={
102101
"test": f"https://github.com/IBM/mt-rag-benchmark/raw/refs/heads/main/corpora/{subset}.jsonl.zip"
103102
},
104103
compression="zip",
105-
file_type="json",
106104
lines=True,
107105
data_classification_policy=["public"],
108106
),

prepare/cards/quality.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
from unitxt.card import TaskCard
33
from unitxt.catalog import add_to_catalog
44
from unitxt.collections_operators import Explode
5-
from unitxt.loaders import LoadCSV
5+
from unitxt.loaders import LoadJsonFile
66
from unitxt.operators import (
77
Copy,
88
MapInstanceValues,
@@ -15,9 +15,8 @@
1515

1616
with unitxt.settings.context(allow_unverified_code=True):
1717
card = TaskCard(
18-
loader=LoadCSV(
18+
loader=LoadJsonFile(
1919
files={"train": file_path + "train", "validation": file_path + "dev"},
20-
file_type="json",
2120
lines=True,
2221
data_classification_policy=["public"],
2322
),

src/unitxt/catalog/cards/arena_hard/generation/english_gpt_4_0314_reference.json

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,11 @@
11
{
22
"__type__": "task_card",
33
"loader": {
4-
"__type__": "load_csv",
4+
"__type__": "load_json_file",
55
"files": {
66
"questions": "https://raw.githubusercontent.com/lmarena/arena-hard-auto/57451f35d2be7fef9f05d5567f36e4c959bb6630/data/arena-hard-v0.1/question.jsonl",
77
"model_answer": "https://raw.githubusercontent.com/lmarena/arena-hard-auto/57451f35d2be7fef9f05d5567f36e4c959bb6630/data/arena-hard-v0.1/model_answer/gpt-4-0314.jsonl"
88
},
9-
"file_type": "json",
109
"lines": true,
1110
"data_classification_policy": [
1211
"public"

src/unitxt/catalog/cards/bfcl/simple_v3.json

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,11 @@
11
{
22
"__type__": "task_card",
33
"loader": {
4-
"__type__": "load_csv",
4+
"__type__": "load_json_file",
55
"files": {
66
"questions": "https://raw.githubusercontent.com/ShishirPatil/gorilla/70b6a4a2144597b1f99d1f4d3185d35d7ee532a4/berkeley-function-call-leaderboard/data/BFCL_v3_simple.json",
77
"answers": "https://raw.githubusercontent.com/ShishirPatil/gorilla/70b6a4a2144597b1f99d1f4d3185d35d7ee532a4/berkeley-function-call-leaderboard/data/possible_answer/BFCL_v3_simple.json"
88
},
9-
"file_type": "json",
109
"lines": true,
1110
"data_classification_policy": [
1211
"public"

src/unitxt/catalog/cards/quality.json

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,11 @@
11
{
22
"__type__": "task_card",
33
"loader": {
4-
"__type__": "load_csv",
4+
"__type__": "load_json_file",
55
"files": {
66
"train": "https://raw.githubusercontent.com/nyu-mll/quality/05e85750d4c5444d2a0a4ad299f6df5f4df06068/data/v1.0.1/QuALITY.v1.0.1.htmlstripped.train",
77
"validation": "https://raw.githubusercontent.com/nyu-mll/quality/05e85750d4c5444d2a0a4ad299f6df5f4df06068/data/v1.0.1/QuALITY.v1.0.1.htmlstripped.dev"
88
},
9-
"file_type": "json",
109
"lines": true,
1110
"data_classification_policy": [
1211
"public"

src/unitxt/catalog/cards/rag/mtrag.json

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,10 @@
11
{
22
"__type__": "task_card",
33
"loader": {
4-
"__type__": "load_csv",
4+
"__type__": "load_json_file",
55
"files": {
66
"test": "https://raw.githubusercontent.com/IBM/mt-rag-benchmark/refs/heads/main/human/generation_tasks/reference+RAG.jsonl"
77
},
8-
"file_type": "json",
98
"lines": true,
109
"data_classification_policy": [
1110
"public"

src/unitxt/catalog/cards/rag/mtrag/documents/clapnq.json

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,11 @@
11
{
22
"__type__": "task_card",
33
"loader": {
4-
"__type__": "load_csv",
4+
"__type__": "load_json_file",
55
"files": {
66
"test": "https://github.com/IBM/mt-rag-benchmark/raw/refs/heads/main/corpora/clapnq.jsonl.zip"
77
},
88
"compression": "zip",
9-
"file_type": "json",
109
"lines": true,
1110
"data_classification_policy": [
1211
"public"

src/unitxt/catalog/cards/rag/mtrag/documents/cloud.json

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,11 @@
11
{
22
"__type__": "task_card",
33
"loader": {
4-
"__type__": "load_csv",
4+
"__type__": "load_json_file",
55
"files": {
66
"test": "https://github.com/IBM/mt-rag-benchmark/raw/refs/heads/main/corpora/cloud.jsonl.zip"
77
},
88
"compression": "zip",
9-
"file_type": "json",
109
"lines": true,
1110
"data_classification_policy": [
1211
"public"

0 commit comments

Comments
 (0)