Skip to content

Commit 3472805

Browse files
authored
trying to fix PERFORMANCE: use a github repository as a replacement for the gone HF 'lmsys/arena-hard-browser' (#1757)
* a handy replacement for the gone HF 'lmsys/arena-hard-browser' Signed-off-by: dafnapension <[email protected]> * specify commit point for the data from github Signed-off-by: dafnapension <[email protected]> --------- Signed-off-by: dafnapension <[email protected]>
1 parent cafc7eb commit 3472805

File tree

2 files changed

+33
-28
lines changed

2 files changed

+33
-28
lines changed

prepare/cards/arena_hard/generation/english_gpt-4-0314_reference.py

Lines changed: 21 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
TaskCard,
33
)
44
from unitxt.catalog import add_to_catalog
5-
from unitxt.loaders import LoadFromHFSpace
5+
from unitxt.loaders import LoadCSV
66
from unitxt.operators import (
77
Apply,
88
Copy,
@@ -13,35 +13,39 @@
1313
from unitxt.stream_operators import DeleteSplits, JoinStreams
1414
from unitxt.test_utils.card import test_card
1515

16+
# 'lmsys/arena-hard-browser' (https://huggingface.co/spaces/lmarena-ai/arena-hard-browser) was gone from HF.
17+
# https://paperswithcode.com/dataset/arena-hard points at git repository https://github.com/lmarena/arena-hard-auto
18+
# from which the dataset is now fetched:
19+
20+
question_git_repo_file_path = "https://raw.githubusercontent.com/lmarena/arena-hard-auto/57451f35d2be7fef9f05d5567f36e4c959bb6630/data/arena-hard-v0.1/question.jsonl"
21+
model_answer_git_repo_file_path = "https://raw.githubusercontent.com/lmarena/arena-hard-auto/57451f35d2be7fef9f05d5567f36e4c959bb6630/data/arena-hard-v0.1/model_answer/gpt-4-0314.jsonl"
22+
1623
card = TaskCard(
17-
loader=LoadFromHFSpace(
18-
space_name="lmsys/arena-hard-browser",
19-
revision="03b91ca", # May 26, 2024
20-
data_files={
21-
"questions": "data/arena-hard-v0.1/question.jsonl",
22-
"model_answer": "data/arena-hard-v0.1/model_answer/gpt-4-0314.jsonl",
23-
},
24-
data_classification_policy = ["public"],
25-
),
24+
loader=LoadCSV(
25+
files={"questions": question_git_repo_file_path, "model_answer": model_answer_git_repo_file_path},
26+
file_type="json",
27+
lines=True,
28+
data_classification_policy=["public"],
29+
),
2630
preprocess_steps=[
2731
# region Question file
2832
Rename(field_to_field={"cluster": "group"}, apply_to_streams=["questions"]),
2933
Copy(
30-
field_to_field={"turns/0/content": "model_input"},
34+
field_to_field={"prompt": "model_input"},
3135
apply_to_streams=["questions"],
3236
),
3337
Set(fields={"reference_model": "gpt-4-0314"}, apply_to_streams=["questions"]),
3438
# endregion
3539
# region Answers file processing
3640
Copy(
3741
field_to_field={
38-
"choices/0/turns/0/content": "reference_model_output",
39-
"choices/0/turns/0/token_len": "reference_model_output_token_len",
42+
"messages/1/content/answer": "reference_model_output"
43+
# "choices/0/turns/0/token_len": "reference_model_output_token_len",
4044
},
4145
apply_to_streams=["model_answer"],
4246
),
4347
Rename(
44-
field_to_field={"model_id": "reference_model"},
48+
field_to_field={"model": "reference_model"},
4549
apply_to_streams=["model_answer"],
4650
),
4751
Apply(
@@ -56,13 +60,13 @@
5660
left_stream="questions",
5761
right_stream="model_answer",
5862
how="inner",
59-
on=["question_id", "reference_model"],
63+
on=["uid", "reference_model"],
6064
new_stream_name="test",
6165
),
6266
DeleteSplits(splits=["questions", "model_answer"]),
6367
SelectFields(
6468
fields=[
65-
"question_id",
69+
"uid",
6670
"category",
6771
"model_input",
6872
"reference_model",
@@ -71,6 +75,7 @@
7175
),
7276
Rename(
7377
field_to_field={
78+
"uid": "question_id",
7479
"model_input": "input",
7580
"category": "group",
7681
"reference_model_output": "output",

src/unitxt/catalog/cards/arena_hard/generation/english_gpt_4_0314_reference.json

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
{
22
"__type__": "task_card",
33
"loader": {
4-
"__type__": "load_from_hf_space",
5-
"space_name": "lmsys/arena-hard-browser",
6-
"revision": "03b91ca",
7-
"data_files": {
8-
"questions": "data/arena-hard-v0.1/question.jsonl",
9-
"model_answer": "data/arena-hard-v0.1/model_answer/gpt-4-0314.jsonl"
4+
"__type__": "load_csv",
5+
"files": {
6+
"questions": "https://raw.githubusercontent.com/lmarena/arena-hard-auto/57451f35d2be7fef9f05d5567f36e4c959bb6630/data/arena-hard-v0.1/question.jsonl",
7+
"model_answer": "https://raw.githubusercontent.com/lmarena/arena-hard-auto/57451f35d2be7fef9f05d5567f36e4c959bb6630/data/arena-hard-v0.1/model_answer/gpt-4-0314.jsonl"
108
},
9+
"file_type": "json",
10+
"lines": true,
1111
"data_classification_policy": [
1212
"public"
1313
]
@@ -25,7 +25,7 @@
2525
{
2626
"__type__": "copy",
2727
"field_to_field": {
28-
"turns/0/content": "model_input"
28+
"prompt": "model_input"
2929
},
3030
"apply_to_streams": [
3131
"questions"
@@ -43,8 +43,7 @@
4343
{
4444
"__type__": "copy",
4545
"field_to_field": {
46-
"choices/0/turns/0/content": "reference_model_output",
47-
"choices/0/turns/0/token_len": "reference_model_output_token_len"
46+
"messages/1/content/answer": "reference_model_output"
4847
},
4948
"apply_to_streams": [
5049
"model_answer"
@@ -53,7 +52,7 @@
5352
{
5453
"__type__": "rename",
5554
"field_to_field": {
56-
"model_id": "reference_model"
55+
"model": "reference_model"
5756
},
5857
"apply_to_streams": [
5958
"model_answer"
@@ -76,7 +75,7 @@
7675
"right_stream": "model_answer",
7776
"how": "inner",
7877
"on": [
79-
"question_id",
78+
"uid",
8079
"reference_model"
8180
],
8281
"new_stream_name": "test"
@@ -91,7 +90,7 @@
9190
{
9291
"__type__": "select_fields",
9392
"fields": [
94-
"question_id",
93+
"uid",
9594
"category",
9695
"model_input",
9796
"reference_model",
@@ -101,6 +100,7 @@
101100
{
102101
"__type__": "rename",
103102
"field_to_field": {
103+
"uid": "question_id",
104104
"model_input": "input",
105105
"category": "group",
106106
"reference_model_output": "output"

0 commit comments

Comments
 (0)