-
Notifications
You must be signed in to change notification settings - Fork 110
/
Copy pathrpj_rw_as_CC.json
92 lines (92 loc) · 9.46 KB
/
rpj_rw_as_CC.json
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
{
"uuid": "ee329a3c-8056-4732-8312-716a07153946",
"name": "rpj_rw_as_CC",
"creation_date": "2024_01_09-09_15_53",
"dataset_url": "s3://***REMOVED***/openlm/dcnlp/datasets/rpj_rw_as_CC/",
"manifest_url": "s3://***REMOVED***/openlm/dcnlp/datasets/rpj_rw_as_CC/manifest.jsonl",
"sources": [
{
"uuid": "df16a14e-0f67-4623-933a-805522653f22",
"name": "rw_original"
},
{
"uuid": "edd67f24-49ae-4915-8c3a-dd4bcc62b9d8",
"name": "rpj_original_github"
},
{
"uuid": "c8b17a9b-6bd8-441a-8b9f-dbf486edf574",
"name": "rpj_original_arxiv"
},
{
"uuid": "d017c1fe-c9df-4e06-aa8f-d92b1097283b",
"name": "rpj_original_books"
},
{
"uuid": "3b25b18c-e724-4071-8c7a-d69c5e1aaeac",
"name": "rpj_original_stackexchange"
},
{
"uuid": "050bc436-8d61-4d73-b931-0306a4b26727",
"name": "rpj_original_wiki"
}
],
"tokenized": true,
"tokenizer": "EleutherAI/gpt-neox-20b",
"num_tokens": 705714313806,
"size": 1855918430866,
"dcnlp_commit_hash": "287ba500c592b05266ee17921c3e4a656f10b8ec",
"dcnlp_diff": "diff --git a/exp_data/datasets/untokenized/c4_wo_dedup.json b/exp_data/datasets/untokenized/c4_wo_dedup.json\nindex fd6fbbd..c5af54a 100644\n--- a/exp_data/datasets/untokenized/c4_wo_dedup.json\n+++ b/exp_data/datasets/untokenized/c4_wo_dedup.json\n@@ -2,7 +2,7 @@\n \"uuid\": \"5431063a-bcdb-4c9e-83df-b5b08243ab1d\",\n \"name\": \"c4_wo_dedup\",\n \"creation_date\": \"2023_12_20-17_59_20\",\n- \"dataset_url\": \"s3://dcnlp-west/cc_wet_2019_april_baselines/c4_wo_dedup/\",\n+ \"dataset_url\": \"s3://***REMOVED***/openlm/dcnlp/raw_datasets/cc_wet_2019_april_baselines/c4_wo_dedup/\",\n \"manifest_url\": null,\n \"sources\": [\n {\ndiff --git a/exp_data/datasets/untokenized/rw_original.json b/exp_data/datasets/untokenized/rw_original.json\nindex 3cc566d..aa35e58 100644\n--- a/exp_data/datasets/untokenized/rw_original.json\n+++ b/exp_data/datasets/untokenized/rw_original.json\n@@ -2,7 +2,7 @@\n \"uuid\": \"df16a14e-0f67-4623-933a-805522653f22\",\n \"name\": \"rw_original\",\n \"creation_date\": \"2023_11_22-12_31_00\",\n- \"dataset_url\": \"s3://dcnlp-west/refinedweb_raw_jsonl_keyfix/\",\n+ \"dataset_url\": \"s3://***REMOVED***/openlm/dcnlp/raw_datasets/refinedweb_raw_jsonl_keyfix/\",\n \"manifest_url\": null,\n \"sources\": [],\n \"tokenized\": false,\ndiff --git a/ray_processing/__init__.py b/ray_processing/__init__.py\nindex 5e1b41d..014c770 100644\n--- a/ray_processing/__init__.py\n+++ b/ray_processing/__init__.py\n@@ -1,4 +1,4 @@\n-from dedup_jsonl import dedup_jsonl\n+from ray_processing.dedup_jsonl import dedup_jsonl\n from baselines.core.constants import GLOBAL_FUNCTIONS\n \n-GLOBAL_FUNCTIONS['exact_dedup'] = dedup_jsonl\n\\ No newline at end of file\n+GLOBAL_FUNCTIONS['exact_dedup'] = dedup_jsonl\ndiff --git a/ray_processing/tokenize_shuffle.py b/ray_processing/tokenize_shuffle.py\nindex ba2ac32..14d4125 100644\n--- a/ray_processing/tokenize_shuffle.py\n+++ b/ray_processing/tokenize_shuffle.py\n@@ -53,7 +53,9 @@ if __name__ == \"__main__\":\n assert all(s is not None for s in source_refs), \"Not all source reference jsons could be found.\"\n \n # Collect args for tokenization and pass them into tokenize_shuffle\n- tokenize_shuffle_args = [str(i) for k,v in vars(args).items() for i in [f\"--{k}\", v] if k not in DCNLP_ARGS and v]\n+ tokenize_shuffle_args = [str(i) for k,v in vars(args).items() for i in [f\"--{k}\", v] if k not in DCNLP_ARGS and k != \"do_sample\" and v]\n+ if args.do_sample:\n+ tokenize_shuffle_args += [\"--do_sample\"]\n tokenize_shuffle.main(tokenize_shuffle_args)\n \n dataset_json = generate_tokenized_dataset_json(args, source_refs)\ndiff --git a/run_CC_plus_rpj_nonCC.sh b/run_CC_plus_rpj_nonCC.sh\nold mode 100644\nnew mode 100755\nindex ae82cc8..7a9e5ea\n--- a/run_CC_plus_rpj_nonCC.sh\n+++ b/run_CC_plus_rpj_nonCC.sh\n@@ -6,28 +6,28 @@ python ray_processing/tokenize_shuffle.py \\\n \t--default_dataset_yaml ray_processing/tokenization_configs/rpj_rw_as_CC.yaml \\\n \t--content_key text \\\n \t--ray_spill_location /tmp/ray \\\n-\t--force_parallelism \\\n-\t--do_sample \\\n-\n-\n-# C4 as the CC source\n-python ray_processing/tokenize_shuffle.py \\\n-\t--source_ref_paths exp_data/datasets/untokenized/c4_original.json exp_data/datasets/untokenized/rpj_original_github.json exp_data/datasets/untokenized/rpj_original_arxiv.json exp_data/datasets/untokenized/rpj_original_books.json exp_data/datasets/untokenized/rpj_original_stackexchange.json exp_data/datasets/untokenized/rpj_original_wiki.json \\\n-\t--readable_name rpj_c4_as_CC \\\n-\t--output s3://***REMOVED***/openlm/dcnlp/datasets/rpj_c4_as_CC/ \\\n-\t--default_dataset_yaml ray_processing/tokenization_configs/rpj_c4_as_CC.yaml \\\n-\t--content_key text \\\n-\t--ray_spill_location /tmp/ray \\\n-\t--force_parallelism \\\n+\t--force_parallelism 320 \\\n \t--do_sample\n \n-# RPJ-CC as the CC source\n-python ray_processing/tokenize_shuffle.py \\\n-\t--source_ref_paths exp_data/datasets/untokenized/rpj_original_cc.json exp_data/datasets/untokenized/rpj_original_github.json exp_data/datasets/untokenized/rpj_original_arxiv.json exp_data/datasets/untokenized/rpj_original_books.json exp_data/datasets/untokenized/rpj_original_stackexchange.json exp_data/datasets/untokenized/rpj_original_wiki.json \\\n-\t--readable_name rpj_rpjCC_as_CC \\\n-\t--output s3://***REMOVED***/openlm/dcnlp/datasets/rpj_rpjCC_as_CC/ \\\n-\t--default_dataset_yaml ray_processing/tokenization_configs/rpj_rpjCC_as_CC.yaml \\\n-\t--content_key text \\\n-\t--ray_spill_location /tmp/ray \\\n-\t--force_parallelism \\\n-\t--do_sample\n\\ No newline at end of file\n+\n+# # C4 as the CC source\n+# python ray_processing/tokenize_shuffle.py \\\n+# \t--source_ref_paths exp_data/datasets/untokenized/c4_original.json exp_data/datasets/untokenized/rpj_original_github.json exp_data/datasets/untokenized/rpj_original_arxiv.json exp_data/datasets/untokenized/rpj_original_books.json exp_data/datasets/untokenized/rpj_original_stackexchange.json exp_data/datasets/untokenized/rpj_original_wiki.json \\\n+# \t--readable_name rpj_c4_as_CC \\\n+# \t--output s3://***REMOVED***/openlm/dcnlp/datasets/rpj_c4_as_CC/ \\\n+# \t--default_dataset_yaml ray_processing/tokenization_configs/rpj_c4_as_CC.yaml \\\n+# \t--content_key text \\\n+# \t--ray_spill_location /tmp/ray \\\n+# \t--force_parallelism \\\n+# \t--do_sample\n+# \n+# # RPJ-CC as the CC source\n+# python ray_processing/tokenize_shuffle.py \\\n+# \t--source_ref_paths exp_data/datasets/untokenized/rpj_original_cc.json exp_data/datasets/untokenized/rpj_original_github.json exp_data/datasets/untokenized/rpj_original_arxiv.json exp_data/datasets/untokenized/rpj_original_books.json exp_data/datasets/untokenized/rpj_original_stackexchange.json exp_data/datasets/untokenized/rpj_original_wiki.json \\\n+# \t--readable_name rpj_rpjCC_as_CC \\\n+# \t--output s3://***REMOVED***/openlm/dcnlp/datasets/rpj_rpjCC_as_CC/ \\\n+# \t--default_dataset_yaml ray_processing/tokenization_configs/rpj_rpjCC_as_CC.yaml \\\n+# \t--content_key text \\\n+# \t--ray_spill_location /tmp/ray \\\n+# \t--force_parallelism \\\n+# \t--do_sample\ndiff --git a/training/configs/1b_1x.json b/training/configs/1b_1x.json\nindex bd0a40b..45b4656 100644\n--- a/training/configs/1b_1x.json\n+++ b/training/configs/1b_1x.json\n@@ -8,7 +8,7 @@\n \"wd\": 0.033,\n \"cd\": 3e-5,\n \"global_bs\": 256,\n- \"acc\": 2,\n+ \"acc\": 1,\n \"qk_norm\": true,\n \"z_loss\": 1e-4,\n \"grad_checkpointing\": false,\n@@ -18,4 +18,4 @@\n \"--fsdp-limit-all-gathers\"\n ],\n \"chinchilla_multiplier\": 1\n-}\n\\ No newline at end of file\n+}\ndiff --git a/training/configs/3b_1x.json b/training/configs/3b_1x.json\nindex d77a4d4..2e9e15b 100644\n--- a/training/configs/3b_1x.json\n+++ b/training/configs/3b_1x.json\n@@ -8,7 +8,7 @@\n \"wd\": 0.33,\n \"cd\": 3e-05,\n \"global_bs\": 2048,\n- \"acc\": 2,\n+ \"acc\": 4,\n \"qk_norm\": true,\n \"z_loss\": 1e-4,\n \"grad_checkpointing\": false,",
"data_key": "json.gz",
"sampling_yaml": {
"sources": [
{
"source": "REFINEDWEB",
"markers": [
"refinedweb"
]
},
{
"source": "GITHUB",
"markers": [
"github"
]
},
{
"source": "WIKIPEDIA",
"markers": [
"wiki"
]
},
{
"source": "BOOKS",
"markers": [
"book"
]
},
{
"source": "ARXIV",
"markers": [
"arxiv"
]
},
{
"source": "STACKEXCHANGE",
"markers": [
"stackexchange"
]
},
{
"source": "UNKNOWN",
"markers": []
}
],
"sampling_frequencies": {
"REFINEDWEB": 1.0,
"GITHUB": 0.5397182204780016,
"WIKIPEDIA": 1.2804846194277604,
"BOOKS": 1.2222185431808508,
"ARXIV": 0.6288823225508585,
"STACKEXCHANGE": 0.68807596592589
}
}
}