-
Notifications
You must be signed in to change notification settings - Fork 110
/
Copy pathrw_v2.json
21 lines (21 loc) · 4.5 KB
/
rw_v2.json
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
{
"uuid": "51d9bc41-bbd0-472a-aa10-61a25c0b26dc",
"name": "rw_v2",
"creation_date": "2023_12_31-09_49_09",
"dataset_url": "s3://***REMOVED***/openlm/dcnlp/datasets/cc_trafilatura_v2-baselines/refinedweb_V2_postsubstr_tokenized/",
"manifest_url": "s3://***REMOVED***/openlm/dcnlp/datasets/cc_trafilatura_v2-baselines/refinedweb_V2_postsubstr_tokenized/manifest.jsonl",
"sources": [
{
"uuid": "b3ea0a33-c563-449b-baaa-2360dcf11f7b",
"name": "rw_v2"
}
],
"tokenized": true,
"tokenizer": "EleutherAI/gpt-neox-20b",
"num_tokens": 250179740442,
"size": 680657229218,
"dcnlp_commit_hash": "43f155d94fd6435aec356dc752f48668d1e9472c",
"dcnlp_diff": "diff --git a/.dockerignore b/.dockerignore\nindex d80b79e..efa3158 100644\n--- a/.dockerignore\n+++ b/.dockerignore\n@@ -1,2 +1,3 @@\n checkpoints\n venv\n+training/eval_data\ndiff --git a/ray_processing/tokenize_shuffle.py b/ray_processing/tokenize_shuffle.py\nindex 14b652c..7247593 100644\n--- a/ray_processing/tokenize_shuffle.py\n+++ b/ray_processing/tokenize_shuffle.py\n@@ -53,8 +53,8 @@ if __name__ == \"__main__\":\n assert all(s is not None for s in source_refs), \"Not all source reference jsons could be found.\"\n \n # Collect args for tokenization and pass them into tokenize_shuffle\n- tokenize_shuffle_args = [str(i) for k,v in vars(args).items() for i in [f\"--{k}\", v] if k not in DCNLP_ARGS and v]\n- tokenize_shuffle.main(tokenize_shuffle_args)\n+ # tokenize_shuffle_args = [str(i) for k,v in vars(args).items() for i in [f\"--{k}\", v] if k not in DCNLP_ARGS and v]\n+ # tokenize_shuffle.main(tokenize_shuffle_args)\n \n dataset_json = generate_tokenized_dataset_json(args, source_refs)\n with open(json_path, \"w\") as ref_file:\ndiff --git a/training/configs_v2/3b_1x.json b/training/configs_v2/3b_1x.json\nindex d77a4d4..2e9e15b 100644\n--- a/training/configs_v2/3b_1x.json\n+++ b/training/configs_v2/3b_1x.json\n@@ -8,7 +8,7 @@\n \"wd\": 0.33,\n \"cd\": 3e-05,\n \"global_bs\": 2048,\n- \"acc\": 2,\n+ \"acc\": 4,\n \"qk_norm\": true,\n \"z_loss\": 1e-4,\n \"grad_checkpointing\": false,\ndiff --git a/training/params.py b/training/params.py\nindex 4be00f2..bb39d88 100644\n--- a/training/params.py\n+++ b/training/params.py\n@@ -99,9 +99,8 @@ def parse_dcnlp_args():\n \n def get_open_lm_args(args, hparams, dr, name):\n if args.manifest_prefix_override is not None:\n- manifiest_name = Path(dr.manifest_url).stem\n- manifest_extention = Path(dr.manifest_url).suffix\n- dr.manifest_url = os.path.join(args.manifest_prefix_override, f\"{manifiest_name}{manifest_extention}\")\n+ manifest_name = Path(dr.manifest_url).name\n+ dr.manifest_url = os.path.join(args.manifest_prefix_override, f\"{manifest_name}\")\n \n local_rank, _, _ = world_info_from_env()\n openlm_val_data = download_val_data(\"open_lm_val\", skip_download=local_rank != 0)\ndiff --git a/training/train_scripts/docker/Dockerfile_update b/training/train_scripts/docker/Dockerfile_update\nindex b46252b..18e49d8 100644\n--- a/training/train_scripts/docker/Dockerfile_update\n+++ b/training/train_scripts/docker/Dockerfile_update\n@@ -8,7 +8,7 @@ COPY . /opt/ml/code/\n \n # RUN pip install -e /opt/ml/code/\n \n-# # Prevent sagemaker from installing requirements again.\n+# Prevent sagemaker from installing requirements again.\n RUN rm /opt/ml/code/requirements.txt\n \n ENV SAGEMAKER_PROGRAM training/train.py\ndiff --git a/training/train_scripts/train_sagemaker.py b/training/train_scripts/train_sagemaker.py\nindex 0d29c07..8c09642 100644\n--- a/training/train_scripts/train_sagemaker.py\n+++ b/training/train_scripts/train_sagemaker.py\n@@ -85,6 +85,7 @@ def main():\n parser.add_argument(\"--scale\", required=True)\n parser.add_argument(\"--data-config\", required=True)\n parser.add_argument(\"--remote-sync\", required=True, help=\"S3 path to sync to\")\n+ parser.add_argument(\"--manifest-prefix-override\")\n \n # Docker / AWS args\n parser.add_argument(\"--docker-dir\", type=Path, default=Path(__file__).parent / \"docker\")\n@@ -165,6 +166,8 @@ def main_after_setup_move(args):\n \"logs\": f\"{checkpoint_local_path}/{job_name}\",\n \"report-to-wandb\": \"\",\n }\n+ if args.manifest_prefix_override:\n+ train_args[\"manifest-prefix-override\"] = args.manifest_prefix_override\n \n estimator = PyTorch(\n entry_point=\"training/train.py\",",
"data_key": "json.gz",
"sampling_yaml": null
}