diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 979e8466bc..ebf58cb04d 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -26,7 +26,7 @@
 #
 # /WARNING!
 
-exclude: build|stubs|^bot/templates/$|openassistant/templates|docs/docs/api/openapi.json|scripts/postprocessing/regex_pii_detector.py
+exclude: build|stubs|^bot/templates/$|openassistant/templates|docs/docs/api/openapi.json|scripts/postprocessing/regex_pii_detector.py|oasst-data/examples/clean_dataset.py|oasst-data/examples/tree_to_messages.py
 
 default_language_version:
   python: python3
diff --git a/oasst-data/examples/clean_dataset.py b/oasst-data/examples/clean_dataset.py
index 92a9f3e8f2..d7c055213e 100644
--- a/oasst-data/examples/clean_dataset.py
+++ b/oasst-data/examples/clean_dataset.py
@@ -1,3 +1,11 @@
+"""
+Example usage:
+
+    python clean_dataset.py /
+        "2023-11-05_oasst_all.jsonl" /
+        "2023-11-05_oasst_all.clean.jsonl" /
+        --instructions "instructions.xlsx"
+"""
 import argparse
 from collections import OrderedDict
 
@@ -59,25 +67,36 @@ def delete_message(msg: ExportMessageNode):
             print(f"Tree deleted: {msg.message_id}")
         else:
             parent_msg = message_by_id[msg.parent_id]
-            parent_msg.replies.remove(msg)
-            print(f"Branch deleted: {msg.message_id} ({count_descendants(msg)} messages)")
+            try:
+                parent_msg.replies.remove(msg)
+                print(f"Branch deleted: {msg.message_id} ({count_descendants(msg)} messages)")
+            except ValueError:
+                print(f"Message not found: {msg.message_id}")
 
     # cleaning
     print("Cleaning...")
     for index, row in instructions_df.iterrows():
         id = row["UUID"]
+        print(f"Cleaning id={id}")
         msg = message_by_id.get(id)
         if msg is None:
             print(f"Not found: {id}")
+            print(f"Skipping instructions for : {id}")
+            continue
 
         action = row["Action"]
+        print(f"Action={action}")
+
+        # Delete
         if action == "Delete":
             print(f"deleting: {id}")
             delete_message(msg)
+        # Replace
         elif action == "Replace":
             print(f"replace: {id}")
             replace = row["Replace"]
             msg.text = replace
+        # Edit
         elif action == "Edit":
             print(f"edit: {id}")
             if row["Category"] == "Copy Code":
@@ -86,8 +105,13 @@ def delete_message(msg: ExportMessageNode):
             else:
                 find = row["Find"]
                 replace = row["Replace"]
-            msg.text.index(find)  # make sure text is present
-            msg.text = msg.text.replace(find, replace)
+            try:
+                msg.text.index(find)  # make sure text is present
+                msg.text = msg.text.replace(find, replace)
+            except ValueError as e:
+                print(e)
+                # print(f"find not found: {find}")
+                continue
         else:
             print(f"Unsupported action {action}")
 
diff --git a/oasst-data/examples/filter_messages.py b/oasst-data/examples/filter_messages.py
index b8005b569c..9ff0a9c656 100644
--- a/oasst-data/examples/filter_messages.py
+++ b/oasst-data/examples/filter_messages.py
@@ -126,7 +126,12 @@ def approve_message(msg: ExportMessageNode) -> bool:
         ):
             return False
 
-        if exclude_normal is True and not msg.deleted and not msg.synthetic and msg.review_result:
+        if (
+            exclude_normal is True
+            and not msg.deleted
+            and not msg.synthetic
+            and msg.review_result
+        ):
             return False
 
         if spam is not None and spam != (not msg.review_result):
diff --git a/oasst-data/examples/filter_trees.py b/oasst-data/examples/filter_trees.py
index 76c753aefb..ce757bc452 100644
--- a/oasst-data/examples/filter_trees.py
+++ b/oasst-data/examples/filter_trees.py
@@ -1,3 +1,12 @@
+"""
+Example usage:
+
+    python filter_trees.py /
+        "2023-11-05_oasst_all.jsonl" /
+        "2023-11-05_oasst_all.clean.jsonl" /
+        --states "ready_for_export"
+"""
+
 import argparse
 
 from oasst_data import read_message_trees, write_message_trees
diff --git a/oasst-data/examples/split_dataset.py b/oasst-data/examples/split_dataset.py
index 0a47a7ca0c..618185bb24 100644
--- a/oasst-data/examples/split_dataset.py
+++ b/oasst-data/examples/split_dataset.py
@@ -1,3 +1,12 @@
+"""
+Example usage:
+
+    python split_dataset.py /
+        "2023-11-05_oasst_all.messages.jsonl" /
+        --val_output "2023-11-05_oasst_all.messages.validation.jsonl" /
+        --train_output "2023-11-05_oasst_all.messages.train.jsonl"
+"""
+
 import argparse
 import random
 
diff --git a/oasst-data/examples/tree_to_messages.py b/oasst-data/examples/tree_to_messages.py
index fe4a8d42f9..924a329dd2 100644
--- a/oasst-data/examples/tree_to_messages.py
+++ b/oasst-data/examples/tree_to_messages.py
@@ -1,3 +1,11 @@
+"""
+Example usage:
+
+    python tree_to_messages.py /
+        "2023-11-05_oasst_all.jsonl" /
+        "2023-11-05_oasst_all.messages.jsonl"
+"""
+
 import argparse
 
 from oasst_data import ExportMessageNode, read_message_trees, visit_messages_depth_first, write_messages
diff --git a/oasst-data/oasst2/generate_oasst2.ipynb b/oasst-data/oasst2/generate_oasst2.ipynb
new file mode 100644
index 0000000000..3e47fd68e3
--- /dev/null
+++ b/oasst-data/oasst2/generate_oasst2.ipynb
@@ -0,0 +1,1011 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import hashlib"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# inputs\n",
+    "data_dir = \"C:/Users/andre/Downloads\"\n",
+    "data_out_dir = f\"{data_dir}/oasst2\"\n",
+    "raw_input_data_path = f\"{data_dir}/2023-11-05_oasst_all/2023-11-05_oasst_all.jsonl\"\n",
+    "instructions_path = f\"{data_dir}/instructions.xlsx\"\n",
+    "trees_filename = f\"2023-11-05_oasst_all.trees.jsonl\"\n",
+    "trees_ready_filename = f\"2023-11-05_oasst_all.trees.ready_for_export.jsonl\"\n",
+    "messages_filename = f\"2023-11-05_oasst_all.messages.jsonl\"\n",
+    "messages_ready_filename = f\"2023-11-05_oasst_all.messages.ready_for_export.jsonl\"\n",
+    "messages_train_filename = f\"2023-11-05_oasst_all.messages.train.jsonl\"\n",
+    "messages_ready_train_filename = (\n",
+    "    f\"2023-11-05_oasst_all.messages.ready_for_export.train.jsonl\"\n",
+    ")\n",
+    "messages_validation_filename = f\"2023-11-05_oasst_all.messages.validation.jsonl\"\n",
+    "messages_ready_validation_filename = (\n",
+    "    f\"2023-11-05_oasst_all.messages.ready_for_export.validation.jsonl\"\n",
+    ")\n",
+    "\n",
+    "# make data_out_dir if it doesn't exist\n",
+    "if not os.path.exists(data_out_dir):\n",
+    "    os.makedirs(data_out_dir)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Hash of input data: C:/Users/andre/Downloads/2023-11-05_oasst_all/2023-11-05_oasst_all.jsonl\n",
+      "8223a0083f70749ecf430b4057c50dc4\n",
+      "Hash of Instructions: C:/Users/andre/Downloads/instructions.xlsx\n",
+      "99e7a311f473b08781fad2b1855243dc\n"
+     ]
+    }
+   ],
+   "source": [
+    "# print hashes of input files\n",
+    "\n",
+    "print(f\"Hash of input data: {raw_input_data_path}\")\n",
+    "with open(raw_input_data_path, \"rb\") as f:\n",
+    "    print(hashlib.md5(f.read()).hexdigest())\n",
+    "\n",
+    "print(f\"Hash of Instructions: {instructions_path}\")\n",
+    "with open(instructions_path, \"rb\") as f:\n",
+    "    print(hashlib.md5(f.read()).hexdigest())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Reading: C:/Users/andre/Downloads/2023-11-05_oasst_all/2023-11-05_oasst_all.jsonl\n",
+      "Loaded 70673 trees with 208686 messages.\n",
+      "Cleaning...\n",
+      "Cleaning id=36be40c8-2451-4b92-99b9-97f425f6955b\n",
+      "Action=Edit\n",
+      "edit: 36be40c8-2451-4b92-99b9-97f425f6955b\n",
+      "Cleaning id=e6933f01-4183-45bf-892c-31bdf778eee0\n",
+      "Action=Delete\n",
+      "deleting: e6933f01-4183-45bf-892c-31bdf778eee0\n",
+      "Tree deleted: e6933f01-4183-45bf-892c-31bdf778eee0\n",
+      "Cleaning id=449a995f-29a4-4d04-aa56-2dab1747a417\n",
+      "Action=Delete\n",
+      "deleting: 449a995f-29a4-4d04-aa56-2dab1747a417\n",
+      "Tree deleted: 449a995f-29a4-4d04-aa56-2dab1747a417\n",
+      "Cleaning id=4b6c83d4-b6c1-452e-b57a-a09bec46a887\n",
+      "Action=Delete\n",
+      "deleting: 4b6c83d4-b6c1-452e-b57a-a09bec46a887\n",
+      "Tree deleted: 4b6c83d4-b6c1-452e-b57a-a09bec46a887\n",
+      "Cleaning id=af60f432-7fb2-4f63-b73a-6092a35bf21b\n",
+      "Action=Delete\n",
+      "deleting: af60f432-7fb2-4f63-b73a-6092a35bf21b\n",
+      "Tree deleted: af60f432-7fb2-4f63-b73a-6092a35bf21b\n",
+      "Cleaning id=5ab3a8bd-bf74-4331-9564-c7e9a663de8e\n",
+      "Action=Edit\n",
+      "edit: 5ab3a8bd-bf74-4331-9564-c7e9a663de8e\n",
+      "Cleaning id=1c001f37-ae16-49fd-9877-48ffcc8091c4\n",
+      "Action=Edit\n",
+      "edit: 1c001f37-ae16-49fd-9877-48ffcc8091c4\n",
+      "Cleaning id=4fedd950-75b6-44f7-a908-3eac7d6b6056\n",
+      "Action=Delete\n",
+      "deleting: 4fedd950-75b6-44f7-a908-3eac7d6b6056\n",
+      "Tree deleted: 4fedd950-75b6-44f7-a908-3eac7d6b6056\n",
+      "Cleaning id=15b74473-55d4-41de-bdba-6aef17b08cc2\n",
+      "Action=Delete\n",
+      "deleting: 15b74473-55d4-41de-bdba-6aef17b08cc2\n",
+      "Tree deleted: 15b74473-55d4-41de-bdba-6aef17b08cc2\n",
+      "Cleaning id=e921db26-b14e-4a98-b854-ee8c07fef2d7\n",
+      "Action=Delete\n",
+      "deleting: e921db26-b14e-4a98-b854-ee8c07fef2d7\n",
+      "Tree deleted: e921db26-b14e-4a98-b854-ee8c07fef2d7\n",
+      "Cleaning id=65a641e2-bfd2-4481-b5fa-5ea4bffb7867\n",
+      "Action=Delete\n",
+      "deleting: 65a641e2-bfd2-4481-b5fa-5ea4bffb7867\n",
+      "Tree deleted: 65a641e2-bfd2-4481-b5fa-5ea4bffb7867\n",
+      "Cleaning id=ea20c005-fd42-4b1f-9089-f0545d4f379c\n",
+      "Action=Replace\n",
+      "replace: ea20c005-fd42-4b1f-9089-f0545d4f379c\n",
+      "Cleaning id=10dd0c91-f372-423b-96f5-045ca304166f\n",
+      "Action=Replace\n",
+      "replace: 10dd0c91-f372-423b-96f5-045ca304166f\n",
+      "Cleaning id=be6bb6d1-60ee-4418-ba29-5d61ba979935\n",
+      "Action=Delete\n",
+      "deleting: be6bb6d1-60ee-4418-ba29-5d61ba979935\n",
+      "Tree deleted: be6bb6d1-60ee-4418-ba29-5d61ba979935\n",
+      "Cleaning id=29bc510e-bba5-45a5-9005-d091f95351fb\n",
+      "Action=Delete\n",
+      "deleting: 29bc510e-bba5-45a5-9005-d091f95351fb\n",
+      "Tree deleted: 29bc510e-bba5-45a5-9005-d091f95351fb\n",
+      "Cleaning id=b60c32ae-2cc3-4896-9f92-347b5e65bce7\n",
+      "Action=Delete\n",
+      "deleting: b60c32ae-2cc3-4896-9f92-347b5e65bce7\n",
+      "Tree deleted: b60c32ae-2cc3-4896-9f92-347b5e65bce7\n",
+      "Cleaning id=18b6e6e2-f392-4bd4-9d12-ccd706dccbb6\n",
+      "Action=Delete\n",
+      "deleting: 18b6e6e2-f392-4bd4-9d12-ccd706dccbb6\n",
+      "Tree deleted: 18b6e6e2-f392-4bd4-9d12-ccd706dccbb6\n",
+      "Cleaning id=89141477-b899-4357-ae7f-50c2d33a119b\n",
+      "Action=Delete\n",
+      "deleting: 89141477-b899-4357-ae7f-50c2d33a119b\n",
+      "Tree deleted: 89141477-b899-4357-ae7f-50c2d33a119b\n",
+      "Cleaning id=ba17e6e7-bde1-41b3-99c8-e5b537f5abbe\n",
+      "Action=Delete\n",
+      "deleting: ba17e6e7-bde1-41b3-99c8-e5b537f5abbe\n",
+      "Tree deleted: ba17e6e7-bde1-41b3-99c8-e5b537f5abbe\n",
+      "Cleaning id=7e93d535-dd90-48a2-beeb-b71917912e90\n",
+      "Action=Delete\n",
+      "deleting: 7e93d535-dd90-48a2-beeb-b71917912e90\n",
+      "Tree deleted: 7e93d535-dd90-48a2-beeb-b71917912e90\n",
+      "Cleaning id=ede2cfed-22d2-421d-b218-adc7402f8a5a\n",
+      "Action=Delete\n",
+      "deleting: ede2cfed-22d2-421d-b218-adc7402f8a5a\n",
+      "Tree deleted: ede2cfed-22d2-421d-b218-adc7402f8a5a\n",
+      "Cleaning id=64bfc00b-9c53-4a5d-8187-91cd2c581e78\n",
+      "Action=Delete\n",
+      "deleting: 64bfc00b-9c53-4a5d-8187-91cd2c581e78\n",
+      "Tree deleted: 64bfc00b-9c53-4a5d-8187-91cd2c581e78\n",
+      "Cleaning id=80904428-577f-4cc1-95f7-c8383b093e49\n",
+      "Action=Delete\n",
+      "deleting: 80904428-577f-4cc1-95f7-c8383b093e49\n",
+      "Tree deleted: 80904428-577f-4cc1-95f7-c8383b093e49\n",
+      "Cleaning id=89a37680-a7b6-4437-bd24-ecfe8395eb29\n",
+      "Action=Delete\n",
+      "deleting: 89a37680-a7b6-4437-bd24-ecfe8395eb29\n",
+      "Branch deleted: 89a37680-a7b6-4437-bd24-ecfe8395eb29 (1 messages)\n",
+      "Cleaning id=549a7cf2-f6bc-4b13-855f-601c714bb033\n",
+      "Action=Delete\n",
+      "deleting: 549a7cf2-f6bc-4b13-855f-601c714bb033\n",
+      "Tree deleted: 549a7cf2-f6bc-4b13-855f-601c714bb033\n",
+      "Cleaning id=e5823ee5-ffd3-4604-8973-325aa1f3b2f1\n",
+      "Action=Delete\n",
+      "deleting: e5823ee5-ffd3-4604-8973-325aa1f3b2f1\n",
+      "Tree deleted: e5823ee5-ffd3-4604-8973-325aa1f3b2f1\n",
+      "Cleaning id=6b665310-180c-4a4b-95e0-b4cc4a0ff501\n",
+      "Action=Delete\n",
+      "deleting: 6b665310-180c-4a4b-95e0-b4cc4a0ff501\n",
+      "Branch deleted: 6b665310-180c-4a4b-95e0-b4cc4a0ff501 (1 messages)\n",
+      "Cleaning id=12bdba1b-e7be-432e-8619-4428e98dc144\n",
+      "Action=Delete\n",
+      "deleting: 12bdba1b-e7be-432e-8619-4428e98dc144\n",
+      "Branch deleted: 12bdba1b-e7be-432e-8619-4428e98dc144 (1 messages)\n",
+      "Cleaning id=4b0a0f5e-8e75-4917-b9de-5b7f0f651b2f\n",
+      "Action=Delete\n",
+      "deleting: 4b0a0f5e-8e75-4917-b9de-5b7f0f651b2f\n",
+      "Branch deleted: 4b0a0f5e-8e75-4917-b9de-5b7f0f651b2f (1 messages)\n",
+      "Cleaning id=5719d9f1-1cb7-4e83-bd36-d9728cefe2a3\n",
+      "Action=Delete\n",
+      "deleting: 5719d9f1-1cb7-4e83-bd36-d9728cefe2a3\n",
+      "Branch deleted: 5719d9f1-1cb7-4e83-bd36-d9728cefe2a3 (1 messages)\n",
+      "Cleaning id=b38fbad4-b43f-42f2-8561-1523165939a9\n",
+      "Action=Delete\n",
+      "deleting: b38fbad4-b43f-42f2-8561-1523165939a9\n",
+      "Branch deleted: b38fbad4-b43f-42f2-8561-1523165939a9 (1 messages)\n",
+      "Cleaning id=f3b49fe6-1ce5-44ae-b78f-95b4572a161d\n",
+      "Action=Delete\n",
+      "deleting: f3b49fe6-1ce5-44ae-b78f-95b4572a161d\n",
+      "Branch deleted: f3b49fe6-1ce5-44ae-b78f-95b4572a161d (1 messages)\n",
+      "Cleaning id=7dac84b7-407b-4b8a-8b77-ef384b2c117e\n",
+      "Action=Delete\n",
+      "deleting: 7dac84b7-407b-4b8a-8b77-ef384b2c117e\n",
+      "Branch deleted: 7dac84b7-407b-4b8a-8b77-ef384b2c117e (1 messages)\n",
+      "Cleaning id=3de67512-a8bd-4c0b-8b3a-c868fb868ee4\n",
+      "Action=Delete\n",
+      "deleting: 3de67512-a8bd-4c0b-8b3a-c868fb868ee4\n",
+      "Branch deleted: 3de67512-a8bd-4c0b-8b3a-c868fb868ee4 (1 messages)\n",
+      "Cleaning id=7effedf7-4857-4d60-b690-34814d9c9086\n",
+      "Action=Delete\n",
+      "deleting: 7effedf7-4857-4d60-b690-34814d9c9086\n",
+      "Branch deleted: 7effedf7-4857-4d60-b690-34814d9c9086 (3 messages)\n",
+      "Cleaning id=496ee685-a186-4498-ab9f-71b7602b9594\n",
+      "Action=Delete\n",
+      "deleting: 496ee685-a186-4498-ab9f-71b7602b9594\n",
+      "Branch deleted: 496ee685-a186-4498-ab9f-71b7602b9594 (1 messages)\n",
+      "Cleaning id=0a4d14a7-6831-4fa2-b04e-ff1186651200\n",
+      "Action=Delete\n",
+      "deleting: 0a4d14a7-6831-4fa2-b04e-ff1186651200\n",
+      "Branch deleted: 0a4d14a7-6831-4fa2-b04e-ff1186651200 (1 messages)\n",
+      "Cleaning id=0ed9a702-199c-4fc9-9dcc-1343e4140f88\n",
+      "Action=Delete\n",
+      "deleting: 0ed9a702-199c-4fc9-9dcc-1343e4140f88\n",
+      "Branch deleted: 0ed9a702-199c-4fc9-9dcc-1343e4140f88 (1 messages)\n",
+      "Cleaning id=803b00c5-cd83-41fd-a3f5-8c88220bfdfe\n",
+      "Action=Delete\n",
+      "deleting: 803b00c5-cd83-41fd-a3f5-8c88220bfdfe\n",
+      "Branch deleted: 803b00c5-cd83-41fd-a3f5-8c88220bfdfe (1 messages)\n",
+      "Cleaning id=eb2b4c9b-c040-41a8-a67c-dac68c020b0a\n",
+      "Action=Delete\n",
+      "deleting: eb2b4c9b-c040-41a8-a67c-dac68c020b0a\n",
+      "Tree deleted: eb2b4c9b-c040-41a8-a67c-dac68c020b0a\n",
+      "Cleaning id=61e0411e-27d4-4778-80f0-2501b1a36786\n",
+      "Action=Delete\n",
+      "deleting: 61e0411e-27d4-4778-80f0-2501b1a36786\n",
+      "Tree deleted: 61e0411e-27d4-4778-80f0-2501b1a36786\n",
+      "Cleaning id=cc193de3-0ccc-4844-b2fc-8b67d0cbf89c\n",
+      "Action=Delete\n",
+      "deleting: cc193de3-0ccc-4844-b2fc-8b67d0cbf89c\n",
+      "Branch deleted: cc193de3-0ccc-4844-b2fc-8b67d0cbf89c (1 messages)\n",
+      "Cleaning id=994141fc-e7b6-462f-9b64-305379038be1\n",
+      "Action=Delete\n",
+      "deleting: 994141fc-e7b6-462f-9b64-305379038be1\n",
+      "Tree deleted: 994141fc-e7b6-462f-9b64-305379038be1\n",
+      "Cleaning id=f3949958-ced8-4ed8-9de5-2c50e4296c1e\n",
+      "Action=Delete\n",
+      "deleting: f3949958-ced8-4ed8-9de5-2c50e4296c1e\n",
+      "Branch deleted: f3949958-ced8-4ed8-9de5-2c50e4296c1e (1 messages)\n",
+      "Cleaning id=bf12f013-a961-49eb-99a4-9f520f52bd52\n",
+      "Action=Delete\n",
+      "deleting: bf12f013-a961-49eb-99a4-9f520f52bd52\n",
+      "Branch deleted: bf12f013-a961-49eb-99a4-9f520f52bd52 (1 messages)\n",
+      "Cleaning id=59f41793-7f3a-480f-819f-b581faae1e7a\n",
+      "Action=Edit\n",
+      "edit: 59f41793-7f3a-480f-819f-b581faae1e7a\n",
+      "Cleaning id=a1f43d47-f0f2-42d3-804c-605714cc56f0\n",
+      "Action=Delete\n",
+      "deleting: a1f43d47-f0f2-42d3-804c-605714cc56f0\n",
+      "Branch deleted: a1f43d47-f0f2-42d3-804c-605714cc56f0 (1 messages)\n",
+      "Cleaning id=2c0b821c-2525-4c69-8480-44aaea186d14\n",
+      "Action=Delete\n",
+      "deleting: 2c0b821c-2525-4c69-8480-44aaea186d14\n",
+      "Tree deleted: 2c0b821c-2525-4c69-8480-44aaea186d14\n",
+      "Cleaning id=7695f9be-4196-4c74-918f-984ae75f0f94\n",
+      "Action=Delete\n",
+      "deleting: 7695f9be-4196-4c74-918f-984ae75f0f94\n",
+      "Tree deleted: 7695f9be-4196-4c74-918f-984ae75f0f94\n",
+      "Cleaning id=f70cd3b7-6df9-40fe-a73b-13528f3cf49b\n",
+      "Action=Delete\n",
+      "deleting: f70cd3b7-6df9-40fe-a73b-13528f3cf49b\n",
+      "Tree deleted: f70cd3b7-6df9-40fe-a73b-13528f3cf49b\n",
+      "Cleaning id=ed80741a-1e3b-4a86-879e-51369e18e796\n",
+      "Action=Delete\n",
+      "deleting: ed80741a-1e3b-4a86-879e-51369e18e796\n",
+      "Branch deleted: ed80741a-1e3b-4a86-879e-51369e18e796 (1 messages)\n",
+      "Cleaning id=1b36018f-7f42-49fe-8e83-b0af4b29a04e\n",
+      "Not found: 1b36018f-7f42-49fe-8e83-b0af4b29a04e\n",
+      "Skipping instructions for : 1b36018f-7f42-49fe-8e83-b0af4b29a04e\n",
+      "Cleaning id=52747b24-82dc-42f0-b764-f689e885b50b\n",
+      "Action=Delete\n",
+      "deleting: 52747b24-82dc-42f0-b764-f689e885b50b\n",
+      "Tree deleted: 52747b24-82dc-42f0-b764-f689e885b50b\n",
+      "Cleaning id=419354d2-2f8e-4d14-bd87-1c27b9253fea\n",
+      "Action=Delete\n",
+      "deleting: 419354d2-2f8e-4d14-bd87-1c27b9253fea\n",
+      "Branch deleted: 419354d2-2f8e-4d14-bd87-1c27b9253fea (1 messages)\n",
+      "Cleaning id=22dfa8a2-2776-47b0-972d-259b63597865\n",
+      "Action=Delete\n",
+      "deleting: 22dfa8a2-2776-47b0-972d-259b63597865\n",
+      "Tree deleted: 22dfa8a2-2776-47b0-972d-259b63597865\n",
+      "Cleaning id=2840a94b-ecfe-4281-8210-a866e63ee14b\n",
+      "Action=Delete\n",
+      "deleting: 2840a94b-ecfe-4281-8210-a866e63ee14b\n",
+      "Tree deleted: 2840a94b-ecfe-4281-8210-a866e63ee14b\n",
+      "Cleaning id=1c9fe3c7-19a6-4e7a-ba31-a7cacd8db4d0\n",
+      "Action=Delete\n",
+      "deleting: 1c9fe3c7-19a6-4e7a-ba31-a7cacd8db4d0\n",
+      "Branch deleted: 1c9fe3c7-19a6-4e7a-ba31-a7cacd8db4d0 (1 messages)\n",
+      "Cleaning id=36b09cbd-d369-47a0-813a-c5f124e18e38\n",
+      "Action=Delete\n",
+      "deleting: 36b09cbd-d369-47a0-813a-c5f124e18e38\n",
+      "Branch deleted: 36b09cbd-d369-47a0-813a-c5f124e18e38 (1 messages)\n",
+      "Cleaning id=cb551444-a1ab-4eb7-b0b2-d68c15da6d4d\n",
+      "Action=Delete\n",
+      "deleting: cb551444-a1ab-4eb7-b0b2-d68c15da6d4d\n",
+      "Branch deleted: cb551444-a1ab-4eb7-b0b2-d68c15da6d4d (1 messages)\n",
+      "Cleaning id=eb1d86db-4f14-4157-90d1-3d2f2e9940df\n",
+      "Action=Delete\n",
+      "deleting: eb1d86db-4f14-4157-90d1-3d2f2e9940df\n",
+      "Branch deleted: eb1d86db-4f14-4157-90d1-3d2f2e9940df (1 messages)\n",
+      "Cleaning id=b0f0b044-ecec-4222-ac17-d5c9cd954e11\n",
+      "Action=Delete\n",
+      "deleting: b0f0b044-ecec-4222-ac17-d5c9cd954e11\n",
+      "Branch deleted: b0f0b044-ecec-4222-ac17-d5c9cd954e11 (2 messages)\n",
+      "Cleaning id=29458fc2-5359-48c9-b542-cce9cb92da93\n",
+      "Action=Delete\n",
+      "deleting: 29458fc2-5359-48c9-b542-cce9cb92da93\n",
+      "Branch deleted: 29458fc2-5359-48c9-b542-cce9cb92da93 (1 messages)\n",
+      "Cleaning id=39a7a2e4-ca84-4c41-b229-891117eb54fb\n",
+      "Action=Delete\n",
+      "deleting: 39a7a2e4-ca84-4c41-b229-891117eb54fb\n",
+      "Branch deleted: 39a7a2e4-ca84-4c41-b229-891117eb54fb (1 messages)\n",
+      "Cleaning id=d757fdd4-5748-4a02-8924-6dfb5583596d\n",
+      "Not found: d757fdd4-5748-4a02-8924-6dfb5583596d\n",
+      "Skipping instructions for : d757fdd4-5748-4a02-8924-6dfb5583596d\n",
+      "Cleaning id=66ddd46c-299f-4394-b0d5-51ab66ca06bd\n",
+      "Not found: 66ddd46c-299f-4394-b0d5-51ab66ca06bd\n",
+      "Skipping instructions for : 66ddd46c-299f-4394-b0d5-51ab66ca06bd\n",
+      "Cleaning id=94da7786-44b5-4ca9-9414-fd4f85ae6ca5\n",
+      "Action=Delete\n",
+      "deleting: 94da7786-44b5-4ca9-9414-fd4f85ae6ca5\n",
+      "Branch deleted: 94da7786-44b5-4ca9-9414-fd4f85ae6ca5 (2 messages)\n",
+      "Cleaning id=f52c527e-b8d9-4948-87c7-12f40e5f4c18\n",
+      "Action=Delete\n",
+      "deleting: f52c527e-b8d9-4948-87c7-12f40e5f4c18\n",
+      "Branch deleted: f52c527e-b8d9-4948-87c7-12f40e5f4c18 (2 messages)\n",
+      "Cleaning id=3bd4b565-e359-49ea-ae35-9ef4918a3454\n",
+      "Action=Delete\n",
+      "deleting: 3bd4b565-e359-49ea-ae35-9ef4918a3454\n",
+      "Branch deleted: 3bd4b565-e359-49ea-ae35-9ef4918a3454 (1 messages)\n",
+      "Cleaning id=f01e097d-774f-43ab-b9cf-ad1c817cbb71\n",
+      "Action=Delete\n",
+      "deleting: f01e097d-774f-43ab-b9cf-ad1c817cbb71\n",
+      "Branch deleted: f01e097d-774f-43ab-b9cf-ad1c817cbb71 (1 messages)\n",
+      "Cleaning id=ccf8373b-a8cf-49be-b612-7336abf1394c\n",
+      "Action=Delete\n",
+      "deleting: ccf8373b-a8cf-49be-b612-7336abf1394c\n",
+      "Branch deleted: ccf8373b-a8cf-49be-b612-7336abf1394c (1 messages)\n",
+      "Cleaning id=3c419703-2097-4946-8901-c66d4e2c0ef7\n",
+      "Action=Delete\n",
+      "deleting: 3c419703-2097-4946-8901-c66d4e2c0ef7\n",
+      "Branch deleted: 3c419703-2097-4946-8901-c66d4e2c0ef7 (3 messages)\n",
+      "Cleaning id=31a5583c-874e-402b-970c-ba2d9a4dcaae\n",
+      "Action=Delete\n",
+      "deleting: 31a5583c-874e-402b-970c-ba2d9a4dcaae\n",
+      "Branch deleted: 31a5583c-874e-402b-970c-ba2d9a4dcaae (2 messages)\n",
+      "Cleaning id=d2fd731c-23cd-4b2f-b509-cf669f221756\n",
+      "Action=Delete\n",
+      "deleting: d2fd731c-23cd-4b2f-b509-cf669f221756\n",
+      "Branch deleted: d2fd731c-23cd-4b2f-b509-cf669f221756 (1 messages)\n",
+      "Cleaning id=c812f202-fff3-4c91-8d69-11c8b2658c0f\n",
+      "Not found: c812f202-fff3-4c91-8d69-11c8b2658c0f\n",
+      "Skipping instructions for : c812f202-fff3-4c91-8d69-11c8b2658c0f\n",
+      "Cleaning id=6c49c918-9a73-41ca-9dfb-74c0685e861a\n",
+      "Action=Delete\n",
+      "deleting: 6c49c918-9a73-41ca-9dfb-74c0685e861a\n",
+      "Tree deleted: 6c49c918-9a73-41ca-9dfb-74c0685e861a\n",
+      "Cleaning id=cf659a23-c5aa-494c-bb72-c08ff3cd9f93\n",
+      "Action=Delete\n",
+      "deleting: cf659a23-c5aa-494c-bb72-c08ff3cd9f93\n",
+      "Branch deleted: cf659a23-c5aa-494c-bb72-c08ff3cd9f93 (1 messages)\n",
+      "Cleaning id=753de123-8213-4f98-90ce-f6137f083db2\n",
+      "Action=Delete\n",
+      "deleting: 753de123-8213-4f98-90ce-f6137f083db2\n",
+      "Tree deleted: 753de123-8213-4f98-90ce-f6137f083db2\n",
+      "Cleaning id=3311a64f-3d64-40a0-8647-edb12616225e\n",
+      "Action=Delete\n",
+      "deleting: 3311a64f-3d64-40a0-8647-edb12616225e\n",
+      "Branch deleted: 3311a64f-3d64-40a0-8647-edb12616225e (1 messages)\n",
+      "Cleaning id=05c3893d-9c74-4618-b690-360317677d3f\n",
+      "Action=Delete\n",
+      "deleting: 05c3893d-9c74-4618-b690-360317677d3f\n",
+      "Branch deleted: 05c3893d-9c74-4618-b690-360317677d3f (1 messages)\n",
+      "Cleaning id=15f4aeab-de6c-43a0-9530-33c99e8386ff\n",
+      "Action=Delete\n",
+      "deleting: 15f4aeab-de6c-43a0-9530-33c99e8386ff\n",
+      "Branch deleted: 15f4aeab-de6c-43a0-9530-33c99e8386ff (1 messages)\n",
+      "Cleaning id=29e4ae6c-bbb6-4e07-8fc9-c400f61cbc12\n",
+      "Action=Delete\n",
+      "deleting: 29e4ae6c-bbb6-4e07-8fc9-c400f61cbc12\n",
+      "Branch deleted: 29e4ae6c-bbb6-4e07-8fc9-c400f61cbc12 (1 messages)\n",
+      "Cleaning id=ca34580a-e5df-4064-aac2-9eac4099e7ce\n",
+      "Action=Delete\n",
+      "deleting: ca34580a-e5df-4064-aac2-9eac4099e7ce\n",
+      "Branch deleted: ca34580a-e5df-4064-aac2-9eac4099e7ce (1 messages)\n",
+      "Cleaning id=4fba85a6-17b9-4f7a-b18c-f6e52f772700\n",
+      "Action=Delete\n",
+      "deleting: 4fba85a6-17b9-4f7a-b18c-f6e52f772700\n",
+      "Branch deleted: 4fba85a6-17b9-4f7a-b18c-f6e52f772700 (1 messages)\n",
+      "Cleaning id=2428fc50-d942-41e9-ac43-5252f7519485\n",
+      "Action=Delete\n",
+      "deleting: 2428fc50-d942-41e9-ac43-5252f7519485\n",
+      "Branch deleted: 2428fc50-d942-41e9-ac43-5252f7519485 (1 messages)\n",
+      "Cleaning id=f5482168-3fbc-4a15-a6e0-8660cea70f37\n",
+      "Action=Edit\n",
+      "edit: f5482168-3fbc-4a15-a6e0-8660cea70f37\n",
+      "substring not found\n",
+      "Cleaning id=9b48bb3f-94d4-4fa8-91ed-64683e63206c\n",
+      "Action=Edit\n",
+      "edit: 9b48bb3f-94d4-4fa8-91ed-64683e63206c\n",
+      "substring not found\n",
+      "Cleaning id=68c80cb8-9998-4ca3-a4f1-3e244a7aac2a\n",
+      "Action=Edit\n",
+      "edit: 68c80cb8-9998-4ca3-a4f1-3e244a7aac2a\n",
+      "Cleaning id=833b1b12-574a-4662-b1c6-33d0202a4a00\n",
+      "Action=Edit\n",
+      "edit: 833b1b12-574a-4662-b1c6-33d0202a4a00\n",
+      "Cleaning id=02702885-6c2e-4e49-8f92-f766c2a3b940\n",
+      "Not found: 02702885-6c2e-4e49-8f92-f766c2a3b940\n",
+      "Skipping instructions for : 02702885-6c2e-4e49-8f92-f766c2a3b940\n",
+      "Cleaning id=6a0823fe-4302-44e0-8a19-8bdc91a8ac7e\n",
+      "Not found: 6a0823fe-4302-44e0-8a19-8bdc91a8ac7e\n",
+      "Skipping instructions for : 6a0823fe-4302-44e0-8a19-8bdc91a8ac7e\n",
+      "Cleaning id=f4dd77d4-24dd-4ab8-a568-9d0edca5fe17\n",
+      "Not found: f4dd77d4-24dd-4ab8-a568-9d0edca5fe17\n",
+      "Skipping instructions for : f4dd77d4-24dd-4ab8-a568-9d0edca5fe17\n",
+      "Cleaning id=8ad4358b-9dd4-4f35-b3e2-17af38abc0d4\n",
+      "Action=Edit\n",
+      "edit: 8ad4358b-9dd4-4f35-b3e2-17af38abc0d4\n",
+      "Cleaning id=68a72d7a-e42e-4e5f-b22f-9964f0463cc4\n",
+      "Action=Edit\n",
+      "edit: 68a72d7a-e42e-4e5f-b22f-9964f0463cc4\n",
+      "substring not found\n",
+      "Cleaning id=ec357197-0ee8-4acd-8072-29aa8e76292e\n",
+      "Action=Edit\n",
+      "edit: ec357197-0ee8-4acd-8072-29aa8e76292e\n",
+      "substring not found\n",
+      "Cleaning id=edbecc63-47e4-4815-8744-5ad69f93bb33\n",
+      "Action=Edit\n",
+      "edit: edbecc63-47e4-4815-8744-5ad69f93bb33\n",
+      "substring not found\n",
+      "Cleaning id=5fb75daf-1926-47a2-85e8-f80c00ba7d03\n",
+      "Action=Edit\n",
+      "edit: 5fb75daf-1926-47a2-85e8-f80c00ba7d03\n",
+      "substring not found\n",
+      "Cleaning id=776f420b-6020-4f77-8b72-d48b580a0755\n",
+      "Action=Edit\n",
+      "edit: 776f420b-6020-4f77-8b72-d48b580a0755\n",
+      "substring not found\n",
+      "Cleaning id=4e97e7f6-cd85-4d8d-b73c-438faa23dd95\n",
+      "Action=Edit\n",
+      "edit: 4e97e7f6-cd85-4d8d-b73c-438faa23dd95\n",
+      "Cleaning id=fc010f62-4ebf-46cd-893f-180cde59f0f5\n",
+      "Action=Edit\n",
+      "edit: fc010f62-4ebf-46cd-893f-180cde59f0f5\n",
+      "Cleaning id=118f9a1c-f976-4120-8ff9-934b22545b0d\n",
+      "Action=Edit\n",
+      "edit: 118f9a1c-f976-4120-8ff9-934b22545b0d\n",
+      "substring not found\n",
+      "Cleaning id=8f70327e-1239-4564-a938-b9649465f14a\n",
+      "Action=Edit\n",
+      "edit: 8f70327e-1239-4564-a938-b9649465f14a\n",
+      "Cleaning id=b0ad34ae-4080-44bf-939f-7ece554fe9bb\n",
+      "Action=Edit\n",
+      "edit: b0ad34ae-4080-44bf-939f-7ece554fe9bb\n",
+      "Cleaning id=7800454d-b340-49d8-8d52-bd26f7c550e6\n",
+      "Action=Edit\n",
+      "edit: 7800454d-b340-49d8-8d52-bd26f7c550e6\n",
+      "Cleaning id=56de418b-c125-4d26-9d77-7db05d548faa\n",
+      "Action=Edit\n",
+      "edit: 56de418b-c125-4d26-9d77-7db05d548faa\n",
+      "Cleaning id=8b7c85bf-da3c-40e1-b3f8-b7fcd10424b6\n",
+      "Action=Edit\n",
+      "edit: 8b7c85bf-da3c-40e1-b3f8-b7fcd10424b6\n",
+      "Cleaning id=8d0d0f03-2fd0-4236-a872-3826cc9d36d6\n",
+      "Action=Edit\n",
+      "edit: 8d0d0f03-2fd0-4236-a872-3826cc9d36d6\n",
+      "Cleaning id=976893a1-416c-448d-84e4-7ee0311b6809\n",
+      "Action=Edit\n",
+      "edit: 976893a1-416c-448d-84e4-7ee0311b6809\n",
+      "substring not found\n",
+      "Cleaning id=34e3d882-9be9-44a7-859e-78a828efa0f8\n",
+      "Action=Edit\n",
+      "edit: 34e3d882-9be9-44a7-859e-78a828efa0f8\n",
+      "substring not found\n",
+      "Cleaning id=f0d3b169-1e2a-43d7-83ce-a792304e1de0\n",
+      "Action=Edit\n",
+      "edit: f0d3b169-1e2a-43d7-83ce-a792304e1de0\n",
+      "Cleaning id=2528f101-fbe9-4907-a44a-783e74e47aa6\n",
+      "Action=Edit\n",
+      "edit: 2528f101-fbe9-4907-a44a-783e74e47aa6\n",
+      "Cleaning id=84418163-5721-4653-9730-c483dd7b563e\n",
+      "Action=Edit\n",
+      "edit: 84418163-5721-4653-9730-c483dd7b563e\n",
+      "Cleaning id=7953dff8-09ec-4372-a1dc-f7c8a2ae6053\n",
+      "Action=Edit\n",
+      "edit: 7953dff8-09ec-4372-a1dc-f7c8a2ae6053\n",
+      "Cleaning id=cfa6b4b7-406a-4990-81b0-a2169cbed8d3\n",
+      "Action=Edit\n",
+      "edit: cfa6b4b7-406a-4990-81b0-a2169cbed8d3\n",
+      "Cleaning id=bfbf6135-3818-4d28-ac8e-cd9946bc72f3\n",
+      "Action=Edit\n",
+      "edit: bfbf6135-3818-4d28-ac8e-cd9946bc72f3\n",
+      "Cleaning id=ce81ca69-5a48-43eb-8df7-580a3d68578d\n",
+      "Action=Edit\n",
+      "edit: ce81ca69-5a48-43eb-8df7-580a3d68578d\n",
+      "Cleaning id=647add48-fbc9-4ac6-9930-1901da34520a\n",
+      "Action=Edit\n",
+      "edit: 647add48-fbc9-4ac6-9930-1901da34520a\n",
+      "Cleaning id=24545bf7-949d-446b-9b68-2553b2392357\n",
+      "Action=Edit\n",
+      "edit: 24545bf7-949d-446b-9b68-2553b2392357\n",
+      "Cleaning id=b15d9839-8e4a-4d7b-8b91-634eb1b37376\n",
+      "Action=Edit\n",
+      "edit: b15d9839-8e4a-4d7b-8b91-634eb1b37376\n",
+      "Cleaning id=f4e9c4d0-8686-451a-bc90-8dd7a5a51fbe\n",
+      "Action=Edit\n",
+      "edit: f4e9c4d0-8686-451a-bc90-8dd7a5a51fbe\n",
+      "Cleaning id=bea22011-334a-4e34-8c11-b1c1566de59c\n",
+      "Action=Edit\n",
+      "edit: bea22011-334a-4e34-8c11-b1c1566de59c\n",
+      "Cleaning id=756cdbe4-e04e-46bf-a049-e375e5100653\n",
+      "Action=Edit\n",
+      "edit: 756cdbe4-e04e-46bf-a049-e375e5100653\n",
+      "Cleaning id=b1aee184-1ec4-45dc-9a9c-515de51f636f\n",
+      "Action=Edit\n",
+      "edit: b1aee184-1ec4-45dc-9a9c-515de51f636f\n",
+      "Cleaning id=4f5227cc-1d36-4e19-b78f-d08a60dc2141\n",
+      "Action=Edit\n",
+      "edit: 4f5227cc-1d36-4e19-b78f-d08a60dc2141\n",
+      "Cleaning id=14d1d5e8-61a2-4e0b-898c-f4b2c38236c2\n",
+      "Action=Edit\n",
+      "edit: 14d1d5e8-61a2-4e0b-898c-f4b2c38236c2\n",
+      "substring not found\n",
+      "Cleaning id=a275451b-6674-467b-b7cf-539cfec31f64\n",
+      "Action=Edit\n",
+      "edit: a275451b-6674-467b-b7cf-539cfec31f64\n",
+      "Cleaning id=58adf822-cd49-4c18-b72e-526d9473bf6b\n",
+      "Action=Edit\n",
+      "edit: 58adf822-cd49-4c18-b72e-526d9473bf6b\n",
+      "Cleaning id=ae9d3f55-9f6f-41ea-a0bc-0f0eea162cb9\n",
+      "Action=Edit\n",
+      "edit: ae9d3f55-9f6f-41ea-a0bc-0f0eea162cb9\n",
+      "Cleaning id=1aed98d8-de97-449d-8f10-5ee506bd3ae4\n",
+      "Action=Edit\n",
+      "edit: 1aed98d8-de97-449d-8f10-5ee506bd3ae4\n",
+      "Cleaning id=d71ad38d-3bdd-4eed-8d19-526156f545a2\n",
+      "Action=Edit\n",
+      "edit: d71ad38d-3bdd-4eed-8d19-526156f545a2\n",
+      "Cleaning id=ae59b5aa-39cb-4ba0-8f05-8ae516ec9b3f\n",
+      "Action=Edit\n",
+      "edit: ae59b5aa-39cb-4ba0-8f05-8ae516ec9b3f\n",
+      "Cleaning id=7b0b2599-294f-4d4b-be67-724f6e17280b\n",
+      "Action=Edit\n",
+      "edit: 7b0b2599-294f-4d4b-be67-724f6e17280b\n",
+      "Cleaning id=6fc61c3c-af7a-4c7f-9556-3e2317795421\n",
+      "Action=Edit\n",
+      "edit: 6fc61c3c-af7a-4c7f-9556-3e2317795421\n",
+      "Cleaning id=307d9761-8dd0-4e7f-99fe-88fe9156a989\n",
+      "Action=Edit\n",
+      "edit: 307d9761-8dd0-4e7f-99fe-88fe9156a989\n",
+      "Cleaning id=4b509f26-ebb4-4de3-b665-2f506d9019ac\n",
+      "Action=Edit\n",
+      "edit: 4b509f26-ebb4-4de3-b665-2f506d9019ac\n",
+      "Cleaning id=7cbd1761-35d7-482d-8c6e-2cfde4677681\n",
+      "Action=Edit\n",
+      "edit: 7cbd1761-35d7-482d-8c6e-2cfde4677681\n",
+      "Cleaning id=2302958f-8b4b-467f-8955-8a991ddf7836\n",
+      "Action=Edit\n",
+      "edit: 2302958f-8b4b-467f-8955-8a991ddf7836\n",
+      "Cleaning id=377b1d72-cb10-4959-a20a-0a3846d34fbe\n",
+      "Action=Edit\n",
+      "edit: 377b1d72-cb10-4959-a20a-0a3846d34fbe\n",
+      "Cleaning id=6f272372-540c-47be-ac90-1f0b0f24b944\n",
+      "Action=Edit\n",
+      "edit: 6f272372-540c-47be-ac90-1f0b0f24b944\n",
+      "Cleaning id=ce4c4015-b5fd-464a-ab1b-df8d994e55ea\n",
+      "Action=Edit\n",
+      "edit: ce4c4015-b5fd-464a-ab1b-df8d994e55ea\n",
+      "substring not found\n",
+      "Cleaning id=c51c25d5-f632-436a-840c-e46ff07e3e79\n",
+      "Not found: c51c25d5-f632-436a-840c-e46ff07e3e79\n",
+      "Skipping instructions for : c51c25d5-f632-436a-840c-e46ff07e3e79\n",
+      "Cleaning id=598cc071-e847-48a6-a064-b1f6447654fb\n",
+      "Not found: 598cc071-e847-48a6-a064-b1f6447654fb\n",
+      "Skipping instructions for : 598cc071-e847-48a6-a064-b1f6447654fb\n",
+      "Cleaning id=743d0067-999c-4987-996b-8cf746a84195\n",
+      "Action=Edit\n",
+      "edit: 743d0067-999c-4987-996b-8cf746a84195\n",
+      "substring not found\n",
+      "Cleaning id=f372c08d-6054-491b-b503-b45f5996b854\n",
+      "Action=Edit\n",
+      "edit: f372c08d-6054-491b-b503-b45f5996b854\n",
+      "substring not found\n",
+      "Cleaning id=6df241aa-f5f8-4649-8313-1f8128f9bdcd\n",
+      "Action=Edit\n",
+      "edit: 6df241aa-f5f8-4649-8313-1f8128f9bdcd\n",
+      "Cleaning id=f817015f-0524-4ea4-a691-c8b6137858b4\n",
+      "Action=Edit\n",
+      "edit: f817015f-0524-4ea4-a691-c8b6137858b4\n",
+      "substring not found\n",
+      "Cleaning id=3eb60a82-2a3e-4a2b-b49f-7ea21002bd3d\n",
+      "Action=Edit\n",
+      "edit: 3eb60a82-2a3e-4a2b-b49f-7ea21002bd3d\n",
+      "substring not found\n",
+      "Cleaning id=b2763984-9f09-41b4-a1b3-18ecebf8eaaf\n",
+      "Action=Edit\n",
+      "edit: b2763984-9f09-41b4-a1b3-18ecebf8eaaf\n",
+      "substring not found\n",
+      "Cleaning id=01216bb7-2999-411d-9224-d9ad12aeb7ae\n",
+      "Action=Edit\n",
+      "edit: 01216bb7-2999-411d-9224-d9ad12aeb7ae\n",
+      "substring not found\n",
+      "Cleaning id=4e722d24-f373-48b9-b8df-afb5f564fd18\n",
+      "Action=Edit\n",
+      "edit: 4e722d24-f373-48b9-b8df-afb5f564fd18\n",
+      "Cleaning id=28df10e4-62a6-4b4e-84da-a9806f743b40\n",
+      "Action=Edit\n",
+      "edit: 28df10e4-62a6-4b4e-84da-a9806f743b40\n",
+      "substring not found\n",
+      "Cleaning id=2134fb02-ac5b-4c80-bfc9-0b4f7811ac22\n",
+      "Action=Edit\n",
+      "edit: 2134fb02-ac5b-4c80-bfc9-0b4f7811ac22\n",
+      "substring not found\n",
+      "Cleaning id=c67f523f-27a9-4648-a6ca-47856067b878\n",
+      "Action=Edit\n",
+      "edit: c67f523f-27a9-4648-a6ca-47856067b878\n",
+      "Cleaning id=7a875918-1bca-4cbe-9098-0cfdcf5c6a06\n",
+      "Action=Edit\n",
+      "edit: 7a875918-1bca-4cbe-9098-0cfdcf5c6a06\n",
+      "Cleaning id=4cf9f4c3-ab6d-4610-9fa4-6e2f6d7187c6\n",
+      "Action=Edit\n",
+      "edit: 4cf9f4c3-ab6d-4610-9fa4-6e2f6d7187c6\n",
+      "Cleaning id=5d41aea3-e16f-4453-8623-4d1e1fa46189\n",
+      "Action=Edit\n",
+      "edit: 5d41aea3-e16f-4453-8623-4d1e1fa46189\n",
+      "Cleaning id=8ab59449-3c58-4282-b491-9256056a0b06\n",
+      "Action=Edit\n",
+      "edit: 8ab59449-3c58-4282-b491-9256056a0b06\n",
+      "Cleaning id=287eba5e-29c9-4e67-b2ef-ebb3083b3003\n",
+      "Action=Edit\n",
+      "edit: 287eba5e-29c9-4e67-b2ef-ebb3083b3003\n",
+      "Cleaning id=ee378b30-e1db-4356-a3f1-57e6356fced4\n",
+      "Action=Edit\n",
+      "edit: ee378b30-e1db-4356-a3f1-57e6356fced4\n",
+      "Cleaning id=ac7a787a-b73c-46b4-87cd-5b5674b72898\n",
+      "Action=Edit\n",
+      "edit: ac7a787a-b73c-46b4-87cd-5b5674b72898\n",
+      "Cleaning id=aa96b30b-f1fe-4887-812a-b207240838be\n",
+      "Action=Edit\n",
+      "edit: aa96b30b-f1fe-4887-812a-b207240838be\n",
+      "Cleaning id=4b1f967e-15a5-4286-838f-f74a7542e365\n",
+      "Action=Edit\n",
+      "edit: 4b1f967e-15a5-4286-838f-f74a7542e365\n",
+      "Cleaning id=bc1503ac-bf8d-417a-9200-92fb1e18089c\n",
+      "Action=Edit\n",
+      "edit: bc1503ac-bf8d-417a-9200-92fb1e18089c\n",
+      "Cleaning id=5ca77f89-9460-4939-a971-940959fe8dff\n",
+      "Action=Edit\n",
+      "edit: 5ca77f89-9460-4939-a971-940959fe8dff\n",
+      "Cleaning id=a55cf83d-3980-4529-af25-7c16d81825f7\n",
+      "Action=Edit\n",
+      "edit: a55cf83d-3980-4529-af25-7c16d81825f7\n",
+      "Cleaning id=c752db2d-6bf9-4ac7-b46b-defed71c0252\n",
+      "Action=Edit\n",
+      "edit: c752db2d-6bf9-4ac7-b46b-defed71c0252\n",
+      "Cleaning id=adf3edbf-d1c4-40db-9ec6-be096f7d7353\n",
+      "Action=Edit\n",
+      "edit: adf3edbf-d1c4-40db-9ec6-be096f7d7353\n",
+      "Cleaning id=192a2529-525e-47f2-841b-b82c4b2feacf\n",
+      "Action=Edit\n",
+      "edit: 192a2529-525e-47f2-841b-b82c4b2feacf\n",
+      "Cleaning id=f59c1667-1fd2-4120-a924-e02a1c69ac73\n",
+      "Action=Edit\n",
+      "edit: f59c1667-1fd2-4120-a924-e02a1c69ac73\n",
+      "Cleaning id=754296b6-dc97-4340-8e21-e42f22ec538b\n",
+      "Action=Edit\n",
+      "edit: 754296b6-dc97-4340-8e21-e42f22ec538b\n",
+      "substring not found\n",
+      "Cleaning id=0d4e9ee5-54f0-4cd0-b026-5952700a5bb4\n",
+      "Action=Edit\n",
+      "edit: 0d4e9ee5-54f0-4cd0-b026-5952700a5bb4\n",
+      "Cleaning id=4a7094e2-25f4-4589-9547-c52827002cf3\n",
+      "Action=Edit\n",
+      "edit: 4a7094e2-25f4-4589-9547-c52827002cf3\n",
+      "Cleaning id=59973454-8b7f-44dc-b0e9-f6b9d2331e13\n",
+      "Action=Edit\n",
+      "edit: 59973454-8b7f-44dc-b0e9-f6b9d2331e13\n",
+      "Cleaning id=973ad122-16af-4e3f-92eb-1228a4ab04ec\n",
+      "Action=Edit\n",
+      "edit: 973ad122-16af-4e3f-92eb-1228a4ab04ec\n",
+      "Cleaning id=cfccf4f0-346f-4690-94b0-2676d697aeab\n",
+      "Action=Edit\n",
+      "edit: cfccf4f0-346f-4690-94b0-2676d697aeab\n",
+      "Cleaning id=f567161f-09c6-42de-b2b1-5e577eda5b46\n",
+      "Action=Edit\n",
+      "edit: f567161f-09c6-42de-b2b1-5e577eda5b46\n",
+      "Cleaning id=1ae6629a-c6cf-4df2-9bde-d5d437a7c412\n",
+      "Action=Edit\n",
+      "edit: 1ae6629a-c6cf-4df2-9bde-d5d437a7c412\n",
+      "Cleaning id=a96bdb41-f57a-46bf-9806-fc692d4a485a\n",
+      "Action=Edit\n",
+      "edit: a96bdb41-f57a-46bf-9806-fc692d4a485a\n",
+      "Cleaning id=c4d2323f-4cff-4816-8d6a-6e0fb39b4685\n",
+      "Action=Edit\n",
+      "edit: c4d2323f-4cff-4816-8d6a-6e0fb39b4685\n",
+      "Cleaning id=95d50a89-368d-46e9-872d-685b31a0ec6a\n",
+      "Action=Edit\n",
+      "edit: 95d50a89-368d-46e9-872d-685b31a0ec6a\n",
+      "Cleaning id=d469a34a-f234-405c-bb3a-d5d0328c2c3d\n",
+      "Action=Edit\n",
+      "edit: d469a34a-f234-405c-bb3a-d5d0328c2c3d\n",
+      "substring not found\n",
+      "Cleaning id=d3ae4235-90a4-4fad-9106-3075ee9c43ba\n",
+      "Action=Edit\n",
+      "edit: d3ae4235-90a4-4fad-9106-3075ee9c43ba\n",
+      "Cleaning id=5b56e959-7ce3-4d52-82dc-c0dcbb439a91\n",
+      "Action=Edit\n",
+      "edit: 5b56e959-7ce3-4d52-82dc-c0dcbb439a91\n",
+      "Cleaning id=5a25c3dc-b6f5-44ec-9a8a-9d0b51f4fcf1 \n",
+      "Not found: 5a25c3dc-b6f5-44ec-9a8a-9d0b51f4fcf1 \n",
+      "Skipping instructions for : 5a25c3dc-b6f5-44ec-9a8a-9d0b51f4fcf1 \n",
+      "Cleaning id=c91afb6c-1585-40a0-a529-8b9e0e0220d4 \n",
+      "Not found: c91afb6c-1585-40a0-a529-8b9e0e0220d4 \n",
+      "Skipping instructions for : c91afb6c-1585-40a0-a529-8b9e0e0220d4 \n",
+      "Cleaning id=36c6d5e6-d19e-435a-9a10-5b536ea52666\n",
+      "Action=Delete\n",
+      "deleting: 36c6d5e6-d19e-435a-9a10-5b536ea52666\n",
+      "Branch deleted: 36c6d5e6-d19e-435a-9a10-5b536ea52666 (1 messages)\n",
+      "Cleaning id=de142443-6c47-47f7-b849-9c1f3abacfeb \n",
+      "Not found: de142443-6c47-47f7-b849-9c1f3abacfeb \n",
+      "Skipping instructions for : de142443-6c47-47f7-b849-9c1f3abacfeb \n",
+      "Cleaning id=a7d5b481-d1dc-405d-83be-94189afd7050 \n",
+      "Not found: a7d5b481-d1dc-405d-83be-94189afd7050 \n",
+      "Skipping instructions for : a7d5b481-d1dc-405d-83be-94189afd7050 \n",
+      "Cleaning id=381503b9-7867-4c48-ad7a-287e889bc12a \n",
+      "Not found: 381503b9-7867-4c48-ad7a-287e889bc12a \n",
+      "Skipping instructions for : 381503b9-7867-4c48-ad7a-287e889bc12a \n",
+      "Cleaning id=8b814e22-5ea6-4092-8908-a76ca50e988c \n",
+      "Not found: 8b814e22-5ea6-4092-8908-a76ca50e988c \n",
+      "Skipping instructions for : 8b814e22-5ea6-4092-8908-a76ca50e988c \n",
+      "Cleaning id=e953b4f4-8476-48f2-8b1c-2dbf7a65a5da \n",
+      "Not found: e953b4f4-8476-48f2-8b1c-2dbf7a65a5da \n",
+      "Skipping instructions for : e953b4f4-8476-48f2-8b1c-2dbf7a65a5da \n",
+      "Cleaning id=ab8e193f-fd9f-45e7-90c5-f5d38cf84aab \n",
+      "Not found: ab8e193f-fd9f-45e7-90c5-f5d38cf84aab \n",
+      "Skipping instructions for : ab8e193f-fd9f-45e7-90c5-f5d38cf84aab \n",
+      "Cleaning id=c91afb6c-1585-40a0-a529-8b9e0e0220d4 \n",
+      "Not found: c91afb6c-1585-40a0-a529-8b9e0e0220d4 \n",
+      "Skipping instructions for : c91afb6c-1585-40a0-a529-8b9e0e0220d4 \n",
+      "Cleaning id=6f3e265c-b45e-44d7-9278-4d73e42811d4 \n",
+      "Not found: 6f3e265c-b45e-44d7-9278-4d73e42811d4 \n",
+      "Skipping instructions for : 6f3e265c-b45e-44d7-9278-4d73e42811d4 \n",
+      "Cleaning id=4dbe3bcf-489c-4a25-84f9-300b68f951c6 \n",
+      "Not found: 4dbe3bcf-489c-4a25-84f9-300b68f951c6 \n",
+      "Skipping instructions for : 4dbe3bcf-489c-4a25-84f9-300b68f951c6 \n",
+      "Cleaning id=a328a64e-20a7-46ce-b56a-622d694341d4\n",
+      "Action=Delete\n",
+      "deleting: a328a64e-20a7-46ce-b56a-622d694341d4\n",
+      "Branch deleted: a328a64e-20a7-46ce-b56a-622d694341d4 (1 messages)\n",
+      "Cleaning id=ca789bea-f12f-4814-a59b-b70455eb5e7c\n",
+      "Action=Delete\n",
+      "deleting: ca789bea-f12f-4814-a59b-b70455eb5e7c\n",
+      "Branch deleted: ca789bea-f12f-4814-a59b-b70455eb5e7c (1 messages)\n",
+      "Cleaning id=7531f5c3-0df0-4f9b-a11e-fe52ed3f809e\n",
+      "Action=Delete\n",
+      "deleting: 7531f5c3-0df0-4f9b-a11e-fe52ed3f809e\n",
+      "Branch deleted: 7531f5c3-0df0-4f9b-a11e-fe52ed3f809e (1 messages)\n",
+      "Cleaning id=36c6d5e6-d19e-435a-9a10-5b536ea52666\n",
+      "Action=Delete\n",
+      "deleting: 36c6d5e6-d19e-435a-9a10-5b536ea52666\n",
+      "Message not found: 36c6d5e6-d19e-435a-9a10-5b536ea52666\n",
+      "Cleaning id=970d2aa9-0089-428f-96ef-a94345231a58\n",
+      "Action=Delete\n",
+      "deleting: 970d2aa9-0089-428f-96ef-a94345231a58\n",
+      "Branch deleted: 970d2aa9-0089-428f-96ef-a94345231a58 (1 messages)\n",
+      "Cleaning id=e71da408-c9a8-4d28-9741-500ec1b02f0f\n",
+      "Action=Delete\n",
+      "deleting: e71da408-c9a8-4d28-9741-500ec1b02f0f\n",
+      "Branch deleted: e71da408-c9a8-4d28-9741-500ec1b02f0f (2 messages)\n",
+      "Cleaning id=b09ef779-8e2a-4ccb-aa1e-c0f108c1ea90\n",
+      "Action=Delete\n",
+      "deleting: b09ef779-8e2a-4ccb-aa1e-c0f108c1ea90\n",
+      "Branch deleted: b09ef779-8e2a-4ccb-aa1e-c0f108c1ea90 (2 messages)\n",
+      "Cleaning id=de142443-6c47-47f7-b849-9c1f3abacfeb\n",
+      "Action=Delete\n",
+      "deleting: de142443-6c47-47f7-b849-9c1f3abacfeb\n",
+      "Branch deleted: de142443-6c47-47f7-b849-9c1f3abacfeb (1 messages)\n",
+      "Cleaning id=fad47867-8c5d-4654-a8a4-97fda14fac1d\n",
+      "Action=Delete\n",
+      "deleting: fad47867-8c5d-4654-a8a4-97fda14fac1d\n",
+      "Branch deleted: fad47867-8c5d-4654-a8a4-97fda14fac1d (1 messages)\n",
+      "Cleaning id=8b814e22-5ea6-4092-8908-a76ca50e988c\n",
+      "Action=Delete\n",
+      "deleting: 8b814e22-5ea6-4092-8908-a76ca50e988c\n",
+      "Branch deleted: 8b814e22-5ea6-4092-8908-a76ca50e988c (1 messages)\n",
+      "Cleaning id=c91afb6c-1585-40a0-a529-8b9e0e0220d4\n",
+      "Action=Delete\n",
+      "deleting: c91afb6c-1585-40a0-a529-8b9e0e0220d4\n",
+      "Branch deleted: c91afb6c-1585-40a0-a529-8b9e0e0220d4 (1 messages)\n",
+      "Cleaning id=3f2913ba-49a6-4641-9267-2e7fc7f7fbd4\n",
+      "Action=Delete\n",
+      "deleting: 3f2913ba-49a6-4641-9267-2e7fc7f7fbd4\n",
+      "Branch deleted: 3f2913ba-49a6-4641-9267-2e7fc7f7fbd4 (1 messages)\n",
+      "Done\n",
+      "Writing: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.trees.jsonl\n"
+     ]
+    }
+   ],
+   "source": [
+    "# use instructions file to clean the raw dataset\n",
+    "!python ../examples/clean_dataset.py \\\n",
+    "    \"{raw_input_data_path}\" \\\n",
+    "    \"{data_out_dir}/{trees_filename}\" \\\n",
+    "    --instructions \"{instructions_path}\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Processing file: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.trees.jsonl\n"
+     ]
+    }
+   ],
+   "source": [
+    "# run keyword flagging\n",
+    "!python ./keyword_flagging.py \\\n",
+    "    \"{data_out_dir}/{trees_filename}\" \\\n",
+    "    \"{data_out_dir}\"\n",
+    "    \n",
+    "# outputs have been manually reviewed and appended to instructions file and notebook has been rerun"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Reading: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.trees.jsonl\n",
+      "Found 13854 matching trees.\n",
+      "Writing: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.trees.ready_for_export.jsonl\n"
+     ]
+    }
+   ],
+   "source": [
+    "# filter trees to make a version with status ready for export\n",
+    "!python ../examples/filter_trees.py \\\n",
+    "    \"{data_out_dir}/{trees_filename}\" \\\n",
+    "    \"{data_out_dir}/{trees_ready_filename}\" \\\n",
+    "    --states \"ready_for_export\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "reading: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.trees.jsonl\n",
+      "70642 trees with 208584 total messages read.\n",
+      "writing: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.messages.jsonl\n",
+      "208584 messages written.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# convert cleaned dataset from tree to messages\n",
+    "!python ../examples/tree_to_messages.py \\\n",
+    "    \"{data_out_dir}/{trees_filename}\" \\\n",
+    "    \"{data_out_dir}/{messages_filename}\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "reading: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.trees.ready_for_export.jsonl\n",
+      "13854 trees with 135174 total messages read.\n",
+      "writing: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.messages.ready_for_export.jsonl\n",
+      "135174 messages written.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# convert cleaned state=ready_for_export dataset from tree to messages\n",
+    "!python ../examples/tree_to_messages.py \\\n",
+    "    \"{data_out_dir}/{trees_ready_filename}\" \\\n",
+    "    \"{data_out_dir}/{messages_ready_filename}\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Reading: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.messages.jsonl\n",
+      "Found 208584 matching messages.\n",
+      "Writing train 198293 messages: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.messages.train.jsonl\n",
+      "Writing valid 10291 messages: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.messages.validation.jsonl\n"
+     ]
+    }
+   ],
+   "source": [
+    "# split messages into train and validation\n",
+    "!python ../examples/split_dataset.py \\\n",
+    "    \"{data_out_dir}/{messages_filename}\" \\\n",
+    "    --train_output \"{data_out_dir}/{messages_train_filename}\" \\\n",
+    "    --val_output \"{data_out_dir}/{messages_validation_filename}\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Reading: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.messages.ready_for_export.jsonl\n",
+      "Found 135174 matching messages.\n",
+      "Writing train 128412 messages: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.messages.ready_for_export.train.jsonl\n",
+      "Writing valid 6762 messages: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.messages.ready_for_export.validation.jsonl\n"
+     ]
+    }
+   ],
+   "source": [
+    "# split ready messages into train and validation\n",
+    "!python ../examples/split_dataset.py \\\n",
+    "    \"{data_out_dir}/{messages_ready_filename}\" \\\n",
+    "    --train_output \"{data_out_dir}/{messages_ready_train_filename}\" \\\n",
+    "    --val_output \"{data_out_dir}/{messages_ready_validation_filename}\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# make .gz files, keeping the original files\n",
+    "!gzip -c \"{data_out_dir}/{trees_filename}\" > \"{data_out_dir}/{trees_filename}.gz\"\n",
+    "!gzip -c \"{data_out_dir}/{trees_ready_filename}\" > \"{data_out_dir}/{trees_ready_filename}.gz\"\n",
+    "!gzip -c \"{data_out_dir}/{messages_filename}\" > \"{data_out_dir}/{messages_filename}.gz\"\n",
+    "!gzip -c \"{data_out_dir}/{messages_ready_filename}\" > \"{data_out_dir}/{messages_ready_filename}.gz\"\n",
+    "!gzip -c \"{data_out_dir}/{messages_train_filename}\" > \"{data_out_dir}/{messages_train_filename}.gz\"\n",
+    "!gzip -c \"{data_out_dir}/{messages_ready_train_filename}\" > \"{data_out_dir}/{messages_ready_train_filename}.gz\"\n",
+    "!gzip -c \"{data_out_dir}/{messages_validation_filename}\" > \"{data_out_dir}/{messages_validation_filename}.gz\"\n",
+    "!gzip -c \"{data_out_dir}/{messages_ready_validation_filename}\" > \"{data_out_dir}/{messages_ready_validation_filename}.gz\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# TODO: add detoxify scores\n",
+    "# TODO: generate huggingface parquet format files\n",
+    "# TODO: create readme for oasst2 dataset"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/oasst-data/oasst2/keyword_flagging.py b/oasst-data/oasst2/keyword_flagging.py
new file mode 100644
index 0000000000..cae7355efc
--- /dev/null
+++ b/oasst-data/oasst2/keyword_flagging.py
@@ -0,0 +1,121 @@
+import argparse
+import csv
+import glob
+import json
+import re
+
+banned_words = {
+    "卐",
+    "mein führer",
+    "sieg heil",
+    "heil hitler" "child porn",
+    "childporn",
+    "loli",
+    "hentai",
+    "pedophile",
+    "nigger",
+    "nigga",
+    "faggot",
+    "tranny",
+    "faggy",
+    "пидор",
+    "хуесос",
+    "хуйло",
+    "хохол",
+    "хохлы",
+    "русня",
+}
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="filter_dataset")
+    parser.add_argument(
+        "input_file_name",
+        type=str,
+        help="path to input .jsonl or .jsonl.gz input file",
+    )
+    parser.add_argument(
+        "output_dir",
+        type=str,
+        help="dir to output",
+    )
+    args = parser.parse_args()
+    return args
+
+
+def contains_banned_word(text):
+    pattern = r"\b(?:" + "|".join(re.escape(word) for word in banned_words) + r")\b"
+    regex = re.compile(pattern, re.IGNORECASE)
+    return bool(regex.search(text))
+
+
+def process_message(msg, writers):
+    text = msg.get("text", "")
+    if contains_banned_word(text):
+        writers["hate_speech_ban_words"].writerow([msg["message_id"], text])
+    if "labels" in msg:
+        for label in ["hate_speech", "toxicity", "pii", "not_appropriate", "violence"]:
+            if label in msg["labels"] and msg["labels"][label]["value"] > 0.85:
+                writers[label].writerow([msg["message_id"], text])
+    if len(text) < 10:
+        writers["junk_by_len"].writerow([msg["message_id"], text])
+    if "replies" in msg:
+        for reply in msg["replies"]:
+            process_message(reply, writers)
+
+
+def process_jsonl_file(file, writers):
+    print(f"Processing file: {file}")
+    with open(file, "r", encoding="utf-8") as f:
+        for line in f:
+            data = json.loads(line.strip())
+            if "prompt" in data:
+                process_message(data["prompt"], writers)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    files = glob.glob(args.input_file_name)
+    if not files:
+        print("No files found")
+    for file in files:
+        with open(
+            f"{args.output_dir}/hate_speech_labelled.csv",
+            "w",
+            newline="",
+            encoding="utf-8",
+        ) as file1, open(
+            f"{args.output_dir}/hate_speech_ban_words.csv",
+            "w",
+            newline="",
+            encoding="utf-8",
+        ) as file2, open(
+            f"{args.output_dir}/junk_len.csv", "w", newline="", encoding="utf-8"
+        ) as file3, open(
+            f"{args.output_dir}/toxicity_labelled.csv",
+            "w",
+            newline="",
+            encoding="utf-8",
+        ) as file4, open(
+            f"{args.output_dir}/pii_labelled.csv", "w", newline="", encoding="utf-8"
+        ) as file5, open(
+            f"{args.output_dir}/not_appropriate_labelled.csv",
+            "w",
+            newline="",
+            encoding="utf-8",
+        ) as file6, open(
+            f"{args.output_dir}/violence_labelled.csv",
+            "w",
+            newline="",
+            encoding="utf-8",
+        ) as file7:
+            writers = {
+                "hate_speech": csv.writer(file1),
+                "hate_speech_ban_words": csv.writer(file2),
+                "junk_by_len": csv.writer(file3),
+                "toxicity": csv.writer(file4),
+                "pii": csv.writer(file5),
+                "not_appropriate": csv.writer(file6),
+                "violence": csv.writer(file7),
+            }
+            process_jsonl_file(file, writers)