Improvements to the dataset split script

mitdbg · Nov 23, 2023 · e7e7b1a · e7e7b1a
1 parent e8e7d45
commit e7e7b1a
Showing 1 changed file with 11 additions and 4 deletions.
diff --git a/tools/query_dataset/split_parsed.py b/tools/query_dataset/split_parsed.py
@@ -17,11 +17,14 @@ def main():
     with open(args.queries_file, "r", encoding="UTF-8") as file:
         queries = {line.strip() for line in file}
 
-    matched_pp, matched_pq, matched_sql = [], [], []
-    unmatched_pp, unmatched_pq, unmatched_sql = [], [], []
+    matched_pp, matched_pq, matched_sql, matched_da = [], [], [], []
+    unmatched_pp, unmatched_pq, unmatched_sql, unmatched_da = [], [], [], []
 
-    for pp, pq, sql in zip(
-        source["parsed_plans"], source["parsed_queries"], source["sql_queries"]
+    for pp, pq, sql, da in zip(
+        source["parsed_plans"],
+        source["parsed_queries"],
+        source["sql_queries"],
+        source["bytes_scanned"],
     ):
         if not sql.endswith(";"):
             match_sql = sql + ";"
@@ -32,10 +35,12 @@ def main():
             matched_pp.append(pp)
             matched_pq.append(pq)
             matched_sql.append(sql)
+            matched_da.append(da)
         else:
             unmatched_pp.append(pp)
             unmatched_pq.append(pq)
             unmatched_sql.append(sql)
+            unmatched_da.append(da)
 
     print("Matching:", len(matched_pp))
     print("Unmatched:", len(unmatched_pp))
@@ -45,6 +50,7 @@ def main():
         "parsed_plans": matched_pp,
         "parsed_queries": matched_pq,
         "sql_queries": matched_sql,
+        "bytes_scanned": matched_da,
     }
 
     with open(args.out_file_1, "w", encoding="UTF-8") as file:
@@ -61,6 +67,7 @@ def main():
         "parsed_plans": unmatched_pp,
         "parsed_queries": unmatched_pq,
         "sql_queries": unmatched_sql,
+        "bytes_scanned": unmatched_da,
     }
 
     with open(args.out_file_2, "w", encoding="UTF-8") as file: