Skip to content

Commit

Permalink
Improvements to the dataset split script
Browse files Browse the repository at this point in the history
  • Loading branch information
geoffxy committed Nov 23, 2023
1 parent e8e7d45 commit e7e7b1a
Showing 1 changed file with 11 additions and 4 deletions.
15 changes: 11 additions & 4 deletions tools/query_dataset/split_parsed.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,14 @@ def main():
with open(args.queries_file, "r", encoding="UTF-8") as file:
queries = {line.strip() for line in file}

matched_pp, matched_pq, matched_sql = [], [], []
unmatched_pp, unmatched_pq, unmatched_sql = [], [], []
matched_pp, matched_pq, matched_sql, matched_da = [], [], [], []
unmatched_pp, unmatched_pq, unmatched_sql, unmatched_da = [], [], [], []

for pp, pq, sql in zip(
source["parsed_plans"], source["parsed_queries"], source["sql_queries"]
for pp, pq, sql, da in zip(
source["parsed_plans"],
source["parsed_queries"],
source["sql_queries"],
source["bytes_scanned"],
):
if not sql.endswith(";"):
match_sql = sql + ";"
Expand All @@ -32,10 +35,12 @@ def main():
matched_pp.append(pp)
matched_pq.append(pq)
matched_sql.append(sql)
matched_da.append(da)
else:
unmatched_pp.append(pp)
unmatched_pq.append(pq)
unmatched_sql.append(sql)
unmatched_da.append(da)

print("Matching:", len(matched_pp))
print("Unmatched:", len(unmatched_pp))
Expand All @@ -45,6 +50,7 @@ def main():
"parsed_plans": matched_pp,
"parsed_queries": matched_pq,
"sql_queries": matched_sql,
"bytes_scanned": matched_da,
}

with open(args.out_file_1, "w", encoding="UTF-8") as file:
Expand All @@ -61,6 +67,7 @@ def main():
"parsed_plans": unmatched_pp,
"parsed_queries": unmatched_pq,
"sql_queries": unmatched_sql,
"bytes_scanned": unmatched_da,
}

with open(args.out_file_2, "w", encoding="UTF-8") as file:
Expand Down

0 comments on commit e7e7b1a

Please sign in to comment.