diff --git a/tools/one_off/extract_telemetry_queries.py b/tools/one_off/extract_telemetry_queries.py new file mode 100644 index 00000000..de4c1301 --- /dev/null +++ b/tools/one_off/extract_telemetry_queries.py @@ -0,0 +1,29 @@ +import json +from typing import Dict, List + + +def process_file(filename: str) -> Dict[str, List[str]]: + with open(filename, "r", encoding="UTF-8") as file: + raw = json.load(file) + + results = {} + for epoch_key, inner in raw.items(): + queries = [] + for q in inner["athena_result"]: + queries.append(q["sql"]) + results[epoch_key] = queries + + return results + + +def main(): + data1 = process_file("telemetry_workload.json") + data2 = process_file("telemetry_workload_100g.json") + + combined = {**data1, **data2} + with open("telemetry_queries.json", "w", encoding="UTF-8") as file: + json.dump(combined, file, indent=2) + + +if __name__ == "__main__": + main() diff --git a/tools/one_off/gather_athena_telemetry.sh b/tools/one_off/gather_athena_telemetry.sh new file mode 100644 index 00000000..af0e7d5d --- /dev/null +++ b/tools/one_off/gather_athena_telemetry.sh @@ -0,0 +1,44 @@ +#! /bin/bash + +SCRIPT_PATH=$(cd $(dirname $0) && pwd -P) +cd $SCRIPT_PATH + +if [ -z $4 ]; then + echo "Usage: $0 queries_json out_dir config_file schema_name" +fi + +queries_json=$1 +out_dir=$2 +config_file=$3 +schema_name=$4 + +mkdir -p $out_dir/sql +mkdir -p $out_dir/raw + +for epoch in "0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22"; do + echo "Processing $epoch" + jq -r '.["epoch_'"$epoch"'"] | .[]' $queries_json > $out_dir/sql/epoch_${epoch}.sql + + echo "Gathering data..." + python ../../run_cost_model.py \ + --run_workload \ + --run_workload_rank 0 \ + --run_workload_world_size 1 \ + --database athena \ + --db_name imdb_specialized_100g \ + --query_timeout 300 \ + --s3_output_path "s3://geoffxy-research/athena/out" \ + --source $out_dir/sql/epoch_${epoch}.sql \ + --target $out_dir/raw/athena_epoch_${epoch}.json + + # Expand the table. + echo "Expanding the table for the next epoch..." + python3 ../load_telemetry.py \ + --config-file $config_file \ + --engines athena \ + --data-s3-bucket geoffxy-research \ + --data-s3-path imdb_specialized_100g/telemetry/telemetry.csv \ + --times 1 \ + --schema-name $schema_name +fi +