Adding chbench

mitdbg · May 13, 2024 · 2948b5e · 2948b5e
1 parent 6891b5f
commit 2948b5e
Show file tree

Hide file tree

Showing 3 changed files with 26 additions and 14 deletions.
diff --git a/load_baseline.py b/load_baseline.py
@@ -129,9 +129,9 @@ def main():
         with open("good_adhoc_queries.sql", "w", encoding="utf-8") as f:
             f.write("\n".join(res))
 
-if __name__ == "__main__":
-    main()
-    sys.exit(0)
+# if __name__ == "__main__":
+#     main()
+#     sys.exit(0)
 
 import yaml
 
@@ -164,7 +164,7 @@ def index_definition(table_name, index_columns):
 
 
 def yaml_main():
-    with open("config/schemas/chbenchmark.yml", "r", encoding="utf-8") as f:
+    with open("config/schemas/imdb_extended.yml", "r", encoding="utf-8") as f:
         tables = yaml.safe_load(f)
         print(f"Tables: {tables}")
 

diff --git a/workloads/IMDB_extended/workload_utils/baseline.py b/workloads/IMDB_extended/workload_utils/baseline.py
@@ -151,15 +151,23 @@ def __init__(self, config_file="config/baseline.yml"):
         cur.execute("SET GLOBAL local_infile = 1;")
         self.conn.commit()
 
-    def manually_copy_s3_data(self, table_names, source_dir="chbenchmark"):
+    def manually_copy_s3_data(self, table_names, source_dir="chbenchmark", source_bucket="geoffxy-research", source_ext="tbl"):
         s3 = boto3.resource("s3")
         for t in table_names:
-            source_key = f"{source_dir}/{t}/{t}.tbl"
+            source_key = f"{source_dir}/{t}/{t}.{source_ext}"
             target_key = f"{source_dir}/test.{t}.csv"
-            copy_source = {"Bucket": "geoffxy-research", "Key": source_key}
+            # copy_source = {"Bucket": "geoffxy-research", "Key": source_key}
+            copy_source = {"Bucket": source_bucket, "Key": source_key}
             print(f"Copying {t}")
             start_t = time.perf_counter()
-            s3.meta.client.copy(copy_source, self.s3_bucket, target_key)
+            try:
+                s3.meta.client.copy(copy_source, self.s3_bucket, target_key)
+            except Exception as e:
+                e = f"{e}"
+                if "Not Found" in e:
+                    source_key = f"{source_dir}/{t}/test.{t}.{source_ext}"
+                    copy_source = {"Bucket": source_bucket, "Key": source_key}
+                    s3.meta.client.copy(copy_source, self.s3_bucket, target_key)
             print(f"Copied {t} in {time.perf_counter() - start_t:.2f} secs")
 
     def fetch_metrics(self, start_time=None, end_time=None):
@@ -169,9 +177,9 @@ def load_database(self, schema_file, table_names):
         with open(schema_file, "r", encoding="utf-8") as f:
             schema = f.read()
         self.submit_query(schema, until_success=True)
-        for t in table_names:
-            replica_cmd = f"ALTER TABLE {t} SET TIFLASH REPLICA 1"
-            self.submit_query(replica_cmd, until_success=True)
+        # for t in table_names:
+        #     replica_cmd = f"ALTER TABLE {t} SET TIFLASH REPLICA 1"
+        #     self.submit_query(replica_cmd, until_success=True)
 
         # print("Creating Indexes")
         # indexes_sql = load_schema_sql(dataset, "indexes.sql")
@@ -564,9 +572,10 @@ def run_query_with_results(self, sql: str):
 
 if __name__ == "__main__":
     baseline = TiDBLoader()
-    chtables = ['warehouse', 'item', 'stock', 'district', 'customer', 'history', 'orders', 'new_order', 'order_line', 'region', 'nation', 'supplier']
-    # baseline.load_database("tables.sql", chtables)
-    baseline.manually_copy_s3_data(chtables, source_dir="chbenchmark")
+    imtables = ['homes', 'theatres', 'showings', 'ticket_orders', 'aka_name', 'aka_title', 'cast_info', 'char_name', 'comp_cast_type', 'company_name', 'company_type', 'complete_cast', 'info_type', 'keyword', 'kind_type', 'link_type', 'movie_companies', 'movie_info_idx', 'movie_keyword', 'movie_link', 'name', 'role_type', 'title', 'movie_info', 'person_info']
+    # chtables = ['warehouse', 'item', 'stock', 'district', 'customer', 'history', 'orders', 'new_order', 'order_line', 'region', 'nation', 'supplier']
+    baseline.load_database("tables.sql", imtables)
+    # baseline.manually_copy_s3_data(imtables, source_bucket="brad-personal-data", source_dir="imdb_extended", source_ext="csv")
     # baseline.manual_unload("imdb_extended", do_unload=False, start_chunk=-1, end_chunk=-1)
     # baseline.manual_count_all("imdb_extended")
     # import sys

diff --git a/workloads/chbenchmark/py-tpcc/pytpcc/tpcc.py b/workloads/chbenchmark/py-tpcc/pytpcc/tpcc.py
@@ -44,6 +44,7 @@
 from .runtime import *
 from .drivers.auroradriver import AuroraDriver
 from .drivers.braddriver import BradDriver
+from .drivers.tidbdriver import TiDBDriver
 
 logging.basicConfig(
     level=logging.INFO,
@@ -61,6 +62,8 @@ def createDriverClass(name):
         return BradDriver
     elif name == "aurora":
         return AuroraDriver
+    elif name == "tidb":
+        return TiDBDriver
     else:
         raise NotImplementedError