support run mutiple queries in TPC-H benchmark

zhangxffff · zhangxffff · commit b5fad14d0c2a · 2025-03-10T22:10:07.000+08:00
diff --git a/datafusion_ray/util.py b/datafusion_ray/util.py
@@ -1,4 +1,4 @@
 from datafusion_ray._datafusion_ray_internal import (
-    exec_sql_on_tables,
+    exec_sqls_on_tables,
     prettify,
 )
diff --git a/docs/contributing.md b/docs/contributing.md
@@ -80,15 +80,15 @@ RAY_COLOR_PREFIX=1 RAY_DEDUP_LOGS=0 python tips.py --data-dir=$(pwd)/../testdata
 - In the `tpch` directory, use `make_data.py` to create a TPCH dataset at a provided scale factor, then
 
 ```bash
-RAY_COLOR_PREFIX=1 RAY_DEDUP_LOGS=0 python tpc.py --data=file:///path/to/your/tpch/directory/ --concurrency=2 --batch-size=8182 --worker-pool-min=10 --qnum 2
+RAY_COLOR_PREFIX=1 RAY_DEDUP_LOGS=0 python tpcbench.py --data=file:///path/to/your/tpch/directory/ --concurrency=2 --batch-size=8182 --worker-pool-min=10 --qnum 2
 ```
 
 To execute the TPCH query #2. To execute an arbitrary query against the TPCH dataset, provide it with `--query` instead of `--qnum`. This is useful for validating plans that DataFusion Ray will create.
 
 For example, to execute the following query:
 
 ```bash
-RAY_COLOR_PREFIX=1 RAY_DEDUP_LOGS=0 python tpc.py --data=file:///path/to/your/tpch/directory/ --concurrency=2 --batch-size=8182 --worker-pool-min=10 --query 'select c.c_name, sum(o.o_totalprice) as total from orders o inner join customer c on o.o_custkey = c.c_custkey group by c_name limit 1'
+RAY_COLOR_PREFIX=1 RAY_DEDUP_LOGS=0 python tpcbench.py --data=file:///path/to/your/tpch/directory/ --concurrency=2 --batch-size=8182 --worker-pool-min=10 --query 'select c.c_name, sum(o.o_totalprice) as total from orders o inner join customer c on o.o_custkey = c.c_custkey group by c_name limit 1'
 ```
 
 To further parallelize execution, you can choose how many partitions will be served by each Stage with `--partitions-per-processor`. If this number is less than `--concurrency` Then multiple Actors will host portions of the stage. For example, if there are 10 stages calculated for a query, `concurrency=16` and `partitions-per-processor=4`, then `40` `RayStage` Actors will be created. If `partitions-per-processor=16` or is absent, then `10` `RayStage` Actors will be created.
diff --git a/src/lib.rs b/src/lib.rs
@@ -44,7 +44,7 @@ fn _datafusion_ray_internal(m: &Bound<'_, PyModule>) -> PyResult<()> {
     m.add_class::<dataframe::PyDFRayStage>()?;
     m.add_class::<processor_service::DFRayProcessorService>()?;
     m.add_function(wrap_pyfunction!(util::prettify, m)?)?;
-    m.add_function(wrap_pyfunction!(util::exec_sql_on_tables, m)?)?;
+    m.add_function(wrap_pyfunction!(util::exec_sqls_on_tables, m)?)?;
     Ok(())
 }
 
diff --git a/src/util.rs b/src/util.rs
@@ -14,7 +14,7 @@ use arrow::error::ArrowError;
 use arrow::ipc::convert::fb_to_schema;
 use arrow::ipc::reader::StreamReader;
 use arrow::ipc::writer::{IpcWriteOptions, StreamWriter};
-use arrow::ipc::{root_as_message, MetadataVersion};
+use arrow::ipc::{MetadataVersion, root_as_message};
 use arrow::pyarrow::*;
 use arrow::util::pretty;
 use arrow_flight::{FlightClient, FlightData, Ticket};
@@ -30,16 +30,16 @@ use datafusion::error::DataFusionError;
 use datafusion::execution::object_store::ObjectStoreUrl;
 use datafusion::execution::{RecordBatchStream, SendableRecordBatchStream, SessionStateBuilder};
 use datafusion::physical_plan::stream::RecordBatchStreamAdapter;
-use datafusion::physical_plan::{displayable, ExecutionPlan, ExecutionPlanProperties};
+use datafusion::physical_plan::{ExecutionPlan, ExecutionPlanProperties, displayable};
 use datafusion::prelude::{SessionConfig, SessionContext};
 use datafusion_proto::physical_plan::AsExecutionPlan;
 use datafusion_python::utils::wait_for_future;
 use futures::{Stream, StreamExt};
 use log::debug;
+use object_store::ObjectStore;
 use object_store::aws::AmazonS3Builder;
 use object_store::gcp::GoogleCloudStorageBuilder;
 use object_store::http::HttpBuilder;
-use object_store::ObjectStore;
 use parking_lot::Mutex;
 use pyo3::prelude::*;
 use pyo3::types::{PyBytes, PyList};
@@ -412,9 +412,9 @@ fn print_node(plan: &Arc<dyn ExecutionPlan>, indent: usize, output: &mut String)
 }
 
 async fn exec_sql(
-    query: String,
+    queries: Vec<String>,
     tables: Vec<(String, String)>,
-) -> Result<RecordBatch, DataFusionError> {
+) -> Result<Vec<RecordBatch>, DataFusionError> {
     let ctx = SessionContext::new();
     for (name, path) in tables {
         let opt =
@@ -428,34 +428,39 @@ async fn exec_sql(
         ctx.register_listing_table(&name, &path, opt, None, None)
             .await?;
     }
-    let df = ctx.sql(&query).await?;
-    let schema = df.schema().inner().clone();
-    let batches = df.collect().await?;
-    concat_batches(&schema, batches.iter()).map_err(|e| DataFusionError::ArrowError(e, None))
+    let mut results = vec![];
+    for query in queries {
+        let df = ctx.sql(&query).await?;
+        let schema = df.schema().inner().clone();
+        let batches = df.collect().await?;
+        let result = concat_batches(&schema, &batches)?;
+        results.push(result);
+    }
+    Ok(results)
 }
 
-/// Executes a query on the specified tables using DataFusion without Ray.
+/// Executes queries on the specified tables using DataFusion without Ray.
 ///
-/// Returns the query results as a RecordBatch that can be used to verify the
-/// correctness of DataFusion-Ray execution of the same query.
+/// Returns the query results as a Vec of RecordBatch that can be used to verify the
+/// correctness of DataFusion-Ray execution of the same queries.
 ///
 /// # Arguments
 ///
 /// * `py`: the Python token
-/// * `query`: the SQL query string to execute
+/// * `queries`: the SQL query strings to execute
 /// * `tables`: a list of `(name, url)` tuples specifying the tables to query;
 ///   the `url` identifies the parquet files for each listing table and see
 ///   [`datafusion::datasource::listing::ListingTableUrl::parse`] for details
 ///   of supported URL formats
 ///  * `listing`: boolean indicating whether this is a listing table path or not
 #[pyfunction]
-#[pyo3(signature = (query, tables, listing=false))]
-pub fn exec_sql_on_tables(
+#[pyo3(signature = (queries, tables, listing=false))]
+pub fn exec_sqls_on_tables(
     py: Python,
-    query: String,
+    queries: Vec<String>,
     tables: Bound<'_, PyList>,
     listing: bool,
-) -> PyResult<PyObject> {
+) -> PyResult<Vec<PyObject>> {
     let table_vec = {
         let mut v = Vec::with_capacity(tables.len());
         for entry in tables.iter() {
@@ -465,8 +470,8 @@ pub fn exec_sql_on_tables(
         }
         v
     };
-    let batch = wait_for_future(py, exec_sql(query, table_vec))?;
-    batch.to_pyarrow(py)
+    let batches = wait_for_future(py, exec_sql(queries, table_vec))?;
+    batches.iter().map(|b| b.to_pyarrow(py)).collect()
 }
 
 pub(crate) fn register_object_store_for_paths_in_plan(
diff --git a/tpch/tpcbench.py b/tpch/tpcbench.py
@@ -18,7 +18,7 @@
 import argparse
 import ray
 from datafusion_ray import DFRayContext, df_ray_runtime_env
-from datafusion_ray.util import exec_sql_on_tables, prettify
+from datafusion_ray.util import exec_sqls_on_tables, prettify
 from datetime import datetime
 import json
 import os
@@ -31,7 +31,7 @@ def tpch_query(qnum: int) -> str:
 
 
 def main(
-    qnum: int,
+    queries: list[(str, str)],
     data_path: str,
     concurrency: int,
     batch_size: int,
@@ -95,35 +95,43 @@ def main(
     if validate:
         results["validated"] = {}
 
-    queries = range(1, 23) if qnum == -1 else [qnum]
-    for qnum in queries:
-        sql = tpch_query(qnum)
-
-        statements = sql.split(";")
-        sql = statements[0]
-
+    for (qid, sql) in queries:
         print("executing ", sql)
 
+        statements = [s for s in sql.split(";") if s.strip() != ""]
         start_time = time.time()
-        df = ctx.sql(sql)
-        batches = df.collect()
+        batches = [ctx.sql(s).collect() for s in statements]
         end_time = time.time()
-        results["queries"][qnum] = end_time - start_time
+        results["queries"][qid] = end_time - start_time
 
-        calculated = prettify(batches)
-        print(calculated)
+        calculated = [prettify(batch) for batch in batches if batch]
+        for pretty_batch in calculated:
+            print(pretty_batch)
         if validate:
             tables = [
                 (name, os.path.join(data_path, f"{name}.parquet"))
                 for name in table_names
             ]
-            answer_batches = [
-                b for b in [exec_sql_on_tables(sql, tables, listing_tables)] if b
-            ]
-            expected = prettify(answer_batches)
-
-            results["validated"][qnum] = calculated == expected
-        print(f"done with query {qnum}")
+            answer_batches = [b for b in exec_sqls_on_tables(
+                statements, tables, listing_tables) if b]
+
+            validated = True
+            if len(answer_batches) == len(calculated):
+                expected = [prettify([answer_batch])
+                            for answer_batch in answer_batches]
+                validated = all(x[0] == x[1]
+                                for x in zip(calculated, expected))
+                for x in zip(calculated, expected):
+                    if x[0] != x[1]:
+                        print(f"Expected:\n{x[1]}")
+                        print(f"Got:\n{x[0]}")
+            else:
+                print(
+                    f"Expected {len(answer_batches)} batches, got {len(calculated)}")
+                validated = False
+
+            results["validated"][qid] = validated
+        print(f"done with query {qid}")
 
         # write the results as we go, so you can peek at them
         results_dump = json.dumps(results, indent=4)
@@ -151,7 +159,10 @@ def main(
     parser.add_argument(
         "--concurrency", required=True, help="Number of concurrent tasks"
     )
-    parser.add_argument("--qnum", type=int, default=-1, help="TPCH query number, 1-22")
+    parser.add_argument("--qnum", type=int, default=-1,
+                        help="TPCH query number, 1-22")
+    parser.add_argument("--query", required=False, type=str,
+                        help="Custom query to run with tpch tables")
     parser.add_argument("--listing-tables", action="store_true")
     parser.add_argument("--validate", action="store_true")
     parser.add_argument(
@@ -183,8 +194,27 @@ def main(
 
     args = parser.parse_args()
 
+    if (args.qnum != -1 and args.query is not None):
+        print("Please specify either --qnum or --query, but not both")
+
+    queries = []
+    if (args.qnum != -1):
+        if args.qnum < 1 or args.qnum > 22:
+            print("Invalid query number. Please specify a number between 1 and 22.")
+            exit(1)
+        else:
+            queries.append((str(args.qnum), tpch_query(args.qnum)))
+            print("Executing tpch query ", args.qnum)
+
+    elif (args.query is not None):
+        queries.append(("custom query", args.query))
+        print("Executing custom query ", args.query)
+    else:
+        print("Executing all tpch queries")
+        queries = [(str(i), tpch_query(i)) for i in range(1, 23)]
+
     main(
-        args.qnum,
+        queries,
         args.data,
         int(args.concurrency),
         int(args.batch_size),

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`	`1`	`from datafusion_ray._datafusion_ray_internal import (`
`2`		`- exec_sql_on_tables,`
	`2`	`+ exec_sqls_on_tables,`
`3`	`3`	`prettify,`
`4`	`4`	`)`
Original file line number	Diff line number	Diff line change
`@@ -44,7 +44,7 @@ fn _datafusion_ray_internal(m: &Bound<'_, PyModule>) -> PyResult<()> {`
`44`	`44`	`m.add_class::<dataframe::PyDFRayStage>()?;`
`45`	`45`	`m.add_class::<processor_service::DFRayProcessorService>()?;`
`46`	`46`	`m.add_function(wrap_pyfunction!(util::prettify, m)?)?;`
`47`		`- m.add_function(wrap_pyfunction!(util::exec_sql_on_tables, m)?)?;`
	`47`	`+ m.add_function(wrap_pyfunction!(util::exec_sqls_on_tables, m)?)?;`
`48`	`48`	`Ok(())`
`49`	`49`	`}`
`50`	`50`