apache · andygrove · Feb 28, 2025 · Feb 26, 2025 · Feb 27, 2025 · andygrove
diff --git a/datafusion_ray/__init__.py b/datafusion_ray/__init__.py
@@ -20,6 +20,6 @@
 except ImportError:
     import importlib_metadata
 
-from .core import RayContext, prettify, runtime_env, RayStagePool
+from .core import RayContext, exec_sql_on_tables, prettify, runtime_env, RayStagePool
 
 __version__ = importlib_metadata.version(__name__)
diff --git a/datafusion_ray/core.py b/datafusion_ray/core.py
@@ -31,6 +31,7 @@
 from datafusion_ray._datafusion_ray_internal import (
     RayContext as RayContextInternal,
     RayDataFrame as RayDataFrameInternal,
+    exec_sql_on_tables,
     prettify,
 )
 
@@ -465,6 +466,9 @@ def stages(self):
 
         return self._stages
 
+    def schema(self):
+        return self.df.schema()
+
     def execution_plan(self):
         return self.df.execution_plan()
 
@@ -479,7 +483,7 @@ def collect(self) -> list[pa.RecordBatch]:
             t1 = time.time()
             self.stages()
             t2 = time.time()
-            log.debug(f"creating stages took {t2 -t1}s")
+            log.debug(f"creating stages took {t2 - t1}s")
 
             last_stage_id = max([stage.stage_id for stage in self._stages])
             log.debug(f"last stage is {last_stage_id}")
@@ -553,7 +557,9 @@ def __init__(
         s = time.time()
         call_sync(wait_for([start_ref], "RayContextSupervisor start"))
         e = time.time()
-        log.info(f"RayContext::__init__ waiting for supervisor to be ready took {e-s}s")
+        log.info(
+            f"RayContext::__init__ waiting for supervisor to be ready took {e - s}s"
+        )
 
     def register_parquet(self, name: str, path: str):
         self.ctx.register_parquet(name, path)

diff --git a/src/dataframe.rs b/src/dataframe.rs
@@ -266,6 +266,10 @@ impl RayDataFrame {
         Ok(PyLogicalPlan::new(self.df.logical_plan().clone()))
     }
 
+    fn schema(&self, py: Python) -> PyResult<PyObject> {
+        self.df.schema().as_arrow().to_pyarrow(py)
+    }
+
     fn optimized_logical_plan(&self) -> PyResult<PyLogicalPlan> {
         Ok(PyLogicalPlan::new(self.df.clone().into_optimized_plan()?))
     }

diff --git a/src/lib.rs b/src/lib.rs
@@ -44,6 +44,7 @@ fn _datafusion_ray_internal(m: &Bound<'_, PyModule>) -> PyResult<()> {
     m.add_class::<dataframe::PyDataFrameStage>()?;
     m.add_class::<stage_service::StageService>()?;
     m.add_function(wrap_pyfunction!(util::prettify, m)?)?;
+    m.add_function(wrap_pyfunction!(util::exec_sql_on_tables, m)?)?;
     Ok(())
 }
 

diff --git a/src/util.rs b/src/util.rs
@@ -8,6 +8,7 @@ use std::task::{Context, Poll};
 use std::time::Duration;
 
 use arrow::array::RecordBatch;
+use arrow::compute::concat_batches;
 use arrow::datatypes::SchemaRef;
 use arrow::error::ArrowError;
 use arrow::ipc::convert::fb_to_schema;
@@ -20,13 +21,17 @@ use arrow_flight::{FlightClient, FlightData, Ticket};
 use async_stream::stream;
 use datafusion::common::internal_datafusion_err;
 use datafusion::common::tree_node::{Transformed, TreeNode};
+use datafusion::datasource::file_format::options::ParquetReadOptions;
+use datafusion::datasource::file_format::parquet::ParquetFormat;
+use datafusion::datasource::listing::ListingOptions;
 use datafusion::datasource::physical_plan::ParquetExec;
 use datafusion::error::DataFusionError;
 use datafusion::execution::{RecordBatchStream, SendableRecordBatchStream, SessionStateBuilder};
 use datafusion::physical_plan::stream::RecordBatchStreamAdapter;
 use datafusion::physical_plan::{displayable, ExecutionPlan, ExecutionPlanProperties};
 use datafusion::prelude::{SessionConfig, SessionContext};
 use datafusion_proto::physical_plan::AsExecutionPlan;
+use datafusion_python::utils::wait_for_future;
 use futures::{Stream, StreamExt};
 use parking_lot::Mutex;
 use pyo3::prelude::*;
@@ -397,6 +402,52 @@ fn print_node(plan: &Arc<dyn ExecutionPlan>, indent: usize, output: &mut String)
     }
 }
 
+async fn exec_sql(query: String, tables: Vec<(String, String)>) -> PyResult<RecordBatch> {
+    let ctx = SessionContext::new();
+    for (name, path) in tables {
+        if path.ends_with(".parquet") {
+            let opt = ParquetReadOptions::default();
+            ctx.register_parquet(&name, &path, opt).await?;
+        } else {
+            let opt =
+                ListingOptions::new(Arc::new(ParquetFormat::new())).with_file_extension(".parquet");
+            ctx.register_listing_table(&name, &path, opt, None, None)
+                .await?;
+        }
+    }
+    let df = ctx.sql(&query).await?;
+    let schema = df.schema().inner().clone();
+    let batches = df.collect().await?;
+    concat_batches(&schema, batches.iter()).to_py_err()
+}
+
+/// Executes a query on the specified tables using DataFusion without Ray.
+///
+/// Returns the query results as a RecordBatch that can be used to verify the
+/// correctness of DataFusion-Ray execution of the same query.
+///
+/// # Arguments
+///
+/// * `py`: the Python token
+/// * `query`: the SQL query string to execute
+/// * `tables`: a list of `(name, path)` tuples specifing the tables to query
+#[pyfunction]
+pub fn exec_sql_on_tables(
+    py: Python,
+    query: String,
+    tables: Bound<'_, PyList>,
+) -> PyResult<PyObject> {
+    let table_vec = {
+        let mut v = Vec::with_capacity(tables.len());
+        for entry in tables.iter() {
+            v.push(entry.extract::<(String, String)>()?);
+        }
+        v
+    };
+    let batch = wait_for_future(py, exec_sql(query, table_vec))?;
+    batch.to_pyarrow(py)
+}
+
 #[cfg(test)]
 mod test {
     use std::sync::Arc;

diff --git a/tpch/tpcbench.py b/tpch/tpcbench.py
@@ -18,7 +18,7 @@
 import argparse
 import ray
 from datafusion import SessionContext, SessionConfig
-from datafusion_ray import RayContext, prettify, runtime_env
+from datafusion_ray import RayContext, exec_sql_on_tables, prettify, runtime_env
 from datetime import datetime
 import json
 import os
@@ -49,7 +49,6 @@ def main(
     validate: bool,
     prefetch_buffer_size: int,
 ):
-
     # Register the tables
     table_names = [
         "customer",
@@ -79,17 +78,13 @@ def main(
 
     local_config = SessionConfig()
 
-    local_ctx = SessionContext(local_config)
-
     for table in table_names:
         path = os.path.join(data_path, f"{table}.parquet")
         print(f"Registering table {table} using path {path}")
         if listing_tables:
             ctx.register_listing_table(table, f"{path}/")
-            local_ctx.register_listing_table(table, f"{path}/")
         else:
             ctx.register_parquet(table, path)
-            local_ctx.register_parquet(table, path)
 
     current_time_millis = int(datetime.now().timestamp() * 1000)
     results_path = f"datafusion-ray-tpch-{current_time_millis}.json"
@@ -125,6 +120,7 @@ def main(
         start_time = time.time()
         df = ctx.sql(sql)
         end_time = time.time()
+        print(f"Ray output schema {df.schema()}")
         print("Logical plan \n", df.logical_plan().display_indent())
         print("Optimized Logical plan \n", df.optimized_logical_plan().display_indent())
         part1 = end_time - start_time
@@ -143,7 +139,11 @@ def main(
         print(calculated)
         if validate:
             start_time = time.time()
-            answer_batches = local_ctx.sql(sql).collect()
+            tables = [
+                (name, os.path.join(data_path, f"{name}.parquet"))
+                for name in table_names
+            ]
+            answer_batches = [b for b in [exec_sql_on_tables(sql, tables)] if b]
             end_time = time.time()
             results["local_queries"][qnum] = end_time - start_time