diff --git a/datafusion-examples/README.md b/datafusion-examples/README.md index 4b0e64ebdb7e..3e696d6b93a6 100644 --- a/datafusion-examples/README.md +++ b/datafusion-examples/README.md @@ -62,6 +62,7 @@ cargo run --example csv_sql - [`memtable.rs`](examples/memtable.rs): Create an query data in memory using SQL and `RecordBatch`es - [`parquet_sql.rs`](examples/parquet_sql.rs): Build and run a query plan from a SQL statement against a local Parquet file - [`parquet_sql_multiple_files.rs`](examples/parquet_sql_multiple_files.rs): Build and run a query plan from a SQL statement against multiple local Parquet files +- ['parquet_exec_visitor.rs'](examples/parquet_exec_visitor.rs): Extract statistics by visiting an ExecutionPlan after execution - [`pruning.rs`](examples/parquet_sql.rs): Use pruning to rule out files based on statistics - [`query-aws-s3.rs`](examples/external_dependency/query-aws-s3.rs): Configure `object_store` and run a query against files stored in AWS S3 - [`query-http-csv.rs`](examples/query-http-csv.rs): Configure `object_store` and run a query against files vi HTTP diff --git a/datafusion-examples/examples/parquet_exec_visitor.rs b/datafusion-examples/examples/parquet_exec_visitor.rs new file mode 100644 index 000000000000..eeb288beb0df --- /dev/null +++ b/datafusion-examples/examples/parquet_exec_visitor.rs @@ -0,0 +1,110 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::Arc; + +use datafusion::datasource::file_format::parquet::ParquetFormat; +use datafusion::datasource::listing::{ListingOptions, PartitionedFile}; +use datafusion::datasource::physical_plan::ParquetExec; +use datafusion::execution::context::SessionContext; +use datafusion::physical_plan::metrics::MetricValue; +use datafusion::physical_plan::{ + execute_stream, visit_execution_plan, ExecutionPlan, ExecutionPlanVisitor, +}; +use futures::StreamExt; + +/// Example of collecting metrics after execution by visiting the `ExecutionPlan` +#[tokio::main] +async fn main() { + let ctx = SessionContext::new(); + + let test_data = datafusion::test_util::parquet_test_data(); + + // Configure listing options + let file_format = ParquetFormat::default().with_enable_pruning(true); + let listing_options = ListingOptions::new(Arc::new(file_format)); + + // First example were we use an absolute path, which requires no additional setup. + let _ = ctx + .register_listing_table( + "my_table", + &format!("file://{test_data}/alltypes_plain.parquet"), + listing_options.clone(), + None, + None, + ) + .await; + + let df = ctx.sql("SELECT * FROM my_table").await.unwrap(); + let plan = df.create_physical_plan().await.unwrap(); + + // Create empty visitor + let mut visitor = ParquetExecVisitor { + file_groups: None, + bytes_scanned: None, + }; + + // Make sure you execute the plan to collect actual execution statistics. + // For example, in this example the `file_scan_config` is known without executing + // but the `bytes_scanned` would be None if we did not execute. + let mut batch_stream = execute_stream(plan.clone(), ctx.task_ctx()).unwrap(); + while let Some(batch) = batch_stream.next().await { + println!("Batch rows: {}", batch.unwrap().num_rows()); + } + + visit_execution_plan(plan.as_ref(), &mut visitor).unwrap(); + + println!( + "ParquetExecVisitor bytes_scanned: {:?}", + visitor.bytes_scanned + ); + println!( + "ParquetExecVisitor file_groups: {:?}", + visitor.file_groups.unwrap() + ); +} + +/// Define a struct with fields to hold the execution information you want to +/// collect. In this case, I want information on how many bytes were scanned +/// and `file_groups` from the FileScanConfig. +#[derive(Debug)] +struct ParquetExecVisitor { + file_groups: Option>>, + bytes_scanned: Option, +} + +impl ExecutionPlanVisitor for ParquetExecVisitor { + type Error = datafusion_common::DataFusionError; + + /// This function is called once for every node in the tree. + /// Based on your needs implement either `pre_visit` (visit each node before its children/inputs) + /// or `post_visit` (visit each node after its children/inputs) + fn pre_visit(&mut self, plan: &dyn ExecutionPlan) -> Result { + // If needed match on a specific `ExecutionPlan` node type + let maybe_parquet_exec = plan.as_any().downcast_ref::(); + if let Some(parquet_exec) = maybe_parquet_exec { + self.file_groups = Some(parquet_exec.base_config().file_groups.clone()); + + let metrics = match parquet_exec.metrics() { + None => return Ok(true), + Some(metrics) => metrics, + }; + self.bytes_scanned = metrics.sum_by_name("bytes_scanned"); + } + Ok(true) + } +}