From 415821361bc4a950e3e956569a89adf02d06c723 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 14 Feb 2025 08:51:29 -0500 Subject: [PATCH] Improve docs `TableSource` and `DefaultTableSource` --- datafusion/catalog/src/table.rs | 5 ++-- .../src/datasource/default_table_source.rs | 13 ++++++---- datafusion/expr/src/table_source.rs | 25 +++++++++++++------ 3 files changed, 29 insertions(+), 14 deletions(-) diff --git a/datafusion/catalog/src/table.rs b/datafusion/catalog/src/table.rs index 88d2d8bde51e..ecc792f73d30 100644 --- a/datafusion/catalog/src/table.rs +++ b/datafusion/catalog/src/table.rs @@ -33,18 +33,19 @@ use datafusion_expr::{ }; use datafusion_physical_plan::ExecutionPlan; -/// A named table which can be queried. +/// A table which can be queried and modified. /// /// Please see [`CatalogProvider`] for details of implementing a custom catalog. /// /// [`TableProvider`] represents a source of data which can provide data as -/// Apache Arrow `RecordBatch`es. Implementations of this trait provide +/// Apache Arrow [`RecordBatch`]es. Implementations of this trait provide /// important information for planning such as: /// /// 1. [`Self::schema`]: The schema (columns and their types) of the table /// 2. [`Self::supports_filters_pushdown`]: Should filters be pushed into this scan /// 2. [`Self::scan`]: An [`ExecutionPlan`] that can read data /// +/// [`RecordBatch`]: https://docs.rs/arrow/latest/arrow/record_batch/struct.RecordBatch.html /// [`CatalogProvider`]: super::CatalogProvider #[async_trait] pub trait TableProvider: Debug + Sync + Send { diff --git a/datafusion/core/src/datasource/default_table_source.rs b/datafusion/core/src/datasource/default_table_source.rs index 91c1e0ac97fc..541e0b6dfa91 100644 --- a/datafusion/core/src/datasource/default_table_source.rs +++ b/datafusion/core/src/datasource/default_table_source.rs @@ -26,12 +26,15 @@ use arrow::datatypes::SchemaRef; use datafusion_common::{internal_err, Constraints}; use datafusion_expr::{Expr, TableProviderFilterPushDown, TableSource, TableType}; -/// DataFusion default table source, wrapping TableProvider. +/// Implements [`TableSource`] for a [`TableProvider`] /// -/// This structure adapts a `TableProvider` (physical plan trait) to the `TableSource` -/// (logical plan trait) and is necessary because the logical plan is contained in -/// the `datafusion_expr` crate, and is not aware of table providers, which exist in -/// the core `datafusion` crate. +/// This structure adapts a [`TableProvider`] (a physical plan trait) to the +/// [`TableSource`] (logical plan trait). +/// +/// It is used so logical plans in the `datafusion_expr` crate do not have a +/// direct dependency on physical plans, such as [`TableProvider`]s. +/// +/// [`TableProvider`]: https://docs.rs/datafusion/latest/datafusion/datasource/provider/trait.TableProvider.html pub struct DefaultTableSource { /// table provider pub table_provider: Arc, diff --git a/datafusion/expr/src/table_source.rs b/datafusion/expr/src/table_source.rs index d62484153f53..d6155cfb5dc0 100644 --- a/datafusion/expr/src/table_source.rs +++ b/datafusion/expr/src/table_source.rs @@ -71,24 +71,33 @@ impl std::fmt::Display for TableType { } } -/// Access schema information and filter push-down capabilities. +/// Planning time information about a table. /// -/// The TableSource trait is used during logical query planning and -/// optimizations and provides a subset of the functionality of the -/// `TableProvider` trait in the (core) `datafusion` crate. The `TableProvider` -/// trait provides additional capabilities needed for physical query execution -/// (such as the ability to perform a scan). +/// This trait is used during logical query planning and optimizations, and +/// provides a subset of the [`TableProvider`] trait, such as schema information +/// and filter push-down capabilities. The [`TableProvider`] trait provides +/// additional information needed for physical query execution, such as the +/// ability to perform a scan or insert data. +/// +/// # See Also: +/// +/// [`DefaultTableSource`] to go from [`TableProvider`], to `TableSource` +/// +/// # Rationale /// /// The reason for having two separate traits is to avoid having the logical /// plan code be dependent on the DataFusion execution engine. Some projects use /// DataFusion's logical plans and have their own execution engine. +/// +/// [`TableProvider`]: https://docs.rs/datafusion/latest/datafusion/datasource/provider/trait.TableProvider.html +/// [`DefaultTableSource`]: https://docs.rs/datafusion/latest/datafusion/datasource/default_table_source/struct.DefaultTableSource.html pub trait TableSource: Sync + Send { fn as_any(&self) -> &dyn Any; /// Get a reference to the schema for this table fn schema(&self) -> SchemaRef; - /// Get primary key indices, if one exists. + /// Get primary key indices, if any fn constraints(&self) -> Option<&Constraints> { None } @@ -110,6 +119,8 @@ pub trait TableSource: Sync + Send { } /// Get the Logical plan of this table provider, if available. + /// + /// For example, a view may have a logical plan, but a CSV file does not. fn get_logical_plan(&self) -> Option> { None }