diff --git a/datafusion/core/src/execution/context.rs b/datafusion/core/src/execution/context.rs index 265595c59d41..50aea80fc9b4 100644 --- a/datafusion/core/src/execution/context.rs +++ b/datafusion/core/src/execution/context.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -//! SessionContext contains methods for registering data sources and executing queries +//! [`SessionContext`] contains methods for registering data sources and executing queries use crate::{ catalog::catalog::{CatalogList, MemoryCatalogList}, datasource::{ @@ -158,11 +158,15 @@ where } } -/// SessionContext is the main interface for executing queries with DataFusion. It stands for -/// the connection between user and DataFusion/Ballista cluster. -/// The context provides the following functionality +/// Main interface for executing queries with DataFusion. Maintains +/// the state of the connection between a user and an instance of the +/// DataFusion engine. /// -/// * Create DataFrame from a CSV or Parquet data source. +/// # Overview +/// +/// [`SessionContext`] provides the following functionality: +/// +/// * Create a DataFrame from a CSV or Parquet data source. /// * Register a CSV or Parquet data source as a table that can be referenced from a SQL query. /// * Register a custom data source that can be referenced from a SQL query. /// * Execution a SQL query @@ -199,6 +203,20 @@ where /// # Ok(()) /// # } /// ``` +/// +/// # `SessionContext`, `SessionState`, and `TaskContext` +/// +/// A [`SessionContext`] can be created from a [`SessionConfig`] and +/// stores the state for a particular query session. A single +/// [`SessionContext`] can run multiple queries. +/// +/// [`SessionState`] contains information available during query +/// planning (creating [`LogicalPlan`]s and [`ExecutionPlan`]s). +/// +/// [`TaskContext`] contains the state available during query +/// execution [`ExecutionPlan::execute`]. It contains a subset of the +/// information in[`SessionState`] and is created from a +/// [`SessionContext`] or a [`SessionState`]. #[derive(Clone)] pub struct SessionContext { /// UUID for the session @@ -216,7 +234,7 @@ impl Default for SessionContext { } impl SessionContext { - /// Creates a new execution context using a default session configuration. + /// Creates a new `SessionContext` using the default [`SessionConfig`]. pub fn new() -> Self { Self::with_config(SessionConfig::new()) } @@ -241,19 +259,35 @@ impl SessionContext { Ok(()) } - /// Creates a new session context using the provided session configuration. + /// Creates a new `SessionContext` using the provided + /// [`SessionConfig`] and a new [`RuntimeEnv`]. + /// + /// See [`Self::with_config_rt`] for more details on resource + /// limits. pub fn with_config(config: SessionConfig) -> Self { let runtime = Arc::new(RuntimeEnv::default()); Self::with_config_rt(config, runtime) } - /// Creates a new session context using the provided configuration and [`RuntimeEnv`]. + /// Creates a new `SessionContext` using the provided + /// [`SessionConfig`] and a [`RuntimeEnv`]. + /// + /// # Resource Limits + /// + /// By default, each new `SessionContext` creates a new + /// `RuntimeEnv`, and therefore will not enforce memory or disk + /// limits for queries run on different `SessionContext`s. + /// + /// To enforce resource limits (e.g. to limit the total amount of + /// memory used) across all DataFusion queries in a process, + /// all `SessionContext`'s should be configured with the + /// same `RuntimeEnv`. pub fn with_config_rt(config: SessionConfig, runtime: Arc) -> Self { let state = SessionState::with_config_rt(config, runtime); Self::with_state(state) } - /// Creates a new session context using the provided session state. + /// Creates a new `SessionContext` using the provided [`SessionState`] pub fn with_state(state: SessionState) -> Self { Self { session_id: state.session_id.clone(), @@ -262,7 +296,7 @@ impl SessionContext { } } - /// Returns the time this session was created + /// Returns the time this `SessionContext` was created pub fn session_start_time(&self) -> DateTime { self.session_start_time } @@ -282,12 +316,12 @@ impl SessionContext { ) } - /// Return the [RuntimeEnv] used to run queries with this [SessionContext] + /// Return the [RuntimeEnv] used to run queries with this `SessionContext` pub fn runtime_env(&self) -> Arc { self.state.read().runtime_env.clone() } - /// Return the `session_id` of this Session + /// Returns an id that uniquely identifies this `SessionContext`. pub fn session_id(&self) -> String { self.session_id.clone() } @@ -1205,7 +1239,7 @@ impl QueryPlanner for DefaultQueryPlanner { /// Execution context for registering data sources and executing queries #[derive(Clone)] pub struct SessionState { - /// UUID for the session + /// A unique UUID that identifies the session session_id: String, /// Responsible for analyzing and rewrite a logical plan before optimization analyzer: Analyzer, @@ -1252,7 +1286,8 @@ pub fn default_session_builder(config: SessionConfig) -> SessionState { } impl SessionState { - /// Returns new SessionState using the provided configuration and runtime + /// Returns new [`SessionState`] using the provided + /// [`SessionConfig`] and [`RuntimeEnv`]. pub fn with_config_rt(config: SessionConfig, runtime: Arc) -> Self { let catalog_list = Arc::new(MemoryCatalogList::new()) as Arc; Self::with_config_rt_and_catalog_list(config, runtime, catalog_list) diff --git a/datafusion/core/src/execution/mod.rs b/datafusion/core/src/execution/mod.rs index ad9b9ce2125b..fa6c4e118e5d 100644 --- a/datafusion/core/src/execution/mod.rs +++ b/datafusion/core/src/execution/mod.rs @@ -15,30 +15,7 @@ // specific language governing permissions and limitations // under the License. -//! This module contains the shared state available at different parts -//! of query planning and execution -//! -//! # Runtime Environment -//! -//! [`runtime_env::RuntimeEnv`] can be created from a [`runtime_env::RuntimeConfig`] and -//! stores state to be shared across multiple sessions. In most applications there will -//! be a single [`runtime_env::RuntimeEnv`] for the entire process -//! -//! # Session Context -//! -//! [`context::SessionContext`] can be created from a [`context::SessionConfig`] and -//! an optional [`runtime_env::RuntimeConfig`], and stores the state for a particular -//! query session. -//! -//! In particular [`context::SessionState`] is the information available to query planning -//! -//! # Task Context -//! -//! [`context::TaskContext`] is typically created from a [`context::SessionContext`] or -//! [`context::SessionState`], and represents the state available to query execution. -//! -//! In particular it is the state passed to [`crate::physical_plan::ExecutionPlan::execute`] -//! +//! Shared state for query planning and execution. pub mod context; // backwards compatibility diff --git a/datafusion/execution/src/runtime_env.rs b/datafusion/execution/src/runtime_env.rs index 67736edf6804..8f9c594681d0 100644 --- a/datafusion/execution/src/runtime_env.rs +++ b/datafusion/execution/src/runtime_env.rs @@ -15,8 +15,8 @@ // specific language governing permissions and limitations // under the License. -//! Execution runtime environment that holds object Store, memory manager, disk manager -//! and various system level components that are used during physical plan execution. +//! Execution [`RuntimeEnv`] environment that manages access to object +//! store, memory manager, disk manager. use crate::{ disk_manager::{DiskManager, DiskManagerConfig}, @@ -32,7 +32,15 @@ use std::sync::Arc; use url::Url; #[derive(Clone)] -/// Execution runtime environment. +/// Execution runtime environment that manages system resources such +/// as memory, disk and storage. +/// +/// A [`RuntimeEnv`] is created from a [`RuntimeConfig`] and has the +/// following resource management functionality: +/// +/// * [`MemoryPool`]: Manage memory +/// * [`DiskManager`]: Manage temporary files on local disk +/// * [`ObjectStoreRegistry`]: Manage mapping URLs to object store instances pub struct RuntimeEnv { /// Runtime memory management pub memory_pool: Arc, diff --git a/datafusion/execution/src/task.rs b/datafusion/execution/src/task.rs index 9f73f767af05..ca1bc9369e35 100644 --- a/datafusion/execution/src/task.rs +++ b/datafusion/execution/src/task.rs @@ -32,6 +32,11 @@ use crate::{ }; /// Task Execution Context +/// +/// A [`TaskContext`] has represents the state available during a single query's +/// execution. +/// +/// # Task Context pub struct TaskContext { /// Session Id session_id: String, @@ -98,7 +103,7 @@ impl TaskContext { )) } - /// Return the SessionConfig associated with the Task + /// Return the SessionConfig associated with this [TaskContext] pub fn session_config(&self) -> &SessionConfig { &self.session_config } diff --git a/datafusion/physical-expr/src/execution_props.rs b/datafusion/physical-expr/src/execution_props.rs index ff413be36141..5849850031b1 100644 --- a/datafusion/physical-expr/src/execution_props.rs +++ b/datafusion/physical-expr/src/execution_props.rs @@ -20,10 +20,12 @@ use chrono::{DateTime, TimeZone, Utc}; use std::collections::HashMap; use std::sync::Arc; -/// Holds per-execution properties and data (such as starting timestamps, etc). -/// An instance of this struct is created each time a [`LogicalPlan`] is prepared for -/// execution (optimized). If the same plan is optimized multiple times, a new -/// `ExecutionProps` is created each time. +/// Holds per-query execution properties and data (such as statment +/// starting timestamps). +/// +/// An [`ExecutionProps`] is created each time a [`LogicalPlan`] is +/// prepared for execution (optimized). If the same plan is optimized +/// multiple times, a new `ExecutionProps` is created each time. /// /// It is important that this structure be cheap to create as it is /// done so during predicate pruning and expression simplification