Skip to content

Commit ae70d72

Browse files
author
Wang
committed
Refactor ExecutionContext and related conf to support multi-tenancy configurations
Do not use static singleton RuntimeEnv in both DataFusion and Ballista Add SessionBuilder fn type to SchedulerServer to allow customized SessionContext creation fix client BallistaContext
1 parent 0e440ea commit ae70d72

File tree

146 files changed

+3932
-2663
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

146 files changed

+3932
-2663
lines changed

ballista-examples/src/bin/ballista-dataframe.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ async fn main() -> Result<()> {
2525
let config = BallistaConfig::builder()
2626
.set("ballista.shuffle.partitions", "4")
2727
.build()?;
28-
let ctx = BallistaContext::remote("localhost", 50050, &config);
28+
let ctx = BallistaContext::remote("localhost", 50050, &config).await?;
2929

3030
let testdata = datafusion::arrow::util::test_util::parquet_test_data();
3131

ballista-examples/src/bin/ballista-sql.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ async fn main() -> Result<()> {
2525
let config = BallistaConfig::builder()
2626
.set("ballista.shuffle.partitions", "4")
2727
.build()?;
28-
let ctx = BallistaContext::remote("localhost", 50050, &config);
28+
let ctx = BallistaContext::remote("localhost", 50050, &config).await?;
2929

3030
let testdata = datafusion::arrow::util::test_util::arrow_test_data();
3131

ballista/rust/client/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ async fn main() -> Result<()> {
106106
.build()?;
107107
108108
// connect to Ballista scheduler
109-
let ctx = BallistaContext::remote("localhost", 50050, &config);
109+
let ctx = BallistaContext::remote("localhost", 50050, &config).await?;
110110
111111
// register csv file with the execution context
112112
ctx.register_csv(

ballista/rust/client/src/context.rs

Lines changed: 126 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
//! Distributed execution context.
1919
20+
use log::info;
2021
use parking_lot::Mutex;
2122
use sqlparser::ast::Statement;
2223
use std::collections::HashMap;
@@ -25,7 +26,8 @@ use std::path::PathBuf;
2526
use std::sync::Arc;
2627

2728
use ballista_core::config::BallistaConfig;
28-
use ballista_core::serde::protobuf::LogicalPlanNode;
29+
use ballista_core::serde::protobuf::scheduler_grpc_client::SchedulerGrpcClient;
30+
use ballista_core::serde::protobuf::{ExecuteQueryParams, KeyValuePair, LogicalPlanNode};
2931
use ballista_core::utils::create_df_ctx_with_ballista_query_planner;
3032

3133
use datafusion::catalog::TableReference;
@@ -35,7 +37,7 @@ use datafusion::error::{DataFusionError, Result};
3537
use datafusion::execution::dataframe_impl::DataFrameImpl;
3638
use datafusion::logical_plan::{CreateExternalTable, LogicalPlan, TableScan};
3739
use datafusion::prelude::{
38-
AvroReadOptions, CsvReadOptions, ExecutionConfig, ExecutionContext,
40+
AvroReadOptions, CsvReadOptions, SessionConfig, SessionContext,
3941
};
4042
use datafusion::sql::parser::{DFParser, FileType, Statement as DFStatement};
4143

@@ -64,26 +66,81 @@ impl BallistaContextState {
6466
}
6567
}
6668

69+
pub fn config(&self) -> &BallistaConfig {
70+
&self.config
71+
}
72+
}
73+
74+
pub struct BallistaContext {
75+
state: Arc<Mutex<BallistaContextState>>,
76+
context: Arc<SessionContext>,
77+
}
78+
79+
impl BallistaContext {
80+
/// Create a context for executing queries against a remote Ballista scheduler instance
81+
pub async fn remote(host: &str, port: u16, config: &BallistaConfig) -> Result<Self> {
82+
let state = BallistaContextState::new(host.to_owned(), port, config);
83+
let scheduler_url =
84+
format!("http://{}:{}", &state.scheduler_host, state.scheduler_port);
85+
info!(
86+
"Connecting to Ballista scheduler at {}",
87+
scheduler_url.clone()
88+
);
89+
let mut scheduler = SchedulerGrpcClient::connect(scheduler_url.clone())
90+
.await
91+
.map_err(|e| DataFusionError::Execution(format!("{:?}", e)))?;
92+
93+
let remote_session_id = scheduler
94+
.execute_query(ExecuteQueryParams {
95+
query: None,
96+
settings: config
97+
.settings()
98+
.iter()
99+
.map(|(k, v)| KeyValuePair {
100+
key: k.to_owned(),
101+
value: v.to_owned(),
102+
})
103+
.collect::<Vec<_>>(),
104+
optional_session_id: None,
105+
})
106+
.await
107+
.map_err(|e| DataFusionError::Execution(format!("{:?}", e)))?
108+
.into_inner()
109+
.session_id;
110+
111+
info!(
112+
"Server side SessionContext created with Session id: {}",
113+
remote_session_id
114+
);
115+
116+
let ctx = {
117+
create_df_ctx_with_ballista_query_planner::<LogicalPlanNode>(
118+
scheduler_url,
119+
remote_session_id,
120+
state.config(),
121+
)
122+
};
123+
124+
Ok(Self {
125+
state: Arc::new(Mutex::new(state)),
126+
context: Arc::new(ctx),
127+
})
128+
}
129+
67130
#[cfg(feature = "standalone")]
68-
pub async fn new_standalone(
131+
pub async fn standalone(
69132
config: &BallistaConfig,
70133
concurrent_tasks: usize,
71134
) -> ballista_core::error::Result<Self> {
72-
use ballista_core::serde::protobuf::scheduler_grpc_client::SchedulerGrpcClient;
73135
use ballista_core::serde::protobuf::PhysicalPlanNode;
74136
use ballista_core::serde::BallistaCodec;
75137

76138
log::info!("Running in local mode. Scheduler will be run in-proc");
77139

78140
let addr = ballista_scheduler::standalone::new_standalone_scheduler().await?;
79-
80-
let scheduler = loop {
81-
match SchedulerGrpcClient::connect(format!(
82-
"http://localhost:{}",
83-
addr.port()
84-
))
85-
.await
86-
{
141+
let scheduler_url = format!("http://localhost:{}", addr.port());
142+
let mut scheduler = loop {
143+
match SchedulerGrpcClient::connect(scheduler_url.clone()).await {
87144
Err(_) => {
88145
tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
89146
log::info!("Attempting to connect to in-proc scheduler...");
@@ -92,6 +149,37 @@ impl BallistaContextState {
92149
}
93150
};
94151

152+
let remote_session_id = scheduler
153+
.execute_query(ExecuteQueryParams {
154+
query: None,
155+
settings: config
156+
.settings()
157+
.iter()
158+
.map(|(k, v)| KeyValuePair {
159+
key: k.to_owned(),
160+
value: v.to_owned(),
161+
})
162+
.collect::<Vec<_>>(),
163+
optional_session_id: None,
164+
})
165+
.await
166+
.map_err(|e| DataFusionError::Execution(format!("{:?}", e)))?
167+
.into_inner()
168+
.session_id;
169+
170+
info!(
171+
"Server side SessionContext created with Session id: {}",
172+
remote_session_id
173+
);
174+
175+
let ctx = {
176+
create_df_ctx_with_ballista_query_planner::<LogicalPlanNode>(
177+
scheduler_url,
178+
remote_session_id,
179+
config,
180+
)
181+
};
182+
95183
let default_codec: BallistaCodec<LogicalPlanNode, PhysicalPlanNode> =
96184
BallistaCodec::default();
97185

@@ -102,43 +190,12 @@ impl BallistaContextState {
102190
)
103191
.await?;
104192

105-
Ok(Self {
106-
config: config.clone(),
107-
scheduler_host: "localhost".to_string(),
108-
scheduler_port: addr.port(),
109-
tables: HashMap::new(),
110-
})
111-
}
112-
113-
pub fn config(&self) -> &BallistaConfig {
114-
&self.config
115-
}
116-
}
117-
118-
pub struct BallistaContext {
119-
state: Arc<Mutex<BallistaContextState>>,
120-
}
121-
122-
impl BallistaContext {
123-
/// Create a context for executing queries against a remote Ballista scheduler instance
124-
pub fn remote(host: &str, port: u16, config: &BallistaConfig) -> Self {
125-
let state = BallistaContextState::new(host.to_owned(), port, config);
126-
127-
Self {
128-
state: Arc::new(Mutex::new(state)),
129-
}
130-
}
131-
132-
#[cfg(feature = "standalone")]
133-
pub async fn standalone(
134-
config: &BallistaConfig,
135-
concurrent_tasks: usize,
136-
) -> ballista_core::error::Result<Self> {
137193
let state =
138-
BallistaContextState::new_standalone(config, concurrent_tasks).await?;
194+
BallistaContextState::new("localhost".to_string(), addr.port(), config);
139195

140196
Ok(Self {
141197
state: Arc::new(Mutex::new(state)),
198+
context: Arc::new(ctx),
142199
})
143200
}
144201

@@ -154,15 +211,10 @@ impl BallistaContext {
154211
let path = fs::canonicalize(&path)?;
155212

156213
// use local DataFusion context for now but later this might call the scheduler
157-
let mut ctx = {
158-
let guard = self.state.lock();
159-
create_df_ctx_with_ballista_query_planner::<LogicalPlanNode>(
160-
&guard.scheduler_host,
161-
guard.scheduler_port,
162-
guard.config(),
163-
)
164-
};
165-
let df = ctx.read_avro(path.to_str().unwrap(), options).await?;
214+
let df = self
215+
.context
216+
.read_avro(path.to_str().unwrap(), options)
217+
.await?;
166218
Ok(df)
167219
}
168220

@@ -174,15 +226,7 @@ impl BallistaContext {
174226
let path = fs::canonicalize(&path)?;
175227

176228
// use local DataFusion context for now but later this might call the scheduler
177-
let mut ctx = {
178-
let guard = self.state.lock();
179-
create_df_ctx_with_ballista_query_planner::<LogicalPlanNode>(
180-
&guard.scheduler_host,
181-
guard.scheduler_port,
182-
guard.config(),
183-
)
184-
};
185-
let df = ctx.read_parquet(path.to_str().unwrap()).await?;
229+
let df = self.context.read_parquet(path.to_str().unwrap()).await?;
186230
Ok(df)
187231
}
188232

@@ -198,15 +242,10 @@ impl BallistaContext {
198242
let path = fs::canonicalize(&path)?;
199243

200244
// use local DataFusion context for now but later this might call the scheduler
201-
let mut ctx = {
202-
let guard = self.state.lock();
203-
create_df_ctx_with_ballista_query_planner::<LogicalPlanNode>(
204-
&guard.scheduler_host,
205-
guard.scheduler_port,
206-
guard.config(),
207-
)
208-
};
209-
let df = ctx.read_csv(path.to_str().unwrap(), options).await?;
245+
let df = self
246+
.context
247+
.read_csv(path.to_str().unwrap(), options)
248+
.await?;
210249
Ok(df)
211250
}
212251

@@ -292,34 +331,30 @@ impl BallistaContext {
292331
/// This method is `async` because queries of type `CREATE EXTERNAL TABLE`
293332
/// might require the schema to be inferred.
294333
pub async fn sql(&self, sql: &str) -> Result<Arc<dyn DataFrame>> {
295-
let mut ctx = {
296-
let state = self.state.lock();
297-
create_df_ctx_with_ballista_query_planner::<LogicalPlanNode>(
298-
&state.scheduler_host,
299-
state.scheduler_port,
300-
state.config(),
301-
)
302-
};
303-
334+
let mut ctx = self.context.clone();
304335
let is_show = self.is_show_statement(sql).await?;
305336
// the show tables、 show columns sql can not run at scheduler because the tables is store at client
306337
if is_show {
307338
let state = self.state.lock();
308-
ctx = ExecutionContext::with_config(
309-
ExecutionConfig::new().with_information_schema(
339+
ctx = Arc::new(SessionContext::with_config(
340+
SessionConfig::new().with_information_schema(
310341
state.config.default_with_information_schema(),
311342
),
312-
);
343+
));
313344
}
314345

315346
// register tables with DataFusion context
316347
{
317348
let state = self.state.lock();
318349
for (name, prov) in &state.tables {
319-
ctx.register_table(
320-
TableReference::Bare { table: name },
321-
Arc::clone(prov),
322-
)?;
350+
// ctx is shared between queries, check table exists or not before register
351+
let table_ref = TableReference::Bare { table: name };
352+
if !ctx.table_exist(table_ref)? {
353+
ctx.register_table(
354+
TableReference::Bare { table: name },
355+
Arc::clone(prov),
356+
)?;
357+
}
323358
}
324359
}
325360

@@ -342,16 +377,16 @@ impl BallistaContext {
342377
.has_header(*has_header),
343378
)
344379
.await?;
345-
Ok(Arc::new(DataFrameImpl::new(ctx.state, &plan)))
380+
Ok(Arc::new(DataFrameImpl::new(ctx.state.clone(), &plan)))
346381
}
347382
FileType::Parquet => {
348383
self.register_parquet(name, location).await?;
349-
Ok(Arc::new(DataFrameImpl::new(ctx.state, &plan)))
384+
Ok(Arc::new(DataFrameImpl::new(ctx.state.clone(), &plan)))
350385
}
351386
FileType::Avro => {
352387
self.register_avro(name, location, AvroReadOptions::default())
353388
.await?;
354-
Ok(Arc::new(DataFrameImpl::new(ctx.state, &plan)))
389+
Ok(Arc::new(DataFrameImpl::new(ctx.state.clone(), &plan)))
355390
}
356391
_ => Err(DataFusionError::NotImplemented(format!(
357392
"Unsupported file type {:?}.",
@@ -476,17 +511,13 @@ mod tests {
476511
use datafusion::arrow::datatypes::Schema;
477512
use datafusion::arrow::util::pretty;
478513
use datafusion::datasource::file_format::csv::CsvFormat;
479-
use datafusion::datasource::file_format::parquet::ParquetFormat;
480514
use datafusion::datasource::listing::{
481515
ListingOptions, ListingTable, ListingTableConfig,
482516
};
483517

484518
use ballista_core::config::{
485519
BallistaConfigBuilder, BALLISTA_WITH_INFORMATION_SCHEMA,
486520
};
487-
use std::fs::File;
488-
use std::io::Write;
489-
use tempfile::TempDir;
490521
let config = BallistaConfigBuilder::default()
491522
.set(BALLISTA_WITH_INFORMATION_SCHEMA, "true")
492523
.build()

ballista/rust/client/src/prelude.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
2020
pub use crate::context::BallistaContext;
2121
pub use ballista_core::config::BallistaConfig;
22+
pub use ballista_core::config::BALLISTA_DEFAULT_BATCH_SIZE;
2223
pub use ballista_core::config::BALLISTA_DEFAULT_SHUFFLE_PARTITIONS;
2324
pub use ballista_core::error::{BallistaError, Result};
2425

0 commit comments

Comments
 (0)