Skip to content

Commit 43ab96e

Browse files
committed
Refactor SessionContext, BallistaContext to support multi-tenancy configurations - Part 3
resolve review comments
1 parent 5936edc commit 43ab96e

File tree

91 files changed

+1573
-1036
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

91 files changed

+1573
-1036
lines changed

ballista-examples/src/bin/ballista-dataframe.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ async fn main() -> Result<()> {
2525
let config = BallistaConfig::builder()
2626
.set("ballista.shuffle.partitions", "4")
2727
.build()?;
28-
let ctx = BallistaContext::remote("localhost", 50050, &config);
28+
let ctx = BallistaContext::remote("localhost", 50050, &config).await?;
2929

3030
let testdata = datafusion::test_util::parquet_test_data();
3131

ballista-examples/src/bin/ballista-sql.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ async fn main() -> Result<()> {
2525
let config = BallistaConfig::builder()
2626
.set("ballista.shuffle.partitions", "4")
2727
.build()?;
28-
let ctx = BallistaContext::remote("localhost", 50050, &config);
28+
let ctx = BallistaContext::remote("localhost", 50050, &config).await?;
2929

3030
let testdata = datafusion::test_util::arrow_test_data();
3131

ballista/rust/client/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ async fn main() -> Result<()> {
106106
.build()?;
107107
108108
// connect to Ballista scheduler
109-
let ctx = BallistaContext::remote("localhost", 50050, &config);
109+
let ctx = BallistaContext::remote("localhost", 50050, &config).await?;
110110
111111
// register csv file with the execution context
112112
ctx.register_csv(

ballista/rust/client/src/context.rs

Lines changed: 123 additions & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
//! Distributed execution context.
1919
20+
use log::info;
2021
use parking_lot::Mutex;
2122
use sqlparser::ast::Statement;
2223
use std::collections::HashMap;
@@ -25,7 +26,8 @@ use std::path::PathBuf;
2526
use std::sync::Arc;
2627

2728
use ballista_core::config::BallistaConfig;
28-
use ballista_core::serde::protobuf::LogicalPlanNode;
29+
use ballista_core::serde::protobuf::scheduler_grpc_client::SchedulerGrpcClient;
30+
use ballista_core::serde::protobuf::{ExecuteQueryParams, KeyValuePair, LogicalPlanNode};
2931
use ballista_core::utils::create_df_ctx_with_ballista_query_planner;
3032

3133
use datafusion::catalog::TableReference;
@@ -63,26 +65,86 @@ impl BallistaContextState {
6365
}
6466
}
6567

68+
pub fn config(&self) -> &BallistaConfig {
69+
&self.config
70+
}
71+
}
72+
73+
pub struct BallistaContext {
74+
state: Arc<Mutex<BallistaContextState>>,
75+
context: Arc<SessionContext>,
76+
}
77+
78+
impl BallistaContext {
79+
/// Create a context for executing queries against a remote Ballista scheduler instance
80+
pub async fn remote(
81+
host: &str,
82+
port: u16,
83+
config: &BallistaConfig,
84+
) -> ballista_core::error::Result<Self> {
85+
let state = BallistaContextState::new(host.to_owned(), port, config);
86+
87+
let scheduler_url =
88+
format!("http://{}:{}", &state.scheduler_host, state.scheduler_port);
89+
info!(
90+
"Connecting to Ballista scheduler at {}",
91+
scheduler_url.clone()
92+
);
93+
let mut scheduler = SchedulerGrpcClient::connect(scheduler_url.clone())
94+
.await
95+
.map_err(|e| DataFusionError::Execution(format!("{:?}", e)))?;
96+
97+
let remote_session_id = scheduler
98+
.execute_query(ExecuteQueryParams {
99+
query: None,
100+
settings: config
101+
.settings()
102+
.iter()
103+
.map(|(k, v)| KeyValuePair {
104+
key: k.to_owned(),
105+
value: v.to_owned(),
106+
})
107+
.collect::<Vec<_>>(),
108+
optional_session_id: None,
109+
})
110+
.await
111+
.map_err(|e| DataFusionError::Execution(format!("{:?}", e)))?
112+
.into_inner()
113+
.session_id;
114+
115+
info!(
116+
"Server side SessionContext created with session id: {}",
117+
remote_session_id
118+
);
119+
120+
let ctx = {
121+
create_df_ctx_with_ballista_query_planner::<LogicalPlanNode>(
122+
scheduler_url,
123+
remote_session_id,
124+
state.config(),
125+
)
126+
};
127+
128+
Ok(Self {
129+
state: Arc::new(Mutex::new(state)),
130+
context: Arc::new(ctx),
131+
})
132+
}
133+
66134
#[cfg(feature = "standalone")]
67-
pub async fn new_standalone(
135+
pub async fn standalone(
68136
config: &BallistaConfig,
69137
concurrent_tasks: usize,
70138
) -> ballista_core::error::Result<Self> {
71-
use ballista_core::serde::protobuf::scheduler_grpc_client::SchedulerGrpcClient;
72139
use ballista_core::serde::protobuf::PhysicalPlanNode;
73140
use ballista_core::serde::BallistaCodec;
74141

75142
log::info!("Running in local mode. Scheduler will be run in-proc");
76143

77144
let addr = ballista_scheduler::standalone::new_standalone_scheduler().await?;
78-
79-
let scheduler = loop {
80-
match SchedulerGrpcClient::connect(format!(
81-
"http://localhost:{}",
82-
addr.port()
83-
))
84-
.await
85-
{
145+
let scheduler_url = format!("http://localhost:{}", addr.port());
146+
let mut scheduler = loop {
147+
match SchedulerGrpcClient::connect(scheduler_url.clone()).await {
86148
Err(_) => {
87149
tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
88150
log::info!("Attempting to connect to in-proc scheduler...");
@@ -91,6 +153,37 @@ impl BallistaContextState {
91153
}
92154
};
93155

156+
let remote_session_id = scheduler
157+
.execute_query(ExecuteQueryParams {
158+
query: None,
159+
settings: config
160+
.settings()
161+
.iter()
162+
.map(|(k, v)| KeyValuePair {
163+
key: k.to_owned(),
164+
value: v.to_owned(),
165+
})
166+
.collect::<Vec<_>>(),
167+
optional_session_id: None,
168+
})
169+
.await
170+
.map_err(|e| DataFusionError::Execution(format!("{:?}", e)))?
171+
.into_inner()
172+
.session_id;
173+
174+
info!(
175+
"Server side SessionContext created with session id: {}",
176+
remote_session_id
177+
);
178+
179+
let ctx = {
180+
create_df_ctx_with_ballista_query_planner::<LogicalPlanNode>(
181+
scheduler_url,
182+
remote_session_id,
183+
config,
184+
)
185+
};
186+
94187
let default_codec: BallistaCodec<LogicalPlanNode, PhysicalPlanNode> =
95188
BallistaCodec::default();
96189

@@ -101,43 +194,12 @@ impl BallistaContextState {
101194
)
102195
.await?;
103196

104-
Ok(Self {
105-
config: config.clone(),
106-
scheduler_host: "localhost".to_string(),
107-
scheduler_port: addr.port(),
108-
tables: HashMap::new(),
109-
})
110-
}
111-
112-
pub fn config(&self) -> &BallistaConfig {
113-
&self.config
114-
}
115-
}
116-
117-
pub struct BallistaContext {
118-
state: Arc<Mutex<BallistaContextState>>,
119-
}
120-
121-
impl BallistaContext {
122-
/// Create a context for executing queries against a remote Ballista scheduler instance
123-
pub fn remote(host: &str, port: u16, config: &BallistaConfig) -> Self {
124-
let state = BallistaContextState::new(host.to_owned(), port, config);
125-
126-
Self {
127-
state: Arc::new(Mutex::new(state)),
128-
}
129-
}
130-
131-
#[cfg(feature = "standalone")]
132-
pub async fn standalone(
133-
config: &BallistaConfig,
134-
concurrent_tasks: usize,
135-
) -> ballista_core::error::Result<Self> {
136197
let state =
137-
BallistaContextState::new_standalone(config, concurrent_tasks).await?;
198+
BallistaContextState::new("localhost".to_string(), addr.port(), config);
138199

139200
Ok(Self {
140201
state: Arc::new(Mutex::new(state)),
202+
context: Arc::new(ctx),
141203
})
142204
}
143205

@@ -152,15 +214,7 @@ impl BallistaContext {
152214
let path = PathBuf::from(path);
153215
let path = fs::canonicalize(&path)?;
154216

155-
// use local DataFusion context for now but later this might call the scheduler
156-
let mut ctx = {
157-
let guard = self.state.lock();
158-
create_df_ctx_with_ballista_query_planner::<LogicalPlanNode>(
159-
&guard.scheduler_host,
160-
guard.scheduler_port,
161-
guard.config(),
162-
)
163-
};
217+
let ctx = self.context.clone();
164218
let df = ctx.read_avro(path.to_str().unwrap(), options).await?;
165219
Ok(df)
166220
}
@@ -172,15 +226,7 @@ impl BallistaContext {
172226
let path = PathBuf::from(path);
173227
let path = fs::canonicalize(&path)?;
174228

175-
// use local DataFusion context for now but later this might call the scheduler
176-
let mut ctx = {
177-
let guard = self.state.lock();
178-
create_df_ctx_with_ballista_query_planner::<LogicalPlanNode>(
179-
&guard.scheduler_host,
180-
guard.scheduler_port,
181-
guard.config(),
182-
)
183-
};
229+
let ctx = self.context.clone();
184230
let df = ctx.read_parquet(path.to_str().unwrap()).await?;
185231
Ok(df)
186232
}
@@ -196,15 +242,7 @@ impl BallistaContext {
196242
let path = PathBuf::from(path);
197243
let path = fs::canonicalize(&path)?;
198244

199-
// use local DataFusion context for now but later this might call the scheduler
200-
let mut ctx = {
201-
let guard = self.state.lock();
202-
create_df_ctx_with_ballista_query_planner::<LogicalPlanNode>(
203-
&guard.scheduler_host,
204-
guard.scheduler_port,
205-
guard.config(),
206-
)
207-
};
245+
let ctx = self.context.clone();
208246
let df = ctx.read_csv(path.to_str().unwrap(), options).await?;
209247
Ok(df)
210248
}
@@ -291,34 +329,31 @@ impl BallistaContext {
291329
/// This method is `async` because queries of type `CREATE EXTERNAL TABLE`
292330
/// might require the schema to be inferred.
293331
pub async fn sql(&self, sql: &str) -> Result<Arc<DataFrame>> {
294-
let mut ctx = {
295-
let state = self.state.lock();
296-
create_df_ctx_with_ballista_query_planner::<LogicalPlanNode>(
297-
&state.scheduler_host,
298-
state.scheduler_port,
299-
state.config(),
300-
)
301-
};
332+
let mut ctx = self.context.clone();
302333

303334
let is_show = self.is_show_statement(sql).await?;
304335
// the show tables、 show columns sql can not run at scheduler because the tables is store at client
305336
if is_show {
306337
let state = self.state.lock();
307-
ctx = SessionContext::with_config(
338+
ctx = Arc::new(SessionContext::with_config(
308339
SessionConfig::new().with_information_schema(
309340
state.config.default_with_information_schema(),
310341
),
311-
);
342+
));
312343
}
313344

314345
// register tables with DataFusion context
315346
{
316347
let state = self.state.lock();
317348
for (name, prov) in &state.tables {
318-
ctx.register_table(
319-
TableReference::Bare { table: name },
320-
Arc::clone(prov),
321-
)?;
349+
// ctx is shared between queries, check table exists or not before register
350+
let table_ref = TableReference::Bare { table: name };
351+
if !ctx.table_exist(table_ref)? {
352+
ctx.register_table(
353+
TableReference::Bare { table: name },
354+
Arc::clone(prov),
355+
)?;
356+
}
322357
}
323358
}
324359

@@ -341,16 +376,16 @@ impl BallistaContext {
341376
.has_header(*has_header),
342377
)
343378
.await?;
344-
Ok(Arc::new(DataFrame::new(ctx.state, &plan)))
379+
Ok(Arc::new(DataFrame::new(ctx.state.clone(), &plan)))
345380
}
346381
FileType::Parquet => {
347382
self.register_parquet(name, location).await?;
348-
Ok(Arc::new(DataFrame::new(ctx.state, &plan)))
383+
Ok(Arc::new(DataFrame::new(ctx.state.clone(), &plan)))
349384
}
350385
FileType::Avro => {
351386
self.register_avro(name, location, AvroReadOptions::default())
352387
.await?;
353-
Ok(Arc::new(DataFrame::new(ctx.state, &plan)))
388+
Ok(Arc::new(DataFrame::new(ctx.state.clone(), &plan)))
354389
}
355390
_ => Err(DataFusionError::NotImplemented(format!(
356391
"Unsupported file type {:?}.",
@@ -475,17 +510,13 @@ mod tests {
475510
use datafusion::arrow::datatypes::Schema;
476511
use datafusion::arrow::util::pretty;
477512
use datafusion::datasource::file_format::csv::CsvFormat;
478-
use datafusion::datasource::file_format::parquet::ParquetFormat;
479513
use datafusion::datasource::listing::{
480514
ListingOptions, ListingTable, ListingTableConfig,
481515
};
482516

483517
use ballista_core::config::{
484518
BallistaConfigBuilder, BALLISTA_WITH_INFORMATION_SCHEMA,
485519
};
486-
use std::fs::File;
487-
use std::io::Write;
488-
use tempfile::TempDir;
489520
let config = BallistaConfigBuilder::default()
490521
.set(BALLISTA_WITH_INFORMATION_SCHEMA, "true")
491522
.build()

ballista/rust/client/src/prelude.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
2020
pub use crate::context::BallistaContext;
2121
pub use ballista_core::config::BallistaConfig;
22+
pub use ballista_core::config::BALLISTA_DEFAULT_BATCH_SIZE;
2223
pub use ballista_core::config::BALLISTA_DEFAULT_SHUFFLE_PARTITIONS;
2324
pub use ballista_core::error::{BallistaError, Result};
2425

ballista/rust/core/proto/ballista.proto

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -712,6 +712,8 @@ message TaskDefinition {
712712
bytes plan = 2;
713713
// Output partition for shuffle writer
714714
PhysicalHashRepartition output_partitioning = 3;
715+
string session_id = 4;
716+
repeated KeyValuePair props = 5;
715717
}
716718

717719
message PollWorkResult {
@@ -757,7 +759,10 @@ message ExecuteQueryParams {
757759
bytes logical_plan = 1;
758760
string sql = 2;
759761
}
760-
repeated KeyValuePair settings = 3;
762+
oneof optional_session_id {
763+
string session_id = 3;
764+
}
765+
repeated KeyValuePair settings = 4;
761766
}
762767

763768
message ExecuteSqlParams {
@@ -766,6 +771,7 @@ message ExecuteSqlParams {
766771

767772
message ExecuteQueryResult {
768773
string job_id = 1;
774+
string session_id = 2;
769775
}
770776

771777
message GetJobStatusParams {

0 commit comments

Comments
 (0)