17
17
18
18
//! Distributed execution context.
19
19
20
+ use log:: info;
20
21
use parking_lot:: Mutex ;
21
22
use sqlparser:: ast:: Statement ;
22
23
use std:: collections:: HashMap ;
@@ -25,7 +26,8 @@ use std::path::PathBuf;
25
26
use std:: sync:: Arc ;
26
27
27
28
use ballista_core:: config:: BallistaConfig ;
28
- use ballista_core:: serde:: protobuf:: LogicalPlanNode ;
29
+ use ballista_core:: serde:: protobuf:: scheduler_grpc_client:: SchedulerGrpcClient ;
30
+ use ballista_core:: serde:: protobuf:: { ExecuteQueryParams , KeyValuePair , LogicalPlanNode } ;
29
31
use ballista_core:: utils:: create_df_ctx_with_ballista_query_planner;
30
32
31
33
use datafusion:: catalog:: TableReference ;
@@ -35,7 +37,7 @@ use datafusion::error::{DataFusionError, Result};
35
37
use datafusion:: execution:: dataframe_impl:: DataFrameImpl ;
36
38
use datafusion:: logical_plan:: { CreateExternalTable , LogicalPlan , TableScan } ;
37
39
use datafusion:: prelude:: {
38
- AvroReadOptions , CsvReadOptions , ExecutionConfig , ExecutionContext ,
40
+ AvroReadOptions , CsvReadOptions , SessionConfig , SessionContext ,
39
41
} ;
40
42
use datafusion:: sql:: parser:: { DFParser , FileType , Statement as DFStatement } ;
41
43
@@ -64,26 +66,81 @@ impl BallistaContextState {
64
66
}
65
67
}
66
68
69
+ pub fn config ( & self ) -> & BallistaConfig {
70
+ & self . config
71
+ }
72
+ }
73
+
74
+ pub struct BallistaContext {
75
+ state : Arc < Mutex < BallistaContextState > > ,
76
+ context : Arc < SessionContext > ,
77
+ }
78
+
79
+ impl BallistaContext {
80
+ /// Create a context for executing queries against a remote Ballista scheduler instance
81
+ pub async fn remote ( host : & str , port : u16 , config : & BallistaConfig ) -> Result < Self > {
82
+ let state = BallistaContextState :: new ( host. to_owned ( ) , port, config) ;
83
+ let scheduler_url =
84
+ format ! ( "http://{}:{}" , & state. scheduler_host, state. scheduler_port) ;
85
+ info ! (
86
+ "Connecting to Ballista scheduler at {}" ,
87
+ scheduler_url. clone( )
88
+ ) ;
89
+ let mut scheduler = SchedulerGrpcClient :: connect ( scheduler_url. clone ( ) )
90
+ . await
91
+ . map_err ( |e| DataFusionError :: Execution ( format ! ( "{:?}" , e) ) ) ?;
92
+
93
+ let remote_session_id = scheduler
94
+ . execute_query ( ExecuteQueryParams {
95
+ query : None ,
96
+ settings : config
97
+ . settings ( )
98
+ . iter ( )
99
+ . map ( |( k, v) | KeyValuePair {
100
+ key : k. to_owned ( ) ,
101
+ value : v. to_owned ( ) ,
102
+ } )
103
+ . collect :: < Vec < _ > > ( ) ,
104
+ optional_session_id : None ,
105
+ } )
106
+ . await
107
+ . map_err ( |e| DataFusionError :: Execution ( format ! ( "{:?}" , e) ) ) ?
108
+ . into_inner ( )
109
+ . session_id ;
110
+
111
+ info ! (
112
+ "Server side SessionContext created with Session id: {}" ,
113
+ remote_session_id
114
+ ) ;
115
+
116
+ let ctx = {
117
+ create_df_ctx_with_ballista_query_planner :: < LogicalPlanNode > (
118
+ scheduler_url,
119
+ remote_session_id,
120
+ state. config ( ) ,
121
+ )
122
+ } ;
123
+
124
+ Ok ( Self {
125
+ state : Arc :: new ( Mutex :: new ( state) ) ,
126
+ context : Arc :: new ( ctx) ,
127
+ } )
128
+ }
129
+
67
130
#[ cfg( feature = "standalone" ) ]
68
- pub async fn new_standalone (
131
+ pub async fn standalone (
69
132
config : & BallistaConfig ,
70
133
concurrent_tasks : usize ,
71
134
) -> ballista_core:: error:: Result < Self > {
72
- use ballista_core:: serde:: protobuf:: scheduler_grpc_client:: SchedulerGrpcClient ;
73
135
use ballista_core:: serde:: protobuf:: PhysicalPlanNode ;
74
136
use ballista_core:: serde:: BallistaCodec ;
75
137
76
138
log:: info!( "Running in local mode. Scheduler will be run in-proc" ) ;
77
139
78
140
let addr = ballista_scheduler:: standalone:: new_standalone_scheduler ( ) . await ?;
79
-
80
- let scheduler = loop {
81
- match SchedulerGrpcClient :: connect ( format ! (
82
- "http://localhost:{}" ,
83
- addr. port( )
84
- ) )
85
- . await
86
- {
141
+ let scheduler_url = format ! ( "http://localhost:{}" , addr. port( ) ) ;
142
+ let mut scheduler = loop {
143
+ match SchedulerGrpcClient :: connect ( scheduler_url. clone ( ) ) . await {
87
144
Err ( _) => {
88
145
tokio:: time:: sleep ( tokio:: time:: Duration :: from_millis ( 100 ) ) . await ;
89
146
log:: info!( "Attempting to connect to in-proc scheduler..." ) ;
@@ -92,6 +149,37 @@ impl BallistaContextState {
92
149
}
93
150
} ;
94
151
152
+ let remote_session_id = scheduler
153
+ . execute_query ( ExecuteQueryParams {
154
+ query : None ,
155
+ settings : config
156
+ . settings ( )
157
+ . iter ( )
158
+ . map ( |( k, v) | KeyValuePair {
159
+ key : k. to_owned ( ) ,
160
+ value : v. to_owned ( ) ,
161
+ } )
162
+ . collect :: < Vec < _ > > ( ) ,
163
+ optional_session_id : None ,
164
+ } )
165
+ . await
166
+ . map_err ( |e| DataFusionError :: Execution ( format ! ( "{:?}" , e) ) ) ?
167
+ . into_inner ( )
168
+ . session_id ;
169
+
170
+ info ! (
171
+ "Server side SessionContext created with Session id: {}" ,
172
+ remote_session_id
173
+ ) ;
174
+
175
+ let ctx = {
176
+ create_df_ctx_with_ballista_query_planner :: < LogicalPlanNode > (
177
+ scheduler_url,
178
+ remote_session_id,
179
+ config,
180
+ )
181
+ } ;
182
+
95
183
let default_codec: BallistaCodec < LogicalPlanNode , PhysicalPlanNode > =
96
184
BallistaCodec :: default ( ) ;
97
185
@@ -102,43 +190,12 @@ impl BallistaContextState {
102
190
)
103
191
. await ?;
104
192
105
- Ok ( Self {
106
- config : config. clone ( ) ,
107
- scheduler_host : "localhost" . to_string ( ) ,
108
- scheduler_port : addr. port ( ) ,
109
- tables : HashMap :: new ( ) ,
110
- } )
111
- }
112
-
113
- pub fn config ( & self ) -> & BallistaConfig {
114
- & self . config
115
- }
116
- }
117
-
118
- pub struct BallistaContext {
119
- state : Arc < Mutex < BallistaContextState > > ,
120
- }
121
-
122
- impl BallistaContext {
123
- /// Create a context for executing queries against a remote Ballista scheduler instance
124
- pub fn remote ( host : & str , port : u16 , config : & BallistaConfig ) -> Self {
125
- let state = BallistaContextState :: new ( host. to_owned ( ) , port, config) ;
126
-
127
- Self {
128
- state : Arc :: new ( Mutex :: new ( state) ) ,
129
- }
130
- }
131
-
132
- #[ cfg( feature = "standalone" ) ]
133
- pub async fn standalone (
134
- config : & BallistaConfig ,
135
- concurrent_tasks : usize ,
136
- ) -> ballista_core:: error:: Result < Self > {
137
193
let state =
138
- BallistaContextState :: new_standalone ( config , concurrent_tasks ) . await ? ;
194
+ BallistaContextState :: new ( "localhost" . to_string ( ) , addr . port ( ) , config ) ;
139
195
140
196
Ok ( Self {
141
197
state : Arc :: new ( Mutex :: new ( state) ) ,
198
+ context : Arc :: new ( ctx) ,
142
199
} )
143
200
}
144
201
@@ -154,15 +211,10 @@ impl BallistaContext {
154
211
let path = fs:: canonicalize ( & path) ?;
155
212
156
213
// use local DataFusion context for now but later this might call the scheduler
157
- let mut ctx = {
158
- let guard = self . state . lock ( ) ;
159
- create_df_ctx_with_ballista_query_planner :: < LogicalPlanNode > (
160
- & guard. scheduler_host ,
161
- guard. scheduler_port ,
162
- guard. config ( ) ,
163
- )
164
- } ;
165
- let df = ctx. read_avro ( path. to_str ( ) . unwrap ( ) , options) . await ?;
214
+ let df = self
215
+ . context
216
+ . read_avro ( path. to_str ( ) . unwrap ( ) , options)
217
+ . await ?;
166
218
Ok ( df)
167
219
}
168
220
@@ -174,15 +226,7 @@ impl BallistaContext {
174
226
let path = fs:: canonicalize ( & path) ?;
175
227
176
228
// use local DataFusion context for now but later this might call the scheduler
177
- let mut ctx = {
178
- let guard = self . state . lock ( ) ;
179
- create_df_ctx_with_ballista_query_planner :: < LogicalPlanNode > (
180
- & guard. scheduler_host ,
181
- guard. scheduler_port ,
182
- guard. config ( ) ,
183
- )
184
- } ;
185
- let df = ctx. read_parquet ( path. to_str ( ) . unwrap ( ) ) . await ?;
229
+ let df = self . context . read_parquet ( path. to_str ( ) . unwrap ( ) ) . await ?;
186
230
Ok ( df)
187
231
}
188
232
@@ -198,15 +242,10 @@ impl BallistaContext {
198
242
let path = fs:: canonicalize ( & path) ?;
199
243
200
244
// use local DataFusion context for now but later this might call the scheduler
201
- let mut ctx = {
202
- let guard = self . state . lock ( ) ;
203
- create_df_ctx_with_ballista_query_planner :: < LogicalPlanNode > (
204
- & guard. scheduler_host ,
205
- guard. scheduler_port ,
206
- guard. config ( ) ,
207
- )
208
- } ;
209
- let df = ctx. read_csv ( path. to_str ( ) . unwrap ( ) , options) . await ?;
245
+ let df = self
246
+ . context
247
+ . read_csv ( path. to_str ( ) . unwrap ( ) , options)
248
+ . await ?;
210
249
Ok ( df)
211
250
}
212
251
@@ -292,34 +331,30 @@ impl BallistaContext {
292
331
/// This method is `async` because queries of type `CREATE EXTERNAL TABLE`
293
332
/// might require the schema to be inferred.
294
333
pub async fn sql ( & self , sql : & str ) -> Result < Arc < dyn DataFrame > > {
295
- let mut ctx = {
296
- let state = self . state . lock ( ) ;
297
- create_df_ctx_with_ballista_query_planner :: < LogicalPlanNode > (
298
- & state. scheduler_host ,
299
- state. scheduler_port ,
300
- state. config ( ) ,
301
- )
302
- } ;
303
-
334
+ let mut ctx = self . context . clone ( ) ;
304
335
let is_show = self . is_show_statement ( sql) . await ?;
305
336
// the show tables、 show columns sql can not run at scheduler because the tables is store at client
306
337
if is_show {
307
338
let state = self . state . lock ( ) ;
308
- ctx = ExecutionContext :: with_config (
309
- ExecutionConfig :: new ( ) . with_information_schema (
339
+ ctx = Arc :: new ( SessionContext :: with_config (
340
+ SessionConfig :: new ( ) . with_information_schema (
310
341
state. config . default_with_information_schema ( ) ,
311
342
) ,
312
- ) ;
343
+ ) ) ;
313
344
}
314
345
315
346
// register tables with DataFusion context
316
347
{
317
348
let state = self . state . lock ( ) ;
318
349
for ( name, prov) in & state. tables {
319
- ctx. register_table (
320
- TableReference :: Bare { table : name } ,
321
- Arc :: clone ( prov) ,
322
- ) ?;
350
+ // ctx is shared between queries, check table exists or not before register
351
+ let table_ref = TableReference :: Bare { table : name } ;
352
+ if !ctx. table_exist ( table_ref) ? {
353
+ ctx. register_table (
354
+ TableReference :: Bare { table : name } ,
355
+ Arc :: clone ( prov) ,
356
+ ) ?;
357
+ }
323
358
}
324
359
}
325
360
@@ -342,16 +377,16 @@ impl BallistaContext {
342
377
. has_header ( * has_header) ,
343
378
)
344
379
. await ?;
345
- Ok ( Arc :: new ( DataFrameImpl :: new ( ctx. state , & plan) ) )
380
+ Ok ( Arc :: new ( DataFrameImpl :: new ( ctx. state . clone ( ) , & plan) ) )
346
381
}
347
382
FileType :: Parquet => {
348
383
self . register_parquet ( name, location) . await ?;
349
- Ok ( Arc :: new ( DataFrameImpl :: new ( ctx. state , & plan) ) )
384
+ Ok ( Arc :: new ( DataFrameImpl :: new ( ctx. state . clone ( ) , & plan) ) )
350
385
}
351
386
FileType :: Avro => {
352
387
self . register_avro ( name, location, AvroReadOptions :: default ( ) )
353
388
. await ?;
354
- Ok ( Arc :: new ( DataFrameImpl :: new ( ctx. state , & plan) ) )
389
+ Ok ( Arc :: new ( DataFrameImpl :: new ( ctx. state . clone ( ) , & plan) ) )
355
390
}
356
391
_ => Err ( DataFusionError :: NotImplemented ( format ! (
357
392
"Unsupported file type {:?}." ,
@@ -476,17 +511,13 @@ mod tests {
476
511
use datafusion:: arrow:: datatypes:: Schema ;
477
512
use datafusion:: arrow:: util:: pretty;
478
513
use datafusion:: datasource:: file_format:: csv:: CsvFormat ;
479
- use datafusion:: datasource:: file_format:: parquet:: ParquetFormat ;
480
514
use datafusion:: datasource:: listing:: {
481
515
ListingOptions , ListingTable , ListingTableConfig ,
482
516
} ;
483
517
484
518
use ballista_core:: config:: {
485
519
BallistaConfigBuilder , BALLISTA_WITH_INFORMATION_SCHEMA ,
486
520
} ;
487
- use std:: fs:: File ;
488
- use std:: io:: Write ;
489
- use tempfile:: TempDir ;
490
521
let config = BallistaConfigBuilder :: default ( )
491
522
. set ( BALLISTA_WITH_INFORMATION_SCHEMA , "true" )
492
523
. build ( )
0 commit comments