@@ -21,11 +21,12 @@ use futures::future::join_all;
21
21
use rand:: prelude:: * ;
22
22
use std:: ops:: Div ;
23
23
use std:: {
24
- fs,
24
+ fs:: { self , File } ,
25
+ io:: Write ,
25
26
iter:: Iterator ,
26
27
path:: { Path , PathBuf } ,
27
28
sync:: Arc ,
28
- time:: Instant ,
29
+ time:: { Instant , SystemTime } ,
29
30
} ;
30
31
31
32
use ballista:: context:: BallistaContext ;
@@ -42,6 +43,7 @@ use datafusion::prelude::*;
42
43
use datafusion:: {
43
44
arrow:: datatypes:: { DataType , Field , Schema } ,
44
45
datasource:: file_format:: { csv:: CsvFormat , FileFormat } ,
46
+ DATAFUSION_VERSION ,
45
47
} ;
46
48
use datafusion:: {
47
49
arrow:: record_batch:: RecordBatch , datasource:: file_format:: parquet:: ParquetFormat ,
@@ -56,6 +58,7 @@ use datafusion::{
56
58
57
59
use datafusion:: datasource:: file_format:: csv:: DEFAULT_CSV_EXTENSION ;
58
60
use datafusion:: datasource:: file_format:: parquet:: DEFAULT_PARQUET_EXTENSION ;
61
+ use serde:: Serialize ;
59
62
use structopt:: StructOpt ;
60
63
61
64
#[ cfg( feature = "snmalloc" ) ]
@@ -105,6 +108,10 @@ struct BallistaBenchmarkOpt {
105
108
/// Ballista executor port
106
109
#[ structopt( long = "port" ) ]
107
110
port : Option < u16 > ,
111
+
112
+ /// Path to output directory where JSON summary file should be written to
113
+ #[ structopt( parse( from_os_str) , short = "o" , long = "output" ) ]
114
+ output_path : Option < PathBuf > ,
108
115
}
109
116
110
117
#[ derive( Debug , StructOpt , Clone ) ]
@@ -140,6 +147,10 @@ struct DataFusionBenchmarkOpt {
140
147
/// Load the data into a MemTable before executing the query
141
148
#[ structopt( short = "m" , long = "mem-table" ) ]
142
149
mem_table : bool ,
150
+
151
+ /// Path to output directory where JSON summary file should be written to
152
+ #[ structopt( parse( from_os_str) , short = "o" , long = "output" ) ]
153
+ output_path : Option < PathBuf > ,
143
154
}
144
155
145
156
#[ derive( Debug , StructOpt , Clone ) ]
@@ -261,6 +272,7 @@ async fn main() -> Result<()> {
261
272
262
273
async fn benchmark_datafusion ( opt : DataFusionBenchmarkOpt ) -> Result < Vec < RecordBatch > > {
263
274
println ! ( "Running benchmarks with the following options: {:?}" , opt) ;
275
+ let mut benchmark_run = BenchmarkRun :: new ( opt. query ) ;
264
276
let config = ExecutionConfig :: new ( )
265
277
. with_target_partitions ( opt. partitions )
266
278
. with_batch_size ( opt. batch_size ) ;
@@ -302,17 +314,27 @@ async fn benchmark_datafusion(opt: DataFusionBenchmarkOpt) -> Result<Vec<RecordB
302
314
result = execute_query ( & mut ctx, & plan, opt. debug ) . await ?;
303
315
let elapsed = start. elapsed ( ) . as_secs_f64 ( ) * 1000.0 ;
304
316
millis. push ( elapsed as f64 ) ;
305
- println ! ( "Query {} iteration {} took {:.1} ms" , opt. query, i, elapsed) ;
317
+ let row_count = result. iter ( ) . map ( |b| b. num_rows ( ) ) . sum ( ) ;
318
+ println ! (
319
+ "Query {} iteration {} took {:.1} ms and returned {} rows" ,
320
+ opt. query, i, elapsed, row_count
321
+ ) ;
322
+ benchmark_run. add_result ( elapsed, row_count) ;
306
323
}
307
324
308
325
let avg = millis. iter ( ) . sum :: < f64 > ( ) / millis. len ( ) as f64 ;
309
326
println ! ( "Query {} avg time: {:.2} ms" , opt. query, avg) ;
310
327
328
+ if let Some ( path) = & opt. output_path {
329
+ write_summary_json ( & mut benchmark_run, path) ?;
330
+ }
331
+
311
332
Ok ( result)
312
333
}
313
334
314
335
async fn benchmark_ballista ( opt : BallistaBenchmarkOpt ) -> Result < ( ) > {
315
336
println ! ( "Running benchmarks with the following options: {:?}" , opt) ;
337
+ let mut benchmark_run = BenchmarkRun :: new ( opt. query ) ;
316
338
317
339
let config = BallistaConfig :: builder ( )
318
340
. set (
@@ -350,7 +372,12 @@ async fn benchmark_ballista(opt: BallistaBenchmarkOpt) -> Result<()> {
350
372
. unwrap ( ) ;
351
373
let elapsed = start. elapsed ( ) . as_secs_f64 ( ) * 1000.0 ;
352
374
millis. push ( elapsed as f64 ) ;
353
- println ! ( "Query {} iteration {} took {:.1} ms" , opt. query, i, elapsed) ;
375
+ let row_count = batches. iter ( ) . map ( |b| b. num_rows ( ) ) . sum ( ) ;
376
+ println ! (
377
+ "Query {} iteration {} took {:.1} ms and returned {} rows" ,
378
+ opt. query, i, elapsed, row_count
379
+ ) ;
380
+ benchmark_run. add_result ( elapsed, row_count) ;
354
381
if opt. debug {
355
382
pretty:: print_batches ( & batches) ?;
356
383
}
@@ -359,6 +386,27 @@ async fn benchmark_ballista(opt: BallistaBenchmarkOpt) -> Result<()> {
359
386
let avg = millis. iter ( ) . sum :: < f64 > ( ) / millis. len ( ) as f64 ;
360
387
println ! ( "Query {} avg time: {:.2} ms" , opt. query, avg) ;
361
388
389
+ if let Some ( path) = & opt. output_path {
390
+ write_summary_json ( & mut benchmark_run, path) ?;
391
+ }
392
+
393
+ Ok ( ( ) )
394
+ }
395
+
396
+ fn write_summary_json ( benchmark_run : & mut BenchmarkRun , path : & Path ) -> Result < ( ) > {
397
+ let json =
398
+ serde_json:: to_string_pretty ( & benchmark_run) . expect ( "summary is serializable" ) ;
399
+ let filename = format ! (
400
+ "tpch-q{}-{}.json" ,
401
+ benchmark_run. query, benchmark_run. start_time
402
+ ) ;
403
+ let path = path. join ( filename) ;
404
+ println ! (
405
+ "Writing summary file to {}" ,
406
+ path. as_os_str( ) . to_str( ) . unwrap( )
407
+ ) ;
408
+ let mut file = File :: create ( path) ?;
409
+ file. write_all ( json. as_bytes ( ) ) ?;
362
410
Ok ( ( ) )
363
411
}
364
412
@@ -779,6 +827,54 @@ fn get_schema(table: &str) -> Schema {
779
827
}
780
828
}
781
829
830
+ #[ derive( Debug , Serialize ) ]
831
+ struct BenchmarkRun {
832
+ /// Benchmark crate version
833
+ benchmark_version : String ,
834
+ /// DataFusion crate version
835
+ datafusion_version : String ,
836
+ /// Number of CPU cores
837
+ num_cpus : usize ,
838
+ /// Start time
839
+ start_time : u64 ,
840
+ /// CLI arguments
841
+ arguments : Vec < String > ,
842
+ /// query number
843
+ query : usize ,
844
+ /// list of individual run times and row counts
845
+ iterations : Vec < QueryResult > ,
846
+ }
847
+
848
+ impl BenchmarkRun {
849
+ fn new ( query : usize ) -> Self {
850
+ Self {
851
+ benchmark_version : env ! ( "CARGO_PKG_VERSION" ) . to_owned ( ) ,
852
+ datafusion_version : DATAFUSION_VERSION . to_owned ( ) ,
853
+ num_cpus : num_cpus:: get ( ) ,
854
+ start_time : SystemTime :: now ( )
855
+ . duration_since ( SystemTime :: UNIX_EPOCH )
856
+ . expect ( "current time is later than UNIX_EPOCH" )
857
+ . as_secs ( ) ,
858
+ arguments : std:: env:: args ( )
859
+ . skip ( 1 )
860
+ . into_iter ( )
861
+ . collect :: < Vec < String > > ( ) ,
862
+ query,
863
+ iterations : vec ! [ ] ,
864
+ }
865
+ }
866
+
867
+ fn add_result ( & mut self , elapsed : f64 , row_count : usize ) {
868
+ self . iterations . push ( QueryResult { elapsed, row_count } )
869
+ }
870
+ }
871
+
872
+ #[ derive( Debug , Serialize ) ]
873
+ struct QueryResult {
874
+ elapsed : f64 ,
875
+ row_count : usize ,
876
+ }
877
+
782
878
#[ cfg( test) ]
783
879
mod tests {
784
880
use super :: * ;
@@ -1235,6 +1331,7 @@ mod tests {
1235
1331
path : PathBuf :: from ( path. to_string ( ) ) ,
1236
1332
file_format : "tbl" . to_string ( ) ,
1237
1333
mem_table : false ,
1334
+ output_path : None ,
1238
1335
} ;
1239
1336
let actual = benchmark_datafusion ( opt) . await ?;
1240
1337
0 commit comments