Skip to content

Commit 1431ef3

Browse files
andygroveDandandan
andauthored
TPC-H benchmark can optionally write JSON output file with benchmark summary (#1766)
* use ordered-float 2.10 Signed-off-by: Andy Grove <[email protected]> * Add DATAFUSION_VERSION constant Signed-off-by: Andy Grove <[email protected]> * Add option to write JSON summary file with benchmark results * update test * Clippy fix Co-authored-by: Daniël Heres <[email protected]>
1 parent ecd0081 commit 1431ef3

File tree

3 files changed

+107
-4
lines changed

3 files changed

+107
-4
lines changed

benchmarks/Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,9 @@ env_logger = "0.9"
4141
mimalloc = { version = "0.1", optional = true, default-features = false }
4242
snmalloc-rs = {version = "0.2", optional = true, features= ["cache-friendly"] }
4343
rand = "0.8.4"
44+
serde = "1.0.136"
45+
serde_json = "1.0.78"
46+
num_cpus = "1.13.0"
4447

4548
[dev-dependencies]
4649
ballista-core = { path = "../ballista/rust/core" }

benchmarks/src/bin/tpch.rs

Lines changed: 101 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,12 @@ use futures::future::join_all;
2121
use rand::prelude::*;
2222
use std::ops::Div;
2323
use std::{
24-
fs,
24+
fs::{self, File},
25+
io::Write,
2526
iter::Iterator,
2627
path::{Path, PathBuf},
2728
sync::Arc,
28-
time::Instant,
29+
time::{Instant, SystemTime},
2930
};
3031

3132
use ballista::context::BallistaContext;
@@ -42,6 +43,7 @@ use datafusion::prelude::*;
4243
use datafusion::{
4344
arrow::datatypes::{DataType, Field, Schema},
4445
datasource::file_format::{csv::CsvFormat, FileFormat},
46+
DATAFUSION_VERSION,
4547
};
4648
use datafusion::{
4749
arrow::record_batch::RecordBatch, datasource::file_format::parquet::ParquetFormat,
@@ -56,6 +58,7 @@ use datafusion::{
5658

5759
use datafusion::datasource::file_format::csv::DEFAULT_CSV_EXTENSION;
5860
use datafusion::datasource::file_format::parquet::DEFAULT_PARQUET_EXTENSION;
61+
use serde::Serialize;
5962
use structopt::StructOpt;
6063

6164
#[cfg(feature = "snmalloc")]
@@ -105,6 +108,10 @@ struct BallistaBenchmarkOpt {
105108
/// Ballista executor port
106109
#[structopt(long = "port")]
107110
port: Option<u16>,
111+
112+
/// Path to output directory where JSON summary file should be written to
113+
#[structopt(parse(from_os_str), short = "o", long = "output")]
114+
output_path: Option<PathBuf>,
108115
}
109116

110117
#[derive(Debug, StructOpt, Clone)]
@@ -140,6 +147,10 @@ struct DataFusionBenchmarkOpt {
140147
/// Load the data into a MemTable before executing the query
141148
#[structopt(short = "m", long = "mem-table")]
142149
mem_table: bool,
150+
151+
/// Path to output directory where JSON summary file should be written to
152+
#[structopt(parse(from_os_str), short = "o", long = "output")]
153+
output_path: Option<PathBuf>,
143154
}
144155

145156
#[derive(Debug, StructOpt, Clone)]
@@ -261,6 +272,7 @@ async fn main() -> Result<()> {
261272

262273
async fn benchmark_datafusion(opt: DataFusionBenchmarkOpt) -> Result<Vec<RecordBatch>> {
263274
println!("Running benchmarks with the following options: {:?}", opt);
275+
let mut benchmark_run = BenchmarkRun::new(opt.query);
264276
let config = ExecutionConfig::new()
265277
.with_target_partitions(opt.partitions)
266278
.with_batch_size(opt.batch_size);
@@ -302,17 +314,27 @@ async fn benchmark_datafusion(opt: DataFusionBenchmarkOpt) -> Result<Vec<RecordB
302314
result = execute_query(&mut ctx, &plan, opt.debug).await?;
303315
let elapsed = start.elapsed().as_secs_f64() * 1000.0;
304316
millis.push(elapsed as f64);
305-
println!("Query {} iteration {} took {:.1} ms", opt.query, i, elapsed);
317+
let row_count = result.iter().map(|b| b.num_rows()).sum();
318+
println!(
319+
"Query {} iteration {} took {:.1} ms and returned {} rows",
320+
opt.query, i, elapsed, row_count
321+
);
322+
benchmark_run.add_result(elapsed, row_count);
306323
}
307324

308325
let avg = millis.iter().sum::<f64>() / millis.len() as f64;
309326
println!("Query {} avg time: {:.2} ms", opt.query, avg);
310327

328+
if let Some(path) = &opt.output_path {
329+
write_summary_json(&mut benchmark_run, path)?;
330+
}
331+
311332
Ok(result)
312333
}
313334

314335
async fn benchmark_ballista(opt: BallistaBenchmarkOpt) -> Result<()> {
315336
println!("Running benchmarks with the following options: {:?}", opt);
337+
let mut benchmark_run = BenchmarkRun::new(opt.query);
316338

317339
let config = BallistaConfig::builder()
318340
.set(
@@ -350,7 +372,12 @@ async fn benchmark_ballista(opt: BallistaBenchmarkOpt) -> Result<()> {
350372
.unwrap();
351373
let elapsed = start.elapsed().as_secs_f64() * 1000.0;
352374
millis.push(elapsed as f64);
353-
println!("Query {} iteration {} took {:.1} ms", opt.query, i, elapsed);
375+
let row_count = batches.iter().map(|b| b.num_rows()).sum();
376+
println!(
377+
"Query {} iteration {} took {:.1} ms and returned {} rows",
378+
opt.query, i, elapsed, row_count
379+
);
380+
benchmark_run.add_result(elapsed, row_count);
354381
if opt.debug {
355382
pretty::print_batches(&batches)?;
356383
}
@@ -359,6 +386,27 @@ async fn benchmark_ballista(opt: BallistaBenchmarkOpt) -> Result<()> {
359386
let avg = millis.iter().sum::<f64>() / millis.len() as f64;
360387
println!("Query {} avg time: {:.2} ms", opt.query, avg);
361388

389+
if let Some(path) = &opt.output_path {
390+
write_summary_json(&mut benchmark_run, path)?;
391+
}
392+
393+
Ok(())
394+
}
395+
396+
fn write_summary_json(benchmark_run: &mut BenchmarkRun, path: &Path) -> Result<()> {
397+
let json =
398+
serde_json::to_string_pretty(&benchmark_run).expect("summary is serializable");
399+
let filename = format!(
400+
"tpch-q{}-{}.json",
401+
benchmark_run.query, benchmark_run.start_time
402+
);
403+
let path = path.join(filename);
404+
println!(
405+
"Writing summary file to {}",
406+
path.as_os_str().to_str().unwrap()
407+
);
408+
let mut file = File::create(path)?;
409+
file.write_all(json.as_bytes())?;
362410
Ok(())
363411
}
364412

@@ -779,6 +827,54 @@ fn get_schema(table: &str) -> Schema {
779827
}
780828
}
781829

830+
#[derive(Debug, Serialize)]
831+
struct BenchmarkRun {
832+
/// Benchmark crate version
833+
benchmark_version: String,
834+
/// DataFusion crate version
835+
datafusion_version: String,
836+
/// Number of CPU cores
837+
num_cpus: usize,
838+
/// Start time
839+
start_time: u64,
840+
/// CLI arguments
841+
arguments: Vec<String>,
842+
/// query number
843+
query: usize,
844+
/// list of individual run times and row counts
845+
iterations: Vec<QueryResult>,
846+
}
847+
848+
impl BenchmarkRun {
849+
fn new(query: usize) -> Self {
850+
Self {
851+
benchmark_version: env!("CARGO_PKG_VERSION").to_owned(),
852+
datafusion_version: DATAFUSION_VERSION.to_owned(),
853+
num_cpus: num_cpus::get(),
854+
start_time: SystemTime::now()
855+
.duration_since(SystemTime::UNIX_EPOCH)
856+
.expect("current time is later than UNIX_EPOCH")
857+
.as_secs(),
858+
arguments: std::env::args()
859+
.skip(1)
860+
.into_iter()
861+
.collect::<Vec<String>>(),
862+
query,
863+
iterations: vec![],
864+
}
865+
}
866+
867+
fn add_result(&mut self, elapsed: f64, row_count: usize) {
868+
self.iterations.push(QueryResult { elapsed, row_count })
869+
}
870+
}
871+
872+
#[derive(Debug, Serialize)]
873+
struct QueryResult {
874+
elapsed: f64,
875+
row_count: usize,
876+
}
877+
782878
#[cfg(test)]
783879
mod tests {
784880
use super::*;
@@ -1235,6 +1331,7 @@ mod tests {
12351331
path: PathBuf::from(path.to_string()),
12361332
file_format: "tbl".to_string(),
12371333
mem_table: false,
1334+
output_path: None,
12381335
};
12391336
let actual = benchmark_datafusion(opt).await?;
12401337

datafusion/src/lib.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,9 @@
201201
//! cargo run --example simple_udf
202202
//! ```
203203
204+
/// DataFusion crate version
205+
pub const DATAFUSION_VERSION: &str = env!("CARGO_PKG_VERSION");
206+
204207
extern crate sqlparser;
205208

206209
pub mod avro_to_arrow;

0 commit comments

Comments
 (0)