Skip to content

Commit fc9aeaf

Browse files
authored
Add support for computing summary stats (#342)
* Add support for computing summary stats * Renaming summary functions * Simplify `summary_stats` `match` branches. * Add some unit tests
1 parent fc38eb5 commit fc9aeaf

File tree

4 files changed

+246
-3
lines changed

4 files changed

+246
-3
lines changed

crates/ark/src/data_explorer/r_data_explorer.rs

Lines changed: 88 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ use amalthea::comm::data_explorer_comm::ColumnProfileResult;
1515
use amalthea::comm::data_explorer_comm::ColumnProfileType;
1616
use amalthea::comm::data_explorer_comm::ColumnSchema;
1717
use amalthea::comm::data_explorer_comm::ColumnSortKey;
18+
use amalthea::comm::data_explorer_comm::ColumnSummaryStats;
1819
use amalthea::comm::data_explorer_comm::DataExplorerBackendReply;
1920
use amalthea::comm::data_explorer_comm::DataExplorerBackendRequest;
2021
use amalthea::comm::data_explorer_comm::DataExplorerFrontendEvent;
@@ -29,6 +30,9 @@ use amalthea::comm::data_explorer_comm::SearchSchemaFeatures;
2930
use amalthea::comm::data_explorer_comm::SetRowFiltersFeatures;
3031
use amalthea::comm::data_explorer_comm::SetRowFiltersParams;
3132
use amalthea::comm::data_explorer_comm::SetSortColumnsParams;
33+
use amalthea::comm::data_explorer_comm::SummaryStatsBoolean;
34+
use amalthea::comm::data_explorer_comm::SummaryStatsNumber;
35+
use amalthea::comm::data_explorer_comm::SummaryStatsString;
3236
use amalthea::comm::data_explorer_comm::SupportedFeatures;
3337
use amalthea::comm::data_explorer_comm::TableData;
3438
use amalthea::comm::data_explorer_comm::TableSchema;
@@ -473,6 +477,26 @@ impl RDataExplorer {
473477
frequency_table: None,
474478
}
475479
},
480+
ColumnProfileType::SummaryStats => {
481+
let summary_stats =
482+
r_task(|| self.r_summary_stats(request.column_index as i32));
483+
ColumnProfileResult {
484+
null_count: None,
485+
summary_stats: match summary_stats {
486+
Err(err) => {
487+
log::error!(
488+
"Error getting summary stats for column {}: {}",
489+
request.column_index,
490+
err
491+
);
492+
None
493+
},
494+
Ok(stats) => Some(stats),
495+
},
496+
histogram: None,
497+
frequency_table: None,
498+
}
499+
},
476500
_ => {
477501
// Other kinds of column profiles are not yet
478502
// implemented in R
@@ -577,6 +601,66 @@ impl RDataExplorer {
577601
Ok(result.try_into()?)
578602
}
579603

604+
fn r_summary_stats(&self, column_index: i32) -> anyhow::Result<ColumnSummaryStats> {
605+
// Get the column to compute summary stats for
606+
let column = tbl_get_column(self.table.get().sexp, column_index, self.shape.kind)?;
607+
let dtype = display_type(column.sexp);
608+
609+
let call_summary_fn = |fun| {
610+
RFunction::new("", fun)
611+
.param("column", column)
612+
.param("filtered_indices", match &self.filtered_indices {
613+
Some(indices) => RObject::try_from(indices)?,
614+
None => RObject::null(),
615+
})
616+
.call_in(ARK_ENVS.positron_ns)
617+
};
618+
619+
let mut stats = ColumnSummaryStats {
620+
type_display: dtype.clone(),
621+
number_stats: None,
622+
string_stats: None,
623+
boolean_stats: None,
624+
};
625+
626+
match dtype {
627+
ColumnDisplayType::Number => {
628+
let r_stats: HashMap<String, String> =
629+
call_summary_fn("number_summary_stats")?.try_into()?;
630+
631+
stats.number_stats = Some(SummaryStatsNumber {
632+
min_value: r_stats["min_value"].clone(),
633+
max_value: r_stats["max_value"].clone(),
634+
mean: r_stats["mean"].clone(),
635+
median: r_stats["median"].clone(),
636+
stdev: r_stats["stdev"].clone(),
637+
});
638+
},
639+
ColumnDisplayType::String => {
640+
let r_stats: HashMap<String, i32> =
641+
call_summary_fn("string_summary_stats")?.try_into()?;
642+
643+
stats.string_stats = Some(SummaryStatsString {
644+
num_empty: r_stats["num_empty"].clone() as i64,
645+
num_unique: r_stats["num_unique"].clone() as i64,
646+
});
647+
},
648+
ColumnDisplayType::Boolean => {
649+
let r_stats: HashMap<String, i32> =
650+
call_summary_fn("boolean_summary_stats")?.try_into()?;
651+
652+
stats.boolean_stats = Some(SummaryStatsBoolean {
653+
true_count: r_stats["true_count"].clone() as i64,
654+
false_count: r_stats["false_count"].clone() as i64,
655+
});
656+
},
657+
_ => {
658+
bail!("Summary stats not implemented for type: {:?}", dtype);
659+
},
660+
}
661+
Ok(stats)
662+
}
663+
580664
/// Sort the rows of the data object according to the sort keys in
581665
/// self.sort_keys.
582666
///
@@ -761,7 +845,10 @@ impl RDataExplorer {
761845
supported_features: SupportedFeatures {
762846
get_column_profiles: GetColumnProfilesFeatures {
763847
supported: true,
764-
supported_types: vec![ColumnProfileType::NullCount],
848+
supported_types: vec![
849+
ColumnProfileType::NullCount,
850+
ColumnProfileType::SummaryStats,
851+
],
765852
},
766853
search_schema: SearchSchemaFeatures { supported: false },
767854
set_row_filters: SetRowFiltersFeatures {

crates/ark/src/modules/positron/r_data_explorer.R

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,35 @@
4444
}
4545
}
4646

47+
number_summary_stats <- function(column, filtered_indices) {
48+
col <- col_filter_indices(column, filtered_indices)
49+
50+
format(c(
51+
min_value = min(col, na.rm = TRUE),
52+
max_value = max(col, na.rm = TRUE),
53+
mean = mean(col, na.rm = TRUE),
54+
median = stats::median(col, na.rm = TRUE),
55+
stdev = stats::sd(col, na.rm = TRUE)
56+
))
57+
}
58+
59+
string_summary_stats <- function(column, filtered_indices) {
60+
col <- col_filter_indices(column, filtered_indices)
61+
c(num_empty = sum(!nzchar(col)), num_unique = length(unique(col)))
62+
}
63+
64+
boolean_summary_stats <- function(column, filtered_indices) {
65+
col <- col_filter_indices(column, filtered_indices)
66+
c(true_count = sum(col, na.rm = TRUE), false_count = sum(!col, na.rm = TRUE))
67+
}
68+
69+
col_filter_indices <- function(col, idx = NULL) {
70+
if (!is.null(idx)) {
71+
col <- col[idx]
72+
}
73+
col
74+
}
75+
4776
.ps.filter_rows <- function(table, row_filters) {
4877
# Are we working with a matrix here?
4978
is_matrix <- is.matrix(table)

crates/ark/tests/data_explorer.rs

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,9 @@ use amalthea::comm::data_explorer_comm::SearchFilterParams;
2525
use amalthea::comm::data_explorer_comm::SearchFilterType;
2626
use amalthea::comm::data_explorer_comm::SetRowFiltersParams;
2727
use amalthea::comm::data_explorer_comm::SetSortColumnsParams;
28+
use amalthea::comm::data_explorer_comm::SummaryStatsBoolean;
29+
use amalthea::comm::data_explorer_comm::SummaryStatsNumber;
30+
use amalthea::comm::data_explorer_comm::SummaryStatsString;
2831
use amalthea::comm::event::CommManagerEvent;
2932
use amalthea::socket;
3033
use ark::data_explorer::r_data_explorer::DataObjectEnvInfo;
@@ -704,6 +707,70 @@ fn test_data_explorer() {
704707
assert_eq!(num_rows, 3);
705708
});
706709

710+
// --- summary stats ---
711+
712+
// Create a data frame with some numbers, characters and booleans to test
713+
// summary statistics.
714+
r_parse_eval0(
715+
"df <- data.frame(num = c(1, 2, 3, NA), char = c('a', 'a', '', NA), bool = c(TRUE, TRUE, FALSE, NA))",
716+
R_ENVS.global,
717+
)
718+
.unwrap();
719+
720+
// Open the fibo data set in the data explorer.
721+
let socket = open_data_explorer(String::from("df"));
722+
723+
// Ask for summary stats for the columns
724+
let req = DataExplorerBackendRequest::GetColumnProfiles(GetColumnProfilesParams {
725+
profiles: (0..3)
726+
.map(|i| ColumnProfileRequest {
727+
column_index: i,
728+
profile_type: ColumnProfileType::SummaryStats,
729+
})
730+
.collect(),
731+
});
732+
733+
assert_match!(socket_rpc(&socket, req),
734+
DataExplorerBackendReply::GetColumnProfilesReply(data) => {
735+
// We asked for summary stats for all 3 columns
736+
assert!(data.len() == 3);
737+
738+
// The first column is numeric and has 3 non-NA values.
739+
assert!(data[0].summary_stats.is_some());
740+
let number_stats = data[0].summary_stats.clone().unwrap().number_stats;
741+
assert!(number_stats.is_some());
742+
let number_stats = number_stats.unwrap();
743+
assert_eq!(number_stats, SummaryStatsNumber {
744+
min_value: String::from("1"),
745+
max_value: String::from("3"),
746+
mean: String::from("2"),
747+
median: String::from("2"),
748+
stdev: String::from("1"),
749+
});
750+
751+
// The second column is a character column
752+
assert!(data[1].summary_stats.is_some());
753+
let string_stats = data[1].summary_stats.clone().unwrap().string_stats;
754+
assert!(string_stats.is_some());
755+
let string_stats = string_stats.unwrap();
756+
assert_eq!(string_stats, SummaryStatsString {
757+
num_empty: 1,
758+
num_unique: 3, // NA's are counted as unique values
759+
});
760+
761+
// The third column is boolean
762+
assert!(data[2].summary_stats.is_some());
763+
let boolean_stats = data[2].summary_stats.clone().unwrap().boolean_stats;
764+
assert!(boolean_stats.is_some());
765+
let boolean_stats = boolean_stats.unwrap();
766+
assert_eq!(boolean_stats, SummaryStatsBoolean {
767+
true_count: 2,
768+
false_count: 1,
769+
});
770+
771+
}
772+
);
773+
707774
// --- search filters ---
708775

709776
// Create a data frame with a bunch of words to use for regex testing.

crates/harp/src/object.rs

Lines changed: 62 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -930,6 +930,8 @@ impl TryFrom<&Vec<i32>> for RObject {
930930
}
931931
}
932932

933+
// Converts an R named character vector to a HashMap<String, String>
934+
// Note: Duplicated names are silently ignored, and only the first occurence is kept.
933935
impl TryFrom<RObject> for HashMap<String, String> {
934936
type Error = crate::error::Error;
935937
fn try_from(value: RObject) -> Result<Self, Self::Error> {
@@ -945,7 +947,7 @@ impl TryFrom<RObject> for HashMap<String, String> {
945947
let n = Rf_xlength(names);
946948
let mut map = HashMap::<String, String>::with_capacity(n as usize);
947949

948-
for i in 0..Rf_xlength(names) {
950+
for i in (0..Rf_xlength(names)).rev() {
949951
// Translate the name and value into Rust strings.
950952
let lhs = r_chr_get_owned_utf8(names, i)?;
951953
let rhs = r_chr_get_owned_utf8(value, i)?;
@@ -958,6 +960,36 @@ impl TryFrom<RObject> for HashMap<String, String> {
958960
}
959961
}
960962

963+
// Converts an R named integer vector to a HashMap<String, i32>
964+
// Note: Duplicated names are silently ignored, and only the first occurence is kept.
965+
impl TryFrom<RObject> for HashMap<String, i32> {
966+
type Error = crate::error::Error;
967+
fn try_from(value: RObject) -> Result<Self, Self::Error> {
968+
unsafe {
969+
r_assert_type(*value, &[INTSXP, VECSXP])?;
970+
971+
let mut protect = RProtect::new();
972+
let names = protect.add(Rf_getAttrib(*value, R_NamesSymbol));
973+
r_assert_type(names, &[STRSXP])?;
974+
975+
let value = protect.add(Rf_coerceVector(*value, INTSXP));
976+
977+
let n = Rf_xlength(names);
978+
let mut map = HashMap::<String, i32>::with_capacity(n as usize);
979+
980+
for i in (0..Rf_xlength(names)).rev() {
981+
// Translate the name and value into Rust strings.
982+
let name = r_chr_get_owned_utf8(names, i)?;
983+
let val = r_int_get(value, i);
984+
985+
map.insert(name, val);
986+
}
987+
988+
Ok(map)
989+
}
990+
}
991+
}
992+
961993
// Converts a named R object into a HashMap<String, RObject> whose names are used as keys.
962994
// Duplicated names are silently ignored, and only the first occurence is kept.
963995
impl TryFrom<RObject> for HashMap<String, RObject> {
@@ -974,7 +1006,7 @@ impl TryFrom<RObject> for HashMap<String, RObject> {
9741006
// iterate in the reverse order to keep the first occurence of a name
9751007
for i in (0..n).rev() {
9761008
let name = r_chr_get_owned_utf8(names, i)?;
977-
let value = RObject::new(VECTOR_ELT(*value, i));
1009+
let value: RObject = RObject::new(VECTOR_ELT(*value, i));
9781010
map.insert(name, value);
9791011
}
9801012

@@ -1403,6 +1435,34 @@ mod tests {
14031435
assert_eq!(out.get("pepperoni").unwrap(), "OK");
14041436
assert_eq!(out.get("sausage").unwrap(), "OK");
14051437
assert_eq!(out.get("pineapple").unwrap(), "NOT OK");
1438+
1439+
1440+
let v = r_parse_eval0("c(x = 'a', y = 'b', z = 'c')", R_ENVS.global).unwrap();
1441+
let out: HashMap<String, String> = v.try_into().unwrap();
1442+
assert_eq!(out["x"], "a"); // duplicated name is ignored and first is kept
1443+
assert_eq!(out["y"], "b");
1444+
}
1445+
}
1446+
1447+
#[test]
1448+
#[allow(non_snake_case)]
1449+
fn test_tryfrom_RObject_hashmap_i32() {
1450+
r_test! {
1451+
// Create a map of pizza toppings to their acceptability.
1452+
let v = r_parse_eval0("list(x = 1L, y = 2L, x = 3L)", R_ENVS.global).unwrap();
1453+
assert_eq!(v.length(), 3 as isize);
1454+
1455+
// Ensure we created an object of the same size as the map.
1456+
let out: HashMap<String, i32> = v.try_into().unwrap();
1457+
1458+
// Ensure we can convert the object back into a map with the same values.
1459+
assert_eq!(out["x"], 1); // duplicated name is ignored and first is kept
1460+
assert_eq!(out["y"], 2);
1461+
1462+
let v = r_parse_eval0("c(x = 1L, y = 2L, x = 3L)", R_ENVS.global).unwrap();
1463+
let out: HashMap<String, i32> = v.try_into().unwrap();
1464+
assert_eq!(out["x"], 1); // duplicated name is ignored and first is kept
1465+
assert_eq!(out["y"], 2);
14061466
}
14071467
}
14081468

0 commit comments

Comments
 (0)