Skip to content

Data Explorer: Add preliminary customizable float formatting options for data values, summary stats #3310

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
May 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,12 @@
ColumnSchema,
ColumnSortKey,
ColumnSummaryStats,
ColumnValue,
CompareFilterParamsOp,
DataExplorerBackendMessageContent,
DataExplorerFrontendEvent,
FilterResult,
FormatOptions,
GetColumnProfilesFeatures,
GetColumnProfilesRequest,
GetDataValuesRequest,
Expand Down Expand Up @@ -134,6 +136,7 @@ def get_data_values(self, request: GetDataValuesRequest):
request.params.row_start_index,
request.params.num_rows,
request.params.column_indices,
request.params.format_options,
).dict()

def set_row_filters(self, request: SetRowFiltersRequest):
Expand All @@ -151,7 +154,7 @@ def get_column_profiles(self, request: GetColumnProfilesRequest):
count = self._prof_null_count(req.column_index)
result = ColumnProfileResult(null_count=int(count))
elif req.profile_type == ColumnProfileType.SummaryStats:
stats = self._prof_summary_stats(req.column_index)
stats = self._prof_summary_stats(req.column_index, request.params.format_options)
result = ColumnProfileResult(summary_stats=stats)
elif req.profile_type == ColumnProfileType.FrequencyTable:
freq_table = self._prof_freq_table(req.column_index)
Expand Down Expand Up @@ -193,6 +196,7 @@ def _get_data_values(
row_start: int,
num_rows: int,
column_indices: Sequence[int],
format_options: FormatOptions,
) -> TableData:
pass

Expand All @@ -213,7 +217,7 @@ def _prof_null_count(self, column_index: int) -> int:
pass

@abc.abstractmethod
def _prof_summary_stats(self, column_index: int) -> ColumnSummaryStats:
def _prof_summary_stats(self, column_index: int, options: FormatOptions) -> ColumnSummaryStats:
pass

@abc.abstractmethod
Expand All @@ -236,21 +240,71 @@ def _get_state(self) -> BackendState:
_VALUE_NONE = 4


def _format_value(x):
if isinstance(x, float) and np_.isnan(x):
return _VALUE_NAN
elif x is None:
return _VALUE_NONE
elif x is getattr(pd_, "NaT"):
return _VALUE_NAT
elif x is getattr(pd_, "NA"):
return _VALUE_NA
else:
return str(x)
if np_ is not None:

def _is_float_scalar(x):
return isinstance(x, (float, np_.floating))

_isnan = np_.isnan # type: ignore
else:

def _is_float_scalar(x):
return isinstance(x, float)

def _isnan(x: float) -> bool:
return x != x


def _get_float_formatter(options: FormatOptions) -> Callable:
sci_format = f".{options.large_num_digits}E"
medium_format = f".{options.large_num_digits}f"
small_format = f".{options.small_num_digits}f"

# The limit for large numbers before switching to scientific
# notation
upper_threshold = float("1" + "0" * options.max_integral_digits)

# The limit for small numbers before switching to scientific
# notation
lower_threshold = float("0." + "0" * (options.small_num_digits - 1) + "1")

thousands_sep = options.thousands_sep

if thousands_sep is not None:
# We format with comma then replace later
medium_format = "," + medium_format

def base_float_format(x) -> str:
abs_x = abs(x)

if abs_x >= 1:
if abs_x < upper_threshold:
# Has non-zero integral part but below
return format(x, medium_format)
else:
return format(x, sci_format)
elif abs_x == 0:
# Special case 0 to align with other "medium" numbers
return format(x, medium_format)
else:
if abs_x >= lower_threshold:
# Less than 1 but above lower threshold
return format(x, small_format)
else:
return format(x, sci_format)

if thousands_sep is not None:
if thousands_sep != ",":

def _pandas_format_values(col):
return [_format_value(x) for x in col]
def float_format(x) -> str:
base = base_float_format(x)
return base.replace(",", thousands_sep)

return float_format
else:
return base_float_format
else:
return base_float_format


_FILTER_RANGE_COMPARE_SUPPORTED = {
Expand Down Expand Up @@ -653,7 +707,11 @@ def _get_single_column_schema(self, column_index: int):
)

def _get_data_values(
self, row_start: int, num_rows: int, column_indices: Sequence[int]
self,
row_start: int,
num_rows: int,
column_indices: Sequence[int],
format_options: FormatOptions,
) -> TableData:
formatted_columns = []

Expand Down Expand Up @@ -685,16 +743,37 @@ def _get_data_values(
indices = self.table.index[row_start : row_start + num_rows]
columns = [col.iloc[row_start : row_start + num_rows] for col in columns]

formatted_columns = [_pandas_format_values(col) for col in columns]
formatted_columns = [self._format_values(col, format_options) for col in columns]

# Currently, we format MultiIndex in its flat tuple
# representation. In the future we will return multiple lists
# of row labels to be formatted more nicely in the UI
if isinstance(self.table.index, pd_.MultiIndex):
indices = indices.to_flat_index()
row_labels = [_pandas_format_values(indices)]
row_labels = [[str(x) for x in indices]]
return TableData(columns=formatted_columns, row_labels=row_labels)

@classmethod
def _format_values(cls, values, options: FormatOptions) -> List[ColumnValue]:
float_format = _get_float_formatter(options)
return [cls._format_value(x, float_format) for x in values]

@staticmethod
def _format_value(x, float_format: Callable):
if _is_float_scalar(x):
if _isnan(x):
return _VALUE_NAN
else:
return float_format(x)
elif x is None:
return _VALUE_NONE
elif x is getattr(pd_, "NaT", None):
return _VALUE_NAT
elif x is getattr(pd_, "NA", None):
return _VALUE_NA
else:
return str(x)

def _update_view_indices(self):
if len(self.sort_keys) == 0:
self.view_indices = self.filtered_indices
Expand Down Expand Up @@ -942,7 +1021,7 @@ def _get_column(self, column_index: int) -> "pd.Series":
def _prof_null_count(self, column_index: int):
return self._get_column(column_index).isnull().sum()

def _prof_summary_stats(self, column_index: int):
def _prof_summary_stats(self, column_index: int, options: FormatOptions):
col_schema = self._get_single_column_schema(column_index)
col = self._get_column(column_index)

Expand All @@ -953,31 +1032,31 @@ def _prof_summary_stats(self, column_index: int):
# Return nothing for types we don't yet know how to summarize
return ColumnSummaryStats(type_display=ui_type)
else:
return handler(col)

@staticmethod
def _summarize_number(col: "pd.Series"):
import pandas.io.formats.format as fmt
return handler(col, options)

minmax = pd_.Series([col.min(), col.max()], dtype=col.dtype)
numeric_stats = pd_.Series([col.mean(), col.median(), col.std()])
@classmethod
def _summarize_number(cls, col: "pd.Series", options: FormatOptions):
float_format = _get_float_formatter(options)

min_value, max_value = fmt.format_array(minmax.to_numpy(), None, leading_space=False)
mean, median, stdev = fmt.format_array(numeric_stats.to_numpy(), None, leading_space=False)
min_val = col.min()
max_val = col.max()
mean_val = col.mean()
median_val = col.median()
std_val = col.std()

return ColumnSummaryStats(
type_display=ColumnDisplayType.Number,
number_stats=SummaryStatsNumber(
min_value=min_value,
max_value=max_value,
mean=mean,
median=median,
stdev=stdev,
min_value=float_format(min_val),
max_value=float_format(max_val),
mean=float_format(mean_val),
median=float_format(median_val),
stdev=float_format(std_val),
),
)

@staticmethod
def _summarize_string(col: "pd.Series"):
def _summarize_string(col: "pd.Series", options: FormatOptions):
num_empty = (col.str.len() == 0).sum()
num_unique = col.nunique()

Expand All @@ -987,7 +1066,7 @@ def _summarize_string(col: "pd.Series"):
)

@staticmethod
def _summarize_boolean(col: "pd.Series"):
def _summarize_boolean(col: "pd.Series", options: FormatOptions):
null_count = col.isnull().sum()
true_count = col.sum()
false_count = len(col) - true_count - null_count
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,29 @@ class TableData(BaseModel):
)


class FormatOptions(BaseModel):
"""
Formatting options for returning data values as strings
"""

large_num_digits: StrictInt = Field(
description="Fixed number of decimal places to display for numbers over 1, or in scientific notation",
)

small_num_digits: StrictInt = Field(
description="Fixed number of decimal places to display for small numbers, and to determine lower threshold for switching to scientific notation",
)

max_integral_digits: StrictInt = Field(
description="Maximum number of integral digits to display before switching to scientific notation",
)

thousands_sep: Optional[StrictStr] = Field(
default=None,
description="Thousands separator string",
)


class TableSchema(BaseModel):
"""
The schema for a table-like object
Expand Down Expand Up @@ -770,6 +793,10 @@ class GetDataValuesParams(BaseModel):
description="Indices to select, which can be a sequential, sparse, or random selection",
)

format_options: FormatOptions = Field(
description="Formatting options for returning data values as strings",
)


class GetDataValuesRequest(BaseModel):
"""
Expand Down Expand Up @@ -859,6 +886,10 @@ class GetColumnProfilesParams(BaseModel):
description="Array of requested profiles",
)

format_options: FormatOptions = Field(
description="Formatting options for returning data values as strings",
)


class GetColumnProfilesRequest(BaseModel):
"""
Expand Down Expand Up @@ -931,6 +962,8 @@ class DataExplorerFrontendEvent(str, enum.Enum):

TableData.update_forward_refs()

FormatOptions.update_forward_refs()

TableSchema.update_forward_refs()

TableShape.update_forward_refs()
Expand Down
Loading
Loading