Skip to content

Commit b88d2bf

Browse files
authored
Merge pull request #334 from sfu-db/perf/plot
perf(eda.plot): changed drop_null to dropna
2 parents 1dbf297 + 0a7fe56 commit b88d2bf

File tree

5 files changed

+35
-27
lines changed

5 files changed

+35
-27
lines changed

dataprep/eda/distribution/compute/bivariate.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414
DTypeDef,
1515
Nominal,
1616
detect_dtype,
17-
drop_null,
1817
is_dtype,
1918
)
2019
from .common import (
@@ -98,7 +97,7 @@ def compute_bivariate(
9897
except TypeError:
9998
df[x] = df[x].astype(str)
10099

101-
(comps,) = dask.compute(nom_cont_comps(drop_null(df), bins, ngroups, largest))
100+
(comps,) = dask.compute(nom_cont_comps(df.dropna(), bins, ngroups, largest))
102101

103102
return Intermediate(
104103
x=x, y=y, data=comps, ngroups=ngroups, visual_type="cat_and_num_cols"
@@ -110,7 +109,7 @@ def compute_bivariate(
110109
and is_dtype(ytype, DateTime())
111110
):
112111
x, y = (x, y) if is_dtype(xtype, DateTime()) else (y, x)
113-
df = drop_null(df[[x, y]])
112+
df = df[[x, y]].dropna()
114113
dtnum: List[Any] = []
115114
# line chart
116115
dtnum.append(dask.delayed(_calc_line_dt)(df, timeunit, agg))
@@ -131,7 +130,7 @@ def compute_bivariate(
131130
and is_dtype(ytype, DateTime())
132131
):
133132
x, y = (x, y) if is_dtype(xtype, DateTime()) else (y, x)
134-
df = drop_null(df[[x, y]])
133+
df = df[[x, y]].dropna()
135134
df[y] = df[y].apply(str, meta=(y, str))
136135
dtcat: List[Any] = []
137136
# line chart
@@ -160,7 +159,7 @@ def compute_bivariate(
160159
except TypeError:
161160
df[y] = df[y].astype(str)
162161

163-
(comps,) = dask.compute(drop_null(df).groupby([x, y]).size())
162+
(comps,) = dask.compute(df.dropna().groupby([x, y]).size())
164163

165164
return Intermediate(
166165
x=x,
@@ -171,7 +170,7 @@ def compute_bivariate(
171170
visual_type="two_cat_cols",
172171
)
173172
elif is_dtype(xtype, Continuous()) and is_dtype(ytype, Continuous()):
174-
df = drop_null(df[[x, y]])
173+
df = df[[x, y]].dropna()
175174

176175
data: Dict[str, Any] = {}
177176
# scatter plot data

dataprep/eda/distribution/compute/common.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from scipy.stats import gaussian_kde as gaussian_kde_
1010
from scipy.stats import ks_2samp as ks_2samp_
1111
from scipy.stats import normaltest as normaltest_
12+
from scipy.stats import skewtest as skewtest_
1213

1314
from ...dtypes import drop_null
1415

@@ -233,5 +234,13 @@ def ks_2samp(data1: np.ndarray, data2: np.ndarray) -> Tuple[float, float]:
233234
name="scipy-gaussian_kde", pure=True, nout=2
234235
)
235236
def gaussian_kde(arr: np.ndarray) -> Tuple[float, float]:
236-
"""Delayed version of scipy ks_2samp."""
237+
"""Delayed version of scipy gaussian_kde."""
237238
return cast(Tuple[np.ndarray, np.ndarray], gaussian_kde_(arr))
239+
240+
241+
@dask.delayed( # pylint: disable=no-value-for-parameter
242+
name="scipy-skewtest", pure=True, nout=2
243+
)
244+
def skewtest(arr: np.ndarray) -> Tuple[float, float]:
245+
"""Delayed version of scipy skewtest."""
246+
return cast(Tuple[float, float], skewtest_(arr))

dataprep/eda/distribution/compute/overview.py

Lines changed: 17 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
import dask.dataframe as dd
99
import numpy as np
1010
import pandas as pd
11-
from dask.array.stats import chisquare, skew
11+
from dask.array.stats import chisquare
1212

1313
from ....errors import UnreachableError
1414
from ...dtypes import (
@@ -18,12 +18,11 @@
1818
DTypeDef,
1919
Nominal,
2020
detect_dtype,
21-
drop_null,
2221
get_dtype_cnts_and_num_cols,
2322
is_dtype,
2423
)
2524
from ...intermediate import Intermediate
26-
from .common import _calc_line_dt, ks_2samp, normaltest
25+
from .common import _calc_line_dt, ks_2samp, normaltest, skewtest
2726

2827

2928
def compute_overview(
@@ -80,11 +79,11 @@ def compute_overview(
8079
first_rows[col].apply(hash)
8180
except TypeError:
8281
srs = df[col] = srs.astype(str)
83-
datas.append(calc_nom_col(drop_null(srs), ngroups, largest))
82+
datas.append(calc_nom_col(srs.dropna(), first_rows[col], ngroups, largest))
8483
col_names_dtypes.append((col, Nominal()))
8584
elif is_dtype(col_dtype, Continuous()):
8685
## if cfg.hist_enable or cfg.any_insights("hist"):
87-
datas.append(calc_cont_col(drop_null(srs), bins))
86+
datas.append(calc_cont_col(srs.dropna(), bins))
8887
col_names_dtypes.append((col, Continuous()))
8988
elif is_dtype(col_dtype, DateTime()):
9089
datas.append(dask.delayed(_calc_line_dt)(df[[col]], timeunit))
@@ -145,10 +144,11 @@ def calc_cont_col(srs: dd.Series, bins: int) -> Dict[str, Any]:
145144
data["npres"] = srs.shape[0]
146145

147146
## if cfg.insight.infinity_enable:
148-
data["ninf"] = srs.isin({np.inf, -np.inf}).sum()
147+
is_inf_srs = srs.isin({np.inf, -np.inf})
148+
data["ninf"] = is_inf_srs.sum()
149149

150150
# remove infinite values
151-
srs = srs[~srs.isin({np.inf, -np.inf})]
151+
srs = srs[~is_inf_srs]
152152

153153
## if cfg.hist_enable or config.insight.uniform_enable or cfg.insight.normal_enable:
154154
## bins = cfg.hist_bins
@@ -164,7 +164,7 @@ def calc_cont_col(srs: dd.Series, bins: int) -> Dict[str, Any]:
164164
data["nneg"] = (srs < 0).sum()
165165

166166
## if cfg.insight.skew_enabled:
167-
data["skew"] = skew(srs)
167+
data["skew"] = skewtest(data["hist"][0])
168168

169169
## if cfg.insight.unique_enabled:
170170
data["nuniq"] = srs.nunique()
@@ -176,7 +176,9 @@ def calc_cont_col(srs: dd.Series, bins: int) -> Dict[str, Any]:
176176

177177

178178
## def calc_nom_col(srs: dd.Series, first_rows: pd.Series, cfg: Config)
179-
def calc_nom_col(srs: dd.Series, ngroups: int, largest: bool) -> Dict[str, Any]:
179+
def calc_nom_col(
180+
srs: dd.Series, first_rows: pd.Series, ngroups: int, largest: bool
181+
) -> Dict[str, Any]:
180182
"""
181183
Computations for a categorical column in plot(df)
182184
@@ -222,8 +224,10 @@ def calc_nom_col(srs: dd.Series, ngroups: int, largest: bool) -> Dict[str, Any]:
222224
## data["npresent"] = srs.shape[0]
223225

224226
## if cfg.insight.constant_length_enable:
225-
length = srs.apply(lambda v: len(str(v)), meta=(srs.name, np.int64))
226-
data["min_len"], data["max_len"] = length.min(), length.max()
227+
if not first_rows.apply(lambda x: isinstance(x, str)).all():
228+
srs = srs.astype(str) # srs must be a string to compute the value lengths
229+
lengths = srs.str.len()
230+
data["min_len"], data["max_len"] = lengths.min(), lengths.max()
227231

228232
return data
229233

@@ -247,7 +251,6 @@ def calc_stats(df: dd.DataFrame, dtype: Optional[DTypeDef]) -> Dict[str, Any]:
247251

248252
## if cfg.stats_enable
249253
dtype_cnts, num_cols = get_dtype_cnts_and_num_cols(df, dtype)
250-
stats["nrows"] = df.shape[0]
251254
stats["ncols"] = df.shape[1]
252255
stats["npresent_cells"] = df.count().sum()
253256
stats["nrows_wo_dups"] = df.drop_duplicates().shape[0]
@@ -327,9 +330,8 @@ def format_cont(col: str, data: Dict[str, Any], nrows: int) -> Any:
327330
ins.append({"Missing": f"{col} has {nmiss} ({pmiss}%) missing values"})
328331

329332
## if cfg.insight.skewed_enable:
330-
if data["skew"] >= 20: ## cfg.insight.skewed_threshold
331-
skew_val = np.round(data["skew"], 4)
332-
ins.append({"Skewed": f"{col} is skewed (\u03B31 = {skew_val})"})
333+
if data["skew"][1] < 1e-5: ## cfg.insight.skewed_threshold
334+
ins.append({"Skewed": f"{col} is skewed"})
333335

334336
## if cfg.insight.infinity_enable:
335337
pinf = round(data["ninf"] / nrows * 100, 2)

dataprep/eda/distribution/compute/univariate.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
DTypeDef,
1919
Nominal,
2020
detect_dtype,
21-
drop_null,
2221
is_dtype,
2322
)
2423
from ...intermediate import Intermediate
@@ -177,7 +176,7 @@ def nom_comps(
177176
except TypeError:
178177
srs = srs.astype(str)
179178
# drop null values
180-
srs = drop_null(srs)
179+
srs = srs.dropna()
181180

182181
## if cfg.bar_enable or cfg.pie_enable
183182
# counts of unique values in the series
@@ -223,7 +222,7 @@ def cont_comps(srs: dd.Series, bins: int) -> Dict[str, Any]:
223222
## if cfg.stats_enable or cfg.hist_enable or
224223
# calculate the total number of rows then drop the missing values
225224
data["nrows"] = srs.shape[0]
226-
srs = drop_null(srs)
225+
srs = srs.dropna()
227226
## if cfg.stats_enable
228227
# number of not null (present) values
229228
data["npres"] = srs.shape[0]
@@ -236,7 +235,6 @@ def cont_comps(srs: dd.Series, bins: int) -> Dict[str, Any]:
236235
## if cfg.hist_enable or cfg.qqplot_enable and cfg.ingsights_enable:
237236
data["hist"] = da.histogram(srs, bins=bins, range=[data["min"], data["max"]])
238237
## if cfg.insights_enable and (cfg.qqplot_enable or cfg.hist_enable):
239-
# NOTE normal test does a .compute() and I cannot fix it with delayed
240238
data["norm"] = normaltest(data["hist"][0])
241239
## if cfg.qqplot_enable
242240
data["qntls"] = srs.quantile(np.linspace(0.01, 0.99, 99))

dataprep/eda/distribution/render.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1636,7 +1636,7 @@ def cont_insights(data: Dict[str, Any], col: str) -> Dict[str, List[str]]:
16361636

16371637
## if cfg.insight.normal_enable:
16381638
if data["norm"][1] > 0.99:
1639-
ins["hist"].append(f"{col} is normally distributed")
1639+
ins["Histogram"].append(f"{col} is normally distributed")
16401640

16411641
## if cfg.insight.uniform_enable:
16421642
if data["chisq"][1] > 0.999: ## cfg.insight.uniform_threshold

0 commit comments

Comments
 (0)