88import dask .dataframe as dd
99import numpy as np
1010import pandas as pd
11- from dask .array .stats import chisquare , skew
11+ from dask .array .stats import chisquare
1212
1313from ....errors import UnreachableError
1414from ...dtypes import (
1818 DTypeDef ,
1919 Nominal ,
2020 detect_dtype ,
21- drop_null ,
2221 get_dtype_cnts_and_num_cols ,
2322 is_dtype ,
2423)
2524from ...intermediate import Intermediate
26- from .common import _calc_line_dt , ks_2samp , normaltest
25+ from .common import _calc_line_dt , ks_2samp , normaltest , skewtest
2726
2827
2928def compute_overview (
@@ -80,11 +79,11 @@ def compute_overview(
8079 first_rows [col ].apply (hash )
8180 except TypeError :
8281 srs = df [col ] = srs .astype (str )
83- datas .append (calc_nom_col (drop_null ( srs ) , ngroups , largest ))
82+ datas .append (calc_nom_col (srs . dropna (), first_rows [ col ] , ngroups , largest ))
8483 col_names_dtypes .append ((col , Nominal ()))
8584 elif is_dtype (col_dtype , Continuous ()):
8685 ## if cfg.hist_enable or cfg.any_insights("hist"):
87- datas .append (calc_cont_col (drop_null ( srs ), bins ))
86+ datas .append (calc_cont_col (srs . dropna ( ), bins ))
8887 col_names_dtypes .append ((col , Continuous ()))
8988 elif is_dtype (col_dtype , DateTime ()):
9089 datas .append (dask .delayed (_calc_line_dt )(df [[col ]], timeunit ))
@@ -145,10 +144,11 @@ def calc_cont_col(srs: dd.Series, bins: int) -> Dict[str, Any]:
145144 data ["npres" ] = srs .shape [0 ]
146145
147146 ## if cfg.insight.infinity_enable:
148- data ["ninf" ] = srs .isin ({np .inf , - np .inf }).sum ()
147+ is_inf_srs = srs .isin ({np .inf , - np .inf })
148+ data ["ninf" ] = is_inf_srs .sum ()
149149
150150 # remove infinite values
151- srs = srs [~ srs . isin ({ np . inf , - np . inf }) ]
151+ srs = srs [~ is_inf_srs ]
152152
153153 ## if cfg.hist_enable or config.insight.uniform_enable or cfg.insight.normal_enable:
154154 ## bins = cfg.hist_bins
@@ -164,7 +164,7 @@ def calc_cont_col(srs: dd.Series, bins: int) -> Dict[str, Any]:
164164 data ["nneg" ] = (srs < 0 ).sum ()
165165
166166 ## if cfg.insight.skew_enabled:
167- data ["skew" ] = skew ( srs )
167+ data ["skew" ] = skewtest ( data [ "hist" ][ 0 ] )
168168
169169 ## if cfg.insight.unique_enabled:
170170 data ["nuniq" ] = srs .nunique ()
@@ -176,7 +176,9 @@ def calc_cont_col(srs: dd.Series, bins: int) -> Dict[str, Any]:
176176
177177
178178## def calc_nom_col(srs: dd.Series, first_rows: pd.Series, cfg: Config)
179- def calc_nom_col (srs : dd .Series , ngroups : int , largest : bool ) -> Dict [str , Any ]:
179+ def calc_nom_col (
180+ srs : dd .Series , first_rows : pd .Series , ngroups : int , largest : bool
181+ ) -> Dict [str , Any ]:
180182 """
181183 Computations for a categorical column in plot(df)
182184
@@ -222,8 +224,10 @@ def calc_nom_col(srs: dd.Series, ngroups: int, largest: bool) -> Dict[str, Any]:
222224 ## data["npresent"] = srs.shape[0]
223225
224226 ## if cfg.insight.constant_length_enable:
225- length = srs .apply (lambda v : len (str (v )), meta = (srs .name , np .int64 ))
226- data ["min_len" ], data ["max_len" ] = length .min (), length .max ()
227+ if not first_rows .apply (lambda x : isinstance (x , str )).all ():
228+ srs = srs .astype (str ) # srs must be a string to compute the value lengths
229+ lengths = srs .str .len ()
230+ data ["min_len" ], data ["max_len" ] = lengths .min (), lengths .max ()
227231
228232 return data
229233
@@ -247,7 +251,6 @@ def calc_stats(df: dd.DataFrame, dtype: Optional[DTypeDef]) -> Dict[str, Any]:
247251
248252 ## if cfg.stats_enable
249253 dtype_cnts , num_cols = get_dtype_cnts_and_num_cols (df , dtype )
250- stats ["nrows" ] = df .shape [0 ]
251254 stats ["ncols" ] = df .shape [1 ]
252255 stats ["npresent_cells" ] = df .count ().sum ()
253256 stats ["nrows_wo_dups" ] = df .drop_duplicates ().shape [0 ]
@@ -327,9 +330,8 @@ def format_cont(col: str, data: Dict[str, Any], nrows: int) -> Any:
327330 ins .append ({"Missing" : f"{ col } has { nmiss } ({ pmiss } %) missing values" })
328331
329332 ## if cfg.insight.skewed_enable:
330- if data ["skew" ] >= 20 : ## cfg.insight.skewed_threshold
331- skew_val = np .round (data ["skew" ], 4 )
332- ins .append ({"Skewed" : f"{ col } is skewed (\u03B3 1 = { skew_val } )" })
333+ if data ["skew" ][1 ] < 1e-5 : ## cfg.insight.skewed_threshold
334+ ins .append ({"Skewed" : f"{ col } is skewed" })
333335
334336 ## if cfg.insight.infinity_enable:
335337 pinf = round (data ["ninf" ] / nrows * 100 , 2 )
0 commit comments