11"""Computations for plot(df) function."""
22
3- from typing import Any , Dict , List , Optional , Tuple
43from itertools import combinations
4+ from typing import Any , Dict , List , Optional , Tuple
55
66import dask
77import dask .array as da
88import dask .dataframe as dd
99import numpy as np
1010import pandas as pd
11- from dask .array .stats import chisquare , normaltest , skew
12- from scipy .stats import ks_2samp
11+ from dask .array .stats import chisquare , skew
1312
1413from ....errors import UnreachableError
1514from ...dtypes import (
2423 is_dtype ,
2524)
2625from ...intermediate import Intermediate
27- from .common import _calc_line_dt
26+ from .common import _calc_line_dt , ks_2samp , normaltest
2827
2928
3029def compute_overview (
@@ -81,9 +80,7 @@ def compute_overview(
8180 first_rows [col ].apply (hash )
8281 except TypeError :
8382 srs = df [col ] = srs .astype (str )
84- datas .append (
85- calc_nom_col (drop_null (srs ), first_rows [col ], ngroups , largest )
86- )
83+ datas .append (calc_nom_col (drop_null (srs ), ngroups , largest ))
8784 col_names_dtypes .append ((col , Nominal ()))
8885 elif is_dtype (col_dtype , Continuous ()):
8986 ## if cfg.hist_enable or cfg.any_insights("hist"):
@@ -179,9 +176,7 @@ def calc_cont_col(srs: dd.Series, bins: int) -> Dict[str, Any]:
179176
180177
181178## def calc_nom_col(srs: dd.Series, first_rows: pd.Series, cfg: Config)
182- def calc_nom_col (
183- srs : dd .Series , first_rows : pd .Series , ngroups : int , largest : bool
184- ) -> Dict [str , Any ]:
179+ def calc_nom_col (srs : dd .Series , ngroups : int , largest : bool ) -> Dict [str , Any ]:
185180 """
186181 Computations for a categorical column in plot(df)
187182
@@ -227,9 +222,7 @@ def calc_nom_col(
227222 ## data["npresent"] = srs.shape[0]
228223
229224 ## if cfg.insight.constant_length_enable:
230- if not first_rows .apply (lambda x : isinstance (x , str )).all ():
231- srs = srs .astype (str ) # srs must be a string to compute the value lengths
232- length = srs .str .len ()
225+ length = srs .apply (lambda v : len (str (v )), meta = (srs .name , np .int64 ))
233226 data ["min_len" ], data ["max_len" ] = length .min (), length .max ()
234227
235228 return data
@@ -269,12 +262,13 @@ def calc_stats(
269262 # compute distribution similarity on a data sample
270263 # TODO .map_partitions() fails for create_report since it calls calc_stats() with a pd dataframe
271264 # df_smp = df.map_partitions(lambda x: x.sample(min(1000, x.shape[0])), meta=df)
272- # NOTE ks_2samp triggers a .compute(), could use .delayed()
265+
273266 if num_cols : # remove this if statement when create_report is refactored
274267 stats ["ks_tests" ] = []
275268 for col1 , col2 in list (combinations (num_cols , 2 )):
276- if ks_2samp (df [col1 ], df [col2 ])[1 ] > 0.05 :
277- stats ["ks_tests" ].append ((col1 , col2 ))
269+ stats ["ks_tests" ].append (
270+ (col1 , col2 , ks_2samp (df [col1 ], df [col2 ])[1 ] > 0.05 )
271+ )
278272
279273 return stats
280274
@@ -299,9 +293,10 @@ def format_overview(data: Dict[str, Any]) -> List[Dict[str, str]]:
299293 ins .append ({"Duplicates" : f"Dataset has { ndup } ({ pdup } %) duplicate rows" })
300294
301295 ## if cfg.insight.similar_distribution_enable
302- for cols in data .get ("ks_tests" , []):
303- msg = f"{ cols [0 ]} and { cols [1 ]} have similar distributions"
304- ins .append ({"Similar Distribution" : msg })
296+ for (* cols , test_result ) in data .get ("ks_tests" , []):
297+ if test_result :
298+ msg = f"{ cols [0 ]} and { cols [1 ]} have similar distributions"
299+ ins .append ({"Similar Distribution" : msg })
305300
306301 data .pop ("ks_tests" , None )
307302
0 commit comments