Skip to content

Commit fd9dcf0

Browse files
authored
Merge pull request #381 from sfu-db/fix/create_report_hash
fix(eda.create_report): handle unhashable dtypes
2 parents 2153b74 + 7743749 commit fd9dcf0

File tree

4 files changed

+51
-44
lines changed

4 files changed

+51
-44
lines changed

dataprep/eda/create_report/formatter.py

Lines changed: 42 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -137,42 +137,45 @@ def format_basic(df: dd.DataFrame) -> Dict[str, Any]:
137137
"col_type": itmdt.visual_type.replace("_column", ""),
138138
}
139139

140-
# interactions
141-
res["has_interaction"] = True
142-
itmdt = Intermediate(data=data["scat"], visual_type="correlation_crossfilter")
143-
rndrd = render_correlation(itmdt)
144-
rndrd.sizing_mode = "stretch_width"
145-
res["interactions"] = components(rndrd)
146-
147-
# correlations
148-
res["has_correlation"] = True
149-
dfs: Dict[str, pd.DataFrame] = {}
150-
for method, corr in data["corrs"].items():
151-
ndf = pd.DataFrame(
152-
{
153-
"x": data["num_cols"][data["cordx"]],
154-
"y": data["num_cols"][data["cordy"]],
155-
"correlation": corr.ravel(),
156-
}
140+
if len(data["num_cols"]) > 0:
141+
# interactions
142+
res["has_interaction"] = True
143+
itmdt = Intermediate(data=data["scat"], visual_type="correlation_crossfilter")
144+
rndrd = render_correlation(itmdt)
145+
rndrd.sizing_mode = "stretch_width"
146+
res["interactions"] = components(rndrd)
147+
148+
# correlations
149+
res["has_correlation"] = True
150+
dfs: Dict[str, pd.DataFrame] = {}
151+
for method, corr in data["corrs"].items():
152+
ndf = pd.DataFrame(
153+
{
154+
"x": data["num_cols"][data["cordx"]],
155+
"y": data["num_cols"][data["cordy"]],
156+
"correlation": corr.ravel(),
157+
}
158+
)
159+
dfs[method.name] = ndf[data["cordy"] > data["cordx"]]
160+
itmdt = Intermediate(
161+
data=dfs,
162+
axis_range=list(data["num_cols"]),
163+
visual_type="correlation_heatmaps",
157164
)
158-
dfs[method.name] = ndf[data["cordy"] > data["cordx"]]
159-
itmdt = Intermediate(
160-
data=dfs, axis_range=list(data["num_cols"]), visual_type="correlation_heatmaps",
161-
)
162-
rndrd = render_correlation(itmdt)
163-
figs.clear()
164-
for tab in rndrd.tabs:
165-
fig = tab.child
166-
fig.sizing_mode = "stretch_width"
167-
fig.title = Title(text=tab.title, align="center", text_font_size="20px")
168-
figs.append(fig)
169-
res["correlations"] = components(figs)
165+
rndrd = render_correlation(itmdt)
166+
figs.clear()
167+
for tab in rndrd.tabs:
168+
fig = tab.child
169+
fig.sizing_mode = "stretch_width"
170+
fig.title = Title(text=tab.title, align="center", text_font_size="20px")
171+
figs.append(fig)
172+
res["correlations"] = components(figs)
173+
else:
174+
res["has_interaction"], res["has_correlation"] = False, False
170175

171176
# missing
172177
res["has_missing"] = True
173-
174178
itmdt = completions["miss"](data["miss"])
175-
176179
rndrd = render_missing(itmdt)
177180
figs.clear()
178181
for tab in rndrd.tabs:
@@ -200,16 +203,21 @@ def basic_computations(df: dd.DataFrame) -> Tuple[Dict[str, Any], Dict[str, Any]
200203
data["num_cols"] = df_num.columns
201204
first_rows = df.select_dtypes(CATEGORICAL_DTYPES).head
202205

203-
# overview
204-
data["ov"] = calc_stats(df.frame, None)
205-
# # variables
206+
# variables
206207
for col in df.columns:
207208
if is_dtype(detect_dtype(df.frame[col]), Continuous()):
208209
data[col] = cont_comps(df.frame[col], 20)
209210
elif is_dtype(detect_dtype(df.frame[col]), Nominal()):
211+
# cast the column as string type if it contains a mutable type
212+
try:
213+
first_rows[col].apply(hash)
214+
except TypeError:
215+
df.frame[col] = df.frame[col].astype(str)
210216
data[col] = nom_comps(
211217
df.frame[col], first_rows[col], 10, True, 10, 20, True, False, False
212218
)
219+
# overview
220+
data["ov"] = calc_stats(df.frame, None)
213221
# interactions
214222
data["scat"] = df_num.frame.map_partitions(
215223
lambda x: x.sample(min(1000, x.shape[0])), meta=df_num.frame

dataprep/eda/distribution/compute/overview.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -241,10 +241,8 @@ def calc_stats(df: dd.DataFrame, dtype: Optional[DTypeDef]) -> Dict[str, Any]:
241241
----------
242242
df
243243
a DataFrame
244-
dtype_cnts
245-
a dictionary that contains the count for each type
246-
num_cols:
247-
numerical columns in the dataset
244+
dtype
245+
str or DType or dict of str or dict of DType
248246
"""
249247

250248
stats = {"nrows": df.shape[0]}

dataprep/eda/distribution/compute/univariate.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,11 @@ def compute_univariate(
8888
col_dtype = detect_dtype(df[x], dtype)
8989
if is_dtype(col_dtype, Nominal()):
9090
first_rows = df[x].head() # dd.Series.head() triggers a (small) data read
91+
# cast the column as string type if it contains a mutable type
92+
try:
93+
first_rows.apply(hash)
94+
except TypeError:
95+
df[x] = df[x].astype(str)
9196
# all computations for plot(df, Nominal())
9297
data = nom_comps(
9398
df[x],
@@ -170,11 +175,6 @@ def nom_comps(
170175

171176
# total rows
172177
data["nrows"] = srs.shape[0]
173-
# cast the column as string type if it contains a mutable type
174-
try:
175-
first_rows.apply(hash)
176-
except TypeError:
177-
srs = srs.astype(str)
178178
# drop null values
179179
srs = srs.dropna()
180180

dataprep/tests/eda/test_create_report.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ def simpledf() -> pd.DataFrame:
1717
df = pd.concat(
1818
[df, pd.Series(np.random.choice(["a", "b", "c"], 1000, replace=True))], axis=1
1919
)
20+
df = pd.concat([df, pd.Series([["foo"] * 1000])], axis=1)
2021
df = pd.concat(
2122
[
2223
df,
@@ -29,7 +30,7 @@ def simpledf() -> pd.DataFrame:
2930
axis=1,
3031
)
3132
# df = pd.concat([df, pd.Series(np.zeros(1000))], axis=1)
32-
df.columns = ["a", "b", "c", "d", "e"]
33+
df.columns = ["a", "b", "c", "d", "e", "f"]
3334
# df["e"] = pd.to_datetime(df["e"])
3435

3536
idx = np.arange(1000)

0 commit comments

Comments
 (0)