Skip to content

Commit 28908d6

Browse files
authored
Merge pull request #41 from python-qds/feature/36_37_invalid_inputs_and_categorical
Feature/36 37 invalid inputs and categorical
2 parents 2a43dac + ace9df6 commit 28908d6

File tree

6 files changed

+198
-32
lines changed

6 files changed

+198
-32
lines changed

.github/workflows/base.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,8 @@ jobs:
4646
strategy:
4747
fail-fast: false
4848
matrix:
49-
os: [ ubuntu-latest ] # , macos-latest, windows-latest]
49+
# see https://github.com/actions/setup-python/issues/544
50+
os: [ ubuntu-20.04 ] # ubuntu-latest, macos-latest, windows-latest]
5051
# all nox sessions: manually > dynamically from previous job
5152
# nox_session: ["tests-2.7", "tests-3.7"]
5253
nox_session: ${{ fromJson(needs.list_nox_test_sessions.outputs.matrix) }}

docs/changelog.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,12 @@
11
# Changelog
22

3+
### 0.6.4 - Bugfixes
4+
5+
- Replaced usage of deprecated `scipy_mode`. Fixed [#39](https://github.com/python-qds/qdscreen/issues/39)
6+
- Fixed `ValueError: invalid literal for int() with base 10` in `predict_qd`. Fixed [#40](https://github.com/python-qds/qdscreen/issues/40)
7+
- Added input validators to raise human-readable error messages when the input is not correct. Fixes [#37](https://github.com/python-qds/qdscreen/issues/37)
8+
- Fixed `AttributeError: module 'numpy' has no attribute 'object'.`. Fixes [#38](https://github.com/python-qds/qdscreen/issues/38)
9+
310
### 0.6.3 - Bugfixes
411

512
- Fixed `ValueError` with recent versions of `SciPy`, due to usage of sparse arrays with object dtype. Fixes [#31](https://github.com/python-qds/qdscreen/issues/31)

qdscreen/main.py

Lines changed: 69 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -34,13 +34,14 @@ def _add_names_to_parents_idx_series(parents):
3434

3535
class QDForest(object):
3636
"""A quasi-deterministic forest returned by `qd_screen`"""
37-
__slots__ = ('_adjmat', # a square numpy array or pandas DataFrame containing the adjacency matrix (parent->child)
38-
'_parents', # a 1d np array or a pandas Series relating each child to its parent index or -1 if a root
39-
'is_nparray', # a boolean indicating if this was built from numpy array (and not pandas dataframe)
40-
'_roots_mask', # a 1d np array or pd Series containing a boolean mask for root variables
41-
'_roots_wc_mask', # a 1d np array or pd Series containing a boolean mask for root with children
42-
'stats' # an optional `Entropies` object stored for debug
43-
)
37+
__slots__ = (
38+
'_adjmat', # a square np array or pd DataFrame containing the adjacency matrix (parent->child)
39+
'_parents', # a 1d np array or a pandas Series relating each child to its parent index or -1 if a root
40+
'is_nparray', # a boolean indicating if this was built from numpy array (and not pandas dataframe)
41+
'_roots_mask', # a 1d np array or pd Series containing a boolean mask for root variables
42+
'_roots_wc_mask', # a 1d np array or pd Series containing a boolean mask for root with children
43+
'stats' # an optional `Entropies` object stored for debug
44+
)
4445

4546
def __init__(self,
4647
adjmat=None, # type: Union[np.ndarray, pd.DataFrame]
@@ -129,13 +130,13 @@ def mask_to_indices(self, mask):
129130
@property
130131
def adjmat_ar(self):
131132
"""The adjacency matrix as a 2D numpy array"""
132-
return self.adjmat if self.is_nparray else self.adjmat.values
133+
return self.adjmat if self.is_nparray else self.adjmat.values
133134

134135
@property
135136
def adjmat(self):
136137
"""The adjacency matrix as a pandas DataFrame or a 2D numpy array"""
137138
if self._adjmat is None:
138-
# compute adjmat from parents.
139+
# compute adjmat from parents and cache it
139140
n = self.nb_vars
140141
adjmat = np.zeros((n, n), dtype=bool)
141142
# from https://stackoverflow.com/a/46018613/7262247
@@ -543,10 +544,30 @@ def plot_increasing_entropies(self):
543544
self.stats.plot_increasing_entropies()
544545

545546

546-
def qd_screen(X, # type: Union[pd.DataFrame, np.ndarray]
547+
def assert_df_or_2D_array(df_or_array # type: Union[pd.DataFrame, np.ndarray]
548+
):
549+
"""
550+
Raises a ValueError if `df_or_array` is
551+
552+
:param df_or_array:
553+
:return:
554+
"""
555+
if isinstance(df_or_array, pd.DataFrame):
556+
pass
557+
elif isinstance(df_or_array, np.ndarray):
558+
# see https://numpy.org/doc/stable/user/basics.rec.html#manipulating-and-displaying-structured-datatypes
559+
if len(df_or_array.shape) != 2:
560+
raise ValueError("Provided data is not a 2D array, the number of dimensions is %s" % len(df_or_array.shape))
561+
else:
562+
# Raise error
563+
raise TypeError("Provided data is neither a `pd.DataFrame` nor a `np.ndarray`")
564+
565+
566+
def qd_screen(X, # type: Union[pd.DataFrame, np.ndarray]
547567
absolute_eps=None, # type: float
548568
relative_eps=None, # type: float
549-
keep_stats=False # type: bool
569+
keep_stats=False, # type: bool
570+
non_categorical_mode='strict',
550571
):
551572
# type: (...) -> QDForest
552573
"""
@@ -574,12 +595,18 @@ def qd_screen(X, # type: Union[pd.DataFrame, np.ndarray]
574595
memory in the resulting forest object (`<QDForest>.stats`), for further analysis. By default this is `False`.
575596
:return:
576597
"""
577-
# only work on the categorical features
578-
X = get_categorical_features(X)
598+
# Make sure this is a 2D table
599+
assert_df_or_2D_array(X)
579600

580-
# sanity check
601+
# Sanity check: are there rows in here ?
581602
if len(X) == 0:
582-
raise ValueError("Empty dataset provided")
603+
raise ValueError("Provided dataset does not contain any row")
604+
605+
# Only work on the categorical features
606+
X = get_categorical_features(X, non_categorical_mode=non_categorical_mode)
607+
608+
# Sanity check concerning the number of columns
609+
assert X.shape[1] > 0, "Internal error: no columns remain in dataset after preprocessing."
583610

584611
# parameters check and defaults
585612
if absolute_eps is None:
@@ -1143,28 +1170,49 @@ def get_arcs_from_adjmat(A, # type: Union[np.ndarray, pd.DataFra
11431170
return ((cols[i], cols[j]) for i, j in zip(*res_ar))
11441171

11451172

1146-
def get_categorical_features(df_or_array # type: Union[np.ndarray, pd.DataFrame]
1173+
def get_categorical_features(df_or_array, # type: Union[np.ndarray, pd.DataFrame]
1174+
non_categorical_mode="strict" # type: str
11471175
):
11481176
# type: (...) -> Union[np.ndarray, pd.DataFrame]
11491177
"""
11501178
11511179
:param df_or_array:
1180+
:param non_categorical_mode:
11521181
:return: a dataframe or array with the categorical features
11531182
"""
1183+
assert_df_or_2D_array(df_or_array)
1184+
1185+
if non_categorical_mode == "strict":
1186+
strict_mode = True
1187+
elif non_categorical_mode == "remove":
1188+
strict_mode = False
1189+
else:
1190+
raise ValueError("Unsupported value for `non_categorical_mode`: %r" % non_categorical_mode)
1191+
11541192
if isinstance(df_or_array, pd.DataFrame):
11551193
is_categorical_dtype = df_or_array.dtypes.astype(str).isin(["object", "categorical"])
1156-
if not is_categorical_dtype.any():
1157-
raise TypeError("Provided dataframe columns do not contain any categorical datatype (dtype in 'object' or "
1194+
if strict_mode and not is_categorical_dtype.all():
1195+
raise ValueError("Provided dataframe columns contains non-categorical datatypes (dtype in 'object' or "
1196+
"'categorical'): found dtypes %r. This is not supported when `non_categorical_mode` is set to "
1197+
"`'strict'`" % df_or_array.dtypes[~is_categorical_dtype].to_dict())
1198+
elif not is_categorical_dtype.any():
1199+
raise ValueError("Provided dataframe columns do not contain any categorical datatype (dtype in 'object' or "
11581200
"'categorical'): found dtypes %r" % df_or_array.dtypes[~is_categorical_dtype].to_dict())
11591201
return df_or_array.loc[:, is_categorical_dtype]
1202+
11601203
elif isinstance(df_or_array, np.ndarray):
11611204
# see https://numpy.org/doc/stable/user/basics.rec.html#manipulating-and-displaying-structured-datatypes
11621205
if df_or_array.dtype.names is not None:
11631206
# structured array
11641207
is_categorical_dtype = np.array([str(df_or_array.dtype.fields[n][0]) == "object"
11651208
for n in df_or_array.dtype.names])
1166-
if not is_categorical_dtype.any():
1167-
raise TypeError(
1209+
if strict_mode and not is_categorical_dtype.all():
1210+
invalid_dtypes = df_or_array.dtype[~is_categorical_dtype].asdict()
1211+
raise ValueError("Provided numpy array columns contains non-categorical datatypes ('object' dtype): "
1212+
"found dtypes %r. This is not supported when `non_categorical_mode` is set to "
1213+
"`'strict'`" % invalid_dtypes)
1214+
elif not is_categorical_dtype.any():
1215+
raise ValueError(
11681216
"Provided dataframe columns do not contain any categorical datatype (dtype in 'object' or "
11691217
"'categorical'): found dtypes %r" % df_or_array.dtype.fields)
11701218
categorical_names = np.array(df_or_array.dtype.names)[is_categorical_dtype]
@@ -1176,6 +1224,7 @@ def get_categorical_features(df_or_array # type: Union[np.ndarray, pd.DataFrame
11761224
% df_or_array.dtype)
11771225
return df_or_array
11781226
else:
1227+
# Should not happen since `assert_df_or_2D_array` is called upfront now.
11791228
raise TypeError("Provided data is neither a pd.DataFrame nor a np.ndarray")
11801229

11811230

qdscreen/selector.py

Lines changed: 89 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,48 @@
1010
from .main import QDForest
1111

1212

13+
class InvalidDataInputError(ValueError):
14+
"""Raised when input data is invalid"""
15+
16+
1317
def _get_most_common_value(x):
1418
# From https://stackoverflow.com/a/47778607/7262247
1519
# `scipy_mode` is the most robust to the various pitfalls (nans, ...)
16-
return scipy_mode(x)[0][0]
20+
# but they will deprecate it
21+
# return scipy_mode(x, nan_policy=None)[0][0]
22+
res = x.mode(dropna=True)
23+
if len(res) == 0:
24+
return np.nan
25+
else:
26+
return res
27+
28+
29+
class ParentChildMapping:
30+
__slots__ = ('_mapping_dct', '_otypes')
31+
32+
def __init__(
33+
self,
34+
mapping_dct # type: Dict
35+
):
36+
self._mapping_dct = mapping_dct
37+
# Find the correct otype to use in the vectorized operation
38+
self._otypes = [np.array(mapping_dct.values()).dtype]
39+
40+
def predict_child_from_parent_ar(
41+
self,
42+
parent_values # type: np.ndarray
43+
):
44+
"""For numpy"""
45+
# apply the learned map efficienty https://stackoverflow.com/q/16992713/7262247
46+
return np.vectorize(self._mapping_dct.__getitem__, otypes=self._otypes)(parent_values)
47+
48+
def predict_child_from_parent(
49+
self,
50+
parent_values # type: pd.DataFrame
51+
):
52+
"""For pandas"""
53+
# See https://stackoverflow.com/questions/47930052/pandas-vectorized-lookup-of-dictionary
54+
return parent_values.map(self._mapping_dct)
1755

1856

1957
class QDSelectorModel(object):
@@ -36,12 +74,47 @@ def __init__(self,
3674
self.forest = qd_forest
3775
self._maps = None # type: Optional[Dict[Any, Dict[Any, Dict]]]
3876

39-
def fit(self,
40-
X # type: Union[np.ndarray, pd.DataFrame]
41-
):
77+
def assert_valid_input(
78+
self,
79+
X, # type: Union[np.ndarray, pd.DataFrame]
80+
df_extras_allowed=False # type: bool
81+
):
82+
"""Raises an InvalidDataInputError if X does not match the expectation"""
83+
84+
if self.forest.is_nparray:
85+
if not isinstance(X, np.ndarray):
86+
raise InvalidDataInputError(
87+
"Input data must be an numpy array. Found: %s" % type(X))
88+
89+
if X.shape[1] != self.forest.nb_vars: # or X.shape[0] != X.shape[1]:
90+
raise InvalidDataInputError(
91+
"Input numpy array must have %s columns. Found %s columns" % (self.forest.nb_vars, X.shape[1]))
92+
else:
93+
if not isinstance(X, pd.DataFrame):
94+
raise InvalidDataInputError(
95+
"Input data must be a pandas DataFrame. Found: %s" % type(X))
96+
97+
actual = set(X.columns)
98+
expected = set(self.forest.varnames)
99+
if actual != expected:
100+
missing = expected - actual
101+
if missing or not df_extras_allowed:
102+
extra = actual - expected
103+
raise InvalidDataInputError(
104+
"Input pandas DataFrame must have column names matching the ones in the model. "
105+
"Missing: %s. Extra: %s " % (missing, extra)
106+
)
107+
108+
def fit(
109+
self,
110+
X # type: Union[np.ndarray, pd.DataFrame]
111+
):
42112
"""Fits the maps able to predict determined features from others"""
43113
forest = self.forest
44114

115+
# Validate the input
116+
self.assert_valid_input(X, df_extras_allowed=False)
117+
45118
# we will create a sparse coordinate representation of maps
46119
n = forest.nb_vars
47120

@@ -79,8 +152,11 @@ def fit(self,
79152
pc_df = pd.DataFrame(X[:, (parent, child)], columns=["parent", "child"])
80153
levels_mapping_df = pc_df.groupby(by="parent").agg(_get_most_common_value)
81154

155+
# Init the dict for parent if it does not exit
82156
maps.setdefault(parent, dict())
83-
maps[parent][child] = levels_mapping_df.iloc[:, 0].to_dict()
157+
158+
# Fill the parent-child item with the mapping object
159+
maps[parent][child] = ParentChildMapping(levels_mapping_df.iloc[:, 0].to_dict())
84160

85161
else:
86162
assert isinstance(X, pd.DataFrame)
@@ -100,8 +176,11 @@ def fit(self,
100176
pc_df = pd.DataFrame(X_ar[:, (parent, child)], columns=["parent", "child"])
101177
levels_mapping_df = pc_df.groupby("parent").agg(_get_most_common_value)
102178

179+
# Init the dict for parent if it does not exit
103180
maps.setdefault(parent, dict())
104-
maps[parent][child] = levels_mapping_df.iloc[:, 0].to_dict()
181+
182+
# Fill the parent-child item with the mapping object
183+
maps[parent][child] = ParentChildMapping(levels_mapping_df.iloc[:, 0].to_dict())
105184

106185
def remove_qd(self,
107186
X, # type: Union[np.ndarray, pd.DataFrame]
@@ -118,6 +197,8 @@ def remove_qd(self,
118197
"""
119198
forest = self.forest
120199

200+
self.assert_valid_input(X, df_extras_allowed=True)
201+
121202
is_x_nparray = isinstance(X, np.ndarray)
122203
assert is_x_nparray == forest.is_nparray
123204

@@ -187,17 +268,15 @@ def predict_qd(self,
187268

188269
# walk the tree from the roots
189270
for _, parent, child in forest.walk_arcs():
190-
# apply the learned map efficienty https://stackoverflow.com/q/16992713/7262247
191-
X[:, child] = np.vectorize(self._maps[parent][child].__getitem__)(X[:, parent])
271+
X[:, child] = self._maps[parent][child].predict_child_from_parent_ar(X[:, parent])
192272
else:
193273
if not inplace:
194274
X = X.copy()
195275

196276
# walk the tree from the roots
197277
varnames = forest.varnames
198278
for _, parent, child in forest.walk_arcs(names=False):
199-
# apply the learned map efficienty https://stackoverflow.com/q/16992713/7262247
200-
X.loc[:, varnames[child]] = np.vectorize(self._maps[parent][child].__getitem__)(X.loc[:, varnames[parent]])
279+
X.loc[:, varnames[child]] = self._maps[parent][child].predict_child_from_parent(X.loc[:, varnames[parent]])
201280

202281
if not inplace:
203282
return X

qdscreen/sklearn.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ def fit(self, X, y=None):
8989
self
9090
"""
9191
X = self._validate_data(X, accept_sparse=False, #('csr', 'csc'),
92-
dtype=np.object,
92+
dtype=object,
9393
force_all_finite='allow-nan')
9494

9595
# if hasattr(X, "toarray"): # sparse matrix

qdscreen/tests/test_core.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
# -*- coding: utf-8 -*-
22
# the above encoding declaration is needed to have non-ascii characters in this file (anywhere even in comments)
33
# from __future__ import unicode_literals # no, since we want to match the return type of str() which is bytes in py2
4+
import sys
5+
46
import numpy as np
57
import pandas as pd
68
import pytest
@@ -306,4 +308,32 @@ def test_nans_in_data_sklearn():
306308

307309
selector = QDScreen()
308310
Xsel = selector.fit_transform(df.to_numpy())
311+
309312
assert Xsel.tolist() == [['A'], ['A'], ['N']]
313+
314+
315+
def test_issue_37_non_categorical():
316+
df = pd.DataFrame({
317+
"nb": [1, 2],
318+
"name": ["A", "B"]
319+
})
320+
with pytest.raises(ValueError, match="Provided dataframe columns contains non-categorical"):
321+
qd_screen(df)
322+
323+
324+
@pytest.mark.skipif(sys.version_info < (3, 6),
325+
reason="This test is known to fail for 3.5 and 2.7, see GH#43")
326+
def test_issue_40_nan_then_str():
327+
df = pd.DataFrame({
328+
"foo": ["1", "2"],
329+
"bar": [np.nan, "B"]
330+
})
331+
qd_forest = qd_screen(df)
332+
assert list(qd_forest.roots) == ["foo"]
333+
334+
feat_selector = qd_forest.fit_selector_model(df)
335+
only_important_features_df = feat_selector.remove_qd(df)
336+
assert list(only_important_features_df.columns) == ["foo"]
337+
338+
result = feat_selector.predict_qd(only_important_features_df)
339+
pd.testing.assert_frame_equal(df, result)

0 commit comments

Comments
 (0)