Merge pull request #41 from python-qds/feature/36_37_invalid_inputs_and_categorical

smarie · web-flow · commit 28908d6ff072 · 2023-03-17T12:00:27.000+01:00
Feature/36 37 invalid inputs and categorical
diff --git a/.github/workflows/base.yml b/.github/workflows/base.yml
@@ -46,7 +46,8 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [ ubuntu-latest ]  # , macos-latest, windows-latest]
+        # see https://github.com/actions/setup-python/issues/544
+        os: [ ubuntu-20.04 ]  # ubuntu-latest, macos-latest, windows-latest]
         # all nox sessions: manually > dynamically from previous job
         # nox_session: ["tests-2.7", "tests-3.7"]
         nox_session: ${{ fromJson(needs.list_nox_test_sessions.outputs.matrix) }}
diff --git a/docs/changelog.md b/docs/changelog.md
@@ -1,5 +1,12 @@
 # Changelog
 
+### 0.6.4 - Bugfixes
+
+ - Replaced usage of deprecated `scipy_mode`. Fixed  [#39](https://github.com/python-qds/qdscreen/issues/39)
+ - Fixed `ValueError: invalid literal for int() with base 10` in `predict_qd`. Fixed [#40](https://github.com/python-qds/qdscreen/issues/40)
+ - Added input validators to raise human-readable error messages when the input is not correct. Fixes [#37](https://github.com/python-qds/qdscreen/issues/37)
+ - Fixed `AttributeError: module 'numpy' has no attribute 'object'.`. Fixes [#38](https://github.com/python-qds/qdscreen/issues/38)
+
 ### 0.6.3 - Bugfixes
 
  - Fixed `ValueError` with recent versions of `SciPy`, due to usage of sparse arrays with object dtype. Fixes [#31](https://github.com/python-qds/qdscreen/issues/31)
diff --git a/qdscreen/main.py b/qdscreen/main.py
@@ -34,13 +34,14 @@ def _add_names_to_parents_idx_series(parents):
 
 class QDForest(object):
     """A quasi-deterministic forest returned by `qd_screen`"""
-    __slots__ = ('_adjmat',         # a square numpy array or pandas DataFrame containing the adjacency matrix (parent->child)
-                 '_parents',        # a 1d np array or a pandas Series relating each child to its parent index or -1 if a root
-                 'is_nparray',      # a boolean indicating if this was built from numpy array (and not pandas dataframe)
-                 '_roots_mask',     # a 1d np array or pd Series containing a boolean mask for root variables
-                 '_roots_wc_mask',  # a 1d np array or pd Series containing a boolean mask for root with children
-                 'stats'            # an optional `Entropies` object stored for debug
-                 )
+    __slots__ = (
+        '_adjmat',         # a square np array or pd DataFrame containing the adjacency matrix (parent->child)
+        '_parents',        # a 1d np array or a pandas Series relating each child to its parent index or -1 if a root
+        'is_nparray',      # a boolean indicating if this was built from numpy array (and not pandas dataframe)
+        '_roots_mask',     # a 1d np array or pd Series containing a boolean mask for root variables
+        '_roots_wc_mask',  # a 1d np array or pd Series containing a boolean mask for root with children
+        'stats'            # an optional `Entropies` object stored for debug
+    )
 
     def __init__(self,
                  adjmat=None,    # type: Union[np.ndarray, pd.DataFrame]
@@ -129,13 +130,13 @@ def mask_to_indices(self, mask):
     @property
     def adjmat_ar(self):
         """The adjacency matrix as a 2D numpy array"""
-        return self.adjmat if self.is_nparray else  self.adjmat.values
+        return self.adjmat if self.is_nparray else self.adjmat.values
 
     @property
     def adjmat(self):
         """The adjacency matrix as a pandas DataFrame or a 2D numpy array"""
         if self._adjmat is None:
-            # compute adjmat from parents.
+            # compute adjmat from parents and cache it
             n = self.nb_vars
             adjmat = np.zeros((n, n), dtype=bool)
             # from https://stackoverflow.com/a/46018613/7262247
@@ -543,10 +544,30 @@ def plot_increasing_entropies(self):
         self.stats.plot_increasing_entropies()
 
 
-def qd_screen(X,                  # type: Union[pd.DataFrame, np.ndarray]
+def assert_df_or_2D_array(df_or_array  # type: Union[pd.DataFrame, np.ndarray]
+                          ):
+    """
+    Raises a ValueError if `df_or_array` is
+
+    :param df_or_array:
+    :return:
+    """
+    if isinstance(df_or_array, pd.DataFrame):
+        pass
+    elif isinstance(df_or_array, np.ndarray):
+        # see https://numpy.org/doc/stable/user/basics.rec.html#manipulating-and-displaying-structured-datatypes
+        if len(df_or_array.shape) != 2:
+            raise ValueError("Provided data is not a 2D array, the number of dimensions is %s" % len(df_or_array.shape))
+    else:
+        # Raise error
+        raise TypeError("Provided data is neither a `pd.DataFrame` nor a `np.ndarray`")
+
+
+def qd_screen(X,  # type: Union[pd.DataFrame, np.ndarray]
               absolute_eps=None,  # type: float
               relative_eps=None,  # type: float
-              keep_stats=False    # type: bool
+              keep_stats=False,  # type: bool
+              non_categorical_mode='strict',
               ):
     # type: (...) -> QDForest
     """
@@ -574,12 +595,18 @@ def qd_screen(X,                  # type: Union[pd.DataFrame, np.ndarray]
         memory in the resulting forest object (`<QDForest>.stats`), for further analysis. By default this is `False`.
     :return:
     """
-    # only work on the categorical features
-    X = get_categorical_features(X)
+    # Make sure this is a 2D table
+    assert_df_or_2D_array(X)
 
-    # sanity check
+    # Sanity check: are there rows in here ?
     if len(X) == 0:
-        raise ValueError("Empty dataset provided")
+        raise ValueError("Provided dataset does not contain any row")
+
+    # Only work on the categorical features
+    X = get_categorical_features(X, non_categorical_mode=non_categorical_mode)
+
+    # Sanity check concerning the number of columns
+    assert X.shape[1] > 0, "Internal error: no columns remain in dataset after preprocessing."
 
     # parameters check and defaults
     if absolute_eps is None:
@@ -1143,28 +1170,49 @@ def get_arcs_from_adjmat(A,                 # type: Union[np.ndarray, pd.DataFra
                 return ((cols[i], cols[j]) for i, j in zip(*res_ar))
 
 
-def get_categorical_features(df_or_array  # type: Union[np.ndarray, pd.DataFrame]
+def get_categorical_features(df_or_array,  # type: Union[np.ndarray, pd.DataFrame]
+                             non_categorical_mode="strict"  # type: str
                              ):
     # type: (...) -> Union[np.ndarray, pd.DataFrame]
     """
 
     :param df_or_array:
+    :param non_categorical_mode:
     :return: a dataframe or array with the categorical features
     """
+    assert_df_or_2D_array(df_or_array)
+
+    if non_categorical_mode == "strict":
+        strict_mode = True
+    elif non_categorical_mode == "remove":
+        strict_mode = False
+    else:
+        raise ValueError("Unsupported value for `non_categorical_mode`: %r" % non_categorical_mode)
+
     if isinstance(df_or_array, pd.DataFrame):
         is_categorical_dtype = df_or_array.dtypes.astype(str).isin(["object", "categorical"])
-        if not is_categorical_dtype.any():
-            raise TypeError("Provided dataframe columns do not contain any categorical datatype (dtype in 'object' or "
+        if strict_mode and not is_categorical_dtype.all():
+            raise ValueError("Provided dataframe columns contains non-categorical datatypes (dtype in 'object' or "
+                            "'categorical'): found dtypes %r. This is not supported when `non_categorical_mode` is set to "
+                            "`'strict'`" % df_or_array.dtypes[~is_categorical_dtype].to_dict())
+        elif not is_categorical_dtype.any():
+            raise ValueError("Provided dataframe columns do not contain any categorical datatype (dtype in 'object' or "
                             "'categorical'): found dtypes %r" % df_or_array.dtypes[~is_categorical_dtype].to_dict())
         return df_or_array.loc[:, is_categorical_dtype]
+
     elif isinstance(df_or_array, np.ndarray):
         # see https://numpy.org/doc/stable/user/basics.rec.html#manipulating-and-displaying-structured-datatypes
         if df_or_array.dtype.names is not None:
             # structured array
             is_categorical_dtype = np.array([str(df_or_array.dtype.fields[n][0]) == "object"
                                              for n in df_or_array.dtype.names])
-            if not is_categorical_dtype.any():
-                raise TypeError(
+            if strict_mode and not is_categorical_dtype.all():
+                invalid_dtypes = df_or_array.dtype[~is_categorical_dtype].asdict()
+                raise ValueError("Provided numpy array columns contains non-categorical datatypes ('object' dtype): "
+                                 "found dtypes %r. This is not supported when `non_categorical_mode` is set to "
+                                 "`'strict'`" % invalid_dtypes)
+            elif not is_categorical_dtype.any():
+                raise ValueError(
                     "Provided dataframe columns do not contain any categorical datatype (dtype in 'object' or "
                     "'categorical'): found dtypes %r" % df_or_array.dtype.fields)
             categorical_names = np.array(df_or_array.dtype.names)[is_categorical_dtype]
@@ -1176,6 +1224,7 @@ def get_categorical_features(df_or_array  # type: Union[np.ndarray, pd.DataFrame
                                 % df_or_array.dtype)
             return df_or_array
     else:
+        # Should not happen since `assert_df_or_2D_array` is called upfront now.
         raise TypeError("Provided data is neither a pd.DataFrame nor a np.ndarray")
 
 
diff --git a/qdscreen/selector.py b/qdscreen/selector.py
@@ -10,10 +10,48 @@
 from .main import QDForest
 
 
+class InvalidDataInputError(ValueError):
+    """Raised when input data is invalid"""
+
+
 def _get_most_common_value(x):
     # From https://stackoverflow.com/a/47778607/7262247
     # `scipy_mode` is the most robust to the various pitfalls (nans, ...)
-    return scipy_mode(x)[0][0]
+    # but they will deprecate it
+    # return scipy_mode(x, nan_policy=None)[0][0]
+    res = x.mode(dropna=True)
+    if len(res) == 0:
+        return np.nan
+    else:
+        return res
+
+
+class ParentChildMapping:
+    __slots__ = ('_mapping_dct', '_otypes')
+
+    def __init__(
+        self,
+        mapping_dct  # type: Dict
+    ):
+        self._mapping_dct = mapping_dct
+        # Find the correct otype to use in the vectorized operation
+        self._otypes = [np.array(mapping_dct.values()).dtype]
+
+    def predict_child_from_parent_ar(
+        self,
+        parent_values  # type: np.ndarray
+    ):
+        """For numpy"""
+        # apply the learned map efficienty https://stackoverflow.com/q/16992713/7262247
+        return np.vectorize(self._mapping_dct.__getitem__, otypes=self._otypes)(parent_values)
+
+    def predict_child_from_parent(
+        self,
+        parent_values  # type: pd.DataFrame
+    ):
+        """For pandas"""
+        # See https://stackoverflow.com/questions/47930052/pandas-vectorized-lookup-of-dictionary
+        return parent_values.map(self._mapping_dct)
 
 
 class QDSelectorModel(object):
@@ -36,12 +74,47 @@ def __init__(self,
         self.forest = qd_forest
         self._maps = None  # type: Optional[Dict[Any, Dict[Any, Dict]]]
 
-    def fit(self,
-            X  # type: Union[np.ndarray, pd.DataFrame]
-            ):
+    def assert_valid_input(
+        self,
+        X,  # type: Union[np.ndarray, pd.DataFrame]
+        df_extras_allowed=False  # type: bool
+    ):
+        """Raises an InvalidDataInputError if X does not match the expectation"""
+
+        if self.forest.is_nparray:
+            if not isinstance(X, np.ndarray):
+                raise InvalidDataInputError(
+                    "Input data must be an numpy array. Found: %s" % type(X))
+
+            if X.shape[1] != self.forest.nb_vars:  # or X.shape[0] != X.shape[1]:
+                raise InvalidDataInputError(
+                    "Input numpy array must have %s columns. Found %s columns" % (self.forest.nb_vars, X.shape[1]))
+        else:
+            if not isinstance(X, pd.DataFrame):
+                raise InvalidDataInputError(
+                    "Input data must be a pandas DataFrame. Found: %s" % type(X))
+
+            actual = set(X.columns)
+            expected = set(self.forest.varnames)
+            if actual != expected:
+                missing = expected - actual
+                if missing or not df_extras_allowed:
+                    extra = actual - expected
+                    raise InvalidDataInputError(
+                        "Input pandas DataFrame must have column names matching the ones in the model. "
+                        "Missing: %s. Extra: %s " % (missing, extra)
+                    )
+
+    def fit(
+        self,
+        X  # type: Union[np.ndarray, pd.DataFrame]
+    ):
         """Fits the maps able to predict determined features from others"""
         forest = self.forest
 
+        # Validate the input
+        self.assert_valid_input(X, df_extras_allowed=False)
+
         # we will create a sparse coordinate representation of maps
         n = forest.nb_vars
 
@@ -79,8 +152,11 @@ def fit(self,
                 pc_df = pd.DataFrame(X[:, (parent, child)], columns=["parent", "child"])
                 levels_mapping_df = pc_df.groupby(by="parent").agg(_get_most_common_value)
 
+                # Init the dict for parent if it does not exit
                 maps.setdefault(parent, dict())
-                maps[parent][child] = levels_mapping_df.iloc[:, 0].to_dict()
+
+                # Fill the parent-child item with the mapping object
+                maps[parent][child] = ParentChildMapping(levels_mapping_df.iloc[:, 0].to_dict())
 
         else:
             assert isinstance(X, pd.DataFrame)
@@ -100,8 +176,11 @@ def fit(self,
                 pc_df = pd.DataFrame(X_ar[:, (parent, child)], columns=["parent", "child"])
                 levels_mapping_df = pc_df.groupby("parent").agg(_get_most_common_value)
 
+                # Init the dict for parent if it does not exit
                 maps.setdefault(parent, dict())
-                maps[parent][child] = levels_mapping_df.iloc[:, 0].to_dict()
+
+                # Fill the parent-child item with the mapping object
+                maps[parent][child] = ParentChildMapping(levels_mapping_df.iloc[:, 0].to_dict())
 
     def remove_qd(self,
                   X,             # type: Union[np.ndarray, pd.DataFrame]
@@ -118,6 +197,8 @@ def remove_qd(self,
         """
         forest = self.forest
 
+        self.assert_valid_input(X, df_extras_allowed=True)
+
         is_x_nparray = isinstance(X, np.ndarray)
         assert is_x_nparray == forest.is_nparray
 
@@ -187,17 +268,15 @@ def predict_qd(self,
 
             # walk the tree from the roots
             for _, parent, child in forest.walk_arcs():
-                # apply the learned map efficienty https://stackoverflow.com/q/16992713/7262247
-                X[:, child] = np.vectorize(self._maps[parent][child].__getitem__)(X[:, parent])
+                X[:, child] = self._maps[parent][child].predict_child_from_parent_ar(X[:, parent])
         else:
             if not inplace:
                 X = X.copy()
 
             # walk the tree from the roots
             varnames = forest.varnames
             for _, parent, child in forest.walk_arcs(names=False):
-                # apply the learned map efficienty https://stackoverflow.com/q/16992713/7262247
-                X.loc[:, varnames[child]] = np.vectorize(self._maps[parent][child].__getitem__)(X.loc[:, varnames[parent]])
+                X.loc[:, varnames[child]] = self._maps[parent][child].predict_child_from_parent(X.loc[:, varnames[parent]])
 
         if not inplace:
             return X
diff --git a/qdscreen/sklearn.py b/qdscreen/sklearn.py
@@ -89,7 +89,7 @@ def fit(self, X, y=None):
         self
         """
         X = self._validate_data(X, accept_sparse=False,  #('csr', 'csc'),
-                                dtype=np.object,
+                                dtype=object,
                                 force_all_finite='allow-nan')
 
         # if hasattr(X, "toarray"):   # sparse matrix
diff --git a/qdscreen/tests/test_core.py b/qdscreen/tests/test_core.py
@@ -1,6 +1,8 @@
 # -*- coding: utf-8 -*-
 # the above encoding declaration is needed to have non-ascii characters in this file (anywhere even in comments)
 # from __future__ import unicode_literals  # no, since we want to match the return type of str() which is bytes in py2
+import sys
+
 import numpy as np
 import pandas as pd
 import pytest
@@ -306,4 +308,32 @@ def test_nans_in_data_sklearn():
 
     selector = QDScreen()
     Xsel = selector.fit_transform(df.to_numpy())
+
     assert Xsel.tolist() == [['A'], ['A'], ['N']]
+
+
+def test_issue_37_non_categorical():
+    df = pd.DataFrame({
+        "nb": [1, 2],
+        "name": ["A", "B"]
+    })
+    with pytest.raises(ValueError, match="Provided dataframe columns contains non-categorical"):
+        qd_screen(df)
+
+
+@pytest.mark.skipif(sys.version_info < (3, 6),
+                    reason="This test is known to fail for 3.5 and 2.7, see GH#43")
+def test_issue_40_nan_then_str():
+    df = pd.DataFrame({
+        "foo": ["1", "2"],
+        "bar": [np.nan, "B"]
+    })
+    qd_forest = qd_screen(df)
+    assert list(qd_forest.roots) == ["foo"]
+
+    feat_selector = qd_forest.fit_selector_model(df)
+    only_important_features_df = feat_selector.remove_qd(df)
+    assert list(only_important_features_df.columns) == ["foo"]
+
+    result = feat_selector.predict_qd(only_important_features_df)
+    pd.testing.assert_frame_equal(df, result)