Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -127,3 +127,6 @@ dmypy.json

# Pyre type checker
.pyre/

# PyCharm
.idea/
68 changes: 57 additions & 11 deletions src/skprometheus/impute.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,63 @@
from skprometheus.utils import get_feature_names


class SimpleImputer(impute.SimpleImputer):
@wraps(impute.SimpleImputer.__init__, assigned=["__signature__"])
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
MetricRegistry.add_counter("imputed", "the number of values imputed", additional_labels=("method", "feature"))
def register_imputer_metrics(X, method):
"""Register the number of missing values per feature.

def transform(self, X):
features = get_feature_names(X)
Parameters
----------
X : {array-like, sparse matrix}, shape (n_samples, n_features)
The input data to complete.
method: {string}, the imputer method that is being registered
"""
features = get_feature_names(X)

missing = np.isnan(X).sum(axis=0)
for idx, feature in enumerate(features):
MetricRegistry.imputed(feature=feature, method="SimpleImputer").inc(missing[idx])
missing = np.isnan(X).sum(axis=0)
for idx, feature in enumerate(features):
MetricRegistry.imputed(feature=feature, method=method).inc(missing[idx])

return super().transform(X)

class ImputerCreated(type):
def __new__(mcs, name, bases, d):
class_obj = super().__new__(mcs, name, bases, d)

# define __init__
setattr(class_obj, '__init__', ImputerCreated.init(class_obj, bases[0]))
setattr(class_obj, 'transform', ImputerCreated.transform(class_obj, bases[0]))

return class_obj

@staticmethod
def init(class_obj, base):
@wraps(base.__init__, assigned=["__signature__"])
def new_init(self, *args, **kwargs):
super(class_obj, self).__init__(*args, **kwargs)
MetricRegistry.add_counter("imputed", "the number of values imputed",
additional_labels=("method", "feature"))

return new_init

@staticmethod
def transform(class_obj, base):
def new_transform(self, X):
register_imputer_metrics(X, method=base.__name__)
return super(class_obj, self).transform(X)

return new_transform


KNNImputer = ImputerCreated("KNNImputer", (impute.KNNImputer,), {})
MissingIndicator = ImputerCreated("MissingIndicator", (impute.MissingIndicator,), {})
SimpleImputer = ImputerCreated("SimpleImputer", (impute.SimpleImputer,), {})

if __name__ == "__main__":
imputer = SimpleImputer()
X = np.array([
[np.nan, 3, 4, 6],
[np.nan, np.nan, 4, 5],
[np.nan, 5, 6, np.nan],
[np.nan, 0, 0, np.nan],
[np.nan, 7, 8, 9]
])
imputer.fit(X)
imputer.transform(X)
81 changes: 53 additions & 28 deletions tests/test_impute.py
Original file line number Diff line number Diff line change
@@ -1,59 +1,84 @@
import pytest

from skprometheus.impute import SimpleImputer
from skprometheus.impute import SimpleImputer, MissingIndicator, KNNImputer
from skprometheus.utils import flatten
import numpy as np
from prometheus_client import REGISTRY
import pandas as pd
from tests.conftest import general_checks, transformer_checks, select_tests


@pytest.mark.parametrize(
"test_fn",
select_tests(
flatten([general_checks, transformer_checks]),
)
)
def test_standard_checks(test_fn):
trf = SimpleImputer()
test_fn(SimpleImputer.__name__, trf)
IMPUTERS = [SimpleImputer, MissingIndicator, KNNImputer]


def test_simple_imputer():
imputer = SimpleImputer()
X = np.array([
@pytest.fixture()
def missing_values():
return np.array([
[np.nan, 3, 4, 6],
[np.nan, np.nan, 4, 5],
[np.nan, 5, 6, np.nan],
[np.nan, 0, 0, np.nan],
[np.nan, 7, 8, 9]
])


@pytest.mark.parametrize(
"test_fn",
select_tests(
flatten([general_checks, transformer_checks]),
),
)
@pytest.mark.parametrize(
"trf",
IMPUTERS
)
def test_standard_checks_imputer(test_fn, trf):
test_fn(trf.__class__.__name__, trf())


@pytest.mark.parametrize(
"method",
IMPUTERS
)
def test_imputer(missing_values, method):
imputer = method()
X = missing_values

imputer.fit(X)
imputer.transform(X)
assert 'skprom_imputed' in [m.name for m in REGISTRY.collect()]

assert REGISTRY.get_sample_value('skprom_imputed_total', {'feature': '0', 'method': 'SimpleImputer'}) == 5
assert REGISTRY.get_sample_value('skprom_imputed_total', {'feature': '1', 'method': 'SimpleImputer'}) == 1
assert REGISTRY.get_sample_value('skprom_imputed_total', {'feature': '3', 'method': 'SimpleImputer'}) == 2
assert REGISTRY.get_sample_value(
'skprom_imputed_total', {'feature': '0', 'method': imputer.__class__.__name__}
) == 5
assert REGISTRY.get_sample_value(
'skprom_imputed_total', {'feature': '1', 'method': imputer.__class__.__name__}
) == 1
assert REGISTRY.get_sample_value(
'skprom_imputed_total', {'feature': '3', 'method': imputer.__class__.__name__}
) == 2


def test_simple_imputer_pandas():
imputer = SimpleImputer()
X = np.array([
[np.nan, 3, 4, 6],
[np.nan, np.nan, 4, 5],
[np.nan, 5, 6, np.nan],
[np.nan, 0, 0, np.nan],
[np.nan, 7, 8, 9]
])
@pytest.mark.parametrize(
"method",
IMPUTERS
)
def test_simple_imputer_pandas(missing_values, method):
imputer = method()
X = missing_values

df = pd.DataFrame.from_records(X, columns=['A', 'B', 'C', 'D'])

imputer.fit(df)
imputer.transform(df)
assert 'skprom_imputed' in [m.name for m in REGISTRY.collect()]

assert REGISTRY.get_sample_value('skprom_imputed_total', {'feature': 'A', 'method': 'SimpleImputer'}) == 5
assert REGISTRY.get_sample_value('skprom_imputed_total', {'feature': 'B', 'method': 'SimpleImputer'}) == 1
assert REGISTRY.get_sample_value('skprom_imputed_total', {'feature': 'D', 'method': 'SimpleImputer'}) == 2
assert REGISTRY.get_sample_value(
'skprom_imputed_total', {'feature': 'A', 'method': imputer.__class__.__name__}
) == 5
assert REGISTRY.get_sample_value(
'skprom_imputed_total', {'feature': 'B', 'method': imputer.__class__.__name__}
) == 1
assert REGISTRY.get_sample_value(
'skprom_imputed_total', {'feature': 'D', 'method': imputer.__class__.__name__}
) == 2