diff --git a/src/skprometheus/preprocessing.py b/src/skprometheus/preprocessing.py index 1dd7711..69f096e 100644 --- a/src/skprometheus/preprocessing.py +++ b/src/skprometheus/preprocessing.py @@ -1,3 +1,4 @@ +import numpy as np from functools import wraps from sklearn import preprocessing @@ -5,6 +6,17 @@ from skprometheus.utils import get_feature_names +def feature_category_count(X, categories): + + features = get_feature_names(X) + + for idx, row in enumerate(categories.T): + for category in row: + if category is None: + category = "missing" + MetricRegistry.model_categorical(feature=str(features[idx]), category=str(category)).inc() + + class OneHotEncoder(preprocessing.OneHotEncoder): """ OneHotEncoder that adds metrics to the prometheus metric registry. @@ -24,15 +36,38 @@ def transform(self, X): metric registry. """ transformed_X = super().transform(X) - features = get_feature_names(X) # Use inverse method on transformed_X to get all missing values back as 'None' categories = self.inverse_transform(transformed_X) - for idx, row in enumerate(categories.T): - for category in row: - if not category: - category = "missing" - MetricRegistry.model_categorical(feature=str(features[idx]), category=str(category)).inc() + feature_category_count(X, categories) + + return transformed_X + + +class OrdinalEncoder(preprocessing.OrdinalEncoder): + """ + OrdinalEncoder that adds metrics to the prometheus metric registry. + """ + @wraps(preprocessing.OrdinalEncoder.__init__, assigned=["__signature__"]) + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + MetricRegistry.add_counter( + "model_categorical", + "Counts category occurrence for each categorical feature.", + additional_labels=("feature", "category"), + ) + + def transform(self, X): + """ + Transform method that adds the count for each category in each feature to the prometheus + metric registry. + """ + transformed_X = super().transform(X) + + # Use inverse method on transformed_X to get all missing values back as 'None' + categories = self.inverse_transform(transformed_X) + + feature_category_count(X, categories) return transformed_X diff --git a/src/skprometheus/utils.py b/src/skprometheus/utils.py index 3f189bb..3f77616 100644 --- a/src/skprometheus/utils.py +++ b/src/skprometheus/utils.py @@ -32,5 +32,5 @@ def get_feature_names(X): if isinstance(X, pd.DataFrame): return X.columns else: - X = check_array(X, force_all_finite=False) + X = check_array(X, dtype = None, force_all_finite=False) return list(range(X.shape[1])) diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py index 864b5d7..f1169b6 100644 --- a/tests/test_preprocessing.py +++ b/tests/test_preprocessing.py @@ -37,6 +37,7 @@ def test_OneHotEncoder(): assert REGISTRY.get_sample_value('skprom_model_categorical_total', {'feature': '2', 'category': '4'}) == 2 assert REGISTRY.get_sample_value('skprom_model_categorical_total', {'feature': '3', 'category': '9'}) == 1 + assert REGISTRY.get_sample_value('skprom_model_categorical_total', {'feature': '3', 'category': '0'}) == 1 def test_OneHotEncoder_pandas():