introduce robust utility functions

martin-springer · martin-springer · commit 1670fc618e9d · 2024-12-12T13:56:57.000-05:00
diff --git a/rdtools/__init__.py b/rdtools/__init__.py
@@ -36,6 +36,9 @@
 # from rdtools.plotting import soiling_rate_histogram
 # from rdtools.plotting import availability_summary_plots
 # from rdtools.availability import AvailabilityAnalysis
+from rdtools.utilities import robust_quantile
+from rdtools.utilities import robust_median
+from rdtools.utilities import robust_mean
 
 from . import _version
 __version__ = _version.get_versions()['version']
diff --git a/rdtools/degradation.py b/rdtools/degradation.py
@@ -5,6 +5,7 @@
 import statsmodels.api as sm
 from rdtools.bootstrap import _make_time_series_bootstrap_samples, \
     _construct_confidence_intervals
+from rdtools import utilities
 
 
 def degradation_ols(energy_normalized, confidence_level=68.2):
@@ -259,7 +260,7 @@ def degradation_year_on_year(energy_normalized, recenter=True,
     if recenter:
         start = energy_normalized.index[0]
         oneyear = start + pd.Timedelta('364d')
-        renorm = energy_normalized[start:oneyear].median()
+        renorm = utilities.robust_median(energy_normalized[start:oneyear])
     else:
         renorm = 1.0
 
diff --git a/rdtools/filtering.py b/rdtools/filtering.py
@@ -8,6 +8,7 @@
 from scipy.interpolate import interp1d
 import rdtools
 import xgboost as xgb
+from rdtools import utilities
 
 # Load in the XGBoost clipping model using joblib.
 xgboost_clipping_model = None
@@ -335,8 +336,7 @@ def quantile_clip_filter(power_ac, quantile=0.98):
     """
     Filter data points likely to be affected by clipping
     with power or energy greater than or equal to 99% of the `quant`
-    quantile. NaN's and small values (power_ac(quantile) / 1000) are
-    removed before calculating clipping threshold.
+    quantile.
 
     Parameters
     ----------
@@ -351,16 +351,8 @@ def quantile_clip_filter(power_ac, quantile=0.98):
         Boolean Series of whether the given measurement is below 99% of the
         quantile filter.
     """
-    # Replace NaN's and small values for quantile calculation
-    # This ensures that power series with NaN's instead of zero values
-    # provide the same result.
-    lower = power_ac.fillna(0).quantile(quantile) / 1000
-
-    # Calculate the quantile and upper clipping threshold
-    q = power_ac[power_ac > lower].quantile(quantile)
-    upper = q * 0.99
-
-    return power_ac < upper
+    v = utilities.robust_quantile(power_ac, quantile)
+    return power_ac < v * 0.99
 
 
 def _format_clipping_time_series(power_ac, mounting_type):
@@ -519,18 +511,15 @@ def _apply_overall_clipping_threshold(power_ac, clipping_mask, clipped_power_ac)
         periods are labeled as True and non-clipping periods are
         labeled as False. Has a pandas datetime index.
     """
-
-    # Ensure that series with NaN's return same results as series with 0's
-    lower = power_ac.fillna(0).quantile(0.99) / 1000
-    power_ac_quant = power_ac[power_ac > lower].quantile(0.99)
+    q_power_ac = utilities.robust_quantile(power_ac, 0.99)
+    q_clipped_power_ac = utilities.robust_quantile(clipped_power_ac, 0.99)
 
     upper_bound_pdiff = abs(
-        (power_ac_quant - clipped_power_ac.quantile(0.99))
-        / ((power_ac_quant + clipped_power_ac.quantile(0.99)) / 2)
+        (q_power_ac - q_clipped_power_ac) / ((q_power_ac + q_clipped_power_ac) / 2)
     )
     percent_clipped = len(clipped_power_ac) / len(power_ac) * 100
     if (upper_bound_pdiff < 0.005) & (percent_clipped > 4):
-        max_clip = power_ac >= power_ac_quant
+        max_clip = power_ac >= q_power_ac
         clipping_mask = clipping_mask | max_clip
     return clipping_mask
 
@@ -656,15 +645,14 @@ def logic_clip_filter(
         # Set any values within the clipping max + clipping min threshold
         # as clipping. This is done specifically for capturing the noise
         # for high frequency data sets.
-
-        # Ensure that time series with zeros and nan's return same result
-        lower = clip_pwr.fillna(0).quantile(0.99) / 1000
-        clip_pwr_no_nan = clip_pwr[clip_pwr > lower]
-
-        daily_mean = clip_pwr_no_nan.resample("D").mean()
+        daily_mean = clip_pwr.resample("D").mean()
         df_daily = daily_mean.to_frame(name="mean")
-        df_daily["clipping_max"] = clip_pwr_no_nan.groupby(pd.Grouper(freq="D")).quantile(0.99)
-        df_daily["clipping_min"] = clip_pwr_no_nan.groupby(pd.Grouper(freq="D")).quantile(0.075)
+        df_daily["clipping_max"] = clip_pwr.groupby(pd.Grouper(freq="D")).agg(
+            utilities.robust_quantile, q=0.99
+        )
+        df_daily["clipping_min"] = clip_pwr.groupby(pd.Grouper(freq="D")).agg(
+            utilities.robust_quantile, q=0.075
+        )
         daily_clipping_max = df_daily["clipping_max"].reindex(
             index=power_ac_copy.index, method="ffill"
         )
diff --git a/rdtools/utilities.py b/rdtools/utilities.py
@@ -0,0 +1,79 @@
+"""Utility functions for rdtools."""
+
+
+def robust_quantile(x, q):
+    """
+    Compute the q-th quantile of a time series (x), ignoring small values and NaN's.
+    NaN's and small values [x < Q(x,q)/1000] are removed before calculating the quantile.
+    This function ensures that time series with NaN's and distributions without
+    NaN's return the same results.
+
+    Parameters
+    ----------
+    x : pandas.Series
+        Input time series.
+    q : float
+        Probability value.
+
+    Returns
+    -------
+    quantile : float
+        The q-th quantile of x, ignoring small values and NaN's.
+    """
+
+    small = x.fillna(0).quantile(q) / 1000
+    q = x[x > small].quantile(q)
+
+    return q
+
+
+def robust_median(x, q=0.99):
+    """
+    Compute the median of a time series (x), ignoring small values and NaN's.
+    NaN's and small values [Q(x,q)/1000] are removed before calculating the mean.
+    This function ensures that time series with NaN's and distributions without
+    NaN's return the same results.
+
+    Parameters
+    ----------
+    x : pandas.Series
+        Input time series.
+    q : float, default 0.99
+        Probability value to use for the small values threshold calculation [Q(x,q)/1000].
+
+    Returns
+    -------
+    quantile : float
+        The q-th quantile of x, ignoring small values and NaN's.
+    """
+
+    small = x.fillna(0).quantile(q) / 1000
+    mdn = x[x > small].median()
+
+    return mdn
+
+
+def robust_mean(x, q=0.99):
+    """
+    Compute the mean of a time series (x), ignoring small values and NaN's.
+    NaN's and small values [x < Q(x,q)/1000] are removed before calculating the mean.
+    This function ensures that time series with NaN's and distributions without
+    NaN's return the same results.
+
+    Parameters
+    ----------
+    x : pandas.Series
+        Input time series.
+    q : float, default 0.99
+        Probability value to use for the small values threshold calculation.
+
+    Returns
+    -------
+    quantile : float
+        The q-th quantile of x, ignoring small values and NaN's.
+    """
+
+    small = x.fillna(0).quantile(q) / 1000
+    m = x[x > small].mean()
+
+    return m