Merge pull request #442 from NREL/fix_issue_#313

mdeceglie · web-flow · commit 9903aaa9ffca · 2024-12-30T14:56:44.000-07:00
Fix issue #313 (different results with Nan's and Zeros in power series)
diff --git a/.github/workflows/nbval.yaml b/.github/workflows/nbval.yaml
@@ -29,7 +29,7 @@ jobs:
     - name: Run notebook and check output
       run: |
         # --sanitize-with:  pre-process text to remove irrelevant differences (e.g. warning filepaths)
-        pytest --nbval --sanitize-with docs/nbval_sanitization_rules.cfg docs/${{ matrix.notebook-file }}
+        pytest --nbval docs/${{ matrix.notebook-file }} --sanitize-with docs/nbval_sanitization_rules.cfg
     - name: Run notebooks again, save files
       run: |
         pip install nbconvert[webpdf]
diff --git a/.gitignore b/.gitignore
@@ -27,6 +27,7 @@ docs/sphinx/source/generated
 .eggs/
 build/
 dist/
+tmp/
 rdtools.egg-info*
 
 # emacs temp files
diff --git a/docs/sphinx/source/changelog/pending.rst b/docs/sphinx/source/changelog/pending.rst
@@ -12,7 +12,8 @@ Bug fixes
 ---------
 * Set marker linewidth to zero in `rdtools.plotting.degradation_summary_plots` (:pull:`433`)
 * Fix :py:func:`~rdtools.normalization.energy_from_power` returns incorrect index for shifted hourly data (:issue:`370`, :pull:`437`)
-* Add warning to clearsky workflow when power_expected is passed by user (:pull:`439`)
+* Add warning to clearsky workflow when ``power_expected`` is passed by user (:pull:`439`)
+* Fix different results with Nan's and Zeros in power series (:issue:`313`, :pull:`442`)
 
 
 Requirements
diff --git a/rdtools/__init__.py b/rdtools/__init__.py
@@ -36,6 +36,9 @@
 # from rdtools.plotting import soiling_rate_histogram
 # from rdtools.plotting import availability_summary_plots
 # from rdtools.availability import AvailabilityAnalysis
+from rdtools.utilities import robust_quantile
+from rdtools.utilities import robust_median
+from rdtools.utilities import robust_mean
 
 from . import _version
 __version__ = _version.get_versions()['version']
diff --git a/rdtools/analysis_chains.py b/rdtools/analysis_chains.py
@@ -8,7 +8,7 @@
 import numpy as np
 import matplotlib.pyplot as plt
 from rdtools import normalization, filtering, aggregation, degradation
-from rdtools import clearsky_temperature, plotting
+from rdtools import clearsky_temperature, plotting, utilities
 import warnings
 
 
@@ -493,8 +493,9 @@ def _pvwatts_norm(self, poa_global, temperature_cell):
         if renorm:
             # Normalize to the 95th percentile for convenience, this is renormalized out
             # in the calculations but is relevant to normalized_filter()
-            x = energy_normalized[np.isfinite(energy_normalized)]
-            energy_normalized = energy_normalized / x.quantile(0.95)
+            q = utilities.robust_quantile(energy_normalized[np.isfinite(energy_normalized)], 0.95)
+
+            energy_normalized = energy_normalized / q
 
         return energy_normalized, insolation
 
@@ -1012,7 +1013,6 @@ def sensor_analysis(
         -------
         None
         """
-
         self._sensor_preprocess()
         sensor_results = {}
 
diff --git a/rdtools/degradation.py b/rdtools/degradation.py
@@ -5,6 +5,7 @@
 import statsmodels.api as sm
 from rdtools.bootstrap import _make_time_series_bootstrap_samples, \
     _construct_confidence_intervals
+from rdtools import utilities
 
 
 def degradation_ols(energy_normalized, confidence_level=68.2):
@@ -259,7 +260,7 @@ def degradation_year_on_year(energy_normalized, recenter=True,
     if recenter:
         start = energy_normalized.index[0]
         oneyear = start + pd.Timedelta('364d')
-        renorm = energy_normalized[start:oneyear].median()
+        renorm = utilities.robust_median(energy_normalized[start:oneyear])
     else:
         renorm = 1.0
 
diff --git a/rdtools/filtering.py b/rdtools/filtering.py
@@ -8,6 +8,7 @@
 from scipy.interpolate import interp1d
 import rdtools
 import xgboost as xgb
+from rdtools import utilities
 
 # Load in the XGBoost clipping model using joblib.
 xgboost_clipping_model = None
@@ -294,7 +295,7 @@ def pvlib_clearsky_filter(
 def clip_filter(power_ac, model="logic", **kwargs):
     """
     Master wrapper for running one of the desired clipping filters.
-    The default filter run is the quantile clipping filter.
+    The default filter run is the logic clipping filter.
 
     Parameters
     ----------
@@ -350,7 +351,7 @@ def quantile_clip_filter(power_ac, quantile=0.98):
         Boolean Series of whether the given measurement is below 99% of the
         quantile filter.
     """
-    v = power_ac.quantile(quantile)
+    v = utilities.robust_quantile(power_ac, quantile)
     return power_ac < v * 0.99
 
 
@@ -510,13 +511,15 @@ def _apply_overall_clipping_threshold(power_ac, clipping_mask, clipped_power_ac)
         periods are labeled as True and non-clipping periods are
         labeled as False. Has a pandas datetime index.
     """
+    q_power_ac = utilities.robust_quantile(power_ac, 0.99)
+    q_clipped_power_ac = utilities.robust_quantile(clipped_power_ac, 0.99)
+
     upper_bound_pdiff = abs(
-        (power_ac.quantile(0.99) - clipped_power_ac.quantile(0.99))
-        / ((power_ac.quantile(0.99) + clipped_power_ac.quantile(0.99)) / 2)
+        (q_power_ac - q_clipped_power_ac) / ((q_power_ac + q_clipped_power_ac) / 2)
     )
     percent_clipped = len(clipped_power_ac) / len(power_ac) * 100
     if (upper_bound_pdiff < 0.005) & (percent_clipped > 4):
-        max_clip = power_ac >= power_ac.quantile(0.99)
+        max_clip = power_ac >= q_power_ac
         clipping_mask = clipping_mask | max_clip
     return clipping_mask
 
@@ -644,9 +647,11 @@ def logic_clip_filter(
         # for high frequency data sets.
         daily_mean = clip_pwr.resample("D").mean()
         df_daily = daily_mean.to_frame(name="mean")
-        df_daily["clipping_max"] = clip_pwr.groupby(pd.Grouper(freq="D")).quantile(0.99)
-        df_daily["clipping_min"] = clip_pwr.groupby(pd.Grouper(freq="D")).quantile(
-            0.075
+        df_daily["clipping_max"] = clip_pwr.groupby(pd.Grouper(freq="D")).agg(
+            utilities.robust_quantile, q=0.99
+        )
+        df_daily["clipping_min"] = clip_pwr.groupby(pd.Grouper(freq="D")).agg(
+            utilities.robust_quantile, q=0.075
         )
         daily_clipping_max = df_daily["clipping_max"].reindex(
             index=power_ac_copy.index, method="ffill"
diff --git a/rdtools/test/analysis_chains_test.py b/rdtools/test/analysis_chains_test.py
@@ -68,6 +68,48 @@ def sensor_analysis(sensor_parameters):
     return rd_analysis
 
 
+@pytest.fixture
+def sensor_analysis_nans(sensor_parameters):
+    def randomly_replace_with(series, replace_with=0, fraction=0.1, seed=None):
+        """
+        Randomly replace a fraction of entries in a pandas Series with input value `replace_with`.
+
+        Parameters:
+        series (pd.Series): The input pandas Series.
+        fraction (float): The fraction of entries to replace with 0. Default is 0.1 (10%).
+        seed (int, optional): Seed for the random number generator for reproducibility.
+
+        Returns:
+        pd.Series: The modified pandas Series with some entries replaced by 0.
+        """
+        if seed is not None:
+            np.random.seed(seed)
+
+        # Determine the number of entries to replace
+        n_replace = int(len(series) * fraction)
+
+        # Randomly select indices to replace
+        replace_indices = np.random.choice(series.index, size=n_replace, replace=False)
+
+        # Replace selected entries with
+        series.loc[replace_indices] = replace_with
+
+        return series
+
+    sensor_parameters_zeros = sensor_parameters.copy()
+    sensor_parameters_nans = sensor_parameters.copy()
+
+    sensor_parameters_zeros["pv"] = randomly_replace_with(sensor_parameters["pv"], seed=0)
+    sensor_parameters_nans["pv"] = sensor_parameters_zeros["pv"].replace(0, np.nan)
+
+    rd_analysis_zeros = TrendAnalysis(**sensor_parameters_zeros)
+    rd_analysis_zeros.sensor_analysis(analyses=["yoy_degradation"])
+
+    rd_analysis_nans = TrendAnalysis(**sensor_parameters_nans)
+    rd_analysis_nans.sensor_analysis(analyses=["yoy_degradation"])
+    return rd_analysis_zeros, rd_analysis_nans
+
+
 @pytest.fixture
 def sensor_analysis_exp_power(sensor_parameters):
     power_expected = normalization.pvwatts_dc_power(
@@ -209,6 +251,21 @@ def test_sensor_analysis(sensor_analysis):
     assert [-1, -1] == pytest.approx(ci, abs=1e-2)
 
 
+def test_sensor_analysis_nans(sensor_analysis_nans):
+    rd_analysis_zeros, rd_analysis_nans = sensor_analysis_nans
+
+    yoy_results_zeros = rd_analysis_zeros.results["sensor"]["yoy_degradation"]
+    rd_zeros = yoy_results_zeros["p50_rd"]
+    ci_zeros = yoy_results_zeros["rd_confidence_interval"]
+
+    yoy_results_nans = rd_analysis_nans.results["sensor"]["yoy_degradation"]
+    rd_nans = yoy_results_nans["p50_rd"]
+    ci_nans = yoy_results_nans["rd_confidence_interval"]
+
+    assert rd_zeros == pytest.approx(rd_nans, abs=1e-2)
+    assert ci_zeros == pytest.approx(ci_nans, abs=1e-1)
+
+
 def test_sensor_analysis_filter_components(sensor_analysis):
     columns = sensor_analysis.sensor_filter_components_aggregated.columns
     assert {'two_way_window_filter'} == set(columns)
diff --git a/rdtools/test/utilities_test.py b/rdtools/test/utilities_test.py
@@ -0,0 +1,43 @@
+import pandas as pd
+import numpy as np
+import pytest
+from rdtools.utilities import robust_quantile, robust_median, robust_mean
+
+
+@pytest.fixture
+def data():
+    data_zeros = pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
+    data_nan = pd.Series([np.nan, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
+    return data_zeros, data_nan
+
+
+def test_robust_quantile(data):
+    data_zeros, data_nan = data
+    quantile = 0.5
+    expected_result = 5.5
+    assert expected_result == robust_quantile(data_zeros, quantile)
+    assert expected_result == robust_quantile(data_nan, quantile)
+
+    quantile = 0.25
+    expected_result = 3.25
+    assert expected_result == robust_quantile(data_zeros, quantile)
+    assert expected_result == robust_quantile(data_nan, quantile)
+
+    quantile = 0.75
+    expected_result = 7.75
+    assert expected_result == robust_quantile(data_zeros, quantile)
+    assert expected_result == robust_quantile(data_nan, quantile)
+
+
+def test_robust_median(data):
+    data_zeros, data_nan = data
+    expected_result = 5.5
+    assert expected_result == robust_median(data_zeros)
+    assert expected_result == robust_median(data_nan)
+
+
+def test_robust_mean(data):
+    data_zeros, data_nan = data
+    expected_result = 5.5
+    assert expected_result == robust_mean(data_zeros)
+    assert expected_result == robust_mean(data_nan)
diff --git a/rdtools/utilities.py b/rdtools/utilities.py
@@ -0,0 +1,79 @@
+"""Utility functions for rdtools."""
+
+
+def robust_quantile(x, q):
+    """
+    Compute the q-th quantile of a time series (x), ignoring small values and NaN's.
+    NaN's and small values [x < Q(x,q)/1000] are removed before calculating the quantile.
+    This function ensures that time series with NaN's and distributions without
+    NaN's return the same results. Should only be used if x is expected to be ≥0.
+
+    Parameters
+    ----------
+    x : pandas.Series
+        Input time series.
+    q : float
+        Probability value.
+
+    Returns
+    -------
+    quantile : float
+        The q-th quantile of x, ignoring small values and NaN's.
+    """
+
+    small = x.astype(float).fillna(0).quantile(q) / 1000
+    q = x[x > small].quantile(q)
+
+    return q
+
+
+def robust_median(x, q=0.99):
+    """
+    Compute the median of a time series (x), ignoring small values and NaN's.
+    NaN's and small values [Q(x,q)/1000] are removed before calculating the mean.
+    This function ensures that time series with NaN's and distributions without
+    NaN's return the same results.  Should only be used if x is expected to be ≥0.
+
+    Parameters
+    ----------
+    x : pandas.Series
+        Input time series.
+    q : float, default 0.99
+        Probability value to use for the small values threshold calculation [Q(x,q)/1000].
+
+    Returns
+    -------
+    quantile : float
+        The q-th quantile of x, ignoring small values and NaN's.
+    """
+
+    small = x.astype(float).fillna(0).quantile(q) / 1000
+    mdn = x[x > small].median()
+
+    return mdn
+
+
+def robust_mean(x, q=0.99):
+    """
+    Compute the mean of a time series (x), ignoring small values and NaN's.
+    NaN's and small values [x < Q(x,q)/1000] are removed before calculating the mean.
+    This function ensures that time series with NaN's and distributions without
+    NaN's return the same results.  Should only be used if x is expected to be ≥0.
+
+    Parameters
+    ----------
+    x : pandas.Series
+        Input time series.
+    q : float, default 0.99
+        Probability value to use for the small values threshold calculation.
+
+    Returns
+    -------
+    quantile : float
+        The q-th quantile of x, ignoring small values and NaN's.
+    """
+
+    small = x.astype(float).fillna(0).quantile(q) / 1000
+    m = x[x > small].mean()
+
+    return m