Skip to content

Commit 1670fc6

Browse files
introduce robust utility functions
1 parent 0052423 commit 1670fc6

File tree

4 files changed

+99
-28
lines changed

4 files changed

+99
-28
lines changed

rdtools/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,9 @@
3636
# from rdtools.plotting import soiling_rate_histogram
3737
# from rdtools.plotting import availability_summary_plots
3838
# from rdtools.availability import AvailabilityAnalysis
39+
from rdtools.utilities import robust_quantile
40+
from rdtools.utilities import robust_median
41+
from rdtools.utilities import robust_mean
3942

4043
from . import _version
4144
__version__ = _version.get_versions()['version']

rdtools/degradation.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import statsmodels.api as sm
66
from rdtools.bootstrap import _make_time_series_bootstrap_samples, \
77
_construct_confidence_intervals
8+
from rdtools import utilities
89

910

1011
def degradation_ols(energy_normalized, confidence_level=68.2):
@@ -259,7 +260,7 @@ def degradation_year_on_year(energy_normalized, recenter=True,
259260
if recenter:
260261
start = energy_normalized.index[0]
261262
oneyear = start + pd.Timedelta('364d')
262-
renorm = energy_normalized[start:oneyear].median()
263+
renorm = utilities.robust_median(energy_normalized[start:oneyear])
263264
else:
264265
renorm = 1.0
265266

rdtools/filtering.py

Lines changed: 15 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from scipy.interpolate import interp1d
99
import rdtools
1010
import xgboost as xgb
11+
from rdtools import utilities
1112

1213
# Load in the XGBoost clipping model using joblib.
1314
xgboost_clipping_model = None
@@ -335,8 +336,7 @@ def quantile_clip_filter(power_ac, quantile=0.98):
335336
"""
336337
Filter data points likely to be affected by clipping
337338
with power or energy greater than or equal to 99% of the `quant`
338-
quantile. NaN's and small values (power_ac(quantile) / 1000) are
339-
removed before calculating clipping threshold.
339+
quantile.
340340
341341
Parameters
342342
----------
@@ -351,16 +351,8 @@ def quantile_clip_filter(power_ac, quantile=0.98):
351351
Boolean Series of whether the given measurement is below 99% of the
352352
quantile filter.
353353
"""
354-
# Replace NaN's and small values for quantile calculation
355-
# This ensures that power series with NaN's instead of zero values
356-
# provide the same result.
357-
lower = power_ac.fillna(0).quantile(quantile) / 1000
358-
359-
# Calculate the quantile and upper clipping threshold
360-
q = power_ac[power_ac > lower].quantile(quantile)
361-
upper = q * 0.99
362-
363-
return power_ac < upper
354+
v = utilities.robust_quantile(power_ac, quantile)
355+
return power_ac < v * 0.99
364356

365357

366358
def _format_clipping_time_series(power_ac, mounting_type):
@@ -519,18 +511,15 @@ def _apply_overall_clipping_threshold(power_ac, clipping_mask, clipped_power_ac)
519511
periods are labeled as True and non-clipping periods are
520512
labeled as False. Has a pandas datetime index.
521513
"""
522-
523-
# Ensure that series with NaN's return same results as series with 0's
524-
lower = power_ac.fillna(0).quantile(0.99) / 1000
525-
power_ac_quant = power_ac[power_ac > lower].quantile(0.99)
514+
q_power_ac = utilities.robust_quantile(power_ac, 0.99)
515+
q_clipped_power_ac = utilities.robust_quantile(clipped_power_ac, 0.99)
526516

527517
upper_bound_pdiff = abs(
528-
(power_ac_quant - clipped_power_ac.quantile(0.99))
529-
/ ((power_ac_quant + clipped_power_ac.quantile(0.99)) / 2)
518+
(q_power_ac - q_clipped_power_ac) / ((q_power_ac + q_clipped_power_ac) / 2)
530519
)
531520
percent_clipped = len(clipped_power_ac) / len(power_ac) * 100
532521
if (upper_bound_pdiff < 0.005) & (percent_clipped > 4):
533-
max_clip = power_ac >= power_ac_quant
522+
max_clip = power_ac >= q_power_ac
534523
clipping_mask = clipping_mask | max_clip
535524
return clipping_mask
536525

@@ -656,15 +645,14 @@ def logic_clip_filter(
656645
# Set any values within the clipping max + clipping min threshold
657646
# as clipping. This is done specifically for capturing the noise
658647
# for high frequency data sets.
659-
660-
# Ensure that time series with zeros and nan's return same result
661-
lower = clip_pwr.fillna(0).quantile(0.99) / 1000
662-
clip_pwr_no_nan = clip_pwr[clip_pwr > lower]
663-
664-
daily_mean = clip_pwr_no_nan.resample("D").mean()
648+
daily_mean = clip_pwr.resample("D").mean()
665649
df_daily = daily_mean.to_frame(name="mean")
666-
df_daily["clipping_max"] = clip_pwr_no_nan.groupby(pd.Grouper(freq="D")).quantile(0.99)
667-
df_daily["clipping_min"] = clip_pwr_no_nan.groupby(pd.Grouper(freq="D")).quantile(0.075)
650+
df_daily["clipping_max"] = clip_pwr.groupby(pd.Grouper(freq="D")).agg(
651+
utilities.robust_quantile, q=0.99
652+
)
653+
df_daily["clipping_min"] = clip_pwr.groupby(pd.Grouper(freq="D")).agg(
654+
utilities.robust_quantile, q=0.075
655+
)
668656
daily_clipping_max = df_daily["clipping_max"].reindex(
669657
index=power_ac_copy.index, method="ffill"
670658
)

rdtools/utilities.py

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
"""Utility functions for rdtools."""
2+
3+
4+
def robust_quantile(x, q):
5+
"""
6+
Compute the q-th quantile of a time series (x), ignoring small values and NaN's.
7+
NaN's and small values [x < Q(x,q)/1000] are removed before calculating the quantile.
8+
This function ensures that time series with NaN's and distributions without
9+
NaN's return the same results.
10+
11+
Parameters
12+
----------
13+
x : pandas.Series
14+
Input time series.
15+
q : float
16+
Probability value.
17+
18+
Returns
19+
-------
20+
quantile : float
21+
The q-th quantile of x, ignoring small values and NaN's.
22+
"""
23+
24+
small = x.fillna(0).quantile(q) / 1000
25+
q = x[x > small].quantile(q)
26+
27+
return q
28+
29+
30+
def robust_median(x, q=0.99):
31+
"""
32+
Compute the median of a time series (x), ignoring small values and NaN's.
33+
NaN's and small values [Q(x,q)/1000] are removed before calculating the mean.
34+
This function ensures that time series with NaN's and distributions without
35+
NaN's return the same results.
36+
37+
Parameters
38+
----------
39+
x : pandas.Series
40+
Input time series.
41+
q : float, default 0.99
42+
Probability value to use for the small values threshold calculation [Q(x,q)/1000].
43+
44+
Returns
45+
-------
46+
quantile : float
47+
The q-th quantile of x, ignoring small values and NaN's.
48+
"""
49+
50+
small = x.fillna(0).quantile(q) / 1000
51+
mdn = x[x > small].median()
52+
53+
return mdn
54+
55+
56+
def robust_mean(x, q=0.99):
57+
"""
58+
Compute the mean of a time series (x), ignoring small values and NaN's.
59+
NaN's and small values [x < Q(x,q)/1000] are removed before calculating the mean.
60+
This function ensures that time series with NaN's and distributions without
61+
NaN's return the same results.
62+
63+
Parameters
64+
----------
65+
x : pandas.Series
66+
Input time series.
67+
q : float, default 0.99
68+
Probability value to use for the small values threshold calculation.
69+
70+
Returns
71+
-------
72+
quantile : float
73+
The q-th quantile of x, ignoring small values and NaN's.
74+
"""
75+
76+
small = x.fillna(0).quantile(q) / 1000
77+
m = x[x > small].mean()
78+
79+
return m

0 commit comments

Comments
 (0)