Skip to content

Commit e2e7260

Browse files
committed
Merged changes from branch BUG_df.rolling_#61416 to be included in PR
1 parent e2bd8e6 commit e2e7260

File tree

3 files changed

+121
-53
lines changed

3 files changed

+121
-53
lines changed

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -830,6 +830,7 @@ Groupby/resample/rolling
830830
- Bug in :meth:`DataFrameGroupby.transform` and :meth:`SeriesGroupby.transform` with a reducer and ``observed=False`` that coerces dtype to float when there are unobserved categories. (:issue:`55326`)
831831
- Bug in :meth:`Rolling.apply` for ``method="table"`` where column order was not being respected due to the columns getting sorted by default. (:issue:`59666`)
832832
- Bug in :meth:`Rolling.apply` where the applied function could be called on fewer than ``min_period`` periods if ``method="table"``. (:issue:`58868`)
833+
- Bug in :meth:`Rolling.kurt` where kurtosis calculations would unintentionally be influenced by values outside of scope. (:issue:`61416`)
833834
- Bug in :meth:`Series.resample` could raise when the the date range ended shortly before a non-existent time. (:issue:`58380`)
834835

835836
Reshaping

pandas/_libs/window/aggregations.pyx

Lines changed: 70 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,18 @@
11
# cython: boundscheck=False, wraparound=False, cdivision=True
2-
32
from libc.math cimport (
43
round,
54
signbit,
65
sqrt,
6+
pow,
7+
log10,
8+
abs,
9+
isfinite,
710
)
811
from libcpp.deque cimport deque
912
from libcpp.stack cimport stack
1013
from libcpp.unordered_map cimport unordered_map
14+
from libcpp cimport bool
15+
1116

1217
from pandas._libs.algos cimport TiebreakEnumType
1318

@@ -21,6 +26,8 @@ from numpy cimport (
2126
ndarray,
2227
)
2328

29+
30+
2431
cnp.import_array()
2532

2633
import cython
@@ -724,6 +731,46 @@ cdef float64_t calc_kurt(int64_t minp, int64_t nobs,
724731

725732
return result
726733

734+
cdef void update_sum_of_window( float64_t val,
735+
float64_t **x_value,
736+
float64_t **comp_value,
737+
int power_of_element,
738+
bool add_mode, #1 for add_kurt, 0 for remove_kurt
739+
) noexcept nogil:
740+
741+
cdef:
742+
float64_t val_raised, new_sum
743+
bool val_length_flag, x_length_flag
744+
745+
if add_mode:
746+
val_raised = pow(val, power_of_element)
747+
else:
748+
val_raised = -pow(val, power_of_element)
749+
750+
x_length_flag = abs(log10(abs(x_value[0][0]))) > 15 and isfinite(abs(log10(abs(x_value[0][0])))) == 1
751+
val_length_flag = abs(log10(abs(val_raised))) > 15 and isfinite(abs(log10(abs(val_raised)))) == 1
752+
753+
# We'll try to maintain comp_value as the counter for
754+
# numbers <1e15 to keep it from getting rounded out.
755+
if x_length_flag and val_length_flag:
756+
#Both > 1e15 or < 1-e15
757+
x_value[0][0] += val_raised
758+
759+
elif x_length_flag:
760+
comp_value[0][0] += val_raised
761+
762+
763+
elif val_length_flag:
764+
comp_value[0][0] += x_value[0][0]
765+
x_value[0][0] = val_raised
766+
767+
else:
768+
#Neither are >1e15/<1e-15, safe to proceed
769+
x_value[0][0] += val_raised
770+
771+
if comp_value[0][0] != 0:
772+
x_value[0][0] += comp_value[0][0]
773+
comp_value[0][0] = 0
727774

728775
cdef void add_kurt(float64_t val, int64_t *nobs,
729776
float64_t *x, float64_t *xx,
@@ -736,29 +783,15 @@ cdef void add_kurt(float64_t val, int64_t *nobs,
736783
float64_t *prev_value
737784
) noexcept nogil:
738785
""" add a value from the kurotic calc """
739-
cdef:
740-
float64_t y, t
741786

742787
# Not NaN
743788
if val == val:
744789
nobs[0] = nobs[0] + 1
745790

746-
y = val - compensation_x[0]
747-
t = x[0] + y
748-
compensation_x[0] = t - x[0] - y
749-
x[0] = t
750-
y = val * val - compensation_xx[0]
751-
t = xx[0] + y
752-
compensation_xx[0] = t - xx[0] - y
753-
xx[0] = t
754-
y = val * val * val - compensation_xxx[0]
755-
t = xxx[0] + y
756-
compensation_xxx[0] = t - xxx[0] - y
757-
xxx[0] = t
758-
y = val * val * val * val - compensation_xxxx[0]
759-
t = xxxx[0] + y
760-
compensation_xxxx[0] = t - xxxx[0] - y
761-
xxxx[0] = t
791+
update_sum_of_window(val, &x, &compensation_x, 1, 1)
792+
update_sum_of_window(val, &xx, &compensation_xx, 2, 1)
793+
update_sum_of_window(val, &xxx, &compensation_xxx, 3, 1)
794+
update_sum_of_window(val, &xxxx, &compensation_xxxx, 4, 1)
762795

763796
# GH#42064, record num of same values to remove floating point artifacts
764797
if val == prev_value[0]:
@@ -768,7 +801,6 @@ cdef void add_kurt(float64_t val, int64_t *nobs,
768801
num_consecutive_same_value[0] = 1
769802
prev_value[0] = val
770803

771-
772804
cdef void remove_kurt(float64_t val, int64_t *nobs,
773805
float64_t *x, float64_t *xx,
774806
float64_t *xxx, float64_t *xxxx,
@@ -777,40 +809,25 @@ cdef void remove_kurt(float64_t val, int64_t *nobs,
777809
float64_t *compensation_xxx,
778810
float64_t *compensation_xxxx) noexcept nogil:
779811
""" remove a value from the kurotic calc """
780-
cdef:
781-
float64_t y, t
782812

783813
# Not NaN
784814
if val == val:
785815
nobs[0] = nobs[0] - 1
786816

787-
y = - val - compensation_x[0]
788-
t = x[0] + y
789-
compensation_x[0] = t - x[0] - y
790-
x[0] = t
791-
y = - val * val - compensation_xx[0]
792-
t = xx[0] + y
793-
compensation_xx[0] = t - xx[0] - y
794-
xx[0] = t
795-
y = - val * val * val - compensation_xxx[0]
796-
t = xxx[0] + y
797-
compensation_xxx[0] = t - xxx[0] - y
798-
xxx[0] = t
799-
y = - val * val * val * val - compensation_xxxx[0]
800-
t = xxxx[0] + y
801-
compensation_xxxx[0] = t - xxxx[0] - y
802-
xxxx[0] = t
803-
817+
update_sum_of_window(val, &x, &compensation_x, 1, 0)
818+
update_sum_of_window(val, &xx, &compensation_xx, 2, 0)
819+
update_sum_of_window(val, &xxx, &compensation_xxx, 3, 0)
820+
update_sum_of_window(val, &xxxx, &compensation_xxxx, 4, 0)
804821

805822
def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start,
806823
ndarray[int64_t] end, int64_t minp) -> np.ndarray:
807824
cdef:
808825
Py_ssize_t i, j
809826
float64_t val, mean_val, min_val, sum_val = 0
810-
float64_t compensation_xxxx_add, compensation_xxxx_remove
811-
float64_t compensation_xxx_remove, compensation_xxx_add
812-
float64_t compensation_xx_remove, compensation_xx_add
813-
float64_t compensation_x_remove, compensation_x_add
827+
float64_t compensation_xxxx
828+
float64_t compensation_xxx
829+
float64_t compensation_xx
830+
float64_t compensation_x
814831
float64_t x, xx, xxx, xxxx
815832
float64_t prev_value
816833
int64_t nobs, s, e, num_consecutive_same_value
@@ -851,16 +868,16 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start,
851868
prev_value = values[s]
852869
num_consecutive_same_value = 0
853870

854-
compensation_xxxx_add = compensation_xxxx_remove = 0
855-
compensation_xxx_remove = compensation_xxx_add = 0
856-
compensation_xx_remove = compensation_xx_add = 0
857-
compensation_x_remove = compensation_x_add = 0
871+
compensation_xxxx = 0
872+
compensation_xxx = 0
873+
compensation_xx = 0
874+
compensation_x = 0
858875
x = xx = xxx = xxxx = 0
859876
nobs = 0
860877
for j in range(s, e):
861878
add_kurt(values_copy[j], &nobs, &x, &xx, &xxx, &xxxx,
862-
&compensation_x_add, &compensation_xx_add,
863-
&compensation_xxx_add, &compensation_xxxx_add,
879+
&compensation_x, &compensation_xx,
880+
&compensation_xxx, &compensation_xxxx,
864881
&num_consecutive_same_value, &prev_value)
865882

866883
else:
@@ -870,14 +887,14 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start,
870887
# calculate deletes
871888
for j in range(start[i - 1], s):
872889
remove_kurt(values_copy[j], &nobs, &x, &xx, &xxx, &xxxx,
873-
&compensation_x_remove, &compensation_xx_remove,
874-
&compensation_xxx_remove, &compensation_xxxx_remove)
890+
&compensation_x, &compensation_xx,
891+
&compensation_xxx, &compensation_xxxx)
875892

876893
# calculate adds
877894
for j in range(end[i - 1], e):
878895
add_kurt(values_copy[j], &nobs, &x, &xx, &xxx, &xxxx,
879-
&compensation_x_add, &compensation_xx_add,
880-
&compensation_xxx_add, &compensation_xxxx_add,
896+
&compensation_x, &compensation_xx,
897+
&compensation_xxx, &compensation_xxxx,
881898
&num_consecutive_same_value, &prev_value)
882899

883900
output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx,

pandas/tests/window/test_rolling_skew_kurt.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,3 +225,53 @@ def test_rolling_kurt_eq_value_fperr(step):
225225
a = Series([1.1] * 15).rolling(window=10, step=step).kurt()
226226
assert (a[a.index >= 9] == -3).all()
227227
assert a[a.index < 9].isna().all()
228+
229+
@pytest.mark.parametrize("test_len, window_size, modifiers",
230+
[([0, 10], 5, [[0,1e6], [3, -1e6]]),
231+
([0, 10], 5, [[0,1e-6], [3, 1e6]]),
232+
([10, 100], 20,[[40, -1e10], [59, -9e9]]),
233+
([10500, 11000], 200,[[10581, 0], [109900, -1e6], [10999, 0]]),
234+
]
235+
)
236+
def test_rolling_kurt_outlier_influence(test_len, window_size, modifiers):
237+
# #61416 Extreme values causes kurtosis value to become incorrect
238+
test_series = Series(range(test_len[0], test_len[1]), index = range(test_len[0], test_len[1]))
239+
for ind, number in modifiers:
240+
test_series = test_series.replace(ind, number)
241+
242+
#minimum elements needed for "window_size" number of kurts
243+
test_len_diff = test_len[1] - test_len[0]
244+
min_elements_needed = test_len_diff - 2*window_size + 1
245+
expected_series = (test_series[min_elements_needed:].reindex(range(test_len[0], test_len[1])))
246+
247+
actual = test_series.rolling(window_size,min_periods=1).kurt()
248+
expected = expected_series.rolling(window_size,min_periods=1).kurt()
249+
250+
tm.assert_series_equal(actual.tail(window_size),
251+
expected.tail(window_size)
252+
)
253+
254+
@pytest.mark.parametrize("array_param, window_size, modifiers",
255+
[([10, 10, 10], 5, [[0,1e6], [3, -1e6]]),
256+
([-15, 10, 10], 5, [[0,1e2], [3, 1e6]]),
257+
([1e4, 1e3, 100], 20, [[90,-1e7], [0, 1e7]]),
258+
([1e-3, 3e-3, 100], 20, [[90,100], [20, 1e4]]),
259+
]
260+
)
261+
def test_rolling_kurt_outlier_influence_rand(array_param, window_size, modifiers):
262+
# #61416 Extreme values causes kurtosis value to become incorrect
263+
rand_array = np.random.default_rng(5).normal(array_param[0], array_param[1], array_param[2])
264+
test_series = Series(rand_array)
265+
for ind, number in modifiers:
266+
test_series = test_series.replace(ind, number)
267+
268+
#minimum elements needed for "window_size" number of kurts
269+
min_elements_needed = array_param[2] - 2*window_size + 1
270+
expected_series = (test_series[min_elements_needed:])
271+
272+
actual = test_series.rolling(window_size,min_periods=1).kurt()
273+
expected = expected_series.rolling(window_size,min_periods=1).kurt()
274+
275+
tm.assert_series_equal(actual.tail(window_size),
276+
expected.tail(window_size)
277+
)

0 commit comments

Comments
 (0)