Merged changes from branch BUG_df.rolling_#61416 to be included in PR

eicchen · eicchen · commit e2e726004684 · 2025-05-22T17:19:47.000-05:00
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -830,6 +830,7 @@ Groupby/resample/rolling
 - Bug in :meth:`DataFrameGroupby.transform` and :meth:`SeriesGroupby.transform` with a reducer and ``observed=False`` that coerces dtype to float when there are unobserved categories. (:issue:`55326`)
 - Bug in :meth:`Rolling.apply` for ``method="table"`` where column order was not being respected due to the columns getting sorted by default. (:issue:`59666`)
 - Bug in :meth:`Rolling.apply` where the applied function could be called on fewer than ``min_period`` periods if ``method="table"``. (:issue:`58868`)
+- Bug in :meth:`Rolling.kurt` where kurtosis calculations would unintentionally be influenced by values outside of scope. (:issue:`61416`)
 - Bug in :meth:`Series.resample` could raise when the the date range ended shortly before a non-existent time. (:issue:`58380`)
 
 Reshaping
diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx
@@ -1,13 +1,18 @@
 # cython: boundscheck=False, wraparound=False, cdivision=True
-
 from libc.math cimport (
     round,
     signbit,
     sqrt,
+    pow,
+    log10,
+    abs,
+    isfinite,
 )
 from libcpp.deque cimport deque
 from libcpp.stack cimport stack
 from libcpp.unordered_map cimport unordered_map
+from libcpp cimport bool
+
 
 from pandas._libs.algos cimport TiebreakEnumType
 
@@ -21,6 +26,8 @@ from numpy cimport (
     ndarray,
 )
 
+
+
 cnp.import_array()
 
 import cython
@@ -724,6 +731,46 @@ cdef float64_t calc_kurt(int64_t minp, int64_t nobs,
 
     return result
 
+cdef void update_sum_of_window( float64_t val,
+                                float64_t **x_value,
+                                float64_t **comp_value,
+                                int power_of_element,  
+                                bool add_mode,   #1 for add_kurt, 0 for remove_kurt
+                                ) noexcept nogil:
+
+    cdef:
+        float64_t val_raised, new_sum
+        bool val_length_flag, x_length_flag
+    
+    if add_mode:
+        val_raised = pow(val, power_of_element)
+    else:
+        val_raised = -pow(val, power_of_element)
+
+    x_length_flag = abs(log10(abs(x_value[0][0]))) > 15 and isfinite(abs(log10(abs(x_value[0][0])))) == 1
+    val_length_flag = abs(log10(abs(val_raised))) > 15 and isfinite(abs(log10(abs(val_raised)))) == 1
+
+    # We'll try to maintain comp_value as the counter for 
+    # numbers <1e15 to keep it from getting rounded out.
+    if x_length_flag and val_length_flag:
+        #Both > 1e15 or < 1-e15
+        x_value[0][0] += val_raised
+
+    elif x_length_flag:
+        comp_value[0][0] += val_raised
+
+
+    elif val_length_flag:
+        comp_value[0][0] += x_value[0][0]
+        x_value[0][0] = val_raised
+        
+    else:
+        #Neither are >1e15/<1e-15, safe to proceed
+        x_value[0][0] += val_raised
+
+        if comp_value[0][0] != 0:
+            x_value[0][0] += comp_value[0][0]
+            comp_value[0][0] = 0
 
 cdef void add_kurt(float64_t val, int64_t *nobs,
                    float64_t *x, float64_t *xx,
@@ -736,29 +783,15 @@ cdef void add_kurt(float64_t val, int64_t *nobs,
                    float64_t *prev_value
                    ) noexcept nogil:
     """ add a value from the kurotic calc """
-    cdef:
-        float64_t y, t
 
     # Not NaN
     if val == val:
         nobs[0] = nobs[0] + 1
 
-        y = val - compensation_x[0]
-        t = x[0] + y
-        compensation_x[0] = t - x[0] - y
-        x[0] = t
-        y = val * val - compensation_xx[0]
-        t = xx[0] + y
-        compensation_xx[0] = t - xx[0] - y
-        xx[0] = t
-        y = val * val * val - compensation_xxx[0]
-        t = xxx[0] + y
-        compensation_xxx[0] = t - xxx[0] - y
-        xxx[0] = t
-        y = val * val * val * val - compensation_xxxx[0]
-        t = xxxx[0] + y
-        compensation_xxxx[0] = t - xxxx[0] - y
-        xxxx[0] = t
+        update_sum_of_window(val, &x,    &compensation_x,    1, 1)
+        update_sum_of_window(val, &xx,   &compensation_xx,   2, 1)
+        update_sum_of_window(val, &xxx,  &compensation_xxx,  3, 1)
+        update_sum_of_window(val, &xxxx, &compensation_xxxx, 4, 1)
 
         # GH#42064, record num of same values to remove floating point artifacts
         if val == prev_value[0]:
@@ -768,7 +801,6 @@ cdef void add_kurt(float64_t val, int64_t *nobs,
             num_consecutive_same_value[0] = 1
         prev_value[0] = val
 
-
 cdef void remove_kurt(float64_t val, int64_t *nobs,
                       float64_t *x, float64_t *xx,
                       float64_t *xxx, float64_t *xxxx,
@@ -777,40 +809,25 @@ cdef void remove_kurt(float64_t val, int64_t *nobs,
                       float64_t *compensation_xxx,
                       float64_t *compensation_xxxx) noexcept nogil:
     """ remove a value from the kurotic calc """
-    cdef:
-        float64_t y, t
 
     # Not NaN
     if val == val:
         nobs[0] = nobs[0] - 1
 
-        y = - val - compensation_x[0]
-        t = x[0] + y
-        compensation_x[0] = t - x[0] - y
-        x[0] = t
-        y = - val * val - compensation_xx[0]
-        t = xx[0] + y
-        compensation_xx[0] = t - xx[0] - y
-        xx[0] = t
-        y = - val * val * val - compensation_xxx[0]
-        t = xxx[0] + y
-        compensation_xxx[0] = t - xxx[0] - y
-        xxx[0] = t
-        y = - val * val * val * val - compensation_xxxx[0]
-        t = xxxx[0] + y
-        compensation_xxxx[0] = t - xxxx[0] - y
-        xxxx[0] = t
-
+        update_sum_of_window(val, &x,    &compensation_x,    1, 0)
+        update_sum_of_window(val, &xx,   &compensation_xx,   2, 0)
+        update_sum_of_window(val, &xxx,  &compensation_xxx,  3, 0)
+        update_sum_of_window(val, &xxxx, &compensation_xxxx, 4, 0)
 
 def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start,
               ndarray[int64_t] end, int64_t minp) -> np.ndarray:
     cdef:
         Py_ssize_t i, j
         float64_t val, mean_val, min_val, sum_val = 0
-        float64_t compensation_xxxx_add, compensation_xxxx_remove
-        float64_t compensation_xxx_remove, compensation_xxx_add
-        float64_t compensation_xx_remove, compensation_xx_add
-        float64_t compensation_x_remove, compensation_x_add
+        float64_t compensation_xxxx
+        float64_t compensation_xxx
+        float64_t compensation_xx
+        float64_t compensation_x
         float64_t x, xx, xxx, xxxx
         float64_t prev_value
         int64_t nobs, s, e, num_consecutive_same_value
@@ -851,16 +868,16 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start,
                 prev_value = values[s]
                 num_consecutive_same_value = 0
 
-                compensation_xxxx_add = compensation_xxxx_remove = 0
-                compensation_xxx_remove = compensation_xxx_add = 0
-                compensation_xx_remove = compensation_xx_add = 0
-                compensation_x_remove = compensation_x_add = 0
+                compensation_xxxx = 0
+                compensation_xxx = 0
+                compensation_xx = 0
+                compensation_x = 0
                 x = xx = xxx = xxxx = 0
                 nobs = 0
                 for j in range(s, e):
                     add_kurt(values_copy[j], &nobs, &x, &xx, &xxx, &xxxx,
-                             &compensation_x_add, &compensation_xx_add,
-                             &compensation_xxx_add, &compensation_xxxx_add,
+                             &compensation_x, &compensation_xx,
+                             &compensation_xxx, &compensation_xxxx,
                              &num_consecutive_same_value, &prev_value)
 
             else:
@@ -870,14 +887,14 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start,
                 # calculate deletes
                 for j in range(start[i - 1], s):
                     remove_kurt(values_copy[j], &nobs, &x, &xx, &xxx, &xxxx,
-                                &compensation_x_remove, &compensation_xx_remove,
-                                &compensation_xxx_remove, &compensation_xxxx_remove)
+                                &compensation_x, &compensation_xx,
+                                &compensation_xxx, &compensation_xxxx)
 
                 # calculate adds
                 for j in range(end[i - 1], e):
                     add_kurt(values_copy[j], &nobs, &x, &xx, &xxx, &xxxx,
-                             &compensation_x_add, &compensation_xx_add,
-                             &compensation_xxx_add, &compensation_xxxx_add,
+                             &compensation_x, &compensation_xx,
+                             &compensation_xxx, &compensation_xxxx,
                              &num_consecutive_same_value, &prev_value)
 
             output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx,
diff --git a/pandas/tests/window/test_rolling_skew_kurt.py b/pandas/tests/window/test_rolling_skew_kurt.py
@@ -225,3 +225,53 @@ def test_rolling_kurt_eq_value_fperr(step):
     a = Series([1.1] * 15).rolling(window=10, step=step).kurt()
     assert (a[a.index >= 9] == -3).all()
     assert a[a.index < 9].isna().all()
+
+@pytest.mark.parametrize("test_len, window_size, modifiers",
+                         [([0, 10], 5,  [[0,1e6], [3, -1e6]]),
+                          ([0, 10], 5,  [[0,1e-6], [3, 1e6]]),
+                          ([10, 100], 20,[[40, -1e10], [59, -9e9]]),
+                          ([10500, 11000], 200,[[10581, 0], [109900, -1e6], [10999, 0]]),
+                          ]
+                        )
+def test_rolling_kurt_outlier_influence(test_len, window_size, modifiers):
+    # #61416 Extreme values causes kurtosis value to become incorrect
+    test_series = Series(range(test_len[0], test_len[1]), index = range(test_len[0], test_len[1]))
+    for ind, number in modifiers:
+        test_series = test_series.replace(ind, number)
+        
+    #minimum elements needed for "window_size" number of kurts 
+    test_len_diff = test_len[1] - test_len[0]
+    min_elements_needed = test_len_diff - 2*window_size + 1 
+    expected_series = (test_series[min_elements_needed:].reindex(range(test_len[0], test_len[1])))
+    
+    actual = test_series.rolling(window_size,min_periods=1).kurt()
+    expected = expected_series.rolling(window_size,min_periods=1).kurt()
+    
+    tm.assert_series_equal(actual.tail(window_size), 
+                           expected.tail(window_size)
+                        )
+    
+@pytest.mark.parametrize("array_param, window_size, modifiers",
+                         [([10, 10, 10], 5,  [[0,1e6], [3, -1e6]]),
+                          ([-15, 10, 10], 5,  [[0,1e2], [3, 1e6]]),
+                          ([1e4, 1e3, 100], 20, [[90,-1e7], [0, 1e7]]),
+                          ([1e-3, 3e-3, 100], 20, [[90,100], [20, 1e4]]),
+                          ]
+                        )
+def test_rolling_kurt_outlier_influence_rand(array_param, window_size, modifiers):
+    # #61416 Extreme values causes kurtosis value to become incorrect
+    rand_array = np.random.default_rng(5).normal(array_param[0], array_param[1], array_param[2])
+    test_series = Series(rand_array)
+    for ind, number in modifiers:
+        test_series = test_series.replace(ind, number)
+        
+    #minimum elements needed for "window_size" number of kurts 
+    min_elements_needed = array_param[2] - 2*window_size + 1 
+    expected_series = (test_series[min_elements_needed:])
+        
+    actual = test_series.rolling(window_size,min_periods=1).kurt()
+    expected = expected_series.rolling(window_size,min_periods=1).kurt()
+    
+    tm.assert_series_equal(actual.tail(window_size), 
+                           expected.tail(window_size)
+                        )