diff --git a/src/tsam/timeseriesaggregation.py b/src/tsam/timeseriesaggregation.py index df4262d0..4b8337de 100644 --- a/src/tsam/timeseriesaggregation.py +++ b/src/tsam/timeseriesaggregation.py @@ -938,29 +938,49 @@ def _rescaleClusterPeriods(self, clusterOrder, clusterPeriods, extremeClusterIdx if column in self.weightDict: scale_ub = scale_ub * self.weightDict[column] - # difference between predicted and original sum - diff = abs(sum_raw - (sum_clu_wo_peak + sum_peak)) - - # use while loop to rescale cluster periods + # Bounded, sum-preserving rescale: one multiplicative shape-preserving + # step, then water-fill the residual proportional to remaining headroom + # (or depth when shrinking). Values stay strictly in [0, scale_ub], so + # the cluster's distribution top is not flattened against the cap — + # which is the condition that previously caused downstream + # _representMinMax to overshoot the input envelope. + target_np = sum_raw - sum_peak + + # Shape-preserving initial shift (single application, not iterated) + if sum_clu_wo_peak > 0 and target_np > 0: + arr[idx_wo_peak, ci, :] *= target_np / sum_clu_wo_peak + # Advanced indexing returns a copy; reassign to write clipped values back + arr[idx_wo_peak, ci, :] = np.clip(arr[idx_wo_peak, ci, :], 0, scale_ub) + np.nan_to_num(arr[:, ci, :], copy=False, nan=0.0) + + weights_np = weightingVec[idx_wo_peak] a = 0 - while diff > sum_raw * TOLERANCE and a < MAX_ITERATOR: - # rescale values (only non-extreme clusters) - arr[idx_wo_peak, ci, :] *= (sum_raw - sum_peak) / sum_clu_wo_peak - - # reset values higher than the upper scale or less than zero - arr[:, ci, :] = np.clip(arr[:, ci, :], 0, scale_ub) - - # Handle NaN (replace with 0) - np.nan_to_num(arr[:, ci, :], copy=False, nan=0.0) - - # calc new sum and new diff to orig data - col_data = arr[:, ci, :] - sum_clu_wo_peak = np.sum( - weightingVec[idx_wo_peak] * col_data[idx_wo_peak, :].sum(axis=1) - ) - diff = abs(sum_raw - (sum_clu_wo_peak + sum_peak)) + while a < MAX_ITERATOR: + col_np = arr[idx_wo_peak, ci, :] + cur_sum_np = np.sum(weights_np * col_np.sum(axis=1)) + delta = target_np - cur_sum_np + if abs(delta) <= max(abs(sum_raw), 1.0) * TOLERANCE: + break + if delta > 0: + available = scale_ub - col_np + else: + available = col_np # depth above lower bound 0 + total = np.sum(weights_np * available.sum(axis=1)) + if total <= TOLERANCE: + break + step = min(abs(delta), total) + direction = 1.0 if delta > 0 else -1.0 + col_np += direction * step * available / total + # Guard against tiny numerical bound violations from the division + np.clip(col_np, 0, scale_ub, out=col_np) + arr[idx_wo_peak, ci, :] = col_np a += 1 + sum_clu_wo_peak = np.sum( + weights_np * arr[idx_wo_peak, ci, :].sum(axis=1) + ) + diff = abs(sum_raw - (sum_clu_wo_peak + sum_peak)) + # Calculate and store final deviation deviation_pct = (diff / sum_raw) * 100 if sum_raw != 0 else 0.0 converged = a < MAX_ITERATOR diff --git a/src/tsam/utils/durationRepresentation.py b/src/tsam/utils/durationRepresentation.py index 3a661f32..c8e34995 100644 --- a/src/tsam/utils/durationRepresentation.py +++ b/src/tsam/utils/durationRepresentation.py @@ -189,12 +189,14 @@ def _representMinMax( # due to the change of the min and max values # of the duration curve delta_sum = delta_max * appearance_max + delta_min * appearance_min + mid_weights = np.asarray( + meansAndWeightsSorted[1].iloc[1:-1].values, dtype=float + ) + mid_orig = np.asarray(representationValues[1:-1], dtype=float) + weighted_mid_sum = float(np.sum(mid_weights * mid_orig)) # and derive how much the other values have to be changed to preserve # the mean of the duration curve - correction_factor = ( - -delta_sum - / (meansAndWeightsSorted[1].iloc[1:-1] * representationValues[1:-1]).sum() - ) + correction_factor = -delta_sum / weighted_mid_sum if weighted_mid_sum != 0 else 0.0 if correction_factor < -1 or correction_factor > 1: warnings.warn( @@ -202,12 +204,41 @@ def _representMinMax( ) return representationValues - # correct the values of the duration curve such - # that the mean of the duration curve is preserved - # since the min and max values are changed - representationValues[1:-1] = np.multiply( - representationValues[1:-1], (1 + correction_factor) - ) + # Initial multiplicative correction: preserves the weighted sum exactly + # and keeps zero-valued segments at zero (matching the distribution + # shape of the cluster). When no bound is violated, no further work + # is needed and this matches the legacy behaviour. + corrected = mid_orig * (1 + correction_factor) + + # Clip to the cluster envelope and redistribute the mass that got + # clipped to segments that still have room. This makes the result + # envelope-safe while restoring sum preservation up to feasibility. + lower = float(sortedAttr.min()) + upper = float(sortedAttr.max()) + target_weighted_sum = weighted_mid_sum - delta_sum + tol = max(abs(target_weighted_sum), 1.0) * 1e-12 + for _ in range(20): + np.clip(corrected, lower, upper, out=corrected) + residual = target_weighted_sum - float(np.sum(mid_weights * corrected)) + if abs(residual) <= tol: + break + if residual > 0: + room = upper - corrected + else: + room = corrected - lower + weighted_room = mid_weights * room + total = float(weighted_room.sum()) + if total <= tol: + # remaining mass has nowhere feasible to go + warnings.warn( + "The cluster is too small to preserve the sum of the duration curve and additionally the min and max values of the original cluster members. The min max values of the cluster are not preserved. This does not necessarily mean that the min and max values of the original time series are not preserved." + ) + break + step = min(abs(residual), total) + direction = 1.0 if residual > 0 else -1.0 + corrected = corrected + direction * step * room / total + + representationValues[1:-1] = corrected # change the values of the duration curve such that the min and max # values are preserved diff --git a/test/test_durationRepresentation.py b/test/test_durationRepresentation.py index 2dfd49af..9131fc8b 100644 --- a/test/test_durationRepresentation.py +++ b/test/test_durationRepresentation.py @@ -120,7 +120,37 @@ def test_distributionMinMaxRepresentation(): predictedPeriods.min(), ) - assert np.isclose(raw.mean(), predictedPeriods.mean(), atol=1e-4).all() + # distributionAndMinMaxRepresentation now enforces the per-cluster envelope + # strictly, so the mean can drift slightly in clusters where the old code + # would have produced values above the local max. The trade buys a hard + # guarantee that the aggregated series never exceeds the input envelope. + assert np.isclose(raw.mean(), predictedPeriods.mean(), rtol=5e-3).all() + + +@pytest.mark.filterwarnings("ignore:The cluster is too small:UserWarning") +def test_distributionMinMaxRepresentation_with_rescale(): + raw = pd.read_csv(TESTDATA_CSV, index_col=0) + + aggregation = tsam.TimeSeriesAggregation( + raw, + noTypicalPeriods=24, + segmentation=True, + noSegments=8, + hoursPerPeriod=24, + sortValues=False, + clusterMethod="hierarchical", + representationMethod="distributionAndMinMaxRepresentation", + distributionPeriodWise=False, + rescaleClusterPeriods=True, + extremePeriodMethod="None", + ) + + predictedPeriods = aggregation.predictOriginalData() + + assert (predictedPeriods.max() <= raw.max() + 1e-10).all() + assert (predictedPeriods.min() >= raw.min() - 1e-10).all() + + assert np.isclose(raw.sum(), predictedPeriods.sum(), rtol=5e-3).all() def test_distributionRepresentation_keeps_mean():